import contextlib import logging import mimetypes import os from typing import Sequence from sqlalchemy import select from sqlalchemy.orm import Session import app.assets.services.hashing as hashing from app.assets.database.models import Asset, AssetReference, Tag from app.assets.database.queries import ( add_tags_to_reference, fetch_reference_and_asset, get_asset_by_hash, get_reference_by_file_path, get_reference_tags, get_or_create_reference, remove_missing_tag_for_asset_id, set_reference_metadata, set_reference_tags, upsert_asset, upsert_reference, ) from app.assets.helpers import normalize_tags from app.assets.services.file_utils import get_size_and_mtime_ns from app.assets.services.path_utils import ( compute_filename_for_reference, resolve_destination_from_tags, validate_path_within_base, ) from app.assets.services.schemas import ( IngestResult, RegisterAssetResult, UploadResult, UserMetadata, extract_asset_data, extract_reference_data, ) from app.database.db import create_session def _ingest_file_from_path( abs_path: str, asset_hash: str, size_bytes: int, mtime_ns: int, mime_type: str | None = None, info_name: str | None = None, owner_id: str = "", preview_id: str | None = None, user_metadata: UserMetadata = None, tags: Sequence[str] = (), tag_origin: str = "manual", require_existing_tags: bool = False, ) -> IngestResult: locator = os.path.abspath(abs_path) asset_created = False asset_updated = False ref_created = False ref_updated = False reference_id: str | None = None with create_session() as session: if preview_id: if not session.get(Asset, preview_id): preview_id = None asset, asset_created, asset_updated = upsert_asset( session, asset_hash=asset_hash, size_bytes=size_bytes, mime_type=mime_type, ) ref_created, ref_updated = upsert_reference( session, asset_id=asset.id, file_path=locator, name=info_name or os.path.basename(locator), mtime_ns=mtime_ns, owner_id=owner_id, ) # Get the reference we just created/updated ref = get_reference_by_file_path(session, locator) if ref: reference_id = ref.id if preview_id and ref.preview_id != preview_id: ref.preview_id = preview_id norm = normalize_tags(list(tags)) if norm: if require_existing_tags: _validate_tags_exist(session, norm) add_tags_to_reference( session, reference_id=reference_id, tags=norm, origin=tag_origin, create_if_missing=not require_existing_tags, ) _update_metadata_with_filename( session, reference_id=reference_id, ref=ref, user_metadata=user_metadata, ) try: remove_missing_tag_for_asset_id(session, asset_id=asset.id) except Exception: logging.exception("Failed to clear 'missing' tag for asset %s", asset.id) session.commit() return IngestResult( asset_created=asset_created, asset_updated=asset_updated, ref_created=ref_created, ref_updated=ref_updated, reference_id=reference_id, ) def _register_existing_asset( asset_hash: str, name: str, user_metadata: UserMetadata = None, tags: list[str] | None = None, tag_origin: str = "manual", owner_id: str = "", ) -> RegisterAssetResult: with create_session() as session: asset = get_asset_by_hash(session, asset_hash=asset_hash) if not asset: raise ValueError(f"No asset with hash {asset_hash}") ref, ref_created = get_or_create_reference( session, asset_id=asset.id, owner_id=owner_id, name=name, ) if not ref_created: tag_names = get_reference_tags(session, reference_id=ref.id) result = RegisterAssetResult( ref=extract_reference_data(ref), asset=extract_asset_data(asset), tags=tag_names, created=False, ) session.commit() return result new_meta = dict(user_metadata or {}) computed_filename = compute_filename_for_reference(session, ref) if computed_filename: new_meta["filename"] = computed_filename if new_meta: set_reference_metadata( session, reference_id=ref.id, user_metadata=new_meta, ) if tags is not None: set_reference_tags( session, reference_id=ref.id, tags=tags, origin=tag_origin, ) tag_names = get_reference_tags(session, reference_id=ref.id) session.refresh(ref) result = RegisterAssetResult( ref=extract_reference_data(ref), asset=extract_asset_data(asset), tags=tag_names, created=True, ) session.commit() return result def _validate_tags_exist(session: Session, tags: list[str]) -> None: existing_tag_names = set( name for (name,) in session.execute(select(Tag.name).where(Tag.name.in_(tags))).all() ) missing = [t for t in tags if t not in existing_tag_names] if missing: raise ValueError(f"Unknown tags: {missing}") def _update_metadata_with_filename( session: Session, reference_id: str, ref: AssetReference, user_metadata: UserMetadata, ) -> None: computed_filename = compute_filename_for_reference(session, ref) current_meta = ref.user_metadata or {} new_meta = dict(current_meta) if user_metadata: for k, v in user_metadata.items(): new_meta[k] = v if computed_filename: new_meta["filename"] = computed_filename if new_meta != current_meta: set_reference_metadata( session, reference_id=reference_id, user_metadata=new_meta, ) def _sanitize_filename(name: str | None, fallback: str) -> str: n = os.path.basename((name or "").strip() or fallback) return n if n else fallback class HashMismatchError(Exception): pass class DependencyMissingError(Exception): def __init__(self, message: str): self.message = message super().__init__(message) def upload_from_temp_path( temp_path: str, name: str | None = None, tags: list[str] | None = None, user_metadata: dict | None = None, client_filename: str | None = None, owner_id: str = "", expected_hash: str | None = None, ) -> UploadResult: try: digest = hashing.compute_blake3_hash(temp_path) except ImportError as e: raise DependencyMissingError(str(e)) except Exception as e: raise RuntimeError(f"failed to hash uploaded file: {e}") asset_hash = "blake3:" + digest if expected_hash and asset_hash != expected_hash.strip().lower(): raise HashMismatchError("Uploaded file hash does not match provided hash.") with create_session() as session: existing = get_asset_by_hash(session, asset_hash=asset_hash) if existing is not None: with contextlib.suppress(Exception): if temp_path and os.path.exists(temp_path): os.remove(temp_path) display_name = _sanitize_filename(name or client_filename, fallback=digest) result = _register_existing_asset( asset_hash=asset_hash, name=display_name, user_metadata=user_metadata or {}, tags=tags or [], tag_origin="manual", owner_id=owner_id, ) return UploadResult( ref=result.ref, asset=result.asset, tags=result.tags, created_new=False, ) if not tags: raise ValueError("tags are required for new asset uploads") base_dir, subdirs = resolve_destination_from_tags(tags) dest_dir = os.path.join(base_dir, *subdirs) if subdirs else base_dir os.makedirs(dest_dir, exist_ok=True) src_for_ext = (client_filename or name or "").strip() _ext = os.path.splitext(os.path.basename(src_for_ext))[1] if src_for_ext else "" ext = _ext if 0 < len(_ext) <= 16 else "" hashed_basename = f"{digest}{ext}" dest_abs = os.path.abspath(os.path.join(dest_dir, hashed_basename)) validate_path_within_base(dest_abs, base_dir) content_type = ( mimetypes.guess_type(os.path.basename(src_for_ext), strict=False)[0] or mimetypes.guess_type(hashed_basename, strict=False)[0] or "application/octet-stream" ) try: os.replace(temp_path, dest_abs) except Exception as e: raise RuntimeError(f"failed to move uploaded file into place: {e}") try: size_bytes, mtime_ns = get_size_and_mtime_ns(dest_abs) except OSError as e: raise RuntimeError(f"failed to stat destination file: {e}") ingest_result = _ingest_file_from_path( asset_hash=asset_hash, abs_path=dest_abs, size_bytes=size_bytes, mtime_ns=mtime_ns, mime_type=content_type, info_name=_sanitize_filename(name or client_filename, fallback=digest), owner_id=owner_id, preview_id=None, user_metadata=user_metadata or {}, tags=tags, tag_origin="manual", require_existing_tags=False, ) reference_id = ingest_result.reference_id if not reference_id: raise RuntimeError("failed to create asset reference") with create_session() as session: pair = fetch_reference_and_asset( session, reference_id=reference_id, owner_id=owner_id ) if not pair: raise RuntimeError("inconsistent DB state after ingest") ref, asset = pair tag_names = get_reference_tags(session, reference_id=ref.id) return UploadResult( ref=extract_reference_data(ref), asset=extract_asset_data(asset), tags=tag_names, created_new=ingest_result.asset_created, ) def create_from_hash( hash_str: str, name: str, tags: list[str] | None = None, user_metadata: dict | None = None, owner_id: str = "", ) -> UploadResult | None: canonical = hash_str.strip().lower() with create_session() as session: asset = get_asset_by_hash(session, asset_hash=canonical) if not asset: return None result = _register_existing_asset( asset_hash=canonical, name=_sanitize_filename( name, fallback=canonical.split(":", 1)[1] if ":" in canonical else canonical ), user_metadata=user_metadata or {}, tags=tags or [], tag_origin="manual", owner_id=owner_id, ) return UploadResult( ref=result.ref, asset=result.asset, tags=result.tags, created_new=False, )