diff --git a/alembic_db/versions/0002_merge_to_asset_references.py b/alembic_db/versions/0002_merge_to_asset_references.py index 03b196b73..1ac1b980c 100644 --- a/alembic_db/versions/0002_merge_to_asset_references.py +++ b/alembic_db/versions/0002_merge_to_asset_references.py @@ -79,6 +79,7 @@ def upgrade() -> None: sa.Column("created_at", sa.DateTime(timezone=False), nullable=False), sa.Column("updated_at", sa.DateTime(timezone=False), nullable=False), sa.Column("last_access_time", sa.DateTime(timezone=False), nullable=False), + sa.Column("deleted_at", sa.DateTime(timezone=False), nullable=True), sa.CheckConstraint( "(mtime_ns IS NULL) OR (mtime_ns >= 0)", name="ck_ar_mtime_nonneg" ), @@ -104,6 +105,7 @@ def upgrade() -> None: op.create_index( "ix_asset_references_owner_name", "asset_references", ["owner_id", "name"] ) + op.create_index("ix_asset_references_deleted_at", "asset_references", ["deleted_at"]) # Create asset_reference_tags table op.create_table( @@ -188,6 +190,7 @@ def downgrade() -> None: op.drop_index("ix_asset_reference_tags_tag_name", table_name="asset_reference_tags") op.drop_table("asset_reference_tags") + op.drop_index("ix_asset_references_deleted_at", table_name="asset_references") op.drop_index("ix_asset_references_owner_name", table_name="asset_references") op.drop_index("ix_asset_references_last_access_time", table_name="asset_references") op.drop_index("ix_asset_references_created_at", table_name="asset_references") diff --git a/app/assets/database/models.py b/app/assets/database/models.py index 04ff6564d..03c1c1707 100644 --- a/app/assets/database/models.py +++ b/app/assets/database/models.py @@ -105,6 +105,9 @@ class AssetReference(Base): last_access_time: Mapped[datetime] = mapped_column( DateTime(timezone=False), nullable=False, default=get_utc_now ) + deleted_at: Mapped[datetime | None] = mapped_column( + DateTime(timezone=False), nullable=True, default=None + ) asset: Mapped[Asset] = relationship( "Asset", @@ -148,6 +151,7 @@ class AssetReference(Base): Index("ix_asset_references_enrichment_level", "enrichment_level"), Index("ix_asset_references_created_at", "created_at"), Index("ix_asset_references_last_access_time", "last_access_time"), + Index("ix_asset_references_deleted_at", "deleted_at"), Index("ix_asset_references_owner_name", "owner_id", "name"), CheckConstraint( "(mtime_ns IS NULL) OR (mtime_ns >= 0)", name="ck_ar_mtime_nonneg" diff --git a/app/assets/database/queries/__init__.py b/app/assets/database/queries/__init__.py index 645759272..7888d0645 100644 --- a/app/assets/database/queries/__init__.py +++ b/app/assets/database/queries/__init__.py @@ -38,6 +38,7 @@ from app.assets.database.queries.asset_reference import ( restore_references_by_paths, set_reference_metadata, set_reference_preview, + soft_delete_reference_by_id, update_reference_access_time, update_reference_name, update_reference_timestamps, @@ -107,6 +108,7 @@ __all__ = [ "restore_references_by_paths", "set_reference_metadata", "set_reference_preview", + "soft_delete_reference_by_id", "set_reference_tags", "update_asset_hash_and_mime", "update_reference_access_time", diff --git a/app/assets/database/queries/asset_reference.py b/app/assets/database/queries/asset_reference.py index 84cdc6033..6524791cc 100644 --- a/app/assets/database/queries/asset_reference.py +++ b/app/assets/database/queries/asset_reference.py @@ -173,11 +173,11 @@ def get_reference_with_owner_check( """Fetch a reference and verify ownership. Raises: - ValueError: if reference not found + ValueError: if reference not found or soft-deleted PermissionError: if owner_id doesn't match """ ref = get_reference_by_id(session, reference_id=reference_id) - if not ref: + if not ref or ref.deleted_at is not None: raise ValueError(f"AssetReference {reference_id} not found") if ref.owner_id and ref.owner_id != owner_id: raise PermissionError("not owner") @@ -206,6 +206,7 @@ def reference_exists_for_asset_id( select(sa.literal(True)) .select_from(AssetReference) .where(AssetReference.asset_id == asset_id) + .where(AssetReference.deleted_at.is_(None)) .limit(1) ) return session.execute(q).first() is not None @@ -327,6 +328,7 @@ def list_references_page( .join(Asset, Asset.id == AssetReference.asset_id) .where(build_visible_owner_clause(owner_id)) .where(AssetReference.is_missing == False) # noqa: E712 + .where(AssetReference.deleted_at.is_(None)) .options(noload(AssetReference.tags)) ) @@ -357,6 +359,7 @@ def list_references_page( .join(Asset, Asset.id == AssetReference.asset_id) .where(build_visible_owner_clause(owner_id)) .where(AssetReference.is_missing == False) # noqa: E712 + .where(AssetReference.deleted_at.is_(None)) ) if name_contains: escaped, esc = escape_sql_like_string(name_contains) @@ -400,6 +403,7 @@ def fetch_reference_asset_and_tags( .join(Tag, Tag.name == AssetReferenceTag.tag_name, isouter=True) .where( AssetReference.id == reference_id, + AssetReference.deleted_at.is_(None), build_visible_owner_clause(owner_id), ) .options(noload(AssetReference.tags)) @@ -430,6 +434,7 @@ def fetch_reference_and_asset( .join(Asset, Asset.id == AssetReference.asset_id) .where( AssetReference.id == reference_id, + AssetReference.deleted_at.is_(None), build_visible_owner_clause(owner_id), ) .limit(1) @@ -541,6 +546,28 @@ def delete_reference_by_id( return int(session.execute(stmt).rowcount or 0) > 0 +def soft_delete_reference_by_id( + session: Session, + reference_id: str, + owner_id: str, +) -> bool: + """Mark a reference as soft-deleted by setting deleted_at timestamp. + + Returns True if the reference was found and marked deleted. + """ + now = get_utc_now() + stmt = ( + sa.update(AssetReference) + .where( + AssetReference.id == reference_id, + AssetReference.deleted_at.is_(None), + build_visible_owner_clause(owner_id), + ) + .values(deleted_at=now) + ) + return int(session.execute(stmt).rowcount or 0) > 0 + + def set_reference_preview( session: Session, reference_id: str, @@ -633,10 +660,12 @@ def upsert_reference( AssetReference.mtime_ns.is_(None), AssetReference.mtime_ns != int(mtime_ns), AssetReference.is_missing == True, # noqa: E712 + AssetReference.deleted_at.isnot(None), ) ) .values( - asset_id=asset_id, mtime_ns=int(mtime_ns), is_missing=False, updated_at=now + asset_id=asset_id, mtime_ns=int(mtime_ns), is_missing=False, + deleted_at=None, updated_at=now, ) ) res2 = session.execute(upd) @@ -660,6 +689,7 @@ def mark_references_missing_outside_prefixes( result = session.execute( sa.update(AssetReference) .where(AssetReference.file_path.isnot(None)) + .where(AssetReference.deleted_at.is_(None)) .where(~matches_valid_prefix) .where(AssetReference.is_missing == False) # noqa: E712 .values(is_missing=True) @@ -681,6 +711,7 @@ def restore_references_by_paths(session: Session, file_paths: list[str]) -> int: sa.update(AssetReference) .where(AssetReference.file_path.in_(chunk)) .where(AssetReference.is_missing == True) # noqa: E712 + .where(AssetReference.deleted_at.is_(None)) .values(is_missing=False) ) total += result.rowcount @@ -699,6 +730,7 @@ def get_unreferenced_unhashed_asset_ids(session: Session) -> list[str]: sa.select(sa.literal(1)) .where(AssetReference.asset_id == Asset.id) .where(AssetReference.is_missing == False) # noqa: E712 + .where(AssetReference.deleted_at.is_(None)) .correlate(Asset) .exists() ) @@ -758,6 +790,7 @@ def get_references_for_prefixes( ) .join(Asset, Asset.id == AssetReference.asset_id) .where(AssetReference.file_path.isnot(None)) + .where(AssetReference.deleted_at.is_(None)) .where(sa.or_(*conds)) ) @@ -894,6 +927,7 @@ def get_unenriched_references( AssetReference.enrichment_level, ) .where(AssetReference.file_path.isnot(None)) + .where(AssetReference.deleted_at.is_(None)) .where(sa.or_(*conds)) .where(AssetReference.is_missing == False) # noqa: E712 .where(AssetReference.enrichment_level <= max_level) diff --git a/app/assets/database/queries/tags.py b/app/assets/database/queries/tags.py index 6719d058b..8b25fee67 100644 --- a/app/assets/database/queries/tags.py +++ b/app/assets/database/queries/tags.py @@ -272,6 +272,7 @@ def list_tags_with_usage( .select_from(AssetReferenceTag) .join(AssetReference, AssetReference.id == AssetReferenceTag.asset_reference_id) .where(build_visible_owner_clause(owner_id)) + .where(AssetReference.deleted_at.is_(None)) .group_by(AssetReferenceTag.tag_name) .subquery() ) @@ -307,6 +308,7 @@ def list_tags_with_usage( select(AssetReferenceTag.tag_name) .join(AssetReference, AssetReference.id == AssetReferenceTag.asset_reference_id) .where(build_visible_owner_clause(owner_id)) + .where(AssetReference.deleted_at.is_(None)) .group_by(AssetReferenceTag.tag_name) ) total_q = total_q.where(Tag.name.in_(visible_tags_sq)) diff --git a/app/assets/services/asset_management.py b/app/assets/services/asset_management.py index 81b0fce3c..3fe7115c8 100644 --- a/app/assets/services/asset_management.py +++ b/app/assets/services/asset_management.py @@ -10,6 +10,7 @@ from app.assets.database.queries import ( reference_exists_for_asset_id, delete_reference_by_id, fetch_reference_and_asset, + soft_delete_reference_by_id, fetch_reference_asset_and_tags, get_asset_by_hash as queries_get_asset_by_hash, get_reference_by_id, @@ -130,6 +131,14 @@ def delete_asset_reference( delete_content_if_orphan: bool = True, ) -> bool: with create_session() as session: + if not delete_content_if_orphan: + # Soft delete: mark the reference as deleted but keep everything + deleted = soft_delete_reference_by_id( + session, reference_id=reference_id, owner_id=owner_id + ) + session.commit() + return deleted + ref_row = get_reference_by_id(session, reference_id=reference_id) asset_id = ref_row.asset_id if ref_row else None file_path = ref_row.file_path if ref_row else None @@ -141,7 +150,7 @@ def delete_asset_reference( session.commit() return False - if not delete_content_if_orphan or not asset_id: + if not asset_id: session.commit() return True diff --git a/tests-unit/assets_test/conftest.py b/tests-unit/assets_test/conftest.py index 986e56edf..6c5c56113 100644 --- a/tests-unit/assets_test/conftest.py +++ b/tests-unit/assets_test/conftest.py @@ -212,7 +212,7 @@ def asset_factory(http: requests.Session, api_base: str): for aid in created: with contextlib.suppress(Exception): - http.delete(f"{api_base}/api/assets/{aid}", timeout=30) + http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=30) @pytest.fixture @@ -258,4 +258,4 @@ def autoclean_unit_test_assets(http: requests.Session, api_base: str): break for aid in ids: with contextlib.suppress(Exception): - http.delete(f"{api_base}/api/assets/{aid}", timeout=30) + http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=30) diff --git a/tests-unit/assets_test/services/test_asset_management.py b/tests-unit/assets_test/services/test_asset_management.py index 8f8c15be9..101ef7292 100644 --- a/tests-unit/assets_test/services/test_asset_management.py +++ b/tests-unit/assets_test/services/test_asset_management.py @@ -140,7 +140,7 @@ class TestUpdateAssetMetadata: class TestDeleteAssetReference: - def test_deletes_reference(self, mock_create_session, session: Session): + def test_soft_deletes_reference(self, mock_create_session, session: Session): asset = _make_asset(session) ref = _make_reference(session, asset) ref_id = ref.id @@ -153,7 +153,11 @@ class TestDeleteAssetReference: ) assert result is True - assert session.get(AssetReference, ref_id) is None + # Row still exists but is marked as soft-deleted + session.expire_all() + row = session.get(AssetReference, ref_id) + assert row is not None + assert row.deleted_at is not None def test_returns_false_for_nonexistent(self, mock_create_session): result = delete_asset_reference( diff --git a/tests-unit/assets_test/test_crud.py b/tests-unit/assets_test/test_crud.py index b8bfc3193..07310223e 100644 --- a/tests-unit/assets_test/test_crud.py +++ b/tests-unit/assets_test/test_crud.py @@ -42,8 +42,8 @@ def test_get_and_delete_asset(http: requests.Session, api_base: str, seeded_asse assert "user_metadata" in detail assert "filename" in detail["user_metadata"] - # DELETE - rd = http.delete(f"{api_base}/api/assets/{aid}", timeout=120) + # DELETE (hard delete to also remove underlying asset and file) + rd = http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=120) assert rd.status_code == 204 # GET again -> 404 @@ -53,6 +53,35 @@ def test_get_and_delete_asset(http: requests.Session, api_base: str, seeded_asse assert body["error"]["code"] == "ASSET_NOT_FOUND" +def test_soft_delete_hides_from_get(http: requests.Session, api_base: str, seeded_asset: dict): + aid = seeded_asset["id"] + asset_hash = seeded_asset["asset_hash"] + + # Soft-delete (default, no delete_content param) + rd = http.delete(f"{api_base}/api/assets/{aid}", timeout=120) + assert rd.status_code == 204 + + # GET by reference ID -> 404 (soft-deleted references are hidden) + rg = http.get(f"{api_base}/api/assets/{aid}", timeout=120) + assert rg.status_code == 404 + + # Asset identity is preserved (underlying content still exists) + rh = http.head(f"{api_base}/api/assets/hash/{asset_hash}", timeout=120) + assert rh.status_code == 200 + + # Soft-deleted reference should not appear in listings + rl = http.get( + f"{api_base}/api/assets", + params={"include_tags": "unit-tests", "limit": "500"}, + timeout=120, + ) + ids = [a["id"] for a in rl.json().get("assets", [])] + assert aid not in ids + + # Clean up: hard-delete the soft-deleted reference and orphaned asset + http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=120) + + def test_delete_upon_reference_count( http: requests.Session, api_base: str, seeded_asset: dict ): @@ -70,21 +99,32 @@ def test_delete_upon_reference_count( assert copy["asset_hash"] == src_hash assert copy["created_new"] is False - # Delete original reference -> asset identity must remain + # Soft-delete original reference (default) -> asset identity must remain aid1 = seeded_asset["id"] rd1 = http.delete(f"{api_base}/api/assets/{aid1}", timeout=120) assert rd1.status_code == 204 rh1 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120) - assert rh1.status_code == 200 # identity still present + assert rh1.status_code == 200 # identity still present (second ref exists) - # Delete the last reference with default semantics -> identity and cached files removed + # Soft-delete the last reference -> asset identity preserved (no hard delete) aid2 = copy["id"] rd2 = http.delete(f"{api_base}/api/assets/{aid2}", timeout=120) assert rd2.status_code == 204 rh2 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120) - assert rh2.status_code == 404 # orphan content removed + assert rh2.status_code == 200 # asset identity preserved (soft delete) + + # Re-associate via from-hash, then hard-delete -> orphan content removed + r3 = http.post(f"{api_base}/api/assets/from-hash", json=payload, timeout=120) + assert r3.status_code == 201, r3.json() + aid3 = r3.json()["id"] + + rd3 = http.delete(f"{api_base}/api/assets/{aid3}?delete_content=true", timeout=120) + assert rd3.status_code == 204 + + rh3 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120) + assert rh3.status_code == 404 # orphan content removed def test_update_asset_fields(http: requests.Session, api_base: str, seeded_asset: dict): diff --git a/tests-unit/assets_test/test_downloads.py b/tests-unit/assets_test/test_downloads.py index 42c64a5fd..672ba9728 100644 --- a/tests-unit/assets_test/test_downloads.py +++ b/tests-unit/assets_test/test_downloads.py @@ -117,7 +117,7 @@ def test_download_missing_file_returns_404( assert body["error"]["code"] == "FILE_NOT_FOUND" finally: # We created asset without the "unit-tests" tag(see `autoclean_unit_test_assets`), we need to clear it manually. - dr = http.delete(f"{api_base}/api/assets/{aid}", timeout=120) + dr = http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=120) dr.content diff --git a/tests-unit/assets_test/test_sync_references.py b/tests-unit/assets_test/test_sync_references.py index e646c1a2b..94cc255bc 100644 --- a/tests-unit/assets_test/test_sync_references.py +++ b/tests-unit/assets_test/test_sync_references.py @@ -2,6 +2,7 @@ import os import tempfile +from datetime import datetime from pathlib import Path from unittest.mock import patch @@ -16,6 +17,12 @@ from app.assets.database.models import ( Base, Tag, ) +from app.assets.database.queries.asset_reference import ( + bulk_insert_references_ignore_conflicts, + get_references_for_prefixes, + get_unenriched_references, + restore_references_by_paths, +) from app.assets.scanner import sync_references_with_filesystem from app.assets.services.file_utils import get_mtime_ns @@ -348,3 +355,128 @@ def test_no_references_is_noop(session, temp_dir): session.commit() assert survivors == set() + + +# --------------------------------------------------------------------------- +# Soft-delete persistence across scanner operations +# --------------------------------------------------------------------------- + +def _soft_delete_ref(session: Session, ref_id: str) -> None: + """Mark a reference as soft-deleted (mimics the API DELETE behaviour).""" + ref = session.get(AssetReference, ref_id) + ref.deleted_at = datetime(2025, 1, 1) + session.flush() + + +def test_soft_deleted_ref_excluded_from_get_references_for_prefixes(session, temp_dir): + """get_references_for_prefixes skips soft-deleted references.""" + fp = _create_file(temp_dir, "model.bin") + mtime = _stat_mtime_ns(fp) + _make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime) + _soft_delete_ref(session, "r1") + session.commit() + + rows = get_references_for_prefixes(session, [str(temp_dir)], include_missing=True) + assert len(rows) == 0 + + +def test_sync_does_not_resurrect_soft_deleted_ref(session, temp_dir): + """Scanner sync leaves soft-deleted refs untouched even when file exists on disk.""" + fp = _create_file(temp_dir, "model.bin") + mtime = _stat_mtime_ns(fp) + _make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime) + _soft_delete_ref(session, "r1") + session.commit() + + with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]): + sync_references_with_filesystem(session, "models") + session.commit() + + session.expire_all() + ref = session.get(AssetReference, "r1") + assert ref.deleted_at is not None, "soft-deleted ref must stay deleted after sync" + + +def test_bulk_insert_does_not_overwrite_soft_deleted_ref(session, temp_dir): + """bulk_insert_references_ignore_conflicts cannot replace a soft-deleted row.""" + fp = _create_file(temp_dir, "model.bin") + mtime = _stat_mtime_ns(fp) + _make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime) + _soft_delete_ref(session, "r1") + session.commit() + + now = datetime.now(tz=None) + bulk_insert_references_ignore_conflicts(session, [ + { + "id": "r_new", + "asset_id": "a1", + "file_path": fp, + "name": "model.bin", + "owner_id": "", + "mtime_ns": mtime, + "preview_id": None, + "user_metadata": None, + "created_at": now, + "updated_at": now, + "last_access_time": now, + } + ]) + session.commit() + + session.expire_all() + # Original row is still the soft-deleted one + ref = session.get(AssetReference, "r1") + assert ref is not None + assert ref.deleted_at is not None + # The new row was not inserted (conflict on file_path) + assert session.get(AssetReference, "r_new") is None + + +def test_restore_references_by_paths_skips_soft_deleted(session, temp_dir): + """restore_references_by_paths does not clear is_missing on soft-deleted refs.""" + fp = _create_file(temp_dir, "model.bin") + mtime = _stat_mtime_ns(fp) + _make_asset( + session, "a1", fp, "r1", + asset_hash="blake3:abc", mtime_ns=mtime, is_missing=True, + ) + _soft_delete_ref(session, "r1") + session.commit() + + restored = restore_references_by_paths(session, [fp]) + session.commit() + + assert restored == 0 + session.expire_all() + ref = session.get(AssetReference, "r1") + assert ref.is_missing is True, "is_missing must not be cleared on soft-deleted ref" + assert ref.deleted_at is not None + + +def test_get_unenriched_references_excludes_soft_deleted(session, temp_dir): + """Enrichment queries do not pick up soft-deleted references.""" + fp = _create_file(temp_dir, "model.bin") + mtime = _stat_mtime_ns(fp) + _make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime) + _soft_delete_ref(session, "r1") + session.commit() + + rows = get_unenriched_references(session, [str(temp_dir)], max_level=2) + assert len(rows) == 0 + + +def test_sync_ignores_soft_deleted_seed_asset(session, temp_dir): + """Soft-deleted seed ref is not garbage-collected even when file is missing.""" + fp = str(temp_dir / "gone.bin") # file does not exist + _make_asset(session, "seed1", fp, "r1", asset_hash=None, mtime_ns=999) + _soft_delete_ref(session, "r1") + session.commit() + + with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]): + sync_references_with_filesystem(session, "models") + session.commit() + + session.expire_all() + # Asset and ref must still exist — scanner did not see the soft-deleted row + assert session.get(Asset, "seed1") is not None + assert session.get(AssetReference, "r1") is not None diff --git a/tests-unit/assets_test/test_tags_api.py b/tests-unit/assets_test/test_tags_api.py index 6b1047802..595bf29c6 100644 --- a/tests-unit/assets_test/test_tags_api.py +++ b/tests-unit/assets_test/test_tags_api.py @@ -69,8 +69,8 @@ def test_tags_empty_usage(http: requests.Session, api_base: str, asset_factory, used_names = [t["name"] for t in body2["tags"]] assert custom_tag in used_names - # Delete the asset so the tag usage drops to zero - rd = http.delete(f"{api_base}/api/assets/{_asset['id']}", timeout=120) + # Hard-delete the asset so the tag usage drops to zero + rd = http.delete(f"{api_base}/api/assets/{_asset['id']}?delete_content=true", timeout=120) assert rd.status_code == 204 # Now the custom tag must not be returned when include_zero=false