feat: soft-delete for AssetReference with scanner persistence
Some checks are pending
Python Linting / Run Ruff (push) Waiting to run
Python Linting / Run Pylint (push) Waiting to run

- Add deleted_at column to AssetReference model and migration
- soft_delete_reference_by_id sets deleted_at instead of removing rows
- DELETE /api/assets/{id} defaults to soft-delete; delete_content=true
  for hard-delete
- Add deleted_at IS NULL filters to read queries, tag queries, and
  scanner queries so soft-deleted refs are invisible
- restore_references_by_paths skips soft-deleted refs
- upsert_reference clears deleted_at on explicit re-ingest
- Add tests for soft-delete API behavior, scanner persistence, bulk
  insert, enrichment exclusion, and seed asset garbage collection

Amp-Thread-ID: https://ampcode.com/threads/T-019cb6fc-c05c-761f-b855-6d5d1c9defa2
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Luke Mino-Altherr 2026-03-03 20:04:01 -08:00
parent ce17e303fc
commit d5cac66405
12 changed files with 247 additions and 17 deletions

View File

@ -79,6 +79,7 @@ def upgrade() -> None:
sa.Column("created_at", sa.DateTime(timezone=False), nullable=False),
sa.Column("updated_at", sa.DateTime(timezone=False), nullable=False),
sa.Column("last_access_time", sa.DateTime(timezone=False), nullable=False),
sa.Column("deleted_at", sa.DateTime(timezone=False), nullable=True),
sa.CheckConstraint(
"(mtime_ns IS NULL) OR (mtime_ns >= 0)", name="ck_ar_mtime_nonneg"
),
@ -104,6 +105,7 @@ def upgrade() -> None:
op.create_index(
"ix_asset_references_owner_name", "asset_references", ["owner_id", "name"]
)
op.create_index("ix_asset_references_deleted_at", "asset_references", ["deleted_at"])
# Create asset_reference_tags table
op.create_table(
@ -188,6 +190,7 @@ def downgrade() -> None:
op.drop_index("ix_asset_reference_tags_tag_name", table_name="asset_reference_tags")
op.drop_table("asset_reference_tags")
op.drop_index("ix_asset_references_deleted_at", table_name="asset_references")
op.drop_index("ix_asset_references_owner_name", table_name="asset_references")
op.drop_index("ix_asset_references_last_access_time", table_name="asset_references")
op.drop_index("ix_asset_references_created_at", table_name="asset_references")

View File

@ -105,6 +105,9 @@ class AssetReference(Base):
last_access_time: Mapped[datetime] = mapped_column(
DateTime(timezone=False), nullable=False, default=get_utc_now
)
deleted_at: Mapped[datetime | None] = mapped_column(
DateTime(timezone=False), nullable=True, default=None
)
asset: Mapped[Asset] = relationship(
"Asset",
@ -148,6 +151,7 @@ class AssetReference(Base):
Index("ix_asset_references_enrichment_level", "enrichment_level"),
Index("ix_asset_references_created_at", "created_at"),
Index("ix_asset_references_last_access_time", "last_access_time"),
Index("ix_asset_references_deleted_at", "deleted_at"),
Index("ix_asset_references_owner_name", "owner_id", "name"),
CheckConstraint(
"(mtime_ns IS NULL) OR (mtime_ns >= 0)", name="ck_ar_mtime_nonneg"

View File

@ -38,6 +38,7 @@ from app.assets.database.queries.asset_reference import (
restore_references_by_paths,
set_reference_metadata,
set_reference_preview,
soft_delete_reference_by_id,
update_reference_access_time,
update_reference_name,
update_reference_timestamps,
@ -107,6 +108,7 @@ __all__ = [
"restore_references_by_paths",
"set_reference_metadata",
"set_reference_preview",
"soft_delete_reference_by_id",
"set_reference_tags",
"update_asset_hash_and_mime",
"update_reference_access_time",

View File

@ -173,11 +173,11 @@ def get_reference_with_owner_check(
"""Fetch a reference and verify ownership.
Raises:
ValueError: if reference not found
ValueError: if reference not found or soft-deleted
PermissionError: if owner_id doesn't match
"""
ref = get_reference_by_id(session, reference_id=reference_id)
if not ref:
if not ref or ref.deleted_at is not None:
raise ValueError(f"AssetReference {reference_id} not found")
if ref.owner_id and ref.owner_id != owner_id:
raise PermissionError("not owner")
@ -206,6 +206,7 @@ def reference_exists_for_asset_id(
select(sa.literal(True))
.select_from(AssetReference)
.where(AssetReference.asset_id == asset_id)
.where(AssetReference.deleted_at.is_(None))
.limit(1)
)
return session.execute(q).first() is not None
@ -327,6 +328,7 @@ def list_references_page(
.join(Asset, Asset.id == AssetReference.asset_id)
.where(build_visible_owner_clause(owner_id))
.where(AssetReference.is_missing == False) # noqa: E712
.where(AssetReference.deleted_at.is_(None))
.options(noload(AssetReference.tags))
)
@ -357,6 +359,7 @@ def list_references_page(
.join(Asset, Asset.id == AssetReference.asset_id)
.where(build_visible_owner_clause(owner_id))
.where(AssetReference.is_missing == False) # noqa: E712
.where(AssetReference.deleted_at.is_(None))
)
if name_contains:
escaped, esc = escape_sql_like_string(name_contains)
@ -400,6 +403,7 @@ def fetch_reference_asset_and_tags(
.join(Tag, Tag.name == AssetReferenceTag.tag_name, isouter=True)
.where(
AssetReference.id == reference_id,
AssetReference.deleted_at.is_(None),
build_visible_owner_clause(owner_id),
)
.options(noload(AssetReference.tags))
@ -430,6 +434,7 @@ def fetch_reference_and_asset(
.join(Asset, Asset.id == AssetReference.asset_id)
.where(
AssetReference.id == reference_id,
AssetReference.deleted_at.is_(None),
build_visible_owner_clause(owner_id),
)
.limit(1)
@ -541,6 +546,28 @@ def delete_reference_by_id(
return int(session.execute(stmt).rowcount or 0) > 0
def soft_delete_reference_by_id(
session: Session,
reference_id: str,
owner_id: str,
) -> bool:
"""Mark a reference as soft-deleted by setting deleted_at timestamp.
Returns True if the reference was found and marked deleted.
"""
now = get_utc_now()
stmt = (
sa.update(AssetReference)
.where(
AssetReference.id == reference_id,
AssetReference.deleted_at.is_(None),
build_visible_owner_clause(owner_id),
)
.values(deleted_at=now)
)
return int(session.execute(stmt).rowcount or 0) > 0
def set_reference_preview(
session: Session,
reference_id: str,
@ -633,10 +660,12 @@ def upsert_reference(
AssetReference.mtime_ns.is_(None),
AssetReference.mtime_ns != int(mtime_ns),
AssetReference.is_missing == True, # noqa: E712
AssetReference.deleted_at.isnot(None),
)
)
.values(
asset_id=asset_id, mtime_ns=int(mtime_ns), is_missing=False, updated_at=now
asset_id=asset_id, mtime_ns=int(mtime_ns), is_missing=False,
deleted_at=None, updated_at=now,
)
)
res2 = session.execute(upd)
@ -660,6 +689,7 @@ def mark_references_missing_outside_prefixes(
result = session.execute(
sa.update(AssetReference)
.where(AssetReference.file_path.isnot(None))
.where(AssetReference.deleted_at.is_(None))
.where(~matches_valid_prefix)
.where(AssetReference.is_missing == False) # noqa: E712
.values(is_missing=True)
@ -681,6 +711,7 @@ def restore_references_by_paths(session: Session, file_paths: list[str]) -> int:
sa.update(AssetReference)
.where(AssetReference.file_path.in_(chunk))
.where(AssetReference.is_missing == True) # noqa: E712
.where(AssetReference.deleted_at.is_(None))
.values(is_missing=False)
)
total += result.rowcount
@ -699,6 +730,7 @@ def get_unreferenced_unhashed_asset_ids(session: Session) -> list[str]:
sa.select(sa.literal(1))
.where(AssetReference.asset_id == Asset.id)
.where(AssetReference.is_missing == False) # noqa: E712
.where(AssetReference.deleted_at.is_(None))
.correlate(Asset)
.exists()
)
@ -758,6 +790,7 @@ def get_references_for_prefixes(
)
.join(Asset, Asset.id == AssetReference.asset_id)
.where(AssetReference.file_path.isnot(None))
.where(AssetReference.deleted_at.is_(None))
.where(sa.or_(*conds))
)
@ -894,6 +927,7 @@ def get_unenriched_references(
AssetReference.enrichment_level,
)
.where(AssetReference.file_path.isnot(None))
.where(AssetReference.deleted_at.is_(None))
.where(sa.or_(*conds))
.where(AssetReference.is_missing == False) # noqa: E712
.where(AssetReference.enrichment_level <= max_level)

View File

@ -272,6 +272,7 @@ def list_tags_with_usage(
.select_from(AssetReferenceTag)
.join(AssetReference, AssetReference.id == AssetReferenceTag.asset_reference_id)
.where(build_visible_owner_clause(owner_id))
.where(AssetReference.deleted_at.is_(None))
.group_by(AssetReferenceTag.tag_name)
.subquery()
)
@ -307,6 +308,7 @@ def list_tags_with_usage(
select(AssetReferenceTag.tag_name)
.join(AssetReference, AssetReference.id == AssetReferenceTag.asset_reference_id)
.where(build_visible_owner_clause(owner_id))
.where(AssetReference.deleted_at.is_(None))
.group_by(AssetReferenceTag.tag_name)
)
total_q = total_q.where(Tag.name.in_(visible_tags_sq))

View File

@ -10,6 +10,7 @@ from app.assets.database.queries import (
reference_exists_for_asset_id,
delete_reference_by_id,
fetch_reference_and_asset,
soft_delete_reference_by_id,
fetch_reference_asset_and_tags,
get_asset_by_hash as queries_get_asset_by_hash,
get_reference_by_id,
@ -130,6 +131,14 @@ def delete_asset_reference(
delete_content_if_orphan: bool = True,
) -> bool:
with create_session() as session:
if not delete_content_if_orphan:
# Soft delete: mark the reference as deleted but keep everything
deleted = soft_delete_reference_by_id(
session, reference_id=reference_id, owner_id=owner_id
)
session.commit()
return deleted
ref_row = get_reference_by_id(session, reference_id=reference_id)
asset_id = ref_row.asset_id if ref_row else None
file_path = ref_row.file_path if ref_row else None
@ -141,7 +150,7 @@ def delete_asset_reference(
session.commit()
return False
if not delete_content_if_orphan or not asset_id:
if not asset_id:
session.commit()
return True

View File

@ -212,7 +212,7 @@ def asset_factory(http: requests.Session, api_base: str):
for aid in created:
with contextlib.suppress(Exception):
http.delete(f"{api_base}/api/assets/{aid}", timeout=30)
http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=30)
@pytest.fixture
@ -258,4 +258,4 @@ def autoclean_unit_test_assets(http: requests.Session, api_base: str):
break
for aid in ids:
with contextlib.suppress(Exception):
http.delete(f"{api_base}/api/assets/{aid}", timeout=30)
http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=30)

View File

@ -140,7 +140,7 @@ class TestUpdateAssetMetadata:
class TestDeleteAssetReference:
def test_deletes_reference(self, mock_create_session, session: Session):
def test_soft_deletes_reference(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset)
ref_id = ref.id
@ -153,7 +153,11 @@ class TestDeleteAssetReference:
)
assert result is True
assert session.get(AssetReference, ref_id) is None
# Row still exists but is marked as soft-deleted
session.expire_all()
row = session.get(AssetReference, ref_id)
assert row is not None
assert row.deleted_at is not None
def test_returns_false_for_nonexistent(self, mock_create_session):
result = delete_asset_reference(

View File

@ -42,8 +42,8 @@ def test_get_and_delete_asset(http: requests.Session, api_base: str, seeded_asse
assert "user_metadata" in detail
assert "filename" in detail["user_metadata"]
# DELETE
rd = http.delete(f"{api_base}/api/assets/{aid}", timeout=120)
# DELETE (hard delete to also remove underlying asset and file)
rd = http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=120)
assert rd.status_code == 204
# GET again -> 404
@ -53,6 +53,35 @@ def test_get_and_delete_asset(http: requests.Session, api_base: str, seeded_asse
assert body["error"]["code"] == "ASSET_NOT_FOUND"
def test_soft_delete_hides_from_get(http: requests.Session, api_base: str, seeded_asset: dict):
aid = seeded_asset["id"]
asset_hash = seeded_asset["asset_hash"]
# Soft-delete (default, no delete_content param)
rd = http.delete(f"{api_base}/api/assets/{aid}", timeout=120)
assert rd.status_code == 204
# GET by reference ID -> 404 (soft-deleted references are hidden)
rg = http.get(f"{api_base}/api/assets/{aid}", timeout=120)
assert rg.status_code == 404
# Asset identity is preserved (underlying content still exists)
rh = http.head(f"{api_base}/api/assets/hash/{asset_hash}", timeout=120)
assert rh.status_code == 200
# Soft-deleted reference should not appear in listings
rl = http.get(
f"{api_base}/api/assets",
params={"include_tags": "unit-tests", "limit": "500"},
timeout=120,
)
ids = [a["id"] for a in rl.json().get("assets", [])]
assert aid not in ids
# Clean up: hard-delete the soft-deleted reference and orphaned asset
http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=120)
def test_delete_upon_reference_count(
http: requests.Session, api_base: str, seeded_asset: dict
):
@ -70,21 +99,32 @@ def test_delete_upon_reference_count(
assert copy["asset_hash"] == src_hash
assert copy["created_new"] is False
# Delete original reference -> asset identity must remain
# Soft-delete original reference (default) -> asset identity must remain
aid1 = seeded_asset["id"]
rd1 = http.delete(f"{api_base}/api/assets/{aid1}", timeout=120)
assert rd1.status_code == 204
rh1 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120)
assert rh1.status_code == 200 # identity still present
assert rh1.status_code == 200 # identity still present (second ref exists)
# Delete the last reference with default semantics -> identity and cached files removed
# Soft-delete the last reference -> asset identity preserved (no hard delete)
aid2 = copy["id"]
rd2 = http.delete(f"{api_base}/api/assets/{aid2}", timeout=120)
assert rd2.status_code == 204
rh2 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120)
assert rh2.status_code == 404 # orphan content removed
assert rh2.status_code == 200 # asset identity preserved (soft delete)
# Re-associate via from-hash, then hard-delete -> orphan content removed
r3 = http.post(f"{api_base}/api/assets/from-hash", json=payload, timeout=120)
assert r3.status_code == 201, r3.json()
aid3 = r3.json()["id"]
rd3 = http.delete(f"{api_base}/api/assets/{aid3}?delete_content=true", timeout=120)
assert rd3.status_code == 204
rh3 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120)
assert rh3.status_code == 404 # orphan content removed
def test_update_asset_fields(http: requests.Session, api_base: str, seeded_asset: dict):

View File

@ -117,7 +117,7 @@ def test_download_missing_file_returns_404(
assert body["error"]["code"] == "FILE_NOT_FOUND"
finally:
# We created asset without the "unit-tests" tag(see `autoclean_unit_test_assets`), we need to clear it manually.
dr = http.delete(f"{api_base}/api/assets/{aid}", timeout=120)
dr = http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=120)
dr.content

View File

@ -2,6 +2,7 @@
import os
import tempfile
from datetime import datetime
from pathlib import Path
from unittest.mock import patch
@ -16,6 +17,12 @@ from app.assets.database.models import (
Base,
Tag,
)
from app.assets.database.queries.asset_reference import (
bulk_insert_references_ignore_conflicts,
get_references_for_prefixes,
get_unenriched_references,
restore_references_by_paths,
)
from app.assets.scanner import sync_references_with_filesystem
from app.assets.services.file_utils import get_mtime_ns
@ -348,3 +355,128 @@ def test_no_references_is_noop(session, temp_dir):
session.commit()
assert survivors == set()
# ---------------------------------------------------------------------------
# Soft-delete persistence across scanner operations
# ---------------------------------------------------------------------------
def _soft_delete_ref(session: Session, ref_id: str) -> None:
"""Mark a reference as soft-deleted (mimics the API DELETE behaviour)."""
ref = session.get(AssetReference, ref_id)
ref.deleted_at = datetime(2025, 1, 1)
session.flush()
def test_soft_deleted_ref_excluded_from_get_references_for_prefixes(session, temp_dir):
"""get_references_for_prefixes skips soft-deleted references."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
_soft_delete_ref(session, "r1")
session.commit()
rows = get_references_for_prefixes(session, [str(temp_dir)], include_missing=True)
assert len(rows) == 0
def test_sync_does_not_resurrect_soft_deleted_ref(session, temp_dir):
"""Scanner sync leaves soft-deleted refs untouched even when file exists on disk."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
_soft_delete_ref(session, "r1")
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(session, "models")
session.commit()
session.expire_all()
ref = session.get(AssetReference, "r1")
assert ref.deleted_at is not None, "soft-deleted ref must stay deleted after sync"
def test_bulk_insert_does_not_overwrite_soft_deleted_ref(session, temp_dir):
"""bulk_insert_references_ignore_conflicts cannot replace a soft-deleted row."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
_soft_delete_ref(session, "r1")
session.commit()
now = datetime.now(tz=None)
bulk_insert_references_ignore_conflicts(session, [
{
"id": "r_new",
"asset_id": "a1",
"file_path": fp,
"name": "model.bin",
"owner_id": "",
"mtime_ns": mtime,
"preview_id": None,
"user_metadata": None,
"created_at": now,
"updated_at": now,
"last_access_time": now,
}
])
session.commit()
session.expire_all()
# Original row is still the soft-deleted one
ref = session.get(AssetReference, "r1")
assert ref is not None
assert ref.deleted_at is not None
# The new row was not inserted (conflict on file_path)
assert session.get(AssetReference, "r_new") is None
def test_restore_references_by_paths_skips_soft_deleted(session, temp_dir):
"""restore_references_by_paths does not clear is_missing on soft-deleted refs."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(
session, "a1", fp, "r1",
asset_hash="blake3:abc", mtime_ns=mtime, is_missing=True,
)
_soft_delete_ref(session, "r1")
session.commit()
restored = restore_references_by_paths(session, [fp])
session.commit()
assert restored == 0
session.expire_all()
ref = session.get(AssetReference, "r1")
assert ref.is_missing is True, "is_missing must not be cleared on soft-deleted ref"
assert ref.deleted_at is not None
def test_get_unenriched_references_excludes_soft_deleted(session, temp_dir):
"""Enrichment queries do not pick up soft-deleted references."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
_soft_delete_ref(session, "r1")
session.commit()
rows = get_unenriched_references(session, [str(temp_dir)], max_level=2)
assert len(rows) == 0
def test_sync_ignores_soft_deleted_seed_asset(session, temp_dir):
"""Soft-deleted seed ref is not garbage-collected even when file is missing."""
fp = str(temp_dir / "gone.bin") # file does not exist
_make_asset(session, "seed1", fp, "r1", asset_hash=None, mtime_ns=999)
_soft_delete_ref(session, "r1")
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(session, "models")
session.commit()
session.expire_all()
# Asset and ref must still exist — scanner did not see the soft-deleted row
assert session.get(Asset, "seed1") is not None
assert session.get(AssetReference, "r1") is not None

View File

@ -69,8 +69,8 @@ def test_tags_empty_usage(http: requests.Session, api_base: str, asset_factory,
used_names = [t["name"] for t in body2["tags"]]
assert custom_tag in used_names
# Delete the asset so the tag usage drops to zero
rd = http.delete(f"{api_base}/api/assets/{_asset['id']}", timeout=120)
# Hard-delete the asset so the tag usage drops to zero
rd = http.delete(f"{api_base}/api/assets/{_asset['id']}?delete_content=true", timeout=120)
assert rd.status_code == 204
# Now the custom tag must not be returned when include_zero=false