Fix asset API security and correctness issues

- Content-Disposition: drop raw filename= parameter, use only RFC 5987
  filename*=UTF-8'' to prevent header injection via ; and special chars
- delete_asset: default delete_content to False (non-destructive) when
  query parameter is omitted
- create_asset_from_hash: return 400 MISSING_INPUT instead of 404 when
  hash not found and no file uploaded (client input error, not missing resource)
- seeder: clear _progress when returning to IDLE so get_status() does not
  return stale progress after scan completion
- hashing: handle non-seekable streams in _hash_file_obj by checking
  seekable() before attempting tell/seek
- bulk_ingest: filter lost_paths to only include paths tied to actually
  inserted asset IDs, preventing inflated counts from ON CONFLICT drops

Amp-Thread-ID: https://ampcode.com/threads/T-019cb67a-9822-7438-ab05-d09991a9f7f3
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Luke Mino-Altherr 2026-03-03 18:47:48 -08:00
parent d638d81d6c
commit c268507ea7
4 changed files with 53 additions and 30 deletions

View File

@ -54,8 +54,10 @@ def _require_assets_feature_enabled(handler):
"Assets system is disabled. Start the server with --enable-assets to use this feature.",
)
return await handler(request)
return wrapper
# UUID regex (canonical hyphenated form, case-insensitive)
UUID_RE = r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}"
@ -81,7 +83,8 @@ def get_query_dict(request: web.Request) -> dict[str, Any]:
def register_assets_routes(
app: web.Application, user_manager_instance: user_manager.UserManager | None = None,
app: web.Application,
user_manager_instance: user_manager.UserManager | None = None,
) -> None:
global USER_MANAGER, _ASSETS_ENABLED
if user_manager_instance is not None:
@ -254,15 +257,19 @@ async def download_asset_content(request: web.Request) -> web.Response:
404, "FILE_NOT_FOUND", "Underlying file not found on disk."
)
quoted = (filename or "").replace("\r", "").replace("\n", "").replace('"', "'")
encoded = urllib.parse.quote(quoted)
cd = f"{disposition}; filename=\"{quoted}\"; filename*=UTF-8''{encoded}"
safe_name = (filename or "").replace("\r", "").replace("\n", "")
encoded = urllib.parse.quote(safe_name)
cd = f"{disposition}; filename*=UTF-8''{encoded}"
file_size = os.path.getsize(abs_path)
size_mb = file_size / (1024 * 1024)
logging.info(
"download_asset_content: path=%s, size=%d bytes (%.2f MB), type=%s, name=%s",
abs_path, file_size, size_mb, content_type, filename,
abs_path,
file_size,
size_mb,
content_type,
filename,
)
async def stream_file_chunks():
@ -382,8 +389,8 @@ async def upload_asset(request: web.Request) -> web.Response:
# Otherwise, we must have a temp file path to ingest
if not parsed.tmp_path or not os.path.exists(parsed.tmp_path):
return _build_error_response(
404,
"ASSET_NOT_FOUND",
400,
"MISSING_INPUT",
"Provided hash not found and no file uploaded.",
)
@ -459,9 +466,7 @@ async def update_asset_route(request: web.Request) -> web.Response:
updated_at=result.ref.updated_at,
)
except PermissionError as pe:
return _build_error_response(
403, "FORBIDDEN", str(pe), {"id": reference_id}
)
return _build_error_response(403, "FORBIDDEN", str(pe), {"id": reference_id})
except ValueError as ve:
return _build_error_response(
404, "ASSET_NOT_FOUND", str(ve), {"id": reference_id}
@ -482,7 +487,7 @@ async def delete_asset_route(request: web.Request) -> web.Response:
reference_id = str(uuid.UUID(request.match_info["id"]))
delete_content_param = request.query.get("delete_content")
delete_content = (
True
False
if delete_content_param is None
else delete_content_param.lower() not in {"0", "false", "no"}
)
@ -577,9 +582,7 @@ async def add_asset_tags(request: web.Request) -> web.Response:
total_tags=result.total_tags,
)
except PermissionError as pe:
return _build_error_response(
403, "FORBIDDEN", str(pe), {"id": reference_id}
)
return _build_error_response(403, "FORBIDDEN", str(pe), {"id": reference_id})
except ValueError as ve:
return _build_error_response(
404, "ASSET_NOT_FOUND", str(ve), {"id": reference_id}
@ -626,9 +629,7 @@ async def delete_asset_tags(request: web.Request) -> web.Response:
total_tags=result.total_tags,
)
except PermissionError as pe:
return _build_error_response(
403, "FORBIDDEN", str(pe), {"id": reference_id}
)
return _build_error_response(403, "FORBIDDEN", str(pe), {"id": reference_id})
except ValueError as ve:
return _build_error_response(
404, "ASSET_NOT_FOUND", str(ve), {"id": reference_id}

View File

@ -360,7 +360,9 @@ class _AssetSeeder:
"""
with self._lock:
if self._state != State.IDLE:
raise ScanInProgressError("Cannot mark missing assets while scan is running")
raise ScanInProgressError(
"Cannot mark missing assets while scan is running"
)
self._state = State.RUNNING
try:
@ -378,6 +380,7 @@ class _AssetSeeder:
finally:
with self._lock:
self._state = State.IDLE
self._progress = None
def _is_cancelled(self) -> bool:
"""Check if cancellation has been requested."""
@ -543,7 +546,11 @@ class _AssetSeeder:
elapsed = time.perf_counter() - t_start
logging.info(
"Scan(%s, %s) done %.3fs: created=%d enriched=%d skipped=%d",
roots, phase.value, elapsed, total_created, total_enriched,
roots,
phase.value,
elapsed,
total_created,
total_enriched,
skipped_existing,
)
@ -575,6 +582,7 @@ class _AssetSeeder:
)
with self._lock:
self._state = State.IDLE
self._progress = None
def _run_fast_phase(self, roots: tuple[RootType, ...]) -> tuple[int, int, int]:
"""Run phase 1: fast scan to create stub records.
@ -605,7 +613,10 @@ class _AssetSeeder:
# Use stub specs (no metadata extraction, no hashing)
specs, tag_pool, skipped_existing = build_asset_specs(
paths, existing_paths, enable_metadata_extraction=False, compute_hashes=False,
paths,
existing_paths,
enable_metadata_extraction=False,
compute_hashes=False,
)
self._update_progress(skipped=skipped_existing)

View File

@ -187,16 +187,18 @@ def batch_insert_seed_assets(
inserted_asset_ids = get_existing_asset_ids(
session, [r["asset_id"] for r in reference_rows]
)
reference_rows = [
r for r in reference_rows if r["asset_id"] in inserted_asset_ids
]
reference_rows = [r for r in reference_rows if r["asset_id"] in inserted_asset_ids]
bulk_insert_references_ignore_conflicts(session, reference_rows)
restore_references_by_paths(session, absolute_path_list)
winning_paths = get_references_by_paths_and_asset_ids(session, path_to_asset_id)
all_paths_set = set(absolute_path_list)
losing_paths = all_paths_set - winning_paths
inserted_paths = {
path
for path in absolute_path_list
if path_to_asset_id[path] in inserted_asset_ids
}
losing_paths = inserted_paths - winning_paths
lost_asset_ids = [path_to_asset_id[path] for path in losing_paths]
if lost_asset_ids:

View File

@ -1,3 +1,4 @@
import io
import os
from typing import IO
@ -21,12 +22,19 @@ def _hash_file_obj(file_obj: IO, chunk_size: int = DEFAULT_CHUNK) -> str:
if chunk_size <= 0:
chunk_size = DEFAULT_CHUNK
orig_pos = file_obj.tell()
seekable = getattr(file_obj, "seekable", lambda: False)()
orig_pos = None
if seekable:
try:
orig_pos = file_obj.tell()
if orig_pos != 0:
file_obj.seek(0)
except io.UnsupportedOperation:
seekable = False
orig_pos = None
try:
if orig_pos != 0:
file_obj.seek(0)
h = blake3()
while True:
chunk = file_obj.read(chunk_size)
@ -35,4 +43,5 @@ def _hash_file_obj(file_obj: IO, chunk_size: int = DEFAULT_CHUNK) -> str:
h.update(chunk)
return h.hexdigest()
finally:
file_obj.seek(orig_pos)
if seekable and orig_pos is not None:
file_obj.seek(orig_pos)