From c268507ea7a3016e1b7ac537155663998344ea05 Mon Sep 17 00:00:00 2001 From: Luke Mino-Altherr Date: Tue, 3 Mar 2026 18:47:48 -0800 Subject: [PATCH] Fix asset API security and correctness issues - Content-Disposition: drop raw filename= parameter, use only RFC 5987 filename*=UTF-8'' to prevent header injection via ; and special chars - delete_asset: default delete_content to False (non-destructive) when query parameter is omitted - create_asset_from_hash: return 400 MISSING_INPUT instead of 404 when hash not found and no file uploaded (client input error, not missing resource) - seeder: clear _progress when returning to IDLE so get_status() does not return stale progress after scan completion - hashing: handle non-seekable streams in _hash_file_obj by checking seekable() before attempting tell/seek - bulk_ingest: filter lost_paths to only include paths tied to actually inserted asset IDs, preventing inflated counts from ON CONFLICT drops Amp-Thread-ID: https://ampcode.com/threads/T-019cb67a-9822-7438-ab05-d09991a9f7f3 Co-authored-by: Amp --- app/assets/api/routes.py | 35 +++++++++++++++--------------- app/assets/seeder.py | 17 ++++++++++++--- app/assets/services/bulk_ingest.py | 12 +++++----- app/assets/services/hashing.py | 19 +++++++++++----- 4 files changed, 53 insertions(+), 30 deletions(-) diff --git a/app/assets/api/routes.py b/app/assets/api/routes.py index de33f51c2..d8a94ae96 100644 --- a/app/assets/api/routes.py +++ b/app/assets/api/routes.py @@ -54,8 +54,10 @@ def _require_assets_feature_enabled(handler): "Assets system is disabled. Start the server with --enable-assets to use this feature.", ) return await handler(request) + return wrapper + # UUID regex (canonical hyphenated form, case-insensitive) UUID_RE = r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}" @@ -81,7 +83,8 @@ def get_query_dict(request: web.Request) -> dict[str, Any]: def register_assets_routes( - app: web.Application, user_manager_instance: user_manager.UserManager | None = None, + app: web.Application, + user_manager_instance: user_manager.UserManager | None = None, ) -> None: global USER_MANAGER, _ASSETS_ENABLED if user_manager_instance is not None: @@ -254,15 +257,19 @@ async def download_asset_content(request: web.Request) -> web.Response: 404, "FILE_NOT_FOUND", "Underlying file not found on disk." ) - quoted = (filename or "").replace("\r", "").replace("\n", "").replace('"', "'") - encoded = urllib.parse.quote(quoted) - cd = f"{disposition}; filename=\"{quoted}\"; filename*=UTF-8''{encoded}" + safe_name = (filename or "").replace("\r", "").replace("\n", "") + encoded = urllib.parse.quote(safe_name) + cd = f"{disposition}; filename*=UTF-8''{encoded}" file_size = os.path.getsize(abs_path) size_mb = file_size / (1024 * 1024) logging.info( "download_asset_content: path=%s, size=%d bytes (%.2f MB), type=%s, name=%s", - abs_path, file_size, size_mb, content_type, filename, + abs_path, + file_size, + size_mb, + content_type, + filename, ) async def stream_file_chunks(): @@ -382,8 +389,8 @@ async def upload_asset(request: web.Request) -> web.Response: # Otherwise, we must have a temp file path to ingest if not parsed.tmp_path or not os.path.exists(parsed.tmp_path): return _build_error_response( - 404, - "ASSET_NOT_FOUND", + 400, + "MISSING_INPUT", "Provided hash not found and no file uploaded.", ) @@ -459,9 +466,7 @@ async def update_asset_route(request: web.Request) -> web.Response: updated_at=result.ref.updated_at, ) except PermissionError as pe: - return _build_error_response( - 403, "FORBIDDEN", str(pe), {"id": reference_id} - ) + return _build_error_response(403, "FORBIDDEN", str(pe), {"id": reference_id}) except ValueError as ve: return _build_error_response( 404, "ASSET_NOT_FOUND", str(ve), {"id": reference_id} @@ -482,7 +487,7 @@ async def delete_asset_route(request: web.Request) -> web.Response: reference_id = str(uuid.UUID(request.match_info["id"])) delete_content_param = request.query.get("delete_content") delete_content = ( - True + False if delete_content_param is None else delete_content_param.lower() not in {"0", "false", "no"} ) @@ -577,9 +582,7 @@ async def add_asset_tags(request: web.Request) -> web.Response: total_tags=result.total_tags, ) except PermissionError as pe: - return _build_error_response( - 403, "FORBIDDEN", str(pe), {"id": reference_id} - ) + return _build_error_response(403, "FORBIDDEN", str(pe), {"id": reference_id}) except ValueError as ve: return _build_error_response( 404, "ASSET_NOT_FOUND", str(ve), {"id": reference_id} @@ -626,9 +629,7 @@ async def delete_asset_tags(request: web.Request) -> web.Response: total_tags=result.total_tags, ) except PermissionError as pe: - return _build_error_response( - 403, "FORBIDDEN", str(pe), {"id": reference_id} - ) + return _build_error_response(403, "FORBIDDEN", str(pe), {"id": reference_id}) except ValueError as ve: return _build_error_response( 404, "ASSET_NOT_FOUND", str(ve), {"id": reference_id} diff --git a/app/assets/seeder.py b/app/assets/seeder.py index ec45eb8b4..a28c99d84 100644 --- a/app/assets/seeder.py +++ b/app/assets/seeder.py @@ -360,7 +360,9 @@ class _AssetSeeder: """ with self._lock: if self._state != State.IDLE: - raise ScanInProgressError("Cannot mark missing assets while scan is running") + raise ScanInProgressError( + "Cannot mark missing assets while scan is running" + ) self._state = State.RUNNING try: @@ -378,6 +380,7 @@ class _AssetSeeder: finally: with self._lock: self._state = State.IDLE + self._progress = None def _is_cancelled(self) -> bool: """Check if cancellation has been requested.""" @@ -543,7 +546,11 @@ class _AssetSeeder: elapsed = time.perf_counter() - t_start logging.info( "Scan(%s, %s) done %.3fs: created=%d enriched=%d skipped=%d", - roots, phase.value, elapsed, total_created, total_enriched, + roots, + phase.value, + elapsed, + total_created, + total_enriched, skipped_existing, ) @@ -575,6 +582,7 @@ class _AssetSeeder: ) with self._lock: self._state = State.IDLE + self._progress = None def _run_fast_phase(self, roots: tuple[RootType, ...]) -> tuple[int, int, int]: """Run phase 1: fast scan to create stub records. @@ -605,7 +613,10 @@ class _AssetSeeder: # Use stub specs (no metadata extraction, no hashing) specs, tag_pool, skipped_existing = build_asset_specs( - paths, existing_paths, enable_metadata_extraction=False, compute_hashes=False, + paths, + existing_paths, + enable_metadata_extraction=False, + compute_hashes=False, ) self._update_progress(skipped=skipped_existing) diff --git a/app/assets/services/bulk_ingest.py b/app/assets/services/bulk_ingest.py index 8762e3ade..54e72730c 100644 --- a/app/assets/services/bulk_ingest.py +++ b/app/assets/services/bulk_ingest.py @@ -187,16 +187,18 @@ def batch_insert_seed_assets( inserted_asset_ids = get_existing_asset_ids( session, [r["asset_id"] for r in reference_rows] ) - reference_rows = [ - r for r in reference_rows if r["asset_id"] in inserted_asset_ids - ] + reference_rows = [r for r in reference_rows if r["asset_id"] in inserted_asset_ids] bulk_insert_references_ignore_conflicts(session, reference_rows) restore_references_by_paths(session, absolute_path_list) winning_paths = get_references_by_paths_and_asset_ids(session, path_to_asset_id) - all_paths_set = set(absolute_path_list) - losing_paths = all_paths_set - winning_paths + inserted_paths = { + path + for path in absolute_path_list + if path_to_asset_id[path] in inserted_asset_ids + } + losing_paths = inserted_paths - winning_paths lost_asset_ids = [path_to_asset_id[path] for path in losing_paths] if lost_asset_ids: diff --git a/app/assets/services/hashing.py b/app/assets/services/hashing.py index bcd7645c3..27bae32fe 100644 --- a/app/assets/services/hashing.py +++ b/app/assets/services/hashing.py @@ -1,3 +1,4 @@ +import io import os from typing import IO @@ -21,12 +22,19 @@ def _hash_file_obj(file_obj: IO, chunk_size: int = DEFAULT_CHUNK) -> str: if chunk_size <= 0: chunk_size = DEFAULT_CHUNK - orig_pos = file_obj.tell() + seekable = getattr(file_obj, "seekable", lambda: False)() + orig_pos = None + + if seekable: + try: + orig_pos = file_obj.tell() + if orig_pos != 0: + file_obj.seek(0) + except io.UnsupportedOperation: + seekable = False + orig_pos = None try: - if orig_pos != 0: - file_obj.seek(0) - h = blake3() while True: chunk = file_obj.read(chunk_size) @@ -35,4 +43,5 @@ def _hash_file_obj(file_obj: IO, chunk_size: int = DEFAULT_CHUNK) -> str: h.update(chunk) return h.hexdigest() finally: - file_obj.seek(orig_pos) + if seekable and orig_pos is not None: + file_obj.seek(orig_pos)