Fix five code review issues

1. Seeder pause/resume: only resume after prompt execution if pause() returned True, preventing undo of user-initiated pauses. 2. Missing rollback in enrich_assets_batch: add sess.rollback() in exception handler to prevent broken session state for subsequent batch operations. 3. Hash checkpoint validation: store mtime_ns/file_size in HashCheckpoint and re-stat on resume instead of comparing the same stat result to itself. 4. Scan progress preserved: save _last_progress before clearing _progress in finally blocks so wait=true endpoint returns final stats instead of zeros. 5. Download XSS hardening: block dangerous MIME types (matching server.py) and add X-Content-Type-Options: nosniff header to asset content endpoint. Amp-Thread-ID: https://ampcode.com/threads/T-019cbb6b-e97b-776d-8c43-2de8acd0d09e Co-authored-by: Amp <amp@ampcode.com>
2026-03-05 17:27:42 +08:00 · 2026-03-04 16:49:44 -08:00 · 2026-03-04 16:49:44 -08:00 · 7f3415549e
commit 7f3415549e
parent b754c50e92
5 changed files with 52 additions and 17 deletions
--- a/app/assets/api/routes.py
+++ b/app/assets/api/routes.py
@ -257,6 +257,13 @@ async def download_asset_content(request: web.Request) -> web.Response:
            404, "FILE_NOT_FOUND", "Underlying file not found on disk."
        )

+    _DANGEROUS_MIME_TYPES = {
+        "text/html", "text/html-sandboxed", "application/xhtml+xml",
+        "text/javascript", "text/css",
+    }
+    if content_type in _DANGEROUS_MIME_TYPES:
+        content_type = "application/octet-stream"
+
    safe_name = (filename or "").replace("\r", "").replace("\n", "")
    encoded = urllib.parse.quote(safe_name)
    cd = f"{disposition}; filename*=UTF-8''{encoded}"
@ -287,6 +294,7 @@ async def download_asset_content(request: web.Request) -> web.Response:
        headers={
            "Content-Disposition": cd,
            "Content-Length": str(file_size),
+            "X-Content-Type-Options": "nosniff",
        },
    )

--- a/app/assets/scanner.py
+++ b/app/assets/scanner.py
@ -412,8 +412,8 @@ def enrich_asset(
        asset_id: ID of the asset to update (for mime_type and hash)
        extract_metadata: If True, extract safetensors header and mime type
        compute_hash: If True, compute blake3 hash
-        interrupt_check: Optional callable that may block (e.g. while paused)
-            and returns True if the operation should be cancelled
+        interrupt_check: Optional non-blocking callable that returns True if
+            the operation should be interrupted (e.g. paused or cancelled)
        hash_checkpoints: Optional dict for saving/restoring hash progress
            across interruptions, keyed by file path

@ -446,14 +446,20 @@ def enrich_asset(
    if compute_hash:
        try:
            mtime_before = get_mtime_ns(stat_p)
+            size_before = stat_p.st_size

            # Restore checkpoint if available and file unchanged
            checkpoint = None
            if hash_checkpoints is not None:
                checkpoint = hash_checkpoints.get(file_path)
-                if checkpoint is not None and mtime_before != get_mtime_ns(stat_p):
-                    checkpoint = None
-                    hash_checkpoints.pop(file_path, None)
+                if checkpoint is not None:
+                    cur_stat = os.stat(file_path, follow_symlinks=True)
+                    if (checkpoint.mtime_ns != get_mtime_ns(cur_stat)
+                            or checkpoint.file_size != cur_stat.st_size):
+                        checkpoint = None
+                        hash_checkpoints.pop(file_path, None)
+                    else:
+                        mtime_before = get_mtime_ns(cur_stat)

            digest, new_checkpoint = compute_blake3_hash(
                file_path,
@ -464,6 +470,8 @@ def enrich_asset(
            if digest is None:
                # Interrupted — save checkpoint for later resumption
                if hash_checkpoints is not None and new_checkpoint is not None:
+                    new_checkpoint.mtime_ns = mtime_before
+                    new_checkpoint.file_size = size_before
                    hash_checkpoints[file_path] = new_checkpoint
                return new_level

@ -522,8 +530,8 @@ def enrich_assets_batch(
        rows: List of UnenrichedReferenceRow from get_unenriched_assets_for_roots
        extract_metadata: If True, extract metadata for each asset
        compute_hash: If True, compute hash for each asset
-        interrupt_check: Optional callable that may block (e.g. while paused)
-            and returns True if the operation should be cancelled
+        interrupt_check: Optional non-blocking callable that returns True if
+            the operation should be interrupted (e.g. paused or cancelled)
        hash_checkpoints: Optional dict for saving/restoring hash progress
            across interruptions, keyed by file path

@ -555,6 +563,7 @@ def enrich_assets_batch(
                    failed_ids.append(row.reference_id)
            except Exception as e:
                logging.warning("Failed to enrich %s: %s", row.file_path, e)
+                sess.rollback()
                failed_ids.append(row.reference_id)

    return enriched, failed_ids
--- a/app/assets/seeder.py
+++ b/app/assets/seeder.py
@ -80,6 +80,7 @@ class _AssetSeeder:
        self._lock = threading.Lock()
        self._state = State.IDLE
        self._progress: Progress | None = None
+        self._last_progress: Progress | None = None
        self._errors: list[str] = []
        self._thread: threading.Thread | None = None
        self._cancel_event = threading.Event()
@ -315,15 +316,16 @@ class _AssetSeeder:
    def get_status(self) -> ScanStatus:
        """Get the current status and progress of the seeder."""
        with self._lock:
+            src = self._progress or self._last_progress
            return ScanStatus(
                state=self._state,
                progress=Progress(
-                    scanned=self._progress.scanned,
-                    total=self._progress.total,
-                    created=self._progress.created,
-                    skipped=self._progress.skipped,
+                    scanned=src.scanned,
+                    total=src.total,
+                    created=src.created,
+                    skipped=src.skipped,
                )
-                if self._progress
+                if src
                else None,
                errors=list(self._errors),
            )
@ -379,6 +381,7 @@ class _AssetSeeder:
            return marked
        finally:
            with self._lock:
+                self._last_progress = self._progress
                self._state = State.IDLE
                self._progress = None

@ -386,6 +389,16 @@ class _AssetSeeder:
        """Check if cancellation has been requested."""
        return self._cancel_event.is_set()

+    def _is_paused_or_cancelled(self) -> bool:
+        """Non-blocking check: True if paused or cancelled.
+
+        Use as interrupt_check for I/O-bound work (e.g. hashing) so that
+        file handles are released immediately on pause rather than held
+        open while blocked. The caller is responsible for blocking on
+        _check_pause_and_cancel() afterward.
+        """
+        return not self._run_gate.is_set() or self._cancel_event.is_set()
+
    def _check_pause_and_cancel(self) -> bool:
        """Block while paused, then check if cancelled.

@ -581,6 +594,7 @@ class _AssetSeeder:
                    },
                )
            with self._lock:
+                self._last_progress = self._progress
                self._state = State.IDLE
                self._progress = None

@ -745,7 +759,7 @@ class _AssetSeeder:
                unenriched,
                extract_metadata=True,
                compute_hash=self._compute_hashes,
-                interrupt_check=self._check_pause_and_cancel,
+                interrupt_check=self._is_paused_or_cancelled,
                hash_checkpoints=hash_checkpoints,
            )
            total_enriched += enriched
--- a/app/assets/services/hashing.py
+++ b/app/assets/services/hashing.py
@ -16,6 +16,8 @@ class HashCheckpoint:

    bytes_processed: int
    hasher: Any  # blake3 hasher instance
+    mtime_ns: int = 0
+    file_size: int = 0


 def compute_blake3_hash(
@ -29,8 +31,9 @@ def compute_blake3_hash(
    Args:
        fp: File path or file-like object
        chunk_size: Size of chunks to read at a time
-        interrupt_check: Optional callable that may block (e.g. while paused)
-            and returns True if the operation should be cancelled. Checked
+        interrupt_check: Optional callable that returns True if the operation
+            should be interrupted (e.g. paused or cancelled). Must be
+            non-blocking so file handles are released immediately. Checked
            between chunk reads.
        checkpoint: Optional checkpoint to resume from (file paths only)

--- a/main.py
+++ b/main.py
@ -261,11 +261,12 @@ def prompt_worker(q, server_instance):
            for k in sensitive:
                extra_data[k] = sensitive[k]

-            asset_seeder.pause()
+            was_paused = asset_seeder.pause()
            try:
                e.execute(item[2], prompt_id, extra_data, item[4])
            finally:
-                asset_seeder.resume()
+                if was_paused:
+                    asset_seeder.resume()
            need_gc = True

            remove_sensitive = lambda prompt: prompt[:5] + prompt[6:]