mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-07-03 13:19:23 +08:00
Fix sweep deleting FAILED partials and fix segmented resume path trusted offsets blindly.
This commit is contained in:
parent
1bbd4a57db
commit
abc0b728ab
@ -222,12 +222,34 @@ class DownloadJob:
|
||||
pr.total_bytes, pr.accept_ranges, max(1, args.download_segments)
|
||||
)
|
||||
existing = await asyncio.to_thread(queries.list_segments, self.spec.download_id)
|
||||
if (
|
||||
can_resume_segmented = (
|
||||
seg_count > 1
|
||||
and existing
|
||||
and pr.total_bytes is not None
|
||||
and existing[-1].end_offset == pr.total_bytes - 1
|
||||
):
|
||||
)
|
||||
if can_resume_segmented and not self._segmented_part_valid(pr.total_bytes):
|
||||
# The persisted per-segment offsets describe bytes in a preallocated
|
||||
# .part that is now gone or the wrong size (e.g. the partial of a
|
||||
# failed download was swept on restart, or removed by a fatal
|
||||
# error). Trusting them would skip already-"complete" segments and
|
||||
# leave zero-filled holes. Discard the offsets and re-plan fresh.
|
||||
logging.info(
|
||||
"[model_downloader] %s discarding segmented resume offsets "
|
||||
"(preallocated .part missing or wrong size); restarting",
|
||||
self.spec.model_id,
|
||||
)
|
||||
self._remove_temp()
|
||||
await asyncio.to_thread(
|
||||
queries.replace_segments, self.spec.download_id, []
|
||||
)
|
||||
await asyncio.to_thread(
|
||||
queries.update_download, self.spec.download_id, bytes_done=0
|
||||
)
|
||||
existing = []
|
||||
can_resume_segmented = False
|
||||
|
||||
if can_resume_segmented:
|
||||
# Resume an existing segmented plan.
|
||||
self.state.segments = [
|
||||
SegmentRuntime(s.idx, s.start_offset, s.end_offset, s.bytes_done)
|
||||
@ -537,6 +559,23 @@ class DownloadJob:
|
||||
logging.debug("[model_downloader] writer close error", exc_info=True)
|
||||
self._writer = None
|
||||
|
||||
def _segmented_part_valid(self, total_bytes: int) -> bool:
|
||||
"""True when the temp file is the preallocated segmented ``.part``.
|
||||
|
||||
A segmented transfer preallocates the .part to ``total_bytes`` up front
|
||||
and tracks how much of each range landed via per-segment offsets. Those
|
||||
offsets are only trustworthy when the file they describe is still on
|
||||
disk at its full preallocated size. A missing file (swept after a
|
||||
failure, removed on a fatal error, deleted by hand) or a wrong-sized one
|
||||
means the persisted offsets no longer correspond to real bytes and must
|
||||
not be resumed over. Doing so would skip "complete" segments and leave
|
||||
zero-filled holes that pass the size-only verification gate.
|
||||
"""
|
||||
try:
|
||||
return os.path.getsize(self.spec.temp_path) == total_bytes
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
def _contiguous_prefix_valid(self, prefix_len: int) -> bool:
|
||||
"""True when the temp file is exactly ``prefix_len`` contiguous bytes.
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
"""Priority scheduler + lifecycle (PRD sections 4, 6, 12).
|
||||
"""Priority scheduler + lifecycle.
|
||||
|
||||
Owns the set of running jobs and admits queued downloads up to a global
|
||||
concurrency limit (K), highest priority first, FIFO within a priority. Runs
|
||||
@ -25,7 +25,7 @@ from app.model_downloader.database import queries
|
||||
from app.model_downloader.engine.job import DownloadJob, JobSpec
|
||||
from app.model_downloader.security import paths
|
||||
|
||||
# Backoff for retryable failures (PRD section 12).
|
||||
# Backoff for retryable failures
|
||||
_BACKOFF_BASE = 2.0
|
||||
_BACKOFF_CAP = 300.0
|
||||
_MAX_ATTEMPTS = 6
|
||||
@ -70,13 +70,23 @@ class Scheduler:
|
||||
def _sweep_orphan_temp_files() -> None:
|
||||
"""Remove ``.part`` files not referenced by a resumable download row.
|
||||
|
||||
Resumable partials (queued/paused rows) are preserved; only truly
|
||||
orphaned temp files from crashed runs are deleted.
|
||||
Resumable partials are preserved; only truly orphaned temp files from
|
||||
crashed runs are deleted. ``FAILED`` is included because
|
||||
:meth:`DownloadManager.resume` explicitly permits resuming a
|
||||
retry-exhausted failed row: deleting its partial here while the
|
||||
per-segment offsets survive in the DB would make the next resume
|
||||
preallocate a fresh sparse file, skip every "complete" segment, and
|
||||
leave zero-filled holes that pass the size-only verification gate.
|
||||
"""
|
||||
live = {
|
||||
row.temp_path
|
||||
for row in queries.list_downloads()
|
||||
if row.status in (DownloadStatus.QUEUED, DownloadStatus.PAUSED)
|
||||
if row.status
|
||||
in (
|
||||
DownloadStatus.QUEUED,
|
||||
DownloadStatus.PAUSED,
|
||||
DownloadStatus.FAILED,
|
||||
)
|
||||
}
|
||||
for path in paths.iter_all_tmp_paths():
|
||||
if path in live:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user