From 6ceba8c9f8e90b99b1c753fa51aa779f32f13710 Mon Sep 17 00:00:00 2001 From: Rattus Date: Mon, 27 Oct 2025 18:54:32 +1000 Subject: [PATCH] mm: sync the last stream in the queue, not the next Currently this peeks ahead to sync the next stream in the queue of streams with the compute stream. This doesnt allow a lot of parallelization, as then end result is you can only get one weight load ahead regardless of how many streams you have. Rotate the loop logic here to synchronize the end of the queue before returning the next stream. This allows weights to be loaded ahead of the compute streams position. --- comfy/model_management.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index cd8326f5f..79c0dfdb4 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1031,18 +1031,17 @@ def get_offload_stream(device): if device in STREAMS: ss = STREAMS[device] - s = ss[stream_counter] - stream_counter = (stream_counter + 1) % len(ss) + #Sync the oldest stream in the queue with the current ss[stream_counter].wait_stream(current_stream(device)) + stream_counter = (stream_counter + 1) % len(ss) stream_counters[device] = stream_counter - return s + return ss[stream_counter] elif is_device_cuda(device): ss = [] for k in range(NUM_STREAMS): ss.append(torch.cuda.Stream(device=device, priority=0)) STREAMS[device] = ss s = ss[stream_counter] - stream_counter = (stream_counter + 1) % len(ss) stream_counters[device] = stream_counter return s elif is_device_xpu(device): @@ -1051,7 +1050,6 @@ def get_offload_stream(device): ss.append(torch.xpu.Stream(device=device, priority=0)) STREAMS[device] = ss s = ss[stream_counter] - stream_counter = (stream_counter + 1) % len(ss) stream_counters[device] = stream_counter return s return None