fix syncs

Fix these sync to conditionalize properly for CPU and always run in exception flows.
2026-02-28 06:47:35 +08:00 · 2026-01-26 01:06:17 +10:00 · 2026-01-26 01:06:17 +10:00 · 362d1d845e
commit 362d1d845e
parent 878c5156d6
2 changed files with 7 additions and 3 deletions
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -1120,7 +1120,8 @@ def get_cast_buffer(offload_stream, device, size, ref):
 def reset_cast_buffers():
    global LARGEST_CASTED_WEIGHT
    LARGEST_CASTED_WEIGHT = (None, 0)
-    torch.cuda.synchronize()
+    for offload_stream in STREAM_CAST_BUFFERS:
+        offload_stream.synchronize()
    STREAM_CAST_BUFFERS.clear()
    torch.cuda.empty_cache()

--- a/execution.py
+++ b/execution.py
@ -523,8 +523,11 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
            #that we just want to cull out each model run.
            allocator = comfy.memory_management.aimdo_allocator
            with nullcontext() if allocator is None else torch.cuda.use_mem_pool(torch.cuda.MemPool(allocator.allocator())):
-                output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, v3_data=v3_data)
-                torch.cuda.synchronize()
+                try:
+                    output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, v3_data=v3_data)
+                finally:
+                    if allocator is not None:
+                        torch.cuda.synchronize()
            if allocator is not None:
                #FIXME: this is probably a little zealous
                # Torch code comments says some stuff about not actually freeing tensors on mempool