From 48deb15c0e2b3336de4ca27b3e920954dfde453b Mon Sep 17 00:00:00 2001
From: Jedrzej Kosinski <kosinkadink1@gmail.com>
Date: Wed, 8 Apr 2026 22:15:57 -1000
Subject: [PATCH] Simplify multigpu dispatch: run all devices on pool threads
 (#13340)

Benchmarked hybrid (main thread + pool) vs all-pool on 2x RTX 4090
with SD1.5 and NetaYume models. No meaningful performance difference
(within noise). All-pool is simpler: eliminates the main_device
special case, main_batch_tuple deferred execution, and the 3-way
branch in the dispatch loop.
---
 comfy/samplers.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/comfy/samplers.py b/comfy/samplers.py
index 68f093749..8ebf1c496 100755
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -516,25 +516,17 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t
 
     results: list[thread_result] = []
     thread_pool: comfy.multigpu.MultiGPUThreadPool = model_options.get("multigpu_thread_pool")
-    main_device = output_device
-    main_batch_tuple = None
 
-    # Submit extra GPU work to pool first, then run main device on this thread
+    # Submit all GPU work to pool threads
     pool_devices = []
     for device, batch_tuple in device_batched_hooked_to_run.items():
-        if device == main_device and thread_pool is not None:
-            main_batch_tuple = batch_tuple
-        elif thread_pool is not None:
+        if thread_pool is not None:
             thread_pool.submit(device, _handle_batch_pooled, device, batch_tuple)
             pool_devices.append(device)
         else:
             # Fallback: no pool, run everything on main thread
             _handle_batch(device, batch_tuple, results)
 
-    # Run main device batch on this thread (parallel with pool workers)
-    if main_batch_tuple is not None:
-        _handle_batch(main_device, main_batch_tuple, results)
-
     # Collect results from pool workers
     for device in pool_devices:
         worker_results, error = thread_pool.get_result(device)
@@ -1210,10 +1202,11 @@ class CFGGuider:
 
         multigpu_patchers = comfy.sampler_helpers.prepare_model_patcher_multigpu_clones(self.model_patcher, self.loaded_models, self.model_options)
 
-        # Create persistent thread pool for extra GPU devices
+        # Create persistent thread pool for all GPU devices (main + extras)
         if multigpu_patchers:
             extra_devices = [p.load_device for p in multigpu_patchers]
-            self.model_options["multigpu_thread_pool"] = comfy.multigpu.MultiGPUThreadPool(extra_devices)
+            all_devices = [device] + extra_devices
+            self.model_options["multigpu_thread_pool"] = comfy.multigpu.MultiGPUThreadPool(all_devices)
 
         try:
             noise = noise.to(device=device, dtype=torch.float32)