From 9e3ede14062ac42d85885eae20e75b754c85e030 Mon Sep 17 00:00:00 2001
From: Jedrzej Kosinski <kosinkadink1@gmail.com>
Date: Tue, 19 May 2026 20:11:53 -0700
Subject: [PATCH] Fix MultiGPU scheduler capacity accounting (#14000)

Fixes _calc_cond_batch_multigpu so that:

1. conds_per_device uses real division before math.ceil. The previous
   expression math.ceil(total_conds // len(devices)) applied integer
   floor division first, making ceil a no-op. For 3 conds across 2
   devices this produced conds_per_device=1 instead of 2.

2. The scheduling loop skips devices that have already reached
   capacity instead of appending empty batch groups. Without this
   guard, the loop could repeatedly emit zero-length groups for a
   full device, leaving sampling stuck at 0/N until timeout.

Reproduces with an Omnigen2 image workflow that produces three
condition entries scheduled across two CUDA devices. With the fix
the scheduler assigns conds_per_device=2 and splits the batches as
2 + 1 across the two devices, allowing sampling to complete.

Original fix authored and validated by @pollockjj in
pollockjj/ComfyUI#64.

Co-authored-by: John Pollock <pollockjj@gmail.com>
---
 comfy/samplers.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/comfy/samplers.py b/comfy/samplers.py
index 88393e367..83fa2e609 100755
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -394,7 +394,7 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t
     total_conds = 0
     for to_run in hooked_to_run.values():
         total_conds += len(to_run)
-    conds_per_device = max(1, math.ceil(total_conds//len(devices)))
+    conds_per_device = max(1, math.ceil(total_conds / len(devices)))
     index_device = 0
     current_device = devices[index_device]
     # run every hooked_to_run separately
@@ -406,13 +406,17 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t
             batched_to_run_length = 0
             for btr in batched_to_run:
                 batched_to_run_length += len(btr[1])
+            remaining_capacity = conds_per_device - batched_to_run_length
+            if remaining_capacity <= 0:
+                index_device += 1
+                continue
 
             first = to_run[0]
             first_shape = first[0][0].shape
             to_batch_temp = []
             # make sure not over conds_per_device limit when creating temp batch
             for x in range(len(to_run)):
-                if can_concat_cond(to_run[x][0], first[0]) and len(to_batch_temp) < (conds_per_device - batched_to_run_length):
+                if can_concat_cond(to_run[x][0], first[0]) and len(to_batch_temp) < remaining_capacity:
                     to_batch_temp += [x]
 
             to_batch_temp.reverse()