From 9e3ede14062ac42d85885eae20e75b754c85e030 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Tue, 19 May 2026 20:11:53 -0700 Subject: [PATCH] Fix MultiGPU scheduler capacity accounting (#14000) Fixes _calc_cond_batch_multigpu so that: 1. conds_per_device uses real division before math.ceil. The previous expression math.ceil(total_conds // len(devices)) applied integer floor division first, making ceil a no-op. For 3 conds across 2 devices this produced conds_per_device=1 instead of 2. 2. The scheduling loop skips devices that have already reached capacity instead of appending empty batch groups. Without this guard, the loop could repeatedly emit zero-length groups for a full device, leaving sampling stuck at 0/N until timeout. Reproduces with an Omnigen2 image workflow that produces three condition entries scheduled across two CUDA devices. With the fix the scheduler assigns conds_per_device=2 and splits the batches as 2 + 1 across the two devices, allowing sampling to complete. Original fix authored and validated by @pollockjj in pollockjj/ComfyUI#64. Co-authored-by: John Pollock --- comfy/samplers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/comfy/samplers.py b/comfy/samplers.py index 88393e367..83fa2e609 100755 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -394,7 +394,7 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t total_conds = 0 for to_run in hooked_to_run.values(): total_conds += len(to_run) - conds_per_device = max(1, math.ceil(total_conds//len(devices))) + conds_per_device = max(1, math.ceil(total_conds / len(devices))) index_device = 0 current_device = devices[index_device] # run every hooked_to_run separately @@ -406,13 +406,17 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t batched_to_run_length = 0 for btr in batched_to_run: batched_to_run_length += len(btr[1]) + remaining_capacity = conds_per_device - batched_to_run_length + if remaining_capacity <= 0: + index_device += 1 + continue first = to_run[0] first_shape = first[0][0].shape to_batch_temp = [] # make sure not over conds_per_device limit when creating temp batch for x in range(len(to_run)): - if can_concat_cond(to_run[x][0], first[0]) and len(to_batch_temp) < (conds_per_device - batched_to_run_length): + if can_concat_cond(to_run[x][0], first[0]) and len(to_batch_temp) < remaining_capacity: to_batch_temp += [x] to_batch_temp.reverse()