From b319c8088b8f125332071d3fbfadcf5c26e830c3 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Fri, 22 May 2026 22:29:45 -0700 Subject: [PATCH] SelectXDevice: address code-review follow-ups True reset semantics for "default": - On first selector application, cache the loader's original load_device / offload_device on the underlying model object (which is shared across patcher clones) and restore those base values when the user picks "default". Previously "default" meant "passthrough" so SelectXDevice(gpu:1) -> SelectXDevice(default) silently kept the gpu:1 routing. CPU + dynamic VRAM: - When SelectModelDevice / SelectCLIPDevice resolves to CPU on a ModelPatcherDynamic, also call clone(disable_dynamic=True) so the result is a plain ModelPatcher, matching ModelPatcherDynamic.__new__'s intent that CPU loads never run through the dynamic path. Fallback to the regular dynamic clone if disable_dynamic is unsupported on that patcher. MultiGPU collision pruning: - After SelectModelDevice retargets the primary patcher, drop any multigpu clone (from a prior MultiGPU CFG Split) whose load_device now matches the primary; otherwise two patchers would be bound to the same device. Logs the prune at info level. SelectVAEDevice: reject CPU at runtime: - The UI uses get_gpu_device_options_no_cpu(), but a workflow opened from another machine could still pass "cpu" through validate_inputs. Detect that case explicitly, log a "CPU is not a supported choice" passthrough message, and leave the VAE unchanged. Cosmetic: - Update VAE node docstring to accurately reflect the runtime CPU rejection rather than the older "intentionally not offered" claim. - Demote the fallback warnings inside resolve_gpu_device_option to no log at all; the Select*Device nodes now own a single context-rich info-level message per failed lookup, so there is no double logging. Amp-Thread-ID: https://ampcode.com/threads/T-019e52b4-31ee-72cd-996b-64ecd9420e13 Co-authored-by: Amp --- comfy/model_management.py | 19 ++--- comfy_extras/nodes_multigpu.py | 142 ++++++++++++++++++++++++++------- 2 files changed, 119 insertions(+), 42 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index d744f1745..3bce128b2 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -268,8 +268,10 @@ def resolve_gpu_device_option(option: str): Returns None for "default" (let the caller use its normal default). Returns torch.device("cpu") for "cpu". - For "gpu:N", returns the Nth torch device. Falls back to None if - the index is out of range (caller should use default). + For "gpu:N", returns the Nth torch device. Returns None if the + index is out of range, the option string is malformed, or + unrecognized (callers are expected to log their own context-rich + message before falling back to the default device). """ if option is None or option == "default": return None @@ -278,16 +280,11 @@ def resolve_gpu_device_option(option: str): if option.startswith("gpu:"): try: idx = int(option[4:]) - devices = get_all_torch_devices() - if 0 <= idx < len(devices): - return devices[idx] - else: - logging.warning(f"Device '{option}' not available (only {len(devices)} GPU(s)), using default.") - return None - except (ValueError, IndexError): - logging.warning(f"Invalid device option '{option}', using default.") + except ValueError: return None - logging.warning(f"Unrecognized device option '{option}', using default.") + devices = get_all_torch_devices() + if 0 <= idx < len(devices): + return devices[idx] return None @contextmanager diff --git a/comfy_extras/nodes_multigpu.py b/comfy_extras/nodes_multigpu.py index 1fb134eca..0e109f426 100644 --- a/comfy_extras/nodes_multigpu.py +++ b/comfy_extras/nodes_multigpu.py @@ -46,15 +46,90 @@ class MultiGPUCFGSplitNode(io.ComfyNode): return io.NodeOutput(model) +def _remember_base_devices(patcher: ModelPatcher): + """Stash the original load/offload device on the underlying model. + + Stored on patcher.model (which is shared across patcher clones), so + repeated selector applications can recover the loader's original + routing when the user picks "default". + """ + if not hasattr(patcher.model, "_select_base_load_device"): + patcher.model._select_base_load_device = patcher.load_device + patcher.model._select_base_offload_device = patcher.offload_device + + +def _apply_patcher_device(patcher: ModelPatcher, resolved, base_offload_override=None): + """Apply *resolved* to a freshly-cloned patcher; respect base devices on default. + + Returns the (possibly newly-replaced) patcher. For CPU on a dynamic + patcher, also tries to downgrade to a plain ModelPatcher so the + dynamic-only code paths are bypassed (best-effort: silently keeps + the dynamic patcher if downgrade is not supported). + """ + _remember_base_devices(patcher) + base_load = patcher.model._select_base_load_device + base_offload = base_offload_override if base_offload_override is not None else patcher.model._select_base_offload_device + + if resolved is None: + # "default" -> reset routing to whatever the loader produced + patcher.load_device = base_load + patcher.offload_device = base_offload + elif resolved.type == "cpu": + if patcher.is_dynamic(): + try: + patcher = patcher.clone(disable_dynamic=True) + except Exception: + # Downgrade unavailable (no cached_patcher_init); fall + # back to the existing dynamic patcher. + pass + patcher.load_device = resolved + patcher.offload_device = resolved + else: + patcher.load_device = resolved + patcher.offload_device = base_offload + + if hasattr(patcher, "register_load_device"): + patcher.register_load_device(patcher.load_device) + return patcher + + +def _prune_multigpu_collision(model: ModelPatcher, primary_device): + """Drop any multigpu clone whose load_device matches *primary_device*. + + Without pruning, MultiGPU CFG Split would have stacked a clone on + the same device the primary now occupies (i.e. the workflow places + MultiGPU CFG Split before Select Model Device). Keeps the clone set + consistent with the new primary placement. + """ + multigpu_models = model.get_additional_models_with_key("multigpu") + if not multigpu_models: + return + filtered = [m for m in multigpu_models if m.load_device != primary_device] + if len(filtered) != len(multigpu_models): + logging.info(f"Select Model Device: pruning MultiGPU clone on {primary_device} that now collides with the primary model.") + model.set_additional_models("multigpu", filtered) + if hasattr(model, "match_multigpu_clones"): + model.match_multigpu_clones() + + class SelectModelDeviceNode(io.ComfyNode): """ Place the diffusion model on a specific device (default / cpu / gpu:N). + - "default" restores the device assigned by the loader (even after a + prior Select Model Device call). + - "cpu" pins both the load and offload device to CPU. + - "gpu:N" pins the load device to the Nth available GPU; the offload + device is restored to the loader's original choice. + + If the workflow already has MultiGPU CFG Split applied and the chosen + GPU collides with one of the existing multigpu clones, that clone is + dropped so two patchers don't end up bound to the same device. + When the selected device does not exist on the current machine (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box), the node passes the model through unchanged and logs a message - instead of failing. This keeps workflows portable across machines - with different GPU counts. + instead of failing. """ @classmethod @@ -83,15 +158,12 @@ class SelectModelDeviceNode(io.ComfyNode): def execute(cls, model: ModelPatcher, device: str = "default") -> io.NodeOutput: model = model.clone() resolved = comfy.model_management.resolve_gpu_device_option(device) - if resolved is None: - if device not in (None, "default"): - logging.info(f"Select Model Device: requested device '{device}' not available, passing through unchanged.") + if resolved is None and device not in (None, "default"): + logging.info(f"Select Model Device: requested device '{device}' not available, passing through unchanged.") return io.NodeOutput(model) - model.load_device = resolved - if resolved.type == "cpu": - model.offload_device = resolved - if hasattr(model, "register_load_device"): - model.register_load_device(resolved) + model = _apply_patcher_device(model, resolved) + if resolved is not None: + _prune_multigpu_collision(model, model.load_device) return io.NodeOutput(model) @@ -99,11 +171,14 @@ class SelectCLIPDeviceNode(io.ComfyNode): """ Place the CLIP text encoder on a specific device (default / cpu / gpu:N). + - "default" restores the device assigned by the loader. + - "cpu" pins both the load and offload device to CPU. + - "gpu:N" pins the load device to the Nth available GPU. + When the selected device does not exist on the current machine (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box), the node passes the CLIP through unchanged and logs a message - instead of failing. This keeps workflows portable across machines - with different GPU counts. + instead of failing. """ @classmethod @@ -130,15 +205,10 @@ class SelectCLIPDeviceNode(io.ComfyNode): def execute(cls, clip: CLIP, device: str = "default") -> io.NodeOutput: clip = clip.clone() resolved = comfy.model_management.resolve_gpu_device_option(device) - if resolved is None: - if device not in (None, "default"): - logging.info(f"Select CLIP Device: requested device '{device}' not available, passing through unchanged.") + if resolved is None and device not in (None, "default"): + logging.info(f"Select CLIP Device: requested device '{device}' not available, passing through unchanged.") return io.NodeOutput(clip) - clip.patcher.load_device = resolved - if resolved.type == "cpu": - clip.patcher.offload_device = resolved - if hasattr(clip.patcher, "register_load_device"): - clip.patcher.register_load_device(resolved) + clip.patcher = _apply_patcher_device(clip.patcher, resolved) return io.NodeOutput(clip) @@ -146,13 +216,18 @@ class SelectVAEDeviceNode(io.ComfyNode): """ Place the VAE on a specific device (default / gpu:N). - CPU is intentionally not offered as a choice; VAE on CPU is impractical. + - "default" restores the device assigned by the loader. + - "gpu:N" pins the load device to the Nth available GPU; the offload + device is set to the standard VAE offload device. + + CPU is intentionally not exposed in the UI for the VAE; if a workflow + supplies "cpu" anyway (e.g. opened from another machine), the request + is dropped with a log message and the VAE is passed through unchanged. When the selected device does not exist on the current machine (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box), the node passes the VAE through unchanged and logs a message - instead of failing. This keeps workflows portable across machines - with different GPU counts. + instead of failing. """ @classmethod @@ -182,15 +257,20 @@ class SelectVAEDeviceNode(io.ComfyNode): vae = copy.copy(vae) vae.patcher = vae.patcher.clone() resolved = comfy.model_management.resolve_gpu_device_option(device) - if resolved is None: - if device not in (None, "default"): - logging.info(f"Select VAE Device: requested device '{device}' not available, passing through unchanged.") + if resolved is None and device not in (None, "default"): + logging.info(f"Select VAE Device: requested device '{device}' not available, passing through unchanged.") return io.NodeOutput(vae) - vae.device = resolved - vae.patcher.load_device = resolved - vae.patcher.offload_device = comfy.model_management.vae_offload_device() - if hasattr(vae.patcher, "register_load_device"): - vae.patcher.register_load_device(resolved) + if resolved is not None and resolved.type == "cpu": + logging.info("Select VAE Device: CPU is not a supported choice, passing through unchanged.") + return io.NodeOutput(vae) + vae.patcher = _apply_patcher_device( + vae.patcher, resolved, + base_offload_override=comfy.model_management.vae_offload_device(), + ) + # VAE caches the working device separately from its patcher. + if not hasattr(vae, "_select_base_device"): + vae._select_base_device = vae.device + vae.device = vae._select_base_device if resolved is None else resolved return io.NodeOutput(vae)