SelectXDevice: address code-review follow-ups

True reset semantics for "default": - On first selector application, cache the loader's original load_device / offload_device on the underlying model object (which is shared across patcher clones) and restore those base values when the user picks "default". Previously "default" meant "passthrough" so SelectXDevice(gpu:1) -> SelectXDevice(default) silently kept the gpu:1 routing. CPU + dynamic VRAM: - When SelectModelDevice / SelectCLIPDevice resolves to CPU on a ModelPatcherDynamic, also call clone(disable_dynamic=True) so the result is a plain ModelPatcher, matching ModelPatcherDynamic.__new__'s intent that CPU loads never run through the dynamic path. Fallback to the regular dynamic clone if disable_dynamic is unsupported on that patcher. MultiGPU collision pruning: - After SelectModelDevice retargets the primary patcher, drop any multigpu clone (from a prior MultiGPU CFG Split) whose load_device now matches the primary; otherwise two patchers would be bound to the same device. Logs the prune at info level. SelectVAEDevice: reject CPU at runtime: - The UI uses get_gpu_device_options_no_cpu(), but a workflow opened from another machine could still pass "cpu" through validate_inputs. Detect that case explicitly, log a "CPU is not a supported choice" passthrough message, and leave the VAE unchanged. Cosmetic: - Update VAE node docstring to accurately reflect the runtime CPU rejection rather than the older "intentionally not offered" claim. - Demote the fallback warnings inside resolve_gpu_device_option to no log at all; the Select*Device nodes now own a single context-rich info-level message per failed lookup, so there is no double logging. Amp-Thread-ID: https://ampcode.com/threads/T-019e52b4-31ee-72cd-996b-64ecd9420e13 Co-authored-by: Amp <amp@ampcode.com>
2026-06-14 11:59:21 +08:00 · 2026-05-22 22:29:45 -07:00 · 2026-05-22 22:29:45 -07:00 · b319c8088b
commit b319c8088b
parent 9ee1540882
2 changed files with 119 additions and 42 deletions
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -268,8 +268,10 @@ def resolve_gpu_device_option(option: str):

    Returns None for "default" (let the caller use its normal default).
    Returns torch.device("cpu") for "cpu".
-    For "gpu:N", returns the Nth torch device. Falls back to None if
-    the index is out of range (caller should use default).
+    For "gpu:N", returns the Nth torch device. Returns None if the
+    index is out of range, the option string is malformed, or
+    unrecognized (callers are expected to log their own context-rich
+    message before falling back to the default device).
    """
    if option is None or option == "default":
        return None
@ -278,16 +280,11 @@ def resolve_gpu_device_option(option: str):
    if option.startswith("gpu:"):
        try:
            idx = int(option[4:])
-            devices = get_all_torch_devices()
-            if 0 <= idx < len(devices):
-                return devices[idx]
-            else:
-                logging.warning(f"Device '{option}' not available (only {len(devices)} GPU(s)), using default.")
-                return None
-        except (ValueError, IndexError):
-            logging.warning(f"Invalid device option '{option}', using default.")
+        except ValueError:
            return None
-    logging.warning(f"Unrecognized device option '{option}', using default.")
+        devices = get_all_torch_devices()
+        if 0 <= idx < len(devices):
+            return devices[idx]
    return None

@contextmanager
--- a/comfy_extras/nodes_multigpu.py
+++ b/comfy_extras/nodes_multigpu.py
@ -46,15 +46,90 @@ class MultiGPUCFGSplitNode(io.ComfyNode):
        return io.NodeOutput(model)


+def _remember_base_devices(patcher: ModelPatcher):
+    """Stash the original load/offload device on the underlying model.
+
+    Stored on patcher.model (which is shared across patcher clones), so
+    repeated selector applications can recover the loader's original
+    routing when the user picks "default".
+    """
+    if not hasattr(patcher.model, "_select_base_load_device"):
+        patcher.model._select_base_load_device = patcher.load_device
+        patcher.model._select_base_offload_device = patcher.offload_device
+
+
+def _apply_patcher_device(patcher: ModelPatcher, resolved, base_offload_override=None):
+    """Apply *resolved* to a freshly-cloned patcher; respect base devices on default.
+
+    Returns the (possibly newly-replaced) patcher. For CPU on a dynamic
+    patcher, also tries to downgrade to a plain ModelPatcher so the
+    dynamic-only code paths are bypassed (best-effort: silently keeps
+    the dynamic patcher if downgrade is not supported).
+    """
+    _remember_base_devices(patcher)
+    base_load = patcher.model._select_base_load_device
+    base_offload = base_offload_override if base_offload_override is not None else patcher.model._select_base_offload_device
+
+    if resolved is None:
+        # "default" -> reset routing to whatever the loader produced
+        patcher.load_device = base_load
+        patcher.offload_device = base_offload
+    elif resolved.type == "cpu":
+        if patcher.is_dynamic():
+            try:
+                patcher = patcher.clone(disable_dynamic=True)
+            except Exception:
+                # Downgrade unavailable (no cached_patcher_init); fall
+                # back to the existing dynamic patcher.
+                pass
+        patcher.load_device = resolved
+        patcher.offload_device = resolved
+    else:
+        patcher.load_device = resolved
+        patcher.offload_device = base_offload
+
+    if hasattr(patcher, "register_load_device"):
+        patcher.register_load_device(patcher.load_device)
+    return patcher
+
+
+def _prune_multigpu_collision(model: ModelPatcher, primary_device):
+    """Drop any multigpu clone whose load_device matches *primary_device*.
+
+    Without pruning, MultiGPU CFG Split would have stacked a clone on
+    the same device the primary now occupies (i.e. the workflow places
+    MultiGPU CFG Split before Select Model Device). Keeps the clone set
+    consistent with the new primary placement.
+    """
+    multigpu_models = model.get_additional_models_with_key("multigpu")
+    if not multigpu_models:
+        return
+    filtered = [m for m in multigpu_models if m.load_device != primary_device]
+    if len(filtered) != len(multigpu_models):
+        logging.info(f"Select Model Device: pruning MultiGPU clone on {primary_device} that now collides with the primary model.")
+        model.set_additional_models("multigpu", filtered)
+        if hasattr(model, "match_multigpu_clones"):
+            model.match_multigpu_clones()
+
+
 class SelectModelDeviceNode(io.ComfyNode):
    """
    Place the diffusion model on a specific device (default / cpu / gpu:N).

+    - "default" restores the device assigned by the loader (even after a
+      prior Select Model Device call).
+    - "cpu" pins both the load and offload device to CPU.
+    - "gpu:N" pins the load device to the Nth available GPU; the offload
+      device is restored to the loader's original choice.
+
+    If the workflow already has MultiGPU CFG Split applied and the chosen
+    GPU collides with one of the existing multigpu clones, that clone is
+    dropped so two patchers don't end up bound to the same device.
+
    When the selected device does not exist on the current machine
    (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box),
    the node passes the model through unchanged and logs a message
-    instead of failing. This keeps workflows portable across machines
-    with different GPU counts.
+    instead of failing.
    """

    @classmethod
@ -83,15 +158,12 @@ class SelectModelDeviceNode(io.ComfyNode):
    def execute(cls, model: ModelPatcher, device: str = "default") -> io.NodeOutput:
        model = model.clone()
        resolved = comfy.model_management.resolve_gpu_device_option(device)
-        if resolved is None:
-            if device not in (None, "default"):
-                logging.info(f"Select Model Device: requested device '{device}' not available, passing through unchanged.")
+        if resolved is None and device not in (None, "default"):
+            logging.info(f"Select Model Device: requested device '{device}' not available, passing through unchanged.")
            return io.NodeOutput(model)
-        model.load_device = resolved
-        if resolved.type == "cpu":
-            model.offload_device = resolved
-        if hasattr(model, "register_load_device"):
-            model.register_load_device(resolved)
+        model = _apply_patcher_device(model, resolved)
+        if resolved is not None:
+            _prune_multigpu_collision(model, model.load_device)
        return io.NodeOutput(model)


@ -99,11 +171,14 @@ class SelectCLIPDeviceNode(io.ComfyNode):
    """
    Place the CLIP text encoder on a specific device (default / cpu / gpu:N).

+    - "default" restores the device assigned by the loader.
+    - "cpu" pins both the load and offload device to CPU.
+    - "gpu:N" pins the load device to the Nth available GPU.
+
    When the selected device does not exist on the current machine
    (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box),
    the node passes the CLIP through unchanged and logs a message
-    instead of failing. This keeps workflows portable across machines
-    with different GPU counts.
+    instead of failing.
    """

    @classmethod
@ -130,15 +205,10 @@ class SelectCLIPDeviceNode(io.ComfyNode):
    def execute(cls, clip: CLIP, device: str = "default") -> io.NodeOutput:
        clip = clip.clone()
        resolved = comfy.model_management.resolve_gpu_device_option(device)
-        if resolved is None:
-            if device not in (None, "default"):
-                logging.info(f"Select CLIP Device: requested device '{device}' not available, passing through unchanged.")
+        if resolved is None and device not in (None, "default"):
+            logging.info(f"Select CLIP Device: requested device '{device}' not available, passing through unchanged.")
            return io.NodeOutput(clip)
-        clip.patcher.load_device = resolved
-        if resolved.type == "cpu":
-            clip.patcher.offload_device = resolved
-        if hasattr(clip.patcher, "register_load_device"):
-            clip.patcher.register_load_device(resolved)
+        clip.patcher = _apply_patcher_device(clip.patcher, resolved)
        return io.NodeOutput(clip)


@ -146,13 +216,18 @@ class SelectVAEDeviceNode(io.ComfyNode):
    """
    Place the VAE on a specific device (default / gpu:N).

-    CPU is intentionally not offered as a choice; VAE on CPU is impractical.
+    - "default" restores the device assigned by the loader.
+    - "gpu:N" pins the load device to the Nth available GPU; the offload
+      device is set to the standard VAE offload device.
+
+    CPU is intentionally not exposed in the UI for the VAE; if a workflow
+    supplies "cpu" anyway (e.g. opened from another machine), the request
+    is dropped with a log message and the VAE is passed through unchanged.

    When the selected device does not exist on the current machine
    (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box),
    the node passes the VAE through unchanged and logs a message
-    instead of failing. This keeps workflows portable across machines
-    with different GPU counts.
+    instead of failing.
    """

    @classmethod
@ -182,15 +257,20 @@ class SelectVAEDeviceNode(io.ComfyNode):
        vae = copy.copy(vae)
        vae.patcher = vae.patcher.clone()
        resolved = comfy.model_management.resolve_gpu_device_option(device)
-        if resolved is None:
-            if device not in (None, "default"):
-                logging.info(f"Select VAE Device: requested device '{device}' not available, passing through unchanged.")
+        if resolved is None and device not in (None, "default"):
+            logging.info(f"Select VAE Device: requested device '{device}' not available, passing through unchanged.")
            return io.NodeOutput(vae)
-        vae.device = resolved
-        vae.patcher.load_device = resolved
-        vae.patcher.offload_device = comfy.model_management.vae_offload_device()
-        if hasattr(vae.patcher, "register_load_device"):
-            vae.patcher.register_load_device(resolved)
+        if resolved is not None and resolved.type == "cpu":
+            logging.info("Select VAE Device: CPU is not a supported choice, passing through unchanged.")
+            return io.NodeOutput(vae)
+        vae.patcher = _apply_patcher_device(
+            vae.patcher, resolved,
+            base_offload_override=comfy.model_management.vae_offload_device(),
+        )
+        # VAE caches the working device separately from its patcher.
+        if not hasattr(vae, "_select_base_device"):
+            vae._select_base_device = vae.device
+        vae.device = vae._select_base_device if resolved is None else resolved
        return io.NodeOutput(vae)