SelectXDevice: address code-review follow-ups

True reset semantics for "default":
- On first selector application, cache the loader's original
  load_device / offload_device on the underlying model object (which
  is shared across patcher clones) and restore those base values when
  the user picks "default". Previously "default" meant "passthrough"
  so SelectXDevice(gpu:1) -> SelectXDevice(default) silently kept the
  gpu:1 routing.

CPU + dynamic VRAM:
- When SelectModelDevice / SelectCLIPDevice resolves to CPU on a
  ModelPatcherDynamic, also call clone(disable_dynamic=True) so the
  result is a plain ModelPatcher, matching ModelPatcherDynamic.__new__'s
  intent that CPU loads never run through the dynamic path. Fallback to
  the regular dynamic clone if disable_dynamic is unsupported on that
  patcher.

MultiGPU collision pruning:
- After SelectModelDevice retargets the primary patcher, drop any
  multigpu clone (from a prior MultiGPU CFG Split) whose load_device
  now matches the primary; otherwise two patchers would be bound to
  the same device. Logs the prune at info level.

SelectVAEDevice: reject CPU at runtime:
- The UI uses get_gpu_device_options_no_cpu(), but a workflow opened
  from another machine could still pass "cpu" through validate_inputs.
  Detect that case explicitly, log a "CPU is not a supported choice"
  passthrough message, and leave the VAE unchanged.

Cosmetic:
- Update VAE node docstring to accurately reflect the runtime CPU
  rejection rather than the older "intentionally not offered" claim.
- Demote the fallback warnings inside resolve_gpu_device_option to no
  log at all; the Select*Device nodes now own a single context-rich
  info-level message per failed lookup, so there is no double logging.

Amp-Thread-ID: https://ampcode.com/threads/T-019e52b4-31ee-72cd-996b-64ecd9420e13
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Jedrzej Kosinski 2026-05-22 22:29:45 -07:00
parent 9ee1540882
commit b319c8088b
2 changed files with 119 additions and 42 deletions

View File

@ -268,8 +268,10 @@ def resolve_gpu_device_option(option: str):
Returns None for "default" (let the caller use its normal default).
Returns torch.device("cpu") for "cpu".
For "gpu:N", returns the Nth torch device. Falls back to None if
the index is out of range (caller should use default).
For "gpu:N", returns the Nth torch device. Returns None if the
index is out of range, the option string is malformed, or
unrecognized (callers are expected to log their own context-rich
message before falling back to the default device).
"""
if option is None or option == "default":
return None
@ -278,16 +280,11 @@ def resolve_gpu_device_option(option: str):
if option.startswith("gpu:"):
try:
idx = int(option[4:])
devices = get_all_torch_devices()
if 0 <= idx < len(devices):
return devices[idx]
else:
logging.warning(f"Device '{option}' not available (only {len(devices)} GPU(s)), using default.")
return None
except (ValueError, IndexError):
logging.warning(f"Invalid device option '{option}', using default.")
except ValueError:
return None
logging.warning(f"Unrecognized device option '{option}', using default.")
devices = get_all_torch_devices()
if 0 <= idx < len(devices):
return devices[idx]
return None
@contextmanager

View File

@ -46,15 +46,90 @@ class MultiGPUCFGSplitNode(io.ComfyNode):
return io.NodeOutput(model)
def _remember_base_devices(patcher: ModelPatcher):
"""Stash the original load/offload device on the underlying model.
Stored on patcher.model (which is shared across patcher clones), so
repeated selector applications can recover the loader's original
routing when the user picks "default".
"""
if not hasattr(patcher.model, "_select_base_load_device"):
patcher.model._select_base_load_device = patcher.load_device
patcher.model._select_base_offload_device = patcher.offload_device
def _apply_patcher_device(patcher: ModelPatcher, resolved, base_offload_override=None):
"""Apply *resolved* to a freshly-cloned patcher; respect base devices on default.
Returns the (possibly newly-replaced) patcher. For CPU on a dynamic
patcher, also tries to downgrade to a plain ModelPatcher so the
dynamic-only code paths are bypassed (best-effort: silently keeps
the dynamic patcher if downgrade is not supported).
"""
_remember_base_devices(patcher)
base_load = patcher.model._select_base_load_device
base_offload = base_offload_override if base_offload_override is not None else patcher.model._select_base_offload_device
if resolved is None:
# "default" -> reset routing to whatever the loader produced
patcher.load_device = base_load
patcher.offload_device = base_offload
elif resolved.type == "cpu":
if patcher.is_dynamic():
try:
patcher = patcher.clone(disable_dynamic=True)
except Exception:
# Downgrade unavailable (no cached_patcher_init); fall
# back to the existing dynamic patcher.
pass
patcher.load_device = resolved
patcher.offload_device = resolved
else:
patcher.load_device = resolved
patcher.offload_device = base_offload
if hasattr(patcher, "register_load_device"):
patcher.register_load_device(patcher.load_device)
return patcher
def _prune_multigpu_collision(model: ModelPatcher, primary_device):
"""Drop any multigpu clone whose load_device matches *primary_device*.
Without pruning, MultiGPU CFG Split would have stacked a clone on
the same device the primary now occupies (i.e. the workflow places
MultiGPU CFG Split before Select Model Device). Keeps the clone set
consistent with the new primary placement.
"""
multigpu_models = model.get_additional_models_with_key("multigpu")
if not multigpu_models:
return
filtered = [m for m in multigpu_models if m.load_device != primary_device]
if len(filtered) != len(multigpu_models):
logging.info(f"Select Model Device: pruning MultiGPU clone on {primary_device} that now collides with the primary model.")
model.set_additional_models("multigpu", filtered)
if hasattr(model, "match_multigpu_clones"):
model.match_multigpu_clones()
class SelectModelDeviceNode(io.ComfyNode):
"""
Place the diffusion model on a specific device (default / cpu / gpu:N).
- "default" restores the device assigned by the loader (even after a
prior Select Model Device call).
- "cpu" pins both the load and offload device to CPU.
- "gpu:N" pins the load device to the Nth available GPU; the offload
device is restored to the loader's original choice.
If the workflow already has MultiGPU CFG Split applied and the chosen
GPU collides with one of the existing multigpu clones, that clone is
dropped so two patchers don't end up bound to the same device.
When the selected device does not exist on the current machine
(e.g. a workflow built on a 2-GPU box opened on a 1-GPU box),
the node passes the model through unchanged and logs a message
instead of failing. This keeps workflows portable across machines
with different GPU counts.
instead of failing.
"""
@classmethod
@ -83,15 +158,12 @@ class SelectModelDeviceNode(io.ComfyNode):
def execute(cls, model: ModelPatcher, device: str = "default") -> io.NodeOutput:
model = model.clone()
resolved = comfy.model_management.resolve_gpu_device_option(device)
if resolved is None:
if device not in (None, "default"):
logging.info(f"Select Model Device: requested device '{device}' not available, passing through unchanged.")
if resolved is None and device not in (None, "default"):
logging.info(f"Select Model Device: requested device '{device}' not available, passing through unchanged.")
return io.NodeOutput(model)
model.load_device = resolved
if resolved.type == "cpu":
model.offload_device = resolved
if hasattr(model, "register_load_device"):
model.register_load_device(resolved)
model = _apply_patcher_device(model, resolved)
if resolved is not None:
_prune_multigpu_collision(model, model.load_device)
return io.NodeOutput(model)
@ -99,11 +171,14 @@ class SelectCLIPDeviceNode(io.ComfyNode):
"""
Place the CLIP text encoder on a specific device (default / cpu / gpu:N).
- "default" restores the device assigned by the loader.
- "cpu" pins both the load and offload device to CPU.
- "gpu:N" pins the load device to the Nth available GPU.
When the selected device does not exist on the current machine
(e.g. a workflow built on a 2-GPU box opened on a 1-GPU box),
the node passes the CLIP through unchanged and logs a message
instead of failing. This keeps workflows portable across machines
with different GPU counts.
instead of failing.
"""
@classmethod
@ -130,15 +205,10 @@ class SelectCLIPDeviceNode(io.ComfyNode):
def execute(cls, clip: CLIP, device: str = "default") -> io.NodeOutput:
clip = clip.clone()
resolved = comfy.model_management.resolve_gpu_device_option(device)
if resolved is None:
if device not in (None, "default"):
logging.info(f"Select CLIP Device: requested device '{device}' not available, passing through unchanged.")
if resolved is None and device not in (None, "default"):
logging.info(f"Select CLIP Device: requested device '{device}' not available, passing through unchanged.")
return io.NodeOutput(clip)
clip.patcher.load_device = resolved
if resolved.type == "cpu":
clip.patcher.offload_device = resolved
if hasattr(clip.patcher, "register_load_device"):
clip.patcher.register_load_device(resolved)
clip.patcher = _apply_patcher_device(clip.patcher, resolved)
return io.NodeOutput(clip)
@ -146,13 +216,18 @@ class SelectVAEDeviceNode(io.ComfyNode):
"""
Place the VAE on a specific device (default / gpu:N).
CPU is intentionally not offered as a choice; VAE on CPU is impractical.
- "default" restores the device assigned by the loader.
- "gpu:N" pins the load device to the Nth available GPU; the offload
device is set to the standard VAE offload device.
CPU is intentionally not exposed in the UI for the VAE; if a workflow
supplies "cpu" anyway (e.g. opened from another machine), the request
is dropped with a log message and the VAE is passed through unchanged.
When the selected device does not exist on the current machine
(e.g. a workflow built on a 2-GPU box opened on a 1-GPU box),
the node passes the VAE through unchanged and logs a message
instead of failing. This keeps workflows portable across machines
with different GPU counts.
instead of failing.
"""
@classmethod
@ -182,15 +257,20 @@ class SelectVAEDeviceNode(io.ComfyNode):
vae = copy.copy(vae)
vae.patcher = vae.patcher.clone()
resolved = comfy.model_management.resolve_gpu_device_option(device)
if resolved is None:
if device not in (None, "default"):
logging.info(f"Select VAE Device: requested device '{device}' not available, passing through unchanged.")
if resolved is None and device not in (None, "default"):
logging.info(f"Select VAE Device: requested device '{device}' not available, passing through unchanged.")
return io.NodeOutput(vae)
vae.device = resolved
vae.patcher.load_device = resolved
vae.patcher.offload_device = comfy.model_management.vae_offload_device()
if hasattr(vae.patcher, "register_load_device"):
vae.patcher.register_load_device(resolved)
if resolved is not None and resolved.type == "cpu":
logging.info("Select VAE Device: CPU is not a supported choice, passing through unchanged.")
return io.NodeOutput(vae)
vae.patcher = _apply_patcher_device(
vae.patcher, resolved,
base_offload_override=comfy.model_management.vae_offload_device(),
)
# VAE caches the working device separately from its patcher.
if not hasattr(vae, "_select_base_device"):
vae._select_base_device = vae.device
vae.device = vae._select_base_device if resolved is None else resolved
return io.NodeOutput(vae)