diff --git a/comfy/cli_args.py b/comfy/cli_args.py index df3841871..3a14a470d 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -49,7 +49,7 @@ parser.add_argument("--temp-directory", type=str, default=None, help="Set the Co parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.") parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.") parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.") -parser.add_argument("--cuda-device", type=str, default=None, metavar="DEVICE_ID", help="Set the ids of cuda devices this instance will use. All other devices will not be visible.") +parser.add_argument("--cuda-device", type=str, default=None, metavar="DEVICE_ID", help="Set the ids of cuda devices this instance will use, as a comma-separated list (e.g. '0' or '0,1'). All other devices will not be visible.") parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.") cm_group = parser.add_mutually_exclusive_group() cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).") diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 837aa907a..6dbbaa959 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -357,6 +357,10 @@ class QwenFunControlNet(ControlNet): super().pre_run(model, percent_to_timestep_function) self.set_extra_arg("base_model", model.diffusion_model) + def cleanup(self): + self.extra_args.pop("base_model", None) + super().cleanup() + def copy(self): c = QwenFunControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype) c.control_model = self.control_model diff --git a/comfy/model_management.py b/comfy/model_management.py index 2e168f363..10b982868 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -215,17 +215,19 @@ def get_all_torch_devices(exclude_current=False): if cpu_state == CPUState.GPU: if is_nvidia(): for i in range(torch.cuda.device_count()): - devices.append(torch.device(i)) + devices.append(torch.device("cuda", i)) elif is_intel_xpu(): for i in range(torch.xpu.device_count()): - devices.append(torch.device(i)) + devices.append(torch.device("xpu", i)) elif is_ascend_npu(): for i in range(torch.npu.device_count()): - devices.append(torch.device(i)) + devices.append(torch.device("npu", i)) else: devices.append(get_torch_device()) if exclude_current: - devices.remove(get_torch_device()) + current = get_torch_device() + if current in devices: + devices.remove(current) return devices def get_gpu_device_options(): diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 00d60ff72..b680de058 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1361,13 +1361,18 @@ class ModelPatcher: for callback in self.get_all_callbacks(CallbacksMP.ON_PRE_RUN): callback(self) - def prepare_state(self, timestep, model_options, ignore_multigpu=False): + def prepare_state(self, timestep, model_options): + ignore_multigpu = model_options.get("ignore_multigpu", False) for callback in self.get_all_callbacks(CallbacksMP.ON_PREPARE_STATE): - callback(self, timestep, model_options, ignore_multigpu) + callback(self, timestep, model_options) if not ignore_multigpu and "multigpu_clones" in model_options: - for p in model_options["multigpu_clones"].values(): - p: ModelPatcher - p.prepare_state(timestep, model_options, ignore_multigpu=True) + model_options["ignore_multigpu"] = True + try: + for p in model_options["multigpu_clones"].values(): + p: ModelPatcher + p.prepare_state(timestep, model_options) + finally: + model_options.pop("ignore_multigpu", None) def restore_hook_patches(self): if self.hook_patches_backup is not None: diff --git a/comfy/multigpu.py b/comfy/multigpu.py index 096270c12..eff7d0649 100644 --- a/comfy/multigpu.py +++ b/comfy/multigpu.py @@ -162,16 +162,16 @@ def create_multigpu_deepclones(model: ModelPatcher, max_gpus: int, gpu_options: gpu_options.register(model) else: logging.info("No extra torch devices need initialization, skipping initializing MultiGPU Work Units.") - # TODO: only keep model clones that don't go 'past' the intended max_gpu count - # multigpu_models = model.get_additional_models_with_key("multigpu") - # new_multigpu_models = [] - # for m in multigpu_models: - # if m.load_device in limit_extra_devices: - # new_multigpu_models.append(m) - # model.set_additional_models("multigpu", new_multigpu_models) - # persist skip_devices for use in sampling code - # if len(skip_devices) > 0 or "multigpu_skip_devices" in model.model_options: - # model.model_options["multigpu_skip_devices"] = skip_devices + # only keep model clones that don't go 'past' the intended max_gpu count; + # this prunes any inherited multigpu clones whose load_device is no longer allowed + # when max_gpus is lowered between runs. + allowed_devices = set(limit_extra_devices) + allowed_devices.add(model.load_device) + multigpu_models = model.get_additional_models_with_key("multigpu") + new_multigpu_models = [m for m in multigpu_models if m.load_device in allowed_devices] + if len(new_multigpu_models) != len(multigpu_models): + model.set_additional_models("multigpu", new_multigpu_models) + model.match_multigpu_clones() return model diff --git a/comfy/samplers.py b/comfy/samplers.py index f0d67cb7e..8bfc42bdb 100755 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -433,7 +433,11 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t for i in range(1, len(to_batch_temp) + 1): batch_amount = to_batch_temp[:len(to_batch_temp)//i] input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:] - if model.memory_required(input_shape) * 1.5 < free_memory: + cond_shapes = collections.defaultdict(list) + for tt in batch_amount: + for k, v in to_run[tt][0].conditioning.items(): + cond_shapes[k].append(v.size()) + if model.memory_required(input_shape, cond_shapes=cond_shapes) * 1.5 < free_memory: to_batch = batch_amount break @@ -866,14 +870,21 @@ def calculate_start_end_timesteps(model, conds): def pre_run_control(model, conds): s = model.model_sampling + # Per-device model lookup so multigpu control clones get the matching + # diffusion_model (e.g. QwenFunControlNet stashes it into extra_args). + device_models: dict = {} + patcher = getattr(model, "current_patcher", None) + if patcher is not None: + for p in patcher.get_additional_models_with_key("multigpu"): + device_models[p.load_device] = p.model for t in range(len(conds)): x = conds[t] percent_to_timestep_function = lambda a: s.percent_to_sigma(a) if 'control' in x: x['control'].pre_run(model, percent_to_timestep_function) - for device_cnet in x['control'].multigpu_clones.values(): - device_cnet.pre_run(model, percent_to_timestep_function) + for device, device_cnet in x['control'].multigpu_clones.items(): + device_cnet.pre_run(device_models.get(device, model), percent_to_timestep_function) def apply_empty_x_to_equal_area(conds, uncond, name, uncond_fill_func): cond_cnets = [] diff --git a/comfy/sd.py b/comfy/sd.py index e7857bf0a..481c87cb1 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1688,7 +1688,8 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata, disable_dynamic=disable_dynamic) if out is None: raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(ckpt_path, model_detection_error_hint(ckpt_path, sd))) - out[0].cached_patcher_init = (load_checkpoint_guess_config, (ckpt_path, False, False, False, embedding_directory, output_model, model_options, te_model_options), 0) + if out[0] is not None: + out[0].cached_patcher_init = (load_checkpoint_guess_config, (ckpt_path, False, False, False, embedding_directory, output_model, model_options, te_model_options), 0) return out def load_checkpoint_guess_config_model_only(ckpt_path, embedding_directory=None, model_options={}, te_model_options={}, disable_dynamic=False): diff --git a/comfy_extras/nodes_multigpu.py b/comfy_extras/nodes_multigpu.py index 5d24952bf..fedafef71 100644 --- a/comfy_extras/nodes_multigpu.py +++ b/comfy_extras/nodes_multigpu.py @@ -45,6 +45,16 @@ class MultiGPUCFGSplitNode(io.ComfyNode): class MultiGPUOptionsNode(io.ComfyNode): """ Select the relative speed of GPUs in the special case they have significantly different performance from one another. + + NOTE (not registered yet, see MultiGPUExtension.get_node_list below): + The output GPUOptionsGroup is plumbed through create_multigpu_deepclones() and stored on + model.model_options['multigpu_options'] via GPUOptionsGroup.register(), but the cond + scheduler in comfy/samplers.py (calc_cond_batch_outer_multigpu) does NOT yet consult + relative_speed when distributing conds across devices; it uses a uniform conds_per_device + round-robin via next_available_device(). Before re-enabling this node, wire its + relative_speed into the scheduler (e.g. via comfy.multigpu.load_balance_devices(), + which already implements the proportional split) so the input actually affects work + distribution. """ @classmethod @@ -68,7 +78,8 @@ class MultiGPUOptionsNode(io.ComfyNode): def execute(cls, device_index: int, relative_speed: float, gpu_options: comfy.multigpu.GPUOptionsGroup = None) -> io.NodeOutput: if not gpu_options: gpu_options = comfy.multigpu.GPUOptionsGroup() - gpu_options.clone() + else: + gpu_options = gpu_options.clone() opt = comfy.multigpu.GPUOptions(device_index=device_index, relative_speed=relative_speed) gpu_options.add(opt)