From 80f07952d25227213c72941824401ef432584a2a Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Sun, 22 Dec 2024 23:20:17 -0500 Subject: [PATCH 01/23] Fix lowvram issue with ltxv vae. --- comfy/ldm/lightricks/vae/causal_video_autoencoder.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py index 4d43feb22..e0344deec 100644 --- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py +++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py @@ -378,7 +378,7 @@ class Decoder(nn.Module): assert ( timestep is not None ), "should pass timestep with timestep_conditioning=True" - scaled_timestep = timestep * self.timestep_scale_multiplier + scaled_timestep = timestep * self.timestep_scale_multiplier.to(dtype=sample.dtype, device=sample.device) for up_block in self.up_blocks: if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D): @@ -403,7 +403,7 @@ class Decoder(nn.Module): ) ada_values = self.last_scale_shift_table[ None, ..., None, None, None - ] + embedded_timestep.reshape( + ].to(device=sample.device, dtype=sample.dtype) + embedded_timestep.reshape( batch_size, 2, -1, @@ -697,7 +697,7 @@ class ResnetBlock3D(nn.Module): ), "should pass timestep with timestep_conditioning=True" ada_values = self.scale_shift_table[ None, ..., None, None, None - ] + timestep.reshape( + ].to(device=hidden_states.device, dtype=hidden_states.dtype) + timestep.reshape( batch_size, 4, -1, @@ -715,7 +715,7 @@ class ResnetBlock3D(nn.Module): if self.inject_noise: hidden_states = self._feed_spatial_noise( - hidden_states, self.per_channel_scale1 + hidden_states, self.per_channel_scale1.to(device=hidden_states.device, dtype=hidden_states.dtype) ) hidden_states = self.norm2(hidden_states) @@ -731,7 +731,7 @@ class ResnetBlock3D(nn.Module): if self.inject_noise: hidden_states = self._feed_spatial_noise( - hidden_states, self.per_channel_scale2 + hidden_states, self.per_channel_scale2.to(device=hidden_states.device, dtype=hidden_states.dtype) ) input_tensor = self.norm3(input_tensor) From f7d83b72e0d4dd27ce6e54ef77dfb2ae4cb0edcd Mon Sep 17 00:00:00 2001 From: zhangp365 <144313702+zhangp365@users.noreply.github.com> Date: Mon, 23 Dec 2024 12:44:20 +0800 Subject: [PATCH 02/23] fixed a bug in ldm/pixart/blocks.py (#6158) --- comfy/ldm/pixart/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ldm/pixart/blocks.py b/comfy/ldm/pixart/blocks.py index 48b27008c..967a224a3 100644 --- a/comfy/ldm/pixart/blocks.py +++ b/comfy/ldm/pixart/blocks.py @@ -12,7 +12,7 @@ from comfy.ldm.modules.attention import optimized_attention if model_management.xformers_enabled(): import xformers.ops - if int((xformers.__version__).split(".")[2]) >= 28: + if int((xformers.__version__).split(".")[2].split("+")[0]) >= 28: block_diagonal_mask_from_seqlens = xformers.ops.fmha.attn_bias.BlockDiagonalMask.from_seqlens else: block_diagonal_mask_from_seqlens = xformers.ops.fmha.BlockDiagonalMask.from_seqlens From 56bc64f3514bc61bdafb8e8f7986c7ebc86d5e9d Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Sun, 22 Dec 2024 23:51:14 -0500 Subject: [PATCH 03/23] Comment out some useless code. --- comfy/ldm/pixart/blocks.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/comfy/ldm/pixart/blocks.py b/comfy/ldm/pixart/blocks.py index 967a224a3..40b0663e5 100644 --- a/comfy/ldm/pixart/blocks.py +++ b/comfy/ldm/pixart/blocks.py @@ -10,12 +10,12 @@ from comfy import model_management from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, Mlp, timestep_embedding from comfy.ldm.modules.attention import optimized_attention -if model_management.xformers_enabled(): - import xformers.ops - if int((xformers.__version__).split(".")[2].split("+")[0]) >= 28: - block_diagonal_mask_from_seqlens = xformers.ops.fmha.attn_bias.BlockDiagonalMask.from_seqlens - else: - block_diagonal_mask_from_seqlens = xformers.ops.fmha.BlockDiagonalMask.from_seqlens +# if model_management.xformers_enabled(): +# import xformers.ops +# if int((xformers.__version__).split(".")[2].split("+")[0]) >= 28: +# block_diagonal_mask_from_seqlens = xformers.ops.fmha.attn_bias.BlockDiagonalMask.from_seqlens +# else: +# block_diagonal_mask_from_seqlens = xformers.ops.fmha.BlockDiagonalMask.from_seqlens def modulate(x, shift, scale): return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) From e44d0ac7f77820e8339d20fe3c0698bf8a5e9347 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 23 Dec 2024 01:50:11 -0500 Subject: [PATCH 04/23] Make --novram completely offload weights. This flag is mainly used for testing the weight offloading, it shouldn't actually be used in practice. Remove useless import. --- comfy/ldm/pixart/blocks.py | 1 - comfy/model_management.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/comfy/ldm/pixart/blocks.py b/comfy/ldm/pixart/blocks.py index 40b0663e5..2225076e5 100644 --- a/comfy/ldm/pixart/blocks.py +++ b/comfy/ldm/pixart/blocks.py @@ -6,7 +6,6 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange -from comfy import model_management from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, Mlp, timestep_embedding from comfy.ldm.modules.attention import optimized_attention diff --git a/comfy/model_management.py b/comfy/model_management.py index b480aaaa4..d77ae8c06 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -521,7 +521,7 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu lowvram_model_memory = 0 if vram_set_state == VRAMState.NO_VRAM: - lowvram_model_memory = 64 * 1024 * 1024 + lowvram_model_memory = 0.1 loaded_model.model_load(lowvram_model_memory, force_patch_weights=force_patch_weights) current_loaded_models.insert(0, loaded_model) From c6b9c11ef6e90e47ed6db9520fffd4f5c37e9dca Mon Sep 17 00:00:00 2001 From: Simon Lui <502929+simonlui@users.noreply.github.com> Date: Mon, 23 Dec 2024 00:18:32 -0800 Subject: [PATCH 05/23] Add oneAPI device selector for xpu and some other changes. (#6112) * Add oneAPI device selector and some other minor changes. * Fix device selector variable name. * Flip minor version check sign. * Undo changes to README.md. --- comfy/cli_args.py | 3 ++- comfy/model_management.py | 6 ++++-- main.py | 4 ++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 4c6545011..224c075f0 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -84,7 +84,8 @@ parser.add_argument("--force-channels-last", action="store_true", help="Force ch parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.") -parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize when loading models with Intel GPUs.") +parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.") +parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize default when loading models with Intel's Extension for Pytorch.") class LatentPreviewMethod(enum.Enum): NoPreviews = "none" diff --git a/comfy/model_management.py b/comfy/model_management.py index d77ae8c06..244fa5c73 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -75,7 +75,7 @@ if args.directml is not None: try: import intel_extension_for_pytorch as ipex _ = torch.xpu.device_count() - xpu_available = torch.xpu.is_available() + xpu_available = xpu_available or torch.xpu.is_available() except: xpu_available = xpu_available or (hasattr(torch, "xpu") and torch.xpu.is_available()) @@ -219,12 +219,14 @@ if is_intel_xpu(): if args.cpu_vae: VAE_DTYPES = [torch.float32] - if ENABLE_PYTORCH_ATTENTION: torch.backends.cuda.enable_math_sdp(True) torch.backends.cuda.enable_flash_sdp(True) torch.backends.cuda.enable_mem_efficient_sdp(True) +if int(torch_version[0]) == 2 and int(torch_version[2]) >= 5: + torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True) + if args.lowvram: set_vram_to = VRAMState.LOW_VRAM lowvram_available = True diff --git a/main.py b/main.py index b65046535..151b264cd 100644 --- a/main.py +++ b/main.py @@ -114,6 +114,10 @@ if __name__ == "__main__": os.environ['HIP_VISIBLE_DEVICES'] = str(args.cuda_device) logging.info("Set cuda device to: {}".format(args.cuda_device)) + if args.oneapi_device_selector is not None: + os.environ['ONEAPI_DEVICE_SELECTOR'] = args.oneapi_device_selector + logging.info("Set oneapi device selector to: {}".format(args.oneapi_device_selector)) + if args.deterministic: if 'CUBLAS_WORKSPACE_CONFIG' not in os.environ: os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8" From 15564688edb9252ffe0b4f284a814ef2cd546446 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 23 Dec 2024 03:22:48 -0500 Subject: [PATCH 06/23] Add a try except block so if torch version is weird it won't crash. --- comfy/model_management.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 244fa5c73..33891b929 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -224,8 +224,11 @@ if ENABLE_PYTORCH_ATTENTION: torch.backends.cuda.enable_flash_sdp(True) torch.backends.cuda.enable_mem_efficient_sdp(True) -if int(torch_version[0]) == 2 and int(torch_version[2]) >= 5: - torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True) +try: + if int(torch_version[0]) == 2 and int(torch_version[2]) >= 5: + torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True) +except: + logging.warning("Warning, could not set allow_fp16_bf16_reduction_math_sdp") if args.lowvram: set_vram_to = VRAMState.LOW_VRAM From f18ebbd31645437afaa9738fcf2b5ed8b48cb021 Mon Sep 17 00:00:00 2001 From: Chenlei Hu Date: Mon, 23 Dec 2024 03:29:42 -0500 Subject: [PATCH 07/23] Use raw dir name to serve static web content (#6107) --- server.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/server.py b/server.py index ddd71e062..22525507a 100644 --- a/server.py +++ b/server.py @@ -714,9 +714,7 @@ class PromptServer(): self.app.add_routes(self.routes) for name, dir in nodes.EXTENSION_WEB_DIRS.items(): - self.app.add_routes([ - web.static('/extensions/' + urllib.parse.quote(name), dir), - ]) + self.app.add_routes([web.static('/extensions/' + name, dir)]) self.app.add_routes([ web.static('/', self.web_root), From bc6dac4327a838f8583f6272cc3cc612b9b16134 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 23 Dec 2024 20:03:37 -0500 Subject: [PATCH 08/23] Add temporal tiling to VAE Decode (Tiled) node. You can now do tiled VAE decoding on the temporal direction for videos. --- comfy/sd.py | 22 ++++++++++++++++++++-- comfy/utils.py | 26 ++++++++++++++++++++++++-- nodes.py | 14 ++++++++++++-- 3 files changed, 56 insertions(+), 6 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index f79eacc24..e85f2ed77 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -259,6 +259,9 @@ class VAE: self.process_output = lambda image: torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0) self.working_dtypes = [torch.bfloat16, torch.float32] + self.downscale_index_formula = None + self.upscale_index_formula = None + if config is None: if "decoder.mid.block_1.mix_factor" in sd: encoder_config = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0} @@ -338,6 +341,7 @@ class VAE: self.memory_used_decode = lambda shape, dtype: (1000 * shape[2] * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype) self.memory_used_encode = lambda shape, dtype: (1.5 * max(shape[2], 7) * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype) self.upscale_ratio = (lambda a: max(0, a * 6 - 5), 8, 8) + self.upscale_index_formula = (lambda a: max(0, a * 6), 8, 8) self.downscale_ratio = (lambda a: max(0, math.floor((a + 5) / 6)), 8, 8) self.working_dtypes = [torch.float16, torch.float32] elif "decoder.up_blocks.0.res_blocks.0.conv1.conv.weight" in sd: #lightricks ltxv @@ -353,6 +357,7 @@ class VAE: self.memory_used_decode = lambda shape, dtype: (900 * shape[2] * shape[3] * shape[4] * (8 * 8 * 8)) * model_management.dtype_size(dtype) self.memory_used_encode = lambda shape, dtype: (70 * max(shape[2], 7) * shape[3] * shape[4]) * model_management.dtype_size(dtype) self.upscale_ratio = (lambda a: max(0, a * 8 - 7), 32, 32) + self.upscale_index_formula = (lambda a: max(0, a * 8), 32, 32) self.downscale_ratio = (lambda a: max(0, math.floor((a + 7) / 8)), 32, 32) self.working_dtypes = [torch.bfloat16, torch.float32] elif "decoder.conv_in.conv.weight" in sd: @@ -360,6 +365,7 @@ class VAE: ddconfig["conv3d"] = True ddconfig["time_compress"] = 4 self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8) + self.upscale_index_formula = (lambda a: max(0, a * 4), 8, 8) self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8) self.latent_dim = 3 self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1] @@ -426,7 +432,7 @@ class VAE: def decode_tiled_3d(self, samples, tile_t=999, tile_x=32, tile_y=32, overlap=(1, 8, 8)): decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float() - return self.process_output(comfy.utils.tiled_scale_multidim(samples, decode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.upscale_ratio, out_channels=self.output_channels, output_device=self.output_device)) + return self.process_output(comfy.utils.tiled_scale_multidim(samples, decode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.upscale_ratio, out_channels=self.output_channels, index_formulas=self.upscale_index_formula, output_device=self.output_device)) def encode_tiled_(self, pixel_samples, tile_x=512, tile_y=512, overlap = 64): steps = pixel_samples.shape[0] * comfy.utils.get_tiled_scale_steps(pixel_samples.shape[3], pixel_samples.shape[2], tile_x, tile_y, overlap) @@ -479,7 +485,7 @@ class VAE: pixel_samples = pixel_samples.to(self.output_device).movedim(1,-1) return pixel_samples - def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None): + def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None): memory_used = self.memory_used_decode(samples.shape, self.vae_dtype) #TODO: calculate mem required for tile model_management.load_models_gpu([self.patcher], memory_required=memory_used) dims = samples.ndim - 2 @@ -497,6 +503,12 @@ class VAE: elif dims == 2: output = self.decode_tiled_(samples, **args) elif dims == 3: + if overlap_t is None: + args["overlap"] = (1, overlap, overlap) + else: + args["overlap"] = (overlap_t, overlap, overlap) + if tile_t is not None: + args["tile_t"] = tile_t output = self.decode_tiled_3d(samples, **args) return output.movedim(1, -1) @@ -575,6 +587,12 @@ class VAE: except: return self.downscale_ratio + def temporal_compression_decode(self): + try: + return round(self.upscale_ratio[0](8192) / 8192) + except: + return None + class StyleModel: def __init__(self, model, device="cpu"): self.model = model diff --git a/comfy/utils.py b/comfy/utils.py index 5fb5418b5..7de659337 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -822,7 +822,7 @@ def get_tiled_scale_steps(width, height, tile_x, tile_y, overlap): return rows * cols @torch.inference_mode() -def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_amount=4, out_channels=3, output_device="cpu", downscale=False, pbar=None): +def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_amount=4, out_channels=3, output_device="cpu", downscale=False, index_formulas=None, pbar=None): dims = len(tile) if not (isinstance(upscale_amount, (tuple, list))): @@ -831,6 +831,12 @@ def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_am if not (isinstance(overlap, (tuple, list))): overlap = [overlap] * dims + if index_formulas is None: + index_formulas = upscale_amount + + if not (isinstance(index_formulas, (tuple, list))): + index_formulas = [index_formulas] * dims + def get_upscale(dim, val): up = upscale_amount[dim] if callable(up): @@ -845,10 +851,26 @@ def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_am else: return val / up + def get_upscale_pos(dim, val): + up = index_formulas[dim] + if callable(up): + return up(val) + else: + return up * val + + def get_downscale_pos(dim, val): + up = index_formulas[dim] + if callable(up): + return up(val) + else: + return val / up + if downscale: get_scale = get_downscale + get_pos = get_downscale_pos else: get_scale = get_upscale + get_pos = get_upscale_pos def mult_list_upscale(a): out = [] @@ -881,7 +903,7 @@ def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_am pos = max(0, min(s.shape[d + 2] - overlap[d], it[d])) l = min(tile[d], s.shape[d + 2] - pos) s_in = s_in.narrow(d + 2, pos, l) - upscaled.append(round(get_scale(d, pos))) + upscaled.append(round(get_pos(d, pos))) ps = function(s_in).to(output_device) mask = torch.ones_like(ps) diff --git a/nodes.py b/nodes.py index bdea7564b..d6777df4f 100644 --- a/nodes.py +++ b/nodes.py @@ -293,17 +293,27 @@ class VAEDecodeTiled: return {"required": {"samples": ("LATENT", ), "vae": ("VAE", ), "tile_size": ("INT", {"default": 512, "min": 64, "max": 4096, "step": 32}), "overlap": ("INT", {"default": 64, "min": 0, "max": 4096, "step": 32}), + "temporal_size": ("INT", {"default": 64, "min": 8, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to decode at a time."}), + "temporal_overlap": ("INT", {"default": 8, "min": 4, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to overlap."}), }} RETURN_TYPES = ("IMAGE",) FUNCTION = "decode" CATEGORY = "_for_testing" - def decode(self, vae, samples, tile_size, overlap=64): + def decode(self, vae, samples, tile_size, overlap=64, temporal_size=64, temporal_overlap=8): if tile_size < overlap * 4: overlap = tile_size // 4 + temporal_compression = vae.temporal_compression_decode() + if temporal_compression is not None: + temporal_size = max(2, temporal_size // temporal_compression) + temporal_overlap = min(1, temporal_size // 2, temporal_overlap // temporal_compression) + else: + temporal_size = None + temporal_overlap = None + compression = vae.spacial_compression_decode() - images = vae.decode_tiled(samples["samples"], tile_x=tile_size // compression, tile_y=tile_size // compression, overlap=overlap // compression) + images = vae.decode_tiled(samples["samples"], tile_x=tile_size // compression, tile_y=tile_size // compression, overlap=overlap // compression, tile_t=temporal_size, overlap_t=temporal_overlap) if len(images.shape) == 5: #Combine batches images = images.reshape(-1, images.shape[-3], images.shape[-2], images.shape[-1]) return (images, ) From 26e0ba8f8cf786575fc1324acb858ad81f3ef9d6 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 24 Dec 2024 14:38:52 +0300 Subject: [PATCH 09/23] Enable External Event Loop Integration for ComfyUI [refactor] (#6114) * Refactor main.py to support external event loop integration * added optional "asyncio_loop" argument to allow using existing event loop --------- Signed-off-by: bigcat88 --- main.py | 60 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/main.py b/main.py index 151b264cd..ccc99fdc4 100644 --- a/main.py +++ b/main.py @@ -150,9 +150,10 @@ def cuda_malloc_warning(): if cuda_malloc_warning: logging.warning("\nWARNING: this card most likely does not support cuda-malloc, if you get \"CUDA error\" please run ComfyUI with: --disable-cuda-malloc\n") -def prompt_worker(q, server): + +def prompt_worker(q, server_instance): current_time: float = 0.0 - e = execution.PromptExecutor(server, lru_size=args.cache_lru) + e = execution.PromptExecutor(server_instance, lru_size=args.cache_lru) last_gc_collect = 0 need_gc = False gc_collect_interval = 10.0 @@ -167,7 +168,7 @@ def prompt_worker(q, server): item, item_id = queue_item execution_start_time = time.perf_counter() prompt_id = item[1] - server.last_prompt_id = prompt_id + server_instance.last_prompt_id = prompt_id e.execute(item[2], prompt_id, item[3], item[4]) need_gc = True @@ -177,8 +178,8 @@ def prompt_worker(q, server): status_str='success' if e.success else 'error', completed=e.success, messages=e.status_messages)) - if server.client_id is not None: - server.send_sync("executing", { "node": None, "prompt_id": prompt_id }, server.client_id) + if server_instance.client_id is not None: + server_instance.send_sync("executing", {"node": None, "prompt_id": prompt_id}, server_instance.client_id) current_time = time.perf_counter() execution_time = current_time - execution_start_time @@ -205,21 +206,23 @@ def prompt_worker(q, server): last_gc_collect = current_time need_gc = False -async def run(server, address='', port=8188, verbose=True, call_on_start=None): + +async def run(server_instance, address='', port=8188, verbose=True, call_on_start=None): addresses = [] for addr in address.split(","): addresses.append((addr, port)) - await asyncio.gather(server.start_multi_address(addresses, call_on_start), server.publish_loop()) + await asyncio.gather(server_instance.start_multi_address(addresses, call_on_start), server_instance.publish_loop()) -def hijack_progress(server): +def hijack_progress(server_instance): def hook(value, total, preview_image): comfy.model_management.throw_exception_if_processing_interrupted() - progress = {"value": value, "max": total, "prompt_id": server.last_prompt_id, "node": server.last_node_id} + progress = {"value": value, "max": total, "prompt_id": server_instance.last_prompt_id, "node": server_instance.last_node_id} - server.send_sync("progress", progress, server.client_id) + server_instance.send_sync("progress", progress, server_instance.client_id) if preview_image is not None: - server.send_sync(BinaryEventTypes.UNENCODED_PREVIEW_IMAGE, preview_image, server.client_id) + server_instance.send_sync(BinaryEventTypes.UNENCODED_PREVIEW_IMAGE, preview_image, server_instance.client_id) + comfy.utils.set_progress_bar_global_hook(hook) @@ -229,7 +232,11 @@ def cleanup_temp(): shutil.rmtree(temp_dir, ignore_errors=True) -if __name__ == "__main__": +def start_comfyui(asyncio_loop=None): + """ + Starts the ComfyUI server using the provided asyncio event loop or creates a new one. + Returns the event loop, server instance, and a function to start the server asynchronously. + """ if args.temp_directory: temp_dir = os.path.join(os.path.abspath(args.temp_directory), "temp") logging.info(f"Setting temp directory to: {temp_dir}") @@ -243,19 +250,20 @@ if __name__ == "__main__": except: pass - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - server = server.PromptServer(loop) - q = execution.PromptQueue(server) + if not asyncio_loop: + asyncio_loop = asyncio.new_event_loop() + asyncio.set_event_loop(asyncio_loop) + prompt_server = server.PromptServer(asyncio_loop) + q = execution.PromptQueue(prompt_server) nodes.init_extra_nodes(init_custom_nodes=not args.disable_all_custom_nodes) cuda_malloc_warning() - server.add_routes() - hijack_progress(server) + prompt_server.add_routes() + hijack_progress(prompt_server) - threading.Thread(target=prompt_worker, daemon=True, args=(q, server,)).start() + threading.Thread(target=prompt_worker, daemon=True, args=(q, prompt_server,)).start() if args.quick_test_for_ci: exit(0) @@ -272,9 +280,19 @@ if __name__ == "__main__": webbrowser.open(f"{scheme}://{address}:{port}") call_on_start = startup_server + async def start_all(): + await prompt_server.setup() + await run(prompt_server, address=args.listen, port=args.port, verbose=not args.dont_print_server, call_on_start=call_on_start) + + # Returning these so that other code can integrate with the ComfyUI loop and server + return asyncio_loop, prompt_server, start_all + + +if __name__ == "__main__": + # Running directly, just start ComfyUI. + event_loop, _, start_all_func = start_comfyui() try: - loop.run_until_complete(server.setup()) - loop.run_until_complete(run(server, address=args.listen, port=args.port, verbose=not args.dont_print_server, call_on_start=call_on_start)) + event_loop.run_until_complete(start_all_func()) except KeyboardInterrupt: logging.info("\nStopped server") From 5388df784acc0f42da1d54fb379b25ad079864cd Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 24 Dec 2024 07:10:09 -0500 Subject: [PATCH 10/23] Add temporal tiling to VAE Encode (Tiled) node. --- comfy/sd.py | 19 ++++++++++++++----- nodes.py | 8 +++++--- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index e85f2ed77..2db00fa44 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -341,8 +341,9 @@ class VAE: self.memory_used_decode = lambda shape, dtype: (1000 * shape[2] * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype) self.memory_used_encode = lambda shape, dtype: (1.5 * max(shape[2], 7) * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype) self.upscale_ratio = (lambda a: max(0, a * 6 - 5), 8, 8) - self.upscale_index_formula = (lambda a: max(0, a * 6), 8, 8) + self.upscale_index_formula = (6, 8, 8) self.downscale_ratio = (lambda a: max(0, math.floor((a + 5) / 6)), 8, 8) + self.downscale_index_formula = (6, 8, 8) self.working_dtypes = [torch.float16, torch.float32] elif "decoder.up_blocks.0.res_blocks.0.conv1.conv.weight" in sd: #lightricks ltxv tensor_conv1 = sd["decoder.up_blocks.0.res_blocks.0.conv1.conv.weight"] @@ -357,16 +358,18 @@ class VAE: self.memory_used_decode = lambda shape, dtype: (900 * shape[2] * shape[3] * shape[4] * (8 * 8 * 8)) * model_management.dtype_size(dtype) self.memory_used_encode = lambda shape, dtype: (70 * max(shape[2], 7) * shape[3] * shape[4]) * model_management.dtype_size(dtype) self.upscale_ratio = (lambda a: max(0, a * 8 - 7), 32, 32) - self.upscale_index_formula = (lambda a: max(0, a * 8), 32, 32) + self.upscale_index_formula = (8, 32, 32) self.downscale_ratio = (lambda a: max(0, math.floor((a + 7) / 8)), 32, 32) + self.downscale_index_formula = (8, 32, 32) self.working_dtypes = [torch.bfloat16, torch.float32] elif "decoder.conv_in.conv.weight" in sd: ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0} ddconfig["conv3d"] = True ddconfig["time_compress"] = 4 self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8) - self.upscale_index_formula = (lambda a: max(0, a * 4), 8, 8) + self.upscale_index_formula = (4, 8, 8) self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8) + self.downscale_index_formula = (4, 8, 8) self.latent_dim = 3 self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1] self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1]) @@ -453,7 +456,7 @@ class VAE: def encode_tiled_3d(self, samples, tile_t=9999, tile_x=512, tile_y=512, overlap=(1, 64, 64)): encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float() - return comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.downscale_ratio, out_channels=self.latent_channels, downscale=True, output_device=self.output_device) + return comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.downscale_ratio, out_channels=self.latent_channels, downscale=True, index_formulas=self.downscale_index_formula, output_device=self.output_device) def decode(self, samples_in): pixel_samples = None @@ -544,7 +547,7 @@ class VAE: return samples - def encode_tiled(self, pixel_samples, tile_x=None, tile_y=None, overlap=None): + def encode_tiled(self, pixel_samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None): pixel_samples = self.vae_encode_crop_pixels(pixel_samples) dims = self.latent_dim pixel_samples = pixel_samples.movedim(-1, 1) @@ -568,6 +571,12 @@ class VAE: elif dims == 2: samples = self.encode_tiled_(pixel_samples, **args) elif dims == 3: + if overlap_t is None: + args["overlap"] = (1, overlap, overlap) + else: + args["overlap"] = (overlap_t, overlap, overlap) + if tile_t is not None: + args["tile_t"] = tile_t samples = self.encode_tiled_3d(pixel_samples, **args) return samples diff --git a/nodes.py b/nodes.py index d6777df4f..e95abc40b 100644 --- a/nodes.py +++ b/nodes.py @@ -337,15 +337,17 @@ class VAEEncodeTiled: return {"required": {"pixels": ("IMAGE", ), "vae": ("VAE", ), "tile_size": ("INT", {"default": 512, "min": 64, "max": 4096, "step": 64}), "overlap": ("INT", {"default": 64, "min": 0, "max": 4096, "step": 32}), + "temporal_size": ("INT", {"default": 64, "min": 8, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to encode at a time."}), + "temporal_overlap": ("INT", {"default": 8, "min": 4, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to overlap."}), }} RETURN_TYPES = ("LATENT",) FUNCTION = "encode" CATEGORY = "_for_testing" - def encode(self, vae, pixels, tile_size, overlap): - t = vae.encode_tiled(pixels[:,:,:,:3], tile_x=tile_size, tile_y=tile_size, overlap=overlap) - return ({"samples":t}, ) + def encode(self, vae, pixels, tile_size, overlap, temporal_size=64, temporal_overlap=8): + t = vae.encode_tiled(pixels[:,:,:,:3], tile_x=tile_size, tile_y=tile_size, overlap=overlap, tile_t=temporal_size, overlap_t=temporal_overlap) + return ({"samples": t}, ) class VAEEncodeForInpaint: @classmethod From 73e04987f7e0f14bdee9baa0aafe61cf7f42a8b2 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 24 Dec 2024 07:36:30 -0500 Subject: [PATCH 11/23] Prevent black images in VAE Decode (Tiled) node. Overlap should be minimum 1 with tiling 2 for tiled temporal VAE decoding. --- comfy/sd.py | 5 +++-- nodes.py | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index 2db00fa44..de3ce677c 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -509,9 +509,10 @@ class VAE: if overlap_t is None: args["overlap"] = (1, overlap, overlap) else: - args["overlap"] = (overlap_t, overlap, overlap) + args["overlap"] = (max(1, overlap_t), overlap, overlap) if tile_t is not None: - args["tile_t"] = tile_t + args["tile_t"] = max(2, tile_t) + output = self.decode_tiled_3d(samples, **args) return output.movedim(1, -1) diff --git a/nodes.py b/nodes.py index e95abc40b..a135a6120 100644 --- a/nodes.py +++ b/nodes.py @@ -304,6 +304,8 @@ class VAEDecodeTiled: def decode(self, vae, samples, tile_size, overlap=64, temporal_size=64, temporal_overlap=8): if tile_size < overlap * 4: overlap = tile_size // 4 + if temporal_size < temporal_overlap * 2: + temporal_overlap = temporal_overlap // 2 temporal_compression = vae.temporal_compression_decode() if temporal_compression is not None: temporal_size = max(2, temporal_size // temporal_compression) From 99a1fb6027b7163592a83669b0b1c5aa4657c2b6 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 24 Dec 2024 18:05:19 -0500 Subject: [PATCH 12/23] Make fast fp8 take a bit less peak memory. --- comfy/ops.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 8e0694232..06be6b48b 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -255,9 +255,10 @@ def fp8_linear(self, input): tensor_2d = True input = input.unsqueeze(1) - + input_shape = input.shape + input_dtype = input.dtype if len(input.shape) == 3: - w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input.dtype) + w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype) w = w.t() scale_weight = self.scale_weight @@ -269,23 +270,24 @@ def fp8_linear(self, input): if scale_input is None: scale_input = torch.ones((), device=input.device, dtype=torch.float32) - inn = torch.clamp(input, min=-448, max=448).reshape(-1, input.shape[2]).to(dtype) + input = torch.clamp(input, min=-448, max=448, out=input) + input = input.reshape(-1, input_shape[2]).to(dtype) else: scale_input = scale_input.to(input.device) - inn = (input * (1.0 / scale_input).to(input.dtype)).reshape(-1, input.shape[2]).to(dtype) + input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype) if bias is not None: - o = torch._scaled_mm(inn, w, out_dtype=input.dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight) + o = torch._scaled_mm(input, w, out_dtype=input_dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight) else: - o = torch._scaled_mm(inn, w, out_dtype=input.dtype, scale_a=scale_input, scale_b=scale_weight) + o = torch._scaled_mm(input, w, out_dtype=input_dtype, scale_a=scale_input, scale_b=scale_weight) if isinstance(o, tuple): o = o[0] if tensor_2d: - return o.reshape(input.shape[0], -1) + return o.reshape(input_shape[0], -1) - return o.reshape((-1, input.shape[1], self.weight.shape[0])) + return o.reshape((-1, input_shape[1], self.weight.shape[0])) return None From 1ed75ab30ee2fdef6b3b41ad3061583a0fede723 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 25 Dec 2024 03:29:03 -0500 Subject: [PATCH 13/23] Update nightly pytorch instructions in readme for nvidia. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8311b7b7c..371421617 100644 --- a/README.md +++ b/README.md @@ -189,7 +189,7 @@ Nvidia users should install stable pytorch using this command: This is the command to install pytorch nightly instead which might have performance improvements: -```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124``` +```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126``` #### Troubleshooting From 0229228f3f75fc4b0d0d4cf3658138eedc2cc2eb Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 25 Dec 2024 04:50:34 -0500 Subject: [PATCH 14/23] Clean up the VAE dtypes code. --- comfy/model_management.py | 27 ++++++++++++--------------- comfy/sd.py | 4 ++-- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 33891b929..8320c6ece 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -188,6 +188,12 @@ def is_nvidia(): return True return False +def is_amd(): + global cpu_state + if cpu_state == CPUState.GPU: + if torch.version.hip: + return True + return False MIN_WEIGHT_MEMORY_RATIO = 0.4 if is_nvidia(): @@ -198,27 +204,17 @@ if args.use_pytorch_cross_attention: ENABLE_PYTORCH_ATTENTION = True XFORMERS_IS_AVAILABLE = False -VAE_DTYPES = [torch.float32] - try: if is_nvidia(): if int(torch_version[0]) >= 2: if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False: ENABLE_PYTORCH_ATTENTION = True - if torch.cuda.is_bf16_supported() and torch.cuda.get_device_properties(torch.cuda.current_device()).major >= 8: - VAE_DTYPES = [torch.bfloat16] + VAE_DTYPES if is_intel_xpu(): if args.use_split_cross_attention == False and args.use_quad_cross_attention == False: ENABLE_PYTORCH_ATTENTION = True except: pass -if is_intel_xpu(): - VAE_DTYPES = [torch.bfloat16] + VAE_DTYPES - -if args.cpu_vae: - VAE_DTYPES = [torch.float32] - if ENABLE_PYTORCH_ATTENTION: torch.backends.cuda.enable_math_sdp(True) torch.backends.cuda.enable_flash_sdp(True) @@ -754,7 +750,6 @@ def vae_offload_device(): return torch.device("cpu") def vae_dtype(device=None, allowed_dtypes=[]): - global VAE_DTYPES if args.fp16_vae: return torch.float16 elif args.bf16_vae: @@ -763,12 +758,14 @@ def vae_dtype(device=None, allowed_dtypes=[]): return torch.float32 for d in allowed_dtypes: - if d == torch.float16 and should_use_fp16(device, prioritize_performance=False): - return d - if d in VAE_DTYPES: + if d == torch.float16 and should_use_fp16(device): return d - return VAE_DTYPES[0] + # NOTE: bfloat16 seems to work on AMD for the VAE but is extremely slow in some cases compared to fp32 + if d == torch.bfloat16 and (not is_amd()) and should_use_bf16(device): + return d + + return torch.float32 def get_autocast_device(dev): if hasattr(dev, 'type'): diff --git a/comfy/sd.py b/comfy/sd.py index de3ce677c..55f91116f 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -111,7 +111,7 @@ class CLIP: model_management.load_models_gpu([self.patcher], force_full_load=True) self.layer_idx = None self.use_clip_schedule = False - logging.debug("CLIP model load device: {}, offload device: {}, current: {}".format(load_device, offload_device, params['device'])) + logging.info("CLIP model load device: {}, offload device: {}, current: {}, dtype: {}".format(load_device, offload_device, params['device'], dtype)) def clone(self): n = CLIP(no_init=True) @@ -402,7 +402,7 @@ class VAE: self.output_device = model_management.intermediate_device() self.patcher = comfy.model_patcher.ModelPatcher(self.first_stage_model, load_device=self.device, offload_device=offload_device) - logging.debug("VAE load device: {}, offload device: {}, dtype: {}".format(self.device, offload_device, self.vae_dtype)) + logging.info("VAE load device: {}, offload device: {}, dtype: {}".format(self.device, offload_device, self.vae_dtype)) def vae_encode_crop_pixels(self, pixels): downscale_ratio = self.spacial_compression_encode() From b486885e0866b1fc37b767a7ff04c1f40acb5ac4 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 25 Dec 2024 05:18:50 -0500 Subject: [PATCH 15/23] Disable bfloat16 on older mac. --- comfy/model_management.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 8320c6ece..ce241e17f 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -886,14 +886,19 @@ def pytorch_attention_flash_attention(): return True return False +def mac_version(): + try: + return tuple(int(n) for n in platform.mac_ver()[0].split(".")) + except: + return None + def force_upcast_attention_dtype(): upcast = args.force_upcast_attention - try: - macos_version = tuple(int(n) for n in platform.mac_ver()[0].split(".")) - if (14, 5) <= macos_version <= (15, 2): # black image bug on recent versions of macOS - upcast = True - except: - pass + + macos_version = mac_version() + if macos_version is not None and ((14, 5) <= macos_version <= (15, 2)): # black image bug on recent versions of macOS + upcast = True + if upcast: return torch.float32 else: @@ -1034,6 +1039,8 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma return False if mps_mode(): + if mac_version() < (14,): + return False return True if cpu_mode(): From 19a64d62918c68b800de7277472c3b039beaa126 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 25 Dec 2024 05:32:51 -0500 Subject: [PATCH 16/23] Cleanup some mac related code. --- comfy/model_management.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index ce241e17f..db2a61395 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -969,17 +969,13 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma if FORCE_FP16: return True - if device is not None: - if is_device_mps(device): - return True - if FORCE_FP32: return False if directml_enabled: return False - if mps_mode(): + if (device is not None and is_device_mps(device)) or mps_mode(): return True if cpu_mode(): @@ -1028,17 +1024,13 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma if is_device_cpu(device): #TODO ? bf16 works on CPU but is extremely slow return False - if device is not None: - if is_device_mps(device): - return True - if FORCE_FP32: return False if directml_enabled: return False - if mps_mode(): + if (device is not None and is_device_mps(device)) or mps_mode(): if mac_version() < (14,): return False return True From ee9547ba31f5f2c1de0211a09c3fb829bd8e25e6 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Thu, 26 Dec 2024 07:18:49 -0500 Subject: [PATCH 17/23] Improve temporal VAE Encode (Tiled) math. --- comfy/sd.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index 55f91116f..c6d6236b1 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -572,13 +572,20 @@ class VAE: elif dims == 2: samples = self.encode_tiled_(pixel_samples, **args) elif dims == 3: + if tile_t is not None: + tile_t_latent = max(2, self.downscale_ratio[0](tile_t)) + else: + tile_t_latent = 9999 + args["tile_t"] = self.upscale_ratio[0](tile_t_latent) + if overlap_t is None: args["overlap"] = (1, overlap, overlap) else: - args["overlap"] = (overlap_t, overlap, overlap) - if tile_t is not None: - args["tile_t"] = tile_t - samples = self.encode_tiled_3d(pixel_samples, **args) + args["overlap"] = (self.upscale_ratio[0](max(1, min(tile_t_latent // 2, self.downscale_ratio[0](overlap_t)))), overlap, overlap) + maximum = pixel_samples.shape[2] + maximum = self.upscale_ratio[0](self.downscale_ratio[0](maximum)) + + samples = self.encode_tiled_3d(pixel_samples[:,:,:maximum], **args) return samples From c4bfdba3301eb8dd2000b1b22e4752a662d4c856 Mon Sep 17 00:00:00 2001 From: Huazhong Ji Date: Fri, 27 Dec 2024 08:36:50 +0800 Subject: [PATCH 18/23] Support ascend npu (#5436) * support ascend npu Co-authored-by: YukMingLaw Co-authored-by: starmountain1997 Co-authored-by: Ginray --- README.md | 10 ++++++++++ comfy/model_management.py | 41 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 371421617..000d76801 100644 --- a/README.md +++ b/README.md @@ -224,6 +224,16 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve ```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml``` +#### Ascend NPUs + +For models compatible with Ascend Extension for PyTorch (torch_npu). To get started, ensure your environment meets the prerequisites outlined on the [installation](https://ascend.github.io/docs/sources/ascend/quick_install.html) page. Here's a step-by-step guide tailored to your platform and installation method: + +1. Begin by installing the recommended or newer kernel version for Linux as specified in the Installation page of torch-npu, if necessary. +2. Proceed with the installation of Ascend Basekit, which includes the driver, firmware, and CANN, following the instructions provided for your specific platform. +3. Next, install the necessary packages for torch-npu by adhering to the platform-specific instructions on the [Installation](https://ascend.github.io/docs/sources/pytorch/install.html#pytorch) page. +4. Finally, adhere to the [ComfyUI manual installation](#manual-install-windows-linux) guide for Linux. Once all components are installed, you can run ComfyUI as described earlier. + + # Running ```python main.py``` diff --git a/comfy/model_management.py b/comfy/model_management.py index db2a61395..c36c52ffd 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -86,6 +86,13 @@ try: except: pass +try: + import torch_npu + _ = torch.npu.device_count() + npu_available = torch.npu.is_available() +except: + npu_available = False + if args.cpu: cpu_state = CPUState.CPU @@ -97,6 +104,12 @@ def is_intel_xpu(): return True return False +def is_ascend_npu(): + global npu_available + if npu_available: + return True + return False + def get_torch_device(): global directml_enabled global cpu_state @@ -110,6 +123,8 @@ def get_torch_device(): else: if is_intel_xpu(): return torch.device("xpu", torch.xpu.current_device()) + elif is_ascend_npu(): + return torch.device("npu", torch.npu.current_device()) else: return torch.device(torch.cuda.current_device()) @@ -130,6 +145,12 @@ def get_total_memory(dev=None, torch_total_too=False): mem_reserved = stats['reserved_bytes.all.current'] mem_total_torch = mem_reserved mem_total = torch.xpu.get_device_properties(dev).total_memory + elif is_ascend_npu(): + stats = torch.npu.memory_stats(dev) + mem_reserved = stats['reserved_bytes.all.current'] + _, mem_total_npu = torch.npu.mem_get_info(dev) + mem_total_torch = mem_reserved + mem_total = mem_total_npu else: stats = torch.cuda.memory_stats(dev) mem_reserved = stats['reserved_bytes.all.current'] @@ -209,7 +230,7 @@ try: if int(torch_version[0]) >= 2: if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False: ENABLE_PYTORCH_ATTENTION = True - if is_intel_xpu(): + if is_intel_xpu() or is_ascend_npu(): if args.use_split_cross_attention == False and args.use_quad_cross_attention == False: ENABLE_PYTORCH_ATTENTION = True except: @@ -274,6 +295,8 @@ def get_torch_device_name(device): return "{}".format(device.type) elif is_intel_xpu(): return "{} {}".format(device, torch.xpu.get_device_name(device)) + elif is_ascend_npu(): + return "{} {}".format(device, torch.npu.get_device_name(device)) else: return "CUDA {}: {}".format(device, torch.cuda.get_device_name(device)) @@ -860,6 +883,8 @@ def xformers_enabled(): return False if is_intel_xpu(): return False + if is_ascend_npu(): + return False if directml_enabled: return False return XFORMERS_IS_AVAILABLE @@ -884,6 +909,8 @@ def pytorch_attention_flash_attention(): return True if is_intel_xpu(): return True + if is_ascend_npu(): + return True return False def mac_version(): @@ -923,6 +950,13 @@ def get_free_memory(dev=None, torch_free_too=False): mem_free_torch = mem_reserved - mem_active mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved mem_free_total = mem_free_xpu + mem_free_torch + elif is_ascend_npu(): + stats = torch.npu.memory_stats(dev) + mem_active = stats['active_bytes.all.current'] + mem_reserved = stats['reserved_bytes.all.current'] + mem_free_npu, _ = torch.npu.mem_get_info(dev) + mem_free_torch = mem_reserved - mem_active + mem_free_total = mem_free_npu + mem_free_torch else: stats = torch.cuda.memory_stats(dev) mem_active = stats['active_bytes.all.current'] @@ -984,6 +1018,9 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma if is_intel_xpu(): return True + if is_ascend_npu(): + return True + if torch.version.hip: return True @@ -1081,6 +1118,8 @@ def soft_empty_cache(force=False): torch.mps.empty_cache() elif is_intel_xpu(): torch.xpu.empty_cache() + elif is_ascend_npu(): + torch.npu.empty_cache() elif torch.cuda.is_available(): if force or is_nvidia(): #This seems to make things worse on ROCm so I only do it for cuda torch.cuda.empty_cache() From 160ca081387e6d871487a6caedeb9bbacf073665 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Thu, 26 Dec 2024 20:05:54 -0500 Subject: [PATCH 19/23] Use python 3.9 in launch test instead of 3.8 Fix ruff check. --- .github/workflows/test-launch.yml | 2 +- comfy/model_management.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-launch.yml b/.github/workflows/test-launch.yml index 5d665d6af..c56283c2d 100644 --- a/.github/workflows/test-launch.yml +++ b/.github/workflows/test-launch.yml @@ -17,7 +17,7 @@ jobs: path: "ComfyUI" - uses: actions/setup-python@v4 with: - python-version: '3.8' + python-version: '3.9' - name: Install requirements run: | python -m pip install --upgrade pip diff --git a/comfy/model_management.py b/comfy/model_management.py index c36c52ffd..731fb5845 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -87,7 +87,7 @@ except: pass try: - import torch_npu + import torch_npu # noqa: F401 _ = torch.npu.device_count() npu_available = torch.npu.is_available() except: From ceb50b2cbfb166b84786d12d313a624273590fab Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Fri, 27 Dec 2024 07:30:09 -0500 Subject: [PATCH 20/23] Closer memory estimation for pixart models. --- comfy/supported_models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index a5f38b5ed..512515aeb 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -608,6 +608,8 @@ class PixArtAlpha(supported_models_base.BASE): unet_extra_config = {} latent_format = latent_formats.SD15 + memory_usage_factor = 0.5 + vae_key_prefix = ["vae."] text_encoder_key_prefix = ["text_encoders."] From 4b5bcd8ac4e221681e2541c2aa2f665a56ef72de Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Fri, 27 Dec 2024 07:37:00 -0500 Subject: [PATCH 21/23] Closer memory estimation for hunyuan dit model. --- comfy/supported_models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 512515aeb..6a2cc75ae 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -642,6 +642,8 @@ class HunyuanDiT(supported_models_base.BASE): latent_format = latent_formats.SDXL + memory_usage_factor = 1.3 + vae_key_prefix = ["vae."] text_encoder_key_prefix = ["text_encoders."] From 9cfd185676e1bd0d5642c43bb9ee1f857ecd1be4 Mon Sep 17 00:00:00 2001 From: filtered <176114999+webfiltered@users.noreply.github.com> Date: Sat, 28 Dec 2024 06:40:05 +1100 Subject: [PATCH 22/23] Add option to log non-error output to stdout (#6243) * nit * Add option to log non-error output to stdout - No change to default behaviour - Adds CLI argument: --log-stdout - With this arg present, any logging of a level below logging.ERROR will be sent to stdout instead of stderr --- app/logger.py | 13 ++++++++++++- comfy/cli_args.py | 3 ++- main.py | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/app/logger.py b/app/logger.py index 527be9fe7..9e9f84ccf 100644 --- a/app/logger.py +++ b/app/logger.py @@ -51,7 +51,7 @@ def on_flush(callback): if stderr_interceptor is not None: stderr_interceptor.on_flush(callback) -def setup_logger(log_level: str = 'INFO', capacity: int = 300): +def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool = False): global logs if logs: return @@ -70,4 +70,15 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300): stream_handler = logging.StreamHandler() stream_handler.setFormatter(logging.Formatter("%(message)s")) + + if use_stdout: + # Only errors and critical to stderr + stream_handler.addFilter(lambda record: not record.levelno < logging.ERROR) + + # Lesser to stdout + stdout_handler = logging.StreamHandler(sys.stdout) + stdout_handler.setFormatter(logging.Formatter("%(message)s")) + stdout_handler.addFilter(lambda record: record.levelno < logging.ERROR) + logger.addHandler(stdout_handler) + logger.addHandler(stream_handler) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 224c075f0..812798bf8 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -122,7 +122,7 @@ vram_group.add_argument("--lowvram", action="store_true", help="Split the unet i vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.") vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).") -parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reverved depending on your OS.") +parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.") parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.") @@ -141,6 +141,7 @@ parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Dis parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.") parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level') +parser.add_argument("--log-stdout", action="store_true", help="Send normal process output to stdout instead of stderr (default).") # The default built-in provider hosted under web/ DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest" diff --git a/main.py b/main.py index ccc99fdc4..95972f73b 100644 --- a/main.py +++ b/main.py @@ -17,7 +17,7 @@ if __name__ == "__main__": os.environ['DO_NOT_TRACK'] = '1' -setup_logger(log_level=args.verbose) +setup_logger(log_level=args.verbose, use_stdout=args.log_stdout) def apply_custom_paths(): # extra model paths From d170292594770377d9e0442078ef43668e2331b6 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Fri, 27 Dec 2024 18:02:21 -0500 Subject: [PATCH 23/23] Remove some trailing white space. --- comfy/controlnet.py | 2 +- comfy/extra_samplers/uni_pc.py | 10 +++++----- comfy/hooks.py | 2 +- comfy/k_diffusion/sampling.py | 2 +- comfy/ldm/modules/sub_quadratic_attention.py | 2 +- comfy/ldm/pixart/pixartms.py | 2 +- comfy/model_detection.py | 4 ++-- comfy/sample.py | 2 +- comfy/sampler_helpers.py | 2 +- comfy/utils.py | 10 +++++----- comfy_extras/nodes_mask.py | 4 ++-- comfy_extras/nodes_perpneg.py | 2 +- comfy_extras/nodes_rebatch.py | 4 ++-- comfy_extras/nodes_tomesd.py | 7 +++---- nodes.py | 20 ++++++++++---------- 15 files changed, 37 insertions(+), 38 deletions(-) diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 7f5988377..ee29251b9 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -120,7 +120,7 @@ class ControlBase: if self.previous_controlnet is not None: out += self.previous_controlnet.get_models() return out - + def get_extra_hooks(self): out = [] if self.extra_hooks is not None: diff --git a/comfy/extra_samplers/uni_pc.py b/comfy/extra_samplers/uni_pc.py index b61baaa8e..77d20bbf5 100644 --- a/comfy/extra_samplers/uni_pc.py +++ b/comfy/extra_samplers/uni_pc.py @@ -80,7 +80,7 @@ class NoiseScheduleVP: 'linear' or 'cosine' for continuous-time DPMs. Returns: A wrapper object of the forward SDE (VP type). - + =============================================================== Example: @@ -208,7 +208,7 @@ def model_wrapper( arXiv preprint arXiv:2202.00512 (2022). [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models." arXiv preprint arXiv:2210.02303 (2022). - + 4. "score": marginal score function. (Trained by denoising score matching). Note that the score function and the noise prediction model follows a simple relationship: ``` @@ -245,7 +245,7 @@ def model_wrapper( [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance." arXiv preprint arXiv:2207.12598 (2022). - + The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999) or continuous-time labels (i.e. epsilon to T). @@ -621,7 +621,7 @@ class UniPC: B_h = torch.expm1(hh) else: raise NotImplementedError() - + for i in range(1, order + 1): R.append(torch.pow(rks, i - 1)) b.append(h_phi_k * factorial_i / B_h) @@ -870,4 +870,4 @@ def sample_unipc(model, noise, sigmas, extra_args=None, callback=None, disable=F return x def sample_unipc_bh2(model, noise, sigmas, extra_args=None, callback=None, disable=False): - return sample_unipc(model, noise, sigmas, extra_args, callback, disable, variant='bh2') \ No newline at end of file + return sample_unipc(model, noise, sigmas, extra_args, callback, disable, variant='bh2') diff --git a/comfy/hooks.py b/comfy/hooks.py index b6f0ac213..7f9eee301 100644 --- a/comfy/hooks.py +++ b/comfy/hooks.py @@ -101,7 +101,7 @@ class WeightHook(Hook): self.need_weight_init = True self._strength_model = strength_model self._strength_clip = strength_clip - + @property def strength_model(self): return self._strength_model * self.strength diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index f08370f83..0f7cc4ca9 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -1230,7 +1230,7 @@ def sample_dpmpp_2m_cfg_pp(model, x, sigmas, extra_args=None, callback=None, dis nonlocal uncond_denoised uncond_denoised = args["uncond_denoised"] return args["denoised"] - + model_options = extra_args.get("model_options", {}).copy() extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True) diff --git a/comfy/ldm/modules/sub_quadratic_attention.py b/comfy/ldm/modules/sub_quadratic_attention.py index 7c5f1d9f9..fca8d1178 100644 --- a/comfy/ldm/modules/sub_quadratic_attention.py +++ b/comfy/ldm/modules/sub_quadratic_attention.py @@ -261,7 +261,7 @@ def efficient_dot_product_attention( value=value, mask=mask, ) - + # TODO: maybe we should use torch.empty_like(query) to allocate storage in-advance, # and pass slices to be mutated, instead of torch.cat()ing the returned slices res = torch.cat([ diff --git a/comfy/ldm/pixart/pixartms.py b/comfy/ldm/pixart/pixartms.py index 50dc58c23..7d4eebdce 100644 --- a/comfy/ldm/pixart/pixartms.py +++ b/comfy/ldm/pixart/pixartms.py @@ -223,7 +223,7 @@ class PixArtMS(nn.Module): if self.micro_conditioning: if c_size is None: c_size = torch.tensor([H*8, W*8], dtype=x.dtype, device=x.device).repeat(B, 1) - + if c_ar is None: c_ar = torch.tensor([H/W], dtype=x.dtype, device=x.device).repeat(B, 1) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index c53bef5bb..de00f773e 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -229,7 +229,7 @@ def detect_unet_config(state_dict, key_prefix): if pe_key in state_dict_keys: dit_config["input_size"] = int(math.sqrt(state_dict[pe_key].shape[1])) * patch_size dit_config["pe_interpolation"] = dit_config["input_size"] // (512//8) # guess - + ar_key = "{}ar_embedder.mlp.0.weight".format(key_prefix) if ar_key in state_dict_keys: dit_config["image_model"] = "pixart_alpha" @@ -571,7 +571,7 @@ def unet_config_from_diffusers_unet(state_dict, dtype=None): 'transformer_depth': [0, 1, 1], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': -2, 'use_linear_in_transformer': False, 'context_dim': 768, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 1, 1, 1, 1], 'use_temporal_attention': False, 'use_temporal_resblock': False} - + SD15_diffusers_inpaint = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': None, 'dtype': dtype, 'in_channels': 9, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': False, 'context_dim': 768, 'num_heads': 8, diff --git a/comfy/sample.py b/comfy/sample.py index 98dcaca7f..9974e0657 100644 --- a/comfy/sample.py +++ b/comfy/sample.py @@ -13,7 +13,7 @@ def prepare_noise(latent_image, seed, noise_inds=None): generator = torch.manual_seed(seed) if noise_inds is None: return torch.randn(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu") - + unique_inds, inverse = np.unique(noise_inds, return_inverse=True) noises = [] for i in range(unique_inds[-1]+1): diff --git a/comfy/sampler_helpers.py b/comfy/sampler_helpers.py index 0691de63f..ac9735369 100644 --- a/comfy/sampler_helpers.py +++ b/comfy/sampler_helpers.py @@ -42,7 +42,7 @@ def get_hooks_from_cond(cond, hooks_dict: dict[comfy.hooks.EnumHookType, dict[co if cnet.previous_controlnet is None: return _list return get_extra_hooks_from_cnet(cnet.previous_controlnet, _list) - + hooks_list = [] cnets = set(cnets) for base_cnet in cnets: diff --git a/comfy/utils.py b/comfy/utils.py index 7de659337..de64b91df 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -455,7 +455,7 @@ def pixart_to_diffusers(mmdit_config, output_prefix=""): for k in PIXART_MAP_BASIC: key_map[k[1]] = "{}{}".format(output_prefix, k[0]) - + return key_map def auraflow_to_diffusers(mmdit_config, output_prefix=""): @@ -702,7 +702,7 @@ def get_attr(obj, attr): def bislerp(samples, width, height): def slerp(b1, b2, r): '''slerps batches b1, b2 according to ratio r, batches should be flat e.g. NxC''' - + c = b1.shape[-1] #norms @@ -730,13 +730,13 @@ def bislerp(samples, width, height): res[dot > 1 - 1e-5] = b1[dot > 1 - 1e-5] res[dot < 1e-5 - 1] = (b1 * (1.0-r) + b2 * r)[dot < 1e-5 - 1] return res - + def generate_bilinear_data(length_old, length_new, device): coords_1 = torch.arange(length_old, dtype=torch.float32, device=device).reshape((1,1,1,-1)) coords_1 = torch.nn.functional.interpolate(coords_1, size=(1, length_new), mode="bilinear") ratios = coords_1 - coords_1.floor() coords_1 = coords_1.to(torch.int64) - + coords_2 = torch.arange(length_old, dtype=torch.float32, device=device).reshape((1,1,1,-1)) + 1 coords_2[:,:,:,-1] -= 1 coords_2 = torch.nn.functional.interpolate(coords_2, size=(1, length_new), mode="bilinear") @@ -747,7 +747,7 @@ def bislerp(samples, width, height): samples = samples.float() n,c,h,w = samples.shape h_new, w_new = (height, width) - + #linear w ratios, coords_1, coords_2 = generate_bilinear_data(w, w_new, samples.device) coords_1 = coords_1.expand((n, c, h, -1)) diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py index 29589b4ab..63fd13b9a 100644 --- a/comfy_extras/nodes_mask.py +++ b/comfy_extras/nodes_mask.py @@ -305,7 +305,7 @@ class FeatherMask: output[:, -y, :] *= feather_rate return (output,) - + class GrowMask: @classmethod def INPUT_TYPES(cls): @@ -316,7 +316,7 @@ class GrowMask: "tapered_corners": ("BOOLEAN", {"default": True}), }, } - + CATEGORY = "mask" RETURN_TYPES = ("MASK",) diff --git a/comfy_extras/nodes_perpneg.py b/comfy_extras/nodes_perpneg.py index 762c40220..290bc4a51 100644 --- a/comfy_extras/nodes_perpneg.py +++ b/comfy_extras/nodes_perpneg.py @@ -64,7 +64,7 @@ class Guider_PerpNeg(comfy.samplers.CFGGuider): def predict_noise(self, x, timestep, model_options={}, seed=None): # in CFGGuider.predict_noise, we call sampling_function(), which uses cfg_function() to compute pos & neg # but we'd rather do a single batch of sampling pos, neg, and empty, so we call calc_cond_batch([pos,neg,empty]) directly - + positive_cond = self.conds.get("positive", None) negative_cond = self.conds.get("negative", None) empty_cond = self.conds.get("empty_negative_prompt", None) diff --git a/comfy_extras/nodes_rebatch.py b/comfy_extras/nodes_rebatch.py index 3010fbd4b..e29cb9ed1 100644 --- a/comfy_extras/nodes_rebatch.py +++ b/comfy_extras/nodes_rebatch.py @@ -40,7 +40,7 @@ class LatentRebatch: return slices, indexable[num * batch_size:] else: return slices, None - + @staticmethod def slice_batch(batch, num, batch_size): result = [LatentRebatch.get_slices(x, num, batch_size) for x in batch] @@ -81,7 +81,7 @@ class LatentRebatch: if current_batch[0].shape[0] > batch_size: num = current_batch[0].shape[0] // batch_size sliced, remainder = self.slice_batch(current_batch, num, batch_size) - + for i in range(num): output_list.append({'samples': sliced[0][i], 'noise_mask': sliced[1][i], 'batch_index': sliced[2][i]}) diff --git a/comfy_extras/nodes_tomesd.py b/comfy_extras/nodes_tomesd.py index ce7b32c77..9f77c06fc 100644 --- a/comfy_extras/nodes_tomesd.py +++ b/comfy_extras/nodes_tomesd.py @@ -40,9 +40,8 @@ def bipartite_soft_matching_random2d(metric: torch.Tensor, return do_nothing, do_nothing gather = mps_gather_workaround if metric.device.type == "mps" else torch.gather - + with torch.no_grad(): - hsy, wsx = h // sy, w // sx # For each sy by sx kernel, randomly assign one token to be dst and the rest src @@ -50,7 +49,7 @@ def bipartite_soft_matching_random2d(metric: torch.Tensor, rand_idx = torch.zeros(hsy, wsx, 1, device=metric.device, dtype=torch.int64) else: rand_idx = torch.randint(sy*sx, size=(hsy, wsx, 1), device=metric.device) - + # The image might not divide sx and sy, so we need to work on a view of the top left if the idx buffer instead idx_buffer_view = torch.zeros(hsy, wsx, sy*sx, device=metric.device, dtype=torch.int64) idx_buffer_view.scatter_(dim=2, index=rand_idx, src=-torch.ones_like(rand_idx, dtype=rand_idx.dtype)) @@ -99,7 +98,7 @@ def bipartite_soft_matching_random2d(metric: torch.Tensor, def merge(x: torch.Tensor, mode="mean") -> torch.Tensor: src, dst = split(x) n, t1, c = src.shape - + unm = gather(src, dim=-2, index=unm_idx.expand(n, t1 - r, c)) src = gather(src, dim=-2, index=src_idx.expand(n, r, c)) dst = dst.scatter_reduce(-2, dst_idx.expand(n, r, c), src, reduce=mode) diff --git a/nodes.py b/nodes.py index a135a6120..89cecc480 100644 --- a/nodes.py +++ b/nodes.py @@ -65,7 +65,7 @@ class CLIPTextEncode(ComfyNodeABC): def encode(self, clip, text): tokens = clip.tokenize(text) return (clip.encode_from_tokens_scheduled(tokens), ) - + class ConditioningCombine: @classmethod @@ -641,7 +641,7 @@ class LoraLoader: "strength_clip": ("FLOAT", {"default": 1.0, "min": -100.0, "max": 100.0, "step": 0.01, "tooltip": "How strongly to modify the CLIP model. This value can be negative."}), } } - + RETURN_TYPES = ("MODEL", "CLIP") OUTPUT_TOOLTIPS = ("The modified diffusion model.", "The modified CLIP model.") FUNCTION = "load_lora" @@ -1211,7 +1211,7 @@ class LatentFromBatch: else: s["batch_index"] = samples["batch_index"][batch_index:batch_index + length] return (s,) - + class RepeatLatentBatch: @classmethod def INPUT_TYPES(s): @@ -1226,7 +1226,7 @@ class RepeatLatentBatch: def repeat(self, samples, amount): s = samples.copy() s_in = samples["samples"] - + s["samples"] = s_in.repeat((amount, 1,1,1)) if "noise_mask" in samples and samples["noise_mask"].shape[0] > 1: masks = samples["noise_mask"] @@ -1636,15 +1636,15 @@ class LoadImage: FUNCTION = "load_image" def load_image(self, image): image_path = folder_paths.get_annotated_filepath(image) - + img = node_helpers.pillow(Image.open, image_path) - + output_images = [] output_masks = [] w, h = None, None excluded_formats = ['MPO'] - + for i in ImageSequence.Iterator(img): i = node_helpers.pillow(ImageOps.exif_transpose, i) @@ -1655,10 +1655,10 @@ class LoadImage: if len(output_images) == 0: w = image.size[0] h = image.size[1] - + if image.size[0] != w or image.size[1] != h: continue - + image = np.array(image).astype(np.float32) / 255.0 image = torch.from_numpy(image)[None,] if 'A' in i.getbands(): @@ -2234,5 +2234,5 @@ def init_extra_nodes(init_custom_nodes=True): else: logging.warning("Please do a: pip install -r requirements.txt") logging.warning("") - + return import_failed