From 80f07952d25227213c72941824401ef432584a2a Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Sun, 22 Dec 2024 23:20:17 -0500
Subject: [PATCH 01/23] Fix lowvram issue with ltxv vae.

---
 comfy/ldm/lightricks/vae/causal_video_autoencoder.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
index 4d43feb22..e0344deec 100644
--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@@ -378,7 +378,7 @@ class Decoder(nn.Module):
             assert (
                 timestep is not None
             ), "should pass timestep with timestep_conditioning=True"
-            scaled_timestep = timestep * self.timestep_scale_multiplier
+            scaled_timestep = timestep * self.timestep_scale_multiplier.to(dtype=sample.dtype, device=sample.device)
 
         for up_block in self.up_blocks:
             if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D):
@@ -403,7 +403,7 @@ class Decoder(nn.Module):
             )
             ada_values = self.last_scale_shift_table[
                 None, ..., None, None, None
-            ] + embedded_timestep.reshape(
+            ].to(device=sample.device, dtype=sample.dtype) + embedded_timestep.reshape(
                 batch_size,
                 2,
                 -1,
@@ -697,7 +697,7 @@ class ResnetBlock3D(nn.Module):
             ), "should pass timestep with timestep_conditioning=True"
             ada_values = self.scale_shift_table[
                 None, ..., None, None, None
-            ] + timestep.reshape(
+            ].to(device=hidden_states.device, dtype=hidden_states.dtype) + timestep.reshape(
                 batch_size,
                 4,
                 -1,
@@ -715,7 +715,7 @@ class ResnetBlock3D(nn.Module):
 
         if self.inject_noise:
             hidden_states = self._feed_spatial_noise(
-                hidden_states, self.per_channel_scale1
+                hidden_states, self.per_channel_scale1.to(device=hidden_states.device, dtype=hidden_states.dtype)
             )
 
         hidden_states = self.norm2(hidden_states)
@@ -731,7 +731,7 @@ class ResnetBlock3D(nn.Module):
 
         if self.inject_noise:
             hidden_states = self._feed_spatial_noise(
-                hidden_states, self.per_channel_scale2
+                hidden_states, self.per_channel_scale2.to(device=hidden_states.device, dtype=hidden_states.dtype)
             )
 
         input_tensor = self.norm3(input_tensor)

From f7d83b72e0d4dd27ce6e54ef77dfb2ae4cb0edcd Mon Sep 17 00:00:00 2001
From: zhangp365 <144313702+zhangp365@users.noreply.github.com>
Date: Mon, 23 Dec 2024 12:44:20 +0800
Subject: [PATCH 02/23] fixed a bug in ldm/pixart/blocks.py (#6158)

---
 comfy/ldm/pixart/blocks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comfy/ldm/pixart/blocks.py b/comfy/ldm/pixart/blocks.py
index 48b27008c..967a224a3 100644
--- a/comfy/ldm/pixart/blocks.py
+++ b/comfy/ldm/pixart/blocks.py
@@ -12,7 +12,7 @@ from comfy.ldm.modules.attention import optimized_attention
 
 if model_management.xformers_enabled():
     import xformers.ops
-    if int((xformers.__version__).split(".")[2]) >= 28:
+    if int((xformers.__version__).split(".")[2].split("+")[0]) >= 28:
         block_diagonal_mask_from_seqlens = xformers.ops.fmha.attn_bias.BlockDiagonalMask.from_seqlens
     else:
         block_diagonal_mask_from_seqlens = xformers.ops.fmha.BlockDiagonalMask.from_seqlens

From 56bc64f3514bc61bdafb8e8f7986c7ebc86d5e9d Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Sun, 22 Dec 2024 23:51:14 -0500
Subject: [PATCH 03/23] Comment out some useless code.

---
 comfy/ldm/pixart/blocks.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/comfy/ldm/pixart/blocks.py b/comfy/ldm/pixart/blocks.py
index 967a224a3..40b0663e5 100644
--- a/comfy/ldm/pixart/blocks.py
+++ b/comfy/ldm/pixart/blocks.py
@@ -10,12 +10,12 @@ from comfy import model_management
 from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, Mlp, timestep_embedding
 from comfy.ldm.modules.attention import optimized_attention
 
-if model_management.xformers_enabled():
-    import xformers.ops
-    if int((xformers.__version__).split(".")[2].split("+")[0]) >= 28:
-        block_diagonal_mask_from_seqlens = xformers.ops.fmha.attn_bias.BlockDiagonalMask.from_seqlens
-    else:
-        block_diagonal_mask_from_seqlens = xformers.ops.fmha.BlockDiagonalMask.from_seqlens
+# if model_management.xformers_enabled():
+#     import xformers.ops
+#     if int((xformers.__version__).split(".")[2].split("+")[0]) >= 28:
+#         block_diagonal_mask_from_seqlens = xformers.ops.fmha.attn_bias.BlockDiagonalMask.from_seqlens
+#     else:
+#         block_diagonal_mask_from_seqlens = xformers.ops.fmha.BlockDiagonalMask.from_seqlens
 
 def modulate(x, shift, scale):
     return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)

From e44d0ac7f77820e8339d20fe3c0698bf8a5e9347 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Mon, 23 Dec 2024 01:50:11 -0500
Subject: [PATCH 04/23] Make --novram completely offload weights.

This flag is mainly used for testing the weight offloading, it shouldn't
actually be used in practice.

Remove useless import.
---
 comfy/ldm/pixart/blocks.py | 1 -
 comfy/model_management.py  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/comfy/ldm/pixart/blocks.py b/comfy/ldm/pixart/blocks.py
index 40b0663e5..2225076e5 100644
--- a/comfy/ldm/pixart/blocks.py
+++ b/comfy/ldm/pixart/blocks.py
@@ -6,7 +6,6 @@ import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
 
-from comfy import model_management
 from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, Mlp, timestep_embedding
 from comfy.ldm.modules.attention import optimized_attention
 
diff --git a/comfy/model_management.py b/comfy/model_management.py
index b480aaaa4..d77ae8c06 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -521,7 +521,7 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
                 lowvram_model_memory = 0
 
         if vram_set_state == VRAMState.NO_VRAM:
-            lowvram_model_memory = 64 * 1024 * 1024
+            lowvram_model_memory = 0.1
 
         loaded_model.model_load(lowvram_model_memory, force_patch_weights=force_patch_weights)
         current_loaded_models.insert(0, loaded_model)

From c6b9c11ef6e90e47ed6db9520fffd4f5c37e9dca Mon Sep 17 00:00:00 2001
From: Simon Lui <502929+simonlui@users.noreply.github.com>
Date: Mon, 23 Dec 2024 00:18:32 -0800
Subject: [PATCH 05/23] Add oneAPI device selector for xpu and some other
 changes. (#6112)

* Add oneAPI device selector and some other minor changes.

* Fix device selector variable name.

* Flip minor version check sign.

* Undo changes to README.md.
---
 comfy/cli_args.py         | 3 ++-
 comfy/model_management.py | 6 ++++--
 main.py                   | 4 ++++
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index 4c6545011..224c075f0 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -84,7 +84,8 @@ parser.add_argument("--force-channels-last", action="store_true", help="Force ch
 
 parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")
 
-parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize when loading models with Intel GPUs.")
+parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
+parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize default when loading models with Intel's Extension for Pytorch.")
 
 class LatentPreviewMethod(enum.Enum):
     NoPreviews = "none"
diff --git a/comfy/model_management.py b/comfy/model_management.py
index d77ae8c06..244fa5c73 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -75,7 +75,7 @@ if args.directml is not None:
 try:
     import intel_extension_for_pytorch as ipex
     _ = torch.xpu.device_count()
-    xpu_available = torch.xpu.is_available()
+    xpu_available = xpu_available or torch.xpu.is_available()
 except:
     xpu_available = xpu_available or (hasattr(torch, "xpu") and torch.xpu.is_available())
 
@@ -219,12 +219,14 @@ if is_intel_xpu():
 if args.cpu_vae:
     VAE_DTYPES = [torch.float32]
 
-
 if ENABLE_PYTORCH_ATTENTION:
     torch.backends.cuda.enable_math_sdp(True)
     torch.backends.cuda.enable_flash_sdp(True)
     torch.backends.cuda.enable_mem_efficient_sdp(True)
 
+if int(torch_version[0]) == 2 and int(torch_version[2]) >= 5:
+    torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
+
 if args.lowvram:
     set_vram_to = VRAMState.LOW_VRAM
     lowvram_available = True
diff --git a/main.py b/main.py
index b65046535..151b264cd 100644
--- a/main.py
+++ b/main.py
@@ -114,6 +114,10 @@ if __name__ == "__main__":
         os.environ['HIP_VISIBLE_DEVICES'] = str(args.cuda_device)
         logging.info("Set cuda device to: {}".format(args.cuda_device))
 
+    if args.oneapi_device_selector is not None:
+        os.environ['ONEAPI_DEVICE_SELECTOR'] = args.oneapi_device_selector
+        logging.info("Set oneapi device selector to: {}".format(args.oneapi_device_selector))
+
     if args.deterministic:
         if 'CUBLAS_WORKSPACE_CONFIG' not in os.environ:
             os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"

From 15564688edb9252ffe0b4f284a814ef2cd546446 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Mon, 23 Dec 2024 03:22:48 -0500
Subject: [PATCH 06/23] Add a try except block so if torch version is weird it
 won't crash.

---
 comfy/model_management.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 244fa5c73..33891b929 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -224,8 +224,11 @@ if ENABLE_PYTORCH_ATTENTION:
     torch.backends.cuda.enable_flash_sdp(True)
     torch.backends.cuda.enable_mem_efficient_sdp(True)
 
-if int(torch_version[0]) == 2 and int(torch_version[2]) >= 5:
-    torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
+try:
+    if int(torch_version[0]) == 2 and int(torch_version[2]) >= 5:
+        torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
+except:
+    logging.warning("Warning, could not set allow_fp16_bf16_reduction_math_sdp")
 
 if args.lowvram:
     set_vram_to = VRAMState.LOW_VRAM

From f18ebbd31645437afaa9738fcf2b5ed8b48cb021 Mon Sep 17 00:00:00 2001
From: Chenlei Hu <chenlei.hu@mail.utoronto.ca>
Date: Mon, 23 Dec 2024 03:29:42 -0500
Subject: [PATCH 07/23] Use raw dir name to serve static web content (#6107)

---
 server.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/server.py b/server.py
index ddd71e062..22525507a 100644
--- a/server.py
+++ b/server.py
@@ -714,9 +714,7 @@ class PromptServer():
         self.app.add_routes(self.routes)
 
         for name, dir in nodes.EXTENSION_WEB_DIRS.items():
-            self.app.add_routes([
-                web.static('/extensions/' + urllib.parse.quote(name), dir),
-            ])
+            self.app.add_routes([web.static('/extensions/' + name, dir)])
 
         self.app.add_routes([
             web.static('/', self.web_root),

From bc6dac4327a838f8583f6272cc3cc612b9b16134 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Mon, 23 Dec 2024 20:03:37 -0500
Subject: [PATCH 08/23] Add temporal tiling to VAE Decode (Tiled) node.

You can now do tiled VAE decoding on the temporal direction for videos.
---
 comfy/sd.py    | 22 ++++++++++++++++++++--
 comfy/utils.py | 26 ++++++++++++++++++++++++--
 nodes.py       | 14 ++++++++++++--
 3 files changed, 56 insertions(+), 6 deletions(-)

diff --git a/comfy/sd.py b/comfy/sd.py
index f79eacc24..e85f2ed77 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -259,6 +259,9 @@ class VAE:
         self.process_output = lambda image: torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)
         self.working_dtypes = [torch.bfloat16, torch.float32]
 
+        self.downscale_index_formula = None
+        self.upscale_index_formula = None
+
         if config is None:
             if "decoder.mid.block_1.mix_factor" in sd:
                 encoder_config = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
@@ -338,6 +341,7 @@ class VAE:
                 self.memory_used_decode = lambda shape, dtype: (1000 * shape[2] * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype)
                 self.memory_used_encode = lambda shape, dtype: (1.5 * max(shape[2], 7) * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype)
                 self.upscale_ratio = (lambda a: max(0, a * 6 - 5), 8, 8)
+                self.upscale_index_formula = (lambda a: max(0, a * 6), 8, 8)
                 self.downscale_ratio = (lambda a: max(0, math.floor((a + 5) / 6)), 8, 8)
                 self.working_dtypes = [torch.float16, torch.float32]
             elif "decoder.up_blocks.0.res_blocks.0.conv1.conv.weight" in sd: #lightricks ltxv
@@ -353,6 +357,7 @@ class VAE:
                 self.memory_used_decode = lambda shape, dtype: (900 * shape[2] * shape[3] * shape[4] * (8 * 8 * 8)) * model_management.dtype_size(dtype)
                 self.memory_used_encode = lambda shape, dtype: (70 * max(shape[2], 7) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
                 self.upscale_ratio = (lambda a: max(0, a * 8 - 7), 32, 32)
+                self.upscale_index_formula = (lambda a: max(0, a * 8), 32, 32)
                 self.downscale_ratio = (lambda a: max(0, math.floor((a + 7) / 8)), 32, 32)
                 self.working_dtypes = [torch.bfloat16, torch.float32]
             elif "decoder.conv_in.conv.weight" in sd:
@@ -360,6 +365,7 @@ class VAE:
                 ddconfig["conv3d"] = True
                 ddconfig["time_compress"] = 4
                 self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
+                self.upscale_index_formula = (lambda a: max(0, a * 4), 8, 8)
                 self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
                 self.latent_dim = 3
                 self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1]
@@ -426,7 +432,7 @@ class VAE:
 
     def decode_tiled_3d(self, samples, tile_t=999, tile_x=32, tile_y=32, overlap=(1, 8, 8)):
         decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
-        return self.process_output(comfy.utils.tiled_scale_multidim(samples, decode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.upscale_ratio, out_channels=self.output_channels, output_device=self.output_device))
+        return self.process_output(comfy.utils.tiled_scale_multidim(samples, decode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.upscale_ratio, out_channels=self.output_channels, index_formulas=self.upscale_index_formula, output_device=self.output_device))
 
     def encode_tiled_(self, pixel_samples, tile_x=512, tile_y=512, overlap = 64):
         steps = pixel_samples.shape[0] * comfy.utils.get_tiled_scale_steps(pixel_samples.shape[3], pixel_samples.shape[2], tile_x, tile_y, overlap)
@@ -479,7 +485,7 @@ class VAE:
         pixel_samples = pixel_samples.to(self.output_device).movedim(1,-1)
         return pixel_samples
 
-    def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None):
+    def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None):
         memory_used = self.memory_used_decode(samples.shape, self.vae_dtype) #TODO: calculate mem required for tile
         model_management.load_models_gpu([self.patcher], memory_required=memory_used)
         dims = samples.ndim - 2
@@ -497,6 +503,12 @@ class VAE:
         elif dims == 2:
             output = self.decode_tiled_(samples, **args)
         elif dims == 3:
+            if overlap_t is None:
+                args["overlap"] = (1, overlap, overlap)
+            else:
+                args["overlap"] = (overlap_t, overlap, overlap)
+            if tile_t is not None:
+                args["tile_t"] = tile_t
             output = self.decode_tiled_3d(samples, **args)
         return output.movedim(1, -1)
 
@@ -575,6 +587,12 @@ class VAE:
         except:
             return self.downscale_ratio
 
+    def temporal_compression_decode(self):
+        try:
+            return round(self.upscale_ratio[0](8192) / 8192)
+        except:
+            return None
+
 class StyleModel:
     def __init__(self, model, device="cpu"):
         self.model = model
diff --git a/comfy/utils.py b/comfy/utils.py
index 5fb5418b5..7de659337 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -822,7 +822,7 @@ def get_tiled_scale_steps(width, height, tile_x, tile_y, overlap):
     return rows * cols
 
 @torch.inference_mode()
-def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_amount=4, out_channels=3, output_device="cpu", downscale=False, pbar=None):
+def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_amount=4, out_channels=3, output_device="cpu", downscale=False, index_formulas=None, pbar=None):
     dims = len(tile)
 
     if not (isinstance(upscale_amount, (tuple, list))):
@@ -831,6 +831,12 @@ def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_am
     if not (isinstance(overlap, (tuple, list))):
         overlap = [overlap] * dims
 
+    if index_formulas is None:
+        index_formulas = upscale_amount
+
+    if not (isinstance(index_formulas, (tuple, list))):
+        index_formulas = [index_formulas] * dims
+
     def get_upscale(dim, val):
         up = upscale_amount[dim]
         if callable(up):
@@ -845,10 +851,26 @@ def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_am
         else:
             return val / up
 
+    def get_upscale_pos(dim, val):
+        up = index_formulas[dim]
+        if callable(up):
+            return up(val)
+        else:
+            return up * val
+
+    def get_downscale_pos(dim, val):
+        up = index_formulas[dim]
+        if callable(up):
+            return up(val)
+        else:
+            return val / up
+
     if downscale:
         get_scale = get_downscale
+        get_pos = get_downscale_pos
     else:
         get_scale = get_upscale
+        get_pos = get_upscale_pos
 
     def mult_list_upscale(a):
         out = []
@@ -881,7 +903,7 @@ def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_am
                 pos = max(0, min(s.shape[d + 2] - overlap[d], it[d]))
                 l = min(tile[d], s.shape[d + 2] - pos)
                 s_in = s_in.narrow(d + 2, pos, l)
-                upscaled.append(round(get_scale(d, pos)))
+                upscaled.append(round(get_pos(d, pos)))
 
             ps = function(s_in).to(output_device)
             mask = torch.ones_like(ps)
diff --git a/nodes.py b/nodes.py
index bdea7564b..d6777df4f 100644
--- a/nodes.py
+++ b/nodes.py
@@ -293,17 +293,27 @@ class VAEDecodeTiled:
         return {"required": {"samples": ("LATENT", ), "vae": ("VAE", ),
                              "tile_size": ("INT", {"default": 512, "min": 64, "max": 4096, "step": 32}),
                              "overlap": ("INT", {"default": 64, "min": 0, "max": 4096, "step": 32}),
+                             "temporal_size": ("INT", {"default": 64, "min": 8, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to decode at a time."}),
+                             "temporal_overlap": ("INT", {"default": 8, "min": 4, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to overlap."}),
                             }}
     RETURN_TYPES = ("IMAGE",)
     FUNCTION = "decode"
 
     CATEGORY = "_for_testing"
 
-    def decode(self, vae, samples, tile_size, overlap=64):
+    def decode(self, vae, samples, tile_size, overlap=64, temporal_size=64, temporal_overlap=8):
         if tile_size < overlap * 4:
             overlap = tile_size // 4
+        temporal_compression = vae.temporal_compression_decode()
+        if temporal_compression is not None:
+            temporal_size = max(2, temporal_size // temporal_compression)
+            temporal_overlap = min(1, temporal_size // 2, temporal_overlap // temporal_compression)
+        else:
+            temporal_size = None
+            temporal_overlap = None
+
         compression = vae.spacial_compression_decode()
-        images = vae.decode_tiled(samples["samples"], tile_x=tile_size // compression, tile_y=tile_size // compression, overlap=overlap // compression)
+        images = vae.decode_tiled(samples["samples"], tile_x=tile_size // compression, tile_y=tile_size // compression, overlap=overlap // compression, tile_t=temporal_size, overlap_t=temporal_overlap)
         if len(images.shape) == 5: #Combine batches
             images = images.reshape(-1, images.shape[-3], images.shape[-2], images.shape[-1])
         return (images, )

From 26e0ba8f8cf786575fc1324acb858ad81f3ef9d6 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Tue, 24 Dec 2024 14:38:52 +0300
Subject: [PATCH 09/23] Enable External Event Loop Integration for ComfyUI
 [refactor] (#6114)

* Refactor main.py to support external event loop integration

* added optional "asyncio_loop" argument to allow using existing event loop

---------

Signed-off-by: bigcat88 <bigcat88@icloud.com>
---
 main.py | 60 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 21 deletions(-)

diff --git a/main.py b/main.py
index 151b264cd..ccc99fdc4 100644
--- a/main.py
+++ b/main.py
@@ -150,9 +150,10 @@ def cuda_malloc_warning():
         if cuda_malloc_warning:
             logging.warning("\nWARNING: this card most likely does not support cuda-malloc, if you get \"CUDA error\" please run ComfyUI with: --disable-cuda-malloc\n")
 
-def prompt_worker(q, server):
+
+def prompt_worker(q, server_instance):
     current_time: float = 0.0
-    e = execution.PromptExecutor(server, lru_size=args.cache_lru)
+    e = execution.PromptExecutor(server_instance, lru_size=args.cache_lru)
     last_gc_collect = 0
     need_gc = False
     gc_collect_interval = 10.0
@@ -167,7 +168,7 @@ def prompt_worker(q, server):
             item, item_id = queue_item
             execution_start_time = time.perf_counter()
             prompt_id = item[1]
-            server.last_prompt_id = prompt_id
+            server_instance.last_prompt_id = prompt_id
 
             e.execute(item[2], prompt_id, item[3], item[4])
             need_gc = True
@@ -177,8 +178,8 @@ def prompt_worker(q, server):
                             status_str='success' if e.success else 'error',
                             completed=e.success,
                             messages=e.status_messages))
-            if server.client_id is not None:
-                server.send_sync("executing", { "node": None, "prompt_id": prompt_id }, server.client_id)
+            if server_instance.client_id is not None:
+                server_instance.send_sync("executing", {"node": None, "prompt_id": prompt_id}, server_instance.client_id)
 
             current_time = time.perf_counter()
             execution_time = current_time - execution_start_time
@@ -205,21 +206,23 @@ def prompt_worker(q, server):
                 last_gc_collect = current_time
                 need_gc = False
 
-async def run(server, address='', port=8188, verbose=True, call_on_start=None):
+
+async def run(server_instance, address='', port=8188, verbose=True, call_on_start=None):
     addresses = []
     for addr in address.split(","):
         addresses.append((addr, port))
-    await asyncio.gather(server.start_multi_address(addresses, call_on_start), server.publish_loop())
+    await asyncio.gather(server_instance.start_multi_address(addresses, call_on_start), server_instance.publish_loop())
 
 
-def hijack_progress(server):
+def hijack_progress(server_instance):
     def hook(value, total, preview_image):
         comfy.model_management.throw_exception_if_processing_interrupted()
-        progress = {"value": value, "max": total, "prompt_id": server.last_prompt_id, "node": server.last_node_id}
+        progress = {"value": value, "max": total, "prompt_id": server_instance.last_prompt_id, "node": server_instance.last_node_id}
 
-        server.send_sync("progress", progress, server.client_id)
+        server_instance.send_sync("progress", progress, server_instance.client_id)
         if preview_image is not None:
-            server.send_sync(BinaryEventTypes.UNENCODED_PREVIEW_IMAGE, preview_image, server.client_id)
+            server_instance.send_sync(BinaryEventTypes.UNENCODED_PREVIEW_IMAGE, preview_image, server_instance.client_id)
+
     comfy.utils.set_progress_bar_global_hook(hook)
 
 
@@ -229,7 +232,11 @@ def cleanup_temp():
         shutil.rmtree(temp_dir, ignore_errors=True)
 
 
-if __name__ == "__main__":
+def start_comfyui(asyncio_loop=None):
+    """
+    Starts the ComfyUI server using the provided asyncio event loop or creates a new one.
+    Returns the event loop, server instance, and a function to start the server asynchronously.
+    """
     if args.temp_directory:
         temp_dir = os.path.join(os.path.abspath(args.temp_directory), "temp")
         logging.info(f"Setting temp directory to: {temp_dir}")
@@ -243,19 +250,20 @@ if __name__ == "__main__":
         except:
             pass
 
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    server = server.PromptServer(loop)
-    q = execution.PromptQueue(server)
+    if not asyncio_loop:
+        asyncio_loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(asyncio_loop)
+    prompt_server = server.PromptServer(asyncio_loop)
+    q = execution.PromptQueue(prompt_server)
 
     nodes.init_extra_nodes(init_custom_nodes=not args.disable_all_custom_nodes)
 
     cuda_malloc_warning()
 
-    server.add_routes()
-    hijack_progress(server)
+    prompt_server.add_routes()
+    hijack_progress(prompt_server)
 
-    threading.Thread(target=prompt_worker, daemon=True, args=(q, server,)).start()
+    threading.Thread(target=prompt_worker, daemon=True, args=(q, prompt_server,)).start()
 
     if args.quick_test_for_ci:
         exit(0)
@@ -272,9 +280,19 @@ if __name__ == "__main__":
             webbrowser.open(f"{scheme}://{address}:{port}")
         call_on_start = startup_server
 
+    async def start_all():
+        await prompt_server.setup()
+        await run(prompt_server, address=args.listen, port=args.port, verbose=not args.dont_print_server, call_on_start=call_on_start)
+
+    # Returning these so that other code can integrate with the ComfyUI loop and server
+    return asyncio_loop, prompt_server, start_all
+
+
+if __name__ == "__main__":
+    # Running directly, just start ComfyUI.
+    event_loop, _, start_all_func = start_comfyui()
     try:
-        loop.run_until_complete(server.setup())
-        loop.run_until_complete(run(server, address=args.listen, port=args.port, verbose=not args.dont_print_server, call_on_start=call_on_start))
+        event_loop.run_until_complete(start_all_func())
     except KeyboardInterrupt:
         logging.info("\nStopped server")
 

From 5388df784acc0f42da1d54fb379b25ad079864cd Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Tue, 24 Dec 2024 07:10:09 -0500
Subject: [PATCH 10/23] Add temporal tiling to VAE Encode (Tiled) node.

---
 comfy/sd.py | 19 ++++++++++++++-----
 nodes.py    |  8 +++++---
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/comfy/sd.py b/comfy/sd.py
index e85f2ed77..2db00fa44 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -341,8 +341,9 @@ class VAE:
                 self.memory_used_decode = lambda shape, dtype: (1000 * shape[2] * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype)
                 self.memory_used_encode = lambda shape, dtype: (1.5 * max(shape[2], 7) * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype)
                 self.upscale_ratio = (lambda a: max(0, a * 6 - 5), 8, 8)
-                self.upscale_index_formula = (lambda a: max(0, a * 6), 8, 8)
+                self.upscale_index_formula = (6, 8, 8)
                 self.downscale_ratio = (lambda a: max(0, math.floor((a + 5) / 6)), 8, 8)
+                self.downscale_index_formula = (6, 8, 8)
                 self.working_dtypes = [torch.float16, torch.float32]
             elif "decoder.up_blocks.0.res_blocks.0.conv1.conv.weight" in sd: #lightricks ltxv
                 tensor_conv1 = sd["decoder.up_blocks.0.res_blocks.0.conv1.conv.weight"]
@@ -357,16 +358,18 @@ class VAE:
                 self.memory_used_decode = lambda shape, dtype: (900 * shape[2] * shape[3] * shape[4] * (8 * 8 * 8)) * model_management.dtype_size(dtype)
                 self.memory_used_encode = lambda shape, dtype: (70 * max(shape[2], 7) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
                 self.upscale_ratio = (lambda a: max(0, a * 8 - 7), 32, 32)
-                self.upscale_index_formula = (lambda a: max(0, a * 8), 32, 32)
+                self.upscale_index_formula = (8, 32, 32)
                 self.downscale_ratio = (lambda a: max(0, math.floor((a + 7) / 8)), 32, 32)
+                self.downscale_index_formula = (8, 32, 32)
                 self.working_dtypes = [torch.bfloat16, torch.float32]
             elif "decoder.conv_in.conv.weight" in sd:
                 ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
                 ddconfig["conv3d"] = True
                 ddconfig["time_compress"] = 4
                 self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
-                self.upscale_index_formula = (lambda a: max(0, a * 4), 8, 8)
+                self.upscale_index_formula = (4, 8, 8)
                 self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
+                self.downscale_index_formula = (4, 8, 8)
                 self.latent_dim = 3
                 self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1]
                 self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1])
@@ -453,7 +456,7 @@ class VAE:
 
     def encode_tiled_3d(self, samples, tile_t=9999, tile_x=512, tile_y=512, overlap=(1, 64, 64)):
         encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
-        return comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.downscale_ratio, out_channels=self.latent_channels, downscale=True, output_device=self.output_device)
+        return comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.downscale_ratio, out_channels=self.latent_channels, downscale=True, index_formulas=self.downscale_index_formula, output_device=self.output_device)
 
     def decode(self, samples_in):
         pixel_samples = None
@@ -544,7 +547,7 @@ class VAE:
 
         return samples
 
-    def encode_tiled(self, pixel_samples, tile_x=None, tile_y=None, overlap=None):
+    def encode_tiled(self, pixel_samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None):
         pixel_samples = self.vae_encode_crop_pixels(pixel_samples)
         dims = self.latent_dim
         pixel_samples = pixel_samples.movedim(-1, 1)
@@ -568,6 +571,12 @@ class VAE:
         elif dims == 2:
             samples = self.encode_tiled_(pixel_samples, **args)
         elif dims == 3:
+            if overlap_t is None:
+                args["overlap"] = (1, overlap, overlap)
+            else:
+                args["overlap"] = (overlap_t, overlap, overlap)
+            if tile_t is not None:
+                args["tile_t"] = tile_t
             samples = self.encode_tiled_3d(pixel_samples, **args)
 
         return samples
diff --git a/nodes.py b/nodes.py
index d6777df4f..e95abc40b 100644
--- a/nodes.py
+++ b/nodes.py
@@ -337,15 +337,17 @@ class VAEEncodeTiled:
         return {"required": {"pixels": ("IMAGE", ), "vae": ("VAE", ),
                              "tile_size": ("INT", {"default": 512, "min": 64, "max": 4096, "step": 64}),
                              "overlap": ("INT", {"default": 64, "min": 0, "max": 4096, "step": 32}),
+                             "temporal_size": ("INT", {"default": 64, "min": 8, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to encode at a time."}),
+                             "temporal_overlap": ("INT", {"default": 8, "min": 4, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to overlap."}),
                             }}
     RETURN_TYPES = ("LATENT",)
     FUNCTION = "encode"
 
     CATEGORY = "_for_testing"
 
-    def encode(self, vae, pixels, tile_size, overlap):
-        t = vae.encode_tiled(pixels[:,:,:,:3], tile_x=tile_size, tile_y=tile_size, overlap=overlap)
-        return ({"samples":t}, )
+    def encode(self, vae, pixels, tile_size, overlap, temporal_size=64, temporal_overlap=8):
+        t = vae.encode_tiled(pixels[:,:,:,:3], tile_x=tile_size, tile_y=tile_size, overlap=overlap, tile_t=temporal_size, overlap_t=temporal_overlap)
+        return ({"samples": t}, )
 
 class VAEEncodeForInpaint:
     @classmethod

From 73e04987f7e0f14bdee9baa0aafe61cf7f42a8b2 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Tue, 24 Dec 2024 07:36:30 -0500
Subject: [PATCH 11/23] Prevent black images in VAE Decode (Tiled) node.

Overlap should be minimum 1 with tiling 2 for tiled temporal VAE decoding.
---
 comfy/sd.py | 5 +++--
 nodes.py    | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/comfy/sd.py b/comfy/sd.py
index 2db00fa44..de3ce677c 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -509,9 +509,10 @@ class VAE:
             if overlap_t is None:
                 args["overlap"] = (1, overlap, overlap)
             else:
-                args["overlap"] = (overlap_t, overlap, overlap)
+                args["overlap"] = (max(1, overlap_t), overlap, overlap)
             if tile_t is not None:
-                args["tile_t"] = tile_t
+                args["tile_t"] = max(2, tile_t)
+
             output = self.decode_tiled_3d(samples, **args)
         return output.movedim(1, -1)
 
diff --git a/nodes.py b/nodes.py
index e95abc40b..a135a6120 100644
--- a/nodes.py
+++ b/nodes.py
@@ -304,6 +304,8 @@ class VAEDecodeTiled:
     def decode(self, vae, samples, tile_size, overlap=64, temporal_size=64, temporal_overlap=8):
         if tile_size < overlap * 4:
             overlap = tile_size // 4
+        if temporal_size < temporal_overlap * 2:
+            temporal_overlap = temporal_overlap // 2
         temporal_compression = vae.temporal_compression_decode()
         if temporal_compression is not None:
             temporal_size = max(2, temporal_size // temporal_compression)

From 99a1fb6027b7163592a83669b0b1c5aa4657c2b6 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Tue, 24 Dec 2024 18:05:19 -0500
Subject: [PATCH 12/23] Make fast fp8 take a bit less peak memory.

---
 comfy/ops.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/comfy/ops.py b/comfy/ops.py
index 8e0694232..06be6b48b 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -255,9 +255,10 @@ def fp8_linear(self, input):
         tensor_2d = True
         input = input.unsqueeze(1)
 
-
+    input_shape = input.shape
+    input_dtype = input.dtype
     if len(input.shape) == 3:
-        w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input.dtype)
+        w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype)
         w = w.t()
 
         scale_weight = self.scale_weight
@@ -269,23 +270,24 @@ def fp8_linear(self, input):
 
         if scale_input is None:
             scale_input = torch.ones((), device=input.device, dtype=torch.float32)
-            inn = torch.clamp(input, min=-448, max=448).reshape(-1, input.shape[2]).to(dtype)
+            input = torch.clamp(input, min=-448, max=448, out=input)
+            input = input.reshape(-1, input_shape[2]).to(dtype)
         else:
             scale_input = scale_input.to(input.device)
-            inn = (input * (1.0 / scale_input).to(input.dtype)).reshape(-1, input.shape[2]).to(dtype)
+            input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype)
 
         if bias is not None:
-            o = torch._scaled_mm(inn, w, out_dtype=input.dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight)
+            o = torch._scaled_mm(input, w, out_dtype=input_dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight)
         else:
-            o = torch._scaled_mm(inn, w, out_dtype=input.dtype, scale_a=scale_input, scale_b=scale_weight)
+            o = torch._scaled_mm(input, w, out_dtype=input_dtype, scale_a=scale_input, scale_b=scale_weight)
 
         if isinstance(o, tuple):
             o = o[0]
 
         if tensor_2d:
-            return o.reshape(input.shape[0], -1)
+            return o.reshape(input_shape[0], -1)
 
-        return o.reshape((-1, input.shape[1], self.weight.shape[0]))
+        return o.reshape((-1, input_shape[1], self.weight.shape[0]))
 
     return None
 

From 1ed75ab30ee2fdef6b3b41ad3061583a0fede723 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Wed, 25 Dec 2024 03:29:03 -0500
Subject: [PATCH 13/23] Update nightly pytorch instructions in readme for
 nvidia.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8311b7b7c..371421617 100644
--- a/README.md
+++ b/README.md
@@ -189,7 +189,7 @@ Nvidia users should install stable pytorch using this command:
 
 This is the command to install pytorch nightly instead which might have performance improvements:
 
-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126```
 
 #### Troubleshooting
 

From 0229228f3f75fc4b0d0d4cf3658138eedc2cc2eb Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Wed, 25 Dec 2024 04:50:34 -0500
Subject: [PATCH 14/23] Clean up the VAE dtypes code.

---
 comfy/model_management.py | 27 ++++++++++++---------------
 comfy/sd.py               |  4 ++--
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 33891b929..8320c6ece 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -188,6 +188,12 @@ def is_nvidia():
             return True
     return False
 
+def is_amd():
+    global cpu_state
+    if cpu_state == CPUState.GPU:
+        if torch.version.hip:
+            return True
+    return False
 
 MIN_WEIGHT_MEMORY_RATIO = 0.4
 if is_nvidia():
@@ -198,27 +204,17 @@ if args.use_pytorch_cross_attention:
     ENABLE_PYTORCH_ATTENTION = True
     XFORMERS_IS_AVAILABLE = False
 
-VAE_DTYPES = [torch.float32]
-
 try:
     if is_nvidia():
         if int(torch_version[0]) >= 2:
             if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
                 ENABLE_PYTORCH_ATTENTION = True
-            if torch.cuda.is_bf16_supported() and torch.cuda.get_device_properties(torch.cuda.current_device()).major >= 8:
-                VAE_DTYPES = [torch.bfloat16] + VAE_DTYPES
     if is_intel_xpu():
         if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
             ENABLE_PYTORCH_ATTENTION = True
 except:
     pass
 
-if is_intel_xpu():
-    VAE_DTYPES = [torch.bfloat16] + VAE_DTYPES
-
-if args.cpu_vae:
-    VAE_DTYPES = [torch.float32]
-
 if ENABLE_PYTORCH_ATTENTION:
     torch.backends.cuda.enable_math_sdp(True)
     torch.backends.cuda.enable_flash_sdp(True)
@@ -754,7 +750,6 @@ def vae_offload_device():
         return torch.device("cpu")
 
 def vae_dtype(device=None, allowed_dtypes=[]):
-    global VAE_DTYPES
     if args.fp16_vae:
         return torch.float16
     elif args.bf16_vae:
@@ -763,12 +758,14 @@ def vae_dtype(device=None, allowed_dtypes=[]):
         return torch.float32
 
     for d in allowed_dtypes:
-        if d == torch.float16 and should_use_fp16(device, prioritize_performance=False):
-            return d
-        if d in VAE_DTYPES:
+        if d == torch.float16 and should_use_fp16(device):
             return d
 
-    return VAE_DTYPES[0]
+        # NOTE: bfloat16 seems to work on AMD for the VAE but is extremely slow in some cases compared to fp32
+        if d == torch.bfloat16 and (not is_amd()) and should_use_bf16(device):
+            return d
+
+    return torch.float32
 
 def get_autocast_device(dev):
     if hasattr(dev, 'type'):
diff --git a/comfy/sd.py b/comfy/sd.py
index de3ce677c..55f91116f 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -111,7 +111,7 @@ class CLIP:
             model_management.load_models_gpu([self.patcher], force_full_load=True)
         self.layer_idx = None
         self.use_clip_schedule = False
-        logging.debug("CLIP model load device: {}, offload device: {}, current: {}".format(load_device, offload_device, params['device']))
+        logging.info("CLIP model load device: {}, offload device: {}, current: {}, dtype: {}".format(load_device, offload_device, params['device'], dtype))
 
     def clone(self):
         n = CLIP(no_init=True)
@@ -402,7 +402,7 @@ class VAE:
         self.output_device = model_management.intermediate_device()
 
         self.patcher = comfy.model_patcher.ModelPatcher(self.first_stage_model, load_device=self.device, offload_device=offload_device)
-        logging.debug("VAE load device: {}, offload device: {}, dtype: {}".format(self.device, offload_device, self.vae_dtype))
+        logging.info("VAE load device: {}, offload device: {}, dtype: {}".format(self.device, offload_device, self.vae_dtype))
 
     def vae_encode_crop_pixels(self, pixels):
         downscale_ratio = self.spacial_compression_encode()

From b486885e0866b1fc37b767a7ff04c1f40acb5ac4 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Wed, 25 Dec 2024 05:18:50 -0500
Subject: [PATCH 15/23] Disable bfloat16 on older mac.

---
 comfy/model_management.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 8320c6ece..ce241e17f 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -886,14 +886,19 @@ def pytorch_attention_flash_attention():
             return True
     return False
 
+def mac_version():
+    try:
+        return tuple(int(n) for n in platform.mac_ver()[0].split("."))
+    except:
+        return None
+
 def force_upcast_attention_dtype():
     upcast = args.force_upcast_attention
-    try:
-        macos_version = tuple(int(n) for n in platform.mac_ver()[0].split("."))
-        if (14, 5) <= macos_version <= (15, 2):  # black image bug on recent versions of macOS
-            upcast = True
-    except:
-        pass
+
+    macos_version = mac_version()
+    if macos_version is not None and ((14, 5) <= macos_version <= (15, 2)):  # black image bug on recent versions of macOS
+        upcast = True
+
     if upcast:
         return torch.float32
     else:
@@ -1034,6 +1039,8 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
         return False
 
     if mps_mode():
+        if mac_version() < (14,):
+            return False
         return True
 
     if cpu_mode():

From 19a64d62918c68b800de7277472c3b039beaa126 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Wed, 25 Dec 2024 05:32:51 -0500
Subject: [PATCH 16/23] Cleanup some mac related code.

---
 comfy/model_management.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index ce241e17f..db2a61395 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -969,17 +969,13 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
     if FORCE_FP16:
         return True
 
-    if device is not None:
-        if is_device_mps(device):
-            return True
-
     if FORCE_FP32:
         return False
 
     if directml_enabled:
         return False
 
-    if mps_mode():
+    if (device is not None and is_device_mps(device)) or mps_mode():
         return True
 
     if cpu_mode():
@@ -1028,17 +1024,13 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
         if is_device_cpu(device): #TODO ? bf16 works on CPU but is extremely slow
             return False
 
-    if device is not None:
-        if is_device_mps(device):
-            return True
-
     if FORCE_FP32:
         return False
 
     if directml_enabled:
         return False
 
-    if mps_mode():
+    if (device is not None and is_device_mps(device)) or mps_mode():
         if mac_version() < (14,):
             return False
         return True

From ee9547ba31f5f2c1de0211a09c3fb829bd8e25e6 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Thu, 26 Dec 2024 07:18:49 -0500
Subject: [PATCH 17/23] Improve temporal VAE Encode (Tiled) math.

---
 comfy/sd.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/comfy/sd.py b/comfy/sd.py
index 55f91116f..c6d6236b1 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -572,13 +572,20 @@ class VAE:
         elif dims == 2:
             samples = self.encode_tiled_(pixel_samples, **args)
         elif dims == 3:
+            if tile_t is not None:
+                tile_t_latent = max(2, self.downscale_ratio[0](tile_t))
+            else:
+                tile_t_latent = 9999
+            args["tile_t"] = self.upscale_ratio[0](tile_t_latent)
+
             if overlap_t is None:
                 args["overlap"] = (1, overlap, overlap)
             else:
-                args["overlap"] = (overlap_t, overlap, overlap)
-            if tile_t is not None:
-                args["tile_t"] = tile_t
-            samples = self.encode_tiled_3d(pixel_samples, **args)
+                args["overlap"] = (self.upscale_ratio[0](max(1, min(tile_t_latent // 2, self.downscale_ratio[0](overlap_t)))), overlap, overlap)
+            maximum = pixel_samples.shape[2]
+            maximum = self.upscale_ratio[0](self.downscale_ratio[0](maximum))
+
+            samples = self.encode_tiled_3d(pixel_samples[:,:,:maximum], **args)
 
         return samples
 

From c4bfdba3301eb8dd2000b1b22e4752a662d4c856 Mon Sep 17 00:00:00 2001
From: Huazhong Ji <hzji210@gmail.com>
Date: Fri, 27 Dec 2024 08:36:50 +0800
Subject: [PATCH 18/23] Support ascend npu (#5436)

* support ascend npu

Co-authored-by: YukMingLaw <lymmm2@163.com>
Co-authored-by: starmountain1997 <guozr1997@hotmail.com>
Co-authored-by: Ginray <ginray0215@gmail.com>
---
 README.md                 | 10 ++++++++++
 comfy/model_management.py | 41 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 371421617..000d76801 100644
--- a/README.md
+++ b/README.md
@@ -224,6 +224,16 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve
 
 ```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```
 
+#### Ascend NPUs
+
+For models compatible with Ascend Extension for PyTorch (torch_npu). To get started, ensure your environment meets the prerequisites outlined on the [installation](https://ascend.github.io/docs/sources/ascend/quick_install.html) page. Here's a step-by-step guide tailored to your platform and installation method:
+
+1. Begin by installing the recommended or newer kernel version for Linux as specified in the Installation page of torch-npu, if necessary.
+2. Proceed with the installation of Ascend Basekit, which includes the driver, firmware, and CANN, following the instructions provided for your specific platform.
+3. Next, install the necessary packages for torch-npu by adhering to the platform-specific instructions on the [Installation](https://ascend.github.io/docs/sources/pytorch/install.html#pytorch) page.
+4. Finally, adhere to the [ComfyUI manual installation](#manual-install-windows-linux) guide for Linux. Once all components are installed, you can run ComfyUI as described earlier.
+
+
 # Running
 
 ```python main.py```
diff --git a/comfy/model_management.py b/comfy/model_management.py
index db2a61395..c36c52ffd 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -86,6 +86,13 @@ try:
 except:
     pass
 
+try:
+    import torch_npu
+    _ = torch.npu.device_count()
+    npu_available = torch.npu.is_available()
+except:
+    npu_available = False
+
 if args.cpu:
     cpu_state = CPUState.CPU
 
@@ -97,6 +104,12 @@ def is_intel_xpu():
             return True
     return False
 
+def is_ascend_npu():
+    global npu_available
+    if npu_available:
+        return True
+    return False
+
 def get_torch_device():
     global directml_enabled
     global cpu_state
@@ -110,6 +123,8 @@ def get_torch_device():
     else:
         if is_intel_xpu():
             return torch.device("xpu", torch.xpu.current_device())
+        elif is_ascend_npu():
+            return torch.device("npu", torch.npu.current_device())
         else:
             return torch.device(torch.cuda.current_device())
 
@@ -130,6 +145,12 @@ def get_total_memory(dev=None, torch_total_too=False):
             mem_reserved = stats['reserved_bytes.all.current']
             mem_total_torch = mem_reserved
             mem_total = torch.xpu.get_device_properties(dev).total_memory
+        elif is_ascend_npu():
+            stats = torch.npu.memory_stats(dev)
+            mem_reserved = stats['reserved_bytes.all.current']
+            _, mem_total_npu = torch.npu.mem_get_info(dev)
+            mem_total_torch = mem_reserved
+            mem_total = mem_total_npu
         else:
             stats = torch.cuda.memory_stats(dev)
             mem_reserved = stats['reserved_bytes.all.current']
@@ -209,7 +230,7 @@ try:
         if int(torch_version[0]) >= 2:
             if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
                 ENABLE_PYTORCH_ATTENTION = True
-    if is_intel_xpu():
+    if is_intel_xpu() or is_ascend_npu():
         if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
             ENABLE_PYTORCH_ATTENTION = True
 except:
@@ -274,6 +295,8 @@ def get_torch_device_name(device):
             return "{}".format(device.type)
     elif is_intel_xpu():
         return "{} {}".format(device, torch.xpu.get_device_name(device))
+    elif is_ascend_npu():
+        return "{} {}".format(device, torch.npu.get_device_name(device))
     else:
         return "CUDA {}: {}".format(device, torch.cuda.get_device_name(device))
 
@@ -860,6 +883,8 @@ def xformers_enabled():
         return False
     if is_intel_xpu():
         return False
+    if is_ascend_npu():
+        return False
     if directml_enabled:
         return False
     return XFORMERS_IS_AVAILABLE
@@ -884,6 +909,8 @@ def pytorch_attention_flash_attention():
             return True
         if is_intel_xpu():
             return True
+        if is_ascend_npu():
+            return True
     return False
 
 def mac_version():
@@ -923,6 +950,13 @@ def get_free_memory(dev=None, torch_free_too=False):
             mem_free_torch = mem_reserved - mem_active
             mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved
             mem_free_total = mem_free_xpu + mem_free_torch
+        elif is_ascend_npu():
+            stats = torch.npu.memory_stats(dev)
+            mem_active = stats['active_bytes.all.current']
+            mem_reserved = stats['reserved_bytes.all.current']
+            mem_free_npu, _ = torch.npu.mem_get_info(dev)
+            mem_free_torch = mem_reserved - mem_active
+            mem_free_total = mem_free_npu + mem_free_torch
         else:
             stats = torch.cuda.memory_stats(dev)
             mem_active = stats['active_bytes.all.current']
@@ -984,6 +1018,9 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
     if is_intel_xpu():
         return True
 
+    if is_ascend_npu():
+        return True
+
     if torch.version.hip:
         return True
 
@@ -1081,6 +1118,8 @@ def soft_empty_cache(force=False):
         torch.mps.empty_cache()
     elif is_intel_xpu():
         torch.xpu.empty_cache()
+    elif is_ascend_npu():
+        torch.npu.empty_cache()
     elif torch.cuda.is_available():
         if force or is_nvidia(): #This seems to make things worse on ROCm so I only do it for cuda
             torch.cuda.empty_cache()

From 160ca081387e6d871487a6caedeb9bbacf073665 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Thu, 26 Dec 2024 20:05:54 -0500
Subject: [PATCH 19/23] Use python 3.9 in launch test instead of 3.8

Fix ruff check.
---
 .github/workflows/test-launch.yml | 2 +-
 comfy/model_management.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test-launch.yml b/.github/workflows/test-launch.yml
index 5d665d6af..c56283c2d 100644
--- a/.github/workflows/test-launch.yml
+++ b/.github/workflows/test-launch.yml
@@ -17,7 +17,7 @@ jobs:
         path: "ComfyUI"
     - uses: actions/setup-python@v4
       with:
-        python-version: '3.8'
+        python-version: '3.9'
     - name: Install requirements
       run: |
         python -m pip install --upgrade pip
diff --git a/comfy/model_management.py b/comfy/model_management.py
index c36c52ffd..731fb5845 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -87,7 +87,7 @@ except:
     pass
 
 try:
-    import torch_npu
+    import torch_npu  # noqa: F401
     _ = torch.npu.device_count()
     npu_available = torch.npu.is_available()
 except:

From ceb50b2cbfb166b84786d12d313a624273590fab Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Fri, 27 Dec 2024 07:30:09 -0500
Subject: [PATCH 20/23] Closer memory estimation for pixart models.

---
 comfy/supported_models.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index a5f38b5ed..512515aeb 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -608,6 +608,8 @@ class PixArtAlpha(supported_models_base.BASE):
     unet_extra_config = {}
     latent_format = latent_formats.SD15
 
+    memory_usage_factor = 0.5
+
     vae_key_prefix = ["vae."]
     text_encoder_key_prefix = ["text_encoders."]
 

From 4b5bcd8ac4e221681e2541c2aa2f665a56ef72de Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Fri, 27 Dec 2024 07:37:00 -0500
Subject: [PATCH 21/23] Closer memory estimation for hunyuan dit model.

---
 comfy/supported_models.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 512515aeb..6a2cc75ae 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -642,6 +642,8 @@ class HunyuanDiT(supported_models_base.BASE):
 
     latent_format = latent_formats.SDXL
 
+    memory_usage_factor = 1.3
+
     vae_key_prefix = ["vae."]
     text_encoder_key_prefix = ["text_encoders."]
 

From 9cfd185676e1bd0d5642c43bb9ee1f857ecd1be4 Mon Sep 17 00:00:00 2001
From: filtered <176114999+webfiltered@users.noreply.github.com>
Date: Sat, 28 Dec 2024 06:40:05 +1100
Subject: [PATCH 22/23] Add option to log non-error output to stdout (#6243)

* nit

* Add option to log non-error output to stdout

- No change to default behaviour
- Adds CLI argument: --log-stdout
- With this arg present, any logging of a level below logging.ERROR will be sent to stdout instead of stderr
---
 app/logger.py     | 13 ++++++++++++-
 comfy/cli_args.py |  3 ++-
 main.py           |  2 +-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/app/logger.py b/app/logger.py
index 527be9fe7..9e9f84ccf 100644
--- a/app/logger.py
+++ b/app/logger.py
@@ -51,7 +51,7 @@ def on_flush(callback):
     if stderr_interceptor is not None:
         stderr_interceptor.on_flush(callback)
 
-def setup_logger(log_level: str = 'INFO', capacity: int = 300):
+def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool = False):
     global logs
     if logs:
         return
@@ -70,4 +70,15 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300):
 
     stream_handler = logging.StreamHandler()
     stream_handler.setFormatter(logging.Formatter("%(message)s"))
+
+    if use_stdout:
+        # Only errors and critical to stderr
+        stream_handler.addFilter(lambda record: not record.levelno < logging.ERROR)
+
+        # Lesser to stdout
+        stdout_handler = logging.StreamHandler(sys.stdout)
+        stdout_handler.setFormatter(logging.Formatter("%(message)s"))
+        stdout_handler.addFilter(lambda record: record.levelno < logging.ERROR)
+        logger.addHandler(stdout_handler)
+
     logger.addHandler(stream_handler)
diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index 224c075f0..812798bf8 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -122,7 +122,7 @@ vram_group.add_argument("--lowvram", action="store_true", help="Split the unet i
 vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
 vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")
 
-parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reverved depending on your OS.")
+parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")
 
 
 parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.")
@@ -141,6 +141,7 @@ parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Dis
 parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")
 
 parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level')
+parser.add_argument("--log-stdout", action="store_true", help="Send normal process output to stdout instead of stderr (default).")
 
 # The default built-in provider hosted under web/
 DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest"
diff --git a/main.py b/main.py
index ccc99fdc4..95972f73b 100644
--- a/main.py
+++ b/main.py
@@ -17,7 +17,7 @@ if __name__ == "__main__":
     os.environ['DO_NOT_TRACK'] = '1'
 
 
-setup_logger(log_level=args.verbose)
+setup_logger(log_level=args.verbose, use_stdout=args.log_stdout)
 
 def apply_custom_paths():
     # extra model paths

From d170292594770377d9e0442078ef43668e2331b6 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Fri, 27 Dec 2024 18:02:21 -0500
Subject: [PATCH 23/23] Remove some trailing white space.

---
 comfy/controlnet.py                          |  2 +-
 comfy/extra_samplers/uni_pc.py               | 10 +++++-----
 comfy/hooks.py                               |  2 +-
 comfy/k_diffusion/sampling.py                |  2 +-
 comfy/ldm/modules/sub_quadratic_attention.py |  2 +-
 comfy/ldm/pixart/pixartms.py                 |  2 +-
 comfy/model_detection.py                     |  4 ++--
 comfy/sample.py                              |  2 +-
 comfy/sampler_helpers.py                     |  2 +-
 comfy/utils.py                               | 10 +++++-----
 comfy_extras/nodes_mask.py                   |  4 ++--
 comfy_extras/nodes_perpneg.py                |  2 +-
 comfy_extras/nodes_rebatch.py                |  4 ++--
 comfy_extras/nodes_tomesd.py                 |  7 +++----
 nodes.py                                     | 20 ++++++++++----------
 15 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/comfy/controlnet.py b/comfy/controlnet.py
index 7f5988377..ee29251b9 100644
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -120,7 +120,7 @@ class ControlBase:
         if self.previous_controlnet is not None:
             out += self.previous_controlnet.get_models()
         return out
-    
+
     def get_extra_hooks(self):
         out = []
         if self.extra_hooks is not None:
diff --git a/comfy/extra_samplers/uni_pc.py b/comfy/extra_samplers/uni_pc.py
index b61baaa8e..77d20bbf5 100644
--- a/comfy/extra_samplers/uni_pc.py
+++ b/comfy/extra_samplers/uni_pc.py
@@ -80,7 +80,7 @@ class NoiseScheduleVP:
                     'linear' or 'cosine' for continuous-time DPMs.
         Returns:
             A wrapper object of the forward SDE (VP type).
-        
+
         ===============================================================
 
         Example:
@@ -208,7 +208,7 @@ def model_wrapper(
                 arXiv preprint arXiv:2202.00512 (2022).
             [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
                 arXiv preprint arXiv:2210.02303 (2022).
-    
+
         4. "score": marginal score function. (Trained by denoising score matching).
             Note that the score function and the noise prediction model follows a simple relationship:
             ```
@@ -245,7 +245,7 @@ def model_wrapper(
 
             [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
                 arXiv preprint arXiv:2207.12598 (2022).
-        
+
 
     The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
     or continuous-time labels (i.e. epsilon to T).
@@ -621,7 +621,7 @@ class UniPC:
             B_h = torch.expm1(hh)
         else:
             raise NotImplementedError()
-            
+
         for i in range(1, order + 1):
             R.append(torch.pow(rks, i - 1))
             b.append(h_phi_k * factorial_i / B_h)
@@ -870,4 +870,4 @@ def sample_unipc(model, noise, sigmas, extra_args=None, callback=None, disable=F
         return x
 
 def sample_unipc_bh2(model, noise, sigmas, extra_args=None, callback=None, disable=False):
-    return sample_unipc(model, noise, sigmas, extra_args, callback, disable, variant='bh2')
\ No newline at end of file
+    return sample_unipc(model, noise, sigmas, extra_args, callback, disable, variant='bh2')
diff --git a/comfy/hooks.py b/comfy/hooks.py
index b6f0ac213..7f9eee301 100644
--- a/comfy/hooks.py
+++ b/comfy/hooks.py
@@ -101,7 +101,7 @@ class WeightHook(Hook):
         self.need_weight_init = True
         self._strength_model = strength_model
         self._strength_clip = strength_clip
-    
+
     @property
     def strength_model(self):
         return self._strength_model * self.strength
diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py
index f08370f83..0f7cc4ca9 100644
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -1230,7 +1230,7 @@ def sample_dpmpp_2m_cfg_pp(model, x, sigmas, extra_args=None, callback=None, dis
         nonlocal uncond_denoised
         uncond_denoised = args["uncond_denoised"]
         return args["denoised"]
-    
+
     model_options = extra_args.get("model_options", {}).copy()
     extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
 
diff --git a/comfy/ldm/modules/sub_quadratic_attention.py b/comfy/ldm/modules/sub_quadratic_attention.py
index 7c5f1d9f9..fca8d1178 100644
--- a/comfy/ldm/modules/sub_quadratic_attention.py
+++ b/comfy/ldm/modules/sub_quadratic_attention.py
@@ -261,7 +261,7 @@ def efficient_dot_product_attention(
             value=value,
             mask=mask,
         )
-    
+
     # TODO: maybe we should use torch.empty_like(query) to allocate storage in-advance,
     # and pass slices to be mutated, instead of torch.cat()ing the returned slices
     res = torch.cat([
diff --git a/comfy/ldm/pixart/pixartms.py b/comfy/ldm/pixart/pixartms.py
index 50dc58c23..7d4eebdce 100644
--- a/comfy/ldm/pixart/pixartms.py
+++ b/comfy/ldm/pixart/pixartms.py
@@ -223,7 +223,7 @@ class PixArtMS(nn.Module):
         if self.micro_conditioning:
             if c_size is None:
                 c_size = torch.tensor([H*8, W*8], dtype=x.dtype, device=x.device).repeat(B, 1)
-            
+
             if c_ar is None:
                 c_ar = torch.tensor([H/W], dtype=x.dtype, device=x.device).repeat(B, 1)
 
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index c53bef5bb..de00f773e 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -229,7 +229,7 @@ def detect_unet_config(state_dict, key_prefix):
         if pe_key in state_dict_keys:
             dit_config["input_size"] = int(math.sqrt(state_dict[pe_key].shape[1])) * patch_size
             dit_config["pe_interpolation"] = dit_config["input_size"] // (512//8) # guess
-        
+
         ar_key = "{}ar_embedder.mlp.0.weight".format(key_prefix)
         if ar_key in state_dict_keys:
             dit_config["image_model"] = "pixart_alpha"
@@ -571,7 +571,7 @@ def unet_config_from_diffusers_unet(state_dict, dtype=None):
             'transformer_depth': [0, 1, 1], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': -2, 'use_linear_in_transformer': False,
             'context_dim': 768, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 1, 1, 1, 1],
             'use_temporal_attention': False, 'use_temporal_resblock': False}
-    
+
     SD15_diffusers_inpaint = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': None,
             'dtype': dtype, 'in_channels': 9, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0],
             'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': False, 'context_dim': 768, 'num_heads': 8,
diff --git a/comfy/sample.py b/comfy/sample.py
index 98dcaca7f..9974e0657 100644
--- a/comfy/sample.py
+++ b/comfy/sample.py
@@ -13,7 +13,7 @@ def prepare_noise(latent_image, seed, noise_inds=None):
     generator = torch.manual_seed(seed)
     if noise_inds is None:
         return torch.randn(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu")
-    
+
     unique_inds, inverse = np.unique(noise_inds, return_inverse=True)
     noises = []
     for i in range(unique_inds[-1]+1):
diff --git a/comfy/sampler_helpers.py b/comfy/sampler_helpers.py
index 0691de63f..ac9735369 100644
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@@ -42,7 +42,7 @@ def get_hooks_from_cond(cond, hooks_dict: dict[comfy.hooks.EnumHookType, dict[co
         if cnet.previous_controlnet is None:
             return _list
         return get_extra_hooks_from_cnet(cnet.previous_controlnet, _list)
-        
+
     hooks_list = []
     cnets = set(cnets)
     for base_cnet in cnets:
diff --git a/comfy/utils.py b/comfy/utils.py
index 7de659337..de64b91df 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -455,7 +455,7 @@ def pixart_to_diffusers(mmdit_config, output_prefix=""):
 
     for k in PIXART_MAP_BASIC:
         key_map[k[1]] = "{}{}".format(output_prefix, k[0])
-    
+
     return key_map
 
 def auraflow_to_diffusers(mmdit_config, output_prefix=""):
@@ -702,7 +702,7 @@ def get_attr(obj, attr):
 def bislerp(samples, width, height):
     def slerp(b1, b2, r):
         '''slerps batches b1, b2 according to ratio r, batches should be flat e.g. NxC'''
-        
+
         c = b1.shape[-1]
 
         #norms
@@ -730,13 +730,13 @@ def bislerp(samples, width, height):
         res[dot > 1 - 1e-5] = b1[dot > 1 - 1e-5] 
         res[dot < 1e-5 - 1] = (b1 * (1.0-r) + b2 * r)[dot < 1e-5 - 1]
         return res
-    
+
     def generate_bilinear_data(length_old, length_new, device):
         coords_1 = torch.arange(length_old, dtype=torch.float32, device=device).reshape((1,1,1,-1))
         coords_1 = torch.nn.functional.interpolate(coords_1, size=(1, length_new), mode="bilinear")
         ratios = coords_1 - coords_1.floor()
         coords_1 = coords_1.to(torch.int64)
-        
+
         coords_2 = torch.arange(length_old, dtype=torch.float32, device=device).reshape((1,1,1,-1)) + 1
         coords_2[:,:,:,-1] -= 1
         coords_2 = torch.nn.functional.interpolate(coords_2, size=(1, length_new), mode="bilinear")
@@ -747,7 +747,7 @@ def bislerp(samples, width, height):
     samples = samples.float()
     n,c,h,w = samples.shape
     h_new, w_new = (height, width)
-    
+
     #linear w
     ratios, coords_1, coords_2 = generate_bilinear_data(w, w_new, samples.device)
     coords_1 = coords_1.expand((n, c, h, -1))
diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py
index 29589b4ab..63fd13b9a 100644
--- a/comfy_extras/nodes_mask.py
+++ b/comfy_extras/nodes_mask.py
@@ -305,7 +305,7 @@ class FeatherMask:
             output[:, -y, :] *= feather_rate
 
         return (output,)
-    
+
 class GrowMask:
     @classmethod
     def INPUT_TYPES(cls):
@@ -316,7 +316,7 @@ class GrowMask:
                 "tapered_corners": ("BOOLEAN", {"default": True}),
             },
         }
-    
+
     CATEGORY = "mask"
 
     RETURN_TYPES = ("MASK",)
diff --git a/comfy_extras/nodes_perpneg.py b/comfy_extras/nodes_perpneg.py
index 762c40220..290bc4a51 100644
--- a/comfy_extras/nodes_perpneg.py
+++ b/comfy_extras/nodes_perpneg.py
@@ -64,7 +64,7 @@ class Guider_PerpNeg(comfy.samplers.CFGGuider):
     def predict_noise(self, x, timestep, model_options={}, seed=None):
         # in CFGGuider.predict_noise, we call sampling_function(), which uses cfg_function() to compute pos & neg
         # but we'd rather do a single batch of sampling pos, neg, and empty, so we call calc_cond_batch([pos,neg,empty]) directly
-        
+
         positive_cond = self.conds.get("positive", None)
         negative_cond = self.conds.get("negative", None)
         empty_cond = self.conds.get("empty_negative_prompt", None)
diff --git a/comfy_extras/nodes_rebatch.py b/comfy_extras/nodes_rebatch.py
index 3010fbd4b..e29cb9ed1 100644
--- a/comfy_extras/nodes_rebatch.py
+++ b/comfy_extras/nodes_rebatch.py
@@ -40,7 +40,7 @@ class LatentRebatch:
             return slices, indexable[num * batch_size:]
         else:
             return slices, None
-    
+
     @staticmethod
     def slice_batch(batch, num, batch_size):
         result = [LatentRebatch.get_slices(x, num, batch_size) for x in batch]
@@ -81,7 +81,7 @@ class LatentRebatch:
             if current_batch[0].shape[0] > batch_size:
                 num = current_batch[0].shape[0] // batch_size
                 sliced, remainder = self.slice_batch(current_batch, num, batch_size)
-                
+
                 for i in range(num):
                     output_list.append({'samples': sliced[0][i], 'noise_mask': sliced[1][i], 'batch_index': sliced[2][i]})
 
diff --git a/comfy_extras/nodes_tomesd.py b/comfy_extras/nodes_tomesd.py
index ce7b32c77..9f77c06fc 100644
--- a/comfy_extras/nodes_tomesd.py
+++ b/comfy_extras/nodes_tomesd.py
@@ -40,9 +40,8 @@ def bipartite_soft_matching_random2d(metric: torch.Tensor,
         return do_nothing, do_nothing
 
     gather = mps_gather_workaround if metric.device.type == "mps" else torch.gather
-    
+
     with torch.no_grad():
-        
         hsy, wsx = h // sy, w // sx
 
         # For each sy by sx kernel, randomly assign one token to be dst and the rest src
@@ -50,7 +49,7 @@ def bipartite_soft_matching_random2d(metric: torch.Tensor,
             rand_idx = torch.zeros(hsy, wsx, 1, device=metric.device, dtype=torch.int64)
         else:
             rand_idx = torch.randint(sy*sx, size=(hsy, wsx, 1), device=metric.device)
-        
+
         # The image might not divide sx and sy, so we need to work on a view of the top left if the idx buffer instead
         idx_buffer_view = torch.zeros(hsy, wsx, sy*sx, device=metric.device, dtype=torch.int64)
         idx_buffer_view.scatter_(dim=2, index=rand_idx, src=-torch.ones_like(rand_idx, dtype=rand_idx.dtype))
@@ -99,7 +98,7 @@ def bipartite_soft_matching_random2d(metric: torch.Tensor,
     def merge(x: torch.Tensor, mode="mean") -> torch.Tensor:
         src, dst = split(x)
         n, t1, c = src.shape
-        
+
         unm = gather(src, dim=-2, index=unm_idx.expand(n, t1 - r, c))
         src = gather(src, dim=-2, index=src_idx.expand(n, r, c))
         dst = dst.scatter_reduce(-2, dst_idx.expand(n, r, c), src, reduce=mode)
diff --git a/nodes.py b/nodes.py
index a135a6120..89cecc480 100644
--- a/nodes.py
+++ b/nodes.py
@@ -65,7 +65,7 @@ class CLIPTextEncode(ComfyNodeABC):
     def encode(self, clip, text):
         tokens = clip.tokenize(text)
         return (clip.encode_from_tokens_scheduled(tokens), )
-        
+
 
 class ConditioningCombine:
     @classmethod
@@ -641,7 +641,7 @@ class LoraLoader:
                 "strength_clip": ("FLOAT", {"default": 1.0, "min": -100.0, "max": 100.0, "step": 0.01, "tooltip": "How strongly to modify the CLIP model. This value can be negative."}),
             }
         }
-    
+
     RETURN_TYPES = ("MODEL", "CLIP")
     OUTPUT_TOOLTIPS = ("The modified diffusion model.", "The modified CLIP model.")
     FUNCTION = "load_lora"
@@ -1211,7 +1211,7 @@ class LatentFromBatch:
         else:
             s["batch_index"] = samples["batch_index"][batch_index:batch_index + length]
         return (s,)
-    
+
 class RepeatLatentBatch:
     @classmethod
     def INPUT_TYPES(s):
@@ -1226,7 +1226,7 @@ class RepeatLatentBatch:
     def repeat(self, samples, amount):
         s = samples.copy()
         s_in = samples["samples"]
-        
+
         s["samples"] = s_in.repeat((amount, 1,1,1))
         if "noise_mask" in samples and samples["noise_mask"].shape[0] > 1:
             masks = samples["noise_mask"]
@@ -1636,15 +1636,15 @@ class LoadImage:
     FUNCTION = "load_image"
     def load_image(self, image):
         image_path = folder_paths.get_annotated_filepath(image)
-        
+
         img = node_helpers.pillow(Image.open, image_path)
-        
+
         output_images = []
         output_masks = []
         w, h = None, None
 
         excluded_formats = ['MPO']
-        
+
         for i in ImageSequence.Iterator(img):
             i = node_helpers.pillow(ImageOps.exif_transpose, i)
 
@@ -1655,10 +1655,10 @@ class LoadImage:
             if len(output_images) == 0:
                 w = image.size[0]
                 h = image.size[1]
-            
+
             if image.size[0] != w or image.size[1] != h:
                 continue
-            
+
             image = np.array(image).astype(np.float32) / 255.0
             image = torch.from_numpy(image)[None,]
             if 'A' in i.getbands():
@@ -2234,5 +2234,5 @@ def init_extra_nodes(init_custom_nodes=True):
         else:
             logging.warning("Please do a: pip install -r requirements.txt")
         logging.warning("")
-    
+
     return import_failed