From 2559dee49202365bc97218b98121e796f57dfcb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?=
 <40791699+kijai@users.noreply.github.com>
Date: Sat, 13 Sep 2025 04:52:58 +0300
Subject: [PATCH 01/33] Support wav2vec base models (#9637)

* Support wav2vec base models

* trim trailing whitespace

* Do interpolation after
---
 comfy/audio_encoders/audio_encoders.py | 36 ++++++++++-
 comfy/audio_encoders/wav2vec2.py       | 87 +++++++++++++++++++-------
 2 files changed, 99 insertions(+), 24 deletions(-)

diff --git a/comfy/audio_encoders/audio_encoders.py b/comfy/audio_encoders/audio_encoders.py
index 538c21bd5..d1ec78f69 100644
--- a/comfy/audio_encoders/audio_encoders.py
+++ b/comfy/audio_encoders/audio_encoders.py
@@ -11,7 +11,13 @@ class AudioEncoderModel():
         self.load_device = comfy.model_management.text_encoder_device()
         offload_device = comfy.model_management.text_encoder_offload_device()
         self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
-        self.model = Wav2Vec2Model(dtype=self.dtype, device=offload_device, operations=comfy.ops.manual_cast)
+        model_config = dict(config)
+        model_config.update({
+            "dtype": self.dtype,
+            "device": offload_device,
+            "operations": comfy.ops.manual_cast
+        })
+        self.model = Wav2Vec2Model(**model_config)
         self.model.eval()
         self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
         self.model_sample_rate = 16000
@@ -25,7 +31,7 @@ class AudioEncoderModel():
     def encode_audio(self, audio, sample_rate):
         comfy.model_management.load_model_gpu(self.patcher)
         audio = torchaudio.functional.resample(audio, sample_rate, self.model_sample_rate)
-        out, all_layers = self.model(audio.to(self.load_device))
+        out, all_layers = self.model(audio.to(self.load_device), sr=self.model_sample_rate)
         outputs = {}
         outputs["encoded_audio"] = out
         outputs["encoded_audio_all_layers"] = all_layers
@@ -33,8 +39,32 @@ class AudioEncoderModel():
 
 
 def load_audio_encoder_from_sd(sd, prefix=""):
-    audio_encoder = AudioEncoderModel(None)
     sd = comfy.utils.state_dict_prefix_replace(sd, {"wav2vec2.": ""})
+    embed_dim = sd["encoder.layer_norm.bias"].shape[0]
+    if embed_dim == 1024:# large
+        config = {
+            "embed_dim": 1024,
+            "num_heads": 16,
+            "num_layers": 24,
+            "conv_norm": True,
+            "conv_bias": True,
+            "do_normalize": True,
+            "do_stable_layer_norm": True
+            }
+    elif embed_dim == 768: # base
+        config = {
+            "embed_dim": 768,
+            "num_heads": 12,
+            "num_layers": 12,
+            "conv_norm": False,
+            "conv_bias": False,
+            "do_normalize": False, # chinese-wav2vec2-base has this False
+            "do_stable_layer_norm": False
+        }
+    else:
+        raise RuntimeError("ERROR: audio encoder file is invalid or unsupported embed_dim: {}".format(embed_dim))
+
+    audio_encoder = AudioEncoderModel(config)
     m, u = audio_encoder.load_sd(sd)
     if len(m) > 0:
         logging.warning("missing audio encoder: {}".format(m))
diff --git a/comfy/audio_encoders/wav2vec2.py b/comfy/audio_encoders/wav2vec2.py
index de906622a..ef10dcd2a 100644
--- a/comfy/audio_encoders/wav2vec2.py
+++ b/comfy/audio_encoders/wav2vec2.py
@@ -13,19 +13,49 @@ class LayerNormConv(nn.Module):
         x = self.conv(x)
         return torch.nn.functional.gelu(self.layer_norm(x.transpose(-2, -1)).transpose(-2, -1))
 
+class LayerGroupNormConv(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, bias=False, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.conv = operations.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, bias=bias, device=device, dtype=dtype)
+        self.layer_norm = operations.GroupNorm(num_groups=out_channels, num_channels=out_channels, affine=True, device=device, dtype=dtype)
+
+    def forward(self, x):
+        x = self.conv(x)
+        return torch.nn.functional.gelu(self.layer_norm(x))
+
+class ConvNoNorm(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, bias=False, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.conv = operations.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, bias=bias, device=device, dtype=dtype)
+
+    def forward(self, x):
+        x = self.conv(x)
+        return torch.nn.functional.gelu(x)
+
 
 class ConvFeatureEncoder(nn.Module):
-    def __init__(self, conv_dim, dtype=None, device=None, operations=None):
+    def __init__(self, conv_dim, conv_bias=False, conv_norm=True, dtype=None, device=None, operations=None):
         super().__init__()
-        self.conv_layers = nn.ModuleList([
-            LayerNormConv(1, conv_dim, kernel_size=10, stride=5, bias=True, device=device, dtype=dtype, operations=operations),
-            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
-            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
-            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
-            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
-            LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
-            LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
-        ])
+        if conv_norm:
+            self.conv_layers = nn.ModuleList([
+                LayerNormConv(1, conv_dim, kernel_size=10, stride=5, bias=True, device=device, dtype=dtype, operations=operations),
+                LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
+                LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
+                LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
+                LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
+                LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
+                LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
+            ])
+        else:
+            self.conv_layers = nn.ModuleList([
+                LayerGroupNormConv(1, conv_dim, kernel_size=10, stride=5, bias=conv_bias, device=device, dtype=dtype, operations=operations),
+                ConvNoNorm(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
+                ConvNoNorm(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
+                ConvNoNorm(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
+                ConvNoNorm(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
+                ConvNoNorm(conv_dim, conv_dim, kernel_size=2, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
+                ConvNoNorm(conv_dim, conv_dim, kernel_size=2, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations),
+            ])
 
     def forward(self, x):
         x = x.unsqueeze(1)
@@ -76,6 +106,7 @@ class TransformerEncoder(nn.Module):
         num_heads=12,
         num_layers=12,
         mlp_ratio=4.0,
+        do_stable_layer_norm=True,
         dtype=None, device=None, operations=None
     ):
         super().__init__()
@@ -86,20 +117,25 @@ class TransformerEncoder(nn.Module):
                 embed_dim=embed_dim,
                 num_heads=num_heads,
                 mlp_ratio=mlp_ratio,
+                do_stable_layer_norm=do_stable_layer_norm,
                 device=device, dtype=dtype, operations=operations
             )
             for _ in range(num_layers)
         ])
 
         self.layer_norm = operations.LayerNorm(embed_dim, eps=1e-05, device=device, dtype=dtype)
+        self.do_stable_layer_norm = do_stable_layer_norm
 
     def forward(self, x, mask=None):
         x = x + self.pos_conv_embed(x)
         all_x = ()
+        if not self.do_stable_layer_norm:
+            x = self.layer_norm(x)
         for layer in self.layers:
             all_x += (x,)
             x = layer(x, mask)
-        x = self.layer_norm(x)
+        if self.do_stable_layer_norm:
+            x = self.layer_norm(x)
         all_x += (x,)
         return x, all_x
 
@@ -145,6 +181,7 @@ class TransformerEncoderLayer(nn.Module):
         embed_dim=768,
         num_heads=12,
         mlp_ratio=4.0,
+        do_stable_layer_norm=True,
         dtype=None, device=None, operations=None
     ):
         super().__init__()
@@ -154,15 +191,19 @@ class TransformerEncoderLayer(nn.Module):
         self.layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
         self.feed_forward = FeedForward(embed_dim, mlp_ratio, device=device, dtype=dtype, operations=operations)
         self.final_layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
+        self.do_stable_layer_norm = do_stable_layer_norm
 
     def forward(self, x, mask=None):
         residual = x
-        x = self.layer_norm(x)
+        if self.do_stable_layer_norm:
+            x = self.layer_norm(x)
         x = self.attention(x, mask=mask)
         x = residual + x
-
-        x = x + self.feed_forward(self.final_layer_norm(x))
-        return x
+        if not self.do_stable_layer_norm:
+            x = self.layer_norm(x)
+            return self.final_layer_norm(x + self.feed_forward(x))
+        else:
+            return x + self.feed_forward(self.final_layer_norm(x))
 
 
 class Wav2Vec2Model(nn.Module):
@@ -174,34 +215,38 @@ class Wav2Vec2Model(nn.Module):
         final_dim=256,
         num_heads=16,
         num_layers=24,
+        conv_norm=True,
+        conv_bias=True,
+        do_normalize=True,
+        do_stable_layer_norm=True,
         dtype=None, device=None, operations=None
     ):
         super().__init__()
 
         conv_dim = 512
-        self.feature_extractor = ConvFeatureEncoder(conv_dim, device=device, dtype=dtype, operations=operations)
+        self.feature_extractor = ConvFeatureEncoder(conv_dim, conv_norm=conv_norm, conv_bias=conv_bias, device=device, dtype=dtype, operations=operations)
         self.feature_projection = FeatureProjection(conv_dim, embed_dim, device=device, dtype=dtype, operations=operations)
 
         self.masked_spec_embed = nn.Parameter(torch.empty(embed_dim, device=device, dtype=dtype))
+        self.do_normalize = do_normalize
 
         self.encoder = TransformerEncoder(
             embed_dim=embed_dim,
             num_heads=num_heads,
             num_layers=num_layers,
+            do_stable_layer_norm=do_stable_layer_norm,
             device=device, dtype=dtype, operations=operations
         )
 
-    def forward(self, x, mask_time_indices=None, return_dict=False):
-
+    def forward(self, x, sr=16000, mask_time_indices=None, return_dict=False):
         x = torch.mean(x, dim=1)
 
-        x = (x - x.mean()) / torch.sqrt(x.var() + 1e-7)
+        if self.do_normalize:
+            x = (x - x.mean()) / torch.sqrt(x.var() + 1e-7)
 
         features = self.feature_extractor(x)
         features = self.feature_projection(features)
-
         batch_size, seq_len, _ = features.shape
 
         x, all_x = self.encoder(features)
-
         return x, all_x

From 29bf807b0e2d89402d555d08bd8e9df15e636f0c Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 12 Sep 2025 18:57:04 -0700
Subject: [PATCH 02/33] Cleanup. (#9838)

---
 comfy/audio_encoders/audio_encoders.py | 2 +-
 comfy/audio_encoders/wav2vec2.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfy/audio_encoders/audio_encoders.py b/comfy/audio_encoders/audio_encoders.py
index d1ec78f69..6fb5b08e9 100644
--- a/comfy/audio_encoders/audio_encoders.py
+++ b/comfy/audio_encoders/audio_encoders.py
@@ -31,7 +31,7 @@ class AudioEncoderModel():
     def encode_audio(self, audio, sample_rate):
         comfy.model_management.load_model_gpu(self.patcher)
         audio = torchaudio.functional.resample(audio, sample_rate, self.model_sample_rate)
-        out, all_layers = self.model(audio.to(self.load_device), sr=self.model_sample_rate)
+        out, all_layers = self.model(audio.to(self.load_device))
         outputs = {}
         outputs["encoded_audio"] = out
         outputs["encoded_audio_all_layers"] = all_layers
diff --git a/comfy/audio_encoders/wav2vec2.py b/comfy/audio_encoders/wav2vec2.py
index ef10dcd2a..4e34a40a7 100644
--- a/comfy/audio_encoders/wav2vec2.py
+++ b/comfy/audio_encoders/wav2vec2.py
@@ -238,7 +238,7 @@ class Wav2Vec2Model(nn.Module):
             device=device, dtype=dtype, operations=operations
         )
 
-    def forward(self, x, sr=16000, mask_time_indices=None, return_dict=False):
+    def forward(self, x, mask_time_indices=None, return_dict=False):
         x = torch.mean(x, dim=1)
 
         if self.do_normalize:

From e5e70636e7b7b54695220a88ab036c1607959736 Mon Sep 17 00:00:00 2001
From: Kimbing Ng <50580578+KimbingNg@users.noreply.github.com>
Date: Sun, 14 Sep 2025 04:59:19 +0800
Subject: [PATCH 03/33] Remove single quote pattern to avoid wrong matches
 (#9842)

---
 comfy/text_encoders/hunyuan_image.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/comfy/text_encoders/hunyuan_image.py b/comfy/text_encoders/hunyuan_image.py
index be396cae7..699eddc33 100644
--- a/comfy/text_encoders/hunyuan_image.py
+++ b/comfy/text_encoders/hunyuan_image.py
@@ -22,17 +22,14 @@ class HunyuanImageTokenizer(QwenImageTokenizer):
 
         # ByT5 processing for HunyuanImage
         text_prompt_texts = []
-        pattern_quote_single = r'\'(.*?)\''
         pattern_quote_double = r'\"(.*?)\"'
         pattern_quote_chinese_single = r'‘(.*?)’'
         pattern_quote_chinese_double = r'“(.*?)”'
 
-        matches_quote_single = re.findall(pattern_quote_single, text)
         matches_quote_double = re.findall(pattern_quote_double, text)
         matches_quote_chinese_single = re.findall(pattern_quote_chinese_single, text)
         matches_quote_chinese_double = re.findall(pattern_quote_chinese_double, text)
 
-        text_prompt_texts.extend(matches_quote_single)
         text_prompt_texts.extend(matches_quote_double)
         text_prompt_texts.extend(matches_quote_chinese_single)
         text_prompt_texts.extend(matches_quote_chinese_double)

From c1297f4eb38a63e2f99c9fa76e32e3a36c933b85 Mon Sep 17 00:00:00 2001
From: blepping <157360029+blepping@users.noreply.github.com>
Date: Sat, 13 Sep 2025 15:58:43 -0600
Subject: [PATCH 04/33] Add support for Chroma Radiance (#9682)

* Initial Chroma Radiance support

* Minor Chroma Radiance cleanups

* Update Radiance nodes to ensure latents/images are on the intermediate device

* Fix Chroma Radiance memory estimation.

* Increase Chroma Radiance memory usage factor

* Increase Chroma Radiance memory usage factor once again

* Ensure images are multiples of 16 for Chroma Radiance
Add batch dimension and fix channels when necessary in ChromaRadianceImageToLatent node

* Tile Chroma Radiance NeRF to reduce memory consumption, update memory usage factor

* Update Radiance to support conv nerf final head type.

* Allow setting NeRF embedder dtype for Radiance
Bump Radiance nerf tile size to 32
Support EasyCache/LazyCache on Radiance (maybe)

* Add ChromaRadianceStubVAE node

* Crop Radiance image inputs to multiples of 16 instead of erroring to be in line with existing VAE behavior

* Convert Chroma Radiance nodes to V3 schema.

* Add ChromaRadianceOptions node and backend support.
Cleanups/refactoring to reduce code duplication with Chroma.

* Fix overriding the NeRF embedder dtype for Chroma Radiance

* Minor Chroma Radiance cleanups

* Move Chroma Radiance to its own directory in ldm
Minor code cleanups and tooltip improvements

* Fix Chroma Radiance embedder dtype overriding

* Remove Radiance dynamic nerf_embedder dtype override feature

* Unbork Radiance NeRF embedder init

* Remove Chroma Radiance image conversion and stub VAE nodes
Add a chroma_radiance option to the VAELoader builtin node which uses comfy.sd.PixelspaceConversionVAE
Add a PixelspaceConversionVAE to comfy.sd for converting BHWC 0..1 <-> BCHW -1..1
---
 comfy/latent_formats.py               |  17 ++
 comfy/ldm/chroma/model.py             |  10 +-
 comfy/ldm/chroma_radiance/layers.py   | 206 ++++++++++++++++
 comfy/ldm/chroma_radiance/model.py    | 328 ++++++++++++++++++++++++++
 comfy/model_base.py                   |   9 +-
 comfy/model_detection.py              |  14 +-
 comfy/sd.py                           |  60 +++++
 comfy/supported_models.py             |  15 +-
 comfy_extras/nodes_chroma_radiance.py | 114 +++++++++
 nodes.py                              |   6 +-
 10 files changed, 770 insertions(+), 9 deletions(-)
 create mode 100644 comfy/ldm/chroma_radiance/layers.py
 create mode 100644 comfy/ldm/chroma_radiance/model.py
 create mode 100644 comfy_extras/nodes_chroma_radiance.py

diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py
index 894540879..77e642a94 100644
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -629,3 +629,20 @@ class Hunyuan3Dv2mini(LatentFormat):
 class ACEAudio(LatentFormat):
     latent_channels = 8
     latent_dimensions = 2
+
+class ChromaRadiance(LatentFormat):
+    latent_channels = 3
+
+    def __init__(self):
+        self.latent_rgb_factors = [
+            # R    G    B
+            [ 1.0, 0.0, 0.0 ],
+            [ 0.0, 1.0, 0.0 ],
+            [ 0.0, 0.0, 1.0 ]
+        ]
+
+    def process_in(self, latent):
+        return latent
+
+    def process_out(self, latent):
+        return latent
diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py
index 4f709f87d..ad1c523fe 100644
--- a/comfy/ldm/chroma/model.py
+++ b/comfy/ldm/chroma/model.py
@@ -151,8 +151,6 @@ class Chroma(nn.Module):
         attn_mask: Tensor = None,
     ) -> Tensor:
         patches_replace = transformer_options.get("patches_replace", {})
-        if img.ndim != 3 or txt.ndim != 3:
-            raise ValueError("Input img and txt tensors must have 3 dimensions.")
 
         # running on sequences img
         img = self.img_in(img)
@@ -254,8 +252,9 @@ class Chroma(nn.Module):
                             img[:, txt.shape[1] :, ...] += add
 
         img = img[:, txt.shape[1] :, ...]
-        final_mod = self.get_modulations(mod_vectors, "final")
-        img = self.final_layer(img, vec=final_mod)  # (N, T, patch_size ** 2 * out_channels)
+        if hasattr(self, "final_layer"):
+            final_mod = self.get_modulations(mod_vectors, "final")
+            img = self.final_layer(img, vec=final_mod)  # (N, T, patch_size ** 2 * out_channels)
         return img
 
     def forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs):
@@ -271,6 +270,9 @@ class Chroma(nn.Module):
 
         img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=self.patch_size, pw=self.patch_size)
 
+        if img.ndim != 3 or context.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+
         h_len = ((h + (self.patch_size // 2)) // self.patch_size)
         w_len = ((w + (self.patch_size // 2)) // self.patch_size)
         img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
diff --git a/comfy/ldm/chroma_radiance/layers.py b/comfy/ldm/chroma_radiance/layers.py
new file mode 100644
index 000000000..3c7bc9b6b
--- /dev/null
+++ b/comfy/ldm/chroma_radiance/layers.py
@@ -0,0 +1,206 @@
+# Adapted from https://github.com/lodestone-rock/flow
+from functools import lru_cache
+
+import torch
+from torch import nn
+
+from comfy.ldm.flux.layers import RMSNorm
+
+
+class NerfEmbedder(nn.Module):
+    """
+    An embedder module that combines input features with a 2D positional
+    encoding that mimics the Discrete Cosine Transform (DCT).
+
+    This module takes an input tensor of shape (B, P^2, C), where P is the
+    patch size, and enriches it with positional information before projecting
+    it to a new hidden size.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_size_input: int,
+        max_freqs: int,
+        dtype=None,
+        device=None,
+        operations=None,
+    ):
+        """
+        Initializes the NerfEmbedder.
+
+        Args:
+            in_channels (int): The number of channels in the input tensor.
+            hidden_size_input (int): The desired dimension of the output embedding.
+            max_freqs (int): The number of frequency components to use for both
+                             the x and y dimensions of the positional encoding.
+                             The total number of positional features will be max_freqs^2.
+        """
+        super().__init__()
+        self.dtype = dtype
+        self.max_freqs = max_freqs
+        self.hidden_size_input = hidden_size_input
+
+        # A linear layer to project the concatenated input features and
+        # positional encodings to the final output dimension.
+        self.embedder = nn.Sequential(
+            operations.Linear(in_channels + max_freqs**2, hidden_size_input, dtype=dtype, device=device)
+        )
+
+    @lru_cache(maxsize=4)
+    def fetch_pos(self, patch_size: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+        """
+        Generates and caches 2D DCT-like positional embeddings for a given patch size.
+
+        The LRU cache is a performance optimization that avoids recomputing the
+        same positional grid on every forward pass.
+
+        Args:
+            patch_size (int): The side length of the square input patch.
+            device: The torch device to create the tensors on.
+            dtype: The torch dtype for the tensors.
+
+        Returns:
+            A tensor of shape (1, patch_size^2, max_freqs^2) containing the
+            positional embeddings.
+        """
+        # Create normalized 1D coordinate grids from 0 to 1.
+        pos_x = torch.linspace(0, 1, patch_size, device=device, dtype=dtype)
+        pos_y = torch.linspace(0, 1, patch_size, device=device, dtype=dtype)
+
+        # Create a 2D meshgrid of coordinates.
+        pos_y, pos_x = torch.meshgrid(pos_y, pos_x, indexing="ij")
+
+        # Reshape positions to be broadcastable with frequencies.
+        # Shape becomes (patch_size^2, 1, 1).
+        pos_x = pos_x.reshape(-1, 1, 1)
+        pos_y = pos_y.reshape(-1, 1, 1)
+
+        # Create a 1D tensor of frequency values from 0 to max_freqs-1.
+        freqs = torch.linspace(0, self.max_freqs - 1, self.max_freqs, dtype=dtype, device=device)
+
+        # Reshape frequencies to be broadcastable for creating 2D basis functions.
+        # freqs_x shape: (1, max_freqs, 1)
+        # freqs_y shape: (1, 1, max_freqs)
+        freqs_x = freqs[None, :, None]
+        freqs_y = freqs[None, None, :]
+
+        # A custom weighting coefficient, not part of standard DCT.
+        # This seems to down-weight the contribution of higher-frequency interactions.
+        coeffs = (1 + freqs_x * freqs_y) ** -1
+
+        # Calculate the 1D cosine basis functions for x and y coordinates.
+        # This is the core of the DCT formulation.
+        dct_x = torch.cos(pos_x * freqs_x * torch.pi)
+        dct_y = torch.cos(pos_y * freqs_y * torch.pi)
+
+        # Combine the 1D basis functions to create 2D basis functions by element-wise
+        # multiplication, and apply the custom coefficients. Broadcasting handles the
+        # combination of all (pos_x, freqs_x) with all (pos_y, freqs_y).
+        # The result is flattened into a feature vector for each position.
+        dct = (dct_x * dct_y * coeffs).view(1, -1, self.max_freqs ** 2)
+
+        return dct
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for the embedder.
+
+        Args:
+            inputs (Tensor): The input tensor of shape (B, P^2, C).
+
+        Returns:
+            Tensor: The output tensor of shape (B, P^2, hidden_size_input).
+        """
+        # Get the batch size, number of pixels, and number of channels.
+        B, P2, C = inputs.shape
+
+        # Infer the patch side length from the number of pixels (P^2).
+        patch_size = int(P2 ** 0.5)
+
+        input_dtype = inputs.dtype
+        inputs = inputs.to(dtype=self.dtype)
+
+        # Fetch the pre-computed or cached positional embeddings.
+        dct = self.fetch_pos(patch_size, inputs.device, self.dtype)
+
+        # Repeat the positional embeddings for each item in the batch.
+        dct = dct.repeat(B, 1, 1)
+
+        # Concatenate the original input features with the positional embeddings
+        # along the feature dimension.
+        inputs = torch.cat((inputs, dct), dim=-1)
+
+        # Project the combined tensor to the target hidden size.
+        return self.embedder(inputs).to(dtype=input_dtype)
+
+
+class NerfGLUBlock(nn.Module):
+    """
+    A NerfBlock using a Gated Linear Unit (GLU) like MLP.
+    """
+    def __init__(self, hidden_size_s: int, hidden_size_x: int, mlp_ratio, dtype=None, device=None, operations=None):
+        super().__init__()
+        # The total number of parameters for the MLP is increased to accommodate
+        # the gate, value, and output projection matrices.
+        # We now need to generate parameters for 3 matrices.
+        total_params = 3 * hidden_size_x**2 * mlp_ratio
+        self.param_generator = operations.Linear(hidden_size_s, total_params, dtype=dtype, device=device)
+        self.norm = RMSNorm(hidden_size_x, dtype=dtype, device=device, operations=operations)
+        self.mlp_ratio = mlp_ratio
+
+
+    def forward(self, x: torch.Tensor, s: torch.Tensor) -> torch.Tensor:
+        batch_size, num_x, hidden_size_x = x.shape
+        mlp_params = self.param_generator(s)
+
+        # Split the generated parameters into three parts for the gate, value, and output projection.
+        fc1_gate_params, fc1_value_params, fc2_params = mlp_params.chunk(3, dim=-1)
+
+        # Reshape the parameters into matrices for batch matrix multiplication.
+        fc1_gate = fc1_gate_params.view(batch_size, hidden_size_x, hidden_size_x * self.mlp_ratio)
+        fc1_value = fc1_value_params.view(batch_size, hidden_size_x, hidden_size_x * self.mlp_ratio)
+        fc2 = fc2_params.view(batch_size, hidden_size_x * self.mlp_ratio, hidden_size_x)
+
+        # Normalize the generated weight matrices as in the original implementation.
+        fc1_gate = torch.nn.functional.normalize(fc1_gate, dim=-2)
+        fc1_value = torch.nn.functional.normalize(fc1_value, dim=-2)
+        fc2 = torch.nn.functional.normalize(fc2, dim=-2)
+
+        res_x = x
+        x = self.norm(x)
+
+        # Apply the final output projection.
+        x = torch.bmm(torch.nn.functional.silu(torch.bmm(x, fc1_gate)) * torch.bmm(x, fc1_value), fc2)
+
+        return x + res_x
+
+
+class NerfFinalLayer(nn.Module):
+    def __init__(self, hidden_size, out_channels, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm = RMSNorm(hidden_size, dtype=dtype, device=device, operations=operations)
+        self.linear = operations.Linear(hidden_size, out_channels, dtype=dtype, device=device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # RMSNorm normalizes over the last dimension, but our channel dim (C) is at dim=1.
+        # So we temporarily move the channel dimension to the end for the norm operation.
+        return self.linear(self.norm(x.movedim(1, -1))).movedim(-1, 1)
+
+
+class NerfFinalLayerConv(nn.Module):
+    def __init__(self, hidden_size: int, out_channels: int, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm = RMSNorm(hidden_size, dtype=dtype, device=device, operations=operations)
+        self.conv = operations.Conv2d(
+            in_channels=hidden_size,
+            out_channels=out_channels,
+            kernel_size=3,
+            padding=1,
+            dtype=dtype,
+            device=device,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # RMSNorm normalizes over the last dimension, but our channel dim (C) is at dim=1.
+        # So we temporarily move the channel dimension to the end for the norm operation.
+        return self.conv(self.norm(x.movedim(1, -1)).movedim(-1, 1))
diff --git a/comfy/ldm/chroma_radiance/model.py b/comfy/ldm/chroma_radiance/model.py
new file mode 100644
index 000000000..f7eb7a22e
--- /dev/null
+++ b/comfy/ldm/chroma_radiance/model.py
@@ -0,0 +1,328 @@
+# Credits:
+# Original Flux code can be found on: https://github.com/black-forest-labs/flux
+# Chroma Radiance adaption referenced from https://github.com/lodestone-rock/flow
+
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+from torch import Tensor, nn
+from einops import repeat
+import comfy.ldm.common_dit
+
+from comfy.ldm.flux.layers import EmbedND
+
+from comfy.ldm.chroma.model import Chroma, ChromaParams
+from comfy.ldm.chroma.layers import (
+    DoubleStreamBlock,
+    SingleStreamBlock,
+    Approximator,
+)
+from .layers import (
+    NerfEmbedder,
+    NerfGLUBlock,
+    NerfFinalLayer,
+    NerfFinalLayerConv,
+)
+
+
+@dataclass
+class ChromaRadianceParams(ChromaParams):
+    patch_size: int
+    nerf_hidden_size: int
+    nerf_mlp_ratio: int
+    nerf_depth: int
+    nerf_max_freqs: int
+    # Setting nerf_tile_size to 0 disables tiling.
+    nerf_tile_size: int
+    # Currently one of linear (legacy) or conv.
+    nerf_final_head_type: str
+    # None means use the same dtype as the model.
+    nerf_embedder_dtype: Optional[torch.dtype]
+
+
+class ChromaRadiance(Chroma):
+    """
+    Transformer model for flow matching on sequences.
+    """
+
+    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
+        if operations is None:
+            raise RuntimeError("Attempt to create ChromaRadiance object without setting operations")
+        nn.Module.__init__(self)
+        self.dtype = dtype
+        params = ChromaRadianceParams(**kwargs)
+        self.params = params
+        self.patch_size = params.patch_size
+        self.in_channels = params.in_channels
+        self.out_channels = params.out_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
+            )
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.in_dim = params.in_dim
+        self.out_dim = params.out_dim
+        self.hidden_dim = params.hidden_dim
+        self.n_layers = params.n_layers
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in_patch = operations.Conv2d(
+            params.in_channels,
+            params.hidden_size,
+            kernel_size=params.patch_size,
+            stride=params.patch_size,
+            bias=True,
+            dtype=dtype,
+            device=device,
+        )
+        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)
+        # set as nn identity for now, will overwrite it later.
+        self.distilled_guidance_layer = Approximator(
+                    in_dim=self.in_dim,
+                    hidden_dim=self.hidden_dim,
+                    out_dim=self.out_dim,
+                    n_layers=self.n_layers,
+                    dtype=dtype, device=device, operations=operations
+                )
+
+
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                    dtype=dtype, device=device, operations=operations
+                )
+                for _ in range(params.depth)
+            ]
+        )
+
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    dtype=dtype, device=device, operations=operations,
+                )
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+
+        # pixel channel concat with DCT
+        self.nerf_image_embedder = NerfEmbedder(
+            in_channels=params.in_channels,
+            hidden_size_input=params.nerf_hidden_size,
+            max_freqs=params.nerf_max_freqs,
+            dtype=params.nerf_embedder_dtype or dtype,
+            device=device,
+            operations=operations,
+        )
+
+        self.nerf_blocks = nn.ModuleList([
+            NerfGLUBlock(
+                hidden_size_s=params.hidden_size,
+                hidden_size_x=params.nerf_hidden_size,
+                mlp_ratio=params.nerf_mlp_ratio,
+                dtype=dtype,
+                device=device,
+                operations=operations,
+            ) for _ in range(params.nerf_depth)
+        ])
+
+        if params.nerf_final_head_type == "linear":
+            self.nerf_final_layer = NerfFinalLayer(
+                params.nerf_hidden_size,
+                out_channels=params.in_channels,
+                dtype=dtype,
+                device=device,
+                operations=operations,
+            )
+        elif params.nerf_final_head_type == "conv":
+            self.nerf_final_layer_conv = NerfFinalLayerConv(
+                params.nerf_hidden_size,
+                out_channels=params.in_channels,
+                dtype=dtype,
+                device=device,
+                operations=operations,
+            )
+        else:
+            errstr = f"Unsupported nerf_final_head_type {params.nerf_final_head_type}"
+            raise ValueError(errstr)
+
+        self.skip_mmdit = []
+        self.skip_dit = []
+        self.lite = False
+
+    @property
+    def _nerf_final_layer(self) -> nn.Module:
+        if self.params.nerf_final_head_type == "linear":
+            return self.nerf_final_layer
+        if self.params.nerf_final_head_type == "conv":
+            return self.nerf_final_layer_conv
+        # Impossible to get here as we raise an error on unexpected types on initialization.
+        raise NotImplementedError
+
+    def img_in(self, img: Tensor) -> Tensor:
+        img = self.img_in_patch(img) # -> [B, Hidden, H/P, W/P]
+        # flatten into a sequence for the transformer.
+        return img.flatten(2).transpose(1, 2) # -> [B, NumPatches, Hidden]
+
+    def forward_nerf(
+        self,
+        img_orig: Tensor,
+        img_out: Tensor,
+        params: ChromaRadianceParams,
+    ) -> Tensor:
+        B, C, H, W = img_orig.shape
+        num_patches = img_out.shape[1]
+        patch_size = params.patch_size
+
+        # Store the raw pixel values of each patch for the NeRF head later.
+        # unfold creates patches: [B, C * P * P, NumPatches]
+        nerf_pixels = nn.functional.unfold(img_orig, kernel_size=patch_size, stride=patch_size)
+        nerf_pixels = nerf_pixels.transpose(1, 2) # -> [B, NumPatches, C * P * P]
+
+        if params.nerf_tile_size > 0 and num_patches > params.nerf_tile_size:
+            # Enable tiling if nerf_tile_size isn't 0 and we actually have more patches than
+            # the tile size.
+            img_dct = self.forward_tiled_nerf(img_out, nerf_pixels, B, C, num_patches, patch_size, params)
+        else:
+            # Reshape for per-patch processing
+            nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size)
+            nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2)
+
+            # Get DCT-encoded pixel embeddings [pixel-dct]
+            img_dct = self.nerf_image_embedder(nerf_pixels)
+
+            # Pass through the dynamic MLP blocks (the NeRF)
+            for block in self.nerf_blocks:
+                img_dct = block(img_dct, nerf_hidden)
+
+        # Reassemble the patches into the final image.
+        img_dct = img_dct.transpose(1, 2) # -> [B*NumPatches, C, P*P]
+        # Reshape to combine with batch dimension for fold
+        img_dct = img_dct.reshape(B, num_patches, -1) # -> [B, NumPatches, C*P*P]
+        img_dct = img_dct.transpose(1, 2) # -> [B, C*P*P, NumPatches]
+        img_dct = nn.functional.fold(
+            img_dct,
+            output_size=(H, W),
+            kernel_size=patch_size,
+            stride=patch_size,
+        )
+        return self._nerf_final_layer(img_dct)
+
+    def forward_tiled_nerf(
+        self,
+        nerf_hidden: Tensor,
+        nerf_pixels: Tensor,
+        batch: int,
+        channels: int,
+        num_patches: int,
+        patch_size: int,
+        params: ChromaRadianceParams,
+    ) -> Tensor:
+        """
+        Processes the NeRF head in tiles to save memory.
+        nerf_hidden has shape [B, L, D]
+        nerf_pixels has shape [B, L, C * P * P]
+        """
+        tile_size = params.nerf_tile_size
+        output_tiles = []
+        # Iterate over the patches in tiles. The dimension L (num_patches) is at index 1.
+        for i in range(0, num_patches, tile_size):
+            end = min(i + tile_size, num_patches)
+
+            # Slice the current tile from the input tensors
+            nerf_hidden_tile = nerf_hidden[:, i:end, :]
+            nerf_pixels_tile = nerf_pixels[:, i:end, :]
+
+            # Get the actual number of patches in this tile (can be smaller for the last tile)
+            num_patches_tile = nerf_hidden_tile.shape[1]
+
+            # Reshape the tile for per-patch processing
+            # [B, NumPatches_tile, D] -> [B * NumPatches_tile, D]
+            nerf_hidden_tile = nerf_hidden_tile.reshape(batch * num_patches_tile, params.hidden_size)
+            # [B, NumPatches_tile, C*P*P] -> [B*NumPatches_tile, C, P*P] -> [B*NumPatches_tile, P*P, C]
+            nerf_pixels_tile = nerf_pixels_tile.reshape(batch * num_patches_tile, channels, patch_size**2).transpose(1, 2)
+
+            # get DCT-encoded pixel embeddings [pixel-dct]
+            img_dct_tile = self.nerf_image_embedder(nerf_pixels_tile)
+
+            # pass through the dynamic MLP blocks (the NeRF)
+            for block in self.nerf_blocks:
+                img_dct_tile = block(img_dct_tile, nerf_hidden_tile)
+
+            output_tiles.append(img_dct_tile)
+
+        # Concatenate the processed tiles along the patch dimension
+        return torch.cat(output_tiles, dim=0)
+
+    def radiance_get_override_params(self, overrides: dict) -> ChromaRadianceParams:
+        params = self.params
+        if not overrides:
+            return params
+        params_dict = {k: getattr(params, k) for k in params.__dataclass_fields__}
+        nullable_keys = frozenset(("nerf_embedder_dtype",))
+        bad_keys = tuple(k for k in overrides if k not in params_dict)
+        if bad_keys:
+            e = f"Unknown key(s) in transformer_options chroma_radiance_options: {', '.join(bad_keys)}"
+            raise ValueError(e)
+        bad_keys = tuple(
+            k
+            for k, v in overrides.items()
+            if type(v) != type(getattr(params, k)) and (v is not None or k not in nullable_keys)
+        )
+        if bad_keys:
+            e = f"Invalid value(s) in transformer_options chroma_radiance_options: {', '.join(bad_keys)}"
+            raise ValueError(e)
+        # At this point it's all valid keys and values so we can merge with the existing params.
+        params_dict |= overrides
+        return params.__class__(**params_dict)
+
+    def _forward(
+        self,
+        x: Tensor,
+        timestep: Tensor,
+        context: Tensor,
+        guidance: Optional[Tensor],
+        control: Optional[dict]=None,
+        transformer_options: dict={},
+        **kwargs: dict,
+    ) -> Tensor:
+        bs, c, h, w = x.shape
+        img = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
+
+        if img.ndim != 4:
+            raise ValueError("Input img tensor must be in [B, C, H, W] format.")
+        if context.ndim != 3:
+            raise ValueError("Input txt tensors must have 3 dimensions.")
+
+        params = self.radiance_get_override_params(transformer_options.get("chroma_radiance_options", {}))
+
+        h_len = ((h + (self.patch_size // 2)) // self.patch_size)
+        w_len = ((w + (self.patch_size // 2)) // self.patch_size)
+        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
+        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
+        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
+        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
+        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
+
+        img_out = self.forward_orig(
+            img,
+            img_ids,
+            context,
+            txt_ids,
+            timestep,
+            guidance,
+            control,
+            transformer_options,
+            attn_mask=kwargs.get("attention_mask", None),
+        )
+        return self.forward_nerf(img, img_out, params)
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 324d89cff..252dfcf69 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -42,6 +42,7 @@ import comfy.ldm.wan.model
 import comfy.ldm.hunyuan3d.model
 import comfy.ldm.hidream.model
 import comfy.ldm.chroma.model
+import comfy.ldm.chroma_radiance.model
 import comfy.ldm.ace.model
 import comfy.ldm.omnigen.omnigen2
 import comfy.ldm.qwen_image.model
@@ -1320,8 +1321,8 @@ class HiDream(BaseModel):
         return out
 
 class Chroma(Flux):
-    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.chroma.model.Chroma)
+    def __init__(self, model_config, model_type=ModelType.FLUX, device=None, unet_model=comfy.ldm.chroma.model.Chroma):
+        super().__init__(model_config, model_type, device=device, unet_model=unet_model)
 
     def extra_conds(self, **kwargs):
         out = super().extra_conds(**kwargs)
@@ -1331,6 +1332,10 @@ class Chroma(Flux):
             out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
         return out
 
+class ChromaRadiance(Chroma):
+    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.chroma_radiance.model.ChromaRadiance)
+
 class ACEStep(BaseModel):
     def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
         super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ace.model.ACEStepTransformer2DModel)
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index fe983cede..03d44f65e 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -174,7 +174,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
         dit_config["guidance_embed"] = len(guidance_keys) > 0
         return dit_config
 
-    if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and '{}img_in.weight'.format(key_prefix) in state_dict_keys: #Flux
+    if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): #Flux, Chroma or Chroma Radiance (has no img_in.weight)
         dit_config = {}
         dit_config["image_model"] = "flux"
         dit_config["in_channels"] = 16
@@ -204,6 +204,18 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
             dit_config["out_dim"] = 3072
             dit_config["hidden_dim"] = 5120
             dit_config["n_layers"] = 5
+            if f"{key_prefix}nerf_blocks.0.norm.scale" in state_dict_keys: #Chroma Radiance
+                dit_config["image_model"] = "chroma_radiance"
+                dit_config["in_channels"] = 3
+                dit_config["out_channels"] = 3
+                dit_config["patch_size"] = 16
+                dit_config["nerf_hidden_size"] = 64
+                dit_config["nerf_mlp_ratio"] = 4
+                dit_config["nerf_depth"] = 4
+                dit_config["nerf_max_freqs"] = 8
+                dit_config["nerf_tile_size"] = 32
+                dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear"
+                dit_config["nerf_embedder_dtype"] = torch.float32
         else:
             dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
         return dit_config
diff --git a/comfy/sd.py b/comfy/sd.py
index f8f1a89e8..cb92802e9 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -785,6 +785,66 @@ class VAE:
         except:
             return None
 
+# "Fake" VAE that converts from IMAGE B, H, W, C and values on the scale of 0..1
+# to LATENT B, C, H, W and values on the scale of -1..1.
+class PixelspaceConversionVAE:
+    def __init__(self, size_increment: int=16):
+        self.intermediate_device = comfy.model_management.intermediate_device()
+        self.size_increment = size_increment
+
+    def vae_encode_crop_pixels(self, pixels: torch.Tensor) -> torch.Tensor:
+        if self.size_increment == 1:
+            return pixels
+        dims = pixels.shape[1:-1]
+        for d in range(len(dims)):
+            d_adj = (dims[d] // self.size_increment) * self.size_increment
+            if d_adj == d:
+                continue
+            d_offset = (dims[d] % self.size_increment) // 2
+            pixels = pixels.narrow(d + 1, d_offset, d_adj)
+        return pixels
+
+    def encode(self, pixels: torch.Tensor, *_args, **_kwargs) -> torch.Tensor:
+        if pixels.ndim == 3:
+            pixels = pixels.unsqueeze(0)
+        elif pixels.ndim != 4:
+            raise ValueError("Unexpected input image shape")
+        # Ensure the image has spatial dimensions that are multiples of 16.
+        pixels = self.vae_encode_crop_pixels(pixels)
+        h, w, c = pixels.shape[1:]
+        if h < self.size_increment or w < self.size_increment:
+            raise ValueError(f"Image inputs must have height/width of at least {self.size_increment} pixel(s).")
+        pixels= pixels[..., :3]
+        if c == 1:
+            pixels = pixels.expand(-1, -1, -1, 3)
+        elif c != 3:
+            raise ValueError("Unexpected number of channels in input image")
+        # Rescale to -1..1 and move the channel dimension to position 1.
+        latent = pixels.to(device=self.intermediate_device, dtype=torch.float32, copy=True)
+        latent = latent.clamp_(0, 1).movedim(-1, 1).contiguous()
+        latent -= 0.5
+        latent *= 2
+        return latent.clamp_(-1, 1)
+
+    def decode(self, samples: torch.Tensor, *_args, **_kwargs) -> torch.Tensor:
+        # Rescale to 0..1 and move the channel dimension to the end.
+        img = samples.to(device=self.intermediate_device, dtype=torch.float32, copy=True)
+        img = img.clamp_(-1, 1).movedim(1, -1).contiguous()
+        img += 1.0
+        img *= 0.5
+        return img.clamp_(0, 1)
+
+    encode_tiled = encode
+    decode_tiled = decode
+
+    @classmethod
+    def spacial_compression_decode(cls) -> int:
+        # This just exists so the tiled VAE nodes don't crash.
+        return 1
+
+    spacial_compression_encode = spacial_compression_decode
+    temporal_compression_decode = spacial_compression_decode
+
 class StyleModel:
     def __init__(self, model, device="cpu"):
         self.model = model
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 472ea0ae9..be36b5dfe 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1205,6 +1205,19 @@ class Chroma(supported_models_base.BASE):
         t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
         return supported_models_base.ClipTarget(comfy.text_encoders.pixart_t5.PixArtTokenizer, comfy.text_encoders.pixart_t5.pixart_te(**t5_detect))
 
+class ChromaRadiance(Chroma):
+    unet_config = {
+        "image_model": "chroma_radiance",
+    }
+
+    latent_format = comfy.latent_formats.ChromaRadiance
+
+    # Pixel-space model, no spatial compression for model input.
+    memory_usage_factor = 0.0325
+
+    def get_model(self, state_dict, prefix="", device=None):
+        return model_base.ChromaRadiance(self, device=device)
+
 class ACEStep(supported_models_base.BASE):
     unet_config = {
         "audio_model": "ace",
@@ -1338,6 +1351,6 @@ class HunyuanImage21Refiner(HunyuanVideo):
         out = model_base.HunyuanImage21Refiner(self, device=device)
         return out
 
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ACEStep, Omnigen2, QwenImage]
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
 
 models += [SVD_img2vid]
diff --git a/comfy_extras/nodes_chroma_radiance.py b/comfy_extras/nodes_chroma_radiance.py
new file mode 100644
index 000000000..381989818
--- /dev/null
+++ b/comfy_extras/nodes_chroma_radiance.py
@@ -0,0 +1,114 @@
+from typing_extensions import override
+from typing import Callable
+
+import torch
+
+import comfy.model_management
+from comfy_api.latest import ComfyExtension, io
+
+import nodes
+
+class EmptyChromaRadianceLatentImage(io.ComfyNode):
+    @classmethod
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="EmptyChromaRadianceLatentImage",
+            category="latent/chroma_radiance",
+            inputs=[
+                io.Int.Input(id="width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input(id="height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input(id="batch_size", default=1, min=1, max=4096),
+            ],
+            outputs=[io.Latent().Output()],
+        )
+
+    @classmethod
+    def execute(cls, *, width: int, height: int, batch_size: int=1) -> io.NodeOutput:
+        latent = torch.zeros((batch_size, 3, height, width), device=comfy.model_management.intermediate_device())
+        return io.NodeOutput({"samples":latent})
+
+
+class ChromaRadianceOptions(io.ComfyNode):
+    @classmethod
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="ChromaRadianceOptions",
+            category="model_patches/chroma_radiance",
+            description="Allows setting advanced options for the Chroma Radiance model.",
+            inputs=[
+                io.Model.Input(id="model"),
+                io.Boolean.Input(
+                    id="preserve_wrapper",
+                    default=True,
+                    tooltip="When enabled, will delegate to an existing model function wrapper if it exists. Generally should be left enabled.",
+                ),
+                io.Float.Input(
+                    id="start_sigma",
+                    default=1.0,
+                    min=0.0,
+                    max=1.0,
+                    tooltip="First sigma that these options will be in effect.",
+                ),
+                io.Float.Input(
+                    id="end_sigma",
+                    default=0.0,
+                    min=0.0,
+                    max=1.0,
+                    tooltip="Last sigma that these options will be in effect.",
+                ),
+                io.Int.Input(
+                    id="nerf_tile_size",
+                    default=-1,
+                    min=-1,
+                    tooltip="Allows overriding the default NeRF tile size. -1 means use the default (32). 0 means use non-tiling mode (may require a lot of VRAM).",
+                ),
+            ],
+            outputs=[io.Model.Output()],
+        )
+
+    @classmethod
+    def execute(
+        cls,
+        *,
+        model: io.Model.Type,
+        preserve_wrapper: bool,
+        start_sigma: float,
+        end_sigma: float,
+        nerf_tile_size: int,
+    ) -> io.NodeOutput:
+        radiance_options = {}
+        if nerf_tile_size >= 0:
+            radiance_options["nerf_tile_size"] = nerf_tile_size
+
+        if not radiance_options:
+            return io.NodeOutput(model)
+
+        old_wrapper = model.model_options.get("model_function_wrapper")
+
+        def model_function_wrapper(apply_model: Callable, args: dict) -> torch.Tensor:
+            c = args["c"].copy()
+            sigma = args["timestep"].max().detach().cpu().item()
+            if end_sigma <= sigma <= start_sigma:
+                transformer_options = c.get("transformer_options", {}).copy()
+                transformer_options["chroma_radiance_options"] = radiance_options.copy()
+                c["transformer_options"] = transformer_options
+            if not (preserve_wrapper and old_wrapper):
+                return apply_model(args["input"], args["timestep"], **c)
+            return old_wrapper(apply_model, args | {"c": c})
+
+        model = model.clone()
+        model.set_model_unet_function_wrapper(model_function_wrapper)
+        return io.NodeOutput(model)
+
+
+class ChromaRadianceExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            EmptyChromaRadianceLatentImage,
+            ChromaRadianceOptions,
+        ]
+
+
+async def comfy_entrypoint() -> ChromaRadianceExtension:
+    return ChromaRadianceExtension()
diff --git a/nodes.py b/nodes.py
index 2befb4b75..76b8cbac8 100644
--- a/nodes.py
+++ b/nodes.py
@@ -730,6 +730,7 @@ class VAELoader:
             vaes.append("taesd3")
         if f1_taesd_dec and f1_taesd_enc:
             vaes.append("taef1")
+        vaes.append("chroma_radiance")
         return vaes
 
     @staticmethod
@@ -772,7 +773,9 @@ class VAELoader:
 
     #TODO: scale factor?
     def load_vae(self, vae_name):
-        if vae_name in ["taesd", "taesdxl", "taesd3", "taef1"]:
+        if vae_name == "chroma_radiance":
+            return (comfy.sd.PixelspaceConversionVAE(),)
+        elif vae_name in ["taesd", "taesdxl", "taesd3", "taef1"]:
             sd = self.load_taesd(vae_name)
         else:
             vae_path = folder_paths.get_full_path_or_raise("vae", vae_name)
@@ -2322,6 +2325,7 @@ async def init_builtin_extra_nodes():
         "nodes_tcfg.py",
         "nodes_context_windows.py",
         "nodes_qwen.py",
+        "nodes_chroma_radiance.py",
         "nodes_model_patch.py",
         "nodes_easycache.py",
         "nodes_audio_encoder.py",

From 80b7c9455bf7afba7a9e95a1eb76b172408ab56c Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Sat, 13 Sep 2025 15:03:34 -0700
Subject: [PATCH 05/33] Changes to the previous radiance commit. (#9851)

---
 comfy/ldm/chroma_radiance/model.py |  7 +--
 comfy/pixel_space_convert.py       | 16 +++++++
 comfy/sd.py                        | 69 +++++-------------------------
 comfy/supported_models.py          |  2 +-
 nodes.py                           |  7 +--
 5 files changed, 35 insertions(+), 66 deletions(-)
 create mode 100644 comfy/pixel_space_convert.py

diff --git a/comfy/ldm/chroma_radiance/model.py b/comfy/ldm/chroma_radiance/model.py
index f7eb7a22e..47aa11b04 100644
--- a/comfy/ldm/chroma_radiance/model.py
+++ b/comfy/ldm/chroma_radiance/model.py
@@ -306,8 +306,9 @@ class ChromaRadiance(Chroma):
 
         params = self.radiance_get_override_params(transformer_options.get("chroma_radiance_options", {}))
 
-        h_len = ((h + (self.patch_size // 2)) // self.patch_size)
-        w_len = ((w + (self.patch_size // 2)) // self.patch_size)
+        h_len = (img.shape[-2] // self.patch_size)
+        w_len = (img.shape[-1] // self.patch_size)
+
         img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
         img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
         img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
@@ -325,4 +326,4 @@ class ChromaRadiance(Chroma):
             transformer_options,
             attn_mask=kwargs.get("attention_mask", None),
         )
-        return self.forward_nerf(img, img_out, params)
+        return self.forward_nerf(img, img_out, params)[:, :, :h, :w]
diff --git a/comfy/pixel_space_convert.py b/comfy/pixel_space_convert.py
new file mode 100644
index 000000000..049bbcfb4
--- /dev/null
+++ b/comfy/pixel_space_convert.py
@@ -0,0 +1,16 @@
+import torch
+
+
+# "Fake" VAE that converts from IMAGE B, H, W, C and values on the scale of 0..1
+# to LATENT B, C, H, W and values on the scale of -1..1.
+class PixelspaceConversionVAE(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.pixel_space_vae = torch.nn.Parameter(torch.tensor(1.0))
+
+    def encode(self, pixels: torch.Tensor, *_args, **_kwargs) -> torch.Tensor:
+        return pixels
+
+    def decode(self, samples: torch.Tensor, *_args, **_kwargs) -> torch.Tensor:
+        return samples
+
diff --git a/comfy/sd.py b/comfy/sd.py
index cb92802e9..2df340739 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -18,6 +18,7 @@ import comfy.ldm.wan.vae2_2
 import comfy.ldm.hunyuan3d.vae
 import comfy.ldm.ace.vae.music_dcae_pipeline
 import comfy.ldm.hunyuan_video.vae
+import comfy.pixel_space_convert
 import yaml
 import math
 import os
@@ -516,6 +517,15 @@ class VAE:
                 self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
                 self.disable_offload = True
                 self.extra_1d_channel = 16
+            elif "pixel_space_vae" in sd:
+                self.first_stage_model = comfy.pixel_space_convert.PixelspaceConversionVAE()
+                self.memory_used_encode = lambda shape, dtype: (1 * shape[2] * shape[3]) * model_management.dtype_size(dtype)
+                self.memory_used_decode = lambda shape, dtype: (1 * shape[2] * shape[3]) * model_management.dtype_size(dtype)
+                self.downscale_ratio = 1
+                self.upscale_ratio = 1
+                self.latent_channels = 3
+                self.latent_dim = 2
+                self.output_channels = 3
             else:
                 logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
                 self.first_stage_model = None
@@ -785,65 +795,6 @@ class VAE:
         except:
             return None
 
-# "Fake" VAE that converts from IMAGE B, H, W, C and values on the scale of 0..1
-# to LATENT B, C, H, W and values on the scale of -1..1.
-class PixelspaceConversionVAE:
-    def __init__(self, size_increment: int=16):
-        self.intermediate_device = comfy.model_management.intermediate_device()
-        self.size_increment = size_increment
-
-    def vae_encode_crop_pixels(self, pixels: torch.Tensor) -> torch.Tensor:
-        if self.size_increment == 1:
-            return pixels
-        dims = pixels.shape[1:-1]
-        for d in range(len(dims)):
-            d_adj = (dims[d] // self.size_increment) * self.size_increment
-            if d_adj == d:
-                continue
-            d_offset = (dims[d] % self.size_increment) // 2
-            pixels = pixels.narrow(d + 1, d_offset, d_adj)
-        return pixels
-
-    def encode(self, pixels: torch.Tensor, *_args, **_kwargs) -> torch.Tensor:
-        if pixels.ndim == 3:
-            pixels = pixels.unsqueeze(0)
-        elif pixels.ndim != 4:
-            raise ValueError("Unexpected input image shape")
-        # Ensure the image has spatial dimensions that are multiples of 16.
-        pixels = self.vae_encode_crop_pixels(pixels)
-        h, w, c = pixels.shape[1:]
-        if h < self.size_increment or w < self.size_increment:
-            raise ValueError(f"Image inputs must have height/width of at least {self.size_increment} pixel(s).")
-        pixels= pixels[..., :3]
-        if c == 1:
-            pixels = pixels.expand(-1, -1, -1, 3)
-        elif c != 3:
-            raise ValueError("Unexpected number of channels in input image")
-        # Rescale to -1..1 and move the channel dimension to position 1.
-        latent = pixels.to(device=self.intermediate_device, dtype=torch.float32, copy=True)
-        latent = latent.clamp_(0, 1).movedim(-1, 1).contiguous()
-        latent -= 0.5
-        latent *= 2
-        return latent.clamp_(-1, 1)
-
-    def decode(self, samples: torch.Tensor, *_args, **_kwargs) -> torch.Tensor:
-        # Rescale to 0..1 and move the channel dimension to the end.
-        img = samples.to(device=self.intermediate_device, dtype=torch.float32, copy=True)
-        img = img.clamp_(-1, 1).movedim(1, -1).contiguous()
-        img += 1.0
-        img *= 0.5
-        return img.clamp_(0, 1)
-
-    encode_tiled = encode
-    decode_tiled = decode
-
-    @classmethod
-    def spacial_compression_decode(cls) -> int:
-        # This just exists so the tiled VAE nodes don't crash.
-        return 1
-
-    spacial_compression_encode = spacial_compression_decode
-    temporal_compression_decode = spacial_compression_decode
 
 class StyleModel:
     def __init__(self, model, device="cpu"):
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index be36b5dfe..557902d11 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1213,7 +1213,7 @@ class ChromaRadiance(Chroma):
     latent_format = comfy.latent_formats.ChromaRadiance
 
     # Pixel-space model, no spatial compression for model input.
-    memory_usage_factor = 0.0325
+    memory_usage_factor = 0.038
 
     def get_model(self, state_dict, prefix="", device=None):
         return model_base.ChromaRadiance(self, device=device)
diff --git a/nodes.py b/nodes.py
index 76b8cbac8..5a5fdcb8e 100644
--- a/nodes.py
+++ b/nodes.py
@@ -730,7 +730,7 @@ class VAELoader:
             vaes.append("taesd3")
         if f1_taesd_dec and f1_taesd_enc:
             vaes.append("taef1")
-        vaes.append("chroma_radiance")
+        vaes.append("pixel_space")
         return vaes
 
     @staticmethod
@@ -773,8 +773,9 @@ class VAELoader:
 
     #TODO: scale factor?
     def load_vae(self, vae_name):
-        if vae_name == "chroma_radiance":
-            return (comfy.sd.PixelspaceConversionVAE(),)
+        if vae_name == "pixel_space":
+            sd = {}
+            sd["pixel_space_vae"] = torch.tensor(1.0)
         elif vae_name in ["taesd", "taesdxl", "taesd3", "taef1"]:
             sd = self.load_taesd(vae_name)
         else:

From f228367c5e3906de194968fa9b6fbe7aa9987bfa Mon Sep 17 00:00:00 2001
From: Jedrzej Kosinski <kosinkadink1@gmail.com>
Date: Sat, 13 Sep 2025 18:34:21 -0700
Subject: [PATCH 06/33] Make ModuleNotFoundError ImportError instead (#9850)

---
 comfy/ldm/modules/attention.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py
index bf2553c37..9dd1a43c1 100644
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -22,7 +22,7 @@ SAGE_ATTENTION_IS_AVAILABLE = False
 try:
     from sageattention import sageattn
     SAGE_ATTENTION_IS_AVAILABLE = True
-except ModuleNotFoundError as e:
+except ImportError as e:
     if model_management.sage_attention_enabled():
         if e.name == "sageattention":
             logging.error(f"\n\nTo use the `--use-sage-attention` feature, the `sageattention` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install sageattention")
@@ -34,7 +34,7 @@ FLASH_ATTENTION_IS_AVAILABLE = False
 try:
     from flash_attn import flash_attn_func
     FLASH_ATTENTION_IS_AVAILABLE = True
-except ModuleNotFoundError:
+except ImportError:
     if model_management.flash_attention_enabled():
         logging.error(f"\n\nTo use the `--use-flash-attention` feature, the `flash-attn` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install flash-attn")
         exit(-1)

From 4f1f26ac6c11b803bbc83cb347178e2f9b5e421b Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Sun, 14 Sep 2025 01:05:38 -0700
Subject: [PATCH 07/33] Add that hunyuan image is supported to readme. (#9857)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 8024870c2..3f6cfc2ed 100644
--- a/README.md
+++ b/README.md
@@ -66,6 +66,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
    - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
    - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
    - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/)
+   - [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/)
 - Image Editing Models
    - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
    - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)

From 47a9cde5d3045c42f20baafb9855fb96959124f0 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 15 Sep 2025 15:10:55 -0700
Subject: [PATCH 08/33] Support the omnigen2 umo lora. (#9886)

---
 comfy/lora.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/comfy/lora.py b/comfy/lora.py
index 4a44f1318..36d26293a 100644
--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -297,6 +297,12 @@ def model_lora_keys_unet(model, key_map={}):
                 key_lora = k[len("diffusion_model."):-len(".weight")]
                 key_map["{}".format(key_lora)] = k
 
+    if isinstance(model, comfy.model_base.Omnigen2):
+        for k in sdk:
+            if k.startswith("diffusion_model.") and k.endswith(".weight"):
+                key_lora = k[len("diffusion_model."):-len(".weight")]
+                key_map["{}".format(key_lora)] = k
+
     if isinstance(model, comfy.model_base.QwenImage):
         for k in sdk:
             if k.startswith("diffusion_model.") and k.endswith(".weight"): #QwenImage lora format

From 1a85483da159f2800407ae5a8a45eb0d88ffce2d Mon Sep 17 00:00:00 2001
From: blepping <157360029+blepping@users.noreply.github.com>
Date: Mon, 15 Sep 2025 18:05:03 -0600
Subject: [PATCH 09/33] Fix depending on asserts to raise an exception in
 BatchedBrownianTree and Flash attn module (#9884)

Correctly handle the case where w0 is passed by kwargs in BatchedBrownianTree
---
 comfy/k_diffusion/sampling.py  | 35 +++++++++++++++++-----------------
 comfy/ldm/modules/attention.py |  3 ++-
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py
index 2d7e09838..0e2cda291 100644
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -86,24 +86,24 @@ class BatchedBrownianTree:
     """A wrapper around torchsde.BrownianTree that enables batches of entropy."""
 
     def __init__(self, x, t0, t1, seed=None, **kwargs):
-        self.cpu_tree = True
-        if "cpu" in kwargs:
-            self.cpu_tree = kwargs.pop("cpu")
+        self.cpu_tree = kwargs.pop("cpu", True)
         t0, t1, self.sign = self.sort(t0, t1)
-        w0 = kwargs.get('w0', torch.zeros_like(x))
+        w0 = kwargs.pop('w0', None)
+        if w0 is None:
+            w0 = torch.zeros_like(x)
+        self.batched = False
         if seed is None:
-            seed = torch.randint(0, 2 ** 63 - 1, []).item()
-        self.batched = True
-        try:
-            assert len(seed) == x.shape[0]
+            seed = (torch.randint(0, 2 ** 63 - 1, ()).item(),)
+        elif isinstance(seed, (tuple, list)):
+            if len(seed) != x.shape[0]:
+                raise ValueError("Passing a list or tuple of seeds to BatchedBrownianTree requires a length matching the batch size.")
+            self.batched = True
             w0 = w0[0]
-        except TypeError:
-            seed = [seed]
-            self.batched = False
-        if self.cpu_tree:
-            self.trees = [torchsde.BrownianTree(t0.cpu(), w0.cpu(), t1.cpu(), entropy=s, **kwargs) for s in seed]
         else:
-            self.trees = [torchsde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed]
+            seed = (seed,)
+        if self.cpu_tree:
+            t0, w0, t1 = t0.detach().cpu(), w0.detach().cpu(), t1.detach().cpu()
+        self.trees = tuple(torchsde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed)
 
     @staticmethod
     def sort(a, b):
@@ -111,11 +111,10 @@ class BatchedBrownianTree:
 
     def __call__(self, t0, t1):
         t0, t1, sign = self.sort(t0, t1)
+        device, dtype = t0.device, t0.dtype
         if self.cpu_tree:
-            w = torch.stack([tree(t0.cpu().float(), t1.cpu().float()).to(t0.dtype).to(t0.device) for tree in self.trees]) * (self.sign * sign)
-        else:
-            w = torch.stack([tree(t0, t1) for tree in self.trees]) * (self.sign * sign)
-
+            t0, t1 = t0.detach().cpu().float(), t1.detach().cpu().float()
+        w = torch.stack([tree(t0, t1) for tree in self.trees]).to(device=device, dtype=dtype) * (self.sign * sign)
         return w if self.batched else w[0]
 
 
diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py
index 9dd1a43c1..7437e0567 100644
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -600,7 +600,8 @@ def attention_flash(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
             mask = mask.unsqueeze(1)
 
     try:
-        assert mask is None
+        if mask is not None:
+            raise RuntimeError("Mask must not be set for Flash attention")
         out = flash_attn_wrapper(
             q.transpose(1, 2),
             k.transpose(1, 2),

From a39ac59c3e3fddc8b278899814f0bd5371abb11f Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 15 Sep 2025 22:19:50 -0700
Subject: [PATCH 10/33] Add encoder part of whisper large v3 as an audio
 encoder model. (#9894)

Not useful yet but some models use it.
---
 comfy/audio_encoders/audio_encoders.py |  58 +++++---
 comfy/audio_encoders/whisper.py        | 186 +++++++++++++++++++++++++
 2 files changed, 224 insertions(+), 20 deletions(-)
 create mode 100755 comfy/audio_encoders/whisper.py

diff --git a/comfy/audio_encoders/audio_encoders.py b/comfy/audio_encoders/audio_encoders.py
index 6fb5b08e9..0550b2f9b 100644
--- a/comfy/audio_encoders/audio_encoders.py
+++ b/comfy/audio_encoders/audio_encoders.py
@@ -1,4 +1,5 @@
 from .wav2vec2 import Wav2Vec2Model
+from .whisper import WhisperLargeV3
 import comfy.model_management
 import comfy.ops
 import comfy.utils
@@ -11,13 +12,18 @@ class AudioEncoderModel():
         self.load_device = comfy.model_management.text_encoder_device()
         offload_device = comfy.model_management.text_encoder_offload_device()
         self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
+        model_type = config.pop("model_type")
         model_config = dict(config)
         model_config.update({
             "dtype": self.dtype,
             "device": offload_device,
             "operations": comfy.ops.manual_cast
         })
-        self.model = Wav2Vec2Model(**model_config)
+
+        if model_type == "wav2vec2":
+            self.model = Wav2Vec2Model(**model_config)
+        elif model_type == "whisper3":
+            self.model = WhisperLargeV3(**model_config)
         self.model.eval()
         self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
         self.model_sample_rate = 16000
@@ -40,33 +46,45 @@ class AudioEncoderModel():
 
 def load_audio_encoder_from_sd(sd, prefix=""):
     sd = comfy.utils.state_dict_prefix_replace(sd, {"wav2vec2.": ""})
-    embed_dim = sd["encoder.layer_norm.bias"].shape[0]
-    if embed_dim == 1024:# large
-        config = {
-            "embed_dim": 1024,
-            "num_heads": 16,
-            "num_layers": 24,
-            "conv_norm": True,
-            "conv_bias": True,
-            "do_normalize": True,
-            "do_stable_layer_norm": True
+    if "encoder.layer_norm.bias" in sd: #wav2vec2
+        embed_dim = sd["encoder.layer_norm.bias"].shape[0]
+        if embed_dim == 1024:# large
+            config = {
+                "model_type": "wav2vec2",
+                "embed_dim": 1024,
+                "num_heads": 16,
+                "num_layers": 24,
+                "conv_norm": True,
+                "conv_bias": True,
+                "do_normalize": True,
+                "do_stable_layer_norm": True
+                }
+        elif embed_dim == 768: # base
+            config = {
+                "model_type": "wav2vec2",
+                "embed_dim": 768,
+                "num_heads": 12,
+                "num_layers": 12,
+                "conv_norm": False,
+                "conv_bias": False,
+                "do_normalize": False, # chinese-wav2vec2-base has this False
+                "do_stable_layer_norm": False
             }
-    elif embed_dim == 768: # base
+        else:
+            raise RuntimeError("ERROR: audio encoder file is invalid or unsupported embed_dim: {}".format(embed_dim))
+    elif "model.encoder.embed_positions.weight" in sd:
+        sd = comfy.utils.state_dict_prefix_replace(sd, {"model.": ""})
         config = {
-            "embed_dim": 768,
-            "num_heads": 12,
-            "num_layers": 12,
-            "conv_norm": False,
-            "conv_bias": False,
-            "do_normalize": False, # chinese-wav2vec2-base has this False
-            "do_stable_layer_norm": False
+            "model_type": "whisper3",
         }
     else:
-        raise RuntimeError("ERROR: audio encoder file is invalid or unsupported embed_dim: {}".format(embed_dim))
+        raise RuntimeError("ERROR: audio encoder not supported.")
 
     audio_encoder = AudioEncoderModel(config)
     m, u = audio_encoder.load_sd(sd)
     if len(m) > 0:
         logging.warning("missing audio encoder: {}".format(m))
+    if len(u) > 0:
+        logging.warning("unexpected audio encoder: {}".format(u))
 
     return audio_encoder
diff --git a/comfy/audio_encoders/whisper.py b/comfy/audio_encoders/whisper.py
new file mode 100755
index 000000000..93d3782f1
--- /dev/null
+++ b/comfy/audio_encoders/whisper.py
@@ -0,0 +1,186 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from typing import Optional
+from comfy.ldm.modules.attention import optimized_attention_masked
+import comfy.ops
+
+class WhisperFeatureExtractor(nn.Module):
+    def __init__(self, n_mels=128, device=None):
+        super().__init__()
+        self.sample_rate = 16000
+        self.n_fft = 400
+        self.hop_length = 160
+        self.n_mels = n_mels
+        self.chunk_length = 30
+        self.n_samples = 480000
+
+        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
+            sample_rate=self.sample_rate,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            n_mels=self.n_mels,
+            f_min=0,
+            f_max=8000,
+            norm="slaney",
+            mel_scale="slaney",
+        ).to(device)
+
+    def __call__(self, audio):
+        audio = torch.mean(audio, dim=1)
+        batch_size = audio.shape[0]
+        processed_audio = []
+
+        for i in range(batch_size):
+            aud = audio[i]
+            if aud.shape[0] > self.n_samples:
+                aud = aud[:self.n_samples]
+            elif aud.shape[0] < self.n_samples:
+                aud = F.pad(aud, (0, self.n_samples - aud.shape[0]))
+            processed_audio.append(aud)
+
+        audio = torch.stack(processed_audio)
+
+        mel_spec = self.mel_spectrogram(audio.to(self.mel_spectrogram.spectrogram.window.device))[:, :, :-1].to(audio.device)
+
+        log_mel_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_mel_spec = torch.maximum(log_mel_spec, log_mel_spec.max() - 8.0)
+        log_mel_spec = (log_mel_spec + 4.0) / 4.0
+
+        return log_mel_spec
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, dtype=None, device=None, operations=None):
+        super().__init__()
+        assert d_model % n_heads == 0
+
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.d_k = d_model // n_heads
+
+        self.q_proj = operations.Linear(d_model, d_model, dtype=dtype, device=device)
+        self.k_proj = operations.Linear(d_model, d_model, bias=False, dtype=dtype, device=device)
+        self.v_proj = operations.Linear(d_model, d_model, dtype=dtype, device=device)
+        self.out_proj = operations.Linear(d_model, d_model, dtype=dtype, device=device)
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, seq_len, _ = query.shape
+
+        q = self.q_proj(query)
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+
+        attn_output = optimized_attention_masked(q, k, v, self.n_heads, mask)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, d_ff: int, dtype=None, device=None, operations=None):
+        super().__init__()
+
+        self.self_attn = MultiHeadAttention(d_model, n_heads, dtype=dtype, device=device, operations=operations)
+        self.self_attn_layer_norm = operations.LayerNorm(d_model, dtype=dtype, device=device)
+
+        self.fc1 = operations.Linear(d_model, d_ff, dtype=dtype, device=device)
+        self.fc2 = operations.Linear(d_ff, d_model, dtype=dtype, device=device)
+        self.final_layer_norm = operations.LayerNorm(d_model, dtype=dtype, device=device)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        residual = x
+        x = self.self_attn_layer_norm(x)
+        x = self.self_attn(x, x, x, attention_mask)
+        x = residual + x
+
+        residual = x
+        x = self.final_layer_norm(x)
+        x = self.fc1(x)
+        x = F.gelu(x)
+        x = self.fc2(x)
+        x = residual + x
+
+        return x
+
+
+class AudioEncoder(nn.Module):
+    def __init__(
+        self,
+        n_mels: int = 128,
+        n_ctx: int = 1500,
+        n_state: int = 1280,
+        n_head: int = 20,
+        n_layer: int = 32,
+        dtype=None,
+        device=None,
+        operations=None
+    ):
+        super().__init__()
+
+        self.conv1 = operations.Conv1d(n_mels, n_state, kernel_size=3, padding=1, dtype=dtype, device=device)
+        self.conv2 = operations.Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1, dtype=dtype, device=device)
+
+        self.embed_positions = operations.Embedding(n_ctx, n_state, dtype=dtype, device=device)
+
+        self.layers = nn.ModuleList([
+            EncoderLayer(n_state, n_head, n_state * 4, dtype=dtype, device=device, operations=operations)
+            for _ in range(n_layer)
+        ])
+
+        self.layer_norm = operations.LayerNorm(n_state, dtype=dtype, device=device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+
+        x = x.transpose(1, 2)
+
+        x = x + comfy.ops.cast_to_input(self.embed_positions.weight[:, :x.shape[1]], x)
+
+        all_x = ()
+        for layer in self.layers:
+            all_x += (x,)
+            x = layer(x)
+
+        x = self.layer_norm(x)
+        all_x += (x,)
+        return x, all_x
+
+
+class WhisperLargeV3(nn.Module):
+    def __init__(
+        self,
+        n_mels: int = 128,
+        n_audio_ctx: int = 1500,
+        n_audio_state: int = 1280,
+        n_audio_head: int = 20,
+        n_audio_layer: int = 32,
+        dtype=None,
+        device=None,
+        operations=None
+    ):
+        super().__init__()
+
+        self.feature_extractor = WhisperFeatureExtractor(n_mels=n_mels, device=device)
+
+        self.encoder = AudioEncoder(
+            n_mels, n_audio_ctx, n_audio_state, n_audio_head, n_audio_layer,
+            dtype=dtype, device=device, operations=operations
+        )
+
+    def forward(self, audio):
+        mel = self.feature_extractor(audio)
+        x, all_x = self.encoder(mel)
+        return x, all_x

From e42682b24ef033a93001ba27cc5c5aa461a61d8d Mon Sep 17 00:00:00 2001
From: rattus128 <46076784+rattus128@users.noreply.github.com>
Date: Wed, 17 Sep 2025 09:21:14 +1000
Subject: [PATCH 11/33] Reduce Peak WAN inference VRAM usage (#9898)

* flux: Do the xq and xk ropes one at a time

This was doing independendent interleaved tensor math on the q and k
tensors, leading to the holding of more than the minimum intermediates
in VRAM. On a bad day, it would VRAM OOM on xk intermediates.

Do everything q and then everything k, so torch can garbage collect
all of qs intermediates before k allocates its intermediates.

This reduces peak VRAM usage for some WAN2.2 inferences (at least).

* wan: Optimize qkv intermediates on attention

As commented. The former logic computed independent pieces of QKV in
parallel which help more inference intermediates in VRAM spiking
VRAM usage. Fully roping Q and garbage collecting the intermediates
before touching K reduces the peak inference VRAM usage.
---
 comfy/ldm/flux/math.py | 11 +++++------
 comfy/ldm/wan/model.py | 22 +++++++++++++---------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/comfy/ldm/flux/math.py b/comfy/ldm/flux/math.py
index 4d743cda2..fb7cd7586 100644
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@@ -35,11 +35,10 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
     out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
     return out.to(dtype=torch.float32, device=pos.device)
 
+def apply_rope1(x: Tensor, freqs_cis: Tensor):
+    x_ = x.to(dtype=freqs_cis.dtype).reshape(*x.shape[:-1], -1, 1, 2)
+    x_out = freqs_cis[..., 0] * x_[..., 0] + freqs_cis[..., 1] * x_[..., 1]
+    return x_out.reshape(*x.shape).type_as(x)
 
 def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
-    xq_ = xq.to(dtype=freqs_cis.dtype).reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.to(dtype=freqs_cis.dtype).reshape(*xk.shape[:-1], -1, 1, 2)
-    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
-    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
-    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
-
+    return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py
index 63472ada2..67dcf8f1e 100644
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@@ -8,7 +8,7 @@ from einops import rearrange
 
 from comfy.ldm.modules.attention import optimized_attention
 from comfy.ldm.flux.layers import EmbedND
-from comfy.ldm.flux.math import apply_rope
+from comfy.ldm.flux.math import apply_rope1
 import comfy.ldm.common_dit
 import comfy.model_management
 import comfy.patcher_extension
@@ -60,20 +60,24 @@ class WanSelfAttention(nn.Module):
         """
         b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
 
-        # query, key, value function
-        def qkv_fn(x):
+        def qkv_fn_q(x):
             q = self.norm_q(self.q(x)).view(b, s, n, d)
-            k = self.norm_k(self.k(x)).view(b, s, n, d)
-            v = self.v(x).view(b, s, n * d)
-            return q, k, v
+            return apply_rope1(q, freqs)
 
-        q, k, v = qkv_fn(x)
-        q, k = apply_rope(q, k, freqs)
+        def qkv_fn_k(x):
+            k = self.norm_k(self.k(x)).view(b, s, n, d)
+            return apply_rope1(k, freqs)
+
+        #These two are VRAM hogs, so we want to do all of q computation and
+        #have pytorch garbage collect the intermediates on the sub function
+        #return before we touch k
+        q = qkv_fn_q(x)
+        k = qkv_fn_k(x)
 
         x = optimized_attention(
             q.view(b, s, n * d),
             k.view(b, s, n * d),
-            v,
+            self.v(x).view(b, s, n * d),
             heads=self.num_heads,
             transformer_options=transformer_options,
         )

From 9288c78fc5fae74d3fa7787736dea442e996303f Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Tue, 16 Sep 2025 21:12:48 -0700
Subject: [PATCH 12/33] Support the HuMo model. (#9903)

---
 comfy/audio_encoders/audio_encoders.py |   1 +
 comfy/ldm/wan/model.py                 | 259 ++++++++++++++++++++++++-
 comfy/model_base.py                    |  17 ++
 comfy/model_detection.py               |   2 +
 comfy/supported_models.py              |  12 +-
 comfy_extras/nodes_wan.py              |  98 ++++++++++
 6 files changed, 383 insertions(+), 6 deletions(-)

diff --git a/comfy/audio_encoders/audio_encoders.py b/comfy/audio_encoders/audio_encoders.py
index 0550b2f9b..46ef21c95 100644
--- a/comfy/audio_encoders/audio_encoders.py
+++ b/comfy/audio_encoders/audio_encoders.py
@@ -41,6 +41,7 @@ class AudioEncoderModel():
         outputs = {}
         outputs["encoded_audio"] = out
         outputs["encoded_audio_all_layers"] = all_layers
+        outputs["audio_samples"] = audio.shape[2]
         return outputs
 
 
diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py
index 67dcf8f1e..b3b7da5d5 100644
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@@ -34,7 +34,9 @@ class WanSelfAttention(nn.Module):
                  num_heads,
                  window_size=(-1, -1),
                  qk_norm=True,
-                 eps=1e-6, operation_settings={}):
+                 eps=1e-6,
+                 kv_dim=None,
+                 operation_settings={}):
         assert dim % num_heads == 0
         super().__init__()
         self.dim = dim
@@ -43,11 +45,13 @@ class WanSelfAttention(nn.Module):
         self.window_size = window_size
         self.qk_norm = qk_norm
         self.eps = eps
+        if kv_dim is None:
+            kv_dim = dim
 
         # layers
         self.q = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.k = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.v = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.k = operation_settings.get("operations").Linear(kv_dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.v = operation_settings.get("operations").Linear(kv_dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
         self.o = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
         self.norm_q = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
         self.norm_k = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
@@ -402,6 +406,7 @@ class WanModel(torch.nn.Module):
                  eps=1e-6,
                  flf_pos_embed_token_number=None,
                  in_dim_ref_conv=None,
+                 wan_attn_block_class=WanAttentionBlock,
                  image_model=None,
                  device=None,
                  dtype=None,
@@ -479,8 +484,8 @@ class WanModel(torch.nn.Module):
         # blocks
         cross_attn_type = 't2v_cross_attn' if model_type == 't2v' else 'i2v_cross_attn'
         self.blocks = nn.ModuleList([
-            WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads,
-                              window_size, qk_norm, cross_attn_norm, eps, operation_settings=operation_settings)
+            wan_attn_block_class(cross_attn_type, dim, ffn_dim, num_heads,
+                                 window_size, qk_norm, cross_attn_norm, eps, operation_settings=operation_settings)
             for _ in range(num_layers)
         ])
 
@@ -1325,3 +1330,247 @@ class WanModel_S2V(WanModel):
         # unpatchify
         x = self.unpatchify(x, grid_sizes)
         return x
+
+
+class WanT2VCrossAttentionGather(WanSelfAttention):
+
+    def forward(self, x, context, transformer_options={}, **kwargs):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C] - video tokens
+            context(Tensor): Shape [B, L2, C] - audio tokens with shape [B, frames*16, 1536]
+        """
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+
+        q = self.norm_q(self.q(x))
+        k = self.norm_k(self.k(context))
+        v = self.v(context)
+
+        # Handle audio temporal structure (16 tokens per frame)
+        k = k.reshape(-1, 16, n, d).transpose(1, 2)
+        v = v.reshape(-1, 16, n, d).transpose(1, 2)
+
+        # Handle video spatial structure
+        q = q.reshape(k.shape[0], -1, n, d).transpose(1, 2)
+
+        x = optimized_attention(q, k, v, heads=self.num_heads, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
+
+        x = x.transpose(1, 2).view(b, -1, n, d).flatten(2)
+        x = self.o(x)
+        return x
+
+
+class AudioCrossAttentionWrapper(nn.Module):
+    def __init__(self, dim, kv_dim, num_heads, qk_norm=True, eps=1e-6, operation_settings={}):
+        super().__init__()
+
+        self.audio_cross_attn = WanT2VCrossAttentionGather(dim, num_heads, qk_norm, kv_dim, eps, operation_settings=operation_settings)
+        self.norm1_audio = operation_settings.get("operations").LayerNorm(dim, eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+
+    def forward(self, x, audio, transformer_options={}):
+        x = x + self.audio_cross_attn(self.norm1_audio(x), audio, transformer_options=transformer_options)
+        return x
+
+
+class WanAttentionBlockAudio(WanAttentionBlock):
+
+    def __init__(self,
+                 cross_attn_type,
+                 dim,
+                 ffn_dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=False,
+                 eps=1e-6, operation_settings={}):
+        super().__init__(cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps, operation_settings)
+        self.audio_cross_attn_wrapper = AudioCrossAttentionWrapper(dim, 1536, num_heads, qk_norm, eps, operation_settings=operation_settings)
+
+    def forward(
+        self,
+        x,
+        e,
+        freqs,
+        context,
+        context_img_len=257,
+        audio=None,
+        transformer_options={},
+    ):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+            e(Tensor): Shape [B, 6, C]
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        # assert e.dtype == torch.float32
+
+        if e.ndim < 4:
+            e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1)
+        else:
+            e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e).unbind(2)
+        # assert e[0].dtype == torch.float32
+
+        # self-attention
+        y = self.self_attn(
+            torch.addcmul(repeat_e(e[0], x), self.norm1(x), 1 + repeat_e(e[1], x)),
+            freqs, transformer_options=transformer_options)
+
+        x = torch.addcmul(x, y, repeat_e(e[2], x))
+
+        # cross-attention & ffn
+        x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len, transformer_options=transformer_options)
+        if audio is not None:
+            x = self.audio_cross_attn_wrapper(x, audio, transformer_options=transformer_options)
+        y = self.ffn(torch.addcmul(repeat_e(e[3], x), self.norm2(x), 1 + repeat_e(e[4], x)))
+        x = torch.addcmul(x, y, repeat_e(e[5], x))
+        return x
+
+class DummyAdapterLayer(nn.Module):
+    def __init__(self, layer):
+        super().__init__()
+        self.layer = layer
+
+    def forward(self, *args, **kwargs):
+        return self.layer(*args, **kwargs)
+
+
+class AudioProjModel(nn.Module):
+    def __init__(
+        self,
+        seq_len=5,
+        blocks=13,  # add a new parameter blocks
+        channels=768,  # add a new parameter channels
+        intermediate_dim=512,
+        output_dim=1536,
+        context_tokens=16,
+        device=None,
+        dtype=None,
+        operations=None,
+    ):
+        super().__init__()
+
+        self.seq_len = seq_len
+        self.blocks = blocks
+        self.channels = channels
+        self.input_dim = seq_len * blocks * channels  # update input_dim to be the product of blocks and channels.
+        self.intermediate_dim = intermediate_dim
+        self.context_tokens = context_tokens
+        self.output_dim = output_dim
+
+        # define multiple linear layers
+        self.audio_proj_glob_1 = DummyAdapterLayer(operations.Linear(self.input_dim, intermediate_dim, dtype=dtype, device=device))
+        self.audio_proj_glob_2 = DummyAdapterLayer(operations.Linear(intermediate_dim, intermediate_dim, dtype=dtype, device=device))
+        self.audio_proj_glob_3 = DummyAdapterLayer(operations.Linear(intermediate_dim, context_tokens * output_dim, dtype=dtype, device=device))
+
+        self.audio_proj_glob_norm = DummyAdapterLayer(operations.LayerNorm(output_dim, dtype=dtype, device=device))
+
+    def forward(self, audio_embeds):
+        video_length = audio_embeds.shape[1]
+        audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
+        batch_size, window_size, blocks, channels = audio_embeds.shape
+        audio_embeds = audio_embeds.view(batch_size, window_size * blocks * channels)
+
+        audio_embeds = torch.relu(self.audio_proj_glob_1(audio_embeds))
+        audio_embeds = torch.relu(self.audio_proj_glob_2(audio_embeds))
+
+        context_tokens = self.audio_proj_glob_3(audio_embeds).reshape(batch_size, self.context_tokens, self.output_dim)
+
+        context_tokens = self.audio_proj_glob_norm(context_tokens)
+        context_tokens = rearrange(context_tokens, "(bz f) m c -> bz f m c", f=video_length)
+
+        return context_tokens
+
+
+class HumoWanModel(WanModel):
+    r"""
+    Wan diffusion backbone supporting both text-to-video and image-to-video.
+    """
+
+    def __init__(self,
+                 model_type='humo',
+                 patch_size=(1, 2, 2),
+                 text_len=512,
+                 in_dim=16,
+                 dim=2048,
+                 ffn_dim=8192,
+                 freq_dim=256,
+                 text_dim=4096,
+                 out_dim=16,
+                 num_heads=16,
+                 num_layers=32,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=True,
+                 eps=1e-6,
+                 flf_pos_embed_token_number=None,
+                 image_model=None,
+                 audio_token_num=16,
+                 device=None,
+                 dtype=None,
+                 operations=None,
+                 ):
+
+        super().__init__(model_type='t2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, wan_attn_block_class=WanAttentionBlockAudio, image_model=image_model, device=device, dtype=dtype, operations=operations)
+
+        self.audio_proj = AudioProjModel(seq_len=8, blocks=5, channels=1280, intermediate_dim=512, output_dim=1536, context_tokens=audio_token_num, dtype=dtype, device=device, operations=operations)
+
+    def forward_orig(
+        self,
+        x,
+        t,
+        context,
+        freqs=None,
+        audio_embed=None,
+        reference_latent=None,
+        transformer_options={},
+        **kwargs,
+    ):
+        bs, _, time, height, width = x.shape
+
+        # embeddings
+        x = self.patch_embedding(x.float()).to(x.dtype)
+        grid_sizes = x.shape[2:]
+        x = x.flatten(2).transpose(1, 2)
+
+        # time embeddings
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype))
+        e = e.reshape(t.shape[0], -1, e.shape[-1])
+        e0 = self.time_projection(e).unflatten(2, (6, self.dim))
+
+        if reference_latent is not None:
+            ref = self.patch_embedding(reference_latent.float()).to(x.dtype)
+            ref = ref.flatten(2).transpose(1, 2)
+            freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=time, device=x.device, dtype=x.dtype)
+            x = torch.cat([x, ref], dim=1)
+            freqs = torch.cat([freqs, freqs_ref], dim=1)
+            del ref, freqs_ref
+
+        # context
+        context = self.text_embedding(context)
+        context_img_len = None
+
+        if audio_embed is not None:
+            audio = self.audio_proj(audio_embed).permute(0, 3, 1, 2).flatten(2).transpose(1, 2)
+        else:
+            audio = None
+
+        patches_replace = transformer_options.get("patches_replace", {})
+        blocks_replace = patches_replace.get("dit", {})
+        for i, block in enumerate(self.blocks):
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, audio=audio, transformer_options=args["transformer_options"])
+                    return out
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})
+                x = out["img"]
+            else:
+                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, audio=audio, transformer_options=transformer_options)
+
+        # head
+        x = self.head(x, e)
+
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        return x
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 252dfcf69..cf99035da 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1213,6 +1213,23 @@ class WAN21_Camera(WAN21):
             out['camera_conditions'] = comfy.conds.CONDRegular(camera_conditions)
         return out
 
+class WAN21_HuMo(WAN21):
+    def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
+        super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.HumoWanModel)
+        self.image_to_video = image_to_video
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+
+        audio_embed = kwargs.get("audio_embed", None)
+        if audio_embed is not None:
+            out['audio_embed'] = comfy.conds.CONDRegular(audio_embed)
+
+        reference_latents = kwargs.get("reference_latents", None)
+        if reference_latents is not None:
+            out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1]))
+        return out
+
 class WAN22_S2V(WAN21):
     def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
         super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel_S2V)
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index 03d44f65e..72621bed6 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -402,6 +402,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
                 dit_config["model_type"] = "camera_2.2"
         elif '{}casual_audio_encoder.encoder.final_linear.weight'.format(key_prefix) in state_dict_keys:
             dit_config["model_type"] = "s2v"
+        elif '{}audio_proj.audio_proj_glob_1.layer.bias'.format(key_prefix) in state_dict_keys:
+            dit_config["model_type"] = "humo"
         else:
             if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
                 dit_config["model_type"] = "i2v"
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 557902d11..213b5b92c 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1073,6 +1073,16 @@ class WAN21_Vace(WAN21_T2V):
         out = model_base.WAN21_Vace(self, image_to_video=False, device=device)
         return out
 
+class WAN21_HuMo(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "humo",
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN21_HuMo(self, image_to_video=False, device=device)
+        return out
+
 class WAN22_S2V(WAN21_T2V):
     unet_config = {
         "image_model": "wan2.1",
@@ -1351,6 +1361,6 @@ class HunyuanImage21Refiner(HunyuanVideo):
         out = model_base.HunyuanImage21Refiner(self, device=device)
         return out
 
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
 
 models += [SVD_img2vid]
diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index 4f73369f5..0b8b55813 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -1015,6 +1015,103 @@ class WanSoundImageToVideoExtend(io.ComfyNode):
         return io.NodeOutput(positive, negative, out_latent)
 
 
+def get_audio_emb_window(audio_emb, frame_num, frame0_idx, audio_shift=2):
+    zero_audio_embed = torch.zeros((audio_emb.shape[1], audio_emb.shape[2]), dtype=audio_emb.dtype, device=audio_emb.device)
+    zero_audio_embed_3 = torch.zeros((3, audio_emb.shape[1], audio_emb.shape[2]), dtype=audio_emb.dtype, device=audio_emb.device)  # device=audio_emb.device
+    iter_ = 1 + (frame_num - 1) // 4
+    audio_emb_wind = []
+    for lt_i in range(iter_):
+        if lt_i == 0:
+            st = frame0_idx + lt_i - 2
+            ed = frame0_idx + lt_i + 3
+            wind_feat = torch.stack([
+                audio_emb[i] if (0 <= i < audio_emb.shape[0]) else zero_audio_embed
+                for i in range(st, ed)
+            ], dim=0)
+            wind_feat = torch.cat((zero_audio_embed_3, wind_feat), dim=0)
+        else:
+            st = frame0_idx + 1 + 4 * (lt_i - 1) - audio_shift
+            ed = frame0_idx + 1 + 4 * lt_i + audio_shift
+            wind_feat = torch.stack([
+                audio_emb[i] if (0 <= i < audio_emb.shape[0]) else zero_audio_embed
+                for i in range(st, ed)
+            ], dim=0)
+        audio_emb_wind.append(wind_feat)
+    audio_emb_wind = torch.stack(audio_emb_wind, dim=0)
+
+    return audio_emb_wind, ed - audio_shift
+
+
+class WanHuMoImageToVideo(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="WanHuMoImageToVideo",
+            category="conditioning/video_models",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Vae.Input("vae"),
+                io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("length", default=97, min=1, max=nodes.MAX_RESOLUTION, step=4),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+                io.AudioEncoderOutput.Input("audio_encoder_output", optional=True),
+                io.Image.Input("ref_image", optional=True),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+            ],
+            is_experimental=True,
+        )
+
+    @classmethod
+    def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, audio_encoder_output=None) -> io.NodeOutput:
+        latent_t = ((length - 1) // 4) + 1
+        latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device())
+
+        if ref_image is not None:
+            ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+            ref_latent = vae.encode(ref_image[:, :, :, :3])
+            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
+            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [torch.zeros_like(ref_latent)]}, append=True)
+        else:
+            zero_latent = torch.zeros([batch_size, 16, 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
+            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [zero_latent]}, append=True)
+            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [zero_latent]}, append=True)
+
+        if audio_encoder_output is not None:
+            audio_emb = torch.stack(audio_encoder_output["encoded_audio_all_layers"], dim=2)
+            audio_len = audio_encoder_output["audio_samples"] // 640
+            audio_emb = audio_emb[:, :audio_len * 2]
+
+            feat0 = linear_interpolation(audio_emb[:, :, 0: 8].mean(dim=2), 50, 25)
+            feat1 = linear_interpolation(audio_emb[:, :, 8: 16].mean(dim=2), 50, 25)
+            feat2 = linear_interpolation(audio_emb[:, :, 16: 24].mean(dim=2), 50, 25)
+            feat3 = linear_interpolation(audio_emb[:, :, 24: 32].mean(dim=2), 50, 25)
+            feat4 = linear_interpolation(audio_emb[:, :, 32], 50, 25)
+            audio_emb = torch.stack([feat0, feat1, feat2, feat3, feat4], dim=2)[0]  # [T, 5, 1280]
+            audio_emb, _ = get_audio_emb_window(audio_emb, length, frame0_idx=0)
+
+            # pad for ref latent
+            zero_audio_pad = torch.zeros(ref_latent.shape[2], *audio_emb.shape[1:], device=audio_emb.device, dtype=audio_emb.dtype)
+            audio_emb = torch.cat([audio_emb, zero_audio_pad], dim=0)
+
+            audio_emb = audio_emb.unsqueeze(0)
+            audio_emb_neg = torch.zeros_like(audio_emb)
+            positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_emb})
+            negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_emb_neg})
+        else:
+            zero_audio = torch.zeros([batch_size, latent_t + 1, 8, 5, 1280], device=comfy.model_management.intermediate_device())
+            positive = node_helpers.conditioning_set_values(positive, {"audio_embed": zero_audio})
+            negative = node_helpers.conditioning_set_values(negative, {"audio_embed": zero_audio})
+
+        out_latent = {}
+        out_latent["samples"] = latent
+        return io.NodeOutput(positive, negative, out_latent)
+
 class Wan22ImageToVideoLatent(io.ComfyNode):
     @classmethod
     def define_schema(cls):
@@ -1075,6 +1172,7 @@ class WanExtension(ComfyExtension):
             WanPhantomSubjectToVideo,
             WanSoundImageToVideo,
             WanSoundImageToVideoExtend,
+            WanHuMoImageToVideo,
             Wan22ImageToVideoLatent,
         ]
 

From dd611a7700956f45f393dee32fb8505de176dc66 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 17 Sep 2025 15:39:24 -0700
Subject: [PATCH 13/33] Support the HuMo 17B model. (#9912)

---
 comfy/ldm/wan/model.py |  2 +-
 comfy/model_base.py    | 29 ++++++++++++++++++++++++++---
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py
index b3b7da5d5..9cf3c171d 100644
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@@ -1364,7 +1364,7 @@ class AudioCrossAttentionWrapper(nn.Module):
     def __init__(self, dim, kv_dim, num_heads, qk_norm=True, eps=1e-6, operation_settings={}):
         super().__init__()
 
-        self.audio_cross_attn = WanT2VCrossAttentionGather(dim, num_heads, qk_norm, kv_dim, eps, operation_settings=operation_settings)
+        self.audio_cross_attn = WanT2VCrossAttentionGather(dim, num_heads, qk_norm=qk_norm, kv_dim=kv_dim, eps=eps, operation_settings=operation_settings)
         self.norm1_audio = operation_settings.get("operations").LayerNorm(dim, eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
 
     def forward(self, x, audio, transformer_options={}):
diff --git a/comfy/model_base.py b/comfy/model_base.py
index cf99035da..70b67b7c1 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1220,14 +1220,37 @@ class WAN21_HuMo(WAN21):
 
     def extra_conds(self, **kwargs):
         out = super().extra_conds(**kwargs)
+        noise = kwargs.get("noise", None)
 
         audio_embed = kwargs.get("audio_embed", None)
         if audio_embed is not None:
             out['audio_embed'] = comfy.conds.CONDRegular(audio_embed)
 
-        reference_latents = kwargs.get("reference_latents", None)
-        if reference_latents is not None:
-            out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1]))
+        if "c_concat" not in out:  # 1.7B model
+            reference_latents = kwargs.get("reference_latents", None)
+            if reference_latents is not None:
+                out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1]))
+        else:
+            noise_shape = list(noise.shape)
+            noise_shape[1] += 4
+            concat_latent = torch.zeros(noise_shape, device=noise.device, dtype=noise.dtype)
+            zero_vae_values_first = torch.tensor([0.8660, -0.4326, -0.0017, -0.4884, -0.5283, 0.9207, -0.9896, 0.4433, -0.5543, -0.0113, 0.5753, -0.6000, -0.8346, -0.3497, -0.1926, -0.6938]).view(1, 16, 1, 1, 1)
+            zero_vae_values_second = torch.tensor([1.0869, -1.2370, 0.0206, -0.4357, -0.6411, 2.0307, -1.5972, 1.2659, -0.8595, -0.4654, 0.9638, -1.6330, -1.4310, -0.1098, -0.3856, -1.4583]).view(1, 16, 1, 1, 1)
+            zero_vae_values = torch.tensor([0.8642, -1.8583, 0.1577, 0.1350, -0.3641, 2.5863, -1.9670, 1.6065, -1.0475, -0.8678, 1.1734, -1.8138, -1.5933, -0.7721, -0.3289, -1.3745]).view(1, 16, 1, 1, 1)
+            concat_latent[:, 4:] = zero_vae_values
+            concat_latent[:, 4:, :1] = zero_vae_values_first
+            concat_latent[:, 4:, 1:2] = zero_vae_values_second
+            out['c_concat'] = comfy.conds.CONDNoiseShape(concat_latent)
+            reference_latents = kwargs.get("reference_latents", None)
+            if reference_latents is not None:
+                ref_latent = self.process_latent_in(reference_latents[-1])
+                ref_latent_shape = list(ref_latent.shape)
+                ref_latent_shape[1] += 4 + ref_latent_shape[1]
+                ref_latent_full = torch.zeros(ref_latent_shape, device=ref_latent.device, dtype=ref_latent.dtype)
+                ref_latent_full[:, 20:] = ref_latent
+                ref_latent_full[:, 16:20] = 1.0
+                out['reference_latent'] = comfy.conds.CONDRegular(ref_latent_full)
+
         return out
 
 class WAN22_S2V(WAN21):

From 8d6653fca676a08df3e11654672fed92a183d147 Mon Sep 17 00:00:00 2001
From: DELUXA <djernovevo@gmail.com>
Date: Fri, 19 Sep 2025 02:50:37 +0300
Subject: [PATCH 14/33] Enable fp8 ops by default on gfx1200 (#9926)

---
 comfy/model_management.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index bbfc3c7a1..d880f1970 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -348,7 +348,7 @@ try:
 #                    if any((a in arch) for a in ["gfx1201"]):
 #                        ENABLE_PYTORCH_ATTENTION = True
         if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4):
-            if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]):  # TODO: more arches
+            if any((a in arch) for a in ["gfx1200", "gfx1201", "gfx942", "gfx950"]):  # TODO: more arches
                 SUPPORT_FP8_OPS = True
 
 except:

From 1ea8c540640913b247248e46c907fb9b92a9dd4b Mon Sep 17 00:00:00 2001
From: Jodh Singh <jetjodh@gmail.com>
Date: Thu, 18 Sep 2025 19:51:16 -0400
Subject: [PATCH 15/33] make kernel of same type as image to avoid mismatch
 issues (#9932)

---
 comfy_extras/nodes_post_processing.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/comfy_extras/nodes_post_processing.py b/comfy_extras/nodes_post_processing.py
index cb1a0d883..ed7a07152 100644
--- a/comfy_extras/nodes_post_processing.py
+++ b/comfy_extras/nodes_post_processing.py
@@ -233,6 +233,7 @@ class Sharpen:
 
         kernel_size = sharpen_radius * 2 + 1
         kernel = gaussian_kernel(kernel_size, sigma, device=image.device) * -(alpha*10)
+        kernel = kernel.to(dtype=image.dtype)
         center = kernel_size // 2
         kernel[center, center] = kernel[center, center] - kernel.sum() + 1.0
         kernel = kernel.repeat(channels, 1, 1).unsqueeze(1)

From 24b0fce099c56d18ceb1f4f6b9455fee55e154ce Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 18 Sep 2025 16:54:16 -0700
Subject: [PATCH 16/33] Do padding of audio embed in model for humo for more
 flexibility. (#9935)

---
 comfy/ldm/wan/model.py    | 3 +++
 comfy_extras/nodes_wan.py | 4 ----
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py
index 9cf3c171d..2dac5980c 100644
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@@ -1551,6 +1551,9 @@ class HumoWanModel(WanModel):
         context_img_len = None
 
         if audio_embed is not None:
+            if reference_latent is not None:
+                zero_audio_pad = torch.zeros(audio_embed.shape[0], reference_latent.shape[-3], *audio_embed.shape[2:], device=audio_embed.device, dtype=audio_embed.dtype)
+                audio_embed = torch.cat([audio_embed, zero_audio_pad], dim=1)
             audio = self.audio_proj(audio_embed).permute(0, 3, 1, 2).flatten(2).transpose(1, 2)
         else:
             audio = None
diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index 0b8b55813..5f10edcff 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -1095,10 +1095,6 @@ class WanHuMoImageToVideo(io.ComfyNode):
             audio_emb = torch.stack([feat0, feat1, feat2, feat3, feat4], dim=2)[0]  # [T, 5, 1280]
             audio_emb, _ = get_audio_emb_window(audio_emb, length, frame0_idx=0)
 
-            # pad for ref latent
-            zero_audio_pad = torch.zeros(ref_latent.shape[2], *audio_emb.shape[1:], device=audio_emb.device, dtype=audio_emb.dtype)
-            audio_emb = torch.cat([audio_emb, zero_audio_pad], dim=0)
-
             audio_emb = audio_emb.unsqueeze(0)
             audio_emb_neg = torch.zeros_like(audio_emb)
             positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_emb})

From 711bcf33ee505a997674f4a9125e69d2a5a3c180 Mon Sep 17 00:00:00 2001
From: Christian Byrne <cbyrne@comfy.org>
Date: Fri, 19 Sep 2025 00:03:30 -0700
Subject: [PATCH 17/33] Bump frontend to 1.26.13 (#9933)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index de5af5fac..79187efaa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-comfyui-frontend-package==1.26.11
+comfyui-frontend-package==1.26.13
 comfyui-workflow-templates==0.1.81
 comfyui-embedded-docs==0.2.6
 torch

From dc95b6acc0ef4962460592d417db4024f7160586 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 19 Sep 2025 00:07:17 -0700
Subject: [PATCH 18/33] Basic WIP support for the wan animate model. (#9939)

---
 comfy/ldm/wan/model_animate.py | 548 +++++++++++++++++++++++++++++++++
 comfy/model_base.py            |  18 ++
 comfy/model_detection.py       |   2 +
 comfy/supported_models.py      |  15 +-
 comfy_extras/nodes_wan.py      |  84 +++++
 5 files changed, 666 insertions(+), 1 deletion(-)
 create mode 100644 comfy/ldm/wan/model_animate.py

diff --git a/comfy/ldm/wan/model_animate.py b/comfy/ldm/wan/model_animate.py
new file mode 100644
index 000000000..542f54110
--- /dev/null
+++ b/comfy/ldm/wan/model_animate.py
@@ -0,0 +1,548 @@
+from torch import nn
+import torch
+from typing import Tuple, Optional
+from einops import rearrange
+import torch.nn.functional as F
+import math
+from .model import WanModel, sinusoidal_embedding_1d
+from comfy.ldm.modules.attention import optimized_attention
+import comfy.model_management
+
+class CausalConv1d(nn.Module):
+
+    def __init__(self, chan_in, chan_out, kernel_size=3, stride=1, dilation=1, pad_mode="replicate", operations=None, **kwargs):
+        super().__init__()
+
+        self.pad_mode = pad_mode
+        padding = (kernel_size - 1, 0)  # T
+        self.time_causal_padding = padding
+
+        self.conv = operations.Conv1d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs)
+
+    def forward(self, x):
+        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
+        return self.conv(x)
+
+
+class FaceEncoder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int, num_heads=int, dtype=None, device=None, operations=None):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+
+        self.num_heads = num_heads
+        self.conv1_local = CausalConv1d(in_dim, 1024 * num_heads, 3, stride=1, operations=operations, **factory_kwargs)
+        self.norm1 = operations.LayerNorm(hidden_dim // 8, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.act = nn.SiLU()
+        self.conv2 = CausalConv1d(1024, 1024, 3, stride=2, operations=operations, **factory_kwargs)
+        self.conv3 = CausalConv1d(1024, 1024, 3, stride=2, operations=operations, **factory_kwargs)
+
+        self.out_proj = operations.Linear(1024, hidden_dim, **factory_kwargs)
+        self.norm1 = operations.LayerNorm(1024, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+
+        self.norm2 = operations.LayerNorm(1024, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+
+        self.norm3 = operations.LayerNorm(1024, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+
+        self.padding_tokens = nn.Parameter(torch.empty(1, 1, 1, hidden_dim, **factory_kwargs))
+
+    def forward(self, x):
+
+        x = rearrange(x, "b t c -> b c t")
+        b, c, t = x.shape
+
+        x = self.conv1_local(x)
+        x = rearrange(x, "b (n c) t -> (b n) t c", n=self.num_heads)
+
+        x = self.norm1(x)
+        x = self.act(x)
+        x = rearrange(x, "b t c -> b c t")
+        x = self.conv2(x)
+        x = rearrange(x, "b c t -> b t c")
+        x = self.norm2(x)
+        x = self.act(x)
+        x = rearrange(x, "b t c -> b c t")
+        x = self.conv3(x)
+        x = rearrange(x, "b c t -> b t c")
+        x = self.norm3(x)
+        x = self.act(x)
+        x = self.out_proj(x)
+        x = rearrange(x, "(b n) t c -> b t n c", b=b)
+        padding = comfy.model_management.cast_to(self.padding_tokens, dtype=x.dtype, device=x.device).repeat(b, x.shape[1], 1, 1)
+        x = torch.cat([x, padding], dim=-2)
+        x_local = x.clone()
+
+        return x_local
+
+
+def get_norm_layer(norm_layer, operations=None):
+    """
+    Get the normalization layer.
+
+    Args:
+        norm_layer (str): The type of normalization layer.
+
+    Returns:
+        norm_layer (nn.Module): The normalization layer.
+    """
+    if norm_layer == "layer":
+        return operations.LayerNorm
+    elif norm_layer == "rms":
+        return operations.RMSNorm
+    else:
+        raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")
+
+
+class FaceAdapter(nn.Module):
+    def __init__(
+        self,
+        hidden_dim: int,
+        heads_num: int,
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        num_adapter_layers: int = 1,
+        dtype=None, device=None, operations=None
+    ):
+
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.hidden_size = hidden_dim
+        self.heads_num = heads_num
+        self.fuser_blocks = nn.ModuleList(
+            [
+                FaceBlock(
+                    self.hidden_size,
+                    self.heads_num,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    operations=operations,
+                    **factory_kwargs,
+                )
+                for _ in range(num_adapter_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        motion_embed: torch.Tensor,
+        idx: int,
+        freqs_cis_q: Tuple[torch.Tensor, torch.Tensor] = None,
+        freqs_cis_k: Tuple[torch.Tensor, torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        return self.fuser_blocks[idx](x, motion_embed, freqs_cis_q, freqs_cis_k)
+
+
+
+class FaceBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qk_scale: float = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        operations=None
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+
+        self.deterministic = False
+        self.hidden_size = hidden_size
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.linear1_kv = operations.Linear(hidden_size, hidden_size * 2, **factory_kwargs)
+        self.linear1_q = operations.Linear(hidden_size, hidden_size, **factory_kwargs)
+
+        self.linear2 = operations.Linear(hidden_size, hidden_size, **factory_kwargs)
+
+        qk_norm_layer = get_norm_layer(qk_norm_type, operations=operations)
+        self.q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+
+        self.pre_norm_feat = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+
+        self.pre_norm_motion = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        motion_vec: torch.Tensor,
+        motion_mask: Optional[torch.Tensor] = None,
+        # use_context_parallel=False,
+    ) -> torch.Tensor:
+
+        B, T, N, C = motion_vec.shape
+        T_comp = T
+
+        x_motion = self.pre_norm_motion(motion_vec)
+        x_feat = self.pre_norm_feat(x)
+
+        kv = self.linear1_kv(x_motion)
+        q = self.linear1_q(x_feat)
+
+        k, v = rearrange(kv, "B L N (K H D) -> K B L N H D", K=2, H=self.heads_num)
+        q = rearrange(q, "B S (H D) -> B S H D", H=self.heads_num)
+
+        # Apply QK-Norm if needed.
+        q = self.q_norm(q).to(v)
+        k = self.k_norm(k).to(v)
+
+        k = rearrange(k, "B L N H D -> (B L) N H D")
+        v = rearrange(v, "B L N H D -> (B L) N H D")
+
+        q = rearrange(q, "B (L S) H D -> (B L) S (H D)", L=T_comp)
+
+        attn = optimized_attention(q, k, v, heads=self.heads_num)
+
+        attn = rearrange(attn, "(B L) S C -> B (L S) C", L=T_comp)
+
+        output = self.linear2(attn)
+
+        if motion_mask is not None:
+            output = output * rearrange(motion_mask, "B T H W -> B (T H W)").unsqueeze(-1)
+
+        return output
+
+# https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/ops/upfirdn2d/upfirdn2d.py#L162
+def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1):
+    _, minor, in_h, in_w = input.shape
+    kernel_h, kernel_w = kernel.shape
+
+    out = input.view(-1, minor, in_h, 1, in_w, 1)
+    out = F.pad(out, [0, up_x - 1, 0, 0, 0, up_y - 1, 0, 0])
+    out = out.view(-1, minor, in_h * up_y, in_w * up_x)
+
+    out = F.pad(out, [max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)])
+    out = out[:, :, max(-pad_y0, 0): out.shape[2] - max(-pad_y1, 0), max(-pad_x0, 0): out.shape[3] - max(-pad_x1, 0)]
+
+    out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
+    w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
+    out = F.conv2d(out, w)
+    out = out.reshape(-1, minor, in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1)
+    return out[:, :, ::down_y, ::down_x]
+
+def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
+    return upfirdn2d_native(input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1])
+
+# https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/ops/fused_act/fused_act.py#L81
+class FusedLeakyReLU(torch.nn.Module):
+    def __init__(self, channel, negative_slope=0.2, scale=2 ** 0.5, dtype=None, device=None):
+        super().__init__()
+        self.bias = torch.nn.Parameter(torch.empty(1, channel, 1, 1, dtype=dtype, device=device))
+        self.negative_slope = negative_slope
+        self.scale = scale
+
+    def forward(self, input):
+        return fused_leaky_relu(input, comfy.model_management.cast_to(self.bias, device=input.device, dtype=input.dtype), self.negative_slope, self.scale)
+
+def fused_leaky_relu(input, bias, negative_slope=0.2, scale=2 ** 0.5):
+    return F.leaky_relu(input + bias, negative_slope) * scale
+
+class Blur(torch.nn.Module):
+    def __init__(self, kernel, pad, dtype=None, device=None):
+        super().__init__()
+        kernel = torch.tensor(kernel, dtype=dtype, device=device)
+        kernel = kernel[None, :] * kernel[:, None]
+        kernel = kernel / kernel.sum()
+        self.register_buffer('kernel', kernel)
+        self.pad = pad
+
+    def forward(self, input):
+        return upfirdn2d(input, comfy.model_management.cast_to(self.kernel, dtype=input.dtype, device=input.device), pad=self.pad)
+
+#https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L590
+class ScaledLeakyReLU(torch.nn.Module):
+    def __init__(self, negative_slope=0.2):
+        super().__init__()
+        self.negative_slope = negative_slope
+
+    def forward(self, input):
+        return F.leaky_relu(input, negative_slope=self.negative_slope)
+
+# https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L605
+class EqualConv2d(torch.nn.Module):
+    def __init__(self, in_channel, out_channel, kernel_size, stride=1, padding=0, bias=True, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(out_channel, in_channel, kernel_size, kernel_size, device=device, dtype=dtype))
+        self.scale = 1 / math.sqrt(in_channel * kernel_size ** 2)
+        self.stride = stride
+        self.padding = padding
+        self.bias = torch.nn.Parameter(torch.empty(out_channel, device=device, dtype=dtype)) if bias else None
+
+    def forward(self, input):
+        if self.bias is None:
+            bias = None
+        else:
+            bias = comfy.model_management.cast_to(self.bias, device=input.device, dtype=input.dtype)
+
+        return F.conv2d(input, comfy.model_management.cast_to(self.weight, device=input.device, dtype=input.dtype) * self.scale, bias=bias, stride=self.stride, padding=self.padding)
+
+# https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L134
+class EqualLinear(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1, activation=None, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(out_dim, in_dim, device=device, dtype=dtype))
+        self.bias = torch.nn.Parameter(torch.empty(out_dim, device=device, dtype=dtype)) if bias else None
+        self.activation = activation
+        self.scale = (1 / math.sqrt(in_dim)) * lr_mul
+        self.lr_mul = lr_mul
+
+    def forward(self, input):
+        if self.bias is None:
+            bias = None
+        else:
+            bias = comfy.model_management.cast_to(self.bias, device=input.device, dtype=input.dtype) * self.lr_mul
+
+        if self.activation:
+            out = F.linear(input, comfy.model_management.cast_to(self.weight, device=input.device, dtype=input.dtype) * self.scale)
+            return fused_leaky_relu(out, bias)
+        return F.linear(input, comfy.model_management.cast_to(self.weight, device=input.device, dtype=input.dtype) * self.scale, bias=bias)
+
+# https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L654
+class ConvLayer(torch.nn.Sequential):
+    def __init__(self, in_channel, out_channel, kernel_size, downsample=False, blur_kernel=[1, 3, 3, 1], bias=True, activate=True, dtype=None, device=None, operations=None):
+        layers = []
+
+        if downsample:
+            factor = 2
+            p = (len(blur_kernel) - factor) + (kernel_size - 1)
+            layers.append(Blur(blur_kernel, pad=((p + 1) // 2, p // 2)))
+            stride, padding = 2, 0
+        else:
+            stride, padding = 1, kernel_size // 2
+
+        layers.append(EqualConv2d(in_channel, out_channel, kernel_size, padding=padding, stride=stride, bias=bias and not activate, dtype=dtype, device=device, operations=operations))
+
+        if activate:
+            layers.append(FusedLeakyReLU(out_channel) if bias else ScaledLeakyReLU(0.2))
+
+        super().__init__(*layers)
+
+# https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L704
+class ResBlock(torch.nn.Module):
+    def __init__(self, in_channel, out_channel, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.conv1 = ConvLayer(in_channel, in_channel, 3, dtype=dtype, device=device, operations=operations)
+        self.conv2 = ConvLayer(in_channel, out_channel, 3, downsample=True, dtype=dtype, device=device, operations=operations)
+        self.skip = ConvLayer(in_channel, out_channel, 1, downsample=True, activate=False, bias=False, dtype=dtype, device=device, operations=operations)
+
+    def forward(self, input):
+        out = self.conv2(self.conv1(input))
+        skip = self.skip(input)
+        return (out + skip) / math.sqrt(2)
+
+
+class EncoderApp(torch.nn.Module):
+    def __init__(self, w_dim=512, dtype=None, device=None, operations=None):
+        super().__init__()
+        kwargs = {"device": device, "dtype": dtype, "operations": operations}
+
+        self.convs = torch.nn.ModuleList([
+            ConvLayer(3, 32, 1, **kwargs), ResBlock(32, 64, **kwargs),
+            ResBlock(64, 128, **kwargs), ResBlock(128, 256, **kwargs),
+            ResBlock(256, 512, **kwargs), ResBlock(512, 512, **kwargs),
+            ResBlock(512, 512, **kwargs), ResBlock(512, 512, **kwargs),
+            EqualConv2d(512, w_dim, 4, padding=0, bias=False, **kwargs)
+        ])
+
+    def forward(self, x):
+        h = x
+        for conv in self.convs:
+            h = conv(h)
+        return h.squeeze(-1).squeeze(-1)
+
+class Encoder(torch.nn.Module):
+    def __init__(self, dim=512, motion_dim=20, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.net_app = EncoderApp(dim, dtype=dtype, device=device, operations=operations)
+        self.fc = torch.nn.Sequential(*[EqualLinear(dim, dim, dtype=dtype, device=device, operations=operations) for _ in range(4)] + [EqualLinear(dim, motion_dim, dtype=dtype, device=device, operations=operations)])
+
+    def encode_motion(self, x):
+        return self.fc(self.net_app(x))
+
+class Direction(torch.nn.Module):
+    def __init__(self, motion_dim, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(512, motion_dim, device=device, dtype=dtype))
+        self.motion_dim = motion_dim
+
+    def forward(self, input):
+        stabilized_weight = comfy.model_management.cast_to(self.weight, device=input.device, dtype=input.dtype) + 1e-8 * torch.eye(512, self.motion_dim, device=input.device, dtype=input.dtype)
+        Q, _ = torch.linalg.qr(stabilized_weight.float())
+        if input is None:
+            return Q
+        return torch.sum(input.unsqueeze(-1) * Q.T.to(input.dtype), dim=1)
+
+class Synthesis(torch.nn.Module):
+    def __init__(self, motion_dim, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.direction = Direction(motion_dim, dtype=dtype, device=device, operations=operations)
+
+class Generator(torch.nn.Module):
+    def __init__(self, style_dim=512, motion_dim=20, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.enc = Encoder(style_dim, motion_dim, dtype=dtype, device=device, operations=operations)
+        self.dec = Synthesis(motion_dim, dtype=dtype, device=device, operations=operations)
+
+    def get_motion(self, img):
+        motion_feat = self.enc.encode_motion(img)
+        return self.dec.direction(motion_feat)
+
+class AnimateWanModel(WanModel):
+    r"""
+    Wan diffusion backbone supporting both text-to-video and image-to-video.
+    """
+
+    def __init__(self,
+                 model_type='animate',
+                 patch_size=(1, 2, 2),
+                 text_len=512,
+                 in_dim=16,
+                 dim=2048,
+                 ffn_dim=8192,
+                 freq_dim=256,
+                 text_dim=4096,
+                 out_dim=16,
+                 num_heads=16,
+                 num_layers=32,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=True,
+                 eps=1e-6,
+                 flf_pos_embed_token_number=None,
+                 motion_encoder_dim=512,
+                 image_model=None,
+                 device=None,
+                 dtype=None,
+                 operations=None,
+                 ):
+
+        super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, image_model=image_model, device=device, dtype=dtype, operations=operations)
+
+        self.pose_patch_embedding = operations.Conv3d(
+            16, dim, kernel_size=patch_size, stride=patch_size, device=device, dtype=dtype
+        )
+
+        self.motion_encoder = Generator(style_dim=512, motion_dim=20, device=device, dtype=dtype, operations=operations)
+
+        self.face_adapter = FaceAdapter(
+            heads_num=self.num_heads,
+            hidden_dim=self.dim,
+            num_adapter_layers=self.num_layers // 5,
+            device=device, dtype=dtype, operations=operations
+        )
+
+        self.face_encoder = FaceEncoder(
+            in_dim=motion_encoder_dim,
+            hidden_dim=self.dim,
+            num_heads=4,
+            device=device, dtype=dtype, operations=operations
+        )
+
+    def after_patch_embedding(self, x, pose_latents, face_pixel_values):
+        if pose_latents is not None:
+            pose_latents = self.pose_patch_embedding(pose_latents)
+            x[:, :, 1:] += pose_latents
+
+        if face_pixel_values is None:
+            return x, None
+
+        b, c, T, h, w = face_pixel_values.shape
+        face_pixel_values = rearrange(face_pixel_values, "b c t h w -> (b t) c h w")
+        encode_bs = 8
+        face_pixel_values_tmp = []
+        for i in range(math.ceil(face_pixel_values.shape[0] / encode_bs)):
+            face_pixel_values_tmp.append(self.motion_encoder.get_motion(face_pixel_values[i * encode_bs: (i + 1) * encode_bs]))
+
+        motion_vec = torch.cat(face_pixel_values_tmp)
+
+        motion_vec = rearrange(motion_vec, "(b t) c -> b t c", t=T)
+        motion_vec = self.face_encoder(motion_vec)
+
+        B, L, H, C = motion_vec.shape
+        pad_face = torch.zeros(B, 1, H, C).type_as(motion_vec)
+        motion_vec = torch.cat([pad_face, motion_vec], dim=1)
+
+        if motion_vec.shape[1] < x.shape[2]:
+            B, L, H, C = motion_vec.shape
+            pad = torch.zeros(B, x.shape[2] - motion_vec.shape[1], H, C).type_as(motion_vec)
+            motion_vec = torch.cat([motion_vec, pad], dim=1)
+        else:
+            motion_vec = motion_vec[:, :x.shape[2]]
+        return x, motion_vec
+
+    def forward_orig(
+        self,
+        x,
+        t,
+        context,
+        clip_fea=None,
+        pose_latents=None,
+        face_pixel_values=None,
+        freqs=None,
+        transformer_options={},
+        **kwargs,
+    ):
+        # embeddings
+        x = self.patch_embedding(x.float()).to(x.dtype)
+        x, motion_vec = self.after_patch_embedding(x, pose_latents, face_pixel_values)
+        grid_sizes = x.shape[2:]
+        x = x.flatten(2).transpose(1, 2)
+
+        # time embeddings
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype))
+        e = e.reshape(t.shape[0], -1, e.shape[-1])
+        e0 = self.time_projection(e).unflatten(2, (6, self.dim))
+
+        full_ref = None
+        if self.ref_conv is not None:
+            full_ref = kwargs.get("reference_latent", None)
+            if full_ref is not None:
+                full_ref = self.ref_conv(full_ref).flatten(2).transpose(1, 2)
+                x = torch.concat((full_ref, x), dim=1)
+
+        # context
+        context = self.text_embedding(context)
+
+        context_img_len = None
+        if clip_fea is not None:
+            if self.img_emb is not None:
+                context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+                context = torch.concat([context_clip, context], dim=1)
+            context_img_len = clip_fea.shape[-2]
+
+        patches_replace = transformer_options.get("patches_replace", {})
+        blocks_replace = patches_replace.get("dit", {})
+        for i, block in enumerate(self.blocks):
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, transformer_options=args["transformer_options"])
+                    return out
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})
+                x = out["img"]
+            else:
+                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options)
+
+            if i % 5 == 0 and motion_vec is not None:
+                x = x + self.face_adapter.fuser_blocks[i // 5](x, motion_vec)
+
+        # head
+        x = self.head(x, e)
+
+        if full_ref is not None:
+            x = x[:, full_ref.shape[1]:]
+
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        return x
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 70b67b7c1..b0b9cde7d 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -39,6 +39,7 @@ import comfy.ldm.cosmos.model
 import comfy.ldm.cosmos.predict2
 import comfy.ldm.lumina.model
 import comfy.ldm.wan.model
+import comfy.ldm.wan.model_animate
 import comfy.ldm.hunyuan3d.model
 import comfy.ldm.hidream.model
 import comfy.ldm.chroma.model
@@ -1253,6 +1254,23 @@ class WAN21_HuMo(WAN21):
 
         return out
 
+class WAN22_Animate(WAN21):
+    def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
+        super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model_animate.AnimateWanModel)
+        self.image_to_video = image_to_video
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+
+        face_video_pixels = kwargs.get("face_video_pixels", None)
+        if face_video_pixels is not None:
+            out['face_pixel_values'] = comfy.conds.CONDRegular(face_video_pixels)
+
+        pose_latents = kwargs.get("pose_video_latent", None)
+        if pose_latents is not None:
+            out['pose_latents'] = comfy.conds.CONDRegular(self.process_latent_in(pose_latents))
+        return out
+
 class WAN22_S2V(WAN21):
     def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
         super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel_S2V)
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index 72621bed6..46415c17a 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -404,6 +404,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
             dit_config["model_type"] = "s2v"
         elif '{}audio_proj.audio_proj_glob_1.layer.bias'.format(key_prefix) in state_dict_keys:
             dit_config["model_type"] = "humo"
+        elif '{}face_adapter.fuser_blocks.0.k_norm.weight'.format(key_prefix) in state_dict_keys:
+            dit_config["model_type"] = "animate"
         else:
             if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
                 dit_config["model_type"] = "i2v"
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 213b5b92c..1fbb6aef4 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1096,6 +1096,19 @@ class WAN22_S2V(WAN21_T2V):
         out = model_base.WAN22_S2V(self, device=device)
         return out
 
+class WAN22_Animate(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "animate",
+    }
+
+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN22_Animate(self, device=device)
+        return out
+
 class WAN22_T2V(WAN21_T2V):
     unet_config = {
         "image_model": "wan2.1",
@@ -1361,6 +1374,6 @@ class HunyuanImage21Refiner(HunyuanVideo):
         out = model_base.HunyuanImage21Refiner(self, device=device)
         return out
 
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
 
 models += [SVD_img2vid]
diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index 5f10edcff..4187a5619 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -1108,6 +1108,89 @@ class WanHuMoImageToVideo(io.ComfyNode):
         out_latent["samples"] = latent
         return io.NodeOutput(positive, negative, out_latent)
 
+class WanAnimateToVideo(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="WanAnimateToVideo",
+            category="conditioning/video_models",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Vae.Input("vae"),
+                io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("length", default=77, min=1, max=nodes.MAX_RESOLUTION, step=4),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+                io.ClipVisionOutput.Input("clip_vision_output", optional=True),
+                io.Image.Input("reference_image", optional=True),
+                io.Image.Input("face_video", optional=True),
+                io.Image.Input("pose_video", optional=True),
+                io.Int.Input("continue_motion_max_frames", default=5, min=1, max=nodes.MAX_RESOLUTION, step=4),
+                io.Image.Input("continue_motion", optional=True),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+                io.Int.Output(display_name="trim_latent"),
+            ],
+            is_experimental=True,
+        )
+
+    @classmethod
+    def execute(cls, positive, negative, vae, width, height, length, batch_size, continue_motion_max_frames, reference_image=None, clip_vision_output=None, face_video=None, pose_video=None, continue_motion=None) -> io.NodeOutput:
+        latent_length = ((length - 1) // 4) + 1
+        latent_width = width // 8
+        latent_height = height // 8
+        trim_latent = 0
+
+        if reference_image is None:
+            reference_image = torch.zeros((1, height, width, 3))
+
+        image = comfy.utils.common_upscale(reference_image[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1)
+        concat_latent_image = vae.encode(image[:, :, :, :3])
+        mask = torch.zeros((1, 1, concat_latent_image.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=concat_latent_image.device, dtype=concat_latent_image.dtype)
+        trim_latent += concat_latent_image.shape[2]
+
+        if clip_vision_output is not None:
+            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
+            negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
+
+        if face_video is not None:
+            face_video = comfy.utils.common_upscale(face_video[:length].movedim(-1, 1), 512, 512, "area", "center") * 2.0 - 1.0
+            face_video = face_video.movedim(0, 1).unsqueeze(0)
+            positive = node_helpers.conditioning_set_values(positive, {"face_video_pixels": face_video})
+            negative = node_helpers.conditioning_set_values(negative, {"face_video_pixels": face_video * 0.0 - 1.0})
+
+        if pose_video is not None:
+            pose_video = comfy.utils.common_upscale(pose_video[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1)
+            pose_video_latent = vae.encode(pose_video[:, :, :, :3])
+            positive = node_helpers.conditioning_set_values(positive, {"pose_video_latent": pose_video_latent})
+            negative = node_helpers.conditioning_set_values(negative, {"pose_video_latent": pose_video_latent})
+
+        if continue_motion is None:
+            image = torch.ones((length, height, width, 3)) * 0.5
+        else:
+            continue_motion = continue_motion[-continue_motion_max_frames:]
+            continue_motion = comfy.utils.common_upscale(continue_motion[-length:].movedim(-1, 1), width, height, "area", "center").movedim(1, -1)
+            image = torch.ones((length, height, width, continue_motion.shape[-1]), device=continue_motion.device, dtype=continue_motion.dtype) * 0.5
+            image[:continue_motion.shape[0]] = continue_motion
+
+        concat_latent_image = torch.cat((concat_latent_image, vae.encode(image[:, :, :, :3])), dim=2)
+        mask_refmotion = torch.ones((1, 1, latent_length, concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=mask.device, dtype=mask.dtype)
+        if continue_motion is not None:
+            mask_refmotion[:, :, :((continue_motion.shape[0] - 1) // 4) + 1] = 0.0
+
+        mask = torch.cat((mask, mask_refmotion), dim=2)
+        positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
+        negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
+
+        latent = torch.zeros([batch_size, 16, latent_length + trim_latent, latent_height, latent_width], device=comfy.model_management.intermediate_device())
+        out_latent = {}
+        out_latent["samples"] = latent
+        return io.NodeOutput(positive, negative, out_latent, trim_latent)
+
 class Wan22ImageToVideoLatent(io.ComfyNode):
     @classmethod
     def define_schema(cls):
@@ -1169,6 +1252,7 @@ class WanExtension(ComfyExtension):
             WanSoundImageToVideo,
             WanSoundImageToVideoExtend,
             WanHuMoImageToVideo,
+            WanAnimateToVideo,
             Wan22ImageToVideoLatent,
         ]
 

From 9fdf8c25abb2133803063a9be395cac774fce611 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Fri, 19 Sep 2025 23:02:43 +0300
Subject: [PATCH 19/33] api_nodes: reduce default timeout from 7 days to 2
 hours (#9918)

---
 comfy_api_nodes/apis/client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py
index 4ad0b783b..0aed906fb 100644
--- a/comfy_api_nodes/apis/client.py
+++ b/comfy_api_nodes/apis/client.py
@@ -683,7 +683,7 @@ class SynchronousOperation(Generic[T, R]):
         auth_token: Optional[str] = None,
         comfy_api_key: Optional[str] = None,
         auth_kwargs: Optional[Dict[str, str]] = None,
-        timeout: float = 604800.0,
+        timeout: float = 7200.0,
         verify_ssl: bool = True,
         content_type: str = "application/json",
         multipart_parser: Callable | None = None,

From 852704c81a652cc53fbe53c5f47dea0e50d0534e Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Fri, 19 Sep 2025 23:04:51 +0300
Subject: [PATCH 20/33] fix(seedream4): add flag to ignore error on partial
 success (#9952)

---
 comfy_api_nodes/nodes_bytedance.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py
index 369a3a4fe..a7eeaf15a 100644
--- a/comfy_api_nodes/nodes_bytedance.py
+++ b/comfy_api_nodes/nodes_bytedance.py
@@ -567,6 +567,12 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode):
                     tooltip="Whether to add an \"AI generated\" watermark to the image.",
                     optional=True,
                 ),
+                comfy_io.Boolean.Input(
+                    "fail_on_partial",
+                    default=True,
+                    tooltip="If enabled, abort execution if any requested images are missing or return an error.",
+                    optional=True,
+                ),
             ],
             outputs=[
                 comfy_io.Image.Output(),
@@ -592,6 +598,7 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode):
         max_images: int = 1,
         seed: int = 0,
         watermark: bool = True,
+        fail_on_partial: bool = True,
     ) -> comfy_io.NodeOutput:
         validate_string(prompt, strip_whitespace=True, min_length=1)
         w = h = None
@@ -651,9 +658,10 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode):
 
         if len(response.data) == 1:
             return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response)))
-        return comfy_io.NodeOutput(
-            torch.cat([await download_url_to_image_tensor(str(i["url"])) for i in response.data])
-        )
+        urls = [str(d["url"]) for d in response.data if isinstance(d, dict) and "url" in d]
+        if fail_on_partial and len(urls) < len(response.data):
+            raise RuntimeError(f"Only {len(urls)} of {len(response.data)} images were generated before error.")
+        return comfy_io.NodeOutput(torch.cat([await download_url_to_image_tensor(i) for i in urls]))
 
 
 class ByteDanceTextToVideoNode(comfy_io.ComfyNode):
@@ -1171,7 +1179,7 @@ async def process_video_task(
     payload: Union[Text2VideoTaskCreationRequest, Image2VideoTaskCreationRequest],
     auth_kwargs: dict,
     node_id: str,
-    estimated_duration: int | None,
+    estimated_duration: Optional[int],
 ) -> comfy_io.NodeOutput:
     initial_response = await SynchronousOperation(
         endpoint=ApiEndpoint(

From e8df53b764c7dfce1a9235f6ee70a17cfdece3ff Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 19 Sep 2025 15:48:56 -0700
Subject: [PATCH 21/33] Update WanAnimateToVideo to more easily extend videos.
 (#9959)

---
 comfy/ldm/wan/model_animate.py |  2 +-
 comfy_extras/nodes_wan.py      | 63 +++++++++++++++++++++++++---------
 2 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/comfy/ldm/wan/model_animate.py b/comfy/ldm/wan/model_animate.py
index 542f54110..7c87835d4 100644
--- a/comfy/ldm/wan/model_animate.py
+++ b/comfy/ldm/wan/model_animate.py
@@ -451,7 +451,7 @@ class AnimateWanModel(WanModel):
     def after_patch_embedding(self, x, pose_latents, face_pixel_values):
         if pose_latents is not None:
             pose_latents = self.pose_patch_embedding(pose_latents)
-            x[:, :, 1:] += pose_latents
+            x[:, :, 1:pose_latents.shape[2] + 1] += pose_latents[:, :, :x.shape[2] - 1]
 
         if face_pixel_values is None:
             return x, None
diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index 4187a5619..3e5fef535 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -1128,18 +1128,22 @@ class WanAnimateToVideo(io.ComfyNode):
                 io.Image.Input("pose_video", optional=True),
                 io.Int.Input("continue_motion_max_frames", default=5, min=1, max=nodes.MAX_RESOLUTION, step=4),
                 io.Image.Input("continue_motion", optional=True),
+                io.Int.Input("video_frame_offset", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1, tooltip="The amount of frames to seek in all the input videos. Used for generating longer videos by chunk. Connect to the video_frame_offset output of the previous node for extending a video."),
             ],
             outputs=[
                 io.Conditioning.Output(display_name="positive"),
                 io.Conditioning.Output(display_name="negative"),
                 io.Latent.Output(display_name="latent"),
                 io.Int.Output(display_name="trim_latent"),
+                io.Int.Output(display_name="trim_image"),
+                io.Int.Output(display_name="video_frame_offset"),
             ],
             is_experimental=True,
         )
 
     @classmethod
-    def execute(cls, positive, negative, vae, width, height, length, batch_size, continue_motion_max_frames, reference_image=None, clip_vision_output=None, face_video=None, pose_video=None, continue_motion=None) -> io.NodeOutput:
+    def execute(cls, positive, negative, vae, width, height, length, batch_size, continue_motion_max_frames, video_frame_offset, reference_image=None, clip_vision_output=None, face_video=None, pose_video=None, continue_motion=None) -> io.NodeOutput:
+        trim_to_pose_video = False
         latent_length = ((length - 1) // 4) + 1
         latent_width = width // 8
         latent_height = height // 8
@@ -1152,35 +1156,60 @@ class WanAnimateToVideo(io.ComfyNode):
         concat_latent_image = vae.encode(image[:, :, :, :3])
         mask = torch.zeros((1, 1, concat_latent_image.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=concat_latent_image.device, dtype=concat_latent_image.dtype)
         trim_latent += concat_latent_image.shape[2]
+        ref_motion_latent_length = 0
+
+        if continue_motion is None:
+            image = torch.ones((length, height, width, 3)) * 0.5
+        else:
+            continue_motion = continue_motion[-continue_motion_max_frames:]
+            video_frame_offset -= continue_motion.shape[0]
+            video_frame_offset = max(0, video_frame_offset)
+            continue_motion = comfy.utils.common_upscale(continue_motion[-length:].movedim(-1, 1), width, height, "area", "center").movedim(1, -1)
+            image = torch.ones((length, height, width, continue_motion.shape[-1]), device=continue_motion.device, dtype=continue_motion.dtype) * 0.5
+            image[:continue_motion.shape[0]] = continue_motion
+            ref_motion_latent_length += ((continue_motion.shape[0] - 1) // 4) + 1
 
         if clip_vision_output is not None:
             positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
             negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
 
+        if pose_video is not None:
+            if pose_video.shape[0] <= video_frame_offset:
+                pose_video = None
+            else:
+                pose_video = pose_video[video_frame_offset:]
+
+        if pose_video is not None:
+            pose_video = comfy.utils.common_upscale(pose_video[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1)
+            if not trim_to_pose_video:
+                if pose_video.shape[0] < length:
+                    pose_video = torch.cat((pose_video,) + (pose_video[-1:],) * (length - pose_video.shape[0]), dim=0)
+
+            pose_video_latent = vae.encode(pose_video[:, :, :, :3])
+            positive = node_helpers.conditioning_set_values(positive, {"pose_video_latent": pose_video_latent})
+            negative = node_helpers.conditioning_set_values(negative, {"pose_video_latent": pose_video_latent})
+
+            if trim_to_pose_video:
+                latent_length = pose_video_latent.shape[2]
+                length = latent_length * 4 - 3
+                image = image[:length]
+
+        if face_video is not None:
+            if face_video.shape[0] <= video_frame_offset:
+                face_video = None
+            else:
+                face_video = face_video[video_frame_offset:]
+
         if face_video is not None:
             face_video = comfy.utils.common_upscale(face_video[:length].movedim(-1, 1), 512, 512, "area", "center") * 2.0 - 1.0
             face_video = face_video.movedim(0, 1).unsqueeze(0)
             positive = node_helpers.conditioning_set_values(positive, {"face_video_pixels": face_video})
             negative = node_helpers.conditioning_set_values(negative, {"face_video_pixels": face_video * 0.0 - 1.0})
 
-        if pose_video is not None:
-            pose_video = comfy.utils.common_upscale(pose_video[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1)
-            pose_video_latent = vae.encode(pose_video[:, :, :, :3])
-            positive = node_helpers.conditioning_set_values(positive, {"pose_video_latent": pose_video_latent})
-            negative = node_helpers.conditioning_set_values(negative, {"pose_video_latent": pose_video_latent})
-
-        if continue_motion is None:
-            image = torch.ones((length, height, width, 3)) * 0.5
-        else:
-            continue_motion = continue_motion[-continue_motion_max_frames:]
-            continue_motion = comfy.utils.common_upscale(continue_motion[-length:].movedim(-1, 1), width, height, "area", "center").movedim(1, -1)
-            image = torch.ones((length, height, width, continue_motion.shape[-1]), device=continue_motion.device, dtype=continue_motion.dtype) * 0.5
-            image[:continue_motion.shape[0]] = continue_motion
-
         concat_latent_image = torch.cat((concat_latent_image, vae.encode(image[:, :, :, :3])), dim=2)
         mask_refmotion = torch.ones((1, 1, latent_length, concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=mask.device, dtype=mask.dtype)
         if continue_motion is not None:
-            mask_refmotion[:, :, :((continue_motion.shape[0] - 1) // 4) + 1] = 0.0
+            mask_refmotion[:, :, :ref_motion_latent_length] = 0.0
 
         mask = torch.cat((mask, mask_refmotion), dim=2)
         positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
@@ -1189,7 +1218,7 @@ class WanAnimateToVideo(io.ComfyNode):
         latent = torch.zeros([batch_size, 16, latent_length + trim_latent, latent_height, latent_width], device=comfy.model_management.intermediate_device())
         out_latent = {}
         out_latent["samples"] = latent
-        return io.NodeOutput(positive, negative, out_latent, trim_latent)
+        return io.NodeOutput(positive, negative, out_latent, trim_latent, max(0, ref_motion_latent_length * 4 - 3), video_frame_offset + length)
 
 class Wan22ImageToVideoLatent(io.ComfyNode):
     @classmethod

From 66241cef31f21247ec8b450d699250fd83b3ff7c Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 19 Sep 2025 23:24:10 -0700
Subject: [PATCH 22/33] Add inputs for character replacement to the
 WanAnimateToVideo node. (#9960)

---
 comfy_extras/nodes_wan.py | 40 +++++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index 3e5fef535..9cca6fb2e 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -1127,6 +1127,8 @@ class WanAnimateToVideo(io.ComfyNode):
                 io.Image.Input("face_video", optional=True),
                 io.Image.Input("pose_video", optional=True),
                 io.Int.Input("continue_motion_max_frames", default=5, min=1, max=nodes.MAX_RESOLUTION, step=4),
+                io.Image.Input("background_video", optional=True),
+                io.Mask.Input("character_mask", optional=True),
                 io.Image.Input("continue_motion", optional=True),
                 io.Int.Input("video_frame_offset", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1, tooltip="The amount of frames to seek in all the input videos. Used for generating longer videos by chunk. Connect to the video_frame_offset output of the previous node for extending a video."),
             ],
@@ -1142,7 +1144,7 @@ class WanAnimateToVideo(io.ComfyNode):
         )
 
     @classmethod
-    def execute(cls, positive, negative, vae, width, height, length, batch_size, continue_motion_max_frames, video_frame_offset, reference_image=None, clip_vision_output=None, face_video=None, pose_video=None, continue_motion=None) -> io.NodeOutput:
+    def execute(cls, positive, negative, vae, width, height, length, batch_size, continue_motion_max_frames, video_frame_offset, reference_image=None, clip_vision_output=None, face_video=None, pose_video=None, continue_motion=None, background_video=None, character_mask=None) -> io.NodeOutput:
         trim_to_pose_video = False
         latent_length = ((length - 1) // 4) + 1
         latent_width = width // 8
@@ -1154,7 +1156,7 @@ class WanAnimateToVideo(io.ComfyNode):
 
         image = comfy.utils.common_upscale(reference_image[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1)
         concat_latent_image = vae.encode(image[:, :, :, :3])
-        mask = torch.zeros((1, 1, concat_latent_image.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=concat_latent_image.device, dtype=concat_latent_image.dtype)
+        mask = torch.zeros((1, 4, concat_latent_image.shape[-3], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=concat_latent_image.device, dtype=concat_latent_image.dtype)
         trim_latent += concat_latent_image.shape[2]
         ref_motion_latent_length = 0
 
@@ -1206,11 +1208,37 @@ class WanAnimateToVideo(io.ComfyNode):
             positive = node_helpers.conditioning_set_values(positive, {"face_video_pixels": face_video})
             negative = node_helpers.conditioning_set_values(negative, {"face_video_pixels": face_video * 0.0 - 1.0})
 
-        concat_latent_image = torch.cat((concat_latent_image, vae.encode(image[:, :, :, :3])), dim=2)
-        mask_refmotion = torch.ones((1, 1, latent_length, concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=mask.device, dtype=mask.dtype)
-        if continue_motion is not None:
-            mask_refmotion[:, :, :ref_motion_latent_length] = 0.0
+        ref_images_num = max(0, ref_motion_latent_length * 4 - 3)
+        if background_video is not None:
+            if background_video.shape[0] > video_frame_offset:
+                background_video = background_video[video_frame_offset:]
+                background_video = comfy.utils.common_upscale(background_video[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1)
+                if background_video.shape[0] > ref_images_num:
+                    image[ref_images_num:background_video.shape[0] - ref_images_num] = background_video[ref_images_num:]
 
+        mask_refmotion = torch.ones((1, 1, latent_length * 4, concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=mask.device, dtype=mask.dtype)
+        if continue_motion is not None:
+            mask_refmotion[:, :, :ref_motion_latent_length * 4] = 0.0
+
+        if character_mask is not None:
+            if character_mask.shape[0] > video_frame_offset or character_mask.shape[0] == 1:
+                if character_mask.shape[0] == 1:
+                    character_mask = character_mask.repeat((length,) + (1,) * (character_mask.ndim - 1))
+                else:
+                    character_mask = character_mask[video_frame_offset:]
+                if character_mask.ndim == 3:
+                    character_mask = character_mask.unsqueeze(1)
+                    character_mask = character_mask.movedim(0, 1)
+                if character_mask.ndim == 4:
+                    character_mask = character_mask.unsqueeze(1)
+                character_mask = comfy.utils.common_upscale(character_mask[:, :, :length], concat_latent_image.shape[-1], concat_latent_image.shape[-2], "nearest-exact", "center")
+                if character_mask.shape[2] > ref_images_num:
+                    mask_refmotion[:, :, ref_images_num:character_mask.shape[2] + ref_images_num] = character_mask[:, :, ref_images_num:]
+
+        concat_latent_image = torch.cat((concat_latent_image, vae.encode(image[:, :, :, :3])), dim=2)
+
+
+        mask_refmotion = mask_refmotion.view(1, mask_refmotion.shape[2] // 4, 4, mask_refmotion.shape[3], mask_refmotion.shape[4]).transpose(1, 2)
         mask = torch.cat((mask, mask_refmotion), dim=2)
         positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
         negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask})

From 9ed3c5cc09c55d2fffa67b59d9d21e3b44d7653e Mon Sep 17 00:00:00 2001
From: Jedrzej Kosinski <kosinkadink1@gmail.com>
Date: Sat, 20 Sep 2025 18:10:39 -0700
Subject: [PATCH 23/33] [Reviving #5709] Add strength input to Differential
 Diffusion (#9957)

* Update nodes_differential_diffusion.py

* Update nodes_differential_diffusion.py

* Make strength optional to avoid validation errors when loading old workflows, adjust step

---------

Co-authored-by: ThereforeGames <eric@sparknight.io>
---
 comfy_extras/nodes_differential_diffusion.py | 33 +++++++++++++++-----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/comfy_extras/nodes_differential_diffusion.py b/comfy_extras/nodes_differential_diffusion.py
index 98dbbf102..255ac420d 100644
--- a/comfy_extras/nodes_differential_diffusion.py
+++ b/comfy_extras/nodes_differential_diffusion.py
@@ -5,19 +5,30 @@ import torch
 class DifferentialDiffusion():
     @classmethod
     def INPUT_TYPES(s):
-        return {"required": {"model": ("MODEL", ),
-                            }}
+        return {
+            "required": {
+                "model": ("MODEL", ),
+            },
+            "optional": {
+                "strength": ("FLOAT", {
+                    "default": 1.0,
+                    "min": 0.0,
+                    "max": 1.0,
+                    "step": 0.01,
+                }),
+            }
+        }
     RETURN_TYPES = ("MODEL",)
     FUNCTION = "apply"
     CATEGORY = "_for_testing"
     INIT = False
 
-    def apply(self, model):
+    def apply(self, model, strength=1.0):
         model = model.clone()
-        model.set_model_denoise_mask_function(self.forward)
-        return (model,)
+        model.set_model_denoise_mask_function(lambda *args, **kwargs: self.forward(*args, **kwargs, strength=strength))
+        return (model, )
 
-    def forward(self, sigma: torch.Tensor, denoise_mask: torch.Tensor, extra_options: dict):
+    def forward(self, sigma: torch.Tensor, denoise_mask: torch.Tensor, extra_options: dict, strength: float):
         model = extra_options["model"]
         step_sigmas = extra_options["sigmas"]
         sigma_to = model.inner_model.model_sampling.sigma_min
@@ -31,7 +42,15 @@ class DifferentialDiffusion():
 
         threshold = (current_ts - ts_to) / (ts_from - ts_to)
 
-        return (denoise_mask >= threshold).to(denoise_mask.dtype)
+        # Generate the binary mask based on the threshold
+        binary_mask = (denoise_mask >= threshold).to(denoise_mask.dtype)
+
+        # Blend binary mask with the original denoise_mask using strength
+        if strength and strength < 1:
+            blended_mask = strength * binary_mask + (1 - strength) * denoise_mask
+            return blended_mask
+        else:
+            return binary_mask
 
 
 NODE_CLASS_MAPPINGS = {

From 7be2b49b6b3430783555bc6bc8fcb3f46d5392e7 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sun, 21 Sep 2025 09:24:48 +0800
Subject: [PATCH 24/33] Fix LoRA Trainer bugs with FP8 models. (#9854)

* Fix adapter weight init

* Fix fp8 model training

* Avoid inference tensor
---
 comfy/ops.py                 | 13 +++++++------
 comfy/weight_adapter/loha.py |  8 ++++----
 comfy/weight_adapter/lokr.py |  4 ++--
 comfy/weight_adapter/lora.py |  4 ++--
 comfy/weight_adapter/oft.py  |  2 +-
 comfy_extras/nodes_train.py  | 18 ++++++++++++++++++
 6 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/comfy/ops.py b/comfy/ops.py
index 55e958adb..9d7dedd37 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -365,12 +365,13 @@ class fp8_ops(manual_cast):
             return None
 
         def forward_comfy_cast_weights(self, input):
-            try:
-                out = fp8_linear(self, input)
-                if out is not None:
-                    return out
-            except Exception as e:
-                logging.info("Exception during fp8 op: {}".format(e))
+            if not self.training:
+                try:
+                    out = fp8_linear(self, input)
+                    if out is not None:
+                        return out
+                except Exception as e:
+                    logging.info("Exception during fp8 op: {}".format(e))
 
             weight, bias = cast_bias_weight(self, input)
             return torch.nn.functional.linear(input, weight, bias)
diff --git a/comfy/weight_adapter/loha.py b/comfy/weight_adapter/loha.py
index 55c97a3af..0abb2d403 100644
--- a/comfy/weight_adapter/loha.py
+++ b/comfy/weight_adapter/loha.py
@@ -130,12 +130,12 @@ class LoHaAdapter(WeightAdapterBase):
     def create_train(cls, weight, rank=1, alpha=1.0):
         out_dim = weight.shape[0]
         in_dim = weight.shape[1:].numel()
-        mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype)
-        mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype)
+        mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=torch.float32)
+        mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=torch.float32)
         torch.nn.init.normal_(mat1, 0.1)
         torch.nn.init.constant_(mat2, 0.0)
-        mat3 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype)
-        mat4 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype)
+        mat3 = torch.empty(out_dim, rank, device=weight.device, dtype=torch.float32)
+        mat4 = torch.empty(rank, in_dim, device=weight.device, dtype=torch.float32)
         torch.nn.init.normal_(mat3, 0.1)
         torch.nn.init.normal_(mat4, 0.01)
         return LohaDiff(
diff --git a/comfy/weight_adapter/lokr.py b/comfy/weight_adapter/lokr.py
index 563c835f5..9b2aff2d7 100644
--- a/comfy/weight_adapter/lokr.py
+++ b/comfy/weight_adapter/lokr.py
@@ -89,8 +89,8 @@ class LoKrAdapter(WeightAdapterBase):
         in_dim = weight.shape[1:].numel()
         out1, out2 = factorization(out_dim, rank)
         in1, in2 = factorization(in_dim, rank)
-        mat1 = torch.empty(out1, in1, device=weight.device, dtype=weight.dtype)
-        mat2 = torch.empty(out2, in2, device=weight.device, dtype=weight.dtype)
+        mat1 = torch.empty(out1, in1, device=weight.device, dtype=torch.float32)
+        mat2 = torch.empty(out2, in2, device=weight.device, dtype=torch.float32)
         torch.nn.init.kaiming_uniform_(mat2, a=5**0.5)
         torch.nn.init.constant_(mat1, 0.0)
         return LokrDiff(
diff --git a/comfy/weight_adapter/lora.py b/comfy/weight_adapter/lora.py
index 47aa17d13..4db004e50 100644
--- a/comfy/weight_adapter/lora.py
+++ b/comfy/weight_adapter/lora.py
@@ -66,8 +66,8 @@ class LoRAAdapter(WeightAdapterBase):
     def create_train(cls, weight, rank=1, alpha=1.0):
         out_dim = weight.shape[0]
         in_dim = weight.shape[1:].numel()
-        mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype)
-        mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype)
+        mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=torch.float32)
+        mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=torch.float32)
         torch.nn.init.kaiming_uniform_(mat1, a=5**0.5)
         torch.nn.init.constant_(mat2, 0.0)
         return LoraDiff(
diff --git a/comfy/weight_adapter/oft.py b/comfy/weight_adapter/oft.py
index 9d4982083..c0aab9635 100644
--- a/comfy/weight_adapter/oft.py
+++ b/comfy/weight_adapter/oft.py
@@ -68,7 +68,7 @@ class OFTAdapter(WeightAdapterBase):
     def create_train(cls, weight, rank=1, alpha=1.0):
         out_dim = weight.shape[0]
         block_size, block_num = factorization(out_dim, rank)
-        block = torch.zeros(block_num, block_size, block_size, device=weight.device, dtype=weight.dtype)
+        block = torch.zeros(block_num, block_size, block_size, device=weight.device, dtype=torch.float32)
         return OFTDiff(
             (block, None, alpha, None)
         )
diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py
index c3aaaee9b..9e6ec6780 100644
--- a/comfy_extras/nodes_train.py
+++ b/comfy_extras/nodes_train.py
@@ -38,6 +38,23 @@ def make_batch_extra_option_dict(d, indicies, full_size=None):
     return new_dict
 
 
+def process_cond_list(d, prefix=""):
+    if hasattr(d, "__iter__") and not hasattr(d, "items"):
+        for index, item in enumerate(d):
+            process_cond_list(item, f"{prefix}.{index}")
+        return d
+    elif hasattr(d, "items"):
+        for k, v in list(d.items()):
+            if isinstance(v, dict):
+                process_cond_list(v, f"{prefix}.{k}")
+            elif isinstance(v, torch.Tensor):
+                d[k] = v.clone()
+            elif isinstance(v, (list, tuple)):
+                for index, item in enumerate(v):
+                    process_cond_list(item, f"{prefix}.{k}.{index}")
+    return d
+
+
 class TrainSampler(comfy.samplers.Sampler):
     def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, grad_acc=1, total_steps=1, seed=0, training_dtype=torch.bfloat16):
         self.loss_fn = loss_fn
@@ -50,6 +67,7 @@ class TrainSampler(comfy.samplers.Sampler):
         self.training_dtype = training_dtype
 
     def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
+        model_wrap.conds = process_cond_list(model_wrap.conds)
         cond = model_wrap.conds["positive"]
         dataset_size = sigmas.size(0)
         torch.cuda.empty_cache()

From d1d9eb94b1096c9b3f963bf152bd6b9cd330c3a4 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Sat, 20 Sep 2025 19:09:35 -0700
Subject: [PATCH 25/33] Lower wan memory estimation value a bit. (#9964)

Previous pr reduced the peak memory requirement.
---
 comfy/supported_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 1fbb6aef4..4064bdae1 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -995,7 +995,7 @@ class WAN21_T2V(supported_models_base.BASE):
     unet_extra_config = {}
     latent_format = latent_formats.Wan21
 
-    memory_usage_factor = 1.0
+    memory_usage_factor = 0.9
 
     supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
 
@@ -1004,7 +1004,7 @@ class WAN21_T2V(supported_models_base.BASE):
 
     def __init__(self, unet_config):
         super().__init__(unet_config)
-        self.memory_usage_factor = self.unet_config.get("dim", 2000) / 2000
+        self.memory_usage_factor = self.unet_config.get("dim", 2000) / 2222
 
     def get_model(self, state_dict, prefix="", device=None):
         out = model_base.WAN21(self, device=device)

From 27bc181c49249f11da2d8a14f84f3bdb58a0615f Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Sun, 21 Sep 2025 16:48:31 -0700
Subject: [PATCH 26/33] Set some wan nodes as no longer experimental. (#9976)

---
 comfy_extras/nodes_wan.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index 9cca6fb2e..b1e9babb5 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -287,7 +287,6 @@ class WanVaceToVideo(io.ComfyNode):
         return io.Schema(
             node_id="WanVaceToVideo",
             category="conditioning/video_models",
-            is_experimental=True,
             inputs=[
                 io.Conditioning.Input("positive"),
                 io.Conditioning.Input("negative"),
@@ -375,7 +374,6 @@ class TrimVideoLatent(io.ComfyNode):
         return io.Schema(
             node_id="TrimVideoLatent",
             category="latent/video",
-            is_experimental=True,
             inputs=[
                 io.Latent.Input("samples"),
                 io.Int.Input("trim_amount", default=0, min=0, max=99999),
@@ -969,7 +967,6 @@ class WanSoundImageToVideo(io.ComfyNode):
                 io.Conditioning.Output(display_name="negative"),
                 io.Latent.Output(display_name="latent"),
             ],
-            is_experimental=True,
         )
 
     @classmethod
@@ -1000,7 +997,6 @@ class WanSoundImageToVideoExtend(io.ComfyNode):
                 io.Conditioning.Output(display_name="negative"),
                 io.Latent.Output(display_name="latent"),
             ],
-            is_experimental=True,
         )
 
     @classmethod

From 1fee8827cb8160c85d96c375413ac590311525dc Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 22 Sep 2025 13:49:48 -0700
Subject: [PATCH 27/33] Support for qwen edit plus model. Use the new
 TextEncodeQwenImageEditPlus. (#9986)

---
 comfy/text_encoders/llama.py | 16 +++++++----
 comfy_extras/nodes_qwen.py   | 55 ++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py
index 5e11956b5..c5a48ba9f 100644
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -400,21 +400,25 @@ class Qwen25_7BVLI(BaseLlama, torch.nn.Module):
 
     def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]):
         grid = None
+        position_ids = None
+        offset = 0
         for e in embeds_info:
             if e.get("type") == "image":
                 grid = e.get("extra", None)
-                position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
                 start = e.get("index")
-                position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
+                if position_ids is None:
+                    position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
+                    position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
                 end = e.get("size") + start
                 len_max = int(grid.max()) // 2
                 start_next = len_max + start
-                position_ids[:, end:] = torch.arange(start_next, start_next + (embeds.shape[1] - end), device=embeds.device)
-                position_ids[0, start:end] = start
+                position_ids[:, end:] = torch.arange(start_next + offset, start_next + (embeds.shape[1] - end) + offset, device=embeds.device)
+                position_ids[0, start:end] = start + offset
                 max_d = int(grid[0][1]) // 2
-                position_ids[1, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
+                position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
                 max_d = int(grid[0][2]) // 2
-                position_ids[2, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
+                position_ids[2, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
+                offset += len_max - (end - start)
 
         if grid is None:
             position_ids = None
diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py
index fff89556f..49747dc7a 100644
--- a/comfy_extras/nodes_qwen.py
+++ b/comfy_extras/nodes_qwen.py
@@ -43,6 +43,61 @@ class TextEncodeQwenImageEdit:
         return (conditioning, )
 
 
+class TextEncodeQwenImageEditPlus:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "clip": ("CLIP", ),
+            "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            },
+            "optional": {"vae": ("VAE", ),
+                         "image1": ("IMAGE", ),
+                         "image2": ("IMAGE", ),
+                         "image3": ("IMAGE", ),
+                         }}
+
+    RETURN_TYPES = ("CONDITIONING",)
+    FUNCTION = "encode"
+
+    CATEGORY = "advanced/conditioning"
+
+    def encode(self, clip, prompt, vae=None, image1=None, image2=None, image3=None):
+        ref_latents = []
+        images = [image1, image2, image3]
+        images_vl = []
+        llama_template = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        image_prompt = ""
+
+        for i, image in enumerate(images):
+            if image is not None:
+                samples = image.movedim(-1, 1)
+                total = int(384 * 384)
+
+                scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
+                width = round(samples.shape[3] * scale_by)
+                height = round(samples.shape[2] * scale_by)
+
+                s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
+                images_vl.append(s.movedim(1, -1))
+                if vae is not None:
+                    total = int(1024 * 1024)
+                    scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
+                    width = round(samples.shape[3] * scale_by / 8.0) * 8
+                    height = round(samples.shape[2] * scale_by / 8.0) * 8
+
+                    s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
+                    ref_latents.append(vae.encode(s.movedim(1, -1)[:, :, :, :3]))
+
+                image_prompt += "Picture {}: <|vision_start|><|image_pad|><|vision_end|>".format(i + 1)
+
+        tokens = clip.tokenize(image_prompt + prompt, images=images_vl, llama_template=llama_template)
+        conditioning = clip.encode_from_tokens_scheduled(tokens)
+        if len(ref_latents) > 0:
+            conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": ref_latents}, append=True)
+        return (conditioning, )
+
+
 NODE_CLASS_MAPPINGS = {
     "TextEncodeQwenImageEdit": TextEncodeQwenImageEdit,
+    "TextEncodeQwenImageEditPlus": TextEncodeQwenImageEditPlus,
 }

From e3206351b07852f2127a56abd898ee77f7f4c25f Mon Sep 17 00:00:00 2001
From: Christian Byrne <cbyrne@comfy.org>
Date: Mon, 22 Sep 2025 14:12:32 -0700
Subject: [PATCH 28/33] add offset param (#9977)

---
 server.py                         |   9 ++-
 tests/execution/test_execution.py | 105 +++++++++++++++++++++++++++++-
 2 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/server.py b/server.py
index 43816a8cd..603677397 100644
--- a/server.py
+++ b/server.py
@@ -645,7 +645,14 @@ class PromptServer():
             max_items = request.rel_url.query.get("max_items", None)
             if max_items is not None:
                 max_items = int(max_items)
-            return web.json_response(self.prompt_queue.get_history(max_items=max_items))
+
+            offset = request.rel_url.query.get("offset", None)
+            if offset is not None:
+                offset = int(offset)
+            else:
+                offset = -1
+
+            return web.json_response(self.prompt_queue.get_history(max_items=max_items, offset=offset))
 
         @routes.get("/history/{prompt_id}")
         async def get_history_prompt_id(request):
diff --git a/tests/execution/test_execution.py b/tests/execution/test_execution.py
index 8ea05fdd8..ef73ad9fd 100644
--- a/tests/execution/test_execution.py
+++ b/tests/execution/test_execution.py
@@ -84,6 +84,21 @@ class ComfyClient:
         with urllib.request.urlopen("http://{}/history/{}".format(self.server_address, prompt_id)) as response:
             return json.loads(response.read())
 
+    def get_all_history(self, max_items=None, offset=None):
+        url = "http://{}/history".format(self.server_address)
+        params = {}
+        if max_items is not None:
+            params["max_items"] = max_items
+        if offset is not None:
+            params["offset"] = offset
+
+        if params:
+            url_values = urllib.parse.urlencode(params)
+            url = "{}?{}".format(url, url_values)
+
+        with urllib.request.urlopen(url) as response:
+            return json.loads(response.read())
+
     def set_test_name(self, name):
         self.test_name = name
 
@@ -498,7 +513,6 @@ class TestExecution:
         assert len(images1) == 1, "Should have 1 image"
         assert len(images2) == 1, "Should have 1 image"
 
-
     # This tests that only constant outputs are used in the call to `IS_CHANGED`
     def test_is_changed_with_outputs(self, client: ComfyClient, builder: GraphBuilder):
         g = builder
@@ -762,3 +776,92 @@ class TestExecution:
         except urllib.error.HTTPError:
             pass  # Expected behavior
 
+    def _create_history_item(self, client, builder):
+        g = GraphBuilder(prefix="offset_test")
+        input_node = g.node(
+            "StubImage", content="BLACK", height=32, width=32, batch_size=1
+        )
+        g.node("SaveImage", images=input_node.out(0))
+        return client.run(g)
+
+    def test_offset_returns_different_items_than_beginning_of_history(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test that offset skips items at the beginning"""
+        for _ in range(5):
+            self._create_history_item(client, builder)
+
+        first_two = client.get_all_history(max_items=2, offset=0)
+        next_two = client.get_all_history(max_items=2, offset=2)
+
+        assert set(first_two.keys()).isdisjoint(
+            set(next_two.keys())
+        ), "Offset should skip initial items"
+
+    def test_offset_beyond_history_length_returns_empty(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test offset larger than total history returns empty result"""
+        self._create_history_item(client, builder)
+
+        result = client.get_all_history(offset=100)
+        assert len(result) == 0, "Large offset should return no items"
+
+    def test_offset_at_exact_history_length_returns_empty(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test offset equal to history length returns empty"""
+        for _ in range(3):
+            self._create_history_item(client, builder)
+
+        all_history = client.get_all_history()
+        result = client.get_all_history(offset=len(all_history))
+        assert len(result) == 0, "Offset at history length should return empty"
+
+    def test_offset_zero_equals_no_offset_parameter(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test offset=0 behaves same as omitting offset"""
+        self._create_history_item(client, builder)
+
+        with_zero = client.get_all_history(offset=0)
+        without_offset = client.get_all_history()
+
+        assert with_zero == without_offset, "offset=0 should equal no offset"
+
+    def test_offset_without_max_items_skips_from_beginning(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test offset alone (no max_items) returns remaining items"""
+        for _ in range(4):
+            self._create_history_item(client, builder)
+
+        all_items = client.get_all_history()
+        offset_items = client.get_all_history(offset=2)
+
+        assert (
+            len(offset_items) == len(all_items) - 2
+        ), "Offset should skip specified number of items"
+
+    def test_offset_with_max_items_returns_correct_window(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test offset + max_items returns correct slice of history"""
+        for _ in range(6):
+            self._create_history_item(client, builder)
+
+        window = client.get_all_history(max_items=2, offset=1)
+        assert len(window) <= 2, "Should respect max_items limit"
+
+    def test_offset_near_end_returns_remaining_items_only(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test offset near end of history returns only remaining items"""
+        for _ in range(3):
+            self._create_history_item(client, builder)
+
+        all_history = client.get_all_history()
+        # Offset to near the end
+        result = client.get_all_history(max_items=5, offset=len(all_history) - 1)
+
+        assert len(result) <= 1, "Should return at most 1 item when offset is near end"

From 8a5ac527e60fcd48ec228d309d49ab28ac79def8 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 22 Sep 2025 14:26:58 -0700
Subject: [PATCH 29/33] Fix bug with WanAnimateToVideo node. (#9988)

---
 comfy_extras/nodes_wan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index b1e9babb5..6c16a2673 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -1210,7 +1210,7 @@ class WanAnimateToVideo(io.ComfyNode):
                 background_video = background_video[video_frame_offset:]
                 background_video = comfy.utils.common_upscale(background_video[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1)
                 if background_video.shape[0] > ref_images_num:
-                    image[ref_images_num:background_video.shape[0] - ref_images_num] = background_video[ref_images_num:]
+                    image[ref_images_num:background_video.shape[0]] = background_video[ref_images_num:]
 
         mask_refmotion = torch.ones((1, 1, latent_length * 4, concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=mask.device, dtype=mask.dtype)
         if continue_motion is not None:

From 707b2638ecd82360c0a67e1d86cc4fdeae218d03 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 22 Sep 2025 14:34:33 -0700
Subject: [PATCH 30/33] Fix bug with WanAnimateToVideo. (#9990)

---
 comfy_extras/nodes_wan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index 6c16a2673..b0bd471bf 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -1229,7 +1229,7 @@ class WanAnimateToVideo(io.ComfyNode):
                     character_mask = character_mask.unsqueeze(1)
                 character_mask = comfy.utils.common_upscale(character_mask[:, :, :length], concat_latent_image.shape[-1], concat_latent_image.shape[-2], "nearest-exact", "center")
                 if character_mask.shape[2] > ref_images_num:
-                    mask_refmotion[:, :, ref_images_num:character_mask.shape[2] + ref_images_num] = character_mask[:, :, ref_images_num:]
+                    mask_refmotion[:, :, ref_images_num:character_mask.shape[2]] = character_mask[:, :, ref_images_num:]
 
         concat_latent_image = torch.cat((concat_latent_image, vae.encode(image[:, :, :, :3])), dim=2)
 

From 145b0e4f79b5d9e815bb781ba29ccd057bb52dab Mon Sep 17 00:00:00 2001
From: ComfyUI Wiki <contact@comfyui-wiki.com>
Date: Tue, 23 Sep 2025 23:22:35 +0800
Subject: [PATCH 31/33] update template to 0.1.86 (#9998)

* update template to 0.1.84

* update template to 0.1.85

* Update template to 0.1.86
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 79187efaa..2980bebdd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 comfyui-frontend-package==1.26.13
-comfyui-workflow-templates==0.1.81
+comfyui-workflow-templates==0.1.86
 comfyui-embedded-docs==0.2.6
 torch
 torchsde

From e8087907995497c6971ee64bd5fa02cb49c1eda6 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Tue, 23 Sep 2025 18:36:47 +0300
Subject: [PATCH 32/33] feat(api-nodes): add wan t2i, t2v, i2v nodes (#9996)

---
 comfy_api_nodes/nodes_wan.py | 602 +++++++++++++++++++++++++++++++++++
 nodes.py                     |   1 +
 2 files changed, 603 insertions(+)
 create mode 100644 comfy_api_nodes/nodes_wan.py

diff --git a/comfy_api_nodes/nodes_wan.py b/comfy_api_nodes/nodes_wan.py
new file mode 100644
index 000000000..db5bd41c1
--- /dev/null
+++ b/comfy_api_nodes/nodes_wan.py
@@ -0,0 +1,602 @@
+import re
+from typing import Optional, Type, Union
+from typing_extensions import override
+
+import torch
+from pydantic import BaseModel, Field
+from comfy_api.latest import ComfyExtension, Input, io as comfy_io
+from comfy_api_nodes.apis.client import (
+    ApiEndpoint,
+    HttpMethod,
+    SynchronousOperation,
+    PollingOperation,
+    EmptyRequest,
+    R,
+    T,
+)
+from comfy_api_nodes.util.validation_utils import get_number_of_images, validate_audio_duration
+
+from comfy_api_nodes.apinode_utils import (
+    download_url_to_image_tensor,
+    download_url_to_video_output,
+    tensor_to_base64_string,
+    audio_to_base64_string,
+)
+
+class Text2ImageInputField(BaseModel):
+    prompt: str = Field(...)
+    negative_prompt: Optional[str] = Field(None)
+
+
+class Text2VideoInputField(BaseModel):
+    prompt: str = Field(...)
+    negative_prompt: Optional[str] = Field(None)
+    audio_url: Optional[str] = Field(None)
+
+
+class Image2VideoInputField(BaseModel):
+    prompt: str = Field(...)
+    negative_prompt: Optional[str] = Field(None)
+    img_url: str = Field(...)
+    audio_url: Optional[str] = Field(None)
+
+
+class Txt2ImageParametersField(BaseModel):
+    size: str = Field(...)
+    n: int = Field(1, description="Number of images to generate.")  # we support only value=1
+    seed: int = Field(..., ge=0, le=2147483647)
+    prompt_extend: bool = Field(True)
+    watermark: bool = Field(True)
+
+
+class Text2VideoParametersField(BaseModel):
+    size: str = Field(...)
+    seed: int = Field(..., ge=0, le=2147483647)
+    duration: int = Field(5, ge=5, le=10)
+    prompt_extend: bool = Field(True)
+    watermark: bool = Field(True)
+    audio: bool = Field(False, description="Should be audio generated automatically")
+
+
+class Image2VideoParametersField(BaseModel):
+    resolution: str = Field(...)
+    seed: int = Field(..., ge=0, le=2147483647)
+    duration: int = Field(5, ge=5, le=10)
+    prompt_extend: bool = Field(True)
+    watermark: bool = Field(True)
+    audio: bool = Field(False, description="Should be audio generated automatically")
+
+
+class Text2ImageTaskCreationRequest(BaseModel):
+    model: str = Field(...)
+    input: Text2ImageInputField = Field(...)
+    parameters: Txt2ImageParametersField = Field(...)
+
+
+class Text2VideoTaskCreationRequest(BaseModel):
+    model: str = Field(...)
+    input: Text2VideoInputField = Field(...)
+    parameters: Text2VideoParametersField = Field(...)
+
+
+class Image2VideoTaskCreationRequest(BaseModel):
+    model: str = Field(...)
+    input: Image2VideoInputField = Field(...)
+    parameters: Image2VideoParametersField = Field(...)
+
+
+class TaskCreationOutputField(BaseModel):
+    task_id: str = Field(...)
+    task_status: str = Field(...)
+
+
+class TaskCreationResponse(BaseModel):
+    output: Optional[TaskCreationOutputField] = Field(None)
+    request_id: str = Field(...)
+    code: Optional[str] = Field(None, description="The error code of the failed request.")
+    message: Optional[str] = Field(None, description="Details of the failed request.")
+
+
+class TaskResult(BaseModel):
+    url: Optional[str] = Field(None)
+    code: Optional[str] = Field(None)
+    message: Optional[str] = Field(None)
+
+
+class ImageTaskStatusOutputField(TaskCreationOutputField):
+    task_id: str = Field(...)
+    task_status: str = Field(...)
+    results: Optional[list[TaskResult]] = Field(None)
+
+
+class VideoTaskStatusOutputField(TaskCreationOutputField):
+    task_id: str = Field(...)
+    task_status: str = Field(...)
+    video_url: Optional[str] = Field(None)
+    code: Optional[str] = Field(None)
+    message: Optional[str] = Field(None)
+
+
+class ImageTaskStatusResponse(BaseModel):
+    output: Optional[ImageTaskStatusOutputField] = Field(None)
+    request_id: str = Field(...)
+
+
+class VideoTaskStatusResponse(BaseModel):
+    output: Optional[VideoTaskStatusOutputField] = Field(None)
+    request_id: str = Field(...)
+
+
+RES_IN_PARENS = re.compile(r'\((\d+)\s*[x×]\s*(\d+)\)')
+
+
+async def process_task(
+    auth_kwargs: dict[str, str],
+    url: str,
+    request_model: Type[T],
+    response_model: Type[R],
+    payload: Union[Text2ImageTaskCreationRequest, Text2VideoTaskCreationRequest, Image2VideoTaskCreationRequest],
+    node_id: str,
+    estimated_duration: int,
+    poll_interval: int,
+) -> Type[R]:
+    initial_response = await SynchronousOperation(
+        endpoint=ApiEndpoint(
+            path=url,
+            method=HttpMethod.POST,
+            request_model=request_model,
+            response_model=TaskCreationResponse,
+        ),
+        request=payload,
+        auth_kwargs=auth_kwargs,
+    ).execute()
+
+    if not initial_response.output:
+        raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}")
+
+    return await PollingOperation(
+        poll_endpoint=ApiEndpoint(
+            path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}",
+            method=HttpMethod.GET,
+            request_model=EmptyRequest,
+            response_model=response_model,
+        ),
+        completed_statuses=["SUCCEEDED"],
+        failed_statuses=["FAILED", "CANCELED", "UNKNOWN"],
+        status_extractor=lambda x: x.output.task_status,
+        estimated_duration=estimated_duration,
+        poll_interval=poll_interval,
+        node_id=node_id,
+        auth_kwargs=auth_kwargs,
+    ).execute()
+
+
+class WanTextToImageApi(comfy_io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="WanTextToImageApi",
+            display_name="Wan Text to Image",
+            category="api node/image/Wan",
+            description="Generates image based on text prompt.",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=["wan2.5-t2i-preview"],
+                    default="wan2.5-t2i-preview",
+                    tooltip="Model to use.",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.",
+                ),
+                comfy_io.String.Input(
+                    "negative_prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Negative text prompt to guide what to avoid.",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "width",
+                    default=1024,
+                    min=768,
+                    max=1440,
+                    step=32,
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "height",
+                    default=1024,
+                    min=768,
+                    max=1440,
+                    step=32,
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed to use for generation.",
+                    optional=True,
+                ),
+                comfy_io.Boolean.Input(
+                    "prompt_extend",
+                    default=True,
+                    tooltip="Whether to enhance the prompt with AI assistance.",
+                    optional=True,
+                ),
+                comfy_io.Boolean.Input(
+                    "watermark",
+                    default=True,
+                    tooltip="Whether to add an \"AI generated\" watermark to the result.",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Image.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        prompt: str,
+        negative_prompt: str = "",
+        width: int = 1024,
+        height: int = 1024,
+        seed: int = 0,
+        prompt_extend: bool = True,
+        watermark: bool = True,
+    ):
+        payload = Text2ImageTaskCreationRequest(
+            model=model,
+            input=Text2ImageInputField(prompt=prompt, negative_prompt=negative_prompt),
+            parameters=Txt2ImageParametersField(
+                size=f"{width}*{height}",
+                seed=seed,
+                prompt_extend=prompt_extend,
+                watermark=watermark,
+            ),
+        )
+        response = await process_task(
+            {
+                "auth_token": cls.hidden.auth_token_comfy_org,
+                "comfy_api_key": cls.hidden.api_key_comfy_org,
+            },
+            "/proxy/wan/api/v1/services/aigc/text2image/image-synthesis",
+            request_model=Text2ImageTaskCreationRequest,
+            response_model=ImageTaskStatusResponse,
+            payload=payload,
+            node_id=cls.hidden.unique_id,
+            estimated_duration=9,
+            poll_interval=3,
+        )
+        return comfy_io.NodeOutput(await download_url_to_image_tensor(str(response.output.results[0].url)))
+
+
+class WanTextToVideoApi(comfy_io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="WanTextToVideoApi",
+            display_name="Wan Text to Video",
+            category="api node/video/Wan",
+            description="Generates video based on text prompt.",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=["wan2.5-t2v-preview"],
+                    default="wan2.5-t2v-preview",
+                    tooltip="Model to use.",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.",
+                ),
+                comfy_io.String.Input(
+                    "negative_prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Negative text prompt to guide what to avoid.",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "size",
+                    options=[
+                        "480p: 1:1 (624x624)",
+                        "480p: 16:9 (832x480)",
+                        "480p: 9:16 (480x832)",
+                        "720p: 1:1 (960x960)",
+                        "720p: 16:9 (1280x720)",
+                        "720p: 9:16 (720x1280)",
+                        "720p: 4:3 (1088x832)",
+                        "720p: 3:4 (832x1088)",
+                        "1080p: 1:1 (1440x1440)",
+                        "1080p: 16:9 (1920x1080)",
+                        "1080p: 9:16 (1080x1920)",
+                        "1080p: 4:3 (1632x1248)",
+                        "1080p: 3:4 (1248x1632)",
+                    ],
+                    default="480p: 1:1 (624x624)",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=10,
+                    step=5,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Available durations: 5 and 10 seconds",
+                    optional=True,
+                ),
+                comfy_io.Audio.Input(
+                    "audio",
+                    optional=True,
+                    tooltip="Audio must contain a clear, loud voice, without extraneous noise, background music.",
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed to use for generation.",
+                    optional=True,
+                ),
+                comfy_io.Boolean.Input(
+                    "generate_audio",
+                    default=False,
+                    optional=True,
+                    tooltip="If there is no audio input, generate audio automatically.",
+                ),
+                comfy_io.Boolean.Input(
+                    "prompt_extend",
+                    default=True,
+                    tooltip="Whether to enhance the prompt with AI assistance.",
+                    optional=True,
+                ),
+                comfy_io.Boolean.Input(
+                    "watermark",
+                    default=True,
+                    tooltip="Whether to add an \"AI generated\" watermark to the result.",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        prompt: str,
+        negative_prompt: str = "",
+        size: str = "480p: 1:1 (624x624)",
+        duration: int = 5,
+        audio: Optional[Input.Audio] = None,
+        seed: int = 0,
+        generate_audio: bool = False,
+        prompt_extend: bool = True,
+        watermark: bool = True,
+    ):
+        width, height = RES_IN_PARENS.search(size).groups()
+        audio_url = None
+        if audio is not None:
+            validate_audio_duration(audio, 3.0, 29.0)
+            audio_url = "data:audio/mp3;base64," + audio_to_base64_string(audio, "mp3", "libmp3lame")
+        payload = Text2VideoTaskCreationRequest(
+            model=model,
+            input=Text2VideoInputField(prompt=prompt, negative_prompt=negative_prompt, audio_url=audio_url),
+            parameters=Text2VideoParametersField(
+                size=f"{width}*{height}",
+                duration=duration,
+                seed=seed,
+                audio=generate_audio,
+                prompt_extend=prompt_extend,
+                watermark=watermark,
+            ),
+        )
+        response = await process_task(
+            {
+                "auth_token": cls.hidden.auth_token_comfy_org,
+                "comfy_api_key": cls.hidden.api_key_comfy_org,
+            },
+            "/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis",
+            request_model=Text2VideoTaskCreationRequest,
+            response_model=VideoTaskStatusResponse,
+            payload=payload,
+            node_id=cls.hidden.unique_id,
+            estimated_duration=120 * int(duration / 5),
+            poll_interval=6,
+        )
+        return comfy_io.NodeOutput(await download_url_to_video_output(response.output.video_url))
+
+
+class WanImageToVideoApi(comfy_io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="WanImageToVideoApi",
+            display_name="Wan Image to Video",
+            category="api node/video/Wan",
+            description="Generates video based on the first frame and text prompt.",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=["wan2.5-i2v-preview"],
+                    default="wan2.5-i2v-preview",
+                    tooltip="Model to use.",
+                ),
+                comfy_io.Image.Input(
+                    "image",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.",
+                ),
+                comfy_io.String.Input(
+                    "negative_prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Negative text prompt to guide what to avoid.",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "resolution",
+                    options=[
+                        "480P",
+                        "720P",
+                        "1080P",
+                    ],
+                    default="480P",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=10,
+                    step=5,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Available durations: 5 and 10 seconds",
+                    optional=True,
+                ),
+                comfy_io.Audio.Input(
+                    "audio",
+                    optional=True,
+                    tooltip="Audio must contain a clear, loud voice, without extraneous noise, background music.",
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed to use for generation.",
+                    optional=True,
+                ),
+                comfy_io.Boolean.Input(
+                    "generate_audio",
+                    default=False,
+                    optional=True,
+                    tooltip="If there is no audio input, generate audio automatically.",
+                ),
+                comfy_io.Boolean.Input(
+                    "prompt_extend",
+                    default=True,
+                    tooltip="Whether to enhance the prompt with AI assistance.",
+                    optional=True,
+                ),
+                comfy_io.Boolean.Input(
+                    "watermark",
+                    default=True,
+                    tooltip="Whether to add an \"AI generated\" watermark to the result.",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        image: torch.Tensor,
+        prompt: str,
+        negative_prompt: str = "",
+        resolution: str = "480P",
+        duration: int = 5,
+        audio: Optional[Input.Audio] = None,
+        seed: int = 0,
+        generate_audio: bool = False,
+        prompt_extend: bool = True,
+        watermark: bool = True,
+    ):
+        if get_number_of_images(image) != 1:
+            raise ValueError("Exactly one input image is required.")
+        image_url = "data:image/png;base64," + tensor_to_base64_string(image, total_pixels=2000*2000)
+        audio_url = None
+        if audio is not None:
+            validate_audio_duration(audio, 3.0, 29.0)
+            audio_url = "data:audio/mp3;base64," + audio_to_base64_string(audio, "mp3", "libmp3lame")
+        payload = Image2VideoTaskCreationRequest(
+            model=model,
+            input=Image2VideoInputField(
+                prompt=prompt, negative_prompt=negative_prompt, img_url=image_url, audio_url=audio_url
+            ),
+            parameters=Image2VideoParametersField(
+                resolution=resolution,
+                duration=duration,
+                seed=seed,
+                audio=generate_audio,
+                prompt_extend=prompt_extend,
+                watermark=watermark,
+            ),
+        )
+        response = await process_task(
+            {
+                "auth_token": cls.hidden.auth_token_comfy_org,
+                "comfy_api_key": cls.hidden.api_key_comfy_org,
+            },
+            "/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis",
+            request_model=Image2VideoTaskCreationRequest,
+            response_model=VideoTaskStatusResponse,
+            payload=payload,
+            node_id=cls.hidden.unique_id,
+            estimated_duration=120 * int(duration / 5),
+            poll_interval=6,
+        )
+        return comfy_io.NodeOutput(await download_url_to_video_output(response.output.video_url))
+
+
+class WanApiExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]:
+        return [
+            WanTextToImageApi,
+            WanTextToVideoApi,
+            WanImageToVideoApi,
+        ]
+
+
+async def comfy_entrypoint() -> WanApiExtension:
+    return WanApiExtension()
diff --git a/nodes.py b/nodes.py
index 5a5fdcb8e..1a6784b68 100644
--- a/nodes.py
+++ b/nodes.py
@@ -2361,6 +2361,7 @@ async def init_builtin_api_nodes():
         "nodes_rodin.py",
         "nodes_gemini.py",
         "nodes_vidu.py",
+        "nodes_wan.py",
     ]
 
     if not await load_custom_node(os.path.join(api_nodes_dir, "canary.py"), module_parent="comfy_api_nodes"):

From b8730510db30c8858e1e5d8e126ef19eac395560 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Tue, 23 Sep 2025 11:50:33 -0400
Subject: [PATCH 33/33] ComfyUI version 0.3.60

---
 comfyui_version.py | 2 +-
 pyproject.toml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfyui_version.py b/comfyui_version.py
index ee58205f5..d469a8194 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.59"
+__version__ = "0.3.60"
diff --git a/pyproject.toml b/pyproject.toml
index a7fc1a5a6..7340c320b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.59"
+version = "0.3.60"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"