From 027c63f63a7f5f380a4df1057c548410b0a87606 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Fri, 15 Aug 2025 21:57:47 +0300
Subject: [PATCH 01/30] fix(OpenAIGPTImage1): set correct MIME type for
 multipart uploads to OpenAI edits (#9348)

---
 comfy_api_nodes/nodes_openai.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py
index ab3c5363b..cbff2b2d2 100644
--- a/comfy_api_nodes/nodes_openai.py
+++ b/comfy_api_nodes/nodes_openai.py
@@ -464,8 +464,6 @@ class OpenAIGPTImage1(ComfyNodeABC):
         path = "/proxy/openai/images/generations"
         content_type = "application/json"
         request_class = OpenAIImageGenerationRequest
-        img_binaries = []
-        mask_binary = None
         files = []
 
         if image is not None:
@@ -484,14 +482,11 @@ class OpenAIGPTImage1(ComfyNodeABC):
                 img_byte_arr = io.BytesIO()
                 img.save(img_byte_arr, format="PNG")
                 img_byte_arr.seek(0)
-                img_binary = img_byte_arr
-                img_binary.name = f"image_{i}.png"
 
-                img_binaries.append(img_binary)
                 if batch_size == 1:
-                    files.append(("image", img_binary))
+                    files.append(("image", (f"image_{i}.png", img_byte_arr, "image/png")))
                 else:
-                    files.append(("image[]", img_binary))
+                    files.append(("image[]", (f"image_{i}.png", img_byte_arr, "image/png")))
 
         if mask is not None:
             if image is None:
@@ -511,9 +506,7 @@ class OpenAIGPTImage1(ComfyNodeABC):
             mask_img_byte_arr = io.BytesIO()
             mask_img.save(mask_img_byte_arr, format="PNG")
             mask_img_byte_arr.seek(0)
-            mask_binary = mask_img_byte_arr
-            mask_binary.name = "mask.png"
-            files.append(("mask", mask_binary))
+            files.append(("mask", ("mask.png", mask_img_byte_arr, "image/png")))
 
         # Build the operation
         operation = SynchronousOperation(

From c308a8840aebf06649364e8e175862250a2d8823 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 15 Aug 2025 12:50:39 -0700
Subject: [PATCH 02/30] Add FluxKontextMultiReferenceLatentMethod node. (#9356)

This node is only useful if someone trains the kontext model to properly
use multiple reference images via the index method.

The default is the offset method which feeds the multiple images like if
they were stitched together as one. This method works with the current
flux kontext model.
---
 comfy/ldm/flux/model.py    | 24 ++++++++++++++++--------
 comfy/model_base.py        |  4 ++++
 comfy_extras/nodes_flux.py | 19 +++++++++++++++++++
 3 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py
index 8f4d99f54..c4de82795 100644
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@@ -224,19 +224,27 @@ class Flux(nn.Module):
         if ref_latents is not None:
             h = 0
             w = 0
+            index = 0
+            index_ref_method = kwargs.get("ref_latents_method", "offset") == "index"
             for ref in ref_latents:
-                h_offset = 0
-                w_offset = 0
-                if ref.shape[-2] + h > ref.shape[-1] + w:
-                    w_offset = w
+                if index_ref_method:
+                    index += 1
+                    h_offset = 0
+                    w_offset = 0
                 else:
-                    h_offset = h
+                    index = 1
+                    h_offset = 0
+                    w_offset = 0
+                    if ref.shape[-2] + h > ref.shape[-1] + w:
+                        w_offset = w
+                    else:
+                        h_offset = h
+                    h = max(h, ref.shape[-2] + h_offset)
+                    w = max(w, ref.shape[-1] + w_offset)
 
-                kontext, kontext_ids = self.process_img(ref, index=1, h_offset=h_offset, w_offset=w_offset)
+                kontext, kontext_ids = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset)
                 img = torch.cat([img, kontext], dim=1)
                 img_ids = torch.cat([img_ids, kontext_ids], dim=1)
-                h = max(h, ref.shape[-2] + h_offset)
-                w = max(w, ref.shape[-1] + w_offset)
 
         txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
         out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
diff --git a/comfy/model_base.py b/comfy/model_base.py
index cde61df7c..bf874b875 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -890,6 +890,10 @@ class Flux(BaseModel):
             for lat in ref_latents:
                 latents.append(self.process_latent_in(lat))
             out['ref_latents'] = comfy.conds.CONDList(latents)
+
+            ref_latents_method = kwargs.get("reference_latents_method", None)
+            if ref_latents_method is not None:
+                out['ref_latents_method'] = comfy.conds.CONDConstant(ref_latents_method)
         return out
 
     def extra_conds_shapes(self, **kwargs):
diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py
index 8a8a17698..c8db75bb3 100644
--- a/comfy_extras/nodes_flux.py
+++ b/comfy_extras/nodes_flux.py
@@ -100,9 +100,28 @@ class FluxKontextImageScale:
         return (image, )
 
 
+class FluxKontextMultiReferenceLatentMethod:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "conditioning": ("CONDITIONING", ),
+            "reference_latents_method": (("offset", "index"), ),
+            }}
+
+    RETURN_TYPES = ("CONDITIONING",)
+    FUNCTION = "append"
+    EXPERIMENTAL = True
+
+    CATEGORY = "advanced/conditioning/flux"
+
+    def append(self, conditioning, reference_latents_method):
+        c = node_helpers.conditioning_set_values(conditioning, {"reference_latents_method": reference_latents_method})
+        return (c, )
+
 NODE_CLASS_MAPPINGS = {
     "CLIPTextEncodeFlux": CLIPTextEncodeFlux,
     "FluxGuidance": FluxGuidance,
     "FluxDisableGuidance": FluxDisableGuidance,
     "FluxKontextImageScale": FluxKontextImageScale,
+    "FluxKontextMultiReferenceLatentMethod": FluxKontextMultiReferenceLatentMethod,
 }

From 1702e6df16b0a52e147f19e3d5c5548c25a64339 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:29:58 -0700
Subject: [PATCH 03/30] Implement wan2.2 camera model. (#9357)

Use the old WanCameraImageToVideo node.
---
 comfy/ldm/wan/model.py    |  7 ++++++-
 comfy/model_detection.py  |  5 ++++-
 comfy/supported_models.py | 14 +++++++++++++-
 comfy_extras/nodes_wan.py |  7 +++++--
 4 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py
index 4e2d99566..9d3741be3 100644
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@@ -768,7 +768,12 @@ class CameraWanModel(WanModel):
                  operations=None,
                  ):
 
-        super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, image_model=image_model, device=device, dtype=dtype, operations=operations)
+        if model_type == 'camera':
+            model_type = 'i2v'
+        else:
+            model_type = 't2v'
+
+        super().__init__(model_type=model_type, patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, image_model=image_model, device=device, dtype=dtype, operations=operations)
         operation_settings = {"operations": operations, "device": device, "dtype": dtype}
 
         self.control_adapter = WanCamAdapter(in_dim_control_adapter, dim, kernel_size=patch_size[1:], stride=patch_size[1:], operation_settings=operation_settings)
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index 8acc51e20..2bec0541e 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -364,7 +364,10 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
             dit_config["vace_in_dim"] = state_dict['{}vace_patch_embedding.weight'.format(key_prefix)].shape[1]
             dit_config["vace_layers"] = count_blocks(state_dict_keys, '{}vace_blocks.'.format(key_prefix) + '{}.')
         elif '{}control_adapter.conv.weight'.format(key_prefix) in state_dict_keys:
-            dit_config["model_type"] = "camera"
+            if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
+                dit_config["model_type"] = "camera"
+            else:
+                dit_config["model_type"] = "camera_2.2"
         else:
             if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
                 dit_config["model_type"] = "i2v"
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 156ff9e26..7ed6dfd69 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1046,6 +1046,18 @@ class WAN21_Camera(WAN21_T2V):
     def get_model(self, state_dict, prefix="", device=None):
         out = model_base.WAN21_Camera(self, image_to_video=False, device=device)
         return out
+
+class WAN22_Camera(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "camera_2.2",
+        "in_dim": 36,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN21_Camera(self, image_to_video=False, device=device)
+        return out
+
 class WAN21_Vace(WAN21_T2V):
     unet_config = {
         "image_model": "wan2.1",
@@ -1260,6 +1272,6 @@ class QwenImage(supported_models_base.BASE):
         return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect))
 
 
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage]
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage]
 
 models += [SVD_img2vid]
diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index 694a183f6..83a990688 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -422,9 +422,12 @@ class WanCameraImageToVideo(io.ComfyNode):
             start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
             concat_latent_image = vae.encode(start_image[:, :, :, :3])
             concat_latent[:,:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
+            mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1]))
+            mask[:, :, :start_image.shape[0] + 3] = 0.0
+            mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2)
 
-            positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent})
-            negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent})
+            positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask})
+            negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask})
 
         if camera_conditions is not None:
             positive = node_helpers.conditioning_set_values(positive, {'camera_conditions': camera_conditions})

From ed2e33c69a291094c4fcc13d8426c49844a6363c Mon Sep 17 00:00:00 2001
From: Christian Byrne <cbyrne@comfy.org>
Date: Fri, 15 Aug 2025 20:32:58 -0700
Subject: [PATCH 04/30] bump frontend version to 1.25.8 (#9361)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 551002b5b..2ae44ebe1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-comfyui-frontend-package==1.24.4
+comfyui-frontend-package==1.25.8
 comfyui-workflow-templates==0.1.59
 comfyui-embedded-docs==0.2.6
 torch

From 20a84166d0d37dd6833caa6cadf3bfac8c241b48 Mon Sep 17 00:00:00 2001
From: Terry Jia <terryjia88@gmail.com>
Date: Sat, 16 Aug 2025 02:07:12 -0400
Subject: [PATCH 05/30] record audio node (#8716)

* record audio node

* sf
---
 comfy_extras/nodes_audio.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py
index a90b31779..3b23f65d8 100644
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -346,6 +346,24 @@ class LoadAudio:
             return "Invalid audio file: {}".format(audio)
         return True
 
+class RecordAudio:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"audio": ("AUDIO_RECORD", {})}}
+
+    CATEGORY = "audio"
+
+    RETURN_TYPES = ("AUDIO", )
+    FUNCTION = "load"
+
+    def load(self, audio):
+        audio_path = folder_paths.get_annotated_filepath(audio)
+
+        waveform, sample_rate = torchaudio.load(audio_path)
+        audio = {"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate}
+        return (audio, )
+
+
 NODE_CLASS_MAPPINGS = {
     "EmptyLatentAudio": EmptyLatentAudio,
     "VAEEncodeAudio": VAEEncodeAudio,
@@ -356,6 +374,7 @@ NODE_CLASS_MAPPINGS = {
     "LoadAudio": LoadAudio,
     "PreviewAudio": PreviewAudio,
     "ConditioningStableAudio": ConditioningStableAudio,
+    "RecordAudio": RecordAudio,
 }
 
 NODE_DISPLAY_NAME_MAPPINGS = {
@@ -367,4 +386,5 @@ NODE_DISPLAY_NAME_MAPPINGS = {
     "SaveAudio": "Save Audio (FLAC)",
     "SaveAudioMP3": "Save Audio (MP3)",
     "SaveAudioOpus": "Save Audio (Opus)",
+    "RecordAudio": "Record Audio",
 }

From 0f2b8525bcafe213e8421a49564a90f926e81f2e Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Sat, 16 Aug 2025 14:51:28 -0700
Subject: [PATCH 06/30] Qwen image model refactor. (#9375)

---
 comfy/ldm/qwen_image/model.py | 36 +++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py
index 99843f88d..40d8fd979 100644
--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@@ -333,21 +333,25 @@ class QwenImageTransformer2DModel(nn.Module):
         self.proj_out = operations.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True, dtype=dtype, device=device)
         self.gradient_checkpointing = False
 
-    def pos_embeds(self, x, context):
+    def process_img(self, x, index=0, h_offset=0, w_offset=0):
         bs, c, t, h, w = x.shape
         patch_size = self.patch_size
+        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (1, self.patch_size, self.patch_size))
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2)
+        hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5)
+        hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4)
         h_len = ((h + (patch_size // 2)) // patch_size)
         w_len = ((w + (patch_size // 2)) // patch_size)
 
-        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
-        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
+        h_offset = ((h_offset + (patch_size // 2)) // patch_size)
+        w_offset = ((w_offset + (patch_size // 2)) // patch_size)
 
-        txt_start = round(max(h_len, w_len))
-        txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(bs, 1, 3)
-        ids = torch.cat((txt_ids, img_ids), dim=1)
-        return self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype)
+        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
+        img_ids[:, :, 0] = img_ids[:, :, 1] + index
+        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
+        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
+        return hidden_states, repeat(img_ids, "h w c -> b (h w) c", b=bs), orig_shape
 
     def forward(
         self,
@@ -363,13 +367,13 @@ class QwenImageTransformer2DModel(nn.Module):
         encoder_hidden_states = context
         encoder_hidden_states_mask = attention_mask
 
-        image_rotary_emb = self.pos_embeds(x, context)
+        hidden_states, img_ids, orig_shape = self.process_img(x)
+        num_embeds = hidden_states.shape[1]
 
-        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (1, self.patch_size, self.patch_size))
-        orig_shape = hidden_states.shape
-        hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2)
-        hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5)
-        hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4)
+        txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size), ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size)))
+        txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(x.shape[0], 1, 3)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype)
 
         hidden_states = self.img_in(hidden_states)
         encoder_hidden_states = self.txt_norm(encoder_hidden_states)
@@ -408,6 +412,6 @@ class QwenImageTransformer2DModel(nn.Module):
         hidden_states = self.norm_out(hidden_states, temb)
         hidden_states = self.proj_out(hidden_states)
 
-        hidden_states = hidden_states.view(orig_shape[0], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2)
+        hidden_states = hidden_states[:, :num_embeds].view(orig_shape[0], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2)
         hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5)
         return hidden_states.reshape(orig_shape)[:, :, :, :x.shape[-2], :x.shape[-1]]

From ed43784b0d04e5b8e8ff2c057fa84b9c5132aaf2 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Sun, 17 Aug 2025 13:45:39 -0700
Subject: [PATCH 07/30] WIP Qwen edit model: The diffusion model part. (#9383)

---
 comfy/ldm/qwen_image/model.py | 26 ++++++++++++++++++++++++++
 comfy/model_base.py           | 10 ++++++++++
 2 files changed, 36 insertions(+)

diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py
index 40d8fd979..a3c726299 100644
--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@@ -360,6 +360,7 @@ class QwenImageTransformer2DModel(nn.Module):
         context,
         attention_mask=None,
         guidance: torch.Tensor = None,
+        ref_latents=None,
         transformer_options={},
         **kwargs
     ):
@@ -370,6 +371,31 @@ class QwenImageTransformer2DModel(nn.Module):
         hidden_states, img_ids, orig_shape = self.process_img(x)
         num_embeds = hidden_states.shape[1]
 
+        if ref_latents is not None:
+            h = 0
+            w = 0
+            index = 0
+            index_ref_method = kwargs.get("ref_latents_method", "index") == "index"
+            for ref in ref_latents:
+                if index_ref_method:
+                    index += 1
+                    h_offset = 0
+                    w_offset = 0
+                else:
+                    index = 1
+                    h_offset = 0
+                    w_offset = 0
+                    if ref.shape[-2] + h > ref.shape[-1] + w:
+                        w_offset = w
+                    else:
+                        h_offset = h
+                    h = max(h, ref.shape[-2] + h_offset)
+                    w = max(w, ref.shape[-1] + w_offset)
+
+                kontext, kontext_ids, _ = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset)
+                hidden_states = torch.cat([hidden_states, kontext], dim=1)
+                img_ids = torch.cat([img_ids, kontext_ids], dim=1)
+
         txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size), ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size)))
         txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(x.shape[0], 1, 3)
         ids = torch.cat((txt_ids, img_ids), dim=1)
diff --git a/comfy/model_base.py b/comfy/model_base.py
index bf874b875..15bd7abef 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1331,4 +1331,14 @@ class QwenImage(BaseModel):
         cross_attn = kwargs.get("cross_attn", None)
         if cross_attn is not None:
             out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+        ref_latents = kwargs.get("reference_latents", None)
+        if ref_latents is not None:
+            latents = []
+            for lat in ref_latents:
+                latents.append(self.process_latent_in(lat))
+            out['ref_latents'] = comfy.conds.CONDList(latents)
+
+            ref_latents_method = kwargs.get("reference_latents_method", None)
+            if ref_latents_method is not None:
+                out['ref_latents_method'] = comfy.conds.CONDConstant(ref_latents_method)
         return out

From d4e353a94ec5a8cb15ed151990a9518b890e5d4f Mon Sep 17 00:00:00 2001
From: ComfyUI Wiki <contact@comfyui-wiki.com>
Date: Mon, 18 Aug 2025 05:38:40 +0800
Subject: [PATCH 08/30] Update template to 0.1.60 (#9377)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 2ae44ebe1..72a700028 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 comfyui-frontend-package==1.25.8
-comfyui-workflow-templates==0.1.59
+comfyui-workflow-templates==0.1.60
 comfyui-embedded-docs==0.2.6
 torch
 torchsde

From 7f3b9b16c6636cb1201213574892d33c2a35e4ba Mon Sep 17 00:00:00 2001
From: Jedrzej Kosinski <kosinkadink1@gmail.com>
Date: Sun, 17 Aug 2025 15:54:07 -0700
Subject: [PATCH 09/30] Make step index detection much more robust (#9392)

---
 comfy/context_windows.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/comfy/context_windows.py b/comfy/context_windows.py
index 928b111df..041f380f9 100644
--- a/comfy/context_windows.py
+++ b/comfy/context_windows.py
@@ -164,8 +164,11 @@ class IndexListContextHandler(ContextHandlerABC):
         return resized_cond
 
     def set_step(self, timestep: torch.Tensor, model_options: dict[str]):
-        indexes = torch.where(model_options["transformer_options"]["sample_sigmas"] == timestep[0])
-        self._step = int(indexes[0])
+        mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep, rtol=0.0001)
+        matches = torch.nonzero(mask)
+        if torch.numel(matches) == 0:
+            raise Exception("No sample_sigmas matched current timestep; something went wrong.")
+        self._step = int(matches[0].item())
 
     def get_context_windows(self, model: BaseModel, x_in: torch.Tensor, model_options: dict[str]) -> list[IndexListContextWindow]:
         full_length = x_in.size(self.dim) # TODO: choose dim based on model

From da2efeaec6609265051165bfb413a2a4c84cf4bb Mon Sep 17 00:00:00 2001
From: Christian Byrne <cbyrne@comfy.org>
Date: Sun, 17 Aug 2025 20:21:02 -0700
Subject: [PATCH 10/30] Bump frontend to 1.25.9 (#9394)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 72a700028..c7a5c47ab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-comfyui-frontend-package==1.25.8
+comfyui-frontend-package==1.25.9
 comfyui-workflow-templates==0.1.60
 comfyui-embedded-docs==0.2.6
 torch

From bd2ab73976a4e245db3e057795328c89bfd98a88 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Mon, 18 Aug 2025 10:26:55 +0300
Subject: [PATCH 11/30] fix(WAN-nodes): invalid nodeid for WanTrackToVideo
 (#9396)

---
 comfy_extras/nodes_wan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index 83a990688..0fff02f76 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -699,7 +699,7 @@ class WanTrackToVideo(io.ComfyNode):
     @classmethod
     def define_schema(cls):
         return io.Schema(
-            node_id="WanPhantomSubjectToVideo",
+            node_id="WanTrackToVideo",
             category="conditioning/video_models",
             inputs=[
                 io.Conditioning.Input("positive"),

From 4977f203fa8e9e3ab22884c8ace8f9b540d48952 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 18 Aug 2025 19:38:34 -0700
Subject: [PATCH 12/30] P2 of qwen edit model. (#9412)

* P2 of qwen edit model.

* Typo.

* Fix normal qwen.

* Fix.

* Make the TextEncodeQwenImageEdit also set the ref latent.

If you don't want it to set the ref latent and want to use the
ReferenceLatent node with your custom latent instead just disconnect the
VAE.
---
 comfy/clip_model.py               |   2 +-
 comfy/model_base.py               |   8 +
 comfy/sd1_clip.py                 |  11 +-
 comfy/text_encoders/bert.py       |   2 +-
 comfy/text_encoders/llama.py      |  43 ++-
 comfy/text_encoders/qwen_image.py |  20 +-
 comfy/text_encoders/qwen_vl.py    | 428 ++++++++++++++++++++++++++++++
 comfy/text_encoders/t5.py         |   2 +-
 comfy_extras/nodes_qwen.py        |  63 +++++
 nodes.py                          |   1 +
 10 files changed, 565 insertions(+), 15 deletions(-)
 create mode 100644 comfy/text_encoders/qwen_vl.py
 create mode 100644 comfy_extras/nodes_qwen.py

diff --git a/comfy/clip_model.py b/comfy/clip_model.py
index c8294d483..7e47d8a55 100644
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@@ -97,7 +97,7 @@ class CLIPTextModel_(torch.nn.Module):
         self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
         self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
 
-    def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
+    def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32, embeds_info=[]):
         if embeds is not None:
             x = embeds + comfy.ops.cast_to(self.embeddings.position_embedding.weight, dtype=dtype, device=embeds.device)
         else:
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 15bd7abef..6c861b15e 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1325,6 +1325,7 @@ class Omnigen2(BaseModel):
 class QwenImage(BaseModel):
     def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
         super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.qwen_image.model.QwenImageTransformer2DModel)
+        self.memory_usage_factor_conds = ("ref_latents",)
 
     def extra_conds(self, **kwargs):
         out = super().extra_conds(**kwargs)
@@ -1342,3 +1343,10 @@ class QwenImage(BaseModel):
             if ref_latents_method is not None:
                 out['ref_latents_method'] = comfy.conds.CONDConstant(ref_latents_method)
         return out
+
+    def extra_conds_shapes(self, **kwargs):
+        out = {}
+        ref_latents = kwargs.get("reference_latents", None)
+        if ref_latents is not None:
+            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
+        return out
diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py
index ade340fd1..1e8adbe69 100644
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -204,17 +204,19 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
             tokens_embed = self.transformer.get_input_embeddings()(tokens_embed, out_dtype=torch.float32)
             index = 0
             pad_extra = 0
+            embeds_info = []
             for o in other_embeds:
                 emb = o[1]
                 if torch.is_tensor(emb):
                     emb = {"type": "embedding", "data": emb}
 
+                extra = None
                 emb_type = emb.get("type", None)
                 if emb_type == "embedding":
                     emb = emb.get("data", None)
                 else:
                     if hasattr(self.transformer, "preprocess_embed"):
-                        emb = self.transformer.preprocess_embed(emb, device=device)
+                        emb, extra = self.transformer.preprocess_embed(emb, device=device)
                     else:
                         emb = None
 
@@ -229,6 +231,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
                     tokens_embed = torch.cat([tokens_embed[:, :ind], emb, tokens_embed[:, ind:]], dim=1)
                     attention_mask = attention_mask[:ind] + [1] * emb_shape + attention_mask[ind:]
                     index += emb_shape - 1
+                    embeds_info.append({"type": emb_type, "index": ind, "size": emb_shape, "extra": extra})
                 else:
                     index += -1
                     pad_extra += emb_shape
@@ -243,11 +246,11 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
             attention_masks.append(attention_mask)
             num_tokens.append(sum(attention_mask))
 
-        return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens
+        return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens, embeds_info
 
     def forward(self, tokens):
         device = self.transformer.get_input_embeddings().weight.device
-        embeds, attention_mask, num_tokens = self.process_tokens(tokens, device)
+        embeds, attention_mask, num_tokens, embeds_info = self.process_tokens(tokens, device)
 
         attention_mask_model = None
         if self.enable_attention_masks:
@@ -258,7 +261,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
         else:
             intermediate_output = self.layer_idx
 
-        outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
+        outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32, embeds_info=embeds_info)
 
         if self.layer == "last":
             z = outputs[0].float()
diff --git a/comfy/text_encoders/bert.py b/comfy/text_encoders/bert.py
index 551b03162..ed4638a9a 100644
--- a/comfy/text_encoders/bert.py
+++ b/comfy/text_encoders/bert.py
@@ -116,7 +116,7 @@ class BertModel_(torch.nn.Module):
         self.embeddings = BertEmbeddings(config_dict["vocab_size"], config_dict["max_position_embeddings"], config_dict["type_vocab_size"], config_dict["pad_token_id"], embed_dim, layer_norm_eps, dtype, device, operations)
         self.encoder = BertEncoder(config_dict["num_hidden_layers"], embed_dim, config_dict["intermediate_size"], config_dict["num_attention_heads"], layer_norm_eps, dtype, device, operations)
 
-    def forward(self, input_tokens, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+    def forward(self, input_tokens, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]):
         x = self.embeddings(input_tokens, embeds=embeds, dtype=dtype)
         mask = None
         if attention_mask is not None:
diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py
index 1da6a0c94..9d90d5a61 100644
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -2,12 +2,14 @@ import torch
 import torch.nn as nn
 from dataclasses import dataclass
 from typing import Optional, Any
+import math
 
 from comfy.ldm.modules.attention import optimized_attention_for_device
 import comfy.model_management
 import comfy.ldm.common_dit
 
 import comfy.model_management
+from . import qwen_vl
 
 @dataclass
 class Llama2Config:
@@ -100,12 +102,10 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-def precompute_freqs_cis(head_dim, seq_len, theta, device=None):
+def precompute_freqs_cis(head_dim, position_ids, theta, device=None):
     theta_numerator = torch.arange(0, head_dim, 2, device=device).float()
     inv_freq = 1.0 / (theta ** (theta_numerator / head_dim))
 
-    position_ids = torch.arange(0, seq_len, device=device).unsqueeze(0)
-
     inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
     position_ids_expanded = position_ids[:, None, :].float()
     freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
@@ -277,7 +277,7 @@ class Llama2_(nn.Module):
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
         # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
 
-    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[]):
         if embeds is not None:
             x = embeds
         else:
@@ -286,8 +286,11 @@ class Llama2_(nn.Module):
         if self.normalize_in:
             x *= self.config.hidden_size ** 0.5
 
+        if position_ids is None:
+            position_ids = torch.arange(0, x.shape[1], device=x.device).unsqueeze(0)
+
         freqs_cis = precompute_freqs_cis(self.config.head_dim,
-                                         x.shape[1],
+                                         position_ids,
                                          self.config.rope_theta,
                                          device=x.device)
 
@@ -372,8 +375,38 @@ class Qwen25_7BVLI(BaseLlama, torch.nn.Module):
         self.num_layers = config.num_hidden_layers
 
         self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
+        self.visual = qwen_vl.Qwen2VLVisionTransformer(hidden_size=1280, output_hidden_size=config.hidden_size, device=device, dtype=dtype, ops=operations)
         self.dtype = dtype
 
+    def preprocess_embed(self, embed, device):
+        if embed["type"] == "image":
+            image, grid = qwen_vl.process_qwen2vl_images(embed["data"])
+            return self.visual(image.to(device, dtype=torch.float32), grid), grid
+        return None, None
+
+    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]):
+        grid = None
+        for e in embeds_info:
+            if e.get("type") == "image":
+                grid = e.get("extra", None)
+                position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
+                start = e.get("index")
+                position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
+                end = e.get("size") + start
+                len_max = int(grid.max()) // 2
+                start_next = len_max + start
+                position_ids[:, end:] = torch.arange(start_next, start_next + (embeds.shape[1] - end), device=embeds.device)
+                position_ids[0, start:end] = start
+                max_d = int(grid[0][1]) // 2
+                position_ids[1, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
+                max_d = int(grid[0][2]) // 2
+                position_ids[2, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
+
+        if grid is None:
+            position_ids = None
+
+        return super().forward(x, attention_mask=attention_mask, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=final_layer_norm_intermediate, dtype=dtype, position_ids=position_ids)
+
 class Gemma2_2B(BaseLlama, torch.nn.Module):
     def __init__(self, config_dict, dtype, device, operations):
         super().__init__()
diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py
index ce5c98097..f07318d6c 100644
--- a/comfy/text_encoders/qwen_image.py
+++ b/comfy/text_encoders/qwen_image.py
@@ -15,13 +15,27 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer):
     def __init__(self, embedding_directory=None, tokenizer_data={}):
         super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen25_7b", tokenizer=Qwen25_7BVLITokenizer)
         self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image \\(color, shape, size, texture, objects, background\\), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
 
-    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None,**kwargs):
+    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], **kwargs):
         if llama_template is None:
-            llama_text = self.llama_template.format(text)
+            if len(images) > 0:
+                llama_text = self.llama_template_images.format(text)
+            else:
+                llama_text = self.llama_template.format(text)
         else:
             llama_text = llama_template.format(text)
-        return super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs)
+        tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs)
+        key_name = next(iter(tokens))
+        embed_count = 0
+        qwen_tokens = tokens[key_name]
+        for r in qwen_tokens:
+            for i in range(len(r)):
+                if r[i][0] == 151655:
+                    if len(images) > embed_count:
+                        r[i] = ({"type": "image", "data": images[embed_count], "original_type": "image"},) + r[i][1:]
+                        embed_count += 1
+        return tokens
 
 
 class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
diff --git a/comfy/text_encoders/qwen_vl.py b/comfy/text_encoders/qwen_vl.py
new file mode 100644
index 000000000..3b18ce730
--- /dev/null
+++ b/comfy/text_encoders/qwen_vl.py
@@ -0,0 +1,428 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+import math
+from comfy.ldm.modules.attention import optimized_attention_for_device
+
+
+def process_qwen2vl_images(
+    images: torch.Tensor,
+    min_pixels: int = 3136,
+    max_pixels: int = 12845056,
+    patch_size: int = 14,
+    temporal_patch_size: int = 2,
+    merge_size: int = 2,
+    image_mean: list = None,
+    image_std: list = None,
+):
+    if image_mean is None:
+        image_mean = [0.48145466, 0.4578275, 0.40821073]
+    if image_std is None:
+        image_std = [0.26862954, 0.26130258, 0.27577711]
+
+    batch_size, height, width, channels = images.shape
+    device = images.device
+    # dtype = images.dtype
+
+    images = images.permute(0, 3, 1, 2)
+
+    grid_thw_list = []
+    img = images[0]
+
+    factor = patch_size * merge_size
+
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = max(factor, math.floor(height / beta / factor) * factor)
+        w_bar = max(factor, math.floor(width / beta / factor) * factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+
+    img_resized = F.interpolate(
+        img.unsqueeze(0),
+        size=(h_bar, w_bar),
+        mode='bilinear',
+        align_corners=False
+    ).squeeze(0)
+
+    normalized = img_resized.clone()
+    for c in range(3):
+        normalized[c] = (img_resized[c] - image_mean[c]) / image_std[c]
+
+    grid_h = h_bar // patch_size
+    grid_w = w_bar // patch_size
+    grid_thw = torch.tensor([1, grid_h, grid_w], device=device, dtype=torch.long)
+
+    pixel_values = normalized
+    grid_thw_list.append(grid_thw)
+    image_grid_thw = torch.stack(grid_thw_list)
+
+    grid_t = 1
+    channel = pixel_values.shape[0]
+    pixel_values = pixel_values.unsqueeze(0).repeat(2, 1, 1, 1)
+
+    patches = pixel_values.reshape(
+        grid_t,
+        temporal_patch_size,
+        channel,
+        grid_h // merge_size,
+        merge_size,
+        patch_size,
+        grid_w // merge_size,
+        merge_size,
+        patch_size,
+    )
+
+    patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
+    flatten_patches = patches.reshape(
+        grid_t * grid_h * grid_w,
+        channel * temporal_patch_size * patch_size * patch_size
+    )
+
+    return flatten_patches, image_grid_thw
+
+
+class VisionPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        embed_dim: int = 3584,
+        device=None,
+        dtype=None,
+        ops=None,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = ops.Conv3d(
+            in_channels,
+            embed_dim,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=False,
+            device=device,
+            dtype=dtype
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states)
+        return hidden_states.view(-1, self.embed_dim)
+
+
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_vision(q, k, cos, sin):
+    cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+
+    def forward(self, seqlen: int, device) -> torch.Tensor:
+        inv_freq = 1.0 / (self.theta ** (torch.arange(0, self.dim, 2, dtype=torch.float, device=device) / self.dim))
+        seq = torch.arange(seqlen, device=inv_freq.device, dtype=inv_freq.dtype)
+        freqs = torch.outer(seq, inv_freq)
+        return freqs
+
+
+class PatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2, device=None, dtype=None, ops=None):
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size ** 2)
+        self.ln_q = ops.RMSNorm(context_dim, eps=1e-6, device=device, dtype=dtype)
+        self.mlp = nn.Sequential(
+            ops.Linear(self.hidden_size, self.hidden_size, device=device, dtype=dtype),
+            nn.GELU(),
+            ops.Linear(self.hidden_size, dim, device=device, dtype=dtype),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ln_q(x).reshape(-1, self.hidden_size)
+        x = self.mlp(x)
+        return x
+
+
+class VisionAttention(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, device=None, dtype=None, ops=None):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.scaling = self.head_dim ** -0.5
+
+        self.qkv = ops.Linear(hidden_size, hidden_size * 3, bias=True, device=device, dtype=dtype)
+        self.proj = ops.Linear(hidden_size, hidden_size, bias=True, device=device, dtype=dtype)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cu_seqlens=None,
+        optimized_attention=None,
+    ) -> torch.Tensor:
+        if hidden_states.dim() == 2:
+            seq_length, _ = hidden_states.shape
+            batch_size = 1
+            hidden_states = hidden_states.unsqueeze(0)
+        else:
+            batch_size, seq_length, _ = hidden_states.shape
+
+        qkv = self.qkv(hidden_states)
+        qkv = qkv.reshape(batch_size, seq_length, 3, self.num_heads, self.head_dim)
+        query_states, key_states, value_states = qkv.reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+
+        if position_embeddings is not None:
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)
+
+        query_states = query_states.transpose(0, 1).unsqueeze(0)
+        key_states = key_states.transpose(0, 1).unsqueeze(0)
+        value_states = value_states.transpose(0, 1).unsqueeze(0)
+
+        lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+        splits = [
+            torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
+        ]
+
+        attn_outputs = [
+            optimized_attention(q, k, v, self.num_heads, skip_reshape=True)
+            for q, k, v in zip(*splits)
+        ]
+        attn_output = torch.cat(attn_outputs, dim=1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+
+        return attn_output
+
+
+class VisionMLP(nn.Module):
+    def __init__(self, hidden_size: int, intermediate_size: int, device=None, dtype=None, ops=None):
+        super().__init__()
+        self.gate_proj = ops.Linear(hidden_size, intermediate_size, bias=True, device=device, dtype=dtype)
+        self.up_proj = ops.Linear(hidden_size, intermediate_size, bias=True, device=device, dtype=dtype)
+        self.down_proj = ops.Linear(intermediate_size, hidden_size, bias=True, device=device, dtype=dtype)
+        self.act_fn = nn.SiLU()
+
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+
+class VisionBlock(nn.Module):
+    def __init__(self, hidden_size: int, intermediate_size: int, num_heads: int, device=None, dtype=None, ops=None):
+        super().__init__()
+        self.norm1 = ops.RMSNorm(hidden_size, eps=1e-6, device=device, dtype=dtype)
+        self.norm2 = ops.RMSNorm(hidden_size, eps=1e-6, device=device, dtype=dtype)
+        self.attn = VisionAttention(hidden_size, num_heads, device=device, dtype=dtype, ops=ops)
+        self.mlp = VisionMLP(hidden_size, intermediate_size, device=device, dtype=dtype, ops=ops)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cu_seqlens=None,
+        optimized_attention=None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.attn(hidden_states, position_embeddings, cu_seqlens, optimized_attention)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class Qwen2VLVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 3584,
+        output_hidden_size: int = 3584,
+        intermediate_size: int = 3420,
+        num_heads: int = 16,
+        num_layers: int = 32,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        spatial_merge_size: int = 2,
+        window_size: int = 112,
+        device=None,
+        dtype=None,
+        ops=None
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.window_size = window_size
+        self.fullatt_block_indexes = [7, 15, 23, 31]
+
+        self.patch_embed = VisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_channels=3,
+            embed_dim=hidden_size,
+            device=device,
+            dtype=dtype,
+            ops=ops,
+        )
+
+        head_dim = hidden_size // num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([
+            VisionBlock(hidden_size, intermediate_size, num_heads, device, dtype, ops)
+            for _ in range(num_layers)
+        ])
+
+        self.merger = PatchMerger(
+            dim=output_hidden_size,
+            context_dim=hidden_size,
+            spatial_merge_size=spatial_merge_size,
+            device=device,
+            dtype=dtype,
+            ops=ops,
+        )
+
+    def get_window_index(self, grid_thw):
+        window_index = []
+        cu_window_seqlens = [0]
+        window_index_id = 0
+        vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h = grid_h // self.spatial_merge_size
+            llm_grid_w = grid_w // self.spatial_merge_size
+
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
+
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+
+            cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_size * self.spatial_merge_size + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+
+        window_index = torch.cat(window_index, dim=0)
+        return window_index, cu_window_seqlens
+
+    def get_position_embeddings(self, grid_thw, device):
+        pos_ids = []
+
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h, device=device).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3).flatten()
+
+            wpos_ids = torch.arange(w, device=device).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3).flatten()
+
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size, device)
+        return rotary_pos_emb_full[pos_ids].flatten(1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        image_grid_thw: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        optimized_attention = optimized_attention_for_device(pixel_values.device, mask=False, small_input=True)
+
+        hidden_states = self.patch_embed(pixel_values)
+
+        window_index, cu_window_seqlens = self.get_window_index(image_grid_thw)
+        cu_window_seqlens = torch.tensor(cu_window_seqlens, device=hidden_states.device)
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+
+        position_embeddings = self.get_position_embeddings(image_grid_thw, hidden_states.device)
+
+        seq_len, _ = hidden_states.size()
+        spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
+
+        hidden_states = hidden_states.reshape(seq_len // spatial_merge_unit, spatial_merge_unit, -1)
+        hidden_states = hidden_states[window_index, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+
+        position_embeddings = position_embeddings.reshape(seq_len // spatial_merge_unit, spatial_merge_unit, -1)
+        position_embeddings = position_embeddings[window_index, :, :]
+        position_embeddings = position_embeddings.reshape(seq_len, -1)
+        position_embeddings = torch.cat((position_embeddings, position_embeddings), dim=-1)
+        position_embeddings = (position_embeddings.cos(), position_embeddings.sin())
+
+        cu_seqlens = torch.repeat_interleave(image_grid_thw[:, 1] * image_grid_thw[:, 2], image_grid_thw[:, 0]).cumsum(
+            dim=0,
+            dtype=torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        for i, block in enumerate(self.blocks):
+            if i in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+            hidden_states = block(hidden_states, position_embeddings, cu_seqlens_now, optimized_attention=optimized_attention)
+
+        hidden_states = self.merger(hidden_states)
+        return hidden_states
diff --git a/comfy/text_encoders/t5.py b/comfy/text_encoders/t5.py
index 36bf35309..e8588992a 100644
--- a/comfy/text_encoders/t5.py
+++ b/comfy/text_encoders/t5.py
@@ -199,7 +199,7 @@ class T5Stack(torch.nn.Module):
         self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations)
         # self.dropout = nn.Dropout(config.dropout_rate)
 
-    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]):
         mask = None
         if attention_mask is not None:
             mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py
new file mode 100644
index 000000000..b5088fae2
--- /dev/null
+++ b/comfy_extras/nodes_qwen.py
@@ -0,0 +1,63 @@
+import node_helpers
+import comfy.utils
+
+PREFERRED_QWENIMAGE_RESOLUTIONS = [
+    (672, 1568),
+    (688, 1504),
+    (720, 1456),
+    (752, 1392),
+    (800, 1328),
+    (832, 1248),
+    (880, 1184),
+    (944, 1104),
+    (1024, 1024),
+    (1104, 944),
+    (1184, 880),
+    (1248, 832),
+    (1328, 800),
+    (1392, 752),
+    (1456, 720),
+    (1504, 688),
+    (1568, 672),
+]
+
+
+class TextEncodeQwenImageEdit:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "clip": ("CLIP", ),
+            "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            },
+            "optional": {"vae": ("VAE", ),
+                         "image": ("IMAGE", ),}}
+
+    RETURN_TYPES = ("CONDITIONING",)
+    FUNCTION = "encode"
+
+    CATEGORY = "advanced/conditioning"
+
+    def encode(self, clip, prompt, vae=None, image=None):
+        ref_latent = None
+        if image is None:
+            images = []
+        else:
+            images = [image]
+            if vae is not None:
+                width = image.shape[2]
+                height = image.shape[1]
+                aspect_ratio = width / height
+                _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS)
+                image = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "lanczos", "center").movedim(1, -1)
+                ref_latent = vae.encode(image[:, :, :, :3])
+
+        tokens = clip.tokenize(prompt, images=images)
+        conditioning = clip.encode_from_tokens_scheduled(tokens)
+        if ref_latent is not None:
+            conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": [ref_latent]}, append=True)
+        return (conditioning, )
+
+
+NODE_CLASS_MAPPINGS = {
+    "TextEncodeQwenImageEdit": TextEncodeQwenImageEdit,
+}
diff --git a/nodes.py b/nodes.py
index 860a236aa..b3fa9c51a 100644
--- a/nodes.py
+++ b/nodes.py
@@ -2321,6 +2321,7 @@ async def init_builtin_extra_nodes():
         "nodes_edit_model.py",
         "nodes_tcfg.py",
         "nodes_context_windows.py",
+        "nodes_qwen.py",
     ]
 
     import_failed = []

From 36b5127fd3eee8eaf95ff7296a61269ed56d53c0 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Tue, 19 Aug 2025 23:28:07 +0300
Subject: [PATCH 13/30] api_nodes: add kling-v2-1 and v2-1-master (#9257)

---
 comfy_api_nodes/apis/__init__.py | 3 +++
 comfy_api_nodes/nodes_kling.py   | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py
index 54298e8a9..c6f91e9d6 100644
--- a/comfy_api_nodes/apis/__init__.py
+++ b/comfy_api_nodes/apis/__init__.py
@@ -1315,6 +1315,7 @@ class KlingTaskStatus(str, Enum):
 class KlingTextToVideoModelName(str, Enum):
     kling_v1 = 'kling-v1'
     kling_v1_6 = 'kling-v1-6'
+    kling_v2_1_master = 'kling-v2-1-master'
 
 
 class KlingVideoGenAspectRatio(str, Enum):
@@ -1347,6 +1348,8 @@ class KlingVideoGenModelName(str, Enum):
     kling_v1_5 = 'kling-v1-5'
     kling_v1_6 = 'kling-v1-6'
     kling_v2_master = 'kling-v2-master'
+    kling_v2_1 = 'kling-v2-1'
+    kling_v2_1_master = 'kling-v2-1-master'
 
 
 class KlingVideoResult(BaseModel):
diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py
index 9d483bb0e..9fa390985 100644
--- a/comfy_api_nodes/nodes_kling.py
+++ b/comfy_api_nodes/nodes_kling.py
@@ -421,6 +421,8 @@ class KlingTextToVideoNode(KlingNodeBase):
             "pro mode / 10s duration / kling-v2-master": ("pro", "10", "kling-v2-master"),
             "standard mode / 5s duration / kling-v2-master": ("std", "5", "kling-v2-master"),
             "standard mode / 10s duration / kling-v2-master": ("std", "10", "kling-v2-master"),
+            "pro mode / 5s duration / kling-v2-1-master": ("pro", "5", "kling-v2-1-master"),
+            "pro mode / 10s duration / kling-v2-1-master": ("pro", "10", "kling-v2-1-master"),
         }
 
     @classmethod

From f16a70ba670e11de549af188663a87c77c5bc0c2 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Tue, 19 Aug 2025 23:28:27 +0300
Subject: [PATCH 14/30] api_nodes: add MinimaxHailuoVideoNode node (#9262)

---
 comfy_api_nodes/apis/__init__.py |  13 ++-
 comfy_api_nodes/nodes_minimax.py | 185 ++++++++++++++++++++++++++++++-
 2 files changed, 191 insertions(+), 7 deletions(-)

diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py
index c6f91e9d6..7a09df55b 100644
--- a/comfy_api_nodes/apis/__init__.py
+++ b/comfy_api_nodes/apis/__init__.py
@@ -1623,13 +1623,14 @@ class MinimaxTaskResultResponse(BaseModel):
     task_id: str = Field(..., description='The task ID being queried.')
 
 
-class Model(str, Enum):
+class MiniMaxModel(str, Enum):
     T2V_01_Director = 'T2V-01-Director'
     I2V_01_Director = 'I2V-01-Director'
     S2V_01 = 'S2V-01'
     I2V_01 = 'I2V-01'
     I2V_01_live = 'I2V-01-live'
     T2V_01 = 'T2V-01'
+    Hailuo_02 = 'MiniMax-Hailuo-02'
 
 
 class SubjectReferenceItem(BaseModel):
@@ -1651,7 +1652,7 @@ class MinimaxVideoGenerationRequest(BaseModel):
         None,
         description='URL or base64 encoding of the first frame image. Required when model is I2V-01, I2V-01-Director, or I2V-01-live.',
     )
-    model: Model = Field(
+    model: MiniMaxModel = Field(
         ...,
         description='Required. ID of model. Options: T2V-01-Director, I2V-01-Director, S2V-01, I2V-01, I2V-01-live, T2V-01',
     )
@@ -1668,6 +1669,14 @@ class MinimaxVideoGenerationRequest(BaseModel):
         None,
         description='Only available when model is S2V-01. The model will generate a video based on the subject uploaded through this parameter.',
     )
+    duration: Optional[int] = Field(
+        None,
+        description="The length of the output video in seconds."
+    )
+    resolution: Optional[str] = Field(
+        None,
+        description="The dimensions of the video display. 1080p corresponds to 1920 x 1080 pixels, 768p corresponds to 1366 x 768 pixels."
+    )
 
 
 class MinimaxVideoGenerationResponse(BaseModel):
diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py
index 58d2ed90c..bb3c9e710 100644
--- a/comfy_api_nodes/nodes_minimax.py
+++ b/comfy_api_nodes/nodes_minimax.py
@@ -1,3 +1,4 @@
+from inspect import cleandoc
 from typing import Union
 import logging
 import torch
@@ -10,7 +11,7 @@ from comfy_api_nodes.apis import (
     MinimaxFileRetrieveResponse,
     MinimaxTaskResultResponse,
     SubjectReferenceItem,
-    Model
+    MiniMaxModel
 )
 from comfy_api_nodes.apis.client import (
     ApiEndpoint,
@@ -84,7 +85,6 @@ class MinimaxTextToVideoNode:
     FUNCTION = "generate_video"
     CATEGORY = "api node/video/MiniMax"
     API_NODE = True
-    OUTPUT_NODE = True
 
     async def generate_video(
         self,
@@ -121,7 +121,7 @@ class MinimaxTextToVideoNode:
                 response_model=MinimaxVideoGenerationResponse,
             ),
             request=MinimaxVideoGenerationRequest(
-                model=Model(model),
+                model=MiniMaxModel(model),
                 prompt=prompt_text,
                 callback_url=None,
                 first_frame_image=image_url,
@@ -251,7 +251,6 @@ class MinimaxImageToVideoNode(MinimaxTextToVideoNode):
     FUNCTION = "generate_video"
     CATEGORY = "api node/video/MiniMax"
     API_NODE = True
-    OUTPUT_NODE = True
 
 
 class MinimaxSubjectToVideoNode(MinimaxTextToVideoNode):
@@ -313,7 +312,181 @@ class MinimaxSubjectToVideoNode(MinimaxTextToVideoNode):
     FUNCTION = "generate_video"
     CATEGORY = "api node/video/MiniMax"
     API_NODE = True
-    OUTPUT_NODE = True
+
+
+class MinimaxHailuoVideoNode:
+    """Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model."""
+
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "prompt_text": (
+                    "STRING",
+                    {
+                        "multiline": True,
+                        "default": "",
+                        "tooltip": "Text prompt to guide the video generation.",
+                    },
+                ),
+            },
+            "optional": {
+                "seed": (
+                    IO.INT,
+                    {
+                        "default": 0,
+                        "min": 0,
+                        "max": 0xFFFFFFFFFFFFFFFF,
+                        "control_after_generate": True,
+                        "tooltip": "The random seed used for creating the noise.",
+                    },
+                ),
+                "first_frame_image": (
+                    IO.IMAGE,
+                    {
+                        "tooltip": "Optional image to use as the first frame to generate a video."
+                    },
+                ),
+                "prompt_optimizer": (
+                    IO.BOOLEAN,
+                    {
+                        "tooltip": "Optimize prompt to improve generation quality when needed.",
+                        "default": True,
+                    },
+                ),
+                "duration": (
+                    IO.COMBO,
+                    {
+                        "tooltip": "The length of the output video in seconds.",
+                        "default": 6,
+                        "options": [6, 10],
+                    },
+                ),
+                "resolution": (
+                    IO.COMBO,
+                    {
+                        "tooltip": "The dimensions of the video display. "
+                                   "1080p corresponds to 1920 x 1080 pixels, 768p corresponds to 1366 x 768 pixels.",
+                        "default": "768P",
+                        "options": ["768P", "1080P"],
+                    },
+                ),
+            },
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+                "unique_id": "UNIQUE_ID",
+            },
+        }
+
+    RETURN_TYPES = ("VIDEO",)
+    DESCRIPTION = cleandoc(__doc__ or "")
+    FUNCTION = "generate_video"
+    CATEGORY = "api node/video/MiniMax"
+    API_NODE = True
+
+    async def generate_video(
+        self,
+        prompt_text,
+        seed=0,
+        first_frame_image: torch.Tensor=None, # used for ImageToVideo
+        prompt_optimizer=True,
+        duration=6,
+        resolution="768P",
+        model="MiniMax-Hailuo-02",
+        unique_id: Union[str, None]=None,
+        **kwargs,
+    ):
+        if first_frame_image is None:
+            validate_string(prompt_text, field_name="prompt_text")
+
+        if model == "MiniMax-Hailuo-02" and resolution.upper() == "1080P" and duration != 6:
+            raise Exception(
+                "When model is MiniMax-Hailuo-02 and resolution is 1080P, duration is limited to 6 seconds."
+            )
+
+        # upload image, if passed in
+        image_url = None
+        if first_frame_image is not None:
+            image_url = (await upload_images_to_comfyapi(first_frame_image, max_images=1, auth_kwargs=kwargs))[0]
+
+        video_generate_operation = SynchronousOperation(
+            endpoint=ApiEndpoint(
+                path="/proxy/minimax/video_generation",
+                method=HttpMethod.POST,
+                request_model=MinimaxVideoGenerationRequest,
+                response_model=MinimaxVideoGenerationResponse,
+            ),
+            request=MinimaxVideoGenerationRequest(
+                model=MiniMaxModel(model),
+                prompt=prompt_text,
+                callback_url=None,
+                first_frame_image=image_url,
+                prompt_optimizer=prompt_optimizer,
+                duration=duration,
+                resolution=resolution,
+            ),
+            auth_kwargs=kwargs,
+        )
+        response = await video_generate_operation.execute()
+
+        task_id = response.task_id
+        if not task_id:
+            raise Exception(f"MiniMax generation failed: {response.base_resp}")
+
+        average_duration = 120 if resolution == "768P" else 240
+        video_generate_operation = PollingOperation(
+            poll_endpoint=ApiEndpoint(
+                path="/proxy/minimax/query/video_generation",
+                method=HttpMethod.GET,
+                request_model=EmptyRequest,
+                response_model=MinimaxTaskResultResponse,
+                query_params={"task_id": task_id},
+            ),
+            completed_statuses=["Success"],
+            failed_statuses=["Fail"],
+            status_extractor=lambda x: x.status.value,
+            estimated_duration=average_duration,
+            node_id=unique_id,
+            auth_kwargs=kwargs,
+        )
+        task_result = await video_generate_operation.execute()
+
+        file_id = task_result.file_id
+        if file_id is None:
+            raise Exception("Request was not successful. Missing file ID.")
+        file_retrieve_operation = SynchronousOperation(
+            endpoint=ApiEndpoint(
+                path="/proxy/minimax/files/retrieve",
+                method=HttpMethod.GET,
+                request_model=EmptyRequest,
+                response_model=MinimaxFileRetrieveResponse,
+                query_params={"file_id": int(file_id)},
+            ),
+            request=EmptyRequest(),
+            auth_kwargs=kwargs,
+        )
+        file_result = await file_retrieve_operation.execute()
+
+        file_url = file_result.file.download_url
+        if file_url is None:
+            raise Exception(
+                f"No video was found in the response. Full response: {file_result.model_dump()}"
+            )
+        logging.info(f"Generated video URL: {file_url}")
+        if unique_id:
+            if hasattr(file_result.file, "backup_download_url"):
+                message = f"Result URL: {file_url}\nBackup URL: {file_result.file.backup_download_url}"
+            else:
+                message = f"Result URL: {file_url}"
+            PromptServer.instance.send_progress_text(message, unique_id)
+
+        video_io = await download_url_to_bytesio(file_url)
+        if video_io is None:
+            error_msg = f"Failed to download video from {file_url}"
+            logging.error(error_msg)
+            raise Exception(error_msg)
+        return (VideoFromFile(video_io),)
 
 
 # A dictionary that contains all nodes you want to export with their names
@@ -322,6 +495,7 @@ NODE_CLASS_MAPPINGS = {
     "MinimaxTextToVideoNode": MinimaxTextToVideoNode,
     "MinimaxImageToVideoNode": MinimaxImageToVideoNode,
     # "MinimaxSubjectToVideoNode": MinimaxSubjectToVideoNode,
+    "MinimaxHailuoVideoNode": MinimaxHailuoVideoNode,
 }
 
 # A dictionary that contains the friendly/humanly readable titles for the nodes
@@ -329,4 +503,5 @@ NODE_DISPLAY_NAME_MAPPINGS = {
     "MinimaxTextToVideoNode": "MiniMax Text to Video",
     "MinimaxImageToVideoNode": "MiniMax Image to Video",
     "MinimaxSubjectToVideoNode": "MiniMax Subject to Video",
+    "MinimaxHailuoVideoNode": "MiniMax Hailuo Video",
 }

From 07a927517cfaf099fec3903e8973f758e62d65f9 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Tue, 19 Aug 2025 23:29:01 +0300
Subject: [PATCH 15/30] api_nodes: add GPT-5 series models (#9325)

---
 comfy_api_nodes/nodes_openai.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py
index cbff2b2d2..674c9ede0 100644
--- a/comfy_api_nodes/nodes_openai.py
+++ b/comfy_api_nodes/nodes_openai.py
@@ -80,6 +80,9 @@ class SupportedOpenAIModel(str, Enum):
     gpt_4_1 = "gpt-4.1"
     gpt_4_1_mini = "gpt-4.1-mini"
     gpt_4_1_nano = "gpt-4.1-nano"
+    gpt_5 = "gpt-5"
+    gpt_5_mini = "gpt-5-mini"
+    gpt_5_nano = "gpt-5-nano"
 
 
 class OpenAIDalle2(ComfyNodeABC):

From d844d8b13bfd6a83b0a7d0491aa2978ac44a6158 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Tue, 19 Aug 2025 23:29:24 +0300
Subject: [PATCH 16/30] api_nodes: added release version of google's models
 (#9304)

---
 comfy_api_nodes/nodes_gemini.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py
index 3751fb2a1..ba4167a50 100644
--- a/comfy_api_nodes/nodes_gemini.py
+++ b/comfy_api_nodes/nodes_gemini.py
@@ -46,6 +46,8 @@ class GeminiModel(str, Enum):
 
     gemini_2_5_pro_preview_05_06 = "gemini-2.5-pro-preview-05-06"
     gemini_2_5_flash_preview_04_17 = "gemini-2.5-flash-preview-04-17"
+    gemini_2_5_pro = "gemini-2.5-pro"
+    gemini_2_5_flash = "gemini-2.5-flash"
 
 
 def get_gemini_endpoint(
@@ -97,7 +99,7 @@ class GeminiNode(ComfyNodeABC):
                     {
                         "tooltip": "The Gemini model to use for generating responses.",
                         "options": [model.value for model in GeminiModel],
-                        "default": GeminiModel.gemini_2_5_pro_preview_05_06.value,
+                        "default": GeminiModel.gemini_2_5_pro.value,
                     },
                 ),
                 "seed": (

From 54d8fdbed0a7b171ab8cfb02e29a7e0dc5fe78fd Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Tue, 19 Aug 2025 23:30:06 +0300
Subject: [PATCH 17/30] feat(api-nodes): add Vidu Video nodes (#9368)

---
 comfy_api_nodes/nodes_vidu.py            | 622 +++++++++++++++++++++++
 comfy_api_nodes/util/validation_utils.py |  53 ++
 nodes.py                                 |   1 +
 3 files changed, 676 insertions(+)
 create mode 100644 comfy_api_nodes/nodes_vidu.py

diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py
new file mode 100644
index 000000000..2f441948c
--- /dev/null
+++ b/comfy_api_nodes/nodes_vidu.py
@@ -0,0 +1,622 @@
+import logging
+from enum import Enum
+from typing import Any, Callable, Optional, Literal, TypeVar
+from typing_extensions import override
+
+import torch
+from pydantic import BaseModel, Field
+
+from comfy_api.latest import ComfyExtension, io as comfy_io
+from comfy_api_nodes.util.validation_utils import (
+    validate_aspect_ratio_closeness,
+    validate_image_dimensions,
+    validate_image_aspect_ratio_range,
+    get_number_of_images,
+)
+from comfy_api_nodes.apis.client import (
+    ApiEndpoint,
+    HttpMethod,
+    SynchronousOperation,
+    PollingOperation,
+    EmptyRequest,
+)
+from comfy_api_nodes.apinode_utils import download_url_to_video_output, upload_images_to_comfyapi
+
+
+VIDU_TEXT_TO_VIDEO = "/proxy/vidu/text2video"
+VIDU_IMAGE_TO_VIDEO = "/proxy/vidu/img2video"
+VIDU_REFERENCE_VIDEO = "/proxy/vidu/reference2video"
+VIDU_START_END_VIDEO = "/proxy/vidu/start-end2video"
+VIDU_GET_GENERATION_STATUS = "/proxy/vidu/tasks/%s/creations"
+
+R = TypeVar("R")
+
+class VideoModelName(str, Enum):
+    vidu_q1 = 'viduq1'
+
+
+class AspectRatio(str, Enum):
+    r_16_9 = "16:9"
+    r_9_16 = "9:16"
+    r_1_1 = "1:1"
+
+
+class Resolution(str, Enum):
+    r_1080p = "1080p"
+
+
+class MovementAmplitude(str, Enum):
+    auto = "auto"
+    small = "small"
+    medium = "medium"
+    large = "large"
+
+
+class TaskCreationRequest(BaseModel):
+    model: VideoModelName = VideoModelName.vidu_q1
+    prompt: Optional[str] = Field(None, max_length=1500)
+    duration: Optional[Literal[5]] = 5
+    seed: Optional[int] = Field(0, ge=0, le=2147483647)
+    aspect_ratio: Optional[AspectRatio] = AspectRatio.r_16_9
+    resolution: Optional[Resolution] = Resolution.r_1080p
+    movement_amplitude: Optional[MovementAmplitude] = MovementAmplitude.auto
+    images: Optional[list[str]] = Field(None, description="Base64 encoded string or image URL")
+
+
+class TaskStatus(str, Enum):
+    created = "created"
+    queueing = "queueing"
+    processing = "processing"
+    success = "success"
+    failed = "failed"
+
+
+class TaskCreationResponse(BaseModel):
+    task_id: str = Field(...)
+    state: TaskStatus = Field(...)
+    created_at: str = Field(...)
+    code: Optional[int] = Field(None, description="Error code")
+
+
+class TaskResult(BaseModel):
+    id: str = Field(..., description="Creation id")
+    url: str = Field(..., description="The URL of the generated results, valid for one hour")
+    cover_url: str = Field(..., description="The cover URL of the generated results, valid for one hour")
+
+
+class TaskStatusResponse(BaseModel):
+    state: TaskStatus = Field(...)
+    err_code: Optional[str] = Field(None)
+    creations: list[TaskResult] = Field(..., description="Generated results")
+
+
+async def poll_until_finished(
+    auth_kwargs: dict[str, str],
+    api_endpoint: ApiEndpoint[Any, R],
+    result_url_extractor: Optional[Callable[[R], str]] = None,
+    estimated_duration: Optional[int] = None,
+    node_id: Optional[str] = None,
+) -> R:
+    return await PollingOperation(
+        poll_endpoint=api_endpoint,
+        completed_statuses=[TaskStatus.success.value],
+        failed_statuses=[TaskStatus.failed.value],
+        status_extractor=lambda response: response.state.value,
+        auth_kwargs=auth_kwargs,
+        result_url_extractor=result_url_extractor,
+        estimated_duration=estimated_duration,
+        node_id=node_id,
+        poll_interval=16.0,
+        max_poll_attempts=256,
+    ).execute()
+
+
+def get_video_url_from_response(response) -> Optional[str]:
+    if response.creations:
+        return response.creations[0].url
+    return None
+
+
+def get_video_from_response(response) -> TaskResult:
+    if not response.creations:
+        error_msg = f"Vidu request does not contain results. State: {response.state}, Error Code: {response.err_code}"
+        logging.info(error_msg)
+        raise RuntimeError(error_msg)
+    logging.info("Vidu task %s succeeded. Video URL: %s", response.creations[0].id, response.creations[0].url)
+    return response.creations[0]
+
+
+async def execute_task(
+    vidu_endpoint: str,
+    auth_kwargs: Optional[dict[str, str]],
+    payload: TaskCreationRequest,
+    estimated_duration: int,
+    node_id: str,
+) -> R:
+    response = await SynchronousOperation(
+        endpoint=ApiEndpoint(
+            path=vidu_endpoint,
+            method=HttpMethod.POST,
+            request_model=TaskCreationRequest,
+            response_model=TaskCreationResponse,
+        ),
+        request=payload,
+        auth_kwargs=auth_kwargs,
+    ).execute()
+    if response.state == TaskStatus.failed:
+        error_msg = f"Vidu request failed. Code: {response.code}"
+        logging.error(error_msg)
+        raise RuntimeError(error_msg)
+    return await poll_until_finished(
+        auth_kwargs,
+        ApiEndpoint(
+            path=VIDU_GET_GENERATION_STATUS % response.task_id,
+            method=HttpMethod.GET,
+            request_model=EmptyRequest,
+            response_model=TaskStatusResponse,
+        ),
+        result_url_extractor=get_video_url_from_response,
+        estimated_duration=estimated_duration,
+        node_id=node_id,
+    )
+
+
+class ViduTextToVideoNode(comfy_io.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="ViduTextToVideoNode",
+            display_name="Vidu Text To Video Generation",
+            category="api node/video/Vidu",
+            description="Generate video from text prompt",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=[model.value for model in VideoModelName],
+                    default=VideoModelName.vidu_q1.value,
+                    tooltip="Model name",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A textual description for video generation",
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=5,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Duration of the output video in seconds",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed for video generation (0 for random)",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "aspect_ratio",
+                    options=[model.value for model in AspectRatio],
+                    default=AspectRatio.r_16_9.value,
+                    tooltip="The aspect ratio of the output video",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "resolution",
+                    options=[model.value for model in Resolution],
+                    default=Resolution.r_1080p.value,
+                    tooltip="Supported values may vary by model & duration",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "movement_amplitude",
+                    options=[model.value for model in MovementAmplitude],
+                    default=MovementAmplitude.auto.value,
+                    tooltip="The movement amplitude of objects in the frame",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        prompt: str,
+        duration: int,
+        seed: int,
+        aspect_ratio: str,
+        resolution: str,
+        movement_amplitude: str,
+    ) -> comfy_io.NodeOutput:
+        if not prompt:
+            raise ValueError("The prompt field is required and cannot be empty.")
+        payload = TaskCreationRequest(
+            model_name=model,
+            prompt=prompt,
+            duration=duration,
+            seed=seed,
+            aspect_ratio=aspect_ratio,
+            resolution=resolution,
+            movement_amplitude=movement_amplitude,
+        )
+        auth = {
+            "auth_token": cls.hidden.auth_token_comfy_org,
+            "comfy_api_key": cls.hidden.api_key_comfy_org,
+        }
+        results = await execute_task(VIDU_TEXT_TO_VIDEO, auth, payload, 320, cls.hidden.unique_id)
+        return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url))
+
+
+class ViduImageToVideoNode(comfy_io.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="ViduImageToVideoNode",
+            display_name="Vidu Image To Video Generation",
+            category="api node/video/Vidu",
+            description="Generate video from image and optional prompt",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=[model.value for model in VideoModelName],
+                    default=VideoModelName.vidu_q1.value,
+                    tooltip="Model name",
+                ),
+                comfy_io.Image.Input(
+                    "image",
+                    tooltip="An image to be used as the start frame of the generated video",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="A textual description for video generation",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=5,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Duration of the output video in seconds",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed for video generation (0 for random)",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "resolution",
+                    options=[model.value for model in Resolution],
+                    default=Resolution.r_1080p.value,
+                    tooltip="Supported values may vary by model & duration",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "movement_amplitude",
+                    options=[model.value for model in MovementAmplitude],
+                    default=MovementAmplitude.auto.value,
+                    tooltip="The movement amplitude of objects in the frame",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        image: torch.Tensor,
+        prompt: str,
+        duration: int,
+        seed: int,
+        resolution: str,
+        movement_amplitude: str,
+    ) -> comfy_io.NodeOutput:
+        if get_number_of_images(image) > 1:
+            raise ValueError("Only one input image is allowed.")
+        validate_image_aspect_ratio_range(image, (1, 4), (4, 1))
+        payload = TaskCreationRequest(
+            model_name=model,
+            prompt=prompt,
+            duration=duration,
+            seed=seed,
+            resolution=resolution,
+            movement_amplitude=movement_amplitude,
+        )
+        auth = {
+            "auth_token": cls.hidden.auth_token_comfy_org,
+            "comfy_api_key": cls.hidden.api_key_comfy_org,
+        }
+        payload.images = await upload_images_to_comfyapi(
+            image,
+            max_images=1,
+            mime_type="image/png",
+            auth_kwargs=auth,
+        )
+        results = await execute_task(VIDU_IMAGE_TO_VIDEO, auth, payload, 120, cls.hidden.unique_id)
+        return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url))
+
+
+class ViduReferenceVideoNode(comfy_io.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="ViduReferenceVideoNode",
+            display_name="Vidu Reference To Video Generation",
+            category="api node/video/Vidu",
+            description="Generate video from multiple images and prompt",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=[model.value for model in VideoModelName],
+                    default=VideoModelName.vidu_q1.value,
+                    tooltip="Model name",
+                ),
+                comfy_io.Image.Input(
+                    "images",
+                    tooltip="Images to use as references to generate a video with consistent subjects (max 7 images).",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A textual description for video generation",
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=5,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Duration of the output video in seconds",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed for video generation (0 for random)",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "aspect_ratio",
+                    options=[model.value for model in AspectRatio],
+                    default=AspectRatio.r_16_9.value,
+                    tooltip="The aspect ratio of the output video",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "resolution",
+                    options=[model.value for model in Resolution],
+                    default=Resolution.r_1080p.value,
+                    tooltip="Supported values may vary by model & duration",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "movement_amplitude",
+                    options=[model.value for model in MovementAmplitude],
+                    default=MovementAmplitude.auto.value,
+                    tooltip="The movement amplitude of objects in the frame",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        images: torch.Tensor,
+        prompt: str,
+        duration: int,
+        seed: int,
+        aspect_ratio: str,
+        resolution: str,
+        movement_amplitude: str,
+    ) -> comfy_io.NodeOutput:
+        if not prompt:
+            raise ValueError("The prompt field is required and cannot be empty.")
+        a = get_number_of_images(images)
+        if a > 7:
+            raise ValueError("Too many images, maximum allowed is 7.")
+        for image in images:
+            validate_image_aspect_ratio_range(image, (1, 4), (4, 1))
+            validate_image_dimensions(image, min_width=128, min_height=128)
+        payload = TaskCreationRequest(
+            model_name=model,
+            prompt=prompt,
+            duration=duration,
+            seed=seed,
+            aspect_ratio=aspect_ratio,
+            resolution=resolution,
+            movement_amplitude=movement_amplitude,
+        )
+        auth = {
+            "auth_token": cls.hidden.auth_token_comfy_org,
+            "comfy_api_key": cls.hidden.api_key_comfy_org,
+        }
+        payload.images = await upload_images_to_comfyapi(
+            images,
+            max_images=7,
+            mime_type="image/png",
+            auth_kwargs=auth,
+        )
+        results = await execute_task(VIDU_REFERENCE_VIDEO, auth, payload, 120, cls.hidden.unique_id)
+        return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url))
+
+
+class ViduStartEndToVideoNode(comfy_io.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="ViduStartEndToVideoNode",
+            display_name="Vidu Start End To Video Generation",
+            category="api node/video/Vidu",
+            description="Generate a video from start and end frames and a prompt",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=[model.value for model in VideoModelName],
+                    default=VideoModelName.vidu_q1.value,
+                    tooltip="Model name",
+                ),
+                comfy_io.Image.Input(
+                    "first_frame",
+                    tooltip="Start frame",
+                ),
+                comfy_io.Image.Input(
+                    "end_frame",
+                    tooltip="End frame",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A textual description for video generation",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=5,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Duration of the output video in seconds",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed for video generation (0 for random)",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "resolution",
+                    options=[model.value for model in Resolution],
+                    default=Resolution.r_1080p.value,
+                    tooltip="Supported values may vary by model & duration",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "movement_amplitude",
+                    options=[model.value for model in MovementAmplitude],
+                    default=MovementAmplitude.auto.value,
+                    tooltip="The movement amplitude of objects in the frame",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        first_frame: torch.Tensor,
+        end_frame: torch.Tensor,
+        prompt: str,
+        duration: int,
+        seed: int,
+        resolution: str,
+        movement_amplitude: str,
+    ) -> comfy_io.NodeOutput:
+        validate_aspect_ratio_closeness(first_frame, end_frame, min_rel=0.8, max_rel=1.25, strict=False)
+        payload = TaskCreationRequest(
+            model_name=model,
+            prompt=prompt,
+            duration=duration,
+            seed=seed,
+            resolution=resolution,
+            movement_amplitude=movement_amplitude,
+        )
+        auth = {
+            "auth_token": cls.hidden.auth_token_comfy_org,
+            "comfy_api_key": cls.hidden.api_key_comfy_org,
+        }
+        payload.images = [
+            (await upload_images_to_comfyapi(frame, max_images=1, mime_type="image/png", auth_kwargs=auth))[0]
+            for frame in (first_frame, end_frame)
+        ]
+        results = await execute_task(VIDU_START_END_VIDEO, auth, payload, 96, cls.hidden.unique_id)
+        return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url))
+
+
+class ViduExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]:
+        return [
+            ViduTextToVideoNode,
+            ViduImageToVideoNode,
+            ViduReferenceVideoNode,
+            ViduStartEndToVideoNode,
+        ]
+
+async def comfy_entrypoint() -> ViduExtension:
+    return ViduExtension()
diff --git a/comfy_api_nodes/util/validation_utils.py b/comfy_api_nodes/util/validation_utils.py
index 031b9fbd3..606b794bf 100644
--- a/comfy_api_nodes/util/validation_utils.py
+++ b/comfy_api_nodes/util/validation_utils.py
@@ -53,6 +53,53 @@ def validate_image_aspect_ratio(
         )
 
 
+def validate_image_aspect_ratio_range(
+    image: torch.Tensor,
+    min_ratio: tuple[float, float],  # e.g. (1, 4)
+    max_ratio: tuple[float, float],  # e.g. (4, 1)
+    *,
+    strict: bool = True,             # True -> (min, max); False -> [min, max]
+) -> float:
+    a1, b1 = min_ratio
+    a2, b2 = max_ratio
+    if a1 <= 0 or b1 <= 0 or a2 <= 0 or b2 <= 0:
+        raise ValueError("Ratios must be positive, like (1, 4) or (4, 1).")
+    lo, hi = (a1 / b1), (a2 / b2)
+    if lo > hi:
+        lo, hi = hi, lo
+        a1, b1, a2, b2 = a2, b2, a1, b1  # swap only for error text
+    w, h = get_image_dimensions(image)
+    if w <= 0 or h <= 0:
+        raise ValueError(f"Invalid image dimensions: {w}x{h}")
+    ar = w / h
+    ok = (lo < ar < hi) if strict else (lo <= ar <= hi)
+    if not ok:
+        op = "<" if strict else "≤"
+        raise ValueError(f"Image aspect ratio {ar:.6g} is outside allowed range: {a1}:{b1} {op} ratio {op} {a2}:{b2}")
+    return ar
+
+
+def validate_aspect_ratio_closeness(
+    start_img,
+    end_img,
+    min_rel: float,
+    max_rel: float,
+    *,
+    strict: bool = False,   # True => exclusive, False => inclusive
+) -> None:
+    w1, h1 = get_image_dimensions(start_img)
+    w2, h2 = get_image_dimensions(end_img)
+    if min(w1, h1, w2, h2) <= 0:
+        raise ValueError("Invalid image dimensions")
+    ar1 = w1 / h1
+    ar2 = w2 / h2
+    # Normalize so it is symmetric (no need to check both ar1/ar2 and ar2/ar1)
+    closeness = max(ar1, ar2) / min(ar1, ar2)
+    limit = max(max_rel, 1.0 / min_rel)  # for 0.8..1.25 this is 1.25
+    if (closeness >= limit) if strict else (closeness > limit):
+        raise ValueError(f"Aspect ratios must be close: start/end={ar1/ar2:.4f}, allowed range {min_rel}–{max_rel}.")
+
+
 def validate_video_dimensions(
     video: VideoInput,
     min_width: Optional[int] = None,
@@ -98,3 +145,9 @@ def validate_video_duration(
         raise ValueError(
             f"Video duration must be at most {max_duration}s, got {duration}s"
         )
+
+
+def get_number_of_images(images):
+    if isinstance(images, torch.Tensor):
+        return images.shape[0] if images.ndim >= 4 else 1
+    return len(images)
diff --git a/nodes.py b/nodes.py
index b3fa9c51a..35dda1b19 100644
--- a/nodes.py
+++ b/nodes.py
@@ -2351,6 +2351,7 @@ async def init_builtin_api_nodes():
         "nodes_moonvalley.py",
         "nodes_rodin.py",
         "nodes_gemini.py",
+        "nodes_vidu.py",
     ]
 
     if not await load_custom_node(os.path.join(api_nodes_dir, "canary.py"), module_parent="comfy_api_nodes"):

From bddd69618bf4463209c3681babfcbebd9b9aed85 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Tue, 19 Aug 2025 13:49:01 -0700
Subject: [PATCH 18/30] Change the TextEncodeQwenImageEdit node to use logic
 closer to reference. (#9432)

---
 comfy_extras/nodes_qwen.py | 37 +++++++++++--------------------------
 1 file changed, 11 insertions(+), 26 deletions(-)

diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py
index b5088fae2..fff89556f 100644
--- a/comfy_extras/nodes_qwen.py
+++ b/comfy_extras/nodes_qwen.py
@@ -1,25 +1,6 @@
 import node_helpers
 import comfy.utils
-
-PREFERRED_QWENIMAGE_RESOLUTIONS = [
-    (672, 1568),
-    (688, 1504),
-    (720, 1456),
-    (752, 1392),
-    (800, 1328),
-    (832, 1248),
-    (880, 1184),
-    (944, 1104),
-    (1024, 1024),
-    (1104, 944),
-    (1184, 880),
-    (1248, 832),
-    (1328, 800),
-    (1392, 752),
-    (1456, 720),
-    (1504, 688),
-    (1568, 672),
-]
+import math
 
 
 class TextEncodeQwenImageEdit:
@@ -42,13 +23,17 @@ class TextEncodeQwenImageEdit:
         if image is None:
             images = []
         else:
-            images = [image]
+            samples = image.movedim(-1, 1)
+            total = int(1024 * 1024)
+
+            scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
+            width = round(samples.shape[3] * scale_by)
+            height = round(samples.shape[2] * scale_by)
+
+            s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
+            image = s.movedim(1, -1)
+            images = [image[:, :, :, :3]]
             if vae is not None:
-                width = image.shape[2]
-                height = image.shape[1]
-                aspect_ratio = width / height
-                _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS)
-                image = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "lanczos", "center").movedim(1, -1)
                 ref_latent = vae.encode(image[:, :, :, :3])
 
         tokens = clip.tokenize(prompt, images=images)

From dfa791eb4bfcaac3eb9b2b33fa15ae5a25589bb8 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Tue, 19 Aug 2025 17:47:42 -0700
Subject: [PATCH 19/30] Rope fix for qwen vl. (#9435)

---
 comfy/text_encoders/llama.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py
index 9d90d5a61..4c976058f 100644
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -27,6 +27,7 @@ class Llama2Config:
     rms_norm_add = False
     mlp_activation = "silu"
     qkv_bias = False
+    rope_dims = None
 
 @dataclass
 class Qwen25_3BConfig:
@@ -44,6 +45,7 @@ class Qwen25_3BConfig:
     rms_norm_add = False
     mlp_activation = "silu"
     qkv_bias = True
+    rope_dims = None
 
 @dataclass
 class Qwen25_7BVLI_Config:
@@ -61,6 +63,7 @@ class Qwen25_7BVLI_Config:
     rms_norm_add = False
     mlp_activation = "silu"
     qkv_bias = True
+    rope_dims = [16, 24, 24]
 
 @dataclass
 class Gemma2_2B_Config:
@@ -78,6 +81,7 @@ class Gemma2_2B_Config:
     rms_norm_add = True
     mlp_activation = "gelu_pytorch_tanh"
     qkv_bias = False
+    rope_dims = None
 
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None):
@@ -102,7 +106,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-def precompute_freqs_cis(head_dim, position_ids, theta, device=None):
+def precompute_freqs_cis(head_dim, position_ids, theta, rope_dims=None, device=None):
     theta_numerator = torch.arange(0, head_dim, 2, device=device).float()
     inv_freq = 1.0 / (theta ** (theta_numerator / head_dim))
 
@@ -112,12 +116,20 @@ def precompute_freqs_cis(head_dim, position_ids, theta, device=None):
     emb = torch.cat((freqs, freqs), dim=-1)
     cos = emb.cos()
     sin = emb.sin()
+    if rope_dims is not None and position_ids.shape[0] > 1:
+        mrope_section = rope_dims * 2
+        cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
+        sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
+    else:
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+
     return (cos, sin)
 
 
 def apply_rope(xq, xk, freqs_cis):
-    cos = freqs_cis[0].unsqueeze(1)
-    sin = freqs_cis[1].unsqueeze(1)
+    cos = freqs_cis[0]
+    sin = freqs_cis[1]
     q_embed = (xq * cos) + (rotate_half(xq) * sin)
     k_embed = (xk * cos) + (rotate_half(xk) * sin)
     return q_embed, k_embed
@@ -292,6 +304,7 @@ class Llama2_(nn.Module):
         freqs_cis = precompute_freqs_cis(self.config.head_dim,
                                          position_ids,
                                          self.config.rope_theta,
+                                         self.config.rope_dims,
                                          device=x.device)
 
         mask = None

From 7cd2c4bd6ab20f35a6bb1b1f2252c3ea16da4777 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Tue, 19 Aug 2025 21:45:27 -0700
Subject: [PATCH 20/30] Qwen rotary embeddings should now match reference code.
 (#9437)

---
 comfy/ldm/qwen_image/model.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py
index a3c726299..bf3940313 100644
--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@@ -349,8 +349,8 @@ class QwenImageTransformer2DModel(nn.Module):
 
         img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
         img_ids[:, :, 0] = img_ids[:, :, 1] + index
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
+        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - (h_len // 2)
+        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) - (w_len // 2)
         return hidden_states, repeat(img_ids, "h w c -> b (h w) c", b=bs), orig_shape
 
     def forward(
@@ -396,7 +396,7 @@ class QwenImageTransformer2DModel(nn.Module):
                 hidden_states = torch.cat([hidden_states, kontext], dim=1)
                 img_ids = torch.cat([img_ids, kontext_ids], dim=1)
 
-        txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size), ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size)))
+        txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2))
         txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(x.shape[0], 1, 3)
         ids = torch.cat((txt_ids, img_ids), dim=1)
         image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype)

From 5a8f502db5889873ffa13132b603b7b6daac605a Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Tue, 19 Aug 2025 22:08:11 -0700
Subject: [PATCH 21/30] Disable prompt weights for qwen. (#9438)

---
 comfy/sd1_clip.py                 | 5 ++++-
 comfy/text_encoders/qwen_image.py | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py
index 1e8adbe69..f8a7c2a1b 100644
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -534,7 +534,10 @@ class SDTokenizer:
         min_padding = tokenizer_options.get("{}_min_padding".format(self.embedding_key), self.min_padding)
 
         text = escape_important(text)
-        parsed_weights = token_weights(text, 1.0)
+        if kwargs.get("disable_weights", False):
+            parsed_weights = [(text, 1.0)]
+        else:
+            parsed_weights = token_weights(text, 1.0)
 
         # tokenize words
         tokens = []
diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py
index f07318d6c..6646b1003 100644
--- a/comfy/text_encoders/qwen_image.py
+++ b/comfy/text_encoders/qwen_image.py
@@ -15,7 +15,7 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer):
     def __init__(self, embedding_directory=None, tokenizer_data={}):
         super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen25_7b", tokenizer=Qwen25_7BVLITokenizer)
         self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
-        self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image \\(color, shape, size, texture, objects, background\\), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
+        self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
 
     def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], **kwargs):
         if llama_template is None:
@@ -25,7 +25,7 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer):
                 llama_text = self.llama_template.format(text)
         else:
             llama_text = llama_template.format(text)
-        tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs)
+        tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
         key_name = next(iter(tokens))
         embed_count = 0
         qwen_tokens = tokens[key_name]

From 8d38ea3bbf7e77ed7e7aee401b157dab211c5307 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Tue, 19 Aug 2025 23:58:54 -0700
Subject: [PATCH 22/30] Fix bf16 precision issue with qwen image embeddings.
 (#9441)

---
 comfy/ldm/qwen_image/model.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py
index bf3940313..49f66b90a 100644
--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@@ -347,7 +347,7 @@ class QwenImageTransformer2DModel(nn.Module):
         h_offset = ((h_offset + (patch_size // 2)) // patch_size)
         w_offset = ((w_offset + (patch_size // 2)) // patch_size)
 
-        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
+        img_ids = torch.zeros((h_len, w_len, 3), device=x.device)
         img_ids[:, :, 0] = img_ids[:, :, 1] + index
         img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - (h_len // 2)
         img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) - (w_len // 2)
@@ -397,9 +397,10 @@ class QwenImageTransformer2DModel(nn.Module):
                 img_ids = torch.cat([img_ids, kontext_ids], dim=1)
 
         txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2))
-        txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(x.shape[0], 1, 3)
+        txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3)
         ids = torch.cat((txt_ids, img_ids), dim=1)
         image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype)
+        del ids, txt_ids, img_ids
 
         hidden_states = self.img_in(hidden_states)
         encoder_hidden_states = self.txt_norm(encoder_hidden_states)

From 2f52e8f05f2039dd67e9b9783b8397350a548b95 Mon Sep 17 00:00:00 2001
From: ComfyUI Wiki <contact@comfyui-wiki.com>
Date: Wed, 20 Aug 2025 15:15:09 +0800
Subject: [PATCH 23/30] Bump template to 0.1.62 (#9419)

* Bump template to 0.1.61

* Bump template to 0.1.62
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c7a5c47ab..8d928d826 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 comfyui-frontend-package==1.25.9
-comfyui-workflow-templates==0.1.60
+comfyui-workflow-templates==0.1.62
 comfyui-embedded-docs==0.2.6
 torch
 torchsde

From 7139d6d93fc7b5481a69b687080bd36f7b531c46 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Wed, 20 Aug 2025 03:15:30 -0400
Subject: [PATCH 24/30] ComfyUI version 0.3.51

---
 comfyui_version.py | 2 +-
 pyproject.toml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfyui_version.py b/comfyui_version.py
index 29ec07ca6..65f06cf37 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.50"
+__version__ = "0.3.51"
diff --git a/pyproject.toml b/pyproject.toml
index 659b5730a..ecbf04303 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.50"
+version = "0.3.51"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"

From fe01885acf892de636b1b2743903812099bd42e3 Mon Sep 17 00:00:00 2001
From: Harel Cain <harel@lightricks.com>
Date: Wed, 20 Aug 2025 09:33:10 +0200
Subject: [PATCH 25/30] LTXV: fix key frame noise mask dimensions for when real
 noise mask exists (#9425)

---
 comfy_extras/nodes_lt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py
index b5058667a..f82337a67 100644
--- a/comfy_extras/nodes_lt.py
+++ b/comfy_extras/nodes_lt.py
@@ -166,7 +166,7 @@ class LTXVAddGuide:
         negative = self.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors)
 
         mask = torch.full(
-            (noise_mask.shape[0], 1, guiding_latent.shape[2], 1, 1),
+            (noise_mask.shape[0], 1, guiding_latent.shape[2], noise_mask.shape[3], noise_mask.shape[4]),
             1.0 - strength,
             dtype=noise_mask.dtype,
             device=noise_mask.device,

From e73a9dbe30434280c69d852ea78cc4bf88bfd501 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 20 Aug 2025 14:34:13 -0700
Subject: [PATCH 26/30] Add that qwen edit model is supported to readme.
 (#9463)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index fa99a8cbe..79a8a8c79 100644
--- a/README.md
+++ b/README.md
@@ -71,6 +71,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
    - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
    - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
    - [HiDream E1.1](https://comfyanonymous.github.io/ComfyUI_examples/hidream/#hidream-e11)
+   - [Qwen Image Edit](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/#edit-model)
 - Video Models
    - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
    - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)

From 0963493a9c3b6565f8537288a0fb90991391ec41 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 20 Aug 2025 19:26:37 -0700
Subject: [PATCH 27/30] Support for Qwen Diffsynth Controlnets canny and depth.
 (#9465)

These are not real controlnets but actually a patch on the model so they
will be treated as such.

Put them in the models/model_patches/ folder.

Use the new ModelPatchLoader and QwenImageDiffsynthControlnet nodes.
---
 comfy/ldm/qwen_image/model.py               |   7 +
 comfy/model_management.py                   |   8 +-
 comfy/model_patcher.py                      |  27 ++++
 comfy_api/latest/_io.py                     |   4 +
 comfy_extras/nodes_model_patch.py           | 138 ++++++++++++++++++++
 models/model_patches/put_model_patches_here |   0
 nodes.py                                    |   1 +
 7 files changed, 184 insertions(+), 1 deletion(-)
 create mode 100644 comfy_extras/nodes_model_patch.py
 create mode 100644 models/model_patches/put_model_patches_here

diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py
index 49f66b90a..2503583cb 100644
--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@@ -416,6 +416,7 @@ class QwenImageTransformer2DModel(nn.Module):
         )
 
         patches_replace = transformer_options.get("patches_replace", {})
+        patches = transformer_options.get("patches", {})
         blocks_replace = patches_replace.get("dit", {})
 
         for i, block in enumerate(self.transformer_blocks):
@@ -436,6 +437,12 @@ class QwenImageTransformer2DModel(nn.Module):
                     image_rotary_emb=image_rotary_emb,
                 )
 
+            if "double_block" in patches:
+                for p in patches["double_block"]:
+                    out = p({"img": hidden_states, "txt": encoder_hidden_states, "x": x, "block_index": i})
+                    hidden_states = out["img"]
+                    encoder_hidden_states = out["txt"]
+
         hidden_states = self.norm_out(hidden_states, temb)
         hidden_states = self.proj_out(hidden_states)
 
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 2a9f18068..d08aee1fe 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -593,7 +593,13 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
     else:
         minimum_memory_required = max(inference_memory, minimum_memory_required + extra_reserved_memory())
 
-    models = set(models)
+    models_temp = set()
+    for m in models:
+        models_temp.add(m)
+        for mm in m.model_patches_models():
+            models_temp.add(mm)
+
+    models = models_temp
 
     models_to_load = []
 
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 52e76b5f3..a944cb421 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -430,6 +430,9 @@ class ModelPatcher:
     def set_model_forward_timestep_embed_patch(self, patch):
         self.set_model_patch(patch, "forward_timestep_embed_patch")
 
+    def set_model_double_block_patch(self, patch):
+        self.set_model_patch(patch, "double_block")
+
     def add_object_patch(self, name, obj):
         self.object_patches[name] = obj
 
@@ -486,6 +489,30 @@ class ModelPatcher:
             if hasattr(wrap_func, "to"):
                 self.model_options["model_function_wrapper"] = wrap_func.to(device)
 
+    def model_patches_models(self):
+        to = self.model_options["transformer_options"]
+        models = []
+        if "patches" in to:
+            patches = to["patches"]
+            for name in patches:
+                patch_list = patches[name]
+                for i in range(len(patch_list)):
+                    if hasattr(patch_list[i], "models"):
+                        models += patch_list[i].models()
+        if "patches_replace" in to:
+            patches = to["patches_replace"]
+            for name in patches:
+                patch_list = patches[name]
+                for k in patch_list:
+                    if hasattr(patch_list[k], "models"):
+                        models += patch_list[k].models()
+        if "model_function_wrapper" in self.model_options:
+            wrap_func = self.model_options["model_function_wrapper"]
+            if hasattr(wrap_func, "models"):
+                models += wrap_func.models()
+
+        return models
+
     def model_dtype(self):
         if hasattr(self.model, "get_dtype"):
             return self.model.get_dtype()
diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py
index ec1efb51d..a3a21facc 100644
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@@ -726,6 +726,10 @@ class SEGS(ComfyTypeIO):
 class AnyType(ComfyTypeIO):
     Type = Any
 
+@comfytype(io_type="MODEL_PATCH")
+class MODEL_PATCH(ComfyTypeIO):
+    Type = Any
+
 @comfytype(io_type="COMFY_MULTITYPED_V3")
 class MultiType:
     Type = Any
diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py
new file mode 100644
index 000000000..bb239bc45
--- /dev/null
+++ b/comfy_extras/nodes_model_patch.py
@@ -0,0 +1,138 @@
+import torch
+import folder_paths
+import comfy.utils
+import comfy.ops
+import comfy.model_management
+import comfy.ldm.common_dit
+import comfy.latent_formats
+
+
+class BlockWiseControlBlock(torch.nn.Module):
+    # [linear, gelu, linear]
+    def __init__(self, dim: int = 3072, device=None, dtype=None, operations=None):
+        super().__init__()
+        self.x_rms = operations.RMSNorm(dim, eps=1e-6)
+        self.y_rms = operations.RMSNorm(dim, eps=1e-6)
+        self.input_proj = operations.Linear(dim, dim)
+        self.act = torch.nn.GELU()
+        self.output_proj = operations.Linear(dim, dim)
+
+    def forward(self, x, y):
+        x, y = self.x_rms(x), self.y_rms(y)
+        x = self.input_proj(x + y)
+        x = self.act(x)
+        x = self.output_proj(x)
+        return x
+
+
+class QwenImageBlockWiseControlNet(torch.nn.Module):
+    def __init__(
+        self,
+        num_layers: int = 60,
+        in_dim: int = 64,
+        additional_in_dim: int = 0,
+        dim: int = 3072,
+        device=None, dtype=None, operations=None
+    ):
+        super().__init__()
+        self.img_in = operations.Linear(in_dim + additional_in_dim, dim, device=device, dtype=dtype)
+        self.controlnet_blocks = torch.nn.ModuleList(
+            [
+                BlockWiseControlBlock(dim, device=device, dtype=dtype, operations=operations)
+                for _ in range(num_layers)
+            ]
+        )
+
+    def process_input_latent_image(self, latent_image):
+        latent_image = comfy.latent_formats.Wan21().process_in(latent_image)
+        patch_size = 2
+        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(latent_image, (1, patch_size, patch_size))
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2)
+        hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5)
+        hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4)
+        return self.img_in(hidden_states)
+
+    def control_block(self, img, controlnet_conditioning, block_id):
+        return self.controlnet_blocks[block_id](img, controlnet_conditioning)
+
+
+class ModelPatchLoader:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": { "name": (folder_paths.get_filename_list("model_patches"), ),
+                              }}
+    RETURN_TYPES = ("MODEL_PATCH",)
+    FUNCTION = "load_model_patch"
+    EXPERIMENTAL = True
+
+    CATEGORY = "advanced/loaders"
+
+    def load_model_patch(self, name):
+        model_patch_path = folder_paths.get_full_path_or_raise("model_patches", name)
+        sd = comfy.utils.load_torch_file(model_patch_path, safe_load=True)
+        dtype = comfy.utils.weight_dtype(sd)
+        # TODO: this node will work with more types of model patches
+        model = QwenImageBlockWiseControlNet(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast)
+        model.load_state_dict(sd)
+        model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device())
+        return (model,)
+
+
+class DiffSynthCnetPatch:
+    def __init__(self, model_patch, vae, image, strength):
+        self.encoded_image = model_patch.model.process_input_latent_image(vae.encode(image))
+        self.model_patch = model_patch
+        self.vae = vae
+        self.image = image
+        self.strength = strength
+
+    def __call__(self, kwargs):
+        x = kwargs.get("x")
+        img = kwargs.get("img")
+        block_index = kwargs.get("block_index")
+        if self.encoded_image is None or self.encoded_image.shape[1:] != img.shape[1:]:
+            spacial_compression = self.vae.spacial_compression_encode()
+            image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center")
+            loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
+            self.encoded_image = self.model_patch.model.process_input_latent_image(self.vae.encode(image_scaled.movedim(1, -1)))
+            comfy.model_management.load_models_gpu(loaded_models)
+
+        img = img + (self.model_patch.model.control_block(img, self.encoded_image.to(img.dtype), block_index) * self.strength)
+        kwargs['img'] = img
+        return kwargs
+
+    def to(self, device_or_dtype):
+        if isinstance(device_or_dtype, torch.device):
+            self.encoded_image = self.encoded_image.to(device_or_dtype)
+        return self
+
+    def models(self):
+        return [self.model_patch]
+
+class QwenImageDiffsynthControlnet:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": { "model": ("MODEL",),
+                              "model_patch": ("MODEL_PATCH",),
+                              "vae": ("VAE",),
+                              "image": ("IMAGE",),
+                              "strength": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01}),
+                              }}
+    RETURN_TYPES = ("MODEL",)
+    FUNCTION = "diffsynth_controlnet"
+    EXPERIMENTAL = True
+
+    CATEGORY = "advanced/loaders/qwen"
+
+    def diffsynth_controlnet(self, model, model_patch, vae, image, strength):
+        model_patched = model.clone()
+        image = image[:, :, :, :3]
+        model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength))
+        return (model_patched,)
+
+
+NODE_CLASS_MAPPINGS = {
+    "ModelPatchLoader": ModelPatchLoader,
+    "QwenImageDiffsynthControlnet": QwenImageDiffsynthControlnet,
+}
diff --git a/models/model_patches/put_model_patches_here b/models/model_patches/put_model_patches_here
new file mode 100644
index 000000000..e69de29bb
diff --git a/nodes.py b/nodes.py
index 35dda1b19..9681750d3 100644
--- a/nodes.py
+++ b/nodes.py
@@ -2322,6 +2322,7 @@ async def init_builtin_extra_nodes():
         "nodes_tcfg.py",
         "nodes_context_windows.py",
         "nodes_qwen.py",
+        "nodes_model_patch.py"
     ]
 
     import_failed = []

From 0737b7e0d245de20192064da4888debbef3241c2 Mon Sep 17 00:00:00 2001
From: saurabh-pingale <saurabh.pingale@formpilot.org>
Date: Thu, 21 Aug 2025 07:57:57 +0530
Subject: [PATCH 28/30] fix(userdata): catch invalid workflow filenames (#9434)
 (#9445)

---
 app/user_manager.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/app/user_manager.py b/app/user_manager.py
index 0ec3e46ea..a2d376c0c 100644
--- a/app/user_manager.py
+++ b/app/user_manager.py
@@ -363,10 +363,17 @@ class UserManager():
             if not overwrite and os.path.exists(path):
                 return web.Response(status=409, text="File already exists")
 
-            body = await request.read()
+            try:
+                body = await request.read()
 
-            with open(path, "wb") as f:
-                f.write(body)
+                with open(path, "wb") as f:
+                    f.write(body)
+            except OSError as e:
+                logging.warning(f"Error saving file '{path}': {e}")
+                return web.Response(
+                    status=400,
+                    reason="Invalid filename. Please avoid special characters like :\\/*?\"<>|"
+                )
 
             user_path = self.get_request_user_filepath(request, None)
             if full_info:

From 9fa1036f60b5264302072453be524aa55928bbaf Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 20 Aug 2025 20:09:35 -0700
Subject: [PATCH 29/30] Forgot this. (#9470)

---
 folder_paths.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/folder_paths.py b/folder_paths.py
index 9ec952940..b34af39e8 100644
--- a/folder_paths.py
+++ b/folder_paths.py
@@ -46,6 +46,8 @@ folder_names_and_paths["photomaker"] = ([os.path.join(models_dir, "photomaker")]
 
 folder_names_and_paths["classifiers"] = ([os.path.join(models_dir, "classifiers")], {""})
 
+folder_names_and_paths["model_patches"] = ([os.path.join(models_dir, "model_patches")], supported_pt_extensions)
+
 output_directory = os.path.join(base_path, "output")
 temp_directory = os.path.join(base_path, "temp")
 input_directory = os.path.join(base_path, "input")

From 1b2de2642d38099acdde7c460d133d93e91074f0 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 20 Aug 2025 21:33:49 -0700
Subject: [PATCH 30/30] Support diffsynth inpaint controlnet (model patch).
 (#9471)

---
 comfy_extras/nodes_model_patch.py | 39 ++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py
index bb239bc45..3eaada9bc 100644
--- a/comfy_extras/nodes_model_patch.py
+++ b/comfy_extras/nodes_model_patch.py
@@ -35,6 +35,7 @@ class QwenImageBlockWiseControlNet(torch.nn.Module):
         device=None, dtype=None, operations=None
     ):
         super().__init__()
+        self.additional_in_dim = additional_in_dim
         self.img_in = operations.Linear(in_dim + additional_in_dim, dim, device=device, dtype=dtype)
         self.controlnet_blocks = torch.nn.ModuleList(
             [
@@ -44,7 +45,7 @@ class QwenImageBlockWiseControlNet(torch.nn.Module):
         )
 
     def process_input_latent_image(self, latent_image):
-        latent_image = comfy.latent_formats.Wan21().process_in(latent_image)
+        latent_image[:, :16] = comfy.latent_formats.Wan21().process_in(latent_image[:, :16])
         patch_size = 2
         hidden_states = comfy.ldm.common_dit.pad_to_patch_size(latent_image, (1, patch_size, patch_size))
         orig_shape = hidden_states.shape
@@ -73,19 +74,33 @@ class ModelPatchLoader:
         sd = comfy.utils.load_torch_file(model_patch_path, safe_load=True)
         dtype = comfy.utils.weight_dtype(sd)
         # TODO: this node will work with more types of model patches
-        model = QwenImageBlockWiseControlNet(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast)
+        additional_in_dim = sd["img_in.weight"].shape[1] - 64
+        model = QwenImageBlockWiseControlNet(additional_in_dim=additional_in_dim, device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast)
         model.load_state_dict(sd)
         model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device())
         return (model,)
 
 
 class DiffSynthCnetPatch:
-    def __init__(self, model_patch, vae, image, strength):
-        self.encoded_image = model_patch.model.process_input_latent_image(vae.encode(image))
+    def __init__(self, model_patch, vae, image, strength, mask=None):
         self.model_patch = model_patch
         self.vae = vae
         self.image = image
         self.strength = strength
+        self.mask = mask
+        self.encoded_image = model_patch.model.process_input_latent_image(self.encode_latent_cond(image))
+
+    def encode_latent_cond(self, image):
+        latent_image = self.vae.encode(image)
+        if self.model_patch.model.additional_in_dim > 0:
+            if self.mask is None:
+                mask_ = torch.ones_like(latent_image)[:, :self.model_patch.model.additional_in_dim // 4]
+            else:
+                mask_ = comfy.utils.common_upscale(self.mask.mean(dim=1, keepdim=True), latent_image.shape[-1], latent_image.shape[-2], "bilinear", "none")
+
+            return torch.cat([latent_image, mask_], dim=1)
+        else:
+            return latent_image
 
     def __call__(self, kwargs):
         x = kwargs.get("x")
@@ -95,7 +110,7 @@ class DiffSynthCnetPatch:
             spacial_compression = self.vae.spacial_compression_encode()
             image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center")
             loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
-            self.encoded_image = self.model_patch.model.process_input_latent_image(self.vae.encode(image_scaled.movedim(1, -1)))
+            self.encoded_image = self.model_patch.model.process_input_latent_image(self.encode_latent_cond(image_scaled.movedim(1, -1)))
             comfy.model_management.load_models_gpu(loaded_models)
 
         img = img + (self.model_patch.model.control_block(img, self.encoded_image.to(img.dtype), block_index) * self.strength)
@@ -118,17 +133,25 @@ class QwenImageDiffsynthControlnet:
                               "vae": ("VAE",),
                               "image": ("IMAGE",),
                               "strength": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01}),
-                              }}
+                              },
+                "optional": {"mask": ("MASK",)}}
     RETURN_TYPES = ("MODEL",)
     FUNCTION = "diffsynth_controlnet"
     EXPERIMENTAL = True
 
     CATEGORY = "advanced/loaders/qwen"
 
-    def diffsynth_controlnet(self, model, model_patch, vae, image, strength):
+    def diffsynth_controlnet(self, model, model_patch, vae, image, strength, mask=None):
         model_patched = model.clone()
         image = image[:, :, :, :3]
-        model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength))
+        if mask is not None:
+            if mask.ndim == 3:
+                mask = mask.unsqueeze(1)
+            if mask.ndim == 4:
+                mask = mask.unsqueeze(2)
+            mask = 1.0 - mask
+
+        model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength, mask))
         return (model_patched,)