Merge branch 'master' of github.com:comfyanonymous/ComfyUI

2026-03-07 10:17:31 +08:00 · 2025-08-07 13:23:38 -07:00 · 2025-08-07 13:23:38 -07:00 · d8dbff9226
commit d8dbff9226
parent b42b2322fa 05df2df489
18 changed files with 3272 additions and 109 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,2 +1,3 @@
 /comfy/web/assets/** linguist-generated
 /comfy/web/** linguist-vendored
 comfy_api_nodes/apis/__init__.py linguist-generated
--- a/README.md
+++ b/README.md
@ -39,6 +39,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
    - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
    - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
   - [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/)
   - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/)
 - Image Editing Models
   - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
   - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
--- a/comfy/init.py
+++ b/comfy/init.py
@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.48"
+__version__ = "0.3.49"
--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@ -0,0 +1,400 @@
 # https://github.com/QwenLM/Qwen-Image (Apache 2.0)
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import Optional, Tuple
 from einops import repeat
 from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps
 from comfy.ldm.modules.attention import optimized_attention_masked
 from comfy.ldm.flux.layers import EmbedND
 import comfy.ldm.common_dit
 class GELU(nn.Module):
    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True, dtype=None, device=None, operations=None):
        super().__init__()
        self.proj = operations.Linear(dim_in, dim_out, bias=bias, dtype=dtype, device=device)
        self.approximate = approximate
    def forward(self, hidden_states):
        hidden_states = self.proj(hidden_states)
        hidden_states = F.gelu(hidden_states, approximate=self.approximate)
        return hidden_states
 class FeedForward(nn.Module):
    def __init__(
        self,
        dim: int,
        dim_out: Optional[int] = None,
        mult: int = 4,
        dropout: float = 0.0,
        inner_dim=None,
        bias: bool = True,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
        if inner_dim is None:
            inner_dim = int(dim * mult)
        dim_out = dim_out if dim_out is not None else dim
        self.net = nn.ModuleList([])
        self.net.append(GELU(dim, inner_dim, approximate="tanh", bias=bias, dtype=dtype, device=device, operations=operations))
        self.net.append(nn.Dropout(dropout))
        self.net.append(operations.Linear(inner_dim, dim_out, bias=bias, dtype=dtype, device=device))
    def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
        for module in self.net:
            hidden_states = module(hidden_states)
        return hidden_states
 def apply_rotary_emb(x, freqs_cis):
    if x.shape[1] == 0:
        return x
    t_ = x.reshape(*x.shape[:-1], -1, 1, 2)
    t_out = freqs_cis[..., 0] * t_[..., 0] + freqs_cis[..., 1] * t_[..., 1]
    return t_out.reshape(*x.shape)
 class QwenTimestepProjEmbeddings(nn.Module):
    def __init__(self, embedding_dim, pooled_projection_dim, dtype=None, device=None, operations=None):
        super().__init__()
        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000)
        self.timestep_embedder = TimestepEmbedding(
            in_channels=256,
            time_embed_dim=embedding_dim,
            dtype=dtype,
            device=device,
            operations=operations
        )
    def forward(self, timestep, hidden_states):
        timesteps_proj = self.time_proj(timestep)
        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))
        return timesteps_emb
 class Attention(nn.Module):
    def __init__(
        self,
        query_dim: int,
        dim_head: int = 64,
        heads: int = 8,
        dropout: float = 0.0,
        bias: bool = False,
        eps: float = 1e-5,
        out_bias: bool = True,
        out_dim: int = None,
        out_context_dim: int = None,
        dtype=None,
        device=None,
        operations=None
    ):
        super().__init__()
        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
        self.inner_kv_dim = self.inner_dim
        self.heads = heads
        self.dim_head = dim_head
        self.out_dim = out_dim if out_dim is not None else query_dim
        self.out_context_dim = out_context_dim if out_context_dim is not None else query_dim
        self.dropout = dropout
        # Q/K normalization
        self.norm_q = operations.RMSNorm(dim_head, eps=eps, elementwise_affine=True, dtype=dtype, device=device)
        self.norm_k = operations.RMSNorm(dim_head, eps=eps, elementwise_affine=True, dtype=dtype, device=device)
        self.norm_added_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
        self.norm_added_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
        # Image stream projections
        self.to_q = operations.Linear(query_dim, self.inner_dim, bias=bias, dtype=dtype, device=device)
        self.to_k = operations.Linear(query_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
        self.to_v = operations.Linear(query_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
        # Text stream projections
        self.add_q_proj = operations.Linear(query_dim, self.inner_dim, bias=bias, dtype=dtype, device=device)
        self.add_k_proj = operations.Linear(query_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
        self.add_v_proj = operations.Linear(query_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
        # Output projections
        self.to_out = nn.ModuleList([
            operations.Linear(self.inner_dim, self.out_dim, bias=out_bias, dtype=dtype, device=device),
            nn.Dropout(dropout)
        ])
        self.to_add_out = operations.Linear(self.inner_dim, self.out_context_dim, bias=out_bias, dtype=dtype, device=device)
    def forward(
        self,
        hidden_states: torch.FloatTensor,  # Image stream
        encoder_hidden_states: torch.FloatTensor = None,  # Text stream
        encoder_hidden_states_mask: torch.FloatTensor = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        image_rotary_emb: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        seq_txt = encoder_hidden_states.shape[1]
        img_query = self.to_q(hidden_states).unflatten(-1, (self.heads, -1))
        img_key = self.to_k(hidden_states).unflatten(-1, (self.heads, -1))
        img_value = self.to_v(hidden_states).unflatten(-1, (self.heads, -1))
        txt_query = self.add_q_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1))
        txt_key = self.add_k_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1))
        txt_value = self.add_v_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1))
        img_query = self.norm_q(img_query)
        img_key = self.norm_k(img_key)
        txt_query = self.norm_added_q(txt_query)
        txt_key = self.norm_added_k(txt_key)
        joint_query = torch.cat([txt_query, img_query], dim=1)
        joint_key = torch.cat([txt_key, img_key], dim=1)
        joint_value = torch.cat([txt_value, img_value], dim=1)
        joint_query = apply_rotary_emb(joint_query, image_rotary_emb)
        joint_key = apply_rotary_emb(joint_key, image_rotary_emb)
        joint_query = joint_query.flatten(start_dim=2)
        joint_key = joint_key.flatten(start_dim=2)
        joint_value = joint_value.flatten(start_dim=2)
        joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads, attention_mask)
        txt_attn_output = joint_hidden_states[:, :seq_txt, :]
        img_attn_output = joint_hidden_states[:, seq_txt:, :]
        img_attn_output = self.to_out[0](img_attn_output)
        img_attn_output = self.to_out[1](img_attn_output)
        txt_attn_output = self.to_add_out(txt_attn_output)
        return img_attn_output, txt_attn_output
 class QwenImageTransformerBlock(nn.Module):
    def __init__(
        self,
        dim: int,
        num_attention_heads: int,
        attention_head_dim: int,
        eps: float = 1e-6,
        dtype=None,
        device=None,
        operations=None
    ):
        super().__init__()
        self.dim = dim
        self.num_attention_heads = num_attention_heads
        self.attention_head_dim = attention_head_dim
        self.img_mod = nn.Sequential(
            nn.SiLU(),
            operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device),
        )
        self.img_norm1 = operations.LayerNorm(dim, elementwise_affine=False, eps=eps, dtype=dtype, device=device)
        self.img_norm2 = operations.LayerNorm(dim, elementwise_affine=False, eps=eps, dtype=dtype, device=device)
        self.img_mlp = FeedForward(dim=dim, dim_out=dim, dtype=dtype, device=device, operations=operations)
        self.txt_mod = nn.Sequential(
            nn.SiLU(),
            operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device),
        )
        self.txt_norm1 = operations.LayerNorm(dim, elementwise_affine=False, eps=eps, dtype=dtype, device=device)
        self.txt_norm2 = operations.LayerNorm(dim, elementwise_affine=False, eps=eps, dtype=dtype, device=device)
        self.txt_mlp = FeedForward(dim=dim, dim_out=dim, dtype=dtype, device=device, operations=operations)
        self.attn = Attention(
            query_dim=dim,
            dim_head=attention_head_dim,
            heads=num_attention_heads,
            out_dim=dim,
            bias=True,
            eps=eps,
            dtype=dtype,
            device=device,
            operations=operations,
        )
    def _modulate(self, x, mod_params):
        shift, scale, gate = mod_params.chunk(3, dim=-1)
        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1)
    def forward(
        self,
        hidden_states: torch.Tensor,
        encoder_hidden_states: torch.Tensor,
        encoder_hidden_states_mask: torch.Tensor,
        temb: torch.Tensor,
        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        img_mod_params = self.img_mod(temb)
        txt_mod_params = self.txt_mod(temb)
        img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1)
        txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1)
        img_normed = self.img_norm1(hidden_states)
        img_modulated, img_gate1 = self._modulate(img_normed, img_mod1)
        txt_normed = self.txt_norm1(encoder_hidden_states)
        txt_modulated, txt_gate1 = self._modulate(txt_normed, txt_mod1)
        img_attn_output, txt_attn_output = self.attn(
            hidden_states=img_modulated,
            encoder_hidden_states=txt_modulated,
            encoder_hidden_states_mask=encoder_hidden_states_mask,
            image_rotary_emb=image_rotary_emb,
        )
        hidden_states = hidden_states + img_gate1 * img_attn_output
        encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output
        img_normed2 = self.img_norm2(hidden_states)
        img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2)
        hidden_states = hidden_states + img_gate2 * self.img_mlp(img_modulated2)
        txt_normed2 = self.txt_norm2(encoder_hidden_states)
        txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2)
        encoder_hidden_states = encoder_hidden_states + txt_gate2 * self.txt_mlp(txt_modulated2)
        return encoder_hidden_states, hidden_states
 class LastLayer(nn.Module):
    def __init__(
        self,
        embedding_dim: int,
        conditioning_embedding_dim: int,
        elementwise_affine=False,
        eps=1e-6,
        bias=True,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
        self.silu = nn.SiLU()
        self.linear = operations.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias, dtype=dtype, device=device)
        self.norm = operations.LayerNorm(embedding_dim, eps, elementwise_affine=False, bias=bias, dtype=dtype, device=device)
    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
        emb = self.linear(self.silu(conditioning_embedding))
        scale, shift = torch.chunk(emb, 2, dim=1)
        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
        return x
 class QwenImageTransformer2DModel(nn.Module):
    def __init__(
        self,
        patch_size: int = 2,
        in_channels: int = 64,
        out_channels: Optional[int] = 16,
        num_layers: int = 60,
        attention_head_dim: int = 128,
        num_attention_heads: int = 24,
        joint_attention_dim: int = 3584,
        pooled_projection_dim: int = 768,
        guidance_embeds: bool = False,
        axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
        image_model=None,
        dtype=None,
        device=None,
        operations=None,
    ):
        super().__init__()
        self.dtype = dtype
        self.patch_size = patch_size
        self.out_channels = out_channels or in_channels
        self.inner_dim = num_attention_heads * attention_head_dim
        self.pe_embedder = EmbedND(dim=attention_head_dim, theta=10000, axes_dim=list(axes_dims_rope))
        self.time_text_embed = QwenTimestepProjEmbeddings(
            embedding_dim=self.inner_dim,
            pooled_projection_dim=pooled_projection_dim,
            dtype=dtype,
            device=device,
            operations=operations
        )
        self.txt_norm = operations.RMSNorm(joint_attention_dim, eps=1e-6, dtype=dtype, device=device)
        self.img_in = operations.Linear(in_channels, self.inner_dim, dtype=dtype, device=device)
        self.txt_in = operations.Linear(joint_attention_dim, self.inner_dim, dtype=dtype, device=device)
        self.transformer_blocks = nn.ModuleList([
            QwenImageTransformerBlock(
                dim=self.inner_dim,
                num_attention_heads=num_attention_heads,
                attention_head_dim=attention_head_dim,
                dtype=dtype,
                device=device,
                operations=operations
            )
            for _ in range(num_layers)
        ])
        self.norm_out = LastLayer(self.inner_dim, self.inner_dim, dtype=dtype, device=device, operations=operations)
        self.proj_out = operations.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True, dtype=dtype, device=device)
        self.gradient_checkpointing = False
    def pos_embeds(self, x, context):
        bs, c, t, h, w = x.shape
        patch_size = self.patch_size
        h_len = ((h + (patch_size // 2)) // patch_size)
        w_len = ((w + (patch_size // 2)) // patch_size)
        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
        txt_start = round(max(h_len, w_len))
        txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(bs, 1, 3)
        ids = torch.cat((txt_ids, img_ids), dim=1)
        return self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype)
    def forward(
        self,
        x,
        timesteps,
        context,
        attention_mask=None,
        guidance: torch.Tensor = None,
        **kwargs
    ):
        timestep = timesteps
        encoder_hidden_states = context
        encoder_hidden_states_mask = attention_mask
        image_rotary_emb = self.pos_embeds(x, context)
        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (1, self.patch_size, self.patch_size))
        orig_shape = hidden_states.shape
        hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2)
        hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5)
        hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4)
        hidden_states = self.img_in(hidden_states)
        encoder_hidden_states = self.txt_norm(encoder_hidden_states)
        encoder_hidden_states = self.txt_in(encoder_hidden_states)
        if guidance is not None:
            guidance = guidance * 1000
        temb = (
            self.time_text_embed(timestep, hidden_states)
            if guidance is None
            else self.time_text_embed(timestep, guidance, hidden_states)
        )
        for block in self.transformer_blocks:
            encoder_hidden_states, hidden_states = block(
                hidden_states=hidden_states,
                encoder_hidden_states=encoder_hidden_states,
                encoder_hidden_states_mask=encoder_hidden_states_mask,
                temb=temb,
                image_rotary_emb=image_rotary_emb,
            )
        hidden_states = self.norm_out(hidden_states, temb)
        hidden_states = self.proj_out(hidden_states)
        hidden_states = hidden_states.view(orig_shape[0], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2)
        hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5)
        return hidden_states.reshape(orig_shape)[:, :, :, :x.shape[-2], :x.shape[-1]]
--- a/comfy/lora.py
+++ b/comfy/lora.py
@ -305,6 +305,15 @@ def model_lora_keys_unet(model, key_map=None):
                key_lora = k[len("diffusion_model."):-len(".weight")]
                key_map["{}".format(key_lora)] = k
    if isinstance(model, comfy.model_base.QwenImage):
        for k in sdk:
            if k.startswith("diffusion_model.") and k.endswith(".weight"): #QwenImage lora format
                key_lora = k[len("diffusion_model."):-len(".weight")]
                # Direct mapping for transformer_blocks format (QwenImage LoRA format)
                key_map["{}".format(key_lora)] = k
                # Support transformer prefix format
                key_map["transformer.{}".format(key_lora)] = k
    return key_map
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -18,11 +18,10 @@
 import logging
 import math
 import torch
 from enum import Enum
 from typing import TypeVar, Type, Protocol, Any, Optional
 import torch
 from . import conds
 from . import latent_formats
 from . import model_management
@ -52,6 +51,7 @@ from .ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmenta
 from .ldm.modules.encoders.noise_aug_modules import CLIPEmbeddingNoiseAugmentation
 from .ldm.omnigen.omnigen2 import OmniGen2Transformer2DModel
 from .ldm.pixart.pixartms import PixArtMS
 from .ldm.qwen_image.model import QwenImageTransformer2DModel
 from .ldm.wan.model import WanModel, VaceWanModel, CameraWanModel
 from .model_management_types import ModelManageable
 from .model_sampling import CONST, ModelSamplingDiscreteFlow, ModelSamplingFlux, IMG_TO_IMG
@ -1360,3 +1360,15 @@ class Omnigen2(BaseModel):
        if ref_latents is not None:
            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
        return out
 class QwenImage(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=QwenImageTransformer2DModel)
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        return out
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -487,6 +487,11 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["timestep_scale"] = 1000.0
        return dit_config
    if '{}txt_norm.weight'.format(key_prefix) in state_dict_keys:  # Qwen Image
        dit_config = {}
        dit_config["image_model"] = "qwen_image"
        return dit_config
    if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
        return None
@ -879,7 +884,7 @@ def convert_diffusers_mmdit(state_dict, output_prefix=""):
        depth_single_blocks = count_blocks(state_dict, 'single_transformer_blocks.{}.')
        hidden_size = state_dict["x_embedder.bias"].shape[0]
        sd_map = utils.flux_to_diffusers({"depth": depth, "depth_single_blocks": depth_single_blocks, "hidden_size": hidden_size}, output_prefix=output_prefix)
-    elif 'transformer_blocks.0.attn.add_q_proj.weight' in state_dict:  # SD3
+    elif 'transformer_blocks.0.attn.add_q_proj.weight' in state_dict and 'pos_embed.proj.weight' in state_dict:  # SD3
        num_blocks = count_blocks(state_dict, 'transformer_blocks.{}.')
        depth = state_dict["pos_embed.proj.weight"].shape[0] // 64
        sd_map = utils.mmdit_to_diffusers({"depth": depth, "num_blocks": num_blocks}, output_prefix=output_prefix)
--- a/comfy/nodes/base_nodes.py
+++ b/comfy/nodes/base_nodes.py
@ -995,7 +995,7 @@ class CLIPLoader:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {"clip_name": (get_filename_list_with_downloadable("text_encoders", KNOWN_CLIP_MODELS),),
-                             "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2"],),
+                             "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image"],),
                             },
                "optional": {
                    "device": (["default", "cpu"], {"advanced": True}),
@ -1321,12 +1321,12 @@ class RepeatLatentBatch:
        s = samples.copy()
        s_in = samples["samples"]
-        s["samples"] = s_in.repeat((amount, 1, 1, 1))
+        s["samples"] = s_in.repeat((amount,) + ((1, ) * (s_in.ndim - 1)))
        if "noise_mask" in samples and samples["noise_mask"].shape[0] > 1:
            masks = samples["noise_mask"]
            if masks.shape[0] < s_in.shape[0]:
-                masks = masks.repeat(math.ceil(s_in.shape[0] / masks.shape[0]), 1, 1, 1)[:s_in.shape[0]]
+                masks = masks.repeat((math.ceil(s_in.shape[0] / masks.shape[0]),) + ((1,) * (masks.ndim - 1)))[:s_in.shape[0]]
-            s["noise_mask"] = samples["noise_mask"].repeat((amount, 1, 1, 1))
+            s["noise_mask"] = samples["noise_mask"].repeat((amount,) + ((1, ) * (samples["noise_mask"].ndim - 1)))
        if "batch_index" in s:
            offset = max(s["batch_index"]) - min(s["batch_index"]) + 1
            s["batch_index"] = s["batch_index"] + [x + (i * offset) for i in range(1, amount) for x in s["batch_index"]]
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -6,11 +6,10 @@ import logging
 import math
 import os
 import os.path
 from enum import Enum
 from typing import Any, Optional
 import torch
 import yaml
 from enum import Enum
 from typing import Any, Optional
 from . import clip_vision
 from . import diffusers_convert
@ -55,6 +54,7 @@ from .text_encoders import lt
 from .text_encoders import lumina2
 from .text_encoders import omnigen2
 from .text_encoders import pixart_t5
 from .text_encoders import qwen_image
 from .text_encoders import sa_t5
 from .text_encoders import sd2_clip
 from .text_encoders import sd3_clip
@ -817,6 +817,7 @@ class CLIPType(Enum):
    CHROMA = 15
    ACE = 16
    OMNIGEN2 = 17
    QWEN_IMAGE = 18
@dataclasses.dataclass
@ -847,6 +848,7 @@ class TEModel(Enum):
    T5_XXL_OLD = 8
    GEMMA_2_2B = 9
    QWEN25_3B = 10
    QWEN25_7B = 11
 def detect_te_model(sd):
@ -869,7 +871,11 @@ def detect_te_model(sd):
    if 'model.layers.0.post_feedforward_layernorm.weight' in sd:
        return TEModel.GEMMA_2_2B
    if 'model.layers.0.self_attn.k_proj.bias' in sd:
        weight = sd['model.layers.0.self_attn.k_proj.bias']
        if weight.shape[0] == 256:
            return TEModel.QWEN25_3B
        if weight.shape[0] == 512:
            return TEModel.QWEN25_7B
    if "model.layers.0.post_attention_layernorm.weight" in sd:
        return TEModel.LLAMA3_8
    return None
@ -976,6 +982,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
        elif te_model == TEModel.QWEN25_3B:
            clip_target.clip = omnigen2.te(**llama_detect(clip_data))
            clip_target.tokenizer = omnigen2.Omnigen2Tokenizer
        elif te_model == TEModel.QWEN25_7B:
            clip_target.clip = qwen_image.te(**llama_detect(clip_data))
            clip_target.tokenizer = qwen_image.QwenImageTokenizer
        else:
            # clip_l
            if clip_type == CLIPType.SD3:
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -23,6 +23,7 @@ from .text_encoders import sa_t5
 from .text_encoders import sd2_clip
 from .text_encoders import sd3_clip
 from .text_encoders import wan
 from .text_encoders import qwen_image
 class SD15(supported_models_base.BASE):
@ -1326,7 +1327,36 @@ class Omnigen2(supported_models_base.BASE):
        hunyuan_detect = hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref))
        return supported_models_base.ClipTarget(omnigen2.Omnigen2Tokenizer, omnigen2.te(**hunyuan_detect))
 class QwenImage(supported_models_base.BASE):
    unet_config = {
        "image_model": "qwen_image",
    }
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2]
+    sampling_settings = {
        "multiplier": 1.0,
        "shift": 1.15,
    }
    memory_usage_factor = 1.8 #TODO
    unet_extra_config = {}
    latent_format = latent_formats.Wan21
    supported_inference_dtypes = [torch.bfloat16, torch.float32]
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.QwenImage(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        pref = self.text_encoder_key_prefix[0]
        hunyuan_detect = hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
        return supported_models_base.ClipTarget(qwen_image.QwenImageTokenizer, qwen_image.te(**hunyuan_detect))
 models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage]
 models += [SVD_img2vid]
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@ -43,6 +43,23 @@ class Qwen25_3BConfig:
    qkv_bias = True
@dataclass
 class Qwen25_7BVLI_Config:
    vocab_size: int = 152064
    hidden_size: int = 3584
    intermediate_size: int = 18944
    num_hidden_layers: int = 28
    num_attention_heads: int = 28
    num_key_value_heads: int = 4
    max_position_embeddings: int = 128000
    rms_norm_eps: float = 1e-6
    rope_theta: float = 1000000.0
    transformer_type: str = "llama"
    head_dim = 128
    rms_norm_add = False
    mlp_activation = "silu"
    qkv_bias = True
@dataclass
 class Gemma2_2B_Config:
    vocab_size: int = 256000
@ -355,6 +372,15 @@ class Qwen25_3B(BaseLlama, torch.nn.Module):
        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
        self.dtype = dtype
 class Qwen25_7BVLI(BaseLlama, torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
        config = Qwen25_7BVLI_Config(**config_dict)
        self.num_layers = config.num_hidden_layers
        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
        self.dtype = dtype
 class Gemma2_2B(BaseLlama, torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
--- a/comfy/text_encoders/qwen_image.py
+++ b/comfy/text_encoders/qwen_image.py
@ -0,0 +1,71 @@
 from transformers import Qwen2Tokenizer
 from comfy import sd1_clip
 import comfy.text_encoders.llama
 import os
 import torch
 import numbers
 class Qwen25_7BVLITokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=3584, embedding_key='qwen25_7b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
 class QwenImageTokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen25_7b", tokenizer=Qwen25_7BVLITokenizer)
        self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None,**kwargs):
        if llama_template is None:
            llama_text = self.llama_template.format(text)
        else:
            llama_text = llama_template.format(text)
        return super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs)
 class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
 class QwenImageTEModel(sd1_clip.SD1ClipModel):
    def __init__(self, device="cpu", dtype=None, model_options={}):
        super().__init__(device=device, dtype=dtype, name="qwen25_7b", clip_model=Qwen25_7BVLIModel, model_options=model_options)
    def encode_token_weights(self, token_weight_pairs):
        out, pooled, extra = super().encode_token_weights(token_weight_pairs)
        tok_pairs = token_weight_pairs["qwen25_7b"][0]
        count_im_start = 0
        for i, v in enumerate(tok_pairs):
            elem = v[0]
            if not torch.is_tensor(elem):
                if isinstance(elem, numbers.Integral):
                    if elem == 151644 and count_im_start < 2:
                        template_end = i
                        count_im_start += 1
        if out.shape[1] > (template_end + 3):
            if tok_pairs[template_end + 1][0] == 872:
                if tok_pairs[template_end + 2][0] == 198:
                    template_end += 3
        out = out[:, template_end:]
        extra["attention_mask"] = extra["attention_mask"][:, template_end:]
        if extra["attention_mask"].sum() == torch.numel(extra["attention_mask"]):
            extra.pop("attention_mask")  # attention mask is useless if no masked elements
        return out, pooled, extra
 def te(dtype_llama=None, llama_scaled_fp8=None):
    class QwenImageTEModel_(QwenImageTEModel):
        def __init__(self, device="cpu", dtype=None, model_options={}):
            if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
                model_options = model_options.copy()
                model_options["scaled_fp8"] = llama_scaled_fp8
            if dtype_llama is not None:
                dtype = dtype_llama
            super().__init__(device=device, dtype=dtype, model_options=model_options)
    return QwenImageTEModel_
--- a/comfy/weight_adapter/lora.py
+++ b/comfy/weight_adapter/lora.py
@ -97,6 +97,7 @@ class LoRAAdapter(WeightAdapterBase):
        diffusers3_lora = "{}.lora.up.weight".format(x)
        mochi_lora = "{}.lora_B".format(x)
        transformers_lora = "{}.lora_linear_layer.up.weight".format(x)
        qwen_default_lora = "{}.lora_B.default.weight".format(x)
        A_name = None
        B_name = None
        mid_name = None
@ -125,6 +126,10 @@ class LoRAAdapter(WeightAdapterBase):
            A_name = transformers_lora
            B_name = "{}.lora_linear_layer.down.weight".format(x)
            mid_name = None
        elif qwen_default_lora in lora.keys():
            A_name = qwen_default_lora
            B_name = "{}.lora_A.default.weight".format(x)
            mid_name = None
        if A_name is not None:
            mid = None
--- a/comfy_api_nodes/apis/init.py
+++ b/comfy_api_nodes/apis/init.py
--- a/comfy_api_nodes/apis/tripo_api.py
+++ b/comfy_api_nodes/apis/tripo_api.py
@ -127,7 +127,7 @@ class TripoTextToModelRequest(BaseModel):
    type: TripoTaskType = Field(TripoTaskType.TEXT_TO_MODEL, description='Type of task')
    prompt: str = Field(..., description='The text prompt describing the model to generate', max_length=1024)
    negative_prompt: Optional[str] = Field(None, description='The negative text prompt', max_length=1024)
-    model_version: Optional[TripoModelVersion] = TripoModelVersion.V2_5
+    model_version: Optional[TripoModelVersion] = TripoModelVersion.v2_5_20250123
    face_limit: Optional[int] = Field(None, description='The number of faces to limit the generation to')
    texture: Optional[bool] = Field(True, description='Whether to apply texture to the generated model')
    pbr: Optional[bool] = Field(True, description='Whether to apply PBR to the generated model')
--- a/comfy_api_nodes/nodes_veo2.py
+++ b/comfy_api_nodes/nodes_veo2.py
@ -8,10 +8,10 @@ from typing import Optional
 from comfy.comfy_types.node_typing import IO, ComfyNodeABC
 from comfy_api.input_impl.video_types import VideoFromFile
 from comfy_api_nodes.apis import (
-    Veo2GenVidRequest,
+    VeoGenVidRequest,
-    Veo2GenVidResponse,
+    VeoGenVidResponse,
-    Veo2GenVidPollRequest,
+    VeoGenVidPollRequest,
-    Veo2GenVidPollResponse
+    VeoGenVidPollResponse
 )
 from comfy_api_nodes.apis.client import (
    ApiEndpoint,
@ -35,7 +35,7 @@ def convert_image_to_base64(image: torch.Tensor):
    return tensor_to_base64_string(scaled_image)
-def get_video_url_from_response(poll_response: Veo2GenVidPollResponse) -> Optional[str]:
+def get_video_url_from_response(poll_response: VeoGenVidPollResponse) -> Optional[str]:
    if (
        poll_response.response
        and hasattr(poll_response.response, "videos")
@ -130,6 +130,14 @@ class VeoVideoGenerationNode(ComfyNodeABC):
                    "default": None,
                    "tooltip": "Optional reference image to guide video generation",
                }),
                "model": (
                    IO.COMBO,
                    {
                        "options": ["veo-2.0-generate-001"],
                        "default": "veo-2.0-generate-001",
                        "tooltip": "Veo 2 model to use for video generation",
                    },
                ),
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
@ -141,7 +149,7 @@ class VeoVideoGenerationNode(ComfyNodeABC):
    RETURN_TYPES = (IO.VIDEO,)
    FUNCTION = "generate_video"
    CATEGORY = "api node/video/Veo"
-    DESCRIPTION = "Generates videos from text prompts using Google's Veo API"
+    DESCRIPTION = "Generates videos from text prompts using Google's Veo 2 API"
    API_NODE = True
    def generate_video(
@ -154,6 +162,8 @@ class VeoVideoGenerationNode(ComfyNodeABC):
        person_generation="ALLOW",
        seed=0,
        image=None,
        model="veo-2.0-generate-001",
        generate_audio=False,
        unique_id: Optional[str] = None,
        **kwargs,
    ):
@ -188,16 +198,19 @@ class VeoVideoGenerationNode(ComfyNodeABC):
            parameters["negativePrompt"] = negative_prompt
        if seed > 0:
            parameters["seed"] = seed
        # Only add generateAudio for Veo 3 models
        if "veo-3.0" in model:
            parameters["generateAudio"] = generate_audio
        # Initial request to start video generation
        initial_operation = SynchronousOperation(
            endpoint=ApiEndpoint(
-                path="/proxy/veo/generate",
+                path=f"/proxy/veo/{model}/generate",
                method=HttpMethod.POST,
-                request_model=Veo2GenVidRequest,
+                request_model=VeoGenVidRequest,
-                response_model=Veo2GenVidResponse
+                response_model=VeoGenVidResponse
            ),
-            request=Veo2GenVidRequest(
+            request=VeoGenVidRequest(
                instances=instances,
                parameters=parameters
            ),
@ -223,16 +236,16 @@ class VeoVideoGenerationNode(ComfyNodeABC):
        # Define the polling operation
        poll_operation = PollingOperation(
            poll_endpoint=ApiEndpoint(
-                path="/proxy/veo/poll",
+                path=f"/proxy/veo/{model}/poll",
                method=HttpMethod.POST,
-                request_model=Veo2GenVidPollRequest,
+                request_model=VeoGenVidPollRequest,
-                response_model=Veo2GenVidPollResponse
+                response_model=VeoGenVidPollResponse
            ),
            completed_statuses=["completed"],
            failed_statuses=[],  # No failed statuses, we'll handle errors after polling
            status_extractor=status_extractor,
            progress_extractor=progress_extractor,
-            request=Veo2GenVidPollRequest(
+            request=VeoGenVidPollRequest(
                operationName=operation_name
            ),
            auth_kwargs=kwargs,
@ -298,11 +311,64 @@ class VeoVideoGenerationNode(ComfyNodeABC):
        return (VideoFromFile(video_io),)
-# Register the node
+class Veo3VideoGenerationNode(VeoVideoGenerationNode):
    """
    Generates videos from text prompts using Google's Veo 3 API.
    Supported models:
    - veo-3.0-generate-001
    - veo-3.0-fast-generate-001
    This node extends the base Veo node with Veo 3 specific features including
    audio generation and fixed 8-second duration.
    """
    @classmethod
    def INPUT_TYPES(s):
        parent_input = super().INPUT_TYPES()
        # Update model options for Veo 3
        parent_input["optional"]["model"] = (
            IO.COMBO,
            {
                "options": ["veo-3.0-generate-001", "veo-3.0-fast-generate-001"],
                "default": "veo-3.0-generate-001",
                "tooltip": "Veo 3 model to use for video generation",
            },
        )
        # Add generateAudio parameter
        parent_input["optional"]["generate_audio"] = (
            IO.BOOLEAN,
            {
                "default": False,
                "tooltip": "Generate audio for the video. Supported by all Veo 3 models.",
            }
        )
        # Update duration constraints for Veo 3 (only 8 seconds supported)
        parent_input["optional"]["duration_seconds"] = (
            IO.INT,
            {
                "default": 8,
                "min": 8,
                "max": 8,
                "step": 1,
                "display": "number",
                "tooltip": "Duration of the output video in seconds (Veo 3 only supports 8 seconds)",
            },
        )
        return parent_input
 # Register the nodes
 NODE_CLASS_MAPPINGS = {
    "VeoVideoGenerationNode": VeoVideoGenerationNode,
    "Veo3VideoGenerationNode": Veo3VideoGenerationNode,
 }
 NODE_DISPLAY_NAME_MAPPINGS = {
-    "VeoVideoGenerationNode": "Google Veo2 Video Generation",
+    "VeoVideoGenerationNode": "Google Veo 2 Video Generation",
    "Veo3VideoGenerationNode": "Google Veo 3 Video Generation",
 }
--- a/comfy_extras/nodes/nodes_model_merging_model_specific.py
+++ b/comfy_extras/nodes/nodes_model_merging_model_specific.py
@ -322,6 +322,29 @@ class ModelMergeCosmosPredict2_14B(nodes_model_merging.ModelMergeBlocks):
        return {"required": arg_dict}
 class ModelMergeQwenImage(comfy_extras.nodes_model_merging.ModelMergeBlocks):
    CATEGORY = "advanced/model_merging/model_specific"
    @classmethod
    def INPUT_TYPES(s):
        arg_dict = { "model1": ("MODEL",),
                              "model2": ("MODEL",)}
        argument = ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01})
        arg_dict["pos_embeds."] = argument
        arg_dict["img_in."] = argument
        arg_dict["txt_norm."] = argument
        arg_dict["txt_in."] = argument
        arg_dict["time_text_embed."] = argument
        for i in range(60):
            arg_dict["transformer_blocks.{}.".format(i)] = argument
        arg_dict["proj_out."] = argument
        return {"required": arg_dict}
 NODE_CLASS_MAPPINGS = {
    "ModelMergeSD1": ModelMergeSD1,
    "ModelMergeSD2": ModelMergeSD1,  # SD1 and SD2 have the same blocks
@ -337,4 +360,5 @@ NODE_CLASS_MAPPINGS = {
    "ModelMergeWAN2_1": ModelMergeWAN2_1,
    "ModelMergeCosmosPredict2_2B": ModelMergeCosmosPredict2_2B,
    "ModelMergeCosmosPredict2_14B": ModelMergeCosmosPredict2_14B,
    "ModelMergeQwenImage": ModelMergeQwenImage,
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "comfyui"
-version = "0.3.48"
+version = "0.3.49"
 description = "An installable version of ComfyUI"
 readme = "README.md"
 authors = [
@ -18,8 +18,8 @@ classifiers = [
 ]
 dependencies = [
-    "comfyui-frontend-package>=1.23.4",
+    "comfyui-frontend-package>=1.24.4",
-    "comfyui-workflow-templates>=0.1.47",
+    "comfyui-workflow-templates>=0.1.51",
    "comfyui-embedded-docs>=0.2.4",
    "torch",
    "torchvision",