final changes

2026-03-07 10:17:31 +08:00 · 2025-10-13 19:28:54 +03:00 · 2025-10-13 19:28:54 +03:00 · 4653b9008d
commit 4653b9008d
parent 4908e7412e
11 changed files with 73 additions and 73 deletions
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@ -246,7 +246,7 @@ class CLIPVision(torch.nn.Module):
            x = self.post_layernorm(x)
            if self.use_head:
                pooled_output = self.head(x)
-            else:   
+            else:
                pooled_output = x
        else:
            pooled_output = self.post_layernorm(x[:, 0, :])
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@ -19,6 +19,8 @@ class Output:

 def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True, resize_mode="bicubic"):
    image = image[:, :, :, :3] if image.shape[3] > 3 else image
+    if image.dtype == torch.uint8:
+        image = image.float() / 255.0
    mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
    std = torch.tensor(std, device=image.device, dtype=image.dtype)
    image = image.movedim(-1, 1)
--- a/comfy/ldm/hunyuan_foley/model.py
+++ b/comfy/ldm/hunyuan_foley/model.py
@ -55,7 +55,7 @@ class TimestepEmbedder(TimestepEmbedderParent):
    def forward(self, t):
        t_freq = timestep_embedding(t, self.frequency_embedding_size).type(self.mlp[0].weight.dtype)
        t_emb = self.mlp(t_freq)
-        return t_emb 
+        return t_emb

 class SwiGLU(nn.Module):
    def __init__(self, dim: int, hidden_dim: int, device, dtype, operations):
@ -150,9 +150,9 @@ class ChannelLastConv1d(nn.Module):
            self.register_parameter("bias", underlying.bias)
        else:
            self.register_parameter("bias", None)
-        
+
        object.__setattr__(self, "_underlying", underlying)
-    
+
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        self._underlying = self._underlying.to(x.dtype)
        x = self._underlying(x.permute(0, 2, 1))
@ -204,7 +204,7 @@ class ModulateDiT(nn.Module):

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear(self.act(x))
-    
+
 class FinalLayer1D(nn.Module):
    def __init__(self, hidden_size, patch_size, out_channels, device=None, dtype=None, operations = None):
        factory_kwargs = {"device": device, "dtype": dtype}
@ -223,7 +223,7 @@ class FinalLayer1D(nn.Module):
        self.linear = self.linear.to(x.dtype)
        x = self.linear(x)
        return x
-    
+
 class MLP(nn.Module):
    def __init__(
        self,
@ -254,7 +254,7 @@ class MLP(nn.Module):
        self.drop2 = nn.Dropout(drop_probs[1])

    def forward(self, x):
-        return self.drop2(self.fc2(self.norm(self.drop1(self.act(self.fc1(x)))))) 
+        return self.drop2(self.fc2(self.norm(self.drop1(self.act(self.fc1(x))))))


 def _to_tuple(x, dim=2):
@ -297,7 +297,7 @@ def get_meshgrid_nd(start, *args, dim=2):
 def get_nd_rotary_pos_embed(
    rope_dim_list, start, *args, theta=10000.0, use_real=False, theta_rescale_factor=1.0, freq_scaling=1.0
 ):
-    
+
    grid = get_meshgrid_nd(start, *args, dim=len(rope_dim_list))

    embs = []
@ -411,14 +411,14 @@ class TwoStreamCABlock(nn.Module):

        self.max_text_len = 100
        self.rope_dim_list = None
-        
+
        self.audio_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
        self.v_cond_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)

        self.audio_cross_q = operations.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
        self.v_cond_cross_q = operations.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
        self.text_cross_kv = operations.Linear(hidden_size, hidden_size * 2, bias=qkv_bias, **factory_kwargs)
-        
+
        self.audio_cross_proj = operations.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
        self.v_cond_cross_proj = operations.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)

@ -435,11 +435,11 @@ class TwoStreamCABlock(nn.Module):
    def build_rope_for_text(self, text_len, head_dim, rope_dim_list=None):
        target_ndim = 1  # n-d RoPE
        rope_sizes = [text_len]
-        
+
        if rope_dim_list is None:
            rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
        assert sum(rope_dim_list) == head_dim, "sum(rope_dim_list) should equal to head_dim of attention layer"
-        
+
        text_freqs_cos, text_freqs_sin = get_nd_rotary_pos_embed(
            rope_dim_list=rope_dim_list,
            start=rope_sizes,
@ -461,7 +461,7 @@ class TwoStreamCABlock(nn.Module):
        sync_vec: torch.Tensor = None,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

-        (audio_mod1_shift, audio_mod1_scale, audio_mod1_gate, 
+        (audio_mod1_shift, audio_mod1_scale, audio_mod1_gate,
             audio_mod2_shift, audio_mod2_scale, audio_mod2_gate,
             audio_mod3_shift, audio_mod3_scale, audio_mod3_gate,
        ) = self.audio_mod(sync_vec if sync_vec is not None else vec).chunk(9, dim=-1)
@ -477,19 +477,19 @@ class TwoStreamCABlock(nn.Module):
            v_cond_mod3_scale,
            v_cond_mod3_gate,
        ) = self.v_cond_mod(vec).chunk(9, dim=-1)
-        
+
        audio_q, audio_k, audio_v = prepare_self_attn_qkv(
-            audio, self.audio_norm1, self.audio_self_attn_qkv, 
+            audio, self.audio_norm1, self.audio_self_attn_qkv,
            self.audio_self_q_norm, self.audio_self_k_norm,
            audio_mod1_shift, audio_mod1_scale, self.num_heads
        )

        v_cond_q, v_cond_k, v_cond_v = prepare_self_attn_qkv(
-            v_cond, self.v_cond_norm1, self.v_cond_attn_qkv, 
+            v_cond, self.v_cond_norm1, self.v_cond_attn_qkv,
            self.v_cond_attn_q_norm, self.v_cond_attn_k_norm,
            v_cond_mod1_shift, v_cond_mod1_scale, self.num_heads
        )
-        
+
        # Apply RoPE if needed for audio and visual
        if freqs_cis is not None:
            if not self.interleaved_audio_visual_rope:
@ -515,18 +515,18 @@ class TwoStreamCABlock(nn.Module):
        if v_freqs_cis is not None and not self.interleaved_audio_visual_rope:
            v_cond_qq, v_cond_kk = apply_rotary_emb(v_cond_q, v_cond_k, v_freqs_cis, head_first=False)
            v_cond_q, v_cond_k = v_cond_qq, v_cond_kk
-        
+
        q = torch.cat((v_cond_q, audio_q), dim=1)
        k = torch.cat((v_cond_k, audio_k), dim=1)
        v = torch.cat((v_cond_v, audio_v), dim=1)
-        
+
        # TODO: look further into here
        if attention.__name__ == "attention_pytorch":
            q, k, v = [t.transpose(1, 2) for t in (q, k, v)]
-        
+
        attn = attention(q, k, v, heads = self.num_heads, mask=attn_mask, skip_reshape=True)
        v_cond_attn, audio_attn = torch.split(attn, [v_cond.shape[1], audio.shape[1]], dim=1)
-        
+
        audio = audio + apply_gate(self.audio_self_proj(audio_attn), gate=audio_mod1_gate)
        v_cond = v_cond + apply_gate(self.v_cond_self_proj(v_cond_attn), gate=v_cond_mod1_gate)
        head_dim = self.hidden_size // self.num_heads
@ -544,12 +544,12 @@ class TwoStreamCABlock(nn.Module):
        text_k = self.text_cross_k_norm(text_k).to(text_v)

        text_len = text_k.shape[1]
-        
-        text_freqs_cos, text_freqs_sin = self.build_rope_for_text(text_len, head_dim, 
+
+        text_freqs_cos, text_freqs_sin = self.build_rope_for_text(text_len, head_dim,
                                                                 rope_dim_list=self.rope_dim_list)
        text_freqs_cis = (text_freqs_cos.to(text_k.device), text_freqs_sin.to(text_k.device))
        text_k = apply_rotary_emb(text_k, text_k, text_freqs_cis, head_first=False)[1]
-        
+
        v_cond_audio_q = torch.cat([v_cond_q, audio_q], dim=1)

        if attention.__name__ == "attention_pytorch":
@ -557,7 +557,7 @@ class TwoStreamCABlock(nn.Module):

        cross_attn = attention(v_cond_audio_q, text_k, text_v, self.num_heads, skip_reshape = True)
        v_cond_cross_attn, audio_cross_attn = torch.split(cross_attn, [v_cond.shape[1], audio.shape[1]], dim=1)
-        
+
        audio = audio + apply_gate(self.audio_cross_proj(audio_cross_attn), gate=audio_mod2_gate)
        v_cond = v_cond + apply_gate(self.v_cond_cross_proj(v_cond_cross_attn), gate=v_cond_mod2_gate)

@ -565,7 +565,7 @@ class TwoStreamCABlock(nn.Module):
        v_cond = apply_modulated_block(v_cond, self.v_cond_norm3, v_cond_mod3_shift, v_cond_mod3_scale, self.v_cond_mlp, v_cond_mod3_gate)

        return audio, cond, v_cond
-    
+
    def prepare_modulated_query(self, x, norm_layer, q_layer, q_norm_layer, shift, scale, num_heads, rope_dim_list):

        x_mod = modulate(norm_layer(x), shift=shift, scale=scale)
@ -577,9 +577,9 @@ class TwoStreamCABlock(nn.Module):
        head_dim = q.shape[-1]
        freqs_cos, freqs_sin = self.build_rope_for_text(q.shape[1], head_dim, rope_dim_list)
        freqs_cis = (freqs_cos.to(q.device), freqs_sin.to(q.device))
-        
+
        q = apply_rotary_emb(q, q, freqs_cis, head_first=False)[0]
-        
+
        return q

 class SingleStreamBlock(nn.Module):
@ -697,7 +697,7 @@ class HunyuanVideoFoley(nn.Module):
        self.patch_size = model_args.get("patch_size", 1)
        self.visual_in_channels = model_args.get("clip_dim", 768)
        self.audio_vae_latent_dim = model_args.get("audio_vae_latent_dim", 128)
-        self.out_channels = self.audio_vae_latent_dim 
+        self.out_channels = self.audio_vae_latent_dim
        self.unpatchify_channels = self.out_channels

        self.num_heads = model_args.get("num_heads", 12)
@ -873,7 +873,7 @@ class HunyuanVideoFoley(nn.Module):

            uncond_1 = uncond_1[:, :clip_feat.size(1), :clip_feat.size(2)]
            uncond_2 = uncond_2[:, :sync_feat.size(1), :sync_feat.size(2)]
-            
+
            uncond_1, uncond_2, cond_neg, clip_feat, sync_feat, cond_pos = [unlock_cpu_tensor(t, device) for t in (uncond_1, uncond_2, cond_neg, clip_feat, sync_feat, cond_pos)]

            diff = cond_pos.shape[1] - cond_neg.shape[1]
@ -885,6 +885,8 @@ class HunyuanVideoFoley(nn.Module):
            clip_feat, sync_feat, cond = torch.cat([uncond_1, clip_feat]), torch.cat([uncond_2, sync_feat]), torch.cat([cond_neg, cond_pos])
            clip_feat = clip_feat.view(2, -1, 768)

+            self.conditions = (clip_feat, sync_feat, cond)
+
        else:
            clip_feat, sync_feat, cond = self.conditions

@ -944,7 +946,7 @@ class HunyuanVideoFoley(nn.Module):
            else:
                audio, cond, v_cond = block(*triple_block_args)

-        x = audio 
+        x = audio
        if sync_vec is not None:
            vec = vec.unsqueeze(1).repeat(1, cond_seq_len + v_cond_seq_len, 1)
            vec = torch.cat((vec, sync_vec), dim=1)
--- a/comfy/ldm/hunyuan_foley/syncformer.py
+++ b/comfy/ldm/hunyuan_foley/syncformer.py
@ -160,7 +160,7 @@ class MotionFormer(nn.Module):
    def __init__(self, device = None, dtype = None, operations = None):
        super().__init__()
        self.APPROX_ATTN_TYPE = "none"
-        self.APPROX_ATTN_DIM = 64 
+        self.APPROX_ATTN_DIM = 64
        self.img_size = 224
        self.patch_size = 16
        self.in_chans = 3
@ -224,7 +224,7 @@ class MotionFormer(nn.Module):
        self.norm = norm_layer(self.embed_dim)

        self.pre_logits = nn.Identity()
-        
+
        transf_enc_layer_kwargs = dict(
            d_model=self.embed_dim,
            nhead=self.num_heads,
@ -273,7 +273,7 @@ class MotionFormer(nn.Module):
            )

        return x, tok_mask
-    
+
    def forward(self, x):
        B, S, C, T, H, W = x.shape

@ -322,7 +322,7 @@ class BaseEncoderLayer(TransformerEncoderComfyv):
        device = None,
        dtype = None, operations = None,
        *args, **kwargs
-    ):  
+    ):
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__(operations = operations, *args, **kwargs, **factory_kwargs)

@ -382,7 +382,7 @@ class SpatialTransformerEncoderLayer(BaseEncoderLayer):
        x = rearrange(x, "(BS t) D -> BS t D", BS=BS, t=t)

        return x
-    
+
 class AST(torch.nn.Module):
    def __init__(
        self,
@ -391,7 +391,7 @@ class AST(torch.nn.Module):
        max_segments: int = None,
        device = None, dtype = None, operations = None
    ) -> None:
-        
+
        super().__init__()
        factory_kwargs = {"device": device, "dtype": dtype}
        self.extract_features = True
@ -518,7 +518,7 @@ class FrequencyTransformerEncoderLayer(BaseEncoderLayer):
        x = x.view(BS, t, D)

        return x
-    
+
 class ASTEmbeddings(nn.Module):

    def __init__(self, config: ASTConfig, device = None, dtype = None, operations = None) -> None:
@ -789,7 +789,7 @@ class ASTModel(nn.Module):
            ),
            tok_mask,
        )
-    
+
 class ASTMLPHead(nn.Module):
    def __init__(self, config: ASTConfig, device, dtype, operations):
        super().__init__()
@ -957,6 +957,7 @@ class Synchformer(nn.Module):
        )

    def forward(self, vis):
+        vis = vis.to(next(self.parameters()).dtype)
        vis = vis.permute(0, 1, 3, 2, 4, 5)  # (B, S, C, Tv, H, W)
        vis = self.vfeat_extractor(vis)
        return vis
--- a/comfy/ldm/hunyuan_foley/vae.py
+++ b/comfy/ldm/hunyuan_foley/vae.py
@ -221,10 +221,11 @@ class FoleyVae(torch.nn.Module):
    def encode(self, x):
        x = x.to(next(self.parameters()).device)
        return self.synchformer(x)
-    
+
    def video_encoding(self, video, step):
+        video = video.to(torch.uint8)
        video = torch.stack([self.syncformer_preprocess(t) for t in video])
-        
+
        t, c, h, w = video.shape
        seg_len = 16
        t = video.size(0)
@ -233,12 +234,13 @@ class FoleyVae(torch.nn.Module):
        video = video.contiguous()
        stride_t, stride_c, stride_h, stride_w = video.stride()

-        # no copies 
+        # no copies
        data = video.as_strided(
            size=(nseg, seg_len, c, h, w),
            stride=(stride_t * step, stride_t, stride_c, stride_h, stride_w),
-        )
+        ).contiguous()
        data = data.unsqueeze(0) # b
        data = rearrange(data, "b s t c h w -> (b s) 1 t c h w")
+        data = data.float()

        return data, nseg, lambda x: rearrange(x, "(b s) 1 t d -> b (s t) d", b=1)
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@ -1121,7 +1121,7 @@ class MultiheadAttentionComfyv(nn.Module):
        self.batch_first = batch_first
        self.head_dim = embed_dim // num_heads
        self.embed_dim = embed_dim
-    
+
    # overwriting state dict loading to convert in_proj_weight/bias -> self._q_proj/_k_proj/_v_proj
    def _load_from_state_dict(
        self,
@ -1164,26 +1164,17 @@ class MultiheadAttentionComfyv(nn.Module):
            error_msgs,
        )

-    def forward(self, src, k = None, v = None, attn_mask = None, key_padding_mask = None):
+    def forward(self, src, k=None, v=None, attn_mask=None, key_padding_mask=None):
+        self._q_proj, self._k_proj, self._v_proj = [
+            t.to(src.device).to(src.dtype)
+            for t in (self._q_proj, self._k_proj, self._v_proj)
+        ]

-        self._q_proj, self._k_proj, self._v_proj = [t.to(src.device).to(src.dtype) for t in (self._q_proj, self._k_proj, self._v_proj)]
        q = self._q_proj(src)
-        if k is None:
-            k = self._k_proj(src)
-        if v is None:
-            v = self._v_proj(src)
-        k, v = k.to(src.device).to(src.dtype), v.to(src.device).to(src.dtype)
+        k = self._k_proj(src if k is None else k.to(src.device).to(src.dtype))
+        v = self._v_proj(src if v is None else v.to(src.device).to(src.dtype))

-        if k is v:
-            if q is k:
-                q = k = v = q.transpose(1, 0)
-            else:
-                q, k = (x.transpose(1, 0) for x in (q, k))
-                v = k
-        else:
-            q, k, v = (x.transpose(1, 0) for x in (q, k, v))
-
-        output = optimized_attention(q, k, v, self.num_heads, mask = attn_mask)
+        output = optimized_attention(q, k, v, self.num_heads, mask=attn_mask)
        return self.out_proj(output)

 # comfyui implementation of nn.TransformerEncoderLayer
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -1413,7 +1413,7 @@ class ACEStep(BaseModel):
        out['speaker_embeds'] = comfy.conds.CONDRegular(torch.zeros(noise.shape[0], 512, device=noise.device, dtype=noise.dtype))
        out['lyrics_strength'] = comfy.conds.CONDConstant(kwargs.get("lyrics_strength", 1.0))
        return out
-    
+
 class HunyuanFoley(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None, unet_model=comfy.ldm.hunyuan_foley.model.HunyuanVideoFoley):
        super().__init__(model_config, model_type, device, unet_model)
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -420,7 +420,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
            dit_config["in_dim_ref_conv"] = ref_conv_weight.shape[1]

        return dit_config
-    
+
    if '{}triple_blocks.17.audio_cross_q.weight'.format(key_prefix) in state_dict_keys: # Hunyuan Foley
        dit_config =  {}
        dit_config["image_model"] = "hunyuan_foley"
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -1303,7 +1303,7 @@ class Omnigen2(supported_models_base.BASE):
        pref = self.text_encoder_key_prefix[0]
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect))
-    
+
 class HunyuanFoley(supported_models_base.BASE):
    unet_config = {
        "image_model": "hunyuan_foley",
@ -1318,7 +1318,7 @@ class HunyuanFoley(supported_models_base.BASE):
        return model_base.HunyuanFoley(self, device=device)
    def clip_target(self, state_dict={}):
        return supported_models_base.ClipTarget(comfy.text_encoders.clap_model.ClapLargeTokenizer, comfy.text_encoders.clap_model.ClapTextEncoderModel)
-    
+
    def process_clip_state_dict(self, state_dict):
        state_dict = utils.state_dict_prefix_replace(state_dict, {k: "transformer." for k in self.text_encoder_key_prefix}, filter_keys=True)
        state_dict["logit_scale"] = torch.tensor(1.0)
--- a/comfy_extras/nodes_hunyuan_foley.py
+++ b/comfy_extras/nodes_hunyuan_foley.py
@ -92,7 +92,7 @@ class HunyuanFoleyConditioning(io.ComfyNode):

    @classmethod
    def execute(cls, siglip_encoding_1, synchformer_encoding_2, text_encoding_positive, text_encoding_negative):
-        
+
        text_encoding_positive = text_encoding_positive[0][0]
        text_encoding_negative = text_encoding_negative[0][0]
        all_ = (siglip_encoding_1, synchformer_encoding_2, text_encoding_positive, text_encoding_negative)
@ -108,7 +108,7 @@ class HunyuanFoleyConditioning(io.ComfyNode):
            # temporary repeat values on the cpu
            factor_pos, remainder = divmod(max_value, input.shape[dim])

-            positions = [1] * input.ndim 
+            positions = [1] * input.ndim
            positions[dim] = factor_pos
            input = input.cpu().repeat(*positions)

@ -120,7 +120,7 @@ class HunyuanFoleyConditioning(io.ComfyNode):
                input = torch.cat([input, pad], dim = dim)

            return input
-        
+
        siglip_encoding_1, synchformer_encoding_2, text_encoding_positive, text_encoding_negative = [repeat_shapes(max_l, t) for t in all_]
        siglip_encoding_1, synchformer_encoding_2, text_encoding_positive, text_encoding_negative = [repeat_shapes(max_d, t, dim = 2) for t in
                                                                                                    (siglip_encoding_1, synchformer_encoding_2, text_encoding_positive, text_encoding_negative)]
--- a/comfy_extras/nodes_video.py
+++ b/comfy_extras/nodes_video.py
@ -48,7 +48,7 @@ class EncodeVideo(io.ComfyNode):
                io.Conditioning.Output(display_name="encoded_video"),
            ],
        )
-    
+
    @classmethod
    def execute(cls, video, processing_batch_size, step_size, vae = None, clip_vision = None):

@ -94,13 +94,15 @@ class EncodeVideo(io.ComfyNode):
                chunk = chunk.to(model_dtype)
                if hasattr(vae, "encode"):
                    try:
+                        if chunk.ndim > 5:
+                            raise ValueError("chunk.ndim > 5")
                        chunk = chunk.movedim(1, -1)
                        out = vae.encode(chunk)
-                    except:
+                    except Exception:
                        out = model.encode(chunk)
                else:
                    chunk = chunk.movedim(1, -1)
-                    out = vae.encode_image(chunk, crop=False, resize_mode="bilinear")
+                    out = vae.encode_image(chunk.to(torch.uint8), crop=False, resize_mode="bilinear")
                    out = out["image_embeds"]

                out_cpu = out.cpu()
@ -133,14 +135,14 @@ class ResampleVideo(io.ComfyNode):
        )
    @classmethod
    def execute(cls, video, target_fps: int):
-        # doesn't support upsampling 
+        # doesn't support upsampling
        with av.open(video.get_stream_source(), mode="r") as container:
            stream = container.streams.video[0]
            frames = []

            src_rate = stream.average_rate or stream.guessed_rate
            src_fps = float(src_rate) if src_rate else None
-            
+
            if src_fps is None:
                logging.warning("src_fps for video resampling is None.")