Do padding of audio embed in model for humo for more flexibility. (#9935)

2026-03-16 22:58:19 +08:00 · 2025-09-18 16:54:16 -07:00 · 2025-09-18 16:54:16 -07:00 · 24b0fce099
commit 24b0fce099
parent 1ea8c54064
2 changed files with 3 additions and 4 deletions
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@ -1551,6 +1551,9 @@ class HumoWanModel(WanModel):
        context_img_len = None

        if audio_embed is not None:
+            if reference_latent is not None:
+                zero_audio_pad = torch.zeros(audio_embed.shape[0], reference_latent.shape[-3], *audio_embed.shape[2:], device=audio_embed.device, dtype=audio_embed.dtype)
+                audio_embed = torch.cat([audio_embed, zero_audio_pad], dim=1)
            audio = self.audio_proj(audio_embed).permute(0, 3, 1, 2).flatten(2).transpose(1, 2)
        else:
            audio = None
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@ -1095,10 +1095,6 @@ class WanHuMoImageToVideo(io.ComfyNode):
            audio_emb = torch.stack([feat0, feat1, feat2, feat3, feat4], dim=2)[0]  # [T, 5, 1280]
            audio_emb, _ = get_audio_emb_window(audio_emb, length, frame0_idx=0)

-            # pad for ref latent
-            zero_audio_pad = torch.zeros(ref_latent.shape[2], *audio_emb.shape[1:], device=audio_emb.device, dtype=audio_emb.dtype)
-            audio_emb = torch.cat([audio_emb, zero_audio_pad], dim=0)
-
            audio_emb = audio_emb.unsqueeze(0)
            audio_emb_neg = torch.zeros_like(audio_emb)
            positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_emb})