mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-05-10 09:12:31 +08:00
Support Wan-Dancer (#13813)
* initial WanDancer support * nodes_wandancer: Add list form of chunker. Create an alternate list form of the node so the chunk gens can be trivially looped by the comfy executor. * Closer match to original soxr resampling * Remove librosa node * Cleanup --------- Co-authored-by: Rattus <rattus128@gmail.com>
This commit is contained in:
parent
a4b7e3beed
commit
3200f28e3a
@ -1135,7 +1135,7 @@ class AudioInjector_WAN(nn.Module):
|
||||
self.injector_adain_output_layers = nn.ModuleList(
|
||||
[operations.Linear(dim, dim, dtype=dtype, device=device) for _ in range(audio_injector_id)])
|
||||
|
||||
def forward(self, x, block_id, audio_emb, audio_emb_global, seq_len):
|
||||
def forward(self, x, block_id, audio_emb, audio_emb_global, seq_len, scale=1.0):
|
||||
audio_attn_id = self.injected_block_id.get(block_id, None)
|
||||
if audio_attn_id is None:
|
||||
return x
|
||||
@ -1148,12 +1148,15 @@ class AudioInjector_WAN(nn.Module):
|
||||
attn_hidden_states = adain_hidden_states
|
||||
else:
|
||||
attn_hidden_states = self.injector_pre_norm_feat[audio_attn_id](input_hidden_states)
|
||||
audio_emb = rearrange(audio_emb, "b t n c -> (b t) n c", t=num_frames)
|
||||
attn_audio_emb = audio_emb
|
||||
|
||||
if audio_emb.dim() == 3: # WanDancer case
|
||||
attn_audio_emb = rearrange(audio_emb, "b t c -> (b t) 1 c", t=num_frames)
|
||||
else: # S2V case
|
||||
attn_audio_emb = rearrange(audio_emb, "b t n c -> (b t) n c", t=num_frames)
|
||||
|
||||
residual_out = self.injector[audio_attn_id](x=attn_hidden_states, context=attn_audio_emb)
|
||||
residual_out = rearrange(
|
||||
residual_out, "(b t) n c -> b (t n) c", t=num_frames)
|
||||
x[:, :seq_len] = x[:, :seq_len] + residual_out
|
||||
residual_out = rearrange(residual_out, "(b t) n c -> b (t n) c", t=num_frames)
|
||||
x[:, :seq_len] = x[:, :seq_len] + residual_out * scale
|
||||
return x
|
||||
|
||||
|
||||
|
||||
251
comfy/ldm/wan/model_wandancer.py
Normal file
251
comfy/ldm/wan/model_wandancer.py
Normal file
@ -0,0 +1,251 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import comfy
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
from comfy.ldm.flux.math import apply_rope1
|
||||
from comfy.ldm.flux.layers import EmbedND
|
||||
|
||||
from .model import AudioInjector_WAN, WanModel, MLPProj, Head, sinusoidal_embedding_1d
|
||||
|
||||
|
||||
class MusicSelfAttention(nn.Module):
|
||||
def __init__(self, dim, num_heads, device=None, dtype=None, operations=None):
|
||||
assert dim % num_heads == 0
|
||||
super().__init__()
|
||||
self.embed_dim = dim
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
|
||||
self.q_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
|
||||
self.k_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
|
||||
self.v_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
|
||||
self.out_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, freqs):
|
||||
b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
|
||||
|
||||
q = self.q_proj(x).view(b, s, n, d)
|
||||
q = apply_rope1(q, freqs)
|
||||
|
||||
k = self.k_proj(x).view(b, s, n, d)
|
||||
k = apply_rope1(k, freqs)
|
||||
|
||||
x = optimized_attention(
|
||||
q.view(b, s, n * d),
|
||||
k.view(b, s, n * d),
|
||||
self.v_proj(x).view(b, s, n * d),
|
||||
heads=self.num_heads,
|
||||
)
|
||||
|
||||
return self.out_proj(x)
|
||||
|
||||
|
||||
class MusicEncoderLayer(nn.Module):
|
||||
def __init__(self, dim: int, num_heads: int, ffn_dim: int, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.self_attn = MusicSelfAttention(dim, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.linear1 = operations.Linear(dim, ffn_dim, device=device, dtype=dtype)
|
||||
self.linear2 = operations.Linear(ffn_dim, dim, device=device, dtype=dtype)
|
||||
|
||||
self.norm1 = operations.LayerNorm(dim, device=device, dtype=dtype)
|
||||
self.norm2 = operations.LayerNorm(dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
|
||||
x = x + self.self_attn(self.norm1(x), freqs=freqs)
|
||||
x = x + self.linear2(torch.nn.functional.gelu(self.linear1(self.norm2(x)))) # ffn
|
||||
return x
|
||||
|
||||
|
||||
class WanDancerModel(WanModel):
|
||||
def __init__(self,
|
||||
model_type='wandancer',
|
||||
patch_size=(1, 2, 2),
|
||||
text_len=512,
|
||||
in_dim=16,
|
||||
dim=5120,
|
||||
ffn_dim=8192,
|
||||
freq_dim=256,
|
||||
text_dim=4096,
|
||||
out_dim=16,
|
||||
num_heads=16,
|
||||
num_layers=40,
|
||||
window_size=(-1, -1),
|
||||
qk_norm=True,
|
||||
cross_attn_norm=True,
|
||||
eps=1e-6,
|
||||
in_dim_ref_conv=None,
|
||||
image_model=None,
|
||||
device=None, dtype=None, operations=None,
|
||||
audio_inject_layers=[0, 4, 8, 12, 16, 20, 24, 27],
|
||||
music_dim = 256,
|
||||
music_heads = 4,
|
||||
music_feature_dim = 35,
|
||||
music_latent_dim = 256
|
||||
):
|
||||
|
||||
super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim,
|
||||
num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, image_model=image_model, in_dim_ref_conv=in_dim_ref_conv,
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.dtype = dtype
|
||||
operation_settings = {"operations": operations, "device": device, "dtype": dtype}
|
||||
|
||||
self.patch_embedding_global = operations.Conv3d(in_dim, dim, kernel_size=patch_size, stride=patch_size, device=operation_settings.get("device"), dtype=torch.float32)
|
||||
self.img_emb_refimage = MLPProj(1280, dim, operation_settings=operation_settings)
|
||||
self.head_global = Head(dim, out_dim, patch_size, eps, operation_settings=operation_settings)
|
||||
|
||||
self.music_injector = AudioInjector_WAN(
|
||||
dim=self.dim,
|
||||
num_heads=self.num_heads,
|
||||
inject_layer=audio_inject_layers,
|
||||
root_net=self,
|
||||
enable_adain=False,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
|
||||
self.music_projection = operations.Linear(music_feature_dim, music_latent_dim, device=device, dtype=dtype)
|
||||
self.music_encoder = nn.ModuleList([MusicEncoderLayer(dim=music_dim, num_heads=music_heads, ffn_dim=1024, device=device, dtype=dtype, operations=operations) for _ in range(2)])
|
||||
music_head_dim = music_dim // music_heads
|
||||
self.music_rope_embedder = EmbedND(dim=music_head_dim, theta=10000.0, axes_dim=[music_head_dim])
|
||||
|
||||
def forward_orig(self, x, t, context, clip_fea=None, clip_fea_ref=None, freqs=None, audio_embed=None, fps=30, audio_inject_scale=1.0, transformer_options={}, **kwargs):
|
||||
# embeddings
|
||||
if int(fps + 0.5) != 30:
|
||||
x = self.patch_embedding_global(x.float()).to(x.dtype)
|
||||
else:
|
||||
x = self.patch_embedding(x.float()).to(x.dtype)
|
||||
|
||||
grid_sizes = x.shape[2:]
|
||||
latent_frames = grid_sizes[0]
|
||||
transformer_options["grid_sizes"] = grid_sizes
|
||||
x = x.flatten(2).transpose(1, 2)
|
||||
seq_len = x.size(1)
|
||||
|
||||
# time embeddings
|
||||
e = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype))
|
||||
e = e.reshape(t.shape[0], -1, e.shape[-1])
|
||||
e0 = self.time_projection(e).unflatten(2, (6, self.dim))
|
||||
|
||||
full_ref = None
|
||||
if self.ref_conv is not None: # model has the weight, but this wasn't used in the original pipeline
|
||||
full_ref = kwargs.get("reference_latent", None)
|
||||
if full_ref is not None:
|
||||
full_ref = self.ref_conv(full_ref).flatten(2).transpose(1, 2)
|
||||
x = torch.concat((full_ref, x), dim=1)
|
||||
|
||||
# context
|
||||
context = self.text_embedding(context)
|
||||
|
||||
audio_emb = None
|
||||
if audio_embed is not None: # encode music feature,[1, frame_num, 35] -> [1, F*8, dim]
|
||||
music_feature = self.music_projection(audio_embed)
|
||||
|
||||
music_seq_len = music_feature.shape[1]
|
||||
music_ids = torch.arange(music_seq_len, device=music_feature.device, dtype=music_feature.dtype).reshape(1, -1, 1) # create 1D position IDs
|
||||
music_freqs = self.music_rope_embedder(music_ids).movedim(1, 2)
|
||||
|
||||
# apply encoder layers
|
||||
for layer in self.music_encoder:
|
||||
music_feature = layer(music_feature, music_freqs)
|
||||
|
||||
# interpolate
|
||||
audio_emb = torch.nn.functional.interpolate(music_feature.unsqueeze(1), size=(latent_frames * 8, self.dim), mode='bilinear').squeeze(1)
|
||||
|
||||
context_img_len = 0
|
||||
if self.img_emb is not None and clip_fea is not None:
|
||||
context_clip = self.img_emb(clip_fea) # bs x 257 x dim
|
||||
context = torch.cat([context_clip, context], dim=1)
|
||||
context_img_len += clip_fea.shape[-2]
|
||||
if self.img_emb_refimage is not None and clip_fea_ref is not None:
|
||||
context_clip_ref = self.img_emb_refimage(clip_fea_ref)
|
||||
context = torch.cat([context_clip_ref, context], dim=1)
|
||||
context_img_len += clip_fea_ref.shape[-2]
|
||||
|
||||
patches_replace = transformer_options.get("patches_replace", {})
|
||||
blocks_replace = patches_replace.get("dit", {})
|
||||
transformer_options["total_blocks"] = len(self.blocks)
|
||||
transformer_options["block_type"] = "double"
|
||||
for i, block in enumerate(self.blocks):
|
||||
transformer_options["block_index"] = i
|
||||
if ("double_block", i) in blocks_replace:
|
||||
def block_wrap(args):
|
||||
out = {}
|
||||
out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, transformer_options=args["transformer_options"])
|
||||
return out
|
||||
out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})
|
||||
x = out["img"]
|
||||
else:
|
||||
x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options)
|
||||
if audio_emb is not None:
|
||||
x = self.music_injector(x, i, audio_emb, audio_emb_global=None, seq_len=seq_len, scale=audio_inject_scale)
|
||||
|
||||
# head
|
||||
if int(fps + 0.5) != 30:
|
||||
x = self.head_global(x, e)
|
||||
else:
|
||||
x = self.head(x, e)
|
||||
|
||||
if full_ref is not None:
|
||||
x = x[:, full_ref.shape[1]:]
|
||||
|
||||
# unpatchify
|
||||
x = self.unpatchify(x, grid_sizes)
|
||||
return x
|
||||
|
||||
def _forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, clip_fea_ref=None, fps=30, audio_inject_scale=1.0, **kwargs):
|
||||
bs, c, t, h, w = x.shape
|
||||
x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
|
||||
|
||||
t_len = t
|
||||
if time_dim_concat is not None:
|
||||
time_dim_concat = comfy.ldm.common_dit.pad_to_patch_size(time_dim_concat, self.patch_size)
|
||||
x = torch.cat([x, time_dim_concat], dim=2)
|
||||
t_len = x.shape[2]
|
||||
|
||||
freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype, fps=fps, transformer_options=transformer_options)
|
||||
return self.forward_orig(x, timestep, context, clip_fea=clip_fea, clip_fea_ref=clip_fea_ref, freqs=freqs, fps=fps, audio_inject_scale=audio_inject_scale, transformer_options=transformer_options, **kwargs)[:, :, :t, :h, :w]
|
||||
|
||||
def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, fps=30, device=None, dtype=None, transformer_options={}):
|
||||
patch_size = self.patch_size
|
||||
t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
|
||||
h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
|
||||
w_len = ((w + (patch_size[2] // 2)) // patch_size[2])
|
||||
|
||||
if steps_t is None:
|
||||
steps_t = t_len
|
||||
if steps_h is None:
|
||||
steps_h = h_len
|
||||
if steps_w is None:
|
||||
steps_w = w_len
|
||||
|
||||
h_start = 0
|
||||
w_start = 0
|
||||
rope_options = transformer_options.get("rope_options", None)
|
||||
if rope_options is not None:
|
||||
t_len = (t_len - 1.0) * rope_options.get("scale_t", 1.0) + 1.0
|
||||
h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
|
||||
w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
|
||||
|
||||
t_start += rope_options.get("shift_t", 0.0)
|
||||
h_start += rope_options.get("shift_y", 0.0)
|
||||
w_start += rope_options.get("shift_x", 0.0)
|
||||
|
||||
img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype)
|
||||
|
||||
if int(fps + 0.5) != 30:
|
||||
time_scale = 30.0 / fps # how many time units each frame represents relative to 30fps
|
||||
positions_new = torch.arange(steps_t, device=device, dtype=dtype) * time_scale + t_start
|
||||
total_frames_at_30fps = int(time_scale * steps_t + 0.5)
|
||||
positions_new[-1] = t_start + (total_frames_at_30fps - 1)
|
||||
|
||||
img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + positions_new.reshape(-1, 1, 1)
|
||||
else:
|
||||
img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1)
|
||||
|
||||
img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(h_start, h_start + (h_len - 1), steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1)
|
||||
img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(w_start, w_start + (w_len - 1), steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1)
|
||||
img_ids = img_ids.reshape(1, -1, img_ids.shape[-1])
|
||||
|
||||
freqs = self.rope_embedder(img_ids).movedim(1, 2)
|
||||
return freqs
|
||||
@ -43,6 +43,7 @@ import comfy.ldm.lumina.model
|
||||
import comfy.ldm.wan.model
|
||||
import comfy.ldm.wan.model_animate
|
||||
import comfy.ldm.wan.ar_model
|
||||
import comfy.ldm.wan.model_wandancer
|
||||
import comfy.ldm.hunyuan3d.model
|
||||
import comfy.ldm.hidream.model
|
||||
import comfy.ldm.chroma.model
|
||||
@ -1599,6 +1600,30 @@ class WAN21_SCAIL(WAN21):
|
||||
|
||||
return out
|
||||
|
||||
class WAN22_WanDancer(WAN21):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=True, device=None):
|
||||
super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model_wandancer.WanDancerModel)
|
||||
self.image_to_video = image_to_video
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
audio_embed = kwargs.get("audio_embed", None)
|
||||
if audio_embed is not None:
|
||||
out['audio_embed'] = comfy.conds.CONDRegular(audio_embed)
|
||||
|
||||
clip_vision_output_ref = kwargs.get("clip_vision_output_ref", None)
|
||||
if clip_vision_output_ref is not None:
|
||||
out['clip_fea_ref'] = comfy.conds.CONDRegular(clip_vision_output_ref.penultimate_hidden_states)
|
||||
|
||||
fps = kwargs.get("fps", None)
|
||||
if fps is not None:
|
||||
out['fps'] = comfy.conds.CONDRegular(torch.FloatTensor([fps]))
|
||||
|
||||
audio_inject_scale = kwargs.get("audio_inject_scale", None)
|
||||
if audio_inject_scale is not None:
|
||||
out['audio_inject_scale'] = comfy.conds.CONDRegular(torch.FloatTensor([audio_inject_scale]))
|
||||
return out
|
||||
|
||||
class Hunyuan3Dv2(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan3d.model.Hunyuan3Dv2)
|
||||
|
||||
@ -572,6 +572,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["model_type"] = "animate"
|
||||
elif '{}patch_embedding_pose.weight'.format(key_prefix) in state_dict_keys:
|
||||
dit_config["model_type"] = "scail"
|
||||
elif '{}patch_embedding_global.weight'.format(key_prefix) in state_dict_keys:
|
||||
dit_config["model_type"] = "wandancer"
|
||||
else:
|
||||
if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
|
||||
dit_config["model_type"] = "i2v"
|
||||
|
||||
@ -1313,6 +1313,37 @@ class WAN21_SCAIL(WAN21_T2V):
|
||||
out = model_base.WAN21_SCAIL(self, image_to_video=False, device=device)
|
||||
return out
|
||||
|
||||
class WAN22_WanDancer(WAN21_T2V):
|
||||
unet_config = {
|
||||
"image_model": "wan2.1",
|
||||
"model_type": "wandancer",
|
||||
"in_dim": 36,
|
||||
}
|
||||
|
||||
def __init__(self, unet_config):
|
||||
super().__init__(unet_config)
|
||||
self.memory_usage_factor = 1.8
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.WAN22_WanDancer(self, image_to_video=True, device=device)
|
||||
return out
|
||||
|
||||
def process_unet_state_dict(self, state_dict):
|
||||
out_sd = {}
|
||||
for k in list(state_dict.keys()):
|
||||
# split music_encoder in_proj into q_proj, k_proj, v_proj
|
||||
if "music_encoder" in k and "self_attn.in_proj" in k:
|
||||
suffix = "weight" if k.endswith("weight") else "bias"
|
||||
tensor = state_dict[k]
|
||||
d = tensor.shape[0] // 3
|
||||
prefix = k.replace(f"in_proj_{suffix}", "")
|
||||
out_sd[f"{prefix}q_proj.{suffix}"] = tensor[:d]
|
||||
out_sd[f"{prefix}k_proj.{suffix}"] = tensor[d:2*d]
|
||||
out_sd[f"{prefix}v_proj.{suffix}"] = tensor[2*d:]
|
||||
else:
|
||||
out_sd[k] = state_dict[k]
|
||||
return out_sd
|
||||
|
||||
class Hunyuan3Dv2(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "hunyuan3d2",
|
||||
@ -1982,6 +2013,7 @@ models = [
|
||||
WAN22_Animate,
|
||||
WAN21_FlowRVS,
|
||||
WAN21_SCAIL,
|
||||
WAN22_WanDancer,
|
||||
Hunyuan3Dv2mini,
|
||||
Hunyuan3Dv2,
|
||||
Hunyuan3Dv2_1,
|
||||
|
||||
1002
comfy_extras/nodes_wandancer.py
Normal file
1002
comfy_extras/nodes_wandancer.py
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user