mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-02-10 13:32:36 +08:00
final changes
This commit is contained in:
parent
4908e7412e
commit
4653b9008d
@ -246,7 +246,7 @@ class CLIPVision(torch.nn.Module):
|
||||
x = self.post_layernorm(x)
|
||||
if self.use_head:
|
||||
pooled_output = self.head(x)
|
||||
else:
|
||||
else:
|
||||
pooled_output = x
|
||||
else:
|
||||
pooled_output = self.post_layernorm(x[:, 0, :])
|
||||
|
||||
@ -19,6 +19,8 @@ class Output:
|
||||
|
||||
def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True, resize_mode="bicubic"):
|
||||
image = image[:, :, :, :3] if image.shape[3] > 3 else image
|
||||
if image.dtype == torch.uint8:
|
||||
image = image.float() / 255.0
|
||||
mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
|
||||
std = torch.tensor(std, device=image.device, dtype=image.dtype)
|
||||
image = image.movedim(-1, 1)
|
||||
|
||||
@ -55,7 +55,7 @@ class TimestepEmbedder(TimestepEmbedderParent):
|
||||
def forward(self, t):
|
||||
t_freq = timestep_embedding(t, self.frequency_embedding_size).type(self.mlp[0].weight.dtype)
|
||||
t_emb = self.mlp(t_freq)
|
||||
return t_emb
|
||||
return t_emb
|
||||
|
||||
class SwiGLU(nn.Module):
|
||||
def __init__(self, dim: int, hidden_dim: int, device, dtype, operations):
|
||||
@ -150,9 +150,9 @@ class ChannelLastConv1d(nn.Module):
|
||||
self.register_parameter("bias", underlying.bias)
|
||||
else:
|
||||
self.register_parameter("bias", None)
|
||||
|
||||
|
||||
object.__setattr__(self, "_underlying", underlying)
|
||||
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
self._underlying = self._underlying.to(x.dtype)
|
||||
x = self._underlying(x.permute(0, 2, 1))
|
||||
@ -204,7 +204,7 @@ class ModulateDiT(nn.Module):
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return self.linear(self.act(x))
|
||||
|
||||
|
||||
class FinalLayer1D(nn.Module):
|
||||
def __init__(self, hidden_size, patch_size, out_channels, device=None, dtype=None, operations = None):
|
||||
factory_kwargs = {"device": device, "dtype": dtype}
|
||||
@ -223,7 +223,7 @@ class FinalLayer1D(nn.Module):
|
||||
self.linear = self.linear.to(x.dtype)
|
||||
x = self.linear(x)
|
||||
return x
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
@ -254,7 +254,7 @@ class MLP(nn.Module):
|
||||
self.drop2 = nn.Dropout(drop_probs[1])
|
||||
|
||||
def forward(self, x):
|
||||
return self.drop2(self.fc2(self.norm(self.drop1(self.act(self.fc1(x))))))
|
||||
return self.drop2(self.fc2(self.norm(self.drop1(self.act(self.fc1(x))))))
|
||||
|
||||
|
||||
def _to_tuple(x, dim=2):
|
||||
@ -297,7 +297,7 @@ def get_meshgrid_nd(start, *args, dim=2):
|
||||
def get_nd_rotary_pos_embed(
|
||||
rope_dim_list, start, *args, theta=10000.0, use_real=False, theta_rescale_factor=1.0, freq_scaling=1.0
|
||||
):
|
||||
|
||||
|
||||
grid = get_meshgrid_nd(start, *args, dim=len(rope_dim_list))
|
||||
|
||||
embs = []
|
||||
@ -411,14 +411,14 @@ class TwoStreamCABlock(nn.Module):
|
||||
|
||||
self.max_text_len = 100
|
||||
self.rope_dim_list = None
|
||||
|
||||
|
||||
self.audio_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
|
||||
self.v_cond_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
|
||||
|
||||
self.audio_cross_q = operations.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
|
||||
self.v_cond_cross_q = operations.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
|
||||
self.text_cross_kv = operations.Linear(hidden_size, hidden_size * 2, bias=qkv_bias, **factory_kwargs)
|
||||
|
||||
|
||||
self.audio_cross_proj = operations.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
|
||||
self.v_cond_cross_proj = operations.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
|
||||
|
||||
@ -435,11 +435,11 @@ class TwoStreamCABlock(nn.Module):
|
||||
def build_rope_for_text(self, text_len, head_dim, rope_dim_list=None):
|
||||
target_ndim = 1 # n-d RoPE
|
||||
rope_sizes = [text_len]
|
||||
|
||||
|
||||
if rope_dim_list is None:
|
||||
rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
|
||||
assert sum(rope_dim_list) == head_dim, "sum(rope_dim_list) should equal to head_dim of attention layer"
|
||||
|
||||
|
||||
text_freqs_cos, text_freqs_sin = get_nd_rotary_pos_embed(
|
||||
rope_dim_list=rope_dim_list,
|
||||
start=rope_sizes,
|
||||
@ -461,7 +461,7 @@ class TwoStreamCABlock(nn.Module):
|
||||
sync_vec: torch.Tensor = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
|
||||
(audio_mod1_shift, audio_mod1_scale, audio_mod1_gate,
|
||||
(audio_mod1_shift, audio_mod1_scale, audio_mod1_gate,
|
||||
audio_mod2_shift, audio_mod2_scale, audio_mod2_gate,
|
||||
audio_mod3_shift, audio_mod3_scale, audio_mod3_gate,
|
||||
) = self.audio_mod(sync_vec if sync_vec is not None else vec).chunk(9, dim=-1)
|
||||
@ -477,19 +477,19 @@ class TwoStreamCABlock(nn.Module):
|
||||
v_cond_mod3_scale,
|
||||
v_cond_mod3_gate,
|
||||
) = self.v_cond_mod(vec).chunk(9, dim=-1)
|
||||
|
||||
|
||||
audio_q, audio_k, audio_v = prepare_self_attn_qkv(
|
||||
audio, self.audio_norm1, self.audio_self_attn_qkv,
|
||||
audio, self.audio_norm1, self.audio_self_attn_qkv,
|
||||
self.audio_self_q_norm, self.audio_self_k_norm,
|
||||
audio_mod1_shift, audio_mod1_scale, self.num_heads
|
||||
)
|
||||
|
||||
v_cond_q, v_cond_k, v_cond_v = prepare_self_attn_qkv(
|
||||
v_cond, self.v_cond_norm1, self.v_cond_attn_qkv,
|
||||
v_cond, self.v_cond_norm1, self.v_cond_attn_qkv,
|
||||
self.v_cond_attn_q_norm, self.v_cond_attn_k_norm,
|
||||
v_cond_mod1_shift, v_cond_mod1_scale, self.num_heads
|
||||
)
|
||||
|
||||
|
||||
# Apply RoPE if needed for audio and visual
|
||||
if freqs_cis is not None:
|
||||
if not self.interleaved_audio_visual_rope:
|
||||
@ -515,18 +515,18 @@ class TwoStreamCABlock(nn.Module):
|
||||
if v_freqs_cis is not None and not self.interleaved_audio_visual_rope:
|
||||
v_cond_qq, v_cond_kk = apply_rotary_emb(v_cond_q, v_cond_k, v_freqs_cis, head_first=False)
|
||||
v_cond_q, v_cond_k = v_cond_qq, v_cond_kk
|
||||
|
||||
|
||||
q = torch.cat((v_cond_q, audio_q), dim=1)
|
||||
k = torch.cat((v_cond_k, audio_k), dim=1)
|
||||
v = torch.cat((v_cond_v, audio_v), dim=1)
|
||||
|
||||
|
||||
# TODO: look further into here
|
||||
if attention.__name__ == "attention_pytorch":
|
||||
q, k, v = [t.transpose(1, 2) for t in (q, k, v)]
|
||||
|
||||
|
||||
attn = attention(q, k, v, heads = self.num_heads, mask=attn_mask, skip_reshape=True)
|
||||
v_cond_attn, audio_attn = torch.split(attn, [v_cond.shape[1], audio.shape[1]], dim=1)
|
||||
|
||||
|
||||
audio = audio + apply_gate(self.audio_self_proj(audio_attn), gate=audio_mod1_gate)
|
||||
v_cond = v_cond + apply_gate(self.v_cond_self_proj(v_cond_attn), gate=v_cond_mod1_gate)
|
||||
head_dim = self.hidden_size // self.num_heads
|
||||
@ -544,12 +544,12 @@ class TwoStreamCABlock(nn.Module):
|
||||
text_k = self.text_cross_k_norm(text_k).to(text_v)
|
||||
|
||||
text_len = text_k.shape[1]
|
||||
|
||||
text_freqs_cos, text_freqs_sin = self.build_rope_for_text(text_len, head_dim,
|
||||
|
||||
text_freqs_cos, text_freqs_sin = self.build_rope_for_text(text_len, head_dim,
|
||||
rope_dim_list=self.rope_dim_list)
|
||||
text_freqs_cis = (text_freqs_cos.to(text_k.device), text_freqs_sin.to(text_k.device))
|
||||
text_k = apply_rotary_emb(text_k, text_k, text_freqs_cis, head_first=False)[1]
|
||||
|
||||
|
||||
v_cond_audio_q = torch.cat([v_cond_q, audio_q], dim=1)
|
||||
|
||||
if attention.__name__ == "attention_pytorch":
|
||||
@ -557,7 +557,7 @@ class TwoStreamCABlock(nn.Module):
|
||||
|
||||
cross_attn = attention(v_cond_audio_q, text_k, text_v, self.num_heads, skip_reshape = True)
|
||||
v_cond_cross_attn, audio_cross_attn = torch.split(cross_attn, [v_cond.shape[1], audio.shape[1]], dim=1)
|
||||
|
||||
|
||||
audio = audio + apply_gate(self.audio_cross_proj(audio_cross_attn), gate=audio_mod2_gate)
|
||||
v_cond = v_cond + apply_gate(self.v_cond_cross_proj(v_cond_cross_attn), gate=v_cond_mod2_gate)
|
||||
|
||||
@ -565,7 +565,7 @@ class TwoStreamCABlock(nn.Module):
|
||||
v_cond = apply_modulated_block(v_cond, self.v_cond_norm3, v_cond_mod3_shift, v_cond_mod3_scale, self.v_cond_mlp, v_cond_mod3_gate)
|
||||
|
||||
return audio, cond, v_cond
|
||||
|
||||
|
||||
def prepare_modulated_query(self, x, norm_layer, q_layer, q_norm_layer, shift, scale, num_heads, rope_dim_list):
|
||||
|
||||
x_mod = modulate(norm_layer(x), shift=shift, scale=scale)
|
||||
@ -577,9 +577,9 @@ class TwoStreamCABlock(nn.Module):
|
||||
head_dim = q.shape[-1]
|
||||
freqs_cos, freqs_sin = self.build_rope_for_text(q.shape[1], head_dim, rope_dim_list)
|
||||
freqs_cis = (freqs_cos.to(q.device), freqs_sin.to(q.device))
|
||||
|
||||
|
||||
q = apply_rotary_emb(q, q, freqs_cis, head_first=False)[0]
|
||||
|
||||
|
||||
return q
|
||||
|
||||
class SingleStreamBlock(nn.Module):
|
||||
@ -697,7 +697,7 @@ class HunyuanVideoFoley(nn.Module):
|
||||
self.patch_size = model_args.get("patch_size", 1)
|
||||
self.visual_in_channels = model_args.get("clip_dim", 768)
|
||||
self.audio_vae_latent_dim = model_args.get("audio_vae_latent_dim", 128)
|
||||
self.out_channels = self.audio_vae_latent_dim
|
||||
self.out_channels = self.audio_vae_latent_dim
|
||||
self.unpatchify_channels = self.out_channels
|
||||
|
||||
self.num_heads = model_args.get("num_heads", 12)
|
||||
@ -873,7 +873,7 @@ class HunyuanVideoFoley(nn.Module):
|
||||
|
||||
uncond_1 = uncond_1[:, :clip_feat.size(1), :clip_feat.size(2)]
|
||||
uncond_2 = uncond_2[:, :sync_feat.size(1), :sync_feat.size(2)]
|
||||
|
||||
|
||||
uncond_1, uncond_2, cond_neg, clip_feat, sync_feat, cond_pos = [unlock_cpu_tensor(t, device) for t in (uncond_1, uncond_2, cond_neg, clip_feat, sync_feat, cond_pos)]
|
||||
|
||||
diff = cond_pos.shape[1] - cond_neg.shape[1]
|
||||
@ -885,6 +885,8 @@ class HunyuanVideoFoley(nn.Module):
|
||||
clip_feat, sync_feat, cond = torch.cat([uncond_1, clip_feat]), torch.cat([uncond_2, sync_feat]), torch.cat([cond_neg, cond_pos])
|
||||
clip_feat = clip_feat.view(2, -1, 768)
|
||||
|
||||
self.conditions = (clip_feat, sync_feat, cond)
|
||||
|
||||
else:
|
||||
clip_feat, sync_feat, cond = self.conditions
|
||||
|
||||
@ -944,7 +946,7 @@ class HunyuanVideoFoley(nn.Module):
|
||||
else:
|
||||
audio, cond, v_cond = block(*triple_block_args)
|
||||
|
||||
x = audio
|
||||
x = audio
|
||||
if sync_vec is not None:
|
||||
vec = vec.unsqueeze(1).repeat(1, cond_seq_len + v_cond_seq_len, 1)
|
||||
vec = torch.cat((vec, sync_vec), dim=1)
|
||||
|
||||
@ -160,7 +160,7 @@ class MotionFormer(nn.Module):
|
||||
def __init__(self, device = None, dtype = None, operations = None):
|
||||
super().__init__()
|
||||
self.APPROX_ATTN_TYPE = "none"
|
||||
self.APPROX_ATTN_DIM = 64
|
||||
self.APPROX_ATTN_DIM = 64
|
||||
self.img_size = 224
|
||||
self.patch_size = 16
|
||||
self.in_chans = 3
|
||||
@ -224,7 +224,7 @@ class MotionFormer(nn.Module):
|
||||
self.norm = norm_layer(self.embed_dim)
|
||||
|
||||
self.pre_logits = nn.Identity()
|
||||
|
||||
|
||||
transf_enc_layer_kwargs = dict(
|
||||
d_model=self.embed_dim,
|
||||
nhead=self.num_heads,
|
||||
@ -273,7 +273,7 @@ class MotionFormer(nn.Module):
|
||||
)
|
||||
|
||||
return x, tok_mask
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
B, S, C, T, H, W = x.shape
|
||||
|
||||
@ -322,7 +322,7 @@ class BaseEncoderLayer(TransformerEncoderComfyv):
|
||||
device = None,
|
||||
dtype = None, operations = None,
|
||||
*args, **kwargs
|
||||
):
|
||||
):
|
||||
factory_kwargs = {"device": device, "dtype": dtype}
|
||||
super().__init__(operations = operations, *args, **kwargs, **factory_kwargs)
|
||||
|
||||
@ -382,7 +382,7 @@ class SpatialTransformerEncoderLayer(BaseEncoderLayer):
|
||||
x = rearrange(x, "(BS t) D -> BS t D", BS=BS, t=t)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class AST(torch.nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
@ -391,7 +391,7 @@ class AST(torch.nn.Module):
|
||||
max_segments: int = None,
|
||||
device = None, dtype = None, operations = None
|
||||
) -> None:
|
||||
|
||||
|
||||
super().__init__()
|
||||
factory_kwargs = {"device": device, "dtype": dtype}
|
||||
self.extract_features = True
|
||||
@ -518,7 +518,7 @@ class FrequencyTransformerEncoderLayer(BaseEncoderLayer):
|
||||
x = x.view(BS, t, D)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class ASTEmbeddings(nn.Module):
|
||||
|
||||
def __init__(self, config: ASTConfig, device = None, dtype = None, operations = None) -> None:
|
||||
@ -789,7 +789,7 @@ class ASTModel(nn.Module):
|
||||
),
|
||||
tok_mask,
|
||||
)
|
||||
|
||||
|
||||
class ASTMLPHead(nn.Module):
|
||||
def __init__(self, config: ASTConfig, device, dtype, operations):
|
||||
super().__init__()
|
||||
@ -957,6 +957,7 @@ class Synchformer(nn.Module):
|
||||
)
|
||||
|
||||
def forward(self, vis):
|
||||
vis = vis.to(next(self.parameters()).dtype)
|
||||
vis = vis.permute(0, 1, 3, 2, 4, 5) # (B, S, C, Tv, H, W)
|
||||
vis = self.vfeat_extractor(vis)
|
||||
return vis
|
||||
|
||||
@ -221,10 +221,11 @@ class FoleyVae(torch.nn.Module):
|
||||
def encode(self, x):
|
||||
x = x.to(next(self.parameters()).device)
|
||||
return self.synchformer(x)
|
||||
|
||||
|
||||
def video_encoding(self, video, step):
|
||||
video = video.to(torch.uint8)
|
||||
video = torch.stack([self.syncformer_preprocess(t) for t in video])
|
||||
|
||||
|
||||
t, c, h, w = video.shape
|
||||
seg_len = 16
|
||||
t = video.size(0)
|
||||
@ -233,12 +234,13 @@ class FoleyVae(torch.nn.Module):
|
||||
video = video.contiguous()
|
||||
stride_t, stride_c, stride_h, stride_w = video.stride()
|
||||
|
||||
# no copies
|
||||
# no copies
|
||||
data = video.as_strided(
|
||||
size=(nseg, seg_len, c, h, w),
|
||||
stride=(stride_t * step, stride_t, stride_c, stride_h, stride_w),
|
||||
)
|
||||
).contiguous()
|
||||
data = data.unsqueeze(0) # b
|
||||
data = rearrange(data, "b s t c h w -> (b s) 1 t c h w")
|
||||
data = data.float()
|
||||
|
||||
return data, nseg, lambda x: rearrange(x, "(b s) 1 t d -> b (s t) d", b=1)
|
||||
|
||||
@ -1121,7 +1121,7 @@ class MultiheadAttentionComfyv(nn.Module):
|
||||
self.batch_first = batch_first
|
||||
self.head_dim = embed_dim // num_heads
|
||||
self.embed_dim = embed_dim
|
||||
|
||||
|
||||
# overwriting state dict loading to convert in_proj_weight/bias -> self._q_proj/_k_proj/_v_proj
|
||||
def _load_from_state_dict(
|
||||
self,
|
||||
@ -1164,26 +1164,17 @@ class MultiheadAttentionComfyv(nn.Module):
|
||||
error_msgs,
|
||||
)
|
||||
|
||||
def forward(self, src, k = None, v = None, attn_mask = None, key_padding_mask = None):
|
||||
def forward(self, src, k=None, v=None, attn_mask=None, key_padding_mask=None):
|
||||
self._q_proj, self._k_proj, self._v_proj = [
|
||||
t.to(src.device).to(src.dtype)
|
||||
for t in (self._q_proj, self._k_proj, self._v_proj)
|
||||
]
|
||||
|
||||
self._q_proj, self._k_proj, self._v_proj = [t.to(src.device).to(src.dtype) for t in (self._q_proj, self._k_proj, self._v_proj)]
|
||||
q = self._q_proj(src)
|
||||
if k is None:
|
||||
k = self._k_proj(src)
|
||||
if v is None:
|
||||
v = self._v_proj(src)
|
||||
k, v = k.to(src.device).to(src.dtype), v.to(src.device).to(src.dtype)
|
||||
k = self._k_proj(src if k is None else k.to(src.device).to(src.dtype))
|
||||
v = self._v_proj(src if v is None else v.to(src.device).to(src.dtype))
|
||||
|
||||
if k is v:
|
||||
if q is k:
|
||||
q = k = v = q.transpose(1, 0)
|
||||
else:
|
||||
q, k = (x.transpose(1, 0) for x in (q, k))
|
||||
v = k
|
||||
else:
|
||||
q, k, v = (x.transpose(1, 0) for x in (q, k, v))
|
||||
|
||||
output = optimized_attention(q, k, v, self.num_heads, mask = attn_mask)
|
||||
output = optimized_attention(q, k, v, self.num_heads, mask=attn_mask)
|
||||
return self.out_proj(output)
|
||||
|
||||
# comfyui implementation of nn.TransformerEncoderLayer
|
||||
|
||||
@ -1413,7 +1413,7 @@ class ACEStep(BaseModel):
|
||||
out['speaker_embeds'] = comfy.conds.CONDRegular(torch.zeros(noise.shape[0], 512, device=noise.device, dtype=noise.dtype))
|
||||
out['lyrics_strength'] = comfy.conds.CONDConstant(kwargs.get("lyrics_strength", 1.0))
|
||||
return out
|
||||
|
||||
|
||||
class HunyuanFoley(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None, unet_model=comfy.ldm.hunyuan_foley.model.HunyuanVideoFoley):
|
||||
super().__init__(model_config, model_type, device, unet_model)
|
||||
|
||||
@ -420,7 +420,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["in_dim_ref_conv"] = ref_conv_weight.shape[1]
|
||||
|
||||
return dit_config
|
||||
|
||||
|
||||
if '{}triple_blocks.17.audio_cross_q.weight'.format(key_prefix) in state_dict_keys: # Hunyuan Foley
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "hunyuan_foley"
|
||||
|
||||
@ -1303,7 +1303,7 @@ class Omnigen2(supported_models_base.BASE):
|
||||
pref = self.text_encoder_key_prefix[0]
|
||||
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect))
|
||||
|
||||
|
||||
class HunyuanFoley(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "hunyuan_foley",
|
||||
@ -1318,7 +1318,7 @@ class HunyuanFoley(supported_models_base.BASE):
|
||||
return model_base.HunyuanFoley(self, device=device)
|
||||
def clip_target(self, state_dict={}):
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.clap_model.ClapLargeTokenizer, comfy.text_encoders.clap_model.ClapTextEncoderModel)
|
||||
|
||||
|
||||
def process_clip_state_dict(self, state_dict):
|
||||
state_dict = utils.state_dict_prefix_replace(state_dict, {k: "transformer." for k in self.text_encoder_key_prefix}, filter_keys=True)
|
||||
state_dict["logit_scale"] = torch.tensor(1.0)
|
||||
|
||||
@ -92,7 +92,7 @@ class HunyuanFoleyConditioning(io.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def execute(cls, siglip_encoding_1, synchformer_encoding_2, text_encoding_positive, text_encoding_negative):
|
||||
|
||||
|
||||
text_encoding_positive = text_encoding_positive[0][0]
|
||||
text_encoding_negative = text_encoding_negative[0][0]
|
||||
all_ = (siglip_encoding_1, synchformer_encoding_2, text_encoding_positive, text_encoding_negative)
|
||||
@ -108,7 +108,7 @@ class HunyuanFoleyConditioning(io.ComfyNode):
|
||||
# temporary repeat values on the cpu
|
||||
factor_pos, remainder = divmod(max_value, input.shape[dim])
|
||||
|
||||
positions = [1] * input.ndim
|
||||
positions = [1] * input.ndim
|
||||
positions[dim] = factor_pos
|
||||
input = input.cpu().repeat(*positions)
|
||||
|
||||
@ -120,7 +120,7 @@ class HunyuanFoleyConditioning(io.ComfyNode):
|
||||
input = torch.cat([input, pad], dim = dim)
|
||||
|
||||
return input
|
||||
|
||||
|
||||
siglip_encoding_1, synchformer_encoding_2, text_encoding_positive, text_encoding_negative = [repeat_shapes(max_l, t) for t in all_]
|
||||
siglip_encoding_1, synchformer_encoding_2, text_encoding_positive, text_encoding_negative = [repeat_shapes(max_d, t, dim = 2) for t in
|
||||
(siglip_encoding_1, synchformer_encoding_2, text_encoding_positive, text_encoding_negative)]
|
||||
|
||||
@ -48,7 +48,7 @@ class EncodeVideo(io.ComfyNode):
|
||||
io.Conditioning.Output(display_name="encoded_video"),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@classmethod
|
||||
def execute(cls, video, processing_batch_size, step_size, vae = None, clip_vision = None):
|
||||
|
||||
@ -94,13 +94,15 @@ class EncodeVideo(io.ComfyNode):
|
||||
chunk = chunk.to(model_dtype)
|
||||
if hasattr(vae, "encode"):
|
||||
try:
|
||||
if chunk.ndim > 5:
|
||||
raise ValueError("chunk.ndim > 5")
|
||||
chunk = chunk.movedim(1, -1)
|
||||
out = vae.encode(chunk)
|
||||
except:
|
||||
except Exception:
|
||||
out = model.encode(chunk)
|
||||
else:
|
||||
chunk = chunk.movedim(1, -1)
|
||||
out = vae.encode_image(chunk, crop=False, resize_mode="bilinear")
|
||||
out = vae.encode_image(chunk.to(torch.uint8), crop=False, resize_mode="bilinear")
|
||||
out = out["image_embeds"]
|
||||
|
||||
out_cpu = out.cpu()
|
||||
@ -133,14 +135,14 @@ class ResampleVideo(io.ComfyNode):
|
||||
)
|
||||
@classmethod
|
||||
def execute(cls, video, target_fps: int):
|
||||
# doesn't support upsampling
|
||||
# doesn't support upsampling
|
||||
with av.open(video.get_stream_source(), mode="r") as container:
|
||||
stream = container.streams.video[0]
|
||||
frames = []
|
||||
|
||||
src_rate = stream.average_rate or stream.guessed_rate
|
||||
src_fps = float(src_rate) if src_rate else None
|
||||
|
||||
|
||||
if src_fps is None:
|
||||
logging.warning("src_fps for video resampling is None.")
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user