From 42a265cddf5057c1cac28f0f4f8c7be2325bb409 Mon Sep 17 00:00:00 2001 From: Yousef Rafat <81116377+yousef-rafat@users.noreply.github.com> Date: Mon, 29 Sep 2025 22:44:40 +0300 Subject: [PATCH] fixed multiple errors in nodes and model loading --- comfy/ldm/hunyuan_foley/model.py | 8 +- comfy/ldm/hunyuan_foley/syncformer.py | 6 +- comfy/ldm/hunyuan_foley/vae.py | 14 ++-- comfy/model_detection.py | 4 +- comfy/sd.py | 5 +- comfy/text_encoders/clap_model.py | 29 ++++--- comfy_api/latest/_input_impl/video_types.py | 9 +-- comfy_extras/nodes_hunyuan_foley.py | 83 +++++++++++--------- comfy_extras/nodes_video.py | 87 +++++++++++++-------- 9 files changed, 143 insertions(+), 102 deletions(-) diff --git a/comfy/ldm/hunyuan_foley/model.py b/comfy/ldm/hunyuan_foley/model.py index 234415d13..e9ca258ba 100644 --- a/comfy/ldm/hunyuan_foley/model.py +++ b/comfy/ldm/hunyuan_foley/model.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Optional, Union, Dict +from typing import List, Tuple, Optional, Union from functools import partial import math @@ -638,17 +638,19 @@ class SingleStreamBlock(nn.Module): class HunyuanVideoFoley(nn.Module): def __init__( self, - model_args, dtype: Optional[torch.dtype] = None, device: Optional[torch.device] = None, - operations = None + operations = None, + **kwargs ): factory_kwargs = {"device": device, "dtype": dtype} super().__init__() + self.dtype = dtype self.depth_triple_blocks = 18 self.depth_single_blocks = 36 + model_args = {} self.interleaved_audio_visual_rope = model_args.get("interleaved_audio_visual_rope", True) diff --git a/comfy/ldm/hunyuan_foley/syncformer.py b/comfy/ldm/hunyuan_foley/syncformer.py index 9f1973bfb..089f80c92 100644 --- a/comfy/ldm/hunyuan_foley/syncformer.py +++ b/comfy/ldm/hunyuan_foley/syncformer.py @@ -850,8 +850,8 @@ class GlobalTransformer(torch.nn.Module): self.vis_in_lnorm = operations.LayerNorm(n_embd, **factory_kwargs) self.aud_in_lnorm = operations.LayerNorm(n_embd, **factory_kwargs) # aux tokens - self.OFF_tok = operations.Parameter(torch.randn(1, 1, n_embd, **factory_kwargs)) - self.MOD_tok = operations.Parameter(torch.randn(1, 1, n_embd, **factory_kwargs)) + self.OFF_tok = nn.Parameter(torch.randn(1, 1, n_embd, **factory_kwargs)) + self.MOD_tok = nn.Parameter(torch.randn(1, 1, n_embd, **factory_kwargs)) # whole token dropout self.tok_pdrop = tok_pdrop self.tok_drop_vis = torch.nn.Dropout1d(tok_pdrop) @@ -863,7 +863,7 @@ class GlobalTransformer(torch.nn.Module): ) # the stem self.drop = torch.nn.Dropout(embd_pdrop) - self.blocks = operations.Sequential(*[Block(self.config, operations=operations, **factory_kwargs) for _ in range(n_layer)]) + self.blocks = nn.Sequential(*[Block(self.config, operations=operations, **factory_kwargs) for _ in range(n_layer)]) # pre-output norm self.ln_f = operations.LayerNorm(n_embd) # maybe add a head diff --git a/comfy/ldm/hunyuan_foley/vae.py b/comfy/ldm/hunyuan_foley/vae.py index e691f248c..17e15f521 100644 --- a/comfy/ldm/hunyuan_foley/vae.py +++ b/comfy/ldm/hunyuan_foley/vae.py @@ -5,7 +5,7 @@ from typing import List import torch.nn as nn from einops import rearrange from torchvision.transforms import v2 -from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils import weight_norm from comfy.ldm.hunyuan_foley.syncformer import Synchformer @@ -154,6 +154,7 @@ class DACDecoder(nn.Module): layers += [ Snake1d(output_dim, device = device, dtype = dtype), WNConv1d(output_dim, d_out, kernel_size=7, padding=3, device = device, dtype = dtype, operations = operations), + nn.Tanh(), ] self.model = nn.Sequential(*layers) @@ -164,11 +165,11 @@ class DACDecoder(nn.Module): class DAC(torch.nn.Module): def __init__( self, - encoder_dim: int = 64, - encoder_rates: List[int] = [2, 4, 8, 8], - latent_dim: int = None, - decoder_dim: int = 1536, - decoder_rates: List[int] = [8, 8, 4, 2], + encoder_dim: int = 128, + encoder_rates: List[int] = [2, 3, 4, 5], + latent_dim: int = 128, + decoder_dim: int = 2048, + decoder_rates: List[int] = [8, 5, 4, 3], sample_rate: int = 44100, ): super().__init__() @@ -204,6 +205,7 @@ class DAC(torch.nn.Module): class FoleyVae(torch.nn.Module): def __init__(self): + super().__init__() self.dac = DAC() self.syncformer = Synchformer(None, None, operations = ops) self.syncformer_preprocess = v2.Compose( diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 851bb913f..d29534fb0 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -422,7 +422,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): return dit_config if '{}triple_blocks.17.audio_cross_q.weight'.format(key_prefix) in state_dict_keys: # Hunyuan Foley - return {} + dit_config = {} + dit_config["image_model"] = "hunyuan_foley" + return dit_config if '{}latent_in.weight'.format(key_prefix) in state_dict_keys: # Hunyuan 3D in_shape = state_dict['{}latent_in.weight'.format(key_prefix)].shape diff --git a/comfy/sd.py b/comfy/sd.py index 84215ba7f..ec44d4288 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -508,7 +508,10 @@ class VAE: self.latent_dim = 128 self.first_stage_model = comfy.ldm.hunyuan_foley.vae.FoleyVae() # TODO - self.memory_used_encode = lambda shape, dtype: shape[0] * model_management.dtype_size(dtype) + encode_layers = 25 + decode_layers = 4 + self.memory_used_encode = lambda shape, dtype: torch.prod(shape) * model_management.dtype_size(dtype) * encode_layers + self.memory_used_decode = lambda shape, dtype: torch.prod(shape) * model_management.dtype_size(dtype) * decode_layers elif "vocoder.backbone.channel_layers.0.0.bias" in sd: #Ace Step Audio self.first_stage_model = comfy.ldm.ace.vae.music_dcae_pipeline.MusicDCAE(source_sample_rate=44100) diff --git a/comfy/text_encoders/clap_model.py b/comfy/text_encoders/clap_model.py index 6f64181a7..27bebd762 100644 --- a/comfy/text_encoders/clap_model.py +++ b/comfy/text_encoders/clap_model.py @@ -66,7 +66,6 @@ class ClapTextEmbeddings(nn.Module): "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long, device=device), persistent=True ) - # End copy self.padding_idx = pad_token_id self.position_embeddings = operations.Embedding( max_position_embeddings, hidden_size, padding_idx=self.padding_idx, device=device, dtype=dtype @@ -145,6 +144,7 @@ class ClapTextSelfAttention(nn.Module): value_states = self.value(hidden_states).view(hidden_shape).transpose(1, 2) query_states, key_states, value_states = [t.contiguous() for t in (query_states, key_states, value_states)] + attention_mask = attention_mask.to(query_states.dtype) attn_output = optimized_attention(query_states, key_states, value_states, self.num_attention_heads, mask = attention_mask, skip_output_reshape=True, skip_reshape=True) attn_output = attn_output.transpose(1, 2).contiguous() return attn_output.reshape(*input_shape, -1).contiguous() @@ -271,16 +271,16 @@ class ClapTextModel(nn.Module): attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, + embeds: Optional[torch.Tensor] = None, ): if input_ids is not None: input_shape = input_ids.size() - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] + elif embeds is not None: + input_shape = embeds.size()[:-1] batch_size, seq_length = input_shape - device = input_ids.device if input_ids is not None else inputs_embeds.device + device = input_ids.device if input_ids is not None else embeds.device if token_type_ids is None: if hasattr(self.embeddings, "token_type_ids"): @@ -294,7 +294,7 @@ class ClapTextModel(nn.Module): input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, - inputs_embeds=inputs_embeds, + inputs_embeds=embeds, ) encoder_outputs = self.encoder( embedding_output, @@ -308,6 +308,10 @@ class ClapTextModel(nn.Module): class ClapTextModelWithProjection(nn.Module): def __init__( self, + config, + dtype=None, + device=None, + operations=None, hidden_size: int = 768, intermediate_size: int = 3072, layer_norm_eps: float = 1e-12, @@ -318,26 +322,30 @@ class ClapTextModelWithProjection(nn.Module): type_vocab_size: int = 1, vocab_size: int = 50265, pad_token_id: int = 1, - device=None, - dtype=None, - operations=None ): super().__init__() + self.num_layers = num_hidden_layers self.text_model = ClapTextModel(num_attention_heads, vocab_size, hidden_size, intermediate_size, pad_token_id, max_position_embeddings, type_vocab_size, layer_norm_eps, num_hidden_layers, device=device, dtype=dtype, operations=operations) self.text_projection = ClapProjectionLayer(hidden_size, projection_dim, device=device, dtype=dtype, operations=operations,) + def get_input_embeddings(self): + return self.text_model.embeddings.word_embeddings + def forward( self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, + embeds = None, + **kwargs ): text_outputs = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, + embeds=embeds ) pooled_output = text_outputs[1] @@ -347,9 +355,10 @@ class ClapTextModelWithProjection(nn.Module): class ClapTextEncoderModel(sd1_clip.SDClipModel): def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}): + self.dtypes = set([dtype]) super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 1}, layer_norm_hidden_state=False, model_class=ClapTextModelWithProjection, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) class ClapLargeTokenizer(sd1_clip.SDTokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clap_tokenizer") - super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='clap_l', tokenizer_class=AutoTokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data) + super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='clap_l', tokenizer_class=AutoTokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=1, tokenizer_data=tokenizer_data) diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py index a57f5fd73..9c33f7067 100644 --- a/comfy_api/latest/_input_impl/video_types.py +++ b/comfy_api/latest/_input_impl/video_types.py @@ -89,7 +89,7 @@ class VideoFromFile(VideoInput): return stream.width, stream.height raise ValueError(f"No video stream found in file '{self.__file}'") - def get_duration(self, return_frames=False) -> float: + def get_duration(self) -> float: """ Returns the duration of the video in seconds. @@ -100,8 +100,7 @@ class VideoFromFile(VideoInput): self.__file.seek(0) with av.open(self.__file, mode="r") as container: if container.duration is not None: - if not return_frames: - return float(container.duration / av.time_base) + return float(container.duration / av.time_base) # Fallback: calculate from frame count and frame rate video_stream = next( @@ -109,8 +108,6 @@ class VideoFromFile(VideoInput): ) if video_stream and video_stream.frames and video_stream.average_rate: length = float(video_stream.frames / video_stream.average_rate) - if return_frames: - return length, float(video_stream.frames) return length # Last resort: decode frames to count them @@ -122,8 +119,6 @@ class VideoFromFile(VideoInput): frame_count += 1 if frame_count > 0: length = float(frame_count / video_stream.average_rate) - if return_frames: - return length, float(frame_count) return length raise ValueError(f"Could not determine duration for file '{self.__file}'") diff --git a/comfy_extras/nodes_hunyuan_foley.py b/comfy_extras/nodes_hunyuan_foley.py index b76ad4aa1..78a5d406d 100644 --- a/comfy_extras/nodes_hunyuan_foley.py +++ b/comfy_extras/nodes_hunyuan_foley.py @@ -1,53 +1,60 @@ import torch import comfy.model_management +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class EmptyLatentHunyuanFoley: +class EmptyLatentHunyuanFoley(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "length": ("INT", {"default": 12, "min": 1, "max": 15, "tooltip": "The length of the audio. The same length as the video."}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent audios in the batch."}), - }, - "optional": {"video": ("VIDEO")} - } - - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" - - CATEGORY = "latent/audio" - - def generate(self, length, batch_size, video = None): + def define_schema(cls): + return io.Schema( + node_id="EmptyLatentHunyuanFoley", + display_name="EmptyLatentHunyuanFoley", + category="audio/latent", + inputs = [ + io.Int.Input("length", min = 1, max = 15, default = 12), + io.Int.Input("batch_size", min = 1, max = 48_000, default = 1), + io.Video.Input("video", optional=True), + ], + outputs=[io.Latent.Output(display_name="latent")] + ) + @classmethod + def execute(cls, length, batch_size, video = None): if video is not None: - _, length = video.get_duration(return_frames = True) + length = video.size(0) length /= 25 shape = (batch_size, 128, int(50 * length)) latent = torch.randn(shape, device=comfy.model_management.intermediate_device()) - return ({"samples": latent, "type": "hunyuan_foley"}, ) + return io.NodeOutput({"samples": latent, "type": "hunyuan_foley"}, ) -class HunyuanFoleyConditioning: +class HunyuanFoleyConditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"video_encoding_siglip": ("CONDITIONING",), - "video_encoding_synchformer": ("CONDITIONING",), - "text_encoding": ("CONDITIONING",) - }, - } + def define_schema(cls): + return io.Schema( + node_id="HunyuanFoleyConditioning", + display_name="HunyuanFoleyConditioning", + category="conditioning/video_models", + inputs = [ + io.Conditioning.Input("video_encoding_1"), + io.Conditioning.Input("video_encoding_2"), + io.Conditioning.Input("text_encoding"), + ], + outputs=[io.Conditioning.Output(display_name= "positive"), io.Conditioning.Output(display_name="negative")] + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING") - RETURN_NAMES = ("positive", "negative") - - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, video_encoding_1, video_encoding_2, text_encoding): + @classmethod + def execute(cls, video_encoding_1, video_encoding_2, text_encoding): embeds = torch.cat([video_encoding_1, video_encoding_2, text_encoding], dim = 0) positive = [[embeds, {}]] negative = [[torch.zeros_like(embeds), {}]] - return (positive, negative) + return io.NodeOutput(positive, negative) -NODE_CLASS_MAPPINGS = { - "HunyuanFoleyConditioning": HunyuanFoleyConditioning, - "EmptyLatentHunyuanFoley": EmptyLatentHunyuanFoley, -} +class FoleyExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + HunyuanFoleyConditioning, + EmptyLatentHunyuanFoley + ] + +async def comfy_entrypoint() -> FoleyExtension: + return FoleyExtension() diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index a7c0f19e9..cde36e141 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -6,7 +6,6 @@ import av import torch import folder_paths import json -import numpy as np from typing import Optional from typing_extensions import override from fractions import Fraction @@ -50,15 +49,18 @@ class EncodeVideo(io.ComfyNode): @classmethod def execute(cls, video, processing_batch_size, step_size, vae = None, clip_vision = None): - b, t, c, h, w = video.shape + t, c, h, w = video.shape + b = 1 batch_size = b * t - if vae is None and clip_vision is None: + if vae is not None and clip_vision is not None: raise ValueError("Must either have vae or clip_vision.") + elif vae is None and clip_vision is None: + raise ValueError("Can't have VAE and Clip Vision passed at the same time!") vae = vae if vae is not None else clip_vision if hasattr(vae.first_stage_model, "video_encoding"): - data, num_segments, output_fn = vae.video_encoding(video, step_size) + data, num_segments, output_fn = vae.first_stage_model.video_encoding(video, step_size) batch_size = b * num_segments else: data = video.view(batch_size, c, h, w) @@ -76,7 +78,7 @@ class EncodeVideo(io.ComfyNode): output = torch.cat(outputs) - return output_fn(output) + return io.NodeOutput(output_fn(output)) class ResampleVideo(io.ComfyNode): @classmethod @@ -87,44 +89,62 @@ class ResampleVideo(io.ComfyNode): category="image/video", inputs = [ io.Video.Input("video"), - io.Int.Input("target_fps") + io.Int.Input("target_fps", min=1, default=25) ], - outputs=[io.Image.Output(display_name="images")] + outputs=[io.Video.Output(display_name="video")] ) @classmethod - def execute(cls, container: av.container.InputContainer, target_fps: int): + def execute(cls, video, target_fps: int): # doesn't support upsampling - - stream = container.streams.video[0] - frames = [] + with av.open(video.get_stream_source(), mode="r") as container: + stream = container.streams.video[0] + frames = [] - src_rate = stream.average_rate or stream.guessed_rate - src_fps = float(src_rate) if src_rate else None + src_rate = stream.average_rate or stream.guessed_rate + src_fps = float(src_rate) if src_rate else None + + # yield original frames if asked for upsampling or src is unknown + if src_fps is None or target_fps > src_fps: + for packet in container.demux(stream): + for frame in packet.decode(): + arr = torch.from_numpy(frame.to_ndarray(format="rgb24")).float() / 255.0 + frames.append(arr) + return torch.stack(frames) + + stream.thread_type = "AUTO" + + next_time = 0.0 + step = 1.0 / target_fps - # yield original frames if asked for upsampling or src is unknown - if src_fps is None or target_fps > src_fps: for packet in container.demux(stream): for frame in packet.decode(): - arr = torch.from_numpy(frame.to_ndarray(format="rgb24")).float() / 255.0 - frames.append(arr) - return torch.stack(frames) + if frame.time is None: + continue + t = frame.time + while t >= next_time: + arr = torch.from_numpy(frame.to_ndarray(format="rgb24")).float() / 255.0 + frames.append(arr) + next_time += step - stream.thread_type = "AUTO" + return io.NodeOutput(torch.stack(frames)) - next_time = 0.0 - step = 1.0 / target_fps +class VideoToImage(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="VideoToImage", + category="image/video", + display_name = "Video To Images", + inputs=[io.Video.Input("video")], + outputs=[io.Image.Output("images")] + ) + @classmethod + def execute(cls, video): + with av.open(video.get_stream_source(), mode="r") as container: + components = video.get_components_internal(container) - for packet in container.demux(stream): - for frame in packet.decode(): - if frame.time is None: - continue - t = frame.time - while t >= next_time: - arr = torch.from_numpy(frame.to_ndarray(format="rgb24")).float() / 255.0 - frames.append(arr) - next_time += step - - return torch.stack(frames) + images = components.images + return io.NodeOutput(images) class SaveWEBM(io.ComfyNode): @classmethod @@ -325,7 +345,8 @@ class VideoExtension(ComfyExtension): GetVideoComponents, LoadVideo, EncodeVideo, - ResampleVideo + ResampleVideo, + VideoToImage ] async def comfy_entrypoint() -> VideoExtension: