diff --git a/comfy/ldm/hunyuan_foley/vae.py b/comfy/ldm/hunyuan_foley/vae.py index 17e15f521..e8567c11b 100644 --- a/comfy/ldm/hunyuan_foley/vae.py +++ b/comfy/ldm/hunyuan_foley/vae.py @@ -5,7 +5,7 @@ from typing import List import torch.nn as nn from einops import rearrange from torchvision.transforms import v2 -from torch.nn.utils import weight_norm +from torch.nn.utils.parametrizations import weight_norm from comfy.ldm.hunyuan_foley.syncformer import Synchformer @@ -96,12 +96,6 @@ class DACEncoder(nn.Module): d_model *= 2 self.block += [DACEncoderBlock(d_model, stride=stride, device = device, dtype = dtype, operations = operations)] - # Create last convolution - self.block += [ - Snake1d(d_model), - WNConv1d(d_model, d_latent, kernel_size=3, padding=1, device = device, dtype = dtype, operations = operations), - ] - # Wrap black into nn.Sequential self.block = nn.Sequential(*self.block) self.enc_dim = d_model @@ -151,12 +145,6 @@ class DACDecoder(nn.Module): output_dim = channels // 2 ** (i + 1) layers += [DACDecoderBlock(input_dim, output_dim, stride, device = device, dtype = dtype, operations = operations)] - layers += [ - Snake1d(output_dim, device = device, dtype = dtype), - WNConv1d(output_dim, d_out, kernel_size=7, padding=3, device = device, dtype = dtype, operations = operations), - nn.Tanh(), - ] - self.model = nn.Sequential(*layers) def forward(self, x): diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 1fbd6f2ad..5a0bbc68c 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1318,6 +1318,10 @@ class HunyuanFoley(supported_models_base.BASE): return model_base.HunyuanFoley(self, device=device) def clip_target(self, state_dict={}): return supported_models_base.ClipTarget(comfy.text_encoders.clap_model.ClapLargeTokenizer, comfy.text_encoders.clap_model.ClapTextEncoderModel) + + def process_clip_state_dict(self, state_dict): + state_dict = utils.state_dict_prefix_replace(state_dict, {k: "transformer." for k in self.text_encoder_key_prefix}, filter_keys=True) + return state_dict class QwenImage(supported_models_base.BASE): unet_config = { diff --git a/comfy_extras/nodes_hunyuan_foley.py b/comfy_extras/nodes_hunyuan_foley.py index 78a5d406d..649d79718 100644 --- a/comfy_extras/nodes_hunyuan_foley.py +++ b/comfy_extras/nodes_hunyuan_foley.py @@ -34,8 +34,8 @@ class HunyuanFoleyConditioning(io.ComfyNode): display_name="HunyuanFoleyConditioning", category="conditioning/video_models", inputs = [ - io.Conditioning.Input("video_encoding_1"), - io.Conditioning.Input("video_encoding_2"), + io.Conditioning.Input("siglip_encoding_1"), + io.Conditioning.Input("synchformer_encoding_2"), io.Conditioning.Input("text_encoding"), ], outputs=[io.Conditioning.Output(display_name= "positive"), io.Conditioning.Output(display_name="negative")] diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index cde36e141..485d38804 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -14,6 +14,7 @@ from comfy_api.input_impl import VideoFromComponents, VideoFromFile from comfy_api.util import VideoCodec, VideoComponents, VideoContainer from comfy_api.latest import ComfyExtension, io, ui from comfy.cli_args import args +import comfy.utils class EncodeVideo(io.ComfyNode): @classmethod @@ -49,6 +50,7 @@ class EncodeVideo(io.ComfyNode): @classmethod def execute(cls, video, processing_batch_size, step_size, vae = None, clip_vision = None): + t, c, h, w = video.shape b = 1 batch_size = b * t @@ -71,10 +73,15 @@ class EncodeVideo(io.ComfyNode): outputs = [] total = data.shape[0] - for i in range(0, total, batch_size): - chunk = data[i : i + batch_size] - out = vae.encode(chunk) - outputs.append(out) + pbar = comfy.utils.ProgressBar(total/batch_size) + with torch.inference_mode(): + for i in range(0, total, batch_size): + chunk = data[i : i + batch_size] + out = vae.encode(chunk) + outputs.append(out) + del out, chunk + torch.cuda.empty_cache() + pbar.update(1) output = torch.cat(outputs) @@ -109,7 +116,7 @@ class ResampleVideo(io.ComfyNode): for frame in packet.decode(): arr = torch.from_numpy(frame.to_ndarray(format="rgb24")).float() / 255.0 frames.append(arr) - return torch.stack(frames) + return io.NodeOutput(torch.stack(frames)) stream.thread_type = "AUTO"