fixed multiple errors in nodes and model loading

This commit is contained in:
Yousef Rafat 2025-09-29 22:44:40 +03:00
parent a6dabd2855
commit 42a265cddf
9 changed files with 143 additions and 102 deletions

View File

@ -1,4 +1,4 @@
from typing import List, Tuple, Optional, Union, Dict from typing import List, Tuple, Optional, Union
from functools import partial from functools import partial
import math import math
@ -638,17 +638,19 @@ class SingleStreamBlock(nn.Module):
class HunyuanVideoFoley(nn.Module): class HunyuanVideoFoley(nn.Module):
def __init__( def __init__(
self, self,
model_args,
dtype: Optional[torch.dtype] = None, dtype: Optional[torch.dtype] = None,
device: Optional[torch.device] = None, device: Optional[torch.device] = None,
operations = None operations = None,
**kwargs
): ):
factory_kwargs = {"device": device, "dtype": dtype} factory_kwargs = {"device": device, "dtype": dtype}
super().__init__() super().__init__()
self.dtype = dtype
self.depth_triple_blocks = 18 self.depth_triple_blocks = 18
self.depth_single_blocks = 36 self.depth_single_blocks = 36
model_args = {}
self.interleaved_audio_visual_rope = model_args.get("interleaved_audio_visual_rope", True) self.interleaved_audio_visual_rope = model_args.get("interleaved_audio_visual_rope", True)

View File

@ -850,8 +850,8 @@ class GlobalTransformer(torch.nn.Module):
self.vis_in_lnorm = operations.LayerNorm(n_embd, **factory_kwargs) self.vis_in_lnorm = operations.LayerNorm(n_embd, **factory_kwargs)
self.aud_in_lnorm = operations.LayerNorm(n_embd, **factory_kwargs) self.aud_in_lnorm = operations.LayerNorm(n_embd, **factory_kwargs)
# aux tokens # aux tokens
self.OFF_tok = operations.Parameter(torch.randn(1, 1, n_embd, **factory_kwargs)) self.OFF_tok = nn.Parameter(torch.randn(1, 1, n_embd, **factory_kwargs))
self.MOD_tok = operations.Parameter(torch.randn(1, 1, n_embd, **factory_kwargs)) self.MOD_tok = nn.Parameter(torch.randn(1, 1, n_embd, **factory_kwargs))
# whole token dropout # whole token dropout
self.tok_pdrop = tok_pdrop self.tok_pdrop = tok_pdrop
self.tok_drop_vis = torch.nn.Dropout1d(tok_pdrop) self.tok_drop_vis = torch.nn.Dropout1d(tok_pdrop)
@ -863,7 +863,7 @@ class GlobalTransformer(torch.nn.Module):
) )
# the stem # the stem
self.drop = torch.nn.Dropout(embd_pdrop) self.drop = torch.nn.Dropout(embd_pdrop)
self.blocks = operations.Sequential(*[Block(self.config, operations=operations, **factory_kwargs) for _ in range(n_layer)]) self.blocks = nn.Sequential(*[Block(self.config, operations=operations, **factory_kwargs) for _ in range(n_layer)])
# pre-output norm # pre-output norm
self.ln_f = operations.LayerNorm(n_embd) self.ln_f = operations.LayerNorm(n_embd)
# maybe add a head # maybe add a head

View File

@ -5,7 +5,7 @@ from typing import List
import torch.nn as nn import torch.nn as nn
from einops import rearrange from einops import rearrange
from torchvision.transforms import v2 from torchvision.transforms import v2
from torch.nn.utils.parametrizations import weight_norm from torch.nn.utils import weight_norm
from comfy.ldm.hunyuan_foley.syncformer import Synchformer from comfy.ldm.hunyuan_foley.syncformer import Synchformer
@ -154,6 +154,7 @@ class DACDecoder(nn.Module):
layers += [ layers += [
Snake1d(output_dim, device = device, dtype = dtype), Snake1d(output_dim, device = device, dtype = dtype),
WNConv1d(output_dim, d_out, kernel_size=7, padding=3, device = device, dtype = dtype, operations = operations), WNConv1d(output_dim, d_out, kernel_size=7, padding=3, device = device, dtype = dtype, operations = operations),
nn.Tanh(),
] ]
self.model = nn.Sequential(*layers) self.model = nn.Sequential(*layers)
@ -164,11 +165,11 @@ class DACDecoder(nn.Module):
class DAC(torch.nn.Module): class DAC(torch.nn.Module):
def __init__( def __init__(
self, self,
encoder_dim: int = 64, encoder_dim: int = 128,
encoder_rates: List[int] = [2, 4, 8, 8], encoder_rates: List[int] = [2, 3, 4, 5],
latent_dim: int = None, latent_dim: int = 128,
decoder_dim: int = 1536, decoder_dim: int = 2048,
decoder_rates: List[int] = [8, 8, 4, 2], decoder_rates: List[int] = [8, 5, 4, 3],
sample_rate: int = 44100, sample_rate: int = 44100,
): ):
super().__init__() super().__init__()
@ -204,6 +205,7 @@ class DAC(torch.nn.Module):
class FoleyVae(torch.nn.Module): class FoleyVae(torch.nn.Module):
def __init__(self): def __init__(self):
super().__init__()
self.dac = DAC() self.dac = DAC()
self.syncformer = Synchformer(None, None, operations = ops) self.syncformer = Synchformer(None, None, operations = ops)
self.syncformer_preprocess = v2.Compose( self.syncformer_preprocess = v2.Compose(

View File

@ -422,7 +422,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
return dit_config return dit_config
if '{}triple_blocks.17.audio_cross_q.weight'.format(key_prefix) in state_dict_keys: # Hunyuan Foley if '{}triple_blocks.17.audio_cross_q.weight'.format(key_prefix) in state_dict_keys: # Hunyuan Foley
return {} dit_config = {}
dit_config["image_model"] = "hunyuan_foley"
return dit_config
if '{}latent_in.weight'.format(key_prefix) in state_dict_keys: # Hunyuan 3D if '{}latent_in.weight'.format(key_prefix) in state_dict_keys: # Hunyuan 3D
in_shape = state_dict['{}latent_in.weight'.format(key_prefix)].shape in_shape = state_dict['{}latent_in.weight'.format(key_prefix)].shape

View File

@ -508,7 +508,10 @@ class VAE:
self.latent_dim = 128 self.latent_dim = 128
self.first_stage_model = comfy.ldm.hunyuan_foley.vae.FoleyVae() self.first_stage_model = comfy.ldm.hunyuan_foley.vae.FoleyVae()
# TODO # TODO
self.memory_used_encode = lambda shape, dtype: shape[0] * model_management.dtype_size(dtype) encode_layers = 25
decode_layers = 4
self.memory_used_encode = lambda shape, dtype: torch.prod(shape) * model_management.dtype_size(dtype) * encode_layers
self.memory_used_decode = lambda shape, dtype: torch.prod(shape) * model_management.dtype_size(dtype) * decode_layers
elif "vocoder.backbone.channel_layers.0.0.bias" in sd: #Ace Step Audio elif "vocoder.backbone.channel_layers.0.0.bias" in sd: #Ace Step Audio
self.first_stage_model = comfy.ldm.ace.vae.music_dcae_pipeline.MusicDCAE(source_sample_rate=44100) self.first_stage_model = comfy.ldm.ace.vae.music_dcae_pipeline.MusicDCAE(source_sample_rate=44100)

View File

@ -66,7 +66,6 @@ class ClapTextEmbeddings(nn.Module):
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long, device=device), persistent=True "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long, device=device), persistent=True
) )
# End copy
self.padding_idx = pad_token_id self.padding_idx = pad_token_id
self.position_embeddings = operations.Embedding( self.position_embeddings = operations.Embedding(
max_position_embeddings, hidden_size, padding_idx=self.padding_idx, device=device, dtype=dtype max_position_embeddings, hidden_size, padding_idx=self.padding_idx, device=device, dtype=dtype
@ -145,6 +144,7 @@ class ClapTextSelfAttention(nn.Module):
value_states = self.value(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.value(hidden_states).view(hidden_shape).transpose(1, 2)
query_states, key_states, value_states = [t.contiguous() for t in (query_states, key_states, value_states)] query_states, key_states, value_states = [t.contiguous() for t in (query_states, key_states, value_states)]
attention_mask = attention_mask.to(query_states.dtype)
attn_output = optimized_attention(query_states, key_states, value_states, self.num_attention_heads, mask = attention_mask, skip_output_reshape=True, skip_reshape=True) attn_output = optimized_attention(query_states, key_states, value_states, self.num_attention_heads, mask = attention_mask, skip_output_reshape=True, skip_reshape=True)
attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.transpose(1, 2).contiguous()
return attn_output.reshape(*input_shape, -1).contiguous() return attn_output.reshape(*input_shape, -1).contiguous()
@ -271,16 +271,16 @@ class ClapTextModel(nn.Module):
attention_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None, embeds: Optional[torch.Tensor] = None,
): ):
if input_ids is not None: if input_ids is not None:
input_shape = input_ids.size() input_shape = input_ids.size()
elif inputs_embeds is not None: elif embeds is not None:
input_shape = inputs_embeds.size()[:-1] input_shape = embeds.size()[:-1]
batch_size, seq_length = input_shape batch_size, seq_length = input_shape
device = input_ids.device if input_ids is not None else inputs_embeds.device device = input_ids.device if input_ids is not None else embeds.device
if token_type_ids is None: if token_type_ids is None:
if hasattr(self.embeddings, "token_type_ids"): if hasattr(self.embeddings, "token_type_ids"):
@ -294,7 +294,7 @@ class ClapTextModel(nn.Module):
input_ids=input_ids, input_ids=input_ids,
position_ids=position_ids, position_ids=position_ids,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds, inputs_embeds=embeds,
) )
encoder_outputs = self.encoder( encoder_outputs = self.encoder(
embedding_output, embedding_output,
@ -308,6 +308,10 @@ class ClapTextModel(nn.Module):
class ClapTextModelWithProjection(nn.Module): class ClapTextModelWithProjection(nn.Module):
def __init__( def __init__(
self, self,
config,
dtype=None,
device=None,
operations=None,
hidden_size: int = 768, hidden_size: int = 768,
intermediate_size: int = 3072, intermediate_size: int = 3072,
layer_norm_eps: float = 1e-12, layer_norm_eps: float = 1e-12,
@ -318,26 +322,30 @@ class ClapTextModelWithProjection(nn.Module):
type_vocab_size: int = 1, type_vocab_size: int = 1,
vocab_size: int = 50265, vocab_size: int = 50265,
pad_token_id: int = 1, pad_token_id: int = 1,
device=None,
dtype=None,
operations=None
): ):
super().__init__() super().__init__()
self.num_layers = num_hidden_layers
self.text_model = ClapTextModel(num_attention_heads, vocab_size, hidden_size, intermediate_size, pad_token_id, max_position_embeddings, self.text_model = ClapTextModel(num_attention_heads, vocab_size, hidden_size, intermediate_size, pad_token_id, max_position_embeddings,
type_vocab_size, layer_norm_eps, num_hidden_layers, device=device, dtype=dtype, operations=operations) type_vocab_size, layer_norm_eps, num_hidden_layers, device=device, dtype=dtype, operations=operations)
self.text_projection = ClapProjectionLayer(hidden_size, projection_dim, device=device, dtype=dtype, operations=operations,) self.text_projection = ClapProjectionLayer(hidden_size, projection_dim, device=device, dtype=dtype, operations=operations,)
def get_input_embeddings(self):
return self.text_model.embeddings.word_embeddings
def forward( def forward(
self, self,
input_ids: Optional[torch.Tensor] = None, input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None,
embeds = None,
**kwargs
): ):
text_outputs = self.text_model( text_outputs = self.text_model(
input_ids=input_ids, input_ids=input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
position_ids=position_ids, position_ids=position_ids,
embeds=embeds
) )
pooled_output = text_outputs[1] pooled_output = text_outputs[1]
@ -347,9 +355,10 @@ class ClapTextModelWithProjection(nn.Module):
class ClapTextEncoderModel(sd1_clip.SDClipModel): class ClapTextEncoderModel(sd1_clip.SDClipModel):
def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}): def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
self.dtypes = set([dtype])
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 1}, layer_norm_hidden_state=False, model_class=ClapTextModelWithProjection, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 1}, layer_norm_hidden_state=False, model_class=ClapTextModelWithProjection, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
class ClapLargeTokenizer(sd1_clip.SDTokenizer): class ClapLargeTokenizer(sd1_clip.SDTokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}): def __init__(self, embedding_directory=None, tokenizer_data={}):
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clap_tokenizer") tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clap_tokenizer")
super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='clap_l', tokenizer_class=AutoTokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data) super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='clap_l', tokenizer_class=AutoTokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=1, tokenizer_data=tokenizer_data)

View File

@ -89,7 +89,7 @@ class VideoFromFile(VideoInput):
return stream.width, stream.height return stream.width, stream.height
raise ValueError(f"No video stream found in file '{self.__file}'") raise ValueError(f"No video stream found in file '{self.__file}'")
def get_duration(self, return_frames=False) -> float: def get_duration(self) -> float:
""" """
Returns the duration of the video in seconds. Returns the duration of the video in seconds.
@ -100,8 +100,7 @@ class VideoFromFile(VideoInput):
self.__file.seek(0) self.__file.seek(0)
with av.open(self.__file, mode="r") as container: with av.open(self.__file, mode="r") as container:
if container.duration is not None: if container.duration is not None:
if not return_frames: return float(container.duration / av.time_base)
return float(container.duration / av.time_base)
# Fallback: calculate from frame count and frame rate # Fallback: calculate from frame count and frame rate
video_stream = next( video_stream = next(
@ -109,8 +108,6 @@ class VideoFromFile(VideoInput):
) )
if video_stream and video_stream.frames and video_stream.average_rate: if video_stream and video_stream.frames and video_stream.average_rate:
length = float(video_stream.frames / video_stream.average_rate) length = float(video_stream.frames / video_stream.average_rate)
if return_frames:
return length, float(video_stream.frames)
return length return length
# Last resort: decode frames to count them # Last resort: decode frames to count them
@ -122,8 +119,6 @@ class VideoFromFile(VideoInput):
frame_count += 1 frame_count += 1
if frame_count > 0: if frame_count > 0:
length = float(frame_count / video_stream.average_rate) length = float(frame_count / video_stream.average_rate)
if return_frames:
return length, float(frame_count)
return length return length
raise ValueError(f"Could not determine duration for file '{self.__file}'") raise ValueError(f"Could not determine duration for file '{self.__file}'")

View File

@ -1,53 +1,60 @@
import torch import torch
import comfy.model_management import comfy.model_management
from typing_extensions import override
from comfy_api.latest import ComfyExtension, io
class EmptyLatentHunyuanFoley: class EmptyLatentHunyuanFoley(io.ComfyNode):
@classmethod @classmethod
def INPUT_TYPES(s): def define_schema(cls):
return { return io.Schema(
"required": { node_id="EmptyLatentHunyuanFoley",
"length": ("INT", {"default": 12, "min": 1, "max": 15, "tooltip": "The length of the audio. The same length as the video."}), display_name="EmptyLatentHunyuanFoley",
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent audios in the batch."}), category="audio/latent",
}, inputs = [
"optional": {"video": ("VIDEO")} io.Int.Input("length", min = 1, max = 15, default = 12),
} io.Int.Input("batch_size", min = 1, max = 48_000, default = 1),
io.Video.Input("video", optional=True),
RETURN_TYPES = ("LATENT",) ],
FUNCTION = "generate" outputs=[io.Latent.Output(display_name="latent")]
)
CATEGORY = "latent/audio" @classmethod
def execute(cls, length, batch_size, video = None):
def generate(self, length, batch_size, video = None):
if video is not None: if video is not None:
_, length = video.get_duration(return_frames = True) length = video.size(0)
length /= 25 length /= 25
shape = (batch_size, 128, int(50 * length)) shape = (batch_size, 128, int(50 * length))
latent = torch.randn(shape, device=comfy.model_management.intermediate_device()) latent = torch.randn(shape, device=comfy.model_management.intermediate_device())
return ({"samples": latent, "type": "hunyuan_foley"}, ) return io.NodeOutput({"samples": latent, "type": "hunyuan_foley"}, )
class HunyuanFoleyConditioning: class HunyuanFoleyConditioning(io.ComfyNode):
@classmethod @classmethod
def INPUT_TYPES(s): def define_schema(cls):
return {"required": {"video_encoding_siglip": ("CONDITIONING",), return io.Schema(
"video_encoding_synchformer": ("CONDITIONING",), node_id="HunyuanFoleyConditioning",
"text_encoding": ("CONDITIONING",) display_name="HunyuanFoleyConditioning",
}, category="conditioning/video_models",
} inputs = [
io.Conditioning.Input("video_encoding_1"),
io.Conditioning.Input("video_encoding_2"),
io.Conditioning.Input("text_encoding"),
],
outputs=[io.Conditioning.Output(display_name= "positive"), io.Conditioning.Output(display_name="negative")]
)
RETURN_TYPES = ("CONDITIONING", "CONDITIONING") @classmethod
RETURN_NAMES = ("positive", "negative") def execute(cls, video_encoding_1, video_encoding_2, text_encoding):
FUNCTION = "encode"
CATEGORY = "conditioning/video_models"
def encode(self, video_encoding_1, video_encoding_2, text_encoding):
embeds = torch.cat([video_encoding_1, video_encoding_2, text_encoding], dim = 0) embeds = torch.cat([video_encoding_1, video_encoding_2, text_encoding], dim = 0)
positive = [[embeds, {}]] positive = [[embeds, {}]]
negative = [[torch.zeros_like(embeds), {}]] negative = [[torch.zeros_like(embeds), {}]]
return (positive, negative) return io.NodeOutput(positive, negative)
NODE_CLASS_MAPPINGS = { class FoleyExtension(ComfyExtension):
"HunyuanFoleyConditioning": HunyuanFoleyConditioning, @override
"EmptyLatentHunyuanFoley": EmptyLatentHunyuanFoley, async def get_node_list(self) -> list[type[io.ComfyNode]]:
} return [
HunyuanFoleyConditioning,
EmptyLatentHunyuanFoley
]
async def comfy_entrypoint() -> FoleyExtension:
return FoleyExtension()

View File

@ -6,7 +6,6 @@ import av
import torch import torch
import folder_paths import folder_paths
import json import json
import numpy as np
from typing import Optional from typing import Optional
from typing_extensions import override from typing_extensions import override
from fractions import Fraction from fractions import Fraction
@ -50,15 +49,18 @@ class EncodeVideo(io.ComfyNode):
@classmethod @classmethod
def execute(cls, video, processing_batch_size, step_size, vae = None, clip_vision = None): def execute(cls, video, processing_batch_size, step_size, vae = None, clip_vision = None):
b, t, c, h, w = video.shape t, c, h, w = video.shape
b = 1
batch_size = b * t batch_size = b * t
if vae is None and clip_vision is None: if vae is not None and clip_vision is not None:
raise ValueError("Must either have vae or clip_vision.") raise ValueError("Must either have vae or clip_vision.")
elif vae is None and clip_vision is None:
raise ValueError("Can't have VAE and Clip Vision passed at the same time!")
vae = vae if vae is not None else clip_vision vae = vae if vae is not None else clip_vision
if hasattr(vae.first_stage_model, "video_encoding"): if hasattr(vae.first_stage_model, "video_encoding"):
data, num_segments, output_fn = vae.video_encoding(video, step_size) data, num_segments, output_fn = vae.first_stage_model.video_encoding(video, step_size)
batch_size = b * num_segments batch_size = b * num_segments
else: else:
data = video.view(batch_size, c, h, w) data = video.view(batch_size, c, h, w)
@ -76,7 +78,7 @@ class EncodeVideo(io.ComfyNode):
output = torch.cat(outputs) output = torch.cat(outputs)
return output_fn(output) return io.NodeOutput(output_fn(output))
class ResampleVideo(io.ComfyNode): class ResampleVideo(io.ComfyNode):
@classmethod @classmethod
@ -87,44 +89,62 @@ class ResampleVideo(io.ComfyNode):
category="image/video", category="image/video",
inputs = [ inputs = [
io.Video.Input("video"), io.Video.Input("video"),
io.Int.Input("target_fps") io.Int.Input("target_fps", min=1, default=25)
], ],
outputs=[io.Image.Output(display_name="images")] outputs=[io.Video.Output(display_name="video")]
) )
@classmethod @classmethod
def execute(cls, container: av.container.InputContainer, target_fps: int): def execute(cls, video, target_fps: int):
# doesn't support upsampling # doesn't support upsampling
with av.open(video.get_stream_source(), mode="r") as container:
stream = container.streams.video[0] stream = container.streams.video[0]
frames = [] frames = []
src_rate = stream.average_rate or stream.guessed_rate src_rate = stream.average_rate or stream.guessed_rate
src_fps = float(src_rate) if src_rate else None src_fps = float(src_rate) if src_rate else None
# yield original frames if asked for upsampling or src is unknown
if src_fps is None or target_fps > src_fps:
for packet in container.demux(stream):
for frame in packet.decode():
arr = torch.from_numpy(frame.to_ndarray(format="rgb24")).float() / 255.0
frames.append(arr)
return torch.stack(frames)
stream.thread_type = "AUTO"
next_time = 0.0
step = 1.0 / target_fps
# yield original frames if asked for upsampling or src is unknown
if src_fps is None or target_fps > src_fps:
for packet in container.demux(stream): for packet in container.demux(stream):
for frame in packet.decode(): for frame in packet.decode():
arr = torch.from_numpy(frame.to_ndarray(format="rgb24")).float() / 255.0 if frame.time is None:
frames.append(arr) continue
return torch.stack(frames) t = frame.time
while t >= next_time:
arr = torch.from_numpy(frame.to_ndarray(format="rgb24")).float() / 255.0
frames.append(arr)
next_time += step
stream.thread_type = "AUTO" return io.NodeOutput(torch.stack(frames))
next_time = 0.0 class VideoToImage(io.ComfyNode):
step = 1.0 / target_fps @classmethod
def define_schema(cls):
return io.Schema(
node_id="VideoToImage",
category="image/video",
display_name = "Video To Images",
inputs=[io.Video.Input("video")],
outputs=[io.Image.Output("images")]
)
@classmethod
def execute(cls, video):
with av.open(video.get_stream_source(), mode="r") as container:
components = video.get_components_internal(container)
for packet in container.demux(stream): images = components.images
for frame in packet.decode(): return io.NodeOutput(images)
if frame.time is None:
continue
t = frame.time
while t >= next_time:
arr = torch.from_numpy(frame.to_ndarray(format="rgb24")).float() / 255.0
frames.append(arr)
next_time += step
return torch.stack(frames)
class SaveWEBM(io.ComfyNode): class SaveWEBM(io.ComfyNode):
@classmethod @classmethod
@ -325,7 +345,8 @@ class VideoExtension(ComfyExtension):
GetVideoComponents, GetVideoComponents,
LoadVideo, LoadVideo,
EncodeVideo, EncodeVideo,
ResampleVideo ResampleVideo,
VideoToImage
] ]
async def comfy_entrypoint() -> VideoExtension: async def comfy_entrypoint() -> VideoExtension: