split encodevideo into two nodes

2026-02-14 07:22:36 +08:00 · 2025-11-20 16:28:48 +02:00 · 2025-11-20 16:28:48 +02:00 · 07cd971992
commit 07cd971992
parent 9a732a0226
1 changed files with 109 additions and 92 deletions
--- a/comfy_extras/nodes_video.py
+++ b/comfy_extras/nodes_video.py
@ -16,41 +16,7 @@ from comfy_api.latest import ComfyExtension, io, ui
 from comfy.cli_args import args
 import comfy.utils
-class EncodeVideo(io.ComfyNode):
+def encode_video(vae, model, video, step_size, processing_batch_size):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="EncodeVideo",
            display_name="Encode Video",
            category="image/video",
            description="Encode a video using an image encoder.",
            inputs=[
                io.Video.Input("video", tooltip="The video to be encoded."),
                io.Int.Input(
                    "processing_batch_size", default=-1, min=-1,
                    tooltip=(
                        "Number of frames/segments to process at a time during encoding.\n"
                        "-1 means process all at once. Smaller values reduce GPU memory usage."
                    ),
                ),
                io.Int.Input("step_size", default=8, min=1, max=32,
                    tooltip=(
                        "Stride (in frames) between the start of consecutive segments.\n"
                        "Smaller step = more overlap and smoother temporal coverage "
                        "but higher compute cost. Larger step = faster but may miss detail."
                    ),
                ),
                io.Vae.Input("vae", optional=True),
                io.ClipVision.Input("clip_vision", optional=True),
            ],
            outputs=[
                io.Conditioning.Output(display_name="encoded_video"),
            ],
        )
    @classmethod
    def execute(cls, video, processing_batch_size, step_size, vae = None, clip_vision = None):
    video = video.images
    if not isinstance(video, torch.Tensor):
        video = torch.from_numpy(video)
@ -60,20 +26,9 @@ class EncodeVideo(io.ComfyNode):
    # channel last
    if rest[-1] in (1, 3, 4) and rest[0] not in (1, 3, 4):
        video = video.permute(0, 3, 1, 2)
        t, c, h, w = video.shape
        device = video.device
    b = 1
-        batch_size = b * t
+    t, c, h, w = video.shape
-
+    batch_size = video.shape[0]
        if vae is not None and clip_vision is not None:
            raise ValueError("Must either have vae or clip_vision.")
        elif vae is None and clip_vision is None:
            raise ValueError("Can't have VAE and Clip Vision passed at the same time!")
        model = vae.first_stage_model if vae is not None else clip_vision.model
        vae = vae if vae is not None else clip_vision
    if hasattr(model, "video_encoding"):
        data, num_segments, output_fn = model.video_encoding(video, step_size)
        batch_size = b * num_segments
@ -83,14 +38,13 @@ class EncodeVideo(io.ComfyNode):
    if processing_batch_size != -1:
        batch_size = processing_batch_size
    outputs = None
    total = data.shape[0]
    pbar = comfy.utils.ProgressBar(total/batch_size)
    model_dtype = next(model.parameters()).dtype
    with torch.inference_mode():
        for i in range(0, total, batch_size):
-                chunk = data[i : i + batch_size].to(device, non_blocking = True)
+            chunk = data[i : i + batch_size].to(next(model.parameters()).device, non_blocking = True)
            chunk = chunk.to(model_dtype)
            if hasattr(vae, "encode"):
                try:
@ -118,7 +72,69 @@ class EncodeVideo(io.ComfyNode):
            torch.cuda.empty_cache()
            pbar.update(1)
-        return io.NodeOutput(output_fn(outputs))
+    return output_fn(outputs)
 encode_video_inputs = [
    io.Video.Input("video", tooltip="The video to be encoded."),
    io.Int.Input(
        "processing_batch_size", default=-1, min=-1,
        tooltip=(
            "Number of frames/segments to process at a time during encoding.\n"
            "-1 means process all at once. Smaller values reduce GPU memory usage."
        ),
    ),
    io.Int.Input("step_size", default=8, min=1, max=32,
        tooltip=(
            "Stride (in frames) between the start of consecutive segments.\n"
            "Smaller step = more overlap and smoother temporal coverage "
            "but higher compute cost. Larger step = faster but may miss detail."
        ),
    ),
 ]
 class EncodeVideoVAE(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="EncodeVideoVAE",
            display_name="Encode Video VAE",
            category="image/video",
            description="Encode a video using a VAE.",
            inputs=[
                *encode_video_inputs,
                io.Vae.Input("vae"),
            ],
            outputs=[
                io.Conditioning.Output(display_name="encoded_video"),
            ],
        )
    @classmethod
    def execute(cls, video, processing_batch_size, step_size, vae):
        model = vae.first_stage_model
        model = model.to(vae.device)
        return io.NodeOutput(encode_video(vae, model, video, step_size, processing_batch_size))
 class EncodeVideoCLIP(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="EncodeVideoCLIP",
            display_name="Encode Video CLIP",
            category="image/video",
            description="Encode a video using a CLIP Vision Model.",
            inputs=[
                *encode_video_inputs,
                io.ClipVision.Input("clip_vision"),
            ],
            outputs=[
                io.Conditioning.Output(display_name="encoded_video"),
            ],
        )
    @classmethod
    def execute(cls, video, processing_batch_size, step_size, clip_vision):
        model = clip_vision.model
        return io.NodeOutput(encode_video(clip_vision, model, video, step_size, processing_batch_size))
 class ResampleVideo(io.ComfyNode):
    @classmethod
@ -373,8 +389,9 @@ class VideoExtension(ComfyExtension):
            CreateVideo,
            GetVideoComponents,
            LoadVideo,
            EncodeVideo,
            ResampleVideo,
            EncodeVideoVAE,
            EncodeVideoCLIP
        ]
 async def comfy_entrypoint() -> VideoExtension: