ComfyUI/comfy_extras/nodes_ar_video.py

"""
ComfyUI nodes for autoregressive video generation (Causal Forcing, Self-Forcing, etc.).
  - EmptyARVideoLatent: create 5D [B, C, T, H, W] video latent tensors
  - SamplerARVideo: SAMPLER for the block-by-block autoregressive denoising loop
  - ARVideoI2V: image-to-video conditioning for AR models (seeds KV cache with start image)
"""

import torch
from typing_extensions import override

import comfy.model_management
import comfy.samplers
import comfy.utils
from comfy_api.latest import ComfyExtension, io


class EmptyARVideoLatent(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="EmptyARVideoLatent",
            category="latent/video",
            inputs=[
                io.Int.Input("width", default=832, min=16, max=8192, step=16),
                io.Int.Input("height", default=480, min=16, max=8192, step=16),
                io.Int.Input("length", default=81, min=1, max=1024, step=4),
                io.Int.Input("batch_size", default=1, min=1, max=64),
            ],
            outputs=[
                io.Latent.Output(display_name="LATENT"),
            ],
        )

    @classmethod
    def execute(cls, width, height, length, batch_size) -> io.NodeOutput:
        lat_t = ((length - 1) // 4) + 1
        latent = torch.zeros(
            [batch_size, 16, lat_t, height // 8, width // 8],
            device=comfy.model_management.intermediate_device(),
        )
        return io.NodeOutput({"samples": latent})


class SamplerARVideo(io.ComfyNode):
    """Sampler for autoregressive video models (Causal Forcing, Self-Forcing).

    All AR-loop parameters are owned by this node so they live in the workflow.
    Add new widgets here as the AR sampler grows new options.
    """

    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="SamplerARVideo",
            display_name="Sampler AR Video",
            category="sampling/custom_sampling/samplers",
            inputs=[
                io.Int.Input(
                    "num_frame_per_block",
                    default=1, min=1, max=64,
                    tooltip="Frames per autoregressive block. 1 = framewise, "
                            "3 = chunkwise. Must match the checkpoint's training mode.",
                ),
            ],
            outputs=[io.Sampler.Output()],
        )

    @classmethod
    def execute(cls, num_frame_per_block) -> io.NodeOutput:
        extra_options = {
            "num_frame_per_block": num_frame_per_block,
        }
        return io.NodeOutput(comfy.samplers.ksampler("ar_video", extra_options))


class ARVideoI2V(io.ComfyNode):
    """Image-to-video setup for AR video models (Causal Forcing, Self-Forcing).

    VAE-encodes the start image and stores it in the model's transformer_options
    so that sample_ar_video can seed the KV cache before denoising.
    Uses the same T2V model checkpoint -- no separate I2V architecture needed.
    """

    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="ARVideoI2V",
            category="conditioning/video_models",
            inputs=[
                io.Model.Input("model"),
                io.Vae.Input("vae"),
                io.Image.Input("start_image"),
                io.Int.Input("width", default=832, min=16, max=8192, step=16),
                io.Int.Input("height", default=480, min=16, max=8192, step=16),
                io.Int.Input("length", default=81, min=1, max=1024, step=4),
                io.Int.Input("batch_size", default=1, min=1, max=64),
            ],
            outputs=[
                io.Model.Output(display_name="MODEL"),
                io.Latent.Output(display_name="LATENT"),
            ],
        )

    @classmethod
    def execute(cls, model, vae, start_image, width, height, length, batch_size) -> io.NodeOutput:
        start_image = comfy.utils.common_upscale(
            start_image[:1].movedim(-1, 1), width, height, "bilinear", "center"
        ).movedim(1, -1)

        initial_latent = vae.encode(start_image[:, :, :, :3])

        m = model.clone()
        to = m.model_options.setdefault("transformer_options", {})
        ar_cfg = to.setdefault("ar_config", {})
        ar_cfg["initial_latent"] = initial_latent

        lat_t = ((length - 1) // 4) + 1
        latent = torch.zeros(
            [batch_size, 16, lat_t, height // 8, width // 8],
            device=comfy.model_management.intermediate_device(),
        )
        return io.NodeOutput(m, {"samples": latent})


class ARVideoExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
        return [
            EmptyARVideoLatent,
            SamplerARVideo,
            ARVideoI2V,
        ]


async def comfy_entrypoint() -> ARVideoExtension:
    return ARVideoExtension()