convert nodes_hunyuan.py to V3 schema (#10136)

2026-07-14 02:17:13 +08:00 · 2025-10-13 22:36:26 +03:00 · 2025-10-13 22:36:26 +03:00 · 3dfdcf66b6
commit 3dfdcf66b6
parent 95ca2e56c8
1 changed files with 150 additions and 91 deletions
--- a/comfy_extras/nodes_hunyuan.py
+++ b/comfy_extras/nodes_hunyuan.py
@ -2,42 +2,60 @@ import nodes
 import node_helpers
 import torch
 import comfy.model_management
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io


-class CLIPTextEncodeHunyuanDiT:
+class CLIPTextEncodeHunyuanDiT(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {
-            "clip": ("CLIP", ),
-            "bert": ("STRING", {"multiline": True, "dynamicPrompts": True}),
-            "mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}),
-            }}
-    RETURN_TYPES = ("CONDITIONING",)
-    FUNCTION = "encode"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="CLIPTextEncodeHunyuanDiT",
+            category="advanced/conditioning",
+            inputs=[
+                io.Clip.Input("clip"),
+                io.String.Input("bert", multiline=True, dynamic_prompts=True),
+                io.String.Input("mt5xl", multiline=True, dynamic_prompts=True),
+            ],
+            outputs=[
+                io.Conditioning.Output(),
+            ],
+        )

-    CATEGORY = "advanced/conditioning"
-
-    def encode(self, clip, bert, mt5xl):
+    @classmethod
+    def execute(cls, clip, bert, mt5xl) -> io.NodeOutput:
        tokens = clip.tokenize(bert)
        tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"]

-        return (clip.encode_from_tokens_scheduled(tokens), )
+        return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens))

-class EmptyHunyuanLatentVideo:
+    encode = execute  # TODO: remove
+
+
+class EmptyHunyuanLatentVideo(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
-                              "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
-                              "length": ("INT", {"default": 25, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
-                              "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
-    RETURN_TYPES = ("LATENT",)
-    FUNCTION = "generate"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="EmptyHunyuanLatentVideo",
+            category="latent/video",
+            inputs=[
+                io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("length", default=25, min=1, max=nodes.MAX_RESOLUTION, step=4),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+            ],
+            outputs=[
+                io.Latent.Output(),
+            ],
+        )

-    CATEGORY = "latent/video"
-
-    def generate(self, width, height, length, batch_size=1):
+    @classmethod
+    def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput:
        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
-        return ({"samples":latent}, )
+        return io.NodeOutput({"samples":latent})
+
+    generate = execute  # TODO: remove
+

 PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
    "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
@ -50,45 +68,61 @@ PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
    "<|start_header_id|>assistant<|end_header_id|>\n\n"
 )

-class TextEncodeHunyuanVideo_ImageToVideo:
+class TextEncodeHunyuanVideo_ImageToVideo(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {
-            "clip": ("CLIP", ),
-            "clip_vision_output": ("CLIP_VISION_OUTPUT", ),
-            "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
-            "image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}),
-            }}
-    RETURN_TYPES = ("CONDITIONING",)
-    FUNCTION = "encode"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="TextEncodeHunyuanVideo_ImageToVideo",
+            category="advanced/conditioning",
+            inputs=[
+                io.Clip.Input("clip"),
+                io.ClipVisionOutput.Input("clip_vision_output"),
+                io.String.Input("prompt", multiline=True, dynamic_prompts=True),
+                io.Int.Input(
+                    "image_interleave",
+                    default=2,
+                    min=1,
+                    max=512,
+                    tooltip="How much the image influences things vs the text prompt. Higher number means more influence from the text prompt.",
+                ),
+            ],
+            outputs=[
+                io.Conditioning.Output(),
+            ],
+        )

-    CATEGORY = "advanced/conditioning"
-
-    def encode(self, clip, clip_vision_output, prompt, image_interleave):
+    @classmethod
+    def execute(cls, clip, clip_vision_output, prompt, image_interleave) -> io.NodeOutput:
        tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave)
-        return (clip.encode_from_tokens_scheduled(tokens), )
+        return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens))

-class HunyuanImageToVideo:
+    encode = execute  # TODO: remove
+
+
+class HunyuanImageToVideo(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"positive": ("CONDITIONING", ),
-                             "vae": ("VAE", ),
-                             "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
-                             "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
-                             "length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
-                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
-                             "guidance_type": (["v1 (concat)", "v2 (replace)", "custom"], )
-                },
-                "optional": {"start_image": ("IMAGE", ),
-                }}
+    def define_schema(cls):
+        return io.Schema(
+            node_id="HunyuanImageToVideo",
+            category="conditioning/video_models",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Vae.Input("vae"),
+                io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("length", default=53, min=1, max=nodes.MAX_RESOLUTION, step=4),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+                io.Combo.Input("guidance_type", options=["v1 (concat)", "v2 (replace)", "custom"]),
+                io.Image.Input("start_image", optional=True),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Latent.Output(display_name="latent"),
+            ],
+        )

-    RETURN_TYPES = ("CONDITIONING", "LATENT")
-    RETURN_NAMES = ("positive", "latent")
-    FUNCTION = "encode"
-
-    CATEGORY = "conditioning/video_models"
-
-    def encode(self, positive, vae, width, height, length, batch_size, guidance_type, start_image=None):
+    @classmethod
+    def execute(cls, positive, vae, width, height, length, batch_size, guidance_type, start_image=None) -> io.NodeOutput:
        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
        out_latent = {}

@ -111,51 +145,76 @@ class HunyuanImageToVideo:
            positive = node_helpers.conditioning_set_values(positive, cond)

        out_latent["samples"] = latent
-        return (positive, out_latent)
+        return io.NodeOutput(positive, out_latent)

-class EmptyHunyuanImageLatent:
+    encode = execute  # TODO: remove
+
+
+class EmptyHunyuanImageLatent(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": { "width": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
-                              "height": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
-                              "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
-    RETURN_TYPES = ("LATENT",)
-    FUNCTION = "generate"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="EmptyHunyuanImageLatent",
+            category="latent",
+            inputs=[
+                io.Int.Input("width", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
+                io.Int.Input("height", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+            ],
+            outputs=[
+                io.Latent.Output(),
+            ],
+        )

-    CATEGORY = "latent"
-
-    def generate(self, width, height, batch_size=1):
+    @classmethod
+    def execute(cls, width, height, batch_size=1) -> io.NodeOutput:
        latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device())
-        return ({"samples":latent}, )
+        return io.NodeOutput({"samples":latent})

-class HunyuanRefinerLatent:
+    generate = execute  # TODO: remove
+
+
+class HunyuanRefinerLatent(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"positive": ("CONDITIONING", ),
-                             "negative": ("CONDITIONING", ),
-                             "latent": ("LATENT", ),
-                             "noise_augmentation": ("FLOAT", {"default": 0.10, "min": 0.0, "max": 1.0, "step": 0.01}),
-                             }}
+    def define_schema(cls):
+        return io.Schema(
+            node_id="HunyuanRefinerLatent",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Latent.Input("latent"),
+                io.Float.Input("noise_augmentation", default=0.10, min=0.0, max=1.0, step=0.01),

-    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
-    RETURN_NAMES = ("positive", "negative", "latent")
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+            ],
+        )

-    FUNCTION = "execute"
-
-    def execute(self, positive, negative, latent, noise_augmentation):
+    @classmethod
+    def execute(cls, positive, negative, latent, noise_augmentation) -> io.NodeOutput:
        latent = latent["samples"]
        positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
        negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
        out_latent = {}
        out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())
-        return (positive, negative, out_latent)
+        return io.NodeOutput(positive, negative, out_latent)


-NODE_CLASS_MAPPINGS = {
-    "CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT,
-    "TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo,
-    "EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo,
-    "HunyuanImageToVideo": HunyuanImageToVideo,
-    "EmptyHunyuanImageLatent": EmptyHunyuanImageLatent,
-    "HunyuanRefinerLatent": HunyuanRefinerLatent,
-}
+class HunyuanExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            CLIPTextEncodeHunyuanDiT,
+            TextEncodeHunyuanVideo_ImageToVideo,
+            EmptyHunyuanLatentVideo,
+            HunyuanImageToVideo,
+            EmptyHunyuanImageLatent,
+            HunyuanRefinerLatent,
+        ]
+
+
+async def comfy_entrypoint() -> HunyuanExtension:
+    return HunyuanExtension()