mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-12-17 01:52:59 +08:00
convert nodes_hunyuan.py to V3 schema (#10136)
This commit is contained in:
parent
95ca2e56c8
commit
3dfdcf66b6
@ -2,42 +2,60 @@ import nodes
|
|||||||
import node_helpers
|
import node_helpers
|
||||||
import torch
|
import torch
|
||||||
import comfy.model_management
|
import comfy.model_management
|
||||||
|
from typing_extensions import override
|
||||||
|
from comfy_api.latest import ComfyExtension, io
|
||||||
|
|
||||||
|
|
||||||
class CLIPTextEncodeHunyuanDiT:
|
class CLIPTextEncodeHunyuanDiT(io.ComfyNode):
|
||||||
@classmethod
|
@classmethod
|
||||||
def INPUT_TYPES(s):
|
def define_schema(cls):
|
||||||
return {"required": {
|
return io.Schema(
|
||||||
"clip": ("CLIP", ),
|
node_id="CLIPTextEncodeHunyuanDiT",
|
||||||
"bert": ("STRING", {"multiline": True, "dynamicPrompts": True}),
|
category="advanced/conditioning",
|
||||||
"mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}),
|
inputs=[
|
||||||
}}
|
io.Clip.Input("clip"),
|
||||||
RETURN_TYPES = ("CONDITIONING",)
|
io.String.Input("bert", multiline=True, dynamic_prompts=True),
|
||||||
FUNCTION = "encode"
|
io.String.Input("mt5xl", multiline=True, dynamic_prompts=True),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Conditioning.Output(),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
CATEGORY = "advanced/conditioning"
|
@classmethod
|
||||||
|
def execute(cls, clip, bert, mt5xl) -> io.NodeOutput:
|
||||||
def encode(self, clip, bert, mt5xl):
|
|
||||||
tokens = clip.tokenize(bert)
|
tokens = clip.tokenize(bert)
|
||||||
tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"]
|
tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"]
|
||||||
|
|
||||||
return (clip.encode_from_tokens_scheduled(tokens), )
|
return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens))
|
||||||
|
|
||||||
class EmptyHunyuanLatentVideo:
|
encode = execute # TODO: remove
|
||||||
|
|
||||||
|
|
||||||
|
class EmptyHunyuanLatentVideo(io.ComfyNode):
|
||||||
@classmethod
|
@classmethod
|
||||||
def INPUT_TYPES(s):
|
def define_schema(cls):
|
||||||
return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
return io.Schema(
|
||||||
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
node_id="EmptyHunyuanLatentVideo",
|
||||||
"length": ("INT", {"default": 25, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
|
category="latent/video",
|
||||||
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
|
inputs=[
|
||||||
RETURN_TYPES = ("LATENT",)
|
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
FUNCTION = "generate"
|
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
|
io.Int.Input("length", default=25, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||||
|
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Latent.Output(),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
CATEGORY = "latent/video"
|
@classmethod
|
||||||
|
def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput:
|
||||||
def generate(self, width, height, length, batch_size=1):
|
|
||||||
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||||
return ({"samples":latent}, )
|
return io.NodeOutput({"samples":latent})
|
||||||
|
|
||||||
|
generate = execute # TODO: remove
|
||||||
|
|
||||||
|
|
||||||
PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
|
PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
|
||||||
"<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
|
"<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
|
||||||
@ -50,45 +68,61 @@ PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
|
|||||||
"<|start_header_id|>assistant<|end_header_id|>\n\n"
|
"<|start_header_id|>assistant<|end_header_id|>\n\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
class TextEncodeHunyuanVideo_ImageToVideo:
|
class TextEncodeHunyuanVideo_ImageToVideo(io.ComfyNode):
|
||||||
@classmethod
|
@classmethod
|
||||||
def INPUT_TYPES(s):
|
def define_schema(cls):
|
||||||
return {"required": {
|
return io.Schema(
|
||||||
"clip": ("CLIP", ),
|
node_id="TextEncodeHunyuanVideo_ImageToVideo",
|
||||||
"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
|
category="advanced/conditioning",
|
||||||
"prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
|
inputs=[
|
||||||
"image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}),
|
io.Clip.Input("clip"),
|
||||||
}}
|
io.ClipVisionOutput.Input("clip_vision_output"),
|
||||||
RETURN_TYPES = ("CONDITIONING",)
|
io.String.Input("prompt", multiline=True, dynamic_prompts=True),
|
||||||
FUNCTION = "encode"
|
io.Int.Input(
|
||||||
|
"image_interleave",
|
||||||
|
default=2,
|
||||||
|
min=1,
|
||||||
|
max=512,
|
||||||
|
tooltip="How much the image influences things vs the text prompt. Higher number means more influence from the text prompt.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Conditioning.Output(),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
CATEGORY = "advanced/conditioning"
|
@classmethod
|
||||||
|
def execute(cls, clip, clip_vision_output, prompt, image_interleave) -> io.NodeOutput:
|
||||||
def encode(self, clip, clip_vision_output, prompt, image_interleave):
|
|
||||||
tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave)
|
tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave)
|
||||||
return (clip.encode_from_tokens_scheduled(tokens), )
|
return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens))
|
||||||
|
|
||||||
class HunyuanImageToVideo:
|
encode = execute # TODO: remove
|
||||||
|
|
||||||
|
|
||||||
|
class HunyuanImageToVideo(io.ComfyNode):
|
||||||
@classmethod
|
@classmethod
|
||||||
def INPUT_TYPES(s):
|
def define_schema(cls):
|
||||||
return {"required": {"positive": ("CONDITIONING", ),
|
return io.Schema(
|
||||||
"vae": ("VAE", ),
|
node_id="HunyuanImageToVideo",
|
||||||
"width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
category="conditioning/video_models",
|
||||||
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
inputs=[
|
||||||
"length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
|
io.Conditioning.Input("positive"),
|
||||||
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
|
io.Vae.Input("vae"),
|
||||||
"guidance_type": (["v1 (concat)", "v2 (replace)", "custom"], )
|
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
},
|
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||||
"optional": {"start_image": ("IMAGE", ),
|
io.Int.Input("length", default=53, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||||
}}
|
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||||
|
io.Combo.Input("guidance_type", options=["v1 (concat)", "v2 (replace)", "custom"]),
|
||||||
|
io.Image.Input("start_image", optional=True),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Conditioning.Output(display_name="positive"),
|
||||||
|
io.Latent.Output(display_name="latent"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
RETURN_TYPES = ("CONDITIONING", "LATENT")
|
@classmethod
|
||||||
RETURN_NAMES = ("positive", "latent")
|
def execute(cls, positive, vae, width, height, length, batch_size, guidance_type, start_image=None) -> io.NodeOutput:
|
||||||
FUNCTION = "encode"
|
|
||||||
|
|
||||||
CATEGORY = "conditioning/video_models"
|
|
||||||
|
|
||||||
def encode(self, positive, vae, width, height, length, batch_size, guidance_type, start_image=None):
|
|
||||||
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||||
out_latent = {}
|
out_latent = {}
|
||||||
|
|
||||||
@ -111,51 +145,76 @@ class HunyuanImageToVideo:
|
|||||||
positive = node_helpers.conditioning_set_values(positive, cond)
|
positive = node_helpers.conditioning_set_values(positive, cond)
|
||||||
|
|
||||||
out_latent["samples"] = latent
|
out_latent["samples"] = latent
|
||||||
return (positive, out_latent)
|
return io.NodeOutput(positive, out_latent)
|
||||||
|
|
||||||
class EmptyHunyuanImageLatent:
|
encode = execute # TODO: remove
|
||||||
|
|
||||||
|
|
||||||
|
class EmptyHunyuanImageLatent(io.ComfyNode):
|
||||||
@classmethod
|
@classmethod
|
||||||
def INPUT_TYPES(s):
|
def define_schema(cls):
|
||||||
return {"required": { "width": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
|
return io.Schema(
|
||||||
"height": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
|
node_id="EmptyHunyuanImageLatent",
|
||||||
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
|
category="latent",
|
||||||
RETURN_TYPES = ("LATENT",)
|
inputs=[
|
||||||
FUNCTION = "generate"
|
io.Int.Input("width", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
|
||||||
|
io.Int.Input("height", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
|
||||||
|
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Latent.Output(),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
CATEGORY = "latent"
|
@classmethod
|
||||||
|
def execute(cls, width, height, batch_size=1) -> io.NodeOutput:
|
||||||
def generate(self, width, height, batch_size=1):
|
|
||||||
latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device())
|
latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device())
|
||||||
return ({"samples":latent}, )
|
return io.NodeOutput({"samples":latent})
|
||||||
|
|
||||||
class HunyuanRefinerLatent:
|
generate = execute # TODO: remove
|
||||||
|
|
||||||
|
|
||||||
|
class HunyuanRefinerLatent(io.ComfyNode):
|
||||||
@classmethod
|
@classmethod
|
||||||
def INPUT_TYPES(s):
|
def define_schema(cls):
|
||||||
return {"required": {"positive": ("CONDITIONING", ),
|
return io.Schema(
|
||||||
"negative": ("CONDITIONING", ),
|
node_id="HunyuanRefinerLatent",
|
||||||
"latent": ("LATENT", ),
|
inputs=[
|
||||||
"noise_augmentation": ("FLOAT", {"default": 0.10, "min": 0.0, "max": 1.0, "step": 0.01}),
|
io.Conditioning.Input("positive"),
|
||||||
}}
|
io.Conditioning.Input("negative"),
|
||||||
|
io.Latent.Input("latent"),
|
||||||
|
io.Float.Input("noise_augmentation", default=0.10, min=0.0, max=1.0, step=0.01),
|
||||||
|
|
||||||
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
|
],
|
||||||
RETURN_NAMES = ("positive", "negative", "latent")
|
outputs=[
|
||||||
|
io.Conditioning.Output(display_name="positive"),
|
||||||
|
io.Conditioning.Output(display_name="negative"),
|
||||||
|
io.Latent.Output(display_name="latent"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
FUNCTION = "execute"
|
@classmethod
|
||||||
|
def execute(cls, positive, negative, latent, noise_augmentation) -> io.NodeOutput:
|
||||||
def execute(self, positive, negative, latent, noise_augmentation):
|
|
||||||
latent = latent["samples"]
|
latent = latent["samples"]
|
||||||
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
|
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
|
||||||
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
|
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
|
||||||
out_latent = {}
|
out_latent = {}
|
||||||
out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())
|
out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())
|
||||||
return (positive, negative, out_latent)
|
return io.NodeOutput(positive, negative, out_latent)
|
||||||
|
|
||||||
|
|
||||||
NODE_CLASS_MAPPINGS = {
|
class HunyuanExtension(ComfyExtension):
|
||||||
"CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT,
|
@override
|
||||||
"TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo,
|
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||||
"EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo,
|
return [
|
||||||
"HunyuanImageToVideo": HunyuanImageToVideo,
|
CLIPTextEncodeHunyuanDiT,
|
||||||
"EmptyHunyuanImageLatent": EmptyHunyuanImageLatent,
|
TextEncodeHunyuanVideo_ImageToVideo,
|
||||||
"HunyuanRefinerLatent": HunyuanRefinerLatent,
|
EmptyHunyuanLatentVideo,
|
||||||
}
|
HunyuanImageToVideo,
|
||||||
|
EmptyHunyuanImageLatent,
|
||||||
|
HunyuanRefinerLatent,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
async def comfy_entrypoint() -> HunyuanExtension:
|
||||||
|
return HunyuanExtension()
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user