mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-04-20 07:22:34 +08:00
Add HY-OmniWeave support for HunyuanVideo 1.5
This commit is contained in:
parent
f21f6b2212
commit
6447250bd6
64
comfy/sd.py
64
comfy/sd.py
@ -1267,6 +1267,13 @@ def detect_te_model(sd):
|
||||
return TEModel.QWEN25_3B
|
||||
if weight.shape[0] == 512:
|
||||
return TEModel.QWEN25_7B
|
||||
# Qwen-VL checkpoints can be saved under model.language_model.* (e.g. HY-OmniWeave text encoder).
|
||||
if 'model.language_model.layers.0.self_attn.k_proj.bias' in sd:
|
||||
weight = sd['model.language_model.layers.0.self_attn.k_proj.bias']
|
||||
if weight.shape[0] == 256:
|
||||
return TEModel.QWEN25_3B
|
||||
if weight.shape[0] == 512:
|
||||
return TEModel.QWEN25_7B
|
||||
if "model.language_model.layers.0.linear_attn.A_log" in sd and "model.language_model.layers.0.input_layernorm.weight" in sd:
|
||||
weight = sd['model.language_model.layers.0.input_layernorm.weight']
|
||||
if weight.shape[0] == 1024:
|
||||
@ -1310,7 +1317,11 @@ def t5xxl_detect(clip_data):
|
||||
return {}
|
||||
|
||||
def llama_detect(clip_data):
|
||||
weight_names = ["model.layers.0.self_attn.k_proj.weight", "model.layers.0.linear_attn.in_proj_a.weight"]
|
||||
weight_names = [
|
||||
"model.layers.0.self_attn.k_proj.weight",
|
||||
"model.layers.0.linear_attn.in_proj_a.weight",
|
||||
"model.language_model.layers.0.self_attn.k_proj.weight",
|
||||
]
|
||||
|
||||
for sd in clip_data:
|
||||
for weight_name in weight_names:
|
||||
@ -1414,7 +1425,23 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
clip_target.clip = comfy.text_encoders.omnigen2.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.omnigen2.Omnigen2Tokenizer
|
||||
elif te_model == TEModel.QWEN25_7B:
|
||||
if clip_type == CLIPType.HUNYUAN_IMAGE:
|
||||
# Some Qwen2.5-VL checkpoints (including HY-OmniWeave's text encoder)
|
||||
# are saved with "model.language_model.*" and "model.visual.*" prefixes.
|
||||
# Normalize keys to the layout expected by Comfy text encoder wrappers.
|
||||
for i, sd in enumerate(clip_data):
|
||||
if "model.language_model.layers.0.self_attn.k_proj.weight" in sd:
|
||||
clip_data[i] = comfy.utils.state_dict_prefix_replace(
|
||||
sd,
|
||||
{
|
||||
"model.language_model.": "model.",
|
||||
"model.visual.": "visual.",
|
||||
"final_layer_norm.": "model.norm.",
|
||||
},
|
||||
)
|
||||
if clip_type == CLIPType.HUNYUAN_VIDEO_15:
|
||||
clip_target.clip = comfy.text_encoders.hunyuan_image.te(byt5=False, **llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer
|
||||
elif clip_type == CLIPType.HUNYUAN_IMAGE:
|
||||
clip_target.clip = comfy.text_encoders.hunyuan_image.te(byt5=False, **llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer
|
||||
elif clip_type == CLIPType.LONGCAT_IMAGE:
|
||||
@ -1748,6 +1775,39 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable
|
||||
if custom_operations is None:
|
||||
sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata)
|
||||
|
||||
# HY-OmniWeave checkpoints store double-block attention as split q/k/v tensors
|
||||
# while Comfy's HunyuanVideo implementation expects merged qkv tensors.
|
||||
if "double_blocks.0.img_attn_q.weight" in sd and "double_blocks.0.img_attn.qkv.weight" not in sd:
|
||||
converted_qkv = 0
|
||||
block_indices = set()
|
||||
for k in list(sd.keys()):
|
||||
if not k.startswith("double_blocks."):
|
||||
continue
|
||||
parts = k.split(".")
|
||||
if len(parts) < 3:
|
||||
continue
|
||||
if parts[2] == "img_attn_q":
|
||||
try:
|
||||
block_indices.add(int(parts[1]))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
for idx in sorted(block_indices):
|
||||
for attn_prefix in ("img_attn", "txt_attn"):
|
||||
for end in ("weight", "bias"):
|
||||
q_key = f"double_blocks.{idx}.{attn_prefix}_q.{end}"
|
||||
k_key = f"double_blocks.{idx}.{attn_prefix}_k.{end}"
|
||||
v_key = f"double_blocks.{idx}.{attn_prefix}_v.{end}"
|
||||
qkv_key = f"double_blocks.{idx}.{attn_prefix}.qkv.{end}"
|
||||
if qkv_key in sd:
|
||||
continue
|
||||
if q_key in sd and k_key in sd and v_key in sd:
|
||||
sd[qkv_key] = torch.cat((sd.pop(q_key), sd.pop(k_key), sd.pop(v_key)), dim=0)
|
||||
converted_qkv += 1
|
||||
|
||||
if converted_qkv > 0:
|
||||
logging.info(f"Converted {converted_qkv} split HunyuanVideo attention tensors to qkv format.")
|
||||
|
||||
parameters = comfy.utils.calculate_parameters(sd)
|
||||
weight_dtype = comfy.utils.weight_dtype(sd)
|
||||
|
||||
|
||||
@ -2,6 +2,8 @@ import nodes
|
||||
import node_helpers
|
||||
import torch
|
||||
import comfy.model_management
|
||||
import comfy.utils
|
||||
import comfy.clip_vision
|
||||
from typing_extensions import override
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
from comfy.ldm.hunyuan_video.upsampler import HunyuanVideo15SRModel
|
||||
@ -301,6 +303,246 @@ class TextEncodeHunyuanVideo_ImageToVideo(io.ComfyNode):
|
||||
encode = execute # TODO: remove
|
||||
|
||||
|
||||
class TextEncodeHunyuanVideo15Omni(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TextEncodeHunyuanVideo15Omni",
|
||||
display_name="Text Encode HunyuanVideo 15 Omni",
|
||||
category="advanced/conditioning",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("prompt", multiline=True, dynamic_prompts=True),
|
||||
io.Combo.Input("task", options=["t2v", "i2v", "interpolation", "reference2v", "editing", "tiv2v"], default="t2v"),
|
||||
io.Boolean.Input("use_visual_inputs", default=True, advanced=True),
|
||||
io.Int.Input("max_visual_inputs", default=8, min=1, max=64, advanced=True),
|
||||
io.ClipVisionOutput.Input("clip_vision_output", optional=True),
|
||||
],
|
||||
outputs=[
|
||||
io.Conditioning.Output(),
|
||||
],
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _task_system_prompt(task: str) -> str:
|
||||
prompts = {
|
||||
"t2v": "Describe a high-quality target video from the user's request with concrete scene details, motion, camera behavior, and style.",
|
||||
"i2v": "Describe a target video that should stay consistent with the provided reference image while following the user's request.",
|
||||
"interpolation": "Describe a target video that smoothly transitions between the provided keyframe images while following the user's request.",
|
||||
"reference2v": "Describe a target video that composes the provided reference subjects into a coherent scene following the user's request.",
|
||||
"editing": "Describe an edited output video that follows the user's instruction while preserving relevant source video content.",
|
||||
"tiv2v": "Describe an edited output video using both the provided source video and reference image guidance according to the user's instruction.",
|
||||
}
|
||||
return prompts.get(task, prompts["t2v"])
|
||||
|
||||
@classmethod
|
||||
def _build_template(cls, task: str, image_count: int) -> str:
|
||||
system_prompt = cls._task_system_prompt(task)
|
||||
visual_tokens = "<|vision_start|><|image_pad|><|vision_end|>\n" * image_count
|
||||
return (
|
||||
"<|im_start|>system\n"
|
||||
f"{system_prompt}"
|
||||
"<|im_end|>\n"
|
||||
"<|im_start|>user\n"
|
||||
f"{visual_tokens}" + "{}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _extract_image_embeds(clip_vision_output, max_visual_inputs: int):
|
||||
if clip_vision_output is None:
|
||||
return []
|
||||
mm_projected = getattr(clip_vision_output, "mm_projected", None)
|
||||
if mm_projected is None:
|
||||
return []
|
||||
if mm_projected.ndim == 2:
|
||||
return [mm_projected]
|
||||
count = min(mm_projected.shape[0], max_visual_inputs)
|
||||
return [mm_projected[i] for i in range(count)]
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip, prompt, task, use_visual_inputs, max_visual_inputs, clip_vision_output=None) -> io.NodeOutput:
|
||||
image_embeds = cls._extract_image_embeds(clip_vision_output, max_visual_inputs) if use_visual_inputs else []
|
||||
template = cls._build_template(task, len(image_embeds))
|
||||
|
||||
# HunyuanVideo 1.5 tokenizers use `images=...`; HunyuanVideo 1.0 uses `image_embeds=...`.
|
||||
try:
|
||||
tokens = clip.tokenize(prompt, llama_template=template, images=image_embeds)
|
||||
except TypeError:
|
||||
embeds = None
|
||||
if len(image_embeds) > 0:
|
||||
embeds = torch.stack(image_embeds, dim=0)
|
||||
tokens = clip.tokenize(prompt, llama_template=template, image_embeds=embeds, image_interleave=1)
|
||||
return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens))
|
||||
|
||||
encode = execute # TODO: remove
|
||||
|
||||
|
||||
class HunyuanClipVisionOutputConcat(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="HunyuanClipVisionOutputConcat",
|
||||
display_name="Hunyuan CLIP Vision Output Concat",
|
||||
category="conditioning/video_models",
|
||||
inputs=[
|
||||
io.ClipVisionOutput.Input("clip_vision_output_1"),
|
||||
io.ClipVisionOutput.Input("clip_vision_output_2", optional=True),
|
||||
io.ClipVisionOutput.Input("clip_vision_output_3", optional=True),
|
||||
io.ClipVisionOutput.Input("clip_vision_output_4", optional=True),
|
||||
],
|
||||
outputs=[
|
||||
io.ClipVisionOutput.Output(),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip_vision_output_1, clip_vision_output_2=None, clip_vision_output_3=None, clip_vision_output_4=None) -> io.NodeOutput:
|
||||
outputs = [o for o in (clip_vision_output_1, clip_vision_output_2, clip_vision_output_3, clip_vision_output_4) if o is not None]
|
||||
merged = comfy.clip_vision.Output()
|
||||
tensor_attrs = ["last_hidden_state", "image_embeds", "penultimate_hidden_states", "all_hidden_states", "mm_projected"]
|
||||
for attr in tensor_attrs:
|
||||
values = [getattr(o, attr) for o in outputs if hasattr(o, attr)]
|
||||
if len(values) > 0 and torch.is_tensor(values[0]):
|
||||
setattr(merged, attr, torch.cat(values, dim=0))
|
||||
|
||||
image_sizes = []
|
||||
for o in outputs:
|
||||
if hasattr(o, "image_sizes"):
|
||||
image_sizes.extend(getattr(o, "image_sizes"))
|
||||
if len(image_sizes) > 0:
|
||||
merged.image_sizes = image_sizes
|
||||
return io.NodeOutput(merged)
|
||||
|
||||
|
||||
class HunyuanVideo15OmniConditioning(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="HunyuanVideo15OmniConditioning",
|
||||
display_name="HunyuanVideo 15 Omni Conditioning",
|
||||
category="conditioning/video_models",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
io.Vae.Input("vae"),
|
||||
io.Combo.Input("task", options=["t2v", "i2v", "interpolation", "reference2v", "editing", "tiv2v"], default="t2v"),
|
||||
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||
io.Image.Input("reference_images", optional=True, tooltip="For i2v/interpolation/reference2v/tiv2v."),
|
||||
io.Image.Input("condition_video", optional=True, tooltip="For editing/tiv2v."),
|
||||
io.ClipVisionOutput.Input("clip_vision_output", optional=True),
|
||||
],
|
||||
outputs=[
|
||||
io.Conditioning.Output(display_name="positive"),
|
||||
io.Conditioning.Output(display_name="negative"),
|
||||
io.Latent.Output(display_name="latent"),
|
||||
],
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _latent_length(length: int) -> int:
|
||||
return ((length - 1) // 4) + 1
|
||||
|
||||
@staticmethod
|
||||
def _upscale_frames(frames: torch.Tensor, width: int, height: int):
|
||||
return comfy.utils.common_upscale(frames.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||
|
||||
@classmethod
|
||||
def _encode_single_image(cls, vae, image: torch.Tensor, width: int, height: int):
|
||||
upscaled = cls._upscale_frames(image[:1], width, height)
|
||||
return vae.encode(upscaled[:, :, :, :3])
|
||||
|
||||
@classmethod
|
||||
def _encode_video(cls, vae, video: torch.Tensor, width: int, height: int, length: int):
|
||||
upscaled = cls._upscale_frames(video[:length], width, height)
|
||||
return vae.encode(upscaled[:, :, :, :3])
|
||||
|
||||
@staticmethod
|
||||
def _assign_frame(target: torch.Tensor, source: torch.Tensor, frame_idx: int):
|
||||
if frame_idx < 0 or frame_idx >= target.shape[2]:
|
||||
return
|
||||
target[:, :, frame_idx:frame_idx + 1] = source[:, :, :1]
|
||||
|
||||
@classmethod
|
||||
def execute(cls, positive, negative, vae, task, width, height, length, batch_size, reference_images=None, condition_video=None, clip_vision_output=None) -> io.NodeOutput:
|
||||
latent_length = cls._latent_length(length)
|
||||
latent = torch.zeros([batch_size, 32, latent_length, height // 16, width // 16], device=comfy.model_management.intermediate_device())
|
||||
|
||||
if task == "t2v":
|
||||
if clip_vision_output is not None:
|
||||
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
||||
return io.NodeOutput(positive, negative, {"samples": latent})
|
||||
|
||||
cond_latent = torch.zeros_like(latent[:1])
|
||||
omni_mask = torch.zeros((latent_length,), device=cond_latent.device, dtype=cond_latent.dtype)
|
||||
|
||||
if task == "i2v":
|
||||
if reference_images is None or reference_images.shape[0] < 1:
|
||||
raise ValueError("Task i2v requires at least one reference image.")
|
||||
encoded = cls._encode_single_image(vae, reference_images, width, height)
|
||||
cls._assign_frame(cond_latent, encoded, 0)
|
||||
omni_mask[0] = 1.0
|
||||
|
||||
elif task == "interpolation":
|
||||
if reference_images is None or reference_images.shape[0] < 2:
|
||||
raise ValueError("Task interpolation requires at least two reference images.")
|
||||
encoded_first = cls._encode_single_image(vae, reference_images[:1], width, height)
|
||||
encoded_last = cls._encode_single_image(vae, reference_images[-1:], width, height)
|
||||
cls._assign_frame(cond_latent, encoded_first, 0)
|
||||
cls._assign_frame(cond_latent, encoded_last, latent_length - 1)
|
||||
omni_mask[0] = 1.0
|
||||
omni_mask[-1] = 1.0
|
||||
|
||||
elif task == "reference2v":
|
||||
if reference_images is None or reference_images.shape[0] < 1:
|
||||
raise ValueError("Task reference2v requires at least one reference image.")
|
||||
num_refs = min(reference_images.shape[0], max(1, latent_length - 1))
|
||||
for idx in range(num_refs):
|
||||
encoded = cls._encode_single_image(vae, reference_images[idx:idx + 1], width, height)
|
||||
frame_idx = min(idx + 1, latent_length - 1)
|
||||
cls._assign_frame(cond_latent, encoded, frame_idx)
|
||||
omni_mask[frame_idx] = 1.0
|
||||
|
||||
elif task == "editing":
|
||||
if condition_video is None or condition_video.shape[0] < 1:
|
||||
raise ValueError("Task editing requires condition_video.")
|
||||
encoded = cls._encode_video(vae, condition_video, width, height, length)
|
||||
valid_frames = min(latent_length, encoded.shape[2])
|
||||
cond_latent[:, :, :valid_frames] = encoded[:, :, :valid_frames]
|
||||
omni_mask[:valid_frames] = 1.0
|
||||
|
||||
elif task == "tiv2v":
|
||||
if condition_video is None or condition_video.shape[0] < 1:
|
||||
raise ValueError("Task tiv2v requires condition_video.")
|
||||
if reference_images is None or reference_images.shape[0] < 1:
|
||||
raise ValueError("Task tiv2v requires at least one reference image.")
|
||||
encoded_video = cls._encode_video(vae, condition_video, width, height, length)
|
||||
valid_frames = min(latent_length, encoded_video.shape[2])
|
||||
cond_latent[:, :, :valid_frames] = encoded_video[:, :, :valid_frames]
|
||||
omni_mask[:valid_frames] = 1.0
|
||||
|
||||
encoded_ref = cls._encode_single_image(vae, reference_images[:1], width, height)
|
||||
ref_idx = 1 if latent_length > 1 else 0
|
||||
cond_latent[:, :, ref_idx:ref_idx + 1] += encoded_ref[:, :, :1]
|
||||
omni_mask[ref_idx] += 1.0
|
||||
|
||||
cond_latent = comfy.utils.resize_to_batch_size(cond_latent, batch_size)
|
||||
# BaseModel/HunyuanVideo15 inverts concat_mask (mask = 1 - concat_mask), so pass the pre-inverted mask.
|
||||
concat_mask = (1.0 - omni_mask).view(1, 1, latent_length, 1, 1).expand(cond_latent.shape[0], 1, latent_length, cond_latent.shape[-2], cond_latent.shape[-1]).to(cond_latent.dtype)
|
||||
|
||||
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": cond_latent, "concat_mask": concat_mask})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": cond_latent, "concat_mask": concat_mask})
|
||||
if clip_vision_output is not None:
|
||||
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
||||
|
||||
return io.NodeOutput(positive, negative, {"samples": latent})
|
||||
|
||||
|
||||
class HunyuanImageToVideo(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
@ -411,9 +653,12 @@ class HunyuanExtension(ComfyExtension):
|
||||
return [
|
||||
CLIPTextEncodeHunyuanDiT,
|
||||
TextEncodeHunyuanVideo_ImageToVideo,
|
||||
TextEncodeHunyuanVideo15Omni,
|
||||
HunyuanClipVisionOutputConcat,
|
||||
EmptyHunyuanLatentVideo,
|
||||
EmptyHunyuanVideo15Latent,
|
||||
HunyuanVideo15ImageToVideo,
|
||||
HunyuanVideo15OmniConditioning,
|
||||
HunyuanVideo15SuperResolution,
|
||||
HunyuanVideo15LatentUpscaleWithModel,
|
||||
LatentUpscaleModelLoader,
|
||||
|
||||
4
nodes.py
4
nodes.py
@ -977,7 +977,7 @@ class CLIPLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image"], ),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "hunyuan_video_15", "flux2", "ovis", "longcat_image"], ),
|
||||
},
|
||||
"optional": {
|
||||
"device": (["default", "cpu"], {"advanced": True}),
|
||||
@ -987,7 +987,7 @@ class CLIPLoader:
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
|
||||
DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B"
|
||||
DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\nhidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nhunyuan_video_15: qwen2.5-vl (single-file fallback without byT5)"
|
||||
|
||||
def load_clip(self, clip_name, type="stable_diffusion", device="default"):
|
||||
clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user