Add HY-OmniWeave support for HunyuanVideo 1.5

2026-06-03 21:07:27 +08:00 · 2026-04-04 22:03:24 +00:00 · 2026-04-04 22:03:24 +00:00 · 6447250bd6
commit 6447250bd6
parent f21f6b2212
3 changed files with 309 additions and 4 deletions
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -1267,6 +1267,13 @@ def detect_te_model(sd):
            return TEModel.QWEN25_3B
        if weight.shape[0] == 512:
            return TEModel.QWEN25_7B
+    # Qwen-VL checkpoints can be saved under model.language_model.* (e.g. HY-OmniWeave text encoder).
+    if 'model.language_model.layers.0.self_attn.k_proj.bias' in sd:
+        weight = sd['model.language_model.layers.0.self_attn.k_proj.bias']
+        if weight.shape[0] == 256:
+            return TEModel.QWEN25_3B
+        if weight.shape[0] == 512:
+            return TEModel.QWEN25_7B
    if "model.language_model.layers.0.linear_attn.A_log" in sd and "model.language_model.layers.0.input_layernorm.weight" in sd:
        weight = sd['model.language_model.layers.0.input_layernorm.weight']
        if weight.shape[0] == 1024:
@ -1310,7 +1317,11 @@ def t5xxl_detect(clip_data):
    return {}

 def llama_detect(clip_data):
-    weight_names = ["model.layers.0.self_attn.k_proj.weight", "model.layers.0.linear_attn.in_proj_a.weight"]
+    weight_names = [
+        "model.layers.0.self_attn.k_proj.weight",
+        "model.layers.0.linear_attn.in_proj_a.weight",
+        "model.language_model.layers.0.self_attn.k_proj.weight",
+    ]

    for sd in clip_data:
        for weight_name in weight_names:
@ -1414,7 +1425,23 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            clip_target.clip = comfy.text_encoders.omnigen2.te(**llama_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.omnigen2.Omnigen2Tokenizer
        elif te_model == TEModel.QWEN25_7B:
-            if clip_type == CLIPType.HUNYUAN_IMAGE:
+            # Some Qwen2.5-VL checkpoints (including HY-OmniWeave's text encoder)
+            # are saved with "model.language_model.*" and "model.visual.*" prefixes.
+            # Normalize keys to the layout expected by Comfy text encoder wrappers.
+            for i, sd in enumerate(clip_data):
+                if "model.language_model.layers.0.self_attn.k_proj.weight" in sd:
+                    clip_data[i] = comfy.utils.state_dict_prefix_replace(
+                        sd,
+                        {
+                            "model.language_model.": "model.",
+                            "model.visual.": "visual.",
+                            "final_layer_norm.": "model.norm.",
+                        },
+                    )
+            if clip_type == CLIPType.HUNYUAN_VIDEO_15:
+                clip_target.clip = comfy.text_encoders.hunyuan_image.te(byt5=False, **llama_detect(clip_data))
+                clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer
+            elif clip_type == CLIPType.HUNYUAN_IMAGE:
                clip_target.clip = comfy.text_encoders.hunyuan_image.te(byt5=False, **llama_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer
            elif clip_type == CLIPType.LONGCAT_IMAGE:
@ -1748,6 +1775,39 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable
        if custom_operations is None:
            sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata)

+    # HY-OmniWeave checkpoints store double-block attention as split q/k/v tensors
+    # while Comfy's HunyuanVideo implementation expects merged qkv tensors.
+    if "double_blocks.0.img_attn_q.weight" in sd and "double_blocks.0.img_attn.qkv.weight" not in sd:
+        converted_qkv = 0
+        block_indices = set()
+        for k in list(sd.keys()):
+            if not k.startswith("double_blocks."):
+                continue
+            parts = k.split(".")
+            if len(parts) < 3:
+                continue
+            if parts[2] == "img_attn_q":
+                try:
+                    block_indices.add(int(parts[1]))
+                except ValueError:
+                    pass
+
+        for idx in sorted(block_indices):
+            for attn_prefix in ("img_attn", "txt_attn"):
+                for end in ("weight", "bias"):
+                    q_key = f"double_blocks.{idx}.{attn_prefix}_q.{end}"
+                    k_key = f"double_blocks.{idx}.{attn_prefix}_k.{end}"
+                    v_key = f"double_blocks.{idx}.{attn_prefix}_v.{end}"
+                    qkv_key = f"double_blocks.{idx}.{attn_prefix}.qkv.{end}"
+                    if qkv_key in sd:
+                        continue
+                    if q_key in sd and k_key in sd and v_key in sd:
+                        sd[qkv_key] = torch.cat((sd.pop(q_key), sd.pop(k_key), sd.pop(v_key)), dim=0)
+                        converted_qkv += 1
+
+        if converted_qkv > 0:
+            logging.info(f"Converted {converted_qkv} split HunyuanVideo attention tensors to qkv format.")
+
    parameters = comfy.utils.calculate_parameters(sd)
    weight_dtype = comfy.utils.weight_dtype(sd)

--- a/comfy_extras/nodes_hunyuan.py
+++ b/comfy_extras/nodes_hunyuan.py
@ -2,6 +2,8 @@ import nodes
 import node_helpers
 import torch
 import comfy.model_management
+import comfy.utils
+import comfy.clip_vision
 from typing_extensions import override
 from comfy_api.latest import ComfyExtension, io
 from comfy.ldm.hunyuan_video.upsampler import HunyuanVideo15SRModel
@ -301,6 +303,246 @@ class TextEncodeHunyuanVideo_ImageToVideo(io.ComfyNode):
    encode = execute  # TODO: remove


+class TextEncodeHunyuanVideo15Omni(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="TextEncodeHunyuanVideo15Omni",
+            display_name="Text Encode HunyuanVideo 15 Omni",
+            category="advanced/conditioning",
+            inputs=[
+                io.Clip.Input("clip"),
+                io.String.Input("prompt", multiline=True, dynamic_prompts=True),
+                io.Combo.Input("task", options=["t2v", "i2v", "interpolation", "reference2v", "editing", "tiv2v"], default="t2v"),
+                io.Boolean.Input("use_visual_inputs", default=True, advanced=True),
+                io.Int.Input("max_visual_inputs", default=8, min=1, max=64, advanced=True),
+                io.ClipVisionOutput.Input("clip_vision_output", optional=True),
+            ],
+            outputs=[
+                io.Conditioning.Output(),
+            ],
+        )
+
+    @staticmethod
+    def _task_system_prompt(task: str) -> str:
+        prompts = {
+            "t2v": "Describe a high-quality target video from the user's request with concrete scene details, motion, camera behavior, and style.",
+            "i2v": "Describe a target video that should stay consistent with the provided reference image while following the user's request.",
+            "interpolation": "Describe a target video that smoothly transitions between the provided keyframe images while following the user's request.",
+            "reference2v": "Describe a target video that composes the provided reference subjects into a coherent scene following the user's request.",
+            "editing": "Describe an edited output video that follows the user's instruction while preserving relevant source video content.",
+            "tiv2v": "Describe an edited output video using both the provided source video and reference image guidance according to the user's instruction.",
+        }
+        return prompts.get(task, prompts["t2v"])
+
+    @classmethod
+    def _build_template(cls, task: str, image_count: int) -> str:
+        system_prompt = cls._task_system_prompt(task)
+        visual_tokens = "<|vision_start|><|image_pad|><|vision_end|>\n" * image_count
+        return (
+            "<|im_start|>system\n"
+            f"{system_prompt}"
+            "<|im_end|>\n"
+            "<|im_start|>user\n"
+            f"{visual_tokens}" + "{}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+
+    @staticmethod
+    def _extract_image_embeds(clip_vision_output, max_visual_inputs: int):
+        if clip_vision_output is None:
+            return []
+        mm_projected = getattr(clip_vision_output, "mm_projected", None)
+        if mm_projected is None:
+            return []
+        if mm_projected.ndim == 2:
+            return [mm_projected]
+        count = min(mm_projected.shape[0], max_visual_inputs)
+        return [mm_projected[i] for i in range(count)]
+
+    @classmethod
+    def execute(cls, clip, prompt, task, use_visual_inputs, max_visual_inputs, clip_vision_output=None) -> io.NodeOutput:
+        image_embeds = cls._extract_image_embeds(clip_vision_output, max_visual_inputs) if use_visual_inputs else []
+        template = cls._build_template(task, len(image_embeds))
+
+        # HunyuanVideo 1.5 tokenizers use `images=...`; HunyuanVideo 1.0 uses `image_embeds=...`.
+        try:
+            tokens = clip.tokenize(prompt, llama_template=template, images=image_embeds)
+        except TypeError:
+            embeds = None
+            if len(image_embeds) > 0:
+                embeds = torch.stack(image_embeds, dim=0)
+            tokens = clip.tokenize(prompt, llama_template=template, image_embeds=embeds, image_interleave=1)
+        return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens))
+
+    encode = execute  # TODO: remove
+
+
+class HunyuanClipVisionOutputConcat(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="HunyuanClipVisionOutputConcat",
+            display_name="Hunyuan CLIP Vision Output Concat",
+            category="conditioning/video_models",
+            inputs=[
+                io.ClipVisionOutput.Input("clip_vision_output_1"),
+                io.ClipVisionOutput.Input("clip_vision_output_2", optional=True),
+                io.ClipVisionOutput.Input("clip_vision_output_3", optional=True),
+                io.ClipVisionOutput.Input("clip_vision_output_4", optional=True),
+            ],
+            outputs=[
+                io.ClipVisionOutput.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, clip_vision_output_1, clip_vision_output_2=None, clip_vision_output_3=None, clip_vision_output_4=None) -> io.NodeOutput:
+        outputs = [o for o in (clip_vision_output_1, clip_vision_output_2, clip_vision_output_3, clip_vision_output_4) if o is not None]
+        merged = comfy.clip_vision.Output()
+        tensor_attrs = ["last_hidden_state", "image_embeds", "penultimate_hidden_states", "all_hidden_states", "mm_projected"]
+        for attr in tensor_attrs:
+            values = [getattr(o, attr) for o in outputs if hasattr(o, attr)]
+            if len(values) > 0 and torch.is_tensor(values[0]):
+                setattr(merged, attr, torch.cat(values, dim=0))
+
+        image_sizes = []
+        for o in outputs:
+            if hasattr(o, "image_sizes"):
+                image_sizes.extend(getattr(o, "image_sizes"))
+        if len(image_sizes) > 0:
+            merged.image_sizes = image_sizes
+        return io.NodeOutput(merged)
+
+
+class HunyuanVideo15OmniConditioning(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="HunyuanVideo15OmniConditioning",
+            display_name="HunyuanVideo 15 Omni Conditioning",
+            category="conditioning/video_models",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Vae.Input("vae"),
+                io.Combo.Input("task", options=["t2v", "i2v", "interpolation", "reference2v", "editing", "tiv2v"], default="t2v"),
+                io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+                io.Image.Input("reference_images", optional=True, tooltip="For i2v/interpolation/reference2v/tiv2v."),
+                io.Image.Input("condition_video", optional=True, tooltip="For editing/tiv2v."),
+                io.ClipVisionOutput.Input("clip_vision_output", optional=True),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+            ],
+        )
+
+    @staticmethod
+    def _latent_length(length: int) -> int:
+        return ((length - 1) // 4) + 1
+
+    @staticmethod
+    def _upscale_frames(frames: torch.Tensor, width: int, height: int):
+        return comfy.utils.common_upscale(frames.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+
+    @classmethod
+    def _encode_single_image(cls, vae, image: torch.Tensor, width: int, height: int):
+        upscaled = cls._upscale_frames(image[:1], width, height)
+        return vae.encode(upscaled[:, :, :, :3])
+
+    @classmethod
+    def _encode_video(cls, vae, video: torch.Tensor, width: int, height: int, length: int):
+        upscaled = cls._upscale_frames(video[:length], width, height)
+        return vae.encode(upscaled[:, :, :, :3])
+
+    @staticmethod
+    def _assign_frame(target: torch.Tensor, source: torch.Tensor, frame_idx: int):
+        if frame_idx < 0 or frame_idx >= target.shape[2]:
+            return
+        target[:, :, frame_idx:frame_idx + 1] = source[:, :, :1]
+
+    @classmethod
+    def execute(cls, positive, negative, vae, task, width, height, length, batch_size, reference_images=None, condition_video=None, clip_vision_output=None) -> io.NodeOutput:
+        latent_length = cls._latent_length(length)
+        latent = torch.zeros([batch_size, 32, latent_length, height // 16, width // 16], device=comfy.model_management.intermediate_device())
+
+        if task == "t2v":
+            if clip_vision_output is not None:
+                positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
+                negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
+            return io.NodeOutput(positive, negative, {"samples": latent})
+
+        cond_latent = torch.zeros_like(latent[:1])
+        omni_mask = torch.zeros((latent_length,), device=cond_latent.device, dtype=cond_latent.dtype)
+
+        if task == "i2v":
+            if reference_images is None or reference_images.shape[0] < 1:
+                raise ValueError("Task i2v requires at least one reference image.")
+            encoded = cls._encode_single_image(vae, reference_images, width, height)
+            cls._assign_frame(cond_latent, encoded, 0)
+            omni_mask[0] = 1.0
+
+        elif task == "interpolation":
+            if reference_images is None or reference_images.shape[0] < 2:
+                raise ValueError("Task interpolation requires at least two reference images.")
+            encoded_first = cls._encode_single_image(vae, reference_images[:1], width, height)
+            encoded_last = cls._encode_single_image(vae, reference_images[-1:], width, height)
+            cls._assign_frame(cond_latent, encoded_first, 0)
+            cls._assign_frame(cond_latent, encoded_last, latent_length - 1)
+            omni_mask[0] = 1.0
+            omni_mask[-1] = 1.0
+
+        elif task == "reference2v":
+            if reference_images is None or reference_images.shape[0] < 1:
+                raise ValueError("Task reference2v requires at least one reference image.")
+            num_refs = min(reference_images.shape[0], max(1, latent_length - 1))
+            for idx in range(num_refs):
+                encoded = cls._encode_single_image(vae, reference_images[idx:idx + 1], width, height)
+                frame_idx = min(idx + 1, latent_length - 1)
+                cls._assign_frame(cond_latent, encoded, frame_idx)
+                omni_mask[frame_idx] = 1.0
+
+        elif task == "editing":
+            if condition_video is None or condition_video.shape[0] < 1:
+                raise ValueError("Task editing requires condition_video.")
+            encoded = cls._encode_video(vae, condition_video, width, height, length)
+            valid_frames = min(latent_length, encoded.shape[2])
+            cond_latent[:, :, :valid_frames] = encoded[:, :, :valid_frames]
+            omni_mask[:valid_frames] = 1.0
+
+        elif task == "tiv2v":
+            if condition_video is None or condition_video.shape[0] < 1:
+                raise ValueError("Task tiv2v requires condition_video.")
+            if reference_images is None or reference_images.shape[0] < 1:
+                raise ValueError("Task tiv2v requires at least one reference image.")
+            encoded_video = cls._encode_video(vae, condition_video, width, height, length)
+            valid_frames = min(latent_length, encoded_video.shape[2])
+            cond_latent[:, :, :valid_frames] = encoded_video[:, :, :valid_frames]
+            omni_mask[:valid_frames] = 1.0
+
+            encoded_ref = cls._encode_single_image(vae, reference_images[:1], width, height)
+            ref_idx = 1 if latent_length > 1 else 0
+            cond_latent[:, :, ref_idx:ref_idx + 1] += encoded_ref[:, :, :1]
+            omni_mask[ref_idx] += 1.0
+
+        cond_latent = comfy.utils.resize_to_batch_size(cond_latent, batch_size)
+        # BaseModel/HunyuanVideo15 inverts concat_mask (mask = 1 - concat_mask), so pass the pre-inverted mask.
+        concat_mask = (1.0 - omni_mask).view(1, 1, latent_length, 1, 1).expand(cond_latent.shape[0], 1, latent_length, cond_latent.shape[-2], cond_latent.shape[-1]).to(cond_latent.dtype)
+
+        positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": cond_latent, "concat_mask": concat_mask})
+        negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": cond_latent, "concat_mask": concat_mask})
+        if clip_vision_output is not None:
+            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
+            negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
+
+        return io.NodeOutput(positive, negative, {"samples": latent})
+
+
 class HunyuanImageToVideo(io.ComfyNode):
    @classmethod
    def define_schema(cls):
@ -411,9 +653,12 @@ class HunyuanExtension(ComfyExtension):
        return [
            CLIPTextEncodeHunyuanDiT,
            TextEncodeHunyuanVideo_ImageToVideo,
+            TextEncodeHunyuanVideo15Omni,
+            HunyuanClipVisionOutputConcat,
            EmptyHunyuanLatentVideo,
            EmptyHunyuanVideo15Latent,
            HunyuanVideo15ImageToVideo,
+            HunyuanVideo15OmniConditioning,
            HunyuanVideo15SuperResolution,
            HunyuanVideo15LatentUpscaleWithModel,
            LatentUpscaleModelLoader,
--- a/nodes.py
+++ b/nodes.py
@ -977,7 +977,7 @@ class CLIPLoader:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image"], ),
+                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "hunyuan_video_15", "flux2", "ovis", "longcat_image"], ),
                              },
                "optional": {
                              "device": (["default", "cpu"], {"advanced": True}),
@ -987,7 +987,7 @@ class CLIPLoader:

    CATEGORY = "advanced/loaders"

-    DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B"
+    DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\nhidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nhunyuan_video_15: qwen2.5-vl (single-file fallback without byT5)"

    def load_clip(self, clip_name, type="stable_diffusion", device="default"):
        clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)