diff --git a/comfy/sd.py b/comfy/sd.py index e573804a5..186af578d 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1276,6 +1276,13 @@ def detect_te_model(sd): return TEModel.QWEN25_3B if weight.shape[0] == 512: return TEModel.QWEN25_7B + # Qwen-VL checkpoints can be saved under model.language_model.* (e.g. HY-OmniWeave text encoder). + if 'model.language_model.layers.0.self_attn.k_proj.bias' in sd: + weight = sd['model.language_model.layers.0.self_attn.k_proj.bias'] + if weight.shape[0] == 256: + return TEModel.QWEN25_3B + if weight.shape[0] == 512: + return TEModel.QWEN25_7B if "model.language_model.layers.0.linear_attn.A_log" in sd and "model.language_model.layers.0.input_layernorm.weight" in sd: weight = sd['model.language_model.layers.0.input_layernorm.weight'] if weight.shape[0] == 1024: @@ -1321,7 +1328,11 @@ def t5xxl_detect(clip_data): return {} def llama_detect(clip_data): - weight_names = ["model.layers.0.self_attn.k_proj.weight", "model.layers.0.linear_attn.in_proj_a.weight"] + weight_names = [ + "model.layers.0.self_attn.k_proj.weight", + "model.layers.0.linear_attn.in_proj_a.weight", + "model.language_model.layers.0.self_attn.k_proj.weight", + ] for sd in clip_data: for weight_name in weight_names: @@ -1425,7 +1436,23 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.omnigen2.te(**llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.omnigen2.Omnigen2Tokenizer elif te_model == TEModel.QWEN25_7B: - if clip_type == CLIPType.HUNYUAN_IMAGE: + # Some Qwen2.5-VL checkpoints (including HY-OmniWeave's text encoder) + # are saved with "model.language_model.*" and "model.visual.*" prefixes. + # Normalize keys to the layout expected by Comfy text encoder wrappers. + for i, sd in enumerate(clip_data): + if "model.language_model.layers.0.self_attn.k_proj.weight" in sd: + clip_data[i] = comfy.utils.state_dict_prefix_replace( + sd, + { + "model.language_model.": "model.", + "model.visual.": "visual.", + "final_layer_norm.": "model.norm.", + }, + ) + if clip_type == CLIPType.HUNYUAN_VIDEO_15: + clip_target.clip = comfy.text_encoders.hunyuan_image.te(byt5=False, **llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer + elif clip_type == CLIPType.HUNYUAN_IMAGE: clip_target.clip = comfy.text_encoders.hunyuan_image.te(byt5=False, **llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer elif clip_type == CLIPType.LONGCAT_IMAGE: @@ -1763,6 +1790,39 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable if custom_operations is None: sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata) + # HY-OmniWeave checkpoints store double-block attention as split q/k/v tensors + # while Comfy's HunyuanVideo implementation expects merged qkv tensors. + if "double_blocks.0.img_attn_q.weight" in sd and "double_blocks.0.img_attn.qkv.weight" not in sd: + converted_qkv = 0 + block_indices = set() + for k in list(sd.keys()): + if not k.startswith("double_blocks."): + continue + parts = k.split(".") + if len(parts) < 3: + continue + if parts[2] == "img_attn_q": + try: + block_indices.add(int(parts[1])) + except ValueError: + pass + + for idx in sorted(block_indices): + for attn_prefix in ("img_attn", "txt_attn"): + for end in ("weight", "bias"): + q_key = f"double_blocks.{idx}.{attn_prefix}_q.{end}" + k_key = f"double_blocks.{idx}.{attn_prefix}_k.{end}" + v_key = f"double_blocks.{idx}.{attn_prefix}_v.{end}" + qkv_key = f"double_blocks.{idx}.{attn_prefix}.qkv.{end}" + if qkv_key in sd: + continue + if q_key in sd and k_key in sd and v_key in sd: + sd[qkv_key] = torch.cat((sd.pop(q_key), sd.pop(k_key), sd.pop(v_key)), dim=0) + converted_qkv += 1 + + if converted_qkv > 0: + logging.info(f"Converted {converted_qkv} split HunyuanVideo attention tensors to qkv format.") + parameters = comfy.utils.calculate_parameters(sd) weight_dtype = comfy.utils.weight_dtype(sd) diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py index 4ea93a499..5f5359bcb 100644 --- a/comfy_extras/nodes_hunyuan.py +++ b/comfy_extras/nodes_hunyuan.py @@ -2,6 +2,8 @@ import nodes import node_helpers import torch import comfy.model_management +import comfy.utils +import comfy.clip_vision from typing_extensions import override from comfy_api.latest import ComfyExtension, io from comfy.ldm.hunyuan_video.upsampler import HunyuanVideo15SRModel @@ -301,6 +303,246 @@ class TextEncodeHunyuanVideo_ImageToVideo(io.ComfyNode): encode = execute # TODO: remove +class TextEncodeHunyuanVideo15Omni(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="TextEncodeHunyuanVideo15Omni", + display_name="Text Encode HunyuanVideo 15 Omni", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("prompt", multiline=True, dynamic_prompts=True), + io.Combo.Input("task", options=["t2v", "i2v", "interpolation", "reference2v", "editing", "tiv2v"], default="t2v"), + io.Boolean.Input("use_visual_inputs", default=True, advanced=True), + io.Int.Input("max_visual_inputs", default=8, min=1, max=64, advanced=True), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) + + @staticmethod + def _task_system_prompt(task: str) -> str: + prompts = { + "t2v": "Describe a high-quality target video from the user's request with concrete scene details, motion, camera behavior, and style.", + "i2v": "Describe a target video that should stay consistent with the provided reference image while following the user's request.", + "interpolation": "Describe a target video that smoothly transitions between the provided keyframe images while following the user's request.", + "reference2v": "Describe a target video that composes the provided reference subjects into a coherent scene following the user's request.", + "editing": "Describe an edited output video that follows the user's instruction while preserving relevant source video content.", + "tiv2v": "Describe an edited output video using both the provided source video and reference image guidance according to the user's instruction.", + } + return prompts.get(task, prompts["t2v"]) + + @classmethod + def _build_template(cls, task: str, image_count: int) -> str: + system_prompt = cls._task_system_prompt(task) + visual_tokens = "<|vision_start|><|image_pad|><|vision_end|>\n" * image_count + return ( + "<|im_start|>system\n" + f"{system_prompt}" + "<|im_end|>\n" + "<|im_start|>user\n" + f"{visual_tokens}" + "{}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + + @staticmethod + def _extract_image_embeds(clip_vision_output, max_visual_inputs: int): + if clip_vision_output is None: + return [] + mm_projected = getattr(clip_vision_output, "mm_projected", None) + if mm_projected is None: + return [] + if mm_projected.ndim == 2: + return [mm_projected] + count = min(mm_projected.shape[0], max_visual_inputs) + return [mm_projected[i] for i in range(count)] + + @classmethod + def execute(cls, clip, prompt, task, use_visual_inputs, max_visual_inputs, clip_vision_output=None) -> io.NodeOutput: + image_embeds = cls._extract_image_embeds(clip_vision_output, max_visual_inputs) if use_visual_inputs else [] + template = cls._build_template(task, len(image_embeds)) + + # HunyuanVideo 1.5 tokenizers use `images=...`; HunyuanVideo 1.0 uses `image_embeds=...`. + try: + tokens = clip.tokenize(prompt, llama_template=template, images=image_embeds) + except TypeError: + embeds = None + if len(image_embeds) > 0: + embeds = torch.stack(image_embeds, dim=0) + tokens = clip.tokenize(prompt, llama_template=template, image_embeds=embeds, image_interleave=1) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens)) + + encode = execute # TODO: remove + + +class HunyuanClipVisionOutputConcat(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="HunyuanClipVisionOutputConcat", + display_name="Hunyuan CLIP Vision Output Concat", + category="conditioning/video_models", + inputs=[ + io.ClipVisionOutput.Input("clip_vision_output_1"), + io.ClipVisionOutput.Input("clip_vision_output_2", optional=True), + io.ClipVisionOutput.Input("clip_vision_output_3", optional=True), + io.ClipVisionOutput.Input("clip_vision_output_4", optional=True), + ], + outputs=[ + io.ClipVisionOutput.Output(), + ], + ) + + @classmethod + def execute(cls, clip_vision_output_1, clip_vision_output_2=None, clip_vision_output_3=None, clip_vision_output_4=None) -> io.NodeOutput: + outputs = [o for o in (clip_vision_output_1, clip_vision_output_2, clip_vision_output_3, clip_vision_output_4) if o is not None] + merged = comfy.clip_vision.Output() + tensor_attrs = ["last_hidden_state", "image_embeds", "penultimate_hidden_states", "all_hidden_states", "mm_projected"] + for attr in tensor_attrs: + values = [getattr(o, attr) for o in outputs if hasattr(o, attr)] + if len(values) > 0 and torch.is_tensor(values[0]): + setattr(merged, attr, torch.cat(values, dim=0)) + + image_sizes = [] + for o in outputs: + if hasattr(o, "image_sizes"): + image_sizes.extend(getattr(o, "image_sizes")) + if len(image_sizes) > 0: + merged.image_sizes = image_sizes + return io.NodeOutput(merged) + + +class HunyuanVideo15OmniConditioning(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="HunyuanVideo15OmniConditioning", + display_name="HunyuanVideo 15 Omni Conditioning", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Combo.Input("task", options=["t2v", "i2v", "interpolation", "reference2v", "editing", "tiv2v"], default="t2v"), + io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("reference_images", optional=True, tooltip="For i2v/interpolation/reference2v/tiv2v."), + io.Image.Input("condition_video", optional=True, tooltip="For editing/tiv2v."), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) + + @staticmethod + def _latent_length(length: int) -> int: + return ((length - 1) // 4) + 1 + + @staticmethod + def _upscale_frames(frames: torch.Tensor, width: int, height: int): + return comfy.utils.common_upscale(frames.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + + @classmethod + def _encode_single_image(cls, vae, image: torch.Tensor, width: int, height: int): + upscaled = cls._upscale_frames(image[:1], width, height) + return vae.encode(upscaled[:, :, :, :3]) + + @classmethod + def _encode_video(cls, vae, video: torch.Tensor, width: int, height: int, length: int): + upscaled = cls._upscale_frames(video[:length], width, height) + return vae.encode(upscaled[:, :, :, :3]) + + @staticmethod + def _assign_frame(target: torch.Tensor, source: torch.Tensor, frame_idx: int): + if frame_idx < 0 or frame_idx >= target.shape[2]: + return + target[:, :, frame_idx:frame_idx + 1] = source[:, :, :1] + + @classmethod + def execute(cls, positive, negative, vae, task, width, height, length, batch_size, reference_images=None, condition_video=None, clip_vision_output=None) -> io.NodeOutput: + latent_length = cls._latent_length(length) + latent = torch.zeros([batch_size, 32, latent_length, height // 16, width // 16], device=comfy.model_management.intermediate_device()) + + if task == "t2v": + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + return io.NodeOutput(positive, negative, {"samples": latent}) + + cond_latent = torch.zeros_like(latent[:1]) + omni_mask = torch.zeros((latent_length,), device=cond_latent.device, dtype=cond_latent.dtype) + + if task == "i2v": + if reference_images is None or reference_images.shape[0] < 1: + raise ValueError("Task i2v requires at least one reference image.") + encoded = cls._encode_single_image(vae, reference_images, width, height) + cls._assign_frame(cond_latent, encoded, 0) + omni_mask[0] = 1.0 + + elif task == "interpolation": + if reference_images is None or reference_images.shape[0] < 2: + raise ValueError("Task interpolation requires at least two reference images.") + encoded_first = cls._encode_single_image(vae, reference_images[:1], width, height) + encoded_last = cls._encode_single_image(vae, reference_images[-1:], width, height) + cls._assign_frame(cond_latent, encoded_first, 0) + cls._assign_frame(cond_latent, encoded_last, latent_length - 1) + omni_mask[0] = 1.0 + omni_mask[-1] = 1.0 + + elif task == "reference2v": + if reference_images is None or reference_images.shape[0] < 1: + raise ValueError("Task reference2v requires at least one reference image.") + num_refs = min(reference_images.shape[0], max(1, latent_length - 1)) + for idx in range(num_refs): + encoded = cls._encode_single_image(vae, reference_images[idx:idx + 1], width, height) + frame_idx = min(idx + 1, latent_length - 1) + cls._assign_frame(cond_latent, encoded, frame_idx) + omni_mask[frame_idx] = 1.0 + + elif task == "editing": + if condition_video is None or condition_video.shape[0] < 1: + raise ValueError("Task editing requires condition_video.") + encoded = cls._encode_video(vae, condition_video, width, height, length) + valid_frames = min(latent_length, encoded.shape[2]) + cond_latent[:, :, :valid_frames] = encoded[:, :, :valid_frames] + omni_mask[:valid_frames] = 1.0 + + elif task == "tiv2v": + if condition_video is None or condition_video.shape[0] < 1: + raise ValueError("Task tiv2v requires condition_video.") + if reference_images is None or reference_images.shape[0] < 1: + raise ValueError("Task tiv2v requires at least one reference image.") + encoded_video = cls._encode_video(vae, condition_video, width, height, length) + valid_frames = min(latent_length, encoded_video.shape[2]) + cond_latent[:, :, :valid_frames] = encoded_video[:, :, :valid_frames] + omni_mask[:valid_frames] = 1.0 + + encoded_ref = cls._encode_single_image(vae, reference_images[:1], width, height) + ref_idx = 1 if latent_length > 1 else 0 + cond_latent[:, :, ref_idx:ref_idx + 1] += encoded_ref[:, :, :1] + omni_mask[ref_idx] += 1.0 + + cond_latent = comfy.utils.resize_to_batch_size(cond_latent, batch_size) + # BaseModel/HunyuanVideo15 inverts concat_mask (mask = 1 - concat_mask), so pass the pre-inverted mask. + concat_mask = (1.0 - omni_mask).view(1, 1, latent_length, 1, 1).expand(cond_latent.shape[0], 1, latent_length, cond_latent.shape[-2], cond_latent.shape[-1]).to(cond_latent.dtype) + + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": cond_latent, "concat_mask": concat_mask}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": cond_latent, "concat_mask": concat_mask}) + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + + return io.NodeOutput(positive, negative, {"samples": latent}) + + class HunyuanImageToVideo(io.ComfyNode): @classmethod def define_schema(cls): @@ -411,9 +653,12 @@ class HunyuanExtension(ComfyExtension): return [ CLIPTextEncodeHunyuanDiT, TextEncodeHunyuanVideo_ImageToVideo, + TextEncodeHunyuanVideo15Omni, + HunyuanClipVisionOutputConcat, EmptyHunyuanLatentVideo, EmptyHunyuanVideo15Latent, HunyuanVideo15ImageToVideo, + HunyuanVideo15OmniConditioning, HunyuanVideo15SuperResolution, HunyuanVideo15LatentUpscaleWithModel, LatentUpscaleModelLoader, diff --git a/nodes.py b/nodes.py index 299b3d758..dbaaed371 100644 --- a/nodes.py +++ b/nodes.py @@ -977,7 +977,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "hunyuan_video_15", "flux2", "ovis", "longcat_image"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), @@ -987,7 +987,7 @@ class CLIPLoader: CATEGORY = "advanced/loaders" - DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B" + DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\nhidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nhunyuan_video_15: qwen2.5-vl (single-file fallback without byT5)" def load_clip(self, clip_name, type="stable_diffusion", device="default"): clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)