Fix text encoding

2026-02-10 05:22:34 +08:00 · 2025-11-17 16:08:06 +02:00 · 2025-11-17 16:08:06 +02:00 · eaef7b764e
commit eaef7b764e
parent ed3d1942d0
4 changed files with 14 additions and 8 deletions
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -911,6 +911,7 @@ class CLIPType(Enum):
    OMNIGEN2 = 17
    QWEN_IMAGE = 18
    HUNYUAN_IMAGE = 19
    HUNYUAN_VIDEO_15 = 20
 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
@ -1126,6 +1127,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
        elif clip_type == CLIPType.HUNYUAN_IMAGE:
            clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer
        elif clip_type == CLIPType.HUNYUAN_VIDEO_15:
            clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer
        else:
            clip_target.clip = sdxl_clip.SDXLClipModel
            clip_target.tokenizer = sdxl_clip.SDXLTokenizer
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -1377,10 +1377,6 @@ class HunyuanImage21Refiner(HunyuanVideo):
 class HunyuanVideo15(HunyuanVideo):
    unet_config = {
        "image_model": "hunyuan_video",
        "patch_size": [1, 1, 1],
        "in_channels": 65,
        "out_channels": 32,
        "depth": 54,
        "vision_in_dim": 1152,
    }
@ -1388,19 +1384,18 @@ class HunyuanVideo15(HunyuanVideo):
        "shift": 7.0,
    }
    memory_usage_factor = 4.0 #TODO
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+    supported_inference_dtypes = [torch.bfloat16, torch.float32] #TODO check if fp16 works
    latent_format = latent_formats.HunyuanVideo15
    def get_model(self, state_dict, prefix="", device=None):
        print("HunyuanVideo15")
        out = model_base.HunyuanVideo15(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        pref = self.text_encoder_key_prefix[0]
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
+        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
 models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
--- a/comfy/text_encoders/hunyuan_video.py
+++ b/comfy/text_encoders/hunyuan_video.py
@ -1,6 +1,7 @@
 from comfy import sd1_clip
 import comfy.model_management
 import comfy.text_encoders.llama
 from .hunyuan_image import HunyuanImageTokenizer
 from transformers import LlamaTokenizerFast
 import torch
 import os
@ -73,6 +74,12 @@ class HunyuanVideoTokenizer:
        return {}
 class HunyuanVideo15Tokenizer(HunyuanImageTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
        self.llama_template = "<|im_start|>system\nYou are a helpful assistant. Describe the video by detailing the following aspects:\n1. The main content and theme of the video.\n2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.\n3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.\n4. background environment, light, style and atmosphere.\n5. camera angles, movements, and transitions used in the video.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
 class HunyuanVideoClipModel(torch.nn.Module):
    def __init__(self, dtype_llama=None, device="cpu", dtype=None, model_options={}):
        super().__init__()
--- a/nodes.py
+++ b/nodes.py
@ -957,7 +957,7 @@ class DualCLIPLoader:
    def INPUT_TYPES(s):
        return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ),
                              "clip_name2": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image"], ),
+                              "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15"], ),
                              },
                "optional": {
                              "device": (["default", "cpu"], {"advanced": True}),