From eaef7b764edaa1b24bbe94b418d928e5dde63342 Mon Sep 17 00:00:00 2001
From: kijai <40791699+kijai@users.noreply.github.com>
Date: Mon, 17 Nov 2025 16:08:06 +0200
Subject: [PATCH] Fix text encoding

---
 comfy/sd.py                          | 4 ++++
 comfy/supported_models.py            | 9 ++-------
 comfy/text_encoders/hunyuan_video.py | 7 +++++++
 nodes.py                             | 2 +-
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/comfy/sd.py b/comfy/sd.py
index 9e5ebbf15..4755f2111 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -911,6 +911,7 @@ class CLIPType(Enum):
     OMNIGEN2 = 17
     QWEN_IMAGE = 18
     HUNYUAN_IMAGE = 19
+    HUNYUAN_VIDEO_15 = 20
 
 
 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
@@ -1126,6 +1127,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
         elif clip_type == CLIPType.HUNYUAN_IMAGE:
             clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data))
             clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer
+        elif clip_type == CLIPType.HUNYUAN_VIDEO_15:
+            clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data))
+            clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer
         else:
             clip_target.clip = sdxl_clip.SDXLClipModel
             clip_target.tokenizer = sdxl_clip.SDXLTokenizer
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 215ebc047..b6e7e9a59 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1377,10 +1377,6 @@ class HunyuanImage21Refiner(HunyuanVideo):
 class HunyuanVideo15(HunyuanVideo):
     unet_config = {
         "image_model": "hunyuan_video",
-        "patch_size": [1, 1, 1],
-        "in_channels": 65,
-        "out_channels": 32,
-        "depth": 54,
         "vision_in_dim": 1152,
     }
 
@@ -1388,19 +1384,18 @@ class HunyuanVideo15(HunyuanVideo):
         "shift": 7.0,
     }
     memory_usage_factor = 4.0 #TODO
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+    supported_inference_dtypes = [torch.bfloat16, torch.float32] #TODO check if fp16 works
 
     latent_format = latent_formats.HunyuanVideo15
 
     def get_model(self, state_dict, prefix="", device=None):
-        print("HunyuanVideo15")
         out = model_base.HunyuanVideo15(self, device=device)
         return out
 
     def clip_target(self, state_dict={}):
         pref = self.text_encoder_key_prefix[0]
         hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
+        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
 
 models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
 
diff --git a/comfy/text_encoders/hunyuan_video.py b/comfy/text_encoders/hunyuan_video.py
index b02148b33..7274dd7f9 100644
--- a/comfy/text_encoders/hunyuan_video.py
+++ b/comfy/text_encoders/hunyuan_video.py
@@ -1,6 +1,7 @@
 from comfy import sd1_clip
 import comfy.model_management
 import comfy.text_encoders.llama
+from .hunyuan_image import HunyuanImageTokenizer
 from transformers import LlamaTokenizerFast
 import torch
 import os
@@ -73,6 +74,12 @@ class HunyuanVideoTokenizer:
         return {}
 
 
+class HunyuanVideo15Tokenizer(HunyuanImageTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.llama_template = "<|im_start|>system\nYou are a helpful assistant. Describe the video by detailing the following aspects:\n1. The main content and theme of the video.\n2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.\n3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.\n4. background environment, light, style and atmosphere.\n5. camera angles, movements, and transitions used in the video.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+
+
 class HunyuanVideoClipModel(torch.nn.Module):
     def __init__(self, dtype_llama=None, device="cpu", dtype=None, model_options={}):
         super().__init__()
diff --git a/nodes.py b/nodes.py
index 030371633..f023ae3b6 100644
--- a/nodes.py
+++ b/nodes.py
@@ -957,7 +957,7 @@ class DualCLIPLoader:
     def INPUT_TYPES(s):
         return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ),
                               "clip_name2": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image"], ),
+                              "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15"], ),
                               },
                 "optional": {
                               "device": (["default", "cpu"], {"advanced": True}),