From 4d982e83f6e288f87b62a9b8e110881e4b75a5ae Mon Sep 17 00:00:00 2001
From: Yousef Rafat <81116377+yousef-rafat@users.noreply.github.com>
Date: Fri, 21 Nov 2025 17:22:37 +0200
Subject: [PATCH] added clip encoder

---
 comfy/ldm/hunyuan_image_3/model.py   | 15 +--------------
 comfy/supported_models.py            |  7 +++++--
 comfy/text_encoders/hunyuan_image.py | 14 +++++++++++---
 3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/comfy/ldm/hunyuan_image_3/model.py b/comfy/ldm/hunyuan_image_3/model.py
index c82904fdc..e6dd436b9 100644
--- a/comfy/ldm/hunyuan_image_3/model.py
+++ b/comfy/ldm/hunyuan_image_3/model.py
@@ -18,16 +18,7 @@ from comfy.ldm.modules.attention import optimized_attention
 from comfy.ldm.modules.diffusionmodules.openaimodel import ResBlock
 
 INIT_MOE = torch.cuda.device_count() != 1
-
-if not INIT_MOE:
-    MOE_LAYER_SIZE = (1024**3) * 2.65 # approx
-
-    torch.cuda.set_device(0)
-    props = torch.cuda.get_device_properties(0)
-    
-    LAYERS_IN_CPU = math.floor((int((os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')) 
-                                    - psutil.Process(os.getpid()).memory_info().rss 
-                                    - (2*1024**3)) * 0.50) / MOE_LAYER_SIZE)
+MOE_LAYER_SIZE = (1024**3) * 2.65 # approx
 
 class HunyuanStaticCache(StaticCache):
 
@@ -885,7 +876,6 @@ class HunyuanImage3Model(nn.Module):
 
     def forward(
             self,
-            input_ids: torch.LongTensor = None,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
             past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -897,9 +887,6 @@ class HunyuanImage3Model(nn.Module):
             gen_timestep_scatter_index: Optional[torch.Tensor] = None,
     ):
 
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-
         hidden_states = inputs_embeds
 
         next_decoder_cache = None
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 5ae2baef0..a3256aa35 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1339,9 +1339,12 @@ class HunyuanImage3(supported_models_base.BASE):
     latent_format = latent_formats.HunyuanImage3
 
     def get_model(self, state_dict, prefix="", device=None):
-        return model_base.HunyuanImage3(self, device = device)
+        state_dict["text_encoders.wte"] = state_dict["model.model.wte"]
+        state_dict.pop("model.model.wte", None)
+        model = model_base.HunyuanImage3(self, device = device)
+        return model
     def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.DummyClip)
+        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.HunyuanImage3)
 
 class HunyuanImage21(HunyuanVideo):
     unet_config = {
diff --git a/comfy/text_encoders/hunyuan_image.py b/comfy/text_encoders/hunyuan_image.py
index ab3512201..732b3d80a 100644
--- a/comfy/text_encoders/hunyuan_image.py
+++ b/comfy/text_encoders/hunyuan_image.py
@@ -2,12 +2,20 @@ from comfy import sd1_clip
 import comfy.text_encoders.llama
 from .qwen_image import QwenImageTokenizer, QwenImageTEModel
 from transformers import ByT5Tokenizer
+import torch
 import os
 import re
 
-class DummyClip:
-    def __init__(*args, **kwargs):
-        pass
+class HunyuanImage3TextEncoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.wte = torch.nn.Embedding(133120, 4096, 128009)
+    def forward(self, x):
+        out = self.wte(x)
+        return out, torch.empty_like(out)
+class HunyuanImage3(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", max_length=77, freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=HunyuanImage3TextEncoder, layer_norm_hidden_state=True, enable_attention_masks=False, zero_out_masked=False, return_projected_pooled=False, return_attention_masks=False, model_options={}):
+        super().__init__(device, max_length, freeze, layer, layer_idx, textmodel_json_config, dtype, model_class, layer_norm_hidden_state, enable_attention_masks, zero_out_masked, return_projected_pooled, return_attention_masks, model_options)
 
 class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer):
     def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...):