added clip encoder

2026-02-28 14:57:32 +08:00 · 2025-11-21 17:22:37 +02:00 · 2025-11-21 17:22:37 +02:00 · 4d982e83f6
commit 4d982e83f6
parent ea176bb87d
3 changed files with 17 additions and 19 deletions
--- a/comfy/ldm/hunyuan_image_3/model.py
+++ b/comfy/ldm/hunyuan_image_3/model.py
@ -18,16 +18,7 @@ from comfy.ldm.modules.attention import optimized_attention
 from comfy.ldm.modules.diffusionmodules.openaimodel import ResBlock

 INIT_MOE = torch.cuda.device_count() != 1
-
-if not INIT_MOE:
-    MOE_LAYER_SIZE = (1024**3) * 2.65 # approx
-
-    torch.cuda.set_device(0)
-    props = torch.cuda.get_device_properties(0)
-    
-    LAYERS_IN_CPU = math.floor((int((os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')) 
-                                    - psutil.Process(os.getpid()).memory_info().rss 
-                                    - (2*1024**3)) * 0.50) / MOE_LAYER_SIZE)
+MOE_LAYER_SIZE = (1024**3) * 2.65 # approx

 class HunyuanStaticCache(StaticCache):

@ -885,7 +876,6 @@ class HunyuanImage3Model(nn.Module):

    def forward(
            self,
-            input_ids: torch.LongTensor = None,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.LongTensor] = None,
            past_key_values: Optional[List[torch.FloatTensor]] = None,
@ -897,9 +887,6 @@ class HunyuanImage3Model(nn.Module):
            gen_timestep_scatter_index: Optional[torch.Tensor] = None,
    ):

-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-
        hidden_states = inputs_embeds

        next_decoder_cache = None
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -1339,9 +1339,12 @@ class HunyuanImage3(supported_models_base.BASE):
    latent_format = latent_formats.HunyuanImage3

    def get_model(self, state_dict, prefix="", device=None):
-        return model_base.HunyuanImage3(self, device = device)
+        state_dict["text_encoders.wte"] = state_dict["model.model.wte"]
+        state_dict.pop("model.model.wte", None)
+        model = model_base.HunyuanImage3(self, device = device)
+        return model
    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.DummyClip)
+        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.HunyuanImage3)

 class HunyuanImage21(HunyuanVideo):
    unet_config = {
--- a/comfy/text_encoders/hunyuan_image.py
+++ b/comfy/text_encoders/hunyuan_image.py
@ -2,12 +2,20 @@ from comfy import sd1_clip
 import comfy.text_encoders.llama
 from .qwen_image import QwenImageTokenizer, QwenImageTEModel
 from transformers import ByT5Tokenizer
+import torch
 import os
 import re

-class DummyClip:
-    def __init__(*args, **kwargs):
-        pass
+class HunyuanImage3TextEncoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.wte = torch.nn.Embedding(133120, 4096, 128009)
+    def forward(self, x):
+        out = self.wte(x)
+        return out, torch.empty_like(out)
+class HunyuanImage3(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", max_length=77, freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=HunyuanImage3TextEncoder, layer_norm_hidden_state=True, enable_attention_masks=False, zero_out_masked=False, return_projected_pooled=False, return_attention_masks=False, model_options={}):
+        super().__init__(device, max_length, freeze, layer, layer_idx, textmodel_json_config, dtype, model_class, layer_norm_hidden_state, enable_attention_masks, zero_out_masked, return_projected_pooled, return_attention_masks, model_options)

 class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer):
    def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...):