From 4d982e83f6e288f87b62a9b8e110881e4b75a5ae Mon Sep 17 00:00:00 2001 From: Yousef Rafat <81116377+yousef-rafat@users.noreply.github.com> Date: Fri, 21 Nov 2025 17:22:37 +0200 Subject: [PATCH] added clip encoder --- comfy/ldm/hunyuan_image_3/model.py | 15 +-------------- comfy/supported_models.py | 7 +++++-- comfy/text_encoders/hunyuan_image.py | 14 +++++++++++--- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/comfy/ldm/hunyuan_image_3/model.py b/comfy/ldm/hunyuan_image_3/model.py index c82904fdc..e6dd436b9 100644 --- a/comfy/ldm/hunyuan_image_3/model.py +++ b/comfy/ldm/hunyuan_image_3/model.py @@ -18,16 +18,7 @@ from comfy.ldm.modules.attention import optimized_attention from comfy.ldm.modules.diffusionmodules.openaimodel import ResBlock INIT_MOE = torch.cuda.device_count() != 1 - -if not INIT_MOE: - MOE_LAYER_SIZE = (1024**3) * 2.65 # approx - - torch.cuda.set_device(0) - props = torch.cuda.get_device_properties(0) - - LAYERS_IN_CPU = math.floor((int((os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')) - - psutil.Process(os.getpid()).memory_info().rss - - (2*1024**3)) * 0.50) / MOE_LAYER_SIZE) +MOE_LAYER_SIZE = (1024**3) * 2.65 # approx class HunyuanStaticCache(StaticCache): @@ -885,7 +876,6 @@ class HunyuanImage3Model(nn.Module): def forward( self, - input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, @@ -897,9 +887,6 @@ class HunyuanImage3Model(nn.Module): gen_timestep_scatter_index: Optional[torch.Tensor] = None, ): - if inputs_embeds is None: - inputs_embeds = self.wte(input_ids) - hidden_states = inputs_embeds next_decoder_cache = None diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 5ae2baef0..a3256aa35 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1339,9 +1339,12 @@ class HunyuanImage3(supported_models_base.BASE): latent_format = latent_formats.HunyuanImage3 def get_model(self, state_dict, prefix="", device=None): - return model_base.HunyuanImage3(self, device = device) + state_dict["text_encoders.wte"] = state_dict["model.model.wte"] + state_dict.pop("model.model.wte", None) + model = model_base.HunyuanImage3(self, device = device) + return model def clip_target(self, state_dict={}): - return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.DummyClip) + return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.HunyuanImage3) class HunyuanImage21(HunyuanVideo): unet_config = { diff --git a/comfy/text_encoders/hunyuan_image.py b/comfy/text_encoders/hunyuan_image.py index ab3512201..732b3d80a 100644 --- a/comfy/text_encoders/hunyuan_image.py +++ b/comfy/text_encoders/hunyuan_image.py @@ -2,12 +2,20 @@ from comfy import sd1_clip import comfy.text_encoders.llama from .qwen_image import QwenImageTokenizer, QwenImageTEModel from transformers import ByT5Tokenizer +import torch import os import re -class DummyClip: - def __init__(*args, **kwargs): - pass +class HunyuanImage3TextEncoder(torch.nn.Module): + def __init__(self): + super().__init__() + self.wte = torch.nn.Embedding(133120, 4096, 128009) + def forward(self, x): + out = self.wte(x) + return out, torch.empty_like(out) +class HunyuanImage3(sd1_clip.SDClipModel): + def __init__(self, device="cpu", max_length=77, freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=HunyuanImage3TextEncoder, layer_norm_hidden_state=True, enable_attention_masks=False, zero_out_masked=False, return_projected_pooled=False, return_attention_masks=False, model_options={}): + super().__init__(device, max_length, freeze, layer, layer_idx, textmodel_json_config, dtype, model_class, layer_norm_hidden_state, enable_attention_masks, zero_out_masked, return_projected_pooled, return_attention_masks, model_options) class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer): def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...):