added clip encoder

This commit is contained in:
Yousef Rafat 2025-11-21 17:22:37 +02:00
parent ea176bb87d
commit 4d982e83f6
3 changed files with 17 additions and 19 deletions

View File

@ -18,16 +18,7 @@ from comfy.ldm.modules.attention import optimized_attention
from comfy.ldm.modules.diffusionmodules.openaimodel import ResBlock
INIT_MOE = torch.cuda.device_count() != 1
if not INIT_MOE:
MOE_LAYER_SIZE = (1024**3) * 2.65 # approx
torch.cuda.set_device(0)
props = torch.cuda.get_device_properties(0)
LAYERS_IN_CPU = math.floor((int((os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES'))
- psutil.Process(os.getpid()).memory_info().rss
- (2*1024**3)) * 0.50) / MOE_LAYER_SIZE)
MOE_LAYER_SIZE = (1024**3) * 2.65 # approx
class HunyuanStaticCache(StaticCache):
@ -885,7 +876,6 @@ class HunyuanImage3Model(nn.Module):
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
@ -897,9 +887,6 @@ class HunyuanImage3Model(nn.Module):
gen_timestep_scatter_index: Optional[torch.Tensor] = None,
):
if inputs_embeds is None:
inputs_embeds = self.wte(input_ids)
hidden_states = inputs_embeds
next_decoder_cache = None

View File

@ -1339,9 +1339,12 @@ class HunyuanImage3(supported_models_base.BASE):
latent_format = latent_formats.HunyuanImage3
def get_model(self, state_dict, prefix="", device=None):
return model_base.HunyuanImage3(self, device = device)
state_dict["text_encoders.wte"] = state_dict["model.model.wte"]
state_dict.pop("model.model.wte", None)
model = model_base.HunyuanImage3(self, device = device)
return model
def clip_target(self, state_dict={}):
return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.DummyClip)
return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.HunyuanImage3)
class HunyuanImage21(HunyuanVideo):
unet_config = {

View File

@ -2,12 +2,20 @@ from comfy import sd1_clip
import comfy.text_encoders.llama
from .qwen_image import QwenImageTokenizer, QwenImageTEModel
from transformers import ByT5Tokenizer
import torch
import os
import re
class DummyClip:
def __init__(*args, **kwargs):
pass
class HunyuanImage3TextEncoder(torch.nn.Module):
def __init__(self):
super().__init__()
self.wte = torch.nn.Embedding(133120, 4096, 128009)
def forward(self, x):
out = self.wte(x)
return out, torch.empty_like(out)
class HunyuanImage3(sd1_clip.SDClipModel):
def __init__(self, device="cpu", max_length=77, freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=HunyuanImage3TextEncoder, layer_norm_hidden_state=True, enable_attention_masks=False, zero_out_masked=False, return_projected_pooled=False, return_attention_masks=False, model_options={}):
super().__init__(device, max_length, freeze, layer, layer_idx, textmodel_json_config, dtype, model_class, layer_norm_hidden_state, enable_attention_masks, zero_out_masked, return_projected_pooled, return_attention_masks, model_options)
class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer):
def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...):