mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-01-26 22:30:19 +08:00
added clip encoder
This commit is contained in:
parent
ea176bb87d
commit
4d982e83f6
@ -18,16 +18,7 @@ from comfy.ldm.modules.attention import optimized_attention
|
|||||||
from comfy.ldm.modules.diffusionmodules.openaimodel import ResBlock
|
from comfy.ldm.modules.diffusionmodules.openaimodel import ResBlock
|
||||||
|
|
||||||
INIT_MOE = torch.cuda.device_count() != 1
|
INIT_MOE = torch.cuda.device_count() != 1
|
||||||
|
MOE_LAYER_SIZE = (1024**3) * 2.65 # approx
|
||||||
if not INIT_MOE:
|
|
||||||
MOE_LAYER_SIZE = (1024**3) * 2.65 # approx
|
|
||||||
|
|
||||||
torch.cuda.set_device(0)
|
|
||||||
props = torch.cuda.get_device_properties(0)
|
|
||||||
|
|
||||||
LAYERS_IN_CPU = math.floor((int((os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES'))
|
|
||||||
- psutil.Process(os.getpid()).memory_info().rss
|
|
||||||
- (2*1024**3)) * 0.50) / MOE_LAYER_SIZE)
|
|
||||||
|
|
||||||
class HunyuanStaticCache(StaticCache):
|
class HunyuanStaticCache(StaticCache):
|
||||||
|
|
||||||
@ -885,7 +876,6 @@ class HunyuanImage3Model(nn.Module):
|
|||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.LongTensor = None,
|
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
||||||
@ -897,9 +887,6 @@ class HunyuanImage3Model(nn.Module):
|
|||||||
gen_timestep_scatter_index: Optional[torch.Tensor] = None,
|
gen_timestep_scatter_index: Optional[torch.Tensor] = None,
|
||||||
):
|
):
|
||||||
|
|
||||||
if inputs_embeds is None:
|
|
||||||
inputs_embeds = self.wte(input_ids)
|
|
||||||
|
|
||||||
hidden_states = inputs_embeds
|
hidden_states = inputs_embeds
|
||||||
|
|
||||||
next_decoder_cache = None
|
next_decoder_cache = None
|
||||||
|
|||||||
@ -1339,9 +1339,12 @@ class HunyuanImage3(supported_models_base.BASE):
|
|||||||
latent_format = latent_formats.HunyuanImage3
|
latent_format = latent_formats.HunyuanImage3
|
||||||
|
|
||||||
def get_model(self, state_dict, prefix="", device=None):
|
def get_model(self, state_dict, prefix="", device=None):
|
||||||
return model_base.HunyuanImage3(self, device = device)
|
state_dict["text_encoders.wte"] = state_dict["model.model.wte"]
|
||||||
|
state_dict.pop("model.model.wte", None)
|
||||||
|
model = model_base.HunyuanImage3(self, device = device)
|
||||||
|
return model
|
||||||
def clip_target(self, state_dict={}):
|
def clip_target(self, state_dict={}):
|
||||||
return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.DummyClip)
|
return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.HunyuanImage3)
|
||||||
|
|
||||||
class HunyuanImage21(HunyuanVideo):
|
class HunyuanImage21(HunyuanVideo):
|
||||||
unet_config = {
|
unet_config = {
|
||||||
|
|||||||
@ -2,12 +2,20 @@ from comfy import sd1_clip
|
|||||||
import comfy.text_encoders.llama
|
import comfy.text_encoders.llama
|
||||||
from .qwen_image import QwenImageTokenizer, QwenImageTEModel
|
from .qwen_image import QwenImageTokenizer, QwenImageTEModel
|
||||||
from transformers import ByT5Tokenizer
|
from transformers import ByT5Tokenizer
|
||||||
|
import torch
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
class DummyClip:
|
class HunyuanImage3TextEncoder(torch.nn.Module):
|
||||||
def __init__(*args, **kwargs):
|
def __init__(self):
|
||||||
pass
|
super().__init__()
|
||||||
|
self.wte = torch.nn.Embedding(133120, 4096, 128009)
|
||||||
|
def forward(self, x):
|
||||||
|
out = self.wte(x)
|
||||||
|
return out, torch.empty_like(out)
|
||||||
|
class HunyuanImage3(sd1_clip.SDClipModel):
|
||||||
|
def __init__(self, device="cpu", max_length=77, freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=HunyuanImage3TextEncoder, layer_norm_hidden_state=True, enable_attention_masks=False, zero_out_masked=False, return_projected_pooled=False, return_attention_masks=False, model_options={}):
|
||||||
|
super().__init__(device, max_length, freeze, layer, layer_idx, textmodel_json_config, dtype, model_class, layer_norm_hidden_state, enable_attention_masks, zero_out_masked, return_projected_pooled, return_attention_masks, model_options)
|
||||||
|
|
||||||
class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer):
|
class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer):
|
||||||
def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...):
|
def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user