mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-01-16 09:10:50 +08:00
fixes from comfyui testing
This commit is contained in:
parent
3dd39efa03
commit
a09d1786e7
@ -596,7 +596,7 @@ class LazyMoELoader(nn.Module):
|
||||
|
||||
def get_checkpoint(self):
|
||||
comfyui_dir = Path.home() / "ComfyUI"
|
||||
checkpoint = comfyui_dir / "models" / "checkpoint" / "hunyuan_image_3.safetensors"
|
||||
checkpoint = comfyui_dir / "models" / "checkpoints" / "hunyuan_image_3.safetensors"
|
||||
checkpoint = checkpoint.resolve()
|
||||
if not os.path.exists(checkpoint):
|
||||
raise ValueError(f"Hunyuan Image 3 Checkpoint on one GPU should have the path: {checkpoint}")
|
||||
@ -998,6 +998,7 @@ class HunyuanImage3ForCausalMM(nn.Module):
|
||||
super().__init__()
|
||||
config = kwargs
|
||||
self.config = config
|
||||
self.dtype = dtype
|
||||
factory_kwargs = {"device": device, "dtype": dtype, "operations": operations}
|
||||
|
||||
self.timestep_emb = TimestepEmbedder(hidden_size=config["hidden_size"], **factory_kwargs)
|
||||
@ -1065,7 +1066,7 @@ class HunyuanImage3ForCausalMM(nn.Module):
|
||||
cond_exists = (joint_image[:, 0, :] != -100.0).any(dim=1).any()
|
||||
|
||||
height, width = x.size(2) * 16, x.size(3) * 16
|
||||
gen_timestep_scatter_index = 4
|
||||
gen_timestep_scatter_index = 3
|
||||
|
||||
def fn(string, func = self.encode_tok):
|
||||
return self.model.wte(torch.tensor(func(string) if not isinstance(func, dict) else func[string], device=inputs_embeds.device))\
|
||||
@ -1175,7 +1176,7 @@ class HunyuanImage3ForCausalMM(nn.Module):
|
||||
|
||||
hidden_states = hidden_states.to(inputs_embeds.device)
|
||||
img_mask = torch.zeros(hidden_states.size(1))
|
||||
img_mask[seq_len + x.size(1)+4:] = 1; img_mask[-1] = 0
|
||||
img_mask[seq_len + 5:seq_len + 5 + x.size(1)] = 1
|
||||
|
||||
diffusion_prediction = self.ragged_final_layer(
|
||||
hidden_states, img_mask, timestep, int(token_height), int(token_width), self.first_step)
|
||||
|
||||
@ -483,7 +483,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["text_emb_dim"] = 2048
|
||||
return dit_config
|
||||
|
||||
if "{}layers.32.mlp.gate_and_up_proj.weight".format(key_prefix) in state_dict_keys:
|
||||
if "__SKIP__{}model.layers.0.mlp.experts.0.down_proj.weight" or "{}model.layers.0.mlp.experts.0.down_proj.weight".format(key_prefix) in state_dict_keys: # Hunyaun Image 3
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "hunyuan_image_3"
|
||||
dit_config["hidden_size"] = 4096
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
from comfy import sd1_clip
|
||||
import comfy.text_encoders.llama
|
||||
from .qwen_image import QwenImageTokenizer, QwenImageTEModel
|
||||
from transformers import ByT5Tokenizer
|
||||
from transformers import ByT5Tokenizer, AutoTokenizer
|
||||
import torch
|
||||
import os
|
||||
import re
|
||||
@ -20,8 +20,9 @@ class HunyuanImage3(sd1_clip.SDClipModel):
|
||||
super().__init__(device, max_length, freeze, layer, layer_idx, textmodel_json_config, dtype, model_class, layer_norm_hidden_state, enable_attention_masks, zero_out_masked, return_projected_pooled, return_attention_masks, model_options)
|
||||
|
||||
class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...):
|
||||
super().__init__(tokenizer_path, max_length, pad_with_end, embedding_directory, embedding_size, embedding_key, tokenizer_class, has_start_token, has_end_token, pad_to_max_length, min_length, pad_token, end_token, min_padding, tokenizer_data, tokenizer_args)
|
||||
def __init__(self, tokenizer_path="hunyuan3_tokenizer", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=4096, embedding_key='clip_l', tokenizer_class=AutoTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=False, min_length=None, pad_token=128009, end_token=None, min_padding=None):
|
||||
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), tokenizer_path)
|
||||
super().__init__(tokenizer_path, max_length, pad_with_end, embedding_directory, embedding_size, embedding_key, tokenizer_class, has_start_token, has_end_token, pad_to_max_length, min_length, pad_token, end_token, min_padding)
|
||||
|
||||
class ByT5SmallTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
|
||||
@ -40,7 +40,6 @@ class HunyuanImage3Conditioning(io.ComfyNode):
|
||||
category="conditioning/video_models",
|
||||
inputs = [
|
||||
io.Conditioning.Input("text_encoding_positive"),
|
||||
io.Clip.Input("clip"),
|
||||
io.Model.Input("model"),
|
||||
io.Conditioning.Input("vae_encoding", optional=True),
|
||||
io.Conditioning.Input("vit_encoding", optional=True),
|
||||
@ -50,11 +49,13 @@ class HunyuanImage3Conditioning(io.ComfyNode):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, text_encoding, clip, model, text_encoding_negative=None, vae_encoding = None, vit_encoding = None):
|
||||
encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids
|
||||
special_fn = clip.tokenizer.tokenizer.added_tokens_encoder
|
||||
def execute(cls, text_encoding, model, text_encoding_negative=None, vae_encoding = None, vit_encoding = None):
|
||||
model = model.diffusion_model
|
||||
|
||||
encode_fn = model.encode_tok
|
||||
special_fn = model.special_tok
|
||||
word_embed = model.wte
|
||||
|
||||
word_embed = clip.wte
|
||||
patch_embed = model.patch_embed
|
||||
t_embed = model.time_embed
|
||||
|
||||
@ -80,12 +81,7 @@ class HunyuanImage3Conditioning(io.ComfyNode):
|
||||
vae_mask = torch.empty_like(joint_image)
|
||||
|
||||
ragged_tensors = torch.nested.nested_tensor([joint_image, vae_mask, text_tokens.to(joint_image.dtype)])
|
||||
|
||||
uncond_ragged_tensors = None
|
||||
if text_encoding_negative is not None:
|
||||
uncond_ragged_tensors, _ = cls.execute(vae_encoding, vit_encoding, text_encoding_negative, clip=clip, text_encoding_negative = None)
|
||||
else:
|
||||
uncond_ragged_tensors = torch.nested.nested_tensor([torch.zeros_like(t) for t in ragged_tensors.unbind()])
|
||||
uncond_ragged_tensors = torch.nested.nested_tensor([torch.zeros_like(t) for t in ragged_tensors.unbind()])
|
||||
|
||||
if uncond_ragged_tensors is not None:
|
||||
positive = [[ragged_tensors, {}]]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user