diff --git a/comfy/ldm/hunyuan_image_3/model.py b/comfy/ldm/hunyuan_image_3/model.py index aa3dce524..28a296f06 100644 --- a/comfy/ldm/hunyuan_image_3/model.py +++ b/comfy/ldm/hunyuan_image_3/model.py @@ -596,7 +596,7 @@ class LazyMoELoader(nn.Module): def get_checkpoint(self): comfyui_dir = Path.home() / "ComfyUI" - checkpoint = comfyui_dir / "models" / "checkpoint" / "hunyuan_image_3.safetensors" + checkpoint = comfyui_dir / "models" / "checkpoints" / "hunyuan_image_3.safetensors" checkpoint = checkpoint.resolve() if not os.path.exists(checkpoint): raise ValueError(f"Hunyuan Image 3 Checkpoint on one GPU should have the path: {checkpoint}") @@ -998,6 +998,7 @@ class HunyuanImage3ForCausalMM(nn.Module): super().__init__() config = kwargs self.config = config + self.dtype = dtype factory_kwargs = {"device": device, "dtype": dtype, "operations": operations} self.timestep_emb = TimestepEmbedder(hidden_size=config["hidden_size"], **factory_kwargs) @@ -1065,7 +1066,7 @@ class HunyuanImage3ForCausalMM(nn.Module): cond_exists = (joint_image[:, 0, :] != -100.0).any(dim=1).any() height, width = x.size(2) * 16, x.size(3) * 16 - gen_timestep_scatter_index = 4 + gen_timestep_scatter_index = 3 def fn(string, func = self.encode_tok): return self.model.wte(torch.tensor(func(string) if not isinstance(func, dict) else func[string], device=inputs_embeds.device))\ @@ -1175,7 +1176,7 @@ class HunyuanImage3ForCausalMM(nn.Module): hidden_states = hidden_states.to(inputs_embeds.device) img_mask = torch.zeros(hidden_states.size(1)) - img_mask[seq_len + x.size(1)+4:] = 1; img_mask[-1] = 0 + img_mask[seq_len + 5:seq_len + 5 + x.size(1)] = 1 diffusion_prediction = self.ragged_final_layer( hidden_states, img_mask, timestep, int(token_height), int(token_width), self.first_step) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 246596167..37fbd0ad3 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -483,7 +483,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["text_emb_dim"] = 2048 return dit_config - if "{}layers.32.mlp.gate_and_up_proj.weight".format(key_prefix) in state_dict_keys: + if "__SKIP__{}model.layers.0.mlp.experts.0.down_proj.weight" or "{}model.layers.0.mlp.experts.0.down_proj.weight".format(key_prefix) in state_dict_keys: # Hunyaun Image 3 dit_config = {} dit_config["image_model"] = "hunyuan_image_3" dit_config["hidden_size"] = 4096 diff --git a/comfy/text_encoders/hunyuan_image.py b/comfy/text_encoders/hunyuan_image.py index c67e08096..4d3e13f90 100644 --- a/comfy/text_encoders/hunyuan_image.py +++ b/comfy/text_encoders/hunyuan_image.py @@ -1,7 +1,7 @@ from comfy import sd1_clip import comfy.text_encoders.llama from .qwen_image import QwenImageTokenizer, QwenImageTEModel -from transformers import ByT5Tokenizer +from transformers import ByT5Tokenizer, AutoTokenizer import torch import os import re @@ -20,8 +20,9 @@ class HunyuanImage3(sd1_clip.SDClipModel): super().__init__(device, max_length, freeze, layer, layer_idx, textmodel_json_config, dtype, model_class, layer_norm_hidden_state, enable_attention_masks, zero_out_masked, return_projected_pooled, return_attention_masks, model_options) class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer): - def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...): - super().__init__(tokenizer_path, max_length, pad_with_end, embedding_directory, embedding_size, embedding_key, tokenizer_class, has_start_token, has_end_token, pad_to_max_length, min_length, pad_token, end_token, min_padding, tokenizer_data, tokenizer_args) + def __init__(self, tokenizer_path="hunyuan3_tokenizer", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=4096, embedding_key='clip_l', tokenizer_class=AutoTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=False, min_length=None, pad_token=128009, end_token=None, min_padding=None): + tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), tokenizer_path) + super().__init__(tokenizer_path, max_length, pad_with_end, embedding_directory, embedding_size, embedding_key, tokenizer_class, has_start_token, has_end_token, pad_to_max_length, min_length, pad_token, end_token, min_padding) class ByT5SmallTokenizer(sd1_clip.SDTokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): diff --git a/comfy_extras/nodes_hunyuan_image.py b/comfy_extras/nodes_hunyuan_image.py index 8eb4d8b80..e4711ab9a 100644 --- a/comfy_extras/nodes_hunyuan_image.py +++ b/comfy_extras/nodes_hunyuan_image.py @@ -40,7 +40,6 @@ class HunyuanImage3Conditioning(io.ComfyNode): category="conditioning/video_models", inputs = [ io.Conditioning.Input("text_encoding_positive"), - io.Clip.Input("clip"), io.Model.Input("model"), io.Conditioning.Input("vae_encoding", optional=True), io.Conditioning.Input("vit_encoding", optional=True), @@ -50,11 +49,13 @@ class HunyuanImage3Conditioning(io.ComfyNode): ) @classmethod - def execute(cls, text_encoding, clip, model, text_encoding_negative=None, vae_encoding = None, vit_encoding = None): - encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids - special_fn = clip.tokenizer.tokenizer.added_tokens_encoder + def execute(cls, text_encoding, model, text_encoding_negative=None, vae_encoding = None, vit_encoding = None): + model = model.diffusion_model + + encode_fn = model.encode_tok + special_fn = model.special_tok + word_embed = model.wte - word_embed = clip.wte patch_embed = model.patch_embed t_embed = model.time_embed @@ -80,12 +81,7 @@ class HunyuanImage3Conditioning(io.ComfyNode): vae_mask = torch.empty_like(joint_image) ragged_tensors = torch.nested.nested_tensor([joint_image, vae_mask, text_tokens.to(joint_image.dtype)]) - - uncond_ragged_tensors = None - if text_encoding_negative is not None: - uncond_ragged_tensors, _ = cls.execute(vae_encoding, vit_encoding, text_encoding_negative, clip=clip, text_encoding_negative = None) - else: - uncond_ragged_tensors = torch.nested.nested_tensor([torch.zeros_like(t) for t in ragged_tensors.unbind()]) + uncond_ragged_tensors = torch.nested.nested_tensor([torch.zeros_like(t) for t in ragged_tensors.unbind()]) if uncond_ragged_tensors is not None: positive = [[ragged_tensors, {}]]