From ea176bb87d34baaef2fef236b091dc726e9d7d8d Mon Sep 17 00:00:00 2001 From: Yousef Rafat <81116377+yousef-rafat@users.noreply.github.com> Date: Thu, 20 Nov 2025 23:47:57 +0200 Subject: [PATCH] basic support for hunyuan model --- comfy/latent_formats.py | 5 +++++ comfy/supported_models.py | 13 ++++++++++++- comfy/text_encoders/hunyuan_image.py | 8 ++++++++ comfy_extras/nodes_hunyuan_image.py | 22 ++++++++++++---------- 4 files changed, 37 insertions(+), 11 deletions(-) diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 77e642a94..a13c281dd 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -626,6 +626,11 @@ class Hunyuan3Dv2mini(LatentFormat): latent_dimensions = 1 scale_factor = 1.0188137142395404 +class HunyuanImage3(LatentFormat): + latent_channels = 32 + scale_factor = 0.562679178327931 + latent_dimensions = 3 + class ACEAudio(LatentFormat): latent_channels = 8 latent_dimensions = 2 diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 4064bdae1..5ae2baef0 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1332,6 +1332,17 @@ class QwenImage(supported_models_base.BASE): hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect)) +class HunyuanImage3(supported_models_base.BASE): + unet_config = { + "image_model": "hunyuan_image_3", + } + latent_format = latent_formats.HunyuanImage3 + + def get_model(self, state_dict, prefix="", device=None): + return model_base.HunyuanImage3(self, device = device) + def clip_target(self, state_dict={}): + return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.DummyClip) + class HunyuanImage21(HunyuanVideo): unet_config = { "image_model": "hunyuan_video", @@ -1374,6 +1385,6 @@ class HunyuanImage21Refiner(HunyuanVideo): out = model_base.HunyuanImage21Refiner(self, device=device) return out -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanImage3, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy/text_encoders/hunyuan_image.py b/comfy/text_encoders/hunyuan_image.py index ff04726e1..ab3512201 100644 --- a/comfy/text_encoders/hunyuan_image.py +++ b/comfy/text_encoders/hunyuan_image.py @@ -5,6 +5,14 @@ from transformers import ByT5Tokenizer import os import re +class DummyClip: + def __init__(*args, **kwargs): + pass + +class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer): + def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...): + super().__init__(tokenizer_path, max_length, pad_with_end, embedding_directory, embedding_size, embedding_key, tokenizer_class, has_start_token, has_end_token, pad_to_max_length, min_length, pad_token, end_token, min_padding, tokenizer_data, tokenizer_args) + class ByT5SmallTokenizer(sd1_clip.SDTokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "byt5_tokenizer") diff --git a/comfy_extras/nodes_hunyuan_image.py b/comfy_extras/nodes_hunyuan_image.py index 012ac8a08..e02702995 100644 --- a/comfy_extras/nodes_hunyuan_image.py +++ b/comfy_extras/nodes_hunyuan_image.py @@ -22,19 +22,20 @@ class EmptyLatentHunyuanImage3(io.ComfyNode): io.Int.Input("height", min = 1, default = 512), io.Int.Input("width", min = 1, default = 512), io.Int.Input("batch_size", min = 1, max = 48_000, default = 1), - io.Clip.Input("clip") + io.Clip.Input("clip"), + io.Model.Input("model") ], outputs=[io.Latent.Output(display_name="latent")] ) @classmethod - def execute(cls, height, width, batch_size, clip): + def execute(cls, height, width, batch_size, clip, model): encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids special_fn = clip.tokenizer.tokenizer.added_tokens_encoder # may convert clip.tokenizer -> clip. - word_embed = clip.tokenizer.wte - patch_embed = clip.tokenizer.patch_embed - t_embed = clip.tokenizer.time_embed + word_embed = model.wte + patch_embed = model.patch_embed + t_embed = model.time_embed height, width = get_target_size(height, width) latent = torch.randn(batch_size, 32, int(height) // 16, int(width) // 16, device=comfy.model_management.intermediate_device()) @@ -63,20 +64,21 @@ class HunyuanImage3Conditioning(io.ComfyNode): io.Conditioning.Input("vae_encoding"), io.Conditioning.Input("vit_encoding"), io.Conditioning.Input("text_encoding_positive"), + io.Clip.Input("clip"), + io.Model.Input("model"), io.Conditioning.Input("text_encoding_negative", optional = True), - io.Clip.Input("clip") ], outputs=[io.Conditioning.Output(display_name= "positive"), io.Conditioning.Output(display_name="negative")] ) @classmethod - def execute(cls, vae_encoding, vit_encoding, text_encoding, clip, text_encoding_negative=None): + def execute(cls, vae_encoding, vit_encoding, text_encoding, clip, model, text_encoding_negative=None): encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids special_fn = clip.tokenizer.tokenizer.added_tokens_encoder - word_embed = clip.tokenizer.wte - patch_embed = clip.tokenizer.patch_embed - t_embed = clip.tokenizer.time_embed + word_embed = model.wte + patch_embed = model.patch_embed + t_embed = model.time_embed batch_size, _, hidden_size = vit_encoding.shape def fn(string, func = encode_fn):