basic support for hunyuan model

2026-02-28 14:57:32 +08:00 · 2025-11-20 23:47:57 +02:00 · 2025-11-20 23:47:57 +02:00 · ea176bb87d
commit ea176bb87d
parent b84af5b947
4 changed files with 37 additions and 11 deletions
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -626,6 +626,11 @@ class Hunyuan3Dv2mini(LatentFormat):
    latent_dimensions = 1
    scale_factor = 1.0188137142395404

+class HunyuanImage3(LatentFormat):
+    latent_channels = 32
+    scale_factor = 0.562679178327931
+    latent_dimensions = 3
+
 class ACEAudio(LatentFormat):
    latent_channels = 8
    latent_dimensions = 2
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -1332,6 +1332,17 @@ class QwenImage(supported_models_base.BASE):
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect))

+class HunyuanImage3(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "hunyuan_image_3",
+    }
+    latent_format = latent_formats.HunyuanImage3
+
+    def get_model(self, state_dict, prefix="", device=None):
+        return model_base.HunyuanImage3(self, device = device)
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.DummyClip)
+
 class HunyuanImage21(HunyuanVideo):
    unet_config = {
        "image_model": "hunyuan_video",
@ -1374,6 +1385,6 @@ class HunyuanImage21Refiner(HunyuanVideo):
        out = model_base.HunyuanImage21Refiner(self, device=device)
        return out

-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanImage3, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]

 models += [SVD_img2vid]
--- a/comfy/text_encoders/hunyuan_image.py
+++ b/comfy/text_encoders/hunyuan_image.py
@ -5,6 +5,14 @@ from transformers import ByT5Tokenizer
 import os
 import re

+class DummyClip:
+    def __init__(*args, **kwargs):
+        pass
+
+class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...):
+        super().__init__(tokenizer_path, max_length, pad_with_end, embedding_directory, embedding_size, embedding_key, tokenizer_class, has_start_token, has_end_token, pad_to_max_length, min_length, pad_token, end_token, min_padding, tokenizer_data, tokenizer_args)
+
 class ByT5SmallTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "byt5_tokenizer")
--- a/comfy_extras/nodes_hunyuan_image.py
+++ b/comfy_extras/nodes_hunyuan_image.py
@ -22,19 +22,20 @@ class EmptyLatentHunyuanImage3(io.ComfyNode):
                io.Int.Input("height", min = 1, default = 512),
                io.Int.Input("width", min = 1, default = 512),
                io.Int.Input("batch_size", min = 1, max = 48_000, default = 1),
-                io.Clip.Input("clip")
+                io.Clip.Input("clip"),
+                io.Model.Input("model")
            ],
            outputs=[io.Latent.Output(display_name="latent")]
        )
    @classmethod
-    def execute(cls, height, width, batch_size, clip):
+    def execute(cls, height, width, batch_size, clip, model):
        encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids
        special_fn = clip.tokenizer.tokenizer.added_tokens_encoder

        # may convert clip.tokenizer -> clip.
-        word_embed = clip.tokenizer.wte
-        patch_embed = clip.tokenizer.patch_embed
-        t_embed = clip.tokenizer.time_embed
+        word_embed = model.wte
+        patch_embed = model.patch_embed
+        t_embed = model.time_embed

        height, width = get_target_size(height, width)
        latent = torch.randn(batch_size, 32, int(height) // 16, int(width) // 16, device=comfy.model_management.intermediate_device())
@ -63,20 +64,21 @@ class HunyuanImage3Conditioning(io.ComfyNode):
                io.Conditioning.Input("vae_encoding"),
                io.Conditioning.Input("vit_encoding"),
                io.Conditioning.Input("text_encoding_positive"),
+                io.Clip.Input("clip"),
+                io.Model.Input("model"),
                io.Conditioning.Input("text_encoding_negative", optional = True),
-                io.Clip.Input("clip")
            ],
            outputs=[io.Conditioning.Output(display_name= "positive"), io.Conditioning.Output(display_name="negative")]
        )

    @classmethod
-    def execute(cls, vae_encoding, vit_encoding, text_encoding, clip, text_encoding_negative=None):
+    def execute(cls, vae_encoding, vit_encoding, text_encoding, clip, model, text_encoding_negative=None):
        encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids
        special_fn = clip.tokenizer.tokenizer.added_tokens_encoder

-        word_embed = clip.tokenizer.wte
-        patch_embed = clip.tokenizer.patch_embed
-        t_embed = clip.tokenizer.time_embed
+        word_embed = model.wte
+        patch_embed = model.patch_embed
+        t_embed = model.time_embed
        batch_size, _, hidden_size = vit_encoding.shape

        def fn(string, func = encode_fn):