From ea176bb87d34baaef2fef236b091dc726e9d7d8d Mon Sep 17 00:00:00 2001
From: Yousef Rafat <81116377+yousef-rafat@users.noreply.github.com>
Date: Thu, 20 Nov 2025 23:47:57 +0200
Subject: [PATCH] basic support for hunyuan model

---
 comfy/latent_formats.py              |  5 +++++
 comfy/supported_models.py            | 13 ++++++++++++-
 comfy/text_encoders/hunyuan_image.py |  8 ++++++++
 comfy_extras/nodes_hunyuan_image.py  | 22 ++++++++++++----------
 4 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py
index 77e642a94..a13c281dd 100644
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -626,6 +626,11 @@ class Hunyuan3Dv2mini(LatentFormat):
     latent_dimensions = 1
     scale_factor = 1.0188137142395404
 
+class HunyuanImage3(LatentFormat):
+    latent_channels = 32
+    scale_factor = 0.562679178327931
+    latent_dimensions = 3
+
 class ACEAudio(LatentFormat):
     latent_channels = 8
     latent_dimensions = 2
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 4064bdae1..5ae2baef0 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1332,6 +1332,17 @@ class QwenImage(supported_models_base.BASE):
         hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
         return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect))
 
+class HunyuanImage3(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "hunyuan_image_3",
+    }
+    latent_format = latent_formats.HunyuanImage3
+
+    def get_model(self, state_dict, prefix="", device=None):
+        return model_base.HunyuanImage3(self, device = device)
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.DummyClip)
+
 class HunyuanImage21(HunyuanVideo):
     unet_config = {
         "image_model": "hunyuan_video",
@@ -1374,6 +1385,6 @@ class HunyuanImage21Refiner(HunyuanVideo):
         out = model_base.HunyuanImage21Refiner(self, device=device)
         return out
 
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanImage3, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
 
 models += [SVD_img2vid]
diff --git a/comfy/text_encoders/hunyuan_image.py b/comfy/text_encoders/hunyuan_image.py
index ff04726e1..ab3512201 100644
--- a/comfy/text_encoders/hunyuan_image.py
+++ b/comfy/text_encoders/hunyuan_image.py
@@ -5,6 +5,14 @@ from transformers import ByT5Tokenizer
 import os
 import re
 
+class DummyClip:
+    def __init__(*args, **kwargs):
+        pass
+
+class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...):
+        super().__init__(tokenizer_path, max_length, pad_with_end, embedding_directory, embedding_size, embedding_key, tokenizer_class, has_start_token, has_end_token, pad_to_max_length, min_length, pad_token, end_token, min_padding, tokenizer_data, tokenizer_args)
+
 class ByT5SmallTokenizer(sd1_clip.SDTokenizer):
     def __init__(self, embedding_directory=None, tokenizer_data={}):
         tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "byt5_tokenizer")
diff --git a/comfy_extras/nodes_hunyuan_image.py b/comfy_extras/nodes_hunyuan_image.py
index 012ac8a08..e02702995 100644
--- a/comfy_extras/nodes_hunyuan_image.py
+++ b/comfy_extras/nodes_hunyuan_image.py
@@ -22,19 +22,20 @@ class EmptyLatentHunyuanImage3(io.ComfyNode):
                 io.Int.Input("height", min = 1, default = 512),
                 io.Int.Input("width", min = 1, default = 512),
                 io.Int.Input("batch_size", min = 1, max = 48_000, default = 1),
-                io.Clip.Input("clip")
+                io.Clip.Input("clip"),
+                io.Model.Input("model")
             ],
             outputs=[io.Latent.Output(display_name="latent")]
         )
     @classmethod
-    def execute(cls, height, width, batch_size, clip):
+    def execute(cls, height, width, batch_size, clip, model):
         encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids
         special_fn = clip.tokenizer.tokenizer.added_tokens_encoder
 
         # may convert clip.tokenizer -> clip.
-        word_embed = clip.tokenizer.wte
-        patch_embed = clip.tokenizer.patch_embed
-        t_embed = clip.tokenizer.time_embed
+        word_embed = model.wte
+        patch_embed = model.patch_embed
+        t_embed = model.time_embed
 
         height, width = get_target_size(height, width)
         latent = torch.randn(batch_size, 32, int(height) // 16, int(width) // 16, device=comfy.model_management.intermediate_device())
@@ -63,20 +64,21 @@ class HunyuanImage3Conditioning(io.ComfyNode):
                 io.Conditioning.Input("vae_encoding"),
                 io.Conditioning.Input("vit_encoding"),
                 io.Conditioning.Input("text_encoding_positive"),
+                io.Clip.Input("clip"),
+                io.Model.Input("model"),
                 io.Conditioning.Input("text_encoding_negative", optional = True),
-                io.Clip.Input("clip")
             ],
             outputs=[io.Conditioning.Output(display_name= "positive"), io.Conditioning.Output(display_name="negative")]
         )
 
     @classmethod
-    def execute(cls, vae_encoding, vit_encoding, text_encoding, clip, text_encoding_negative=None):
+    def execute(cls, vae_encoding, vit_encoding, text_encoding, clip, model, text_encoding_negative=None):
         encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids
         special_fn = clip.tokenizer.tokenizer.added_tokens_encoder
 
-        word_embed = clip.tokenizer.wte
-        patch_embed = clip.tokenizer.patch_embed
-        t_embed = clip.tokenizer.time_embed
+        word_embed = model.wte
+        patch_embed = model.patch_embed
+        t_embed = model.time_embed
         batch_size, _, hidden_size = vit_encoding.shape
 
         def fn(string, func = encode_fn):