Merge ea176bb87d into 10e90a5757

2026-03-12 04:37:32 +08:00 · 2025-11-20 21:58:25 -05:00 · 2025-11-20 21:58:25 -05:00 · e3d5079d26
commit e3d5079d26
parent 10e90a5757 ea176bb87d
8 changed files with 1292 additions and 1 deletions
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -626,6 +626,11 @@ class Hunyuan3Dv2mini(LatentFormat):
    latent_dimensions = 1
    scale_factor = 1.0188137142395404

+class HunyuanImage3(LatentFormat):
+    latent_channels = 32
+    scale_factor = 0.562679178327931
+    latent_dimensions = 3
+
 class ACEAudio(LatentFormat):
    latent_channels = 8
    latent_dimensions = 2
--- a/comfy/ldm/hunyuan_image_3/model.py
+++ b/comfy/ldm/hunyuan_image_3/model.py
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -46,6 +46,7 @@ import comfy.ldm.chroma.model
 import comfy.ldm.chroma_radiance.model
 import comfy.ldm.ace.model
 import comfy.ldm.omnigen.omnigen2
+import comfy.ldm.hunyuan_image_3.model
 import comfy.ldm.qwen_image.model

 import comfy.model_management
@ -1355,6 +1356,10 @@ class Hunyuan3Dv2(BaseModel):
        if guidance is not None:
            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
        return out
+    
+class HunyuanImage3(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device, unet_model = comfy.ldm.hunyuan_image_3.model.HunyuanImage3ForCausalMM)

 class Hunyuan3Dv2_1(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -482,6 +482,17 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["patch_size"] = 2
        dit_config["text_emb_dim"] = 2048
        return dit_config
+    
+    if "{}layers.32.mlp.gate_and_up_proj.weight".format(key_prefix) in state_dict_keys:
+        dit_config = {}
+        dit_config["image_model"] = "hunyuan_image_3"
+        dit_config["hidden_size"] = 4096
+        dit_config["max_position_embeddings"] = 12800
+        dit_config["num_attention_heads"] = 32
+        dit_config['rms_norm_eps'] = 1e-05
+        dit_config["num_hidden_layers"] = 32
+        dit_config["attention_head_dim"] = 128
+        return dit_config

    if '{}blocks.0.mlp.layer1.weight'.format(key_prefix) in state_dict_keys:  # Cosmos predict2
        dit_config = {}
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -1332,6 +1332,17 @@ class QwenImage(supported_models_base.BASE):
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect))

+class HunyuanImage3(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "hunyuan_image_3",
+    }
+    latent_format = latent_formats.HunyuanImage3
+
+    def get_model(self, state_dict, prefix="", device=None):
+        return model_base.HunyuanImage3(self, device = device)
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.DummyClip)
+
 class HunyuanImage21(HunyuanVideo):
    unet_config = {
        "image_model": "hunyuan_video",
@ -1374,6 +1385,6 @@ class HunyuanImage21Refiner(HunyuanVideo):
        out = model_base.HunyuanImage21Refiner(self, device=device)
        return out

-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanImage3, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]

 models += [SVD_img2vid]
--- a/comfy/text_encoders/hunyuan_image.py
+++ b/comfy/text_encoders/hunyuan_image.py
@ -5,6 +5,14 @@ from transformers import ByT5Tokenizer
 import os
 import re

+class DummyClip:
+    def __init__(*args, **kwargs):
+        pass
+
+class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...):
+        super().__init__(tokenizer_path, max_length, pad_with_end, embedding_directory, embedding_size, embedding_key, tokenizer_class, has_start_token, has_end_token, pad_to_max_length, min_length, pad_token, end_token, min_padding, tokenizer_data, tokenizer_args)
+
 class ByT5SmallTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "byt5_tokenizer")
--- a/comfy_extras/nodes_hunyuan_image.py
+++ b/comfy_extras/nodes_hunyuan_image.py
@ -0,0 +1,122 @@
+import torch
+import comfy.model_management
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
+
+COMPUTED_RESO_GROUPS = ['512x2048', '512x1984', '512x1920', '512x1856', '512x1792', '512x1728', '512x1664', '512x1600', '512x1536', '576x1472', '640x1408', '704x1344', '768x1280', '832x1216', '896x1152', '960x1088', '1024x1024', '1088x960', '1152x896', '1216x832', '1280x768', '1344x704', '1408x640', '1472x576', '1536x512', '1600x512', '1664x512', '1728x512', '1792x512', '1856x512', '1920x512', '1984x512', '2048x512']
+RATIOS = [torch.tensor(int(r.split("x")[0]) / int(r.split("x")[1])) for r in COMPUTED_RESO_GROUPS]
+def get_target_size(height, width):
+    ratio = height / width
+    idx = torch.argmin(torch.abs(torch.tensor(RATIOS) - ratio))
+    reso = COMPUTED_RESO_GROUPS[idx]
+    return reso.split("x")
+
+class EmptyLatentHunyuanImage3(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="EmptyLatentHunyuanImage3",
+            display_name="EmptyLatentHunyuanImage3",
+            category="image/latent",
+            inputs = [
+                io.Int.Input("height", min = 1, default = 512),
+                io.Int.Input("width", min = 1, default = 512),
+                io.Int.Input("batch_size", min = 1, max = 48_000, default = 1),
+                io.Clip.Input("clip"),
+                io.Model.Input("model")
+            ],
+            outputs=[io.Latent.Output(display_name="latent")]
+        )
+    @classmethod
+    def execute(cls, height, width, batch_size, clip, model):
+        encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids
+        special_fn = clip.tokenizer.tokenizer.added_tokens_encoder
+
+        # may convert clip.tokenizer -> clip.
+        word_embed = model.wte
+        patch_embed = model.patch_embed
+        t_embed = model.time_embed
+
+        height, width = get_target_size(height, width)
+        latent = torch.randn(batch_size, 32, int(height) // 16, int(width) // 16, device=comfy.model_management.intermediate_device())
+
+        latent, tk_height, tk_width = patch_embed(latent, t_embed(torch.tensor([0]).repeat(batch_size)))
+
+        def tk_fn(token):
+            return torch.tensor([token], device = latent.device, dtype = latent.dtype).unsqueeze(1).expand(batch_size, 1, latent.size(-1))
+        
+        def fn(string, func = encode_fn):
+            return word_embed(torch.tensor(func(string) if not isinstance(func, dict) else func[string], device=comfy.model_management.intermediate_device()))\
+                .unsqueeze(0).expand(batch_size, -1, -1)
+
+        latent = torch.cat([fn("<boi>"), fn("<img_size_1024>", func = special_fn), fn(f"<img_ratio_{int(height) // int(width)}>", special_fn), fn("<timestep>", special_fn), latent, fn("<eoi>")], dim = 1)
+        latent = torch.cat([latent, tk_fn(tk_height), tk_fn(tk_width)], dim = 1)
+        return io.NodeOutput({"samples": latent, "type": "hunyuan_image_3"}, )
+
+class HunyuanImage3Conditioning(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="HunyuanImage3Conditioning",
+            display_name="HunyuanImage3Conditioning",
+            category="conditioning/video_models",
+            inputs = [
+                io.Conditioning.Input("vae_encoding"),
+                io.Conditioning.Input("vit_encoding"),
+                io.Conditioning.Input("text_encoding_positive"),
+                io.Clip.Input("clip"),
+                io.Model.Input("model"),
+                io.Conditioning.Input("text_encoding_negative", optional = True),
+            ],
+            outputs=[io.Conditioning.Output(display_name= "positive"), io.Conditioning.Output(display_name="negative")]
+        )
+
+    @classmethod
+    def execute(cls, vae_encoding, vit_encoding, text_encoding, clip, model, text_encoding_negative=None):
+        encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids
+        special_fn = clip.tokenizer.tokenizer.added_tokens_encoder
+
+        word_embed = model.wte
+        patch_embed = model.patch_embed
+        t_embed = model.time_embed
+        batch_size, _, hidden_size = vit_encoding.shape
+
+        def fn(string, func = encode_fn):
+            return word_embed(torch.tensor(func(string) if not isinstance(func, dict) else func[string], device=comfy.model_management.intermediate_device()))\
+                .view(1, 1, hidden_size).expand(batch_size, -1, hidden_size)
+
+        text_tokens = text_encoding[0][0]
+        vae_encoding, _, _ = patch_embed(vae_encoding, t_embed(torch.tensor([0]).repeat(vae_encoding.size(0))))
+        #                                                                       should dynamically change in model logic
+        joint_image = torch.cat([fn("<boi>"), fn("<img_size_1024>", special_fn), fn("<img_ratio_3>", special_fn), fn("<timestep>", special_fn), vae_encoding, fn("<joint_img_sep>"), vit_encoding, fn("<eoi>")], dim = 1)
+
+        vae_mask = torch.ones(joint_image.size(1))
+        vae_mask[:3] = torch.zeros(3); vae_mask[vae_encoding.size(1) + 4:] = torch.zeros(len(vae_mask[vae_encoding.size(1) + 4:]))
+
+        ragged_tensors = torch.nested.nested_tensor([joint_image, vae_mask.unsqueeze(0).unsqueeze(-1), text_tokens.to(joint_image.dtype)])
+
+        uncond_ragged_tensors = None
+        if text_encoding_negative is not None:
+            uncond_ragged_tensors, _ = cls.execute(vae_encoding, vit_encoding, text_encoding_negative, clip=clip, text_encoding_negative = None)
+        else:
+            uncond_ragged_tensors = torch.nested.nested_tensor([torch.zeros_like(t) for t in ragged_tensors.unbind()])
+
+        if uncond_ragged_tensors is not None:
+            positive = [[ragged_tensors, {}]]
+            negative = [[uncond_ragged_tensors, {}]]
+        else:
+            positive = ragged_tensors
+            negative = uncond_ragged_tensors
+
+        return positive, negative
+
+class Image3Extension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            HunyuanImage3Conditioning,
+            EmptyLatentHunyuanImage3
+        ]
+
+async def comfy_entrypoint() -> Image3Extension:
+    return Image3Extension()
--- a/nodes.py
+++ b/nodes.py
@ -2326,6 +2326,7 @@ async def init_builtin_extra_nodes():
        "nodes_ace.py",
        "nodes_string.py",
        "nodes_camera_trajectory.py",
+        "nodes_hunyuan_image.py",
        "nodes_edit_model.py",
        "nodes_tcfg.py",
        "nodes_context_windows.py",