basic support for hunyuan model

This commit is contained in:
Yousef Rafat 2025-11-20 23:47:57 +02:00
parent b84af5b947
commit ea176bb87d
4 changed files with 37 additions and 11 deletions

View File

@ -626,6 +626,11 @@ class Hunyuan3Dv2mini(LatentFormat):
latent_dimensions = 1
scale_factor = 1.0188137142395404
class HunyuanImage3(LatentFormat):
latent_channels = 32
scale_factor = 0.562679178327931
latent_dimensions = 3
class ACEAudio(LatentFormat):
latent_channels = 8
latent_dimensions = 2

View File

@ -1332,6 +1332,17 @@ class QwenImage(supported_models_base.BASE):
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect))
class HunyuanImage3(supported_models_base.BASE):
unet_config = {
"image_model": "hunyuan_image_3",
}
latent_format = latent_formats.HunyuanImage3
def get_model(self, state_dict, prefix="", device=None):
return model_base.HunyuanImage3(self, device = device)
def clip_target(self, state_dict={}):
return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.DummyClip)
class HunyuanImage21(HunyuanVideo):
unet_config = {
"image_model": "hunyuan_video",
@ -1374,6 +1385,6 @@ class HunyuanImage21Refiner(HunyuanVideo):
out = model_base.HunyuanImage21Refiner(self, device=device)
return out
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanImage3, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
models += [SVD_img2vid]

View File

@ -5,6 +5,14 @@ from transformers import ByT5Tokenizer
import os
import re
class DummyClip:
def __init__(*args, **kwargs):
pass
class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer):
def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...):
super().__init__(tokenizer_path, max_length, pad_with_end, embedding_directory, embedding_size, embedding_key, tokenizer_class, has_start_token, has_end_token, pad_to_max_length, min_length, pad_token, end_token, min_padding, tokenizer_data, tokenizer_args)
class ByT5SmallTokenizer(sd1_clip.SDTokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "byt5_tokenizer")

View File

@ -22,19 +22,20 @@ class EmptyLatentHunyuanImage3(io.ComfyNode):
io.Int.Input("height", min = 1, default = 512),
io.Int.Input("width", min = 1, default = 512),
io.Int.Input("batch_size", min = 1, max = 48_000, default = 1),
io.Clip.Input("clip")
io.Clip.Input("clip"),
io.Model.Input("model")
],
outputs=[io.Latent.Output(display_name="latent")]
)
@classmethod
def execute(cls, height, width, batch_size, clip):
def execute(cls, height, width, batch_size, clip, model):
encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids
special_fn = clip.tokenizer.tokenizer.added_tokens_encoder
# may convert clip.tokenizer -> clip.
word_embed = clip.tokenizer.wte
patch_embed = clip.tokenizer.patch_embed
t_embed = clip.tokenizer.time_embed
word_embed = model.wte
patch_embed = model.patch_embed
t_embed = model.time_embed
height, width = get_target_size(height, width)
latent = torch.randn(batch_size, 32, int(height) // 16, int(width) // 16, device=comfy.model_management.intermediate_device())
@ -63,20 +64,21 @@ class HunyuanImage3Conditioning(io.ComfyNode):
io.Conditioning.Input("vae_encoding"),
io.Conditioning.Input("vit_encoding"),
io.Conditioning.Input("text_encoding_positive"),
io.Clip.Input("clip"),
io.Model.Input("model"),
io.Conditioning.Input("text_encoding_negative", optional = True),
io.Clip.Input("clip")
],
outputs=[io.Conditioning.Output(display_name= "positive"), io.Conditioning.Output(display_name="negative")]
)
@classmethod
def execute(cls, vae_encoding, vit_encoding, text_encoding, clip, text_encoding_negative=None):
def execute(cls, vae_encoding, vit_encoding, text_encoding, clip, model, text_encoding_negative=None):
encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids
special_fn = clip.tokenizer.tokenizer.added_tokens_encoder
word_embed = clip.tokenizer.wte
patch_embed = clip.tokenizer.patch_embed
t_embed = clip.tokenizer.time_embed
word_embed = model.wte
patch_embed = model.patch_embed
t_embed = model.time_embed
batch_size, _, hidden_size = vit_encoding.shape
def fn(string, func = encode_fn):