mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-01-12 07:10:52 +08:00
basic support for hunyuan model
This commit is contained in:
parent
b84af5b947
commit
ea176bb87d
@ -626,6 +626,11 @@ class Hunyuan3Dv2mini(LatentFormat):
|
||||
latent_dimensions = 1
|
||||
scale_factor = 1.0188137142395404
|
||||
|
||||
class HunyuanImage3(LatentFormat):
|
||||
latent_channels = 32
|
||||
scale_factor = 0.562679178327931
|
||||
latent_dimensions = 3
|
||||
|
||||
class ACEAudio(LatentFormat):
|
||||
latent_channels = 8
|
||||
latent_dimensions = 2
|
||||
|
||||
@ -1332,6 +1332,17 @@ class QwenImage(supported_models_base.BASE):
|
||||
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect))
|
||||
|
||||
class HunyuanImage3(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "hunyuan_image_3",
|
||||
}
|
||||
latent_format = latent_formats.HunyuanImage3
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
return model_base.HunyuanImage3(self, device = device)
|
||||
def clip_target(self, state_dict={}):
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.DummyClip)
|
||||
|
||||
class HunyuanImage21(HunyuanVideo):
|
||||
unet_config = {
|
||||
"image_model": "hunyuan_video",
|
||||
@ -1374,6 +1385,6 @@ class HunyuanImage21Refiner(HunyuanVideo):
|
||||
out = model_base.HunyuanImage21Refiner(self, device=device)
|
||||
return out
|
||||
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanImage3, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
|
||||
|
||||
models += [SVD_img2vid]
|
||||
|
||||
@ -5,6 +5,14 @@ from transformers import ByT5Tokenizer
|
||||
import os
|
||||
import re
|
||||
|
||||
class DummyClip:
|
||||
def __init__(*args, **kwargs):
|
||||
pass
|
||||
|
||||
class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...):
|
||||
super().__init__(tokenizer_path, max_length, pad_with_end, embedding_directory, embedding_size, embedding_key, tokenizer_class, has_start_token, has_end_token, pad_to_max_length, min_length, pad_token, end_token, min_padding, tokenizer_data, tokenizer_args)
|
||||
|
||||
class ByT5SmallTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "byt5_tokenizer")
|
||||
|
||||
@ -22,19 +22,20 @@ class EmptyLatentHunyuanImage3(io.ComfyNode):
|
||||
io.Int.Input("height", min = 1, default = 512),
|
||||
io.Int.Input("width", min = 1, default = 512),
|
||||
io.Int.Input("batch_size", min = 1, max = 48_000, default = 1),
|
||||
io.Clip.Input("clip")
|
||||
io.Clip.Input("clip"),
|
||||
io.Model.Input("model")
|
||||
],
|
||||
outputs=[io.Latent.Output(display_name="latent")]
|
||||
)
|
||||
@classmethod
|
||||
def execute(cls, height, width, batch_size, clip):
|
||||
def execute(cls, height, width, batch_size, clip, model):
|
||||
encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids
|
||||
special_fn = clip.tokenizer.tokenizer.added_tokens_encoder
|
||||
|
||||
# may convert clip.tokenizer -> clip.
|
||||
word_embed = clip.tokenizer.wte
|
||||
patch_embed = clip.tokenizer.patch_embed
|
||||
t_embed = clip.tokenizer.time_embed
|
||||
word_embed = model.wte
|
||||
patch_embed = model.patch_embed
|
||||
t_embed = model.time_embed
|
||||
|
||||
height, width = get_target_size(height, width)
|
||||
latent = torch.randn(batch_size, 32, int(height) // 16, int(width) // 16, device=comfy.model_management.intermediate_device())
|
||||
@ -63,20 +64,21 @@ class HunyuanImage3Conditioning(io.ComfyNode):
|
||||
io.Conditioning.Input("vae_encoding"),
|
||||
io.Conditioning.Input("vit_encoding"),
|
||||
io.Conditioning.Input("text_encoding_positive"),
|
||||
io.Clip.Input("clip"),
|
||||
io.Model.Input("model"),
|
||||
io.Conditioning.Input("text_encoding_negative", optional = True),
|
||||
io.Clip.Input("clip")
|
||||
],
|
||||
outputs=[io.Conditioning.Output(display_name= "positive"), io.Conditioning.Output(display_name="negative")]
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, vae_encoding, vit_encoding, text_encoding, clip, text_encoding_negative=None):
|
||||
def execute(cls, vae_encoding, vit_encoding, text_encoding, clip, model, text_encoding_negative=None):
|
||||
encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids
|
||||
special_fn = clip.tokenizer.tokenizer.added_tokens_encoder
|
||||
|
||||
word_embed = clip.tokenizer.wte
|
||||
patch_embed = clip.tokenizer.patch_embed
|
||||
t_embed = clip.tokenizer.time_embed
|
||||
word_embed = model.wte
|
||||
patch_embed = model.patch_embed
|
||||
t_embed = model.time_embed
|
||||
batch_size, _, hidden_size = vit_encoding.shape
|
||||
|
||||
def fn(string, func = encode_fn):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user