This commit is contained in:
Yousef R. Gamaleldin 2025-11-20 21:58:25 -05:00 committed by GitHub
commit e3d5079d26
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 1292 additions and 1 deletions

View File

@ -626,6 +626,11 @@ class Hunyuan3Dv2mini(LatentFormat):
latent_dimensions = 1
scale_factor = 1.0188137142395404
class HunyuanImage3(LatentFormat):
latent_channels = 32
scale_factor = 0.562679178327931
latent_dimensions = 3
class ACEAudio(LatentFormat):
latent_channels = 8
latent_dimensions = 2

File diff suppressed because it is too large Load Diff

View File

@ -46,6 +46,7 @@ import comfy.ldm.chroma.model
import comfy.ldm.chroma_radiance.model
import comfy.ldm.ace.model
import comfy.ldm.omnigen.omnigen2
import comfy.ldm.hunyuan_image_3.model
import comfy.ldm.qwen_image.model
import comfy.model_management
@ -1355,6 +1356,10 @@ class Hunyuan3Dv2(BaseModel):
if guidance is not None:
out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
return out
class HunyuanImage3(BaseModel):
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
super().__init__(model_config, model_type, device, unet_model = comfy.ldm.hunyuan_image_3.model.HunyuanImage3ForCausalMM)
class Hunyuan3Dv2_1(BaseModel):
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):

View File

@ -482,6 +482,17 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
dit_config["patch_size"] = 2
dit_config["text_emb_dim"] = 2048
return dit_config
if "{}layers.32.mlp.gate_and_up_proj.weight".format(key_prefix) in state_dict_keys:
dit_config = {}
dit_config["image_model"] = "hunyuan_image_3"
dit_config["hidden_size"] = 4096
dit_config["max_position_embeddings"] = 12800
dit_config["num_attention_heads"] = 32
dit_config['rms_norm_eps'] = 1e-05
dit_config["num_hidden_layers"] = 32
dit_config["attention_head_dim"] = 128
return dit_config
if '{}blocks.0.mlp.layer1.weight'.format(key_prefix) in state_dict_keys: # Cosmos predict2
dit_config = {}

View File

@ -1332,6 +1332,17 @@ class QwenImage(supported_models_base.BASE):
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect))
class HunyuanImage3(supported_models_base.BASE):
unet_config = {
"image_model": "hunyuan_image_3",
}
latent_format = latent_formats.HunyuanImage3
def get_model(self, state_dict, prefix="", device=None):
return model_base.HunyuanImage3(self, device = device)
def clip_target(self, state_dict={}):
return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImage3Tokenizer, comfy.text_encoders.hunyuan_image.DummyClip)
class HunyuanImage21(HunyuanVideo):
unet_config = {
"image_model": "hunyuan_video",
@ -1374,6 +1385,6 @@ class HunyuanImage21Refiner(HunyuanVideo):
out = model_base.HunyuanImage21Refiner(self, device=device)
return out
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanImage3, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
models += [SVD_img2vid]

View File

@ -5,6 +5,14 @@ from transformers import ByT5Tokenizer
import os
import re
class DummyClip:
def __init__(*args, **kwargs):
pass
class HunyuanImage3Tokenizer(sd1_clip.SDTokenizer):
def __init__(self, tokenizer_path="hunyuan_image_3", max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=..., has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data=..., tokenizer_args=...):
super().__init__(tokenizer_path, max_length, pad_with_end, embedding_directory, embedding_size, embedding_key, tokenizer_class, has_start_token, has_end_token, pad_to_max_length, min_length, pad_token, end_token, min_padding, tokenizer_data, tokenizer_args)
class ByT5SmallTokenizer(sd1_clip.SDTokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "byt5_tokenizer")

View File

@ -0,0 +1,122 @@
import torch
import comfy.model_management
from typing_extensions import override
from comfy_api.latest import ComfyExtension, io
COMPUTED_RESO_GROUPS = ['512x2048', '512x1984', '512x1920', '512x1856', '512x1792', '512x1728', '512x1664', '512x1600', '512x1536', '576x1472', '640x1408', '704x1344', '768x1280', '832x1216', '896x1152', '960x1088', '1024x1024', '1088x960', '1152x896', '1216x832', '1280x768', '1344x704', '1408x640', '1472x576', '1536x512', '1600x512', '1664x512', '1728x512', '1792x512', '1856x512', '1920x512', '1984x512', '2048x512']
RATIOS = [torch.tensor(int(r.split("x")[0]) / int(r.split("x")[1])) for r in COMPUTED_RESO_GROUPS]
def get_target_size(height, width):
ratio = height / width
idx = torch.argmin(torch.abs(torch.tensor(RATIOS) - ratio))
reso = COMPUTED_RESO_GROUPS[idx]
return reso.split("x")
class EmptyLatentHunyuanImage3(io.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
node_id="EmptyLatentHunyuanImage3",
display_name="EmptyLatentHunyuanImage3",
category="image/latent",
inputs = [
io.Int.Input("height", min = 1, default = 512),
io.Int.Input("width", min = 1, default = 512),
io.Int.Input("batch_size", min = 1, max = 48_000, default = 1),
io.Clip.Input("clip"),
io.Model.Input("model")
],
outputs=[io.Latent.Output(display_name="latent")]
)
@classmethod
def execute(cls, height, width, batch_size, clip, model):
encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids
special_fn = clip.tokenizer.tokenizer.added_tokens_encoder
# may convert clip.tokenizer -> clip.
word_embed = model.wte
patch_embed = model.patch_embed
t_embed = model.time_embed
height, width = get_target_size(height, width)
latent = torch.randn(batch_size, 32, int(height) // 16, int(width) // 16, device=comfy.model_management.intermediate_device())
latent, tk_height, tk_width = patch_embed(latent, t_embed(torch.tensor([0]).repeat(batch_size)))
def tk_fn(token):
return torch.tensor([token], device = latent.device, dtype = latent.dtype).unsqueeze(1).expand(batch_size, 1, latent.size(-1))
def fn(string, func = encode_fn):
return word_embed(torch.tensor(func(string) if not isinstance(func, dict) else func[string], device=comfy.model_management.intermediate_device()))\
.unsqueeze(0).expand(batch_size, -1, -1)
latent = torch.cat([fn("<boi>"), fn("<img_size_1024>", func = special_fn), fn(f"<img_ratio_{int(height) // int(width)}>", special_fn), fn("<timestep>", special_fn), latent, fn("<eoi>")], dim = 1)
latent = torch.cat([latent, tk_fn(tk_height), tk_fn(tk_width)], dim = 1)
return io.NodeOutput({"samples": latent, "type": "hunyuan_image_3"}, )
class HunyuanImage3Conditioning(io.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
node_id="HunyuanImage3Conditioning",
display_name="HunyuanImage3Conditioning",
category="conditioning/video_models",
inputs = [
io.Conditioning.Input("vae_encoding"),
io.Conditioning.Input("vit_encoding"),
io.Conditioning.Input("text_encoding_positive"),
io.Clip.Input("clip"),
io.Model.Input("model"),
io.Conditioning.Input("text_encoding_negative", optional = True),
],
outputs=[io.Conditioning.Output(display_name= "positive"), io.Conditioning.Output(display_name="negative")]
)
@classmethod
def execute(cls, vae_encoding, vit_encoding, text_encoding, clip, model, text_encoding_negative=None):
encode_fn = clip.tokenizer.tokenizer.convert_tokens_to_ids
special_fn = clip.tokenizer.tokenizer.added_tokens_encoder
word_embed = model.wte
patch_embed = model.patch_embed
t_embed = model.time_embed
batch_size, _, hidden_size = vit_encoding.shape
def fn(string, func = encode_fn):
return word_embed(torch.tensor(func(string) if not isinstance(func, dict) else func[string], device=comfy.model_management.intermediate_device()))\
.view(1, 1, hidden_size).expand(batch_size, -1, hidden_size)
text_tokens = text_encoding[0][0]
vae_encoding, _, _ = patch_embed(vae_encoding, t_embed(torch.tensor([0]).repeat(vae_encoding.size(0))))
# should dynamically change in model logic
joint_image = torch.cat([fn("<boi>"), fn("<img_size_1024>", special_fn), fn("<img_ratio_3>", special_fn), fn("<timestep>", special_fn), vae_encoding, fn("<joint_img_sep>"), vit_encoding, fn("<eoi>")], dim = 1)
vae_mask = torch.ones(joint_image.size(1))
vae_mask[:3] = torch.zeros(3); vae_mask[vae_encoding.size(1) + 4:] = torch.zeros(len(vae_mask[vae_encoding.size(1) + 4:]))
ragged_tensors = torch.nested.nested_tensor([joint_image, vae_mask.unsqueeze(0).unsqueeze(-1), text_tokens.to(joint_image.dtype)])
uncond_ragged_tensors = None
if text_encoding_negative is not None:
uncond_ragged_tensors, _ = cls.execute(vae_encoding, vit_encoding, text_encoding_negative, clip=clip, text_encoding_negative = None)
else:
uncond_ragged_tensors = torch.nested.nested_tensor([torch.zeros_like(t) for t in ragged_tensors.unbind()])
if uncond_ragged_tensors is not None:
positive = [[ragged_tensors, {}]]
negative = [[uncond_ragged_tensors, {}]]
else:
positive = ragged_tensors
negative = uncond_ragged_tensors
return positive, negative
class Image3Extension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[io.ComfyNode]]:
return [
HunyuanImage3Conditioning,
EmptyLatentHunyuanImage3
]
async def comfy_entrypoint() -> Image3Extension:
return Image3Extension()

View File

@ -2326,6 +2326,7 @@ async def init_builtin_extra_nodes():
"nodes_ace.py",
"nodes_string.py",
"nodes_camera_trajectory.py",
"nodes_hunyuan_image.py",
"nodes_edit_model.py",
"nodes_tcfg.py",
"nodes_context_windows.py",