diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index 9e432d5c0..959748011 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -688,6 +688,7 @@ class NextDiT(nn.Module): embeds += (siglip_feats,) freqs_cis += (self.rope_embedder(siglip_pos_ids).movedim(1, 2),) + x = comfy.ldm.common_dit.pad_to_patch_size(x, (pH, pW)) B, C, H, W = x.shape x = self.x_embedder(x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2)) x_pos_ids = pos_ids_x(cap_feats_len + 1, H // pH, W // pW, bsz, device, transformer_options=transformer_options) diff --git a/comfy/lora.py b/comfy/lora.py index 63ee85323..118a56ec8 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -429,9 +429,11 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori if output is None: logging.warning("Calculate Weight Failed: {} {}".format(v.name, key)) else: - weight = output if old_weight is not None: + old_weight.narrow(offset[0], offset[1], offset[2]).copy_(output) weight = old_weight + else: + weight = output continue if len(v) == 1: diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 35a6822e3..ab260046f 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -1094,6 +1094,12 @@ def convert_diffusers_mmdit(state_dict, output_prefix=""): num_blocks = count_blocks(state_dict, 'transformer_blocks.{}.') depth = state_dict["pos_embed.proj.weight"].shape[0] // 64 sd_map = comfy.utils.mmdit_to_diffusers({"depth": depth, "num_blocks": num_blocks}, output_prefix=output_prefix) + elif 'all_x_embedder.2-1.weight' in state_dict: #Z-Image (HuggingFace format) + w = state_dict.get('cap_embedder.1.weight') + hidden_size = w.shape[0] if w is not None else 3840 + n_layers = count_blocks(state_dict, 'layers.{}.') + n_refiner = count_blocks(state_dict, 'noise_refiner.{}.') + sd_map = comfy.utils.z_image_to_diffusers({"n_layers": n_layers, "n_refiner_layers": n_refiner, "dim": hidden_size}, output_prefix=output_prefix) else: return None diff --git a/comfy/sd.py b/comfy/sd.py index 4d427bb9a..2e1033d94 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1718,6 +1718,12 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable if model_config is not None: new_sd = sd + # Z-Image from HuggingFace uses diffusers-style key names that need conversion + if model_config.unet_config.get('z_image_modulation', False) and 'all_x_embedder.2-1.weight' in new_sd: + sd_copy = dict(new_sd) + converted = model_detection.convert_diffusers_mmdit(sd_copy, "") + if converted is not None: + new_sd = {**sd_copy, **converted} else: new_sd = model_detection.convert_diffusers_mmdit(sd, "") if new_sd is not None: #diffusers mmdit diff --git a/merge_zimage_models.py b/merge_zimage_models.py new file mode 100644 index 000000000..bd839230d --- /dev/null +++ b/merge_zimage_models.py @@ -0,0 +1,46 @@ +"""Merge Z-Image Turbo sharded safetensors into single files for ComfyUI.""" +import json +import os +from pathlib import Path +from safetensors.torch import load_file, save_file + +SNAP = Path.home() / ".cache/huggingface/hub/models--Tongyi-MAI--Z-Image-Turbo/snapshots/f332072aa78be7aecdf3ee76d5c247082da564a6" +OUT = Path.home() / "ComfyUI/models" + +def merge_shards(index_path, output_path, label): + print(f"\n=== Merging {label} ===") + with open(index_path) as f: + index = json.load(f) + weight_map = index["weight_map"] + shard_dir = Path(index_path).parent + shards = sorted(set(weight_map.values())) + + merged = {} + for shard in shards: + print(f" Loading {shard} ...") + tensors = load_file(str(shard_dir / shard)) + merged.update(tensors) + print(f" -> {len(tensors)} tensors, total so far: {len(merged)}") + + print(f" Saving to {output_path} ...") + os.makedirs(Path(output_path).parent, exist_ok=True) + save_file(merged, str(output_path)) + size = Path(output_path).stat().st_size / 1e9 + print(f" Done! {size:.1f} GB") + +# 1. Merge transformer +merge_shards( + SNAP / "transformer/diffusion_pytorch_model.safetensors.index.json", + OUT / "diffusion_models/z_image_turbo.safetensors", + "Transformer" +) + +# 2. Merge text encoder (Qwen3) +merge_shards( + SNAP / "text_encoder/model.safetensors.index.json", + OUT / "text_encoders/qwen3_z_image.safetensors", + "Text Encoder (Qwen3)" +) + +print("\n=== All done! ===") +print("Models ready in ComfyUI/models/")