From 7ecc3a39355db0c7e88be3272e343ec2830e8e75 Mon Sep 17 00:00:00 2001
From: bigjimmy <dajimmy@gmail.com>
Date: Sun, 1 Mar 2026 22:21:01 +0800
Subject: [PATCH 1/3] fix: Z-Image LoRA and model loading for HuggingFace
 format weights

Three fixes for Z-Image Turbo support:

1. model_detection.py: Add Z-Image to convert_diffusers_mmdit() so HF-format
   safetensors (using to_q/to_k/to_v + all_x_embedder.2-1 key names) are
   detected and converted.

2. sd.py: Apply the Z-Image key conversion when loading a ZImage model whose
   weights are in HF format (all_x_embedder.2-1 present), mapping separate
   to_q/to_k/to_v weights into the combined qkv format ComfyUI expects.

3. lora.py: Fix sliced LoRA patches being silently discarded. In
   calculate_weight(), when a LoRA targets a slice of a combined weight
   (e.g. to_q/to_k/to_v -> qkv), the computed output was never written back
   to the weight tensor. Fix by calling narrow().copy_() before restoring
   old_weight. This affected any LoRA using sliced key mappings.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 comfy/lora.py            | 4 +++-
 comfy/model_detection.py | 6 ++++++
 comfy/sd.py              | 6 ++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/comfy/lora.py b/comfy/lora.py
index f36ddb046..76f5eb0a9 100644
--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -426,9 +426,11 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori
             if output is None:
                 logging.warning("Calculate Weight Failed: {} {}".format(v.name, key))
             else:
-                weight = output
                 if old_weight is not None:
+                    old_weight.narrow(offset[0], offset[1], offset[2]).copy_(output)
                     weight = old_weight
+                else:
+                    weight = output
             continue
 
         if len(v) == 1:
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index 3faa950ca..ebccc2b46 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -1064,6 +1064,12 @@ def convert_diffusers_mmdit(state_dict, output_prefix=""):
         num_blocks = count_blocks(state_dict, 'transformer_blocks.{}.')
         depth = state_dict["pos_embed.proj.weight"].shape[0] // 64
         sd_map = comfy.utils.mmdit_to_diffusers({"depth": depth, "num_blocks": num_blocks}, output_prefix=output_prefix)
+    elif 'all_x_embedder.2-1.weight' in state_dict: #Z-Image (HuggingFace format)
+        w = state_dict.get('cap_embedder.1.weight')
+        hidden_size = w.shape[0] if w is not None else 3840
+        n_layers = count_blocks(state_dict, 'layers.{}.')
+        n_refiner = count_blocks(state_dict, 'noise_refiner.{}.')
+        sd_map = comfy.utils.z_image_to_diffusers({"n_layers": n_layers, "n_refiner_layers": n_refiner, "dim": hidden_size}, output_prefix=output_prefix)
     else:
         return None
 
diff --git a/comfy/sd.py b/comfy/sd.py
index a9ad7c2d2..a2e8a3704 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -1713,6 +1713,12 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable
 
     if model_config is not None:
         new_sd = sd
+        # Z-Image from HuggingFace uses diffusers-style key names that need conversion
+        if model_config.unet_config.get('z_image_modulation', False) and 'all_x_embedder.2-1.weight' in new_sd:
+            sd_copy = dict(new_sd)
+            converted = model_detection.convert_diffusers_mmdit(sd_copy, "")
+            if converted is not None:
+                new_sd = {**sd_copy, **converted}
     else:
         new_sd = model_detection.convert_diffusers_mmdit(sd, "")
         if new_sd is not None: #diffusers mmdit

From f5bf7ed4d32295f8b93a080341aef1d9a712b818 Mon Sep 17 00:00:00 2001
From: bigjimmy <dajimmy@gmail.com>
Date: Sun, 1 Mar 2026 22:44:47 +0800
Subject: [PATCH 2/3] fix: pad reference latents to patch size in embed_all

When a reference image is passed via TextEncodeZImageOmni, its VAE-encoded
latent may have odd height or width (e.g. from auto_resize rounding to
multiples of 8 pixels). The embed_all() function in the Lumina model tries
to reshape the latent as view(B, C, H//2, 2, W//2, 2) for patch embedding,
which fails when H or W is not divisible by the patch size (2).

Add pad_to_patch_size() before the reshape, matching what the main forward
pass already does for the primary latent.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 comfy/ldm/lumina/model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py
index 77d1abc97..e4c9ad21c 100644
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@@ -687,6 +687,7 @@ class NextDiT(nn.Module):
                 embeds += (siglip_feats,)
                 freqs_cis += (self.rope_embedder(siglip_pos_ids).movedim(1, 2),)
 
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, (pH, pW))
         B, C, H, W = x.shape
         x = self.x_embedder(x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
         x_pos_ids = pos_ids_x(cap_feats_len + 1, H // pH, W // pW, bsz, device, transformer_options=transformer_options)

From 98440de8d4d122339fe17d869822ada68c89e618 Mon Sep 17 00:00:00 2001
From: bigjimmy <dajimmy@gmail.com>
Date: Sun, 1 Mar 2026 22:57:10 +0800
Subject: [PATCH 3/3] add: script to merge Z-Image Turbo sharded safetensors
 for ComfyUI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Utility script that merges the HuggingFace sharded safetensors from
Tongyi-MAI/Z-Image-Turbo into single files ready for ComfyUI:
- transformer shards → models/diffusion_models/z_image_turbo.safetensors
- text encoder shards → models/text_encoders/qwen3_z_image.safetensors

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 merge_zimage_models.py | 46 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 merge_zimage_models.py

diff --git a/merge_zimage_models.py b/merge_zimage_models.py
new file mode 100644
index 000000000..bd839230d
--- /dev/null
+++ b/merge_zimage_models.py
@@ -0,0 +1,46 @@
+"""Merge Z-Image Turbo sharded safetensors into single files for ComfyUI."""
+import json
+import os
+from pathlib import Path
+from safetensors.torch import load_file, save_file
+
+SNAP = Path.home() / ".cache/huggingface/hub/models--Tongyi-MAI--Z-Image-Turbo/snapshots/f332072aa78be7aecdf3ee76d5c247082da564a6"
+OUT = Path.home() / "ComfyUI/models"
+
+def merge_shards(index_path, output_path, label):
+    print(f"\n=== Merging {label} ===")
+    with open(index_path) as f:
+        index = json.load(f)
+    weight_map = index["weight_map"]
+    shard_dir = Path(index_path).parent
+    shards = sorted(set(weight_map.values()))
+
+    merged = {}
+    for shard in shards:
+        print(f"  Loading {shard} ...")
+        tensors = load_file(str(shard_dir / shard))
+        merged.update(tensors)
+        print(f"  -> {len(tensors)} tensors, total so far: {len(merged)}")
+
+    print(f"  Saving to {output_path} ...")
+    os.makedirs(Path(output_path).parent, exist_ok=True)
+    save_file(merged, str(output_path))
+    size = Path(output_path).stat().st_size / 1e9
+    print(f"  Done! {size:.1f} GB")
+
+# 1. Merge transformer
+merge_shards(
+    SNAP / "transformer/diffusion_pytorch_model.safetensors.index.json",
+    OUT / "diffusion_models/z_image_turbo.safetensors",
+    "Transformer"
+)
+
+# 2. Merge text encoder (Qwen3)
+merge_shards(
+    SNAP / "text_encoder/model.safetensors.index.json",
+    OUT / "text_encoders/qwen3_z_image.safetensors",
+    "Text Encoder (Qwen3)"
+)
+
+print("\n=== All done! ===")
+print("Models ready in ComfyUI/models/")