From f5bf7ed4d32295f8b93a080341aef1d9a712b818 Mon Sep 17 00:00:00 2001
From: bigjimmy <dajimmy@gmail.com>
Date: Sun, 1 Mar 2026 22:44:47 +0800
Subject: [PATCH] fix: pad reference latents to patch size in embed_all

When a reference image is passed via TextEncodeZImageOmni, its VAE-encoded
latent may have odd height or width (e.g. from auto_resize rounding to
multiples of 8 pixels). The embed_all() function in the Lumina model tries
to reshape the latent as view(B, C, H//2, 2, W//2, 2) for patch embedding,
which fails when H or W is not divisible by the patch size (2).

Add pad_to_patch_size() before the reshape, matching what the main forward
pass already does for the primary latent.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 comfy/ldm/lumina/model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py
index 77d1abc97..e4c9ad21c 100644
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@@ -687,6 +687,7 @@ class NextDiT(nn.Module):
                 embeds += (siglip_feats,)
                 freqs_cis += (self.rope_embedder(siglip_pos_ids).movedim(1, 2),)
 
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, (pH, pW))
         B, C, H, W = x.shape
         x = self.x_embedder(x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
         x_pos_ids = pos_ids_x(cap_feats_len + 1, H // pH, W // pW, bsz, device, transformer_options=transformer_options)