Merge 1f255e11d2 into 1a72bf2046

2026-01-31 00:30:21 +08:00 · 2026-01-19 12:49:11 +00:00
17 changed files with 74 additions and 708 deletions
--- a/comfy/ldm/lightricks/vae/audio_vae.py
+++ b/comfy/ldm/lightricks/vae/audio_vae.py
@ -189,12 +189,9 @@ class AudioVAE(torch.nn.Module):
        waveform = self.device_manager.move_to_load_device(waveform)
        expected_channels = self.autoencoder.encoder.in_channels
        if waveform.shape[1] != expected_channels:
-            if waveform.shape[1] == 1:
-                waveform = waveform.expand(-1, expected_channels, *waveform.shape[2:])
-            else:
-                raise ValueError(
-                    f"Input audio must have {expected_channels} channels, got {waveform.shape[1]}"
-                )
+            raise ValueError(
+                f"Input audio must have {expected_channels} channels, got {waveform.shape[1]}"
+            )

        mel_spec = self.preprocessor.waveform_to_mel(
            waveform, waveform_sample_rate, device=self.device_manager.load_device
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@ -13,53 +13,10 @@ from comfy.ldm.modules.attention import optimized_attention_masked
 from comfy.ldm.flux.layers import EmbedND
 from comfy.ldm.flux.math import apply_rope
 import comfy.patcher_extension
-import comfy.utils


-def invert_slices(slices, length):
-    sorted_slices = sorted(slices)
-    result = []
-    current = 0
-
-    for start, end in sorted_slices:
-        if current < start:
-            result.append((current, start))
-        current = max(current, end)
-
-    if current < length:
-        result.append((current, length))
-
-    return result
-
-
-def modulate(x, scale, timestep_zero_index=None):
-    if timestep_zero_index is None:
-        return x * (1 + scale.unsqueeze(1))
-    else:
-        scale = (1 + scale.unsqueeze(1))
-        actual_batch = scale.size(0) // 2
-        slices = timestep_zero_index
-        invert = invert_slices(timestep_zero_index, x.shape[1])
-        for s in slices:
-            x[:, s[0]:s[1]] *= scale[actual_batch:]
-        for s in invert:
-            x[:, s[0]:s[1]] *= scale[:actual_batch]
-        return x
-
-
-def apply_gate(gate, x, timestep_zero_index=None):
-    if timestep_zero_index is None:
-        return gate * x
-    else:
-        actual_batch = gate.size(0) // 2
-
-        slices = timestep_zero_index
-        invert = invert_slices(timestep_zero_index, x.shape[1])
-        for s in slices:
-            x[:, s[0]:s[1]] *= gate[actual_batch:]
-        for s in invert:
-            x[:, s[0]:s[1]] *= gate[:actual_batch]
-        return x
+def modulate(x, scale):
+    return x * (1 + scale.unsqueeze(1))

 #############################################################################
 #                               Core NextDiT Model                              #
@ -301,7 +258,6 @@ class JointTransformerBlock(nn.Module):
        x_mask: torch.Tensor,
        freqs_cis: torch.Tensor,
        adaln_input: Optional[torch.Tensor]=None,
-        timestep_zero_index=None,
        transformer_options={},
    ):
        """
@ -320,18 +276,18 @@ class JointTransformerBlock(nn.Module):
            assert adaln_input is not None
            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1)

-            x = x + apply_gate(gate_msa.unsqueeze(1).tanh(), self.attention_norm2(
+            x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2(
                clamp_fp16(self.attention(
-                    modulate(self.attention_norm1(x), scale_msa, timestep_zero_index=timestep_zero_index),
+                    modulate(self.attention_norm1(x), scale_msa),
                    x_mask,
                    freqs_cis,
                    transformer_options=transformer_options,
-                ))), timestep_zero_index=timestep_zero_index
+                ))
            )
-            x = x + apply_gate(gate_mlp.unsqueeze(1).tanh(), self.ffn_norm2(
+            x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
                clamp_fp16(self.feed_forward(
-                    modulate(self.ffn_norm1(x), scale_mlp, timestep_zero_index=timestep_zero_index),
-                ))), timestep_zero_index=timestep_zero_index
+                    modulate(self.ffn_norm1(x), scale_mlp),
+                ))
            )
        else:
            assert adaln_input is None
@ -389,37 +345,13 @@ class FinalLayer(nn.Module):
            ),
        )

-    def forward(self, x, c, timestep_zero_index=None):
+    def forward(self, x, c):
        scale = self.adaLN_modulation(c)
-        x = modulate(self.norm_final(x), scale, timestep_zero_index=timestep_zero_index)
+        x = modulate(self.norm_final(x), scale)
        x = self.linear(x)
        return x


-def pad_zimage(feats, pad_token, pad_tokens_multiple):
-    pad_extra = (-feats.shape[1]) % pad_tokens_multiple
-    return torch.cat((feats, pad_token.to(device=feats.device, dtype=feats.dtype, copy=True).unsqueeze(0).repeat(feats.shape[0], pad_extra, 1)), dim=1), pad_extra
-
-
-def pos_ids_x(start_t, H_tokens, W_tokens, batch_size, device, transformer_options={}):
-    rope_options = transformer_options.get("rope_options", None)
-    h_scale = 1.0
-    w_scale = 1.0
-    h_start = 0
-    w_start = 0
-    if rope_options is not None:
-        h_scale = rope_options.get("scale_y", 1.0)
-        w_scale = rope_options.get("scale_x", 1.0)
-
-        h_start = rope_options.get("shift_y", 0.0)
-        w_start = rope_options.get("shift_x", 0.0)
-    x_pos_ids = torch.zeros((batch_size, H_tokens * W_tokens, 3), dtype=torch.float32, device=device)
-    x_pos_ids[:, :, 0] = start_t
-    x_pos_ids[:, :, 1] = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten()
-    x_pos_ids[:, :, 2] = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten()
-    return x_pos_ids
-
-
 class NextDiT(nn.Module):
    """
    Diffusion model with a Transformer backbone.
@ -446,7 +378,6 @@ class NextDiT(nn.Module):
        time_scale=1.0,
        pad_tokens_multiple=None,
        clip_text_dim=None,
-        siglip_feat_dim=None,
        image_model=None,
        device=None,
        dtype=None,
@ -560,41 +491,6 @@ class NextDiT(nn.Module):
                for layer_id in range(n_layers)
            ]
        )
-
-        if siglip_feat_dim is not None:
-            self.siglip_embedder = nn.Sequential(
-                operation_settings.get("operations").RMSNorm(siglip_feat_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
-                operation_settings.get("operations").Linear(
-                    siglip_feat_dim,
-                    dim,
-                    bias=True,
-                    device=operation_settings.get("device"),
-                    dtype=operation_settings.get("dtype"),
-                ),
-            )
-            self.siglip_refiner = nn.ModuleList(
-                [
-                    JointTransformerBlock(
-                        layer_id,
-                        dim,
-                        n_heads,
-                        n_kv_heads,
-                        multiple_of,
-                        ffn_dim_multiplier,
-                        norm_eps,
-                        qk_norm,
-                        modulation=False,
-                        operation_settings=operation_settings,
-                    )
-                    for layer_id in range(n_refiner_layers)
-                ]
-            )
-            self.siglip_pad_token = nn.Parameter(torch.empty((1, dim), device=device, dtype=dtype))
-        else:
-            self.siglip_embedder = None
-            self.siglip_refiner = None
-            self.siglip_pad_token = None
-
        # This norm final is in the lumina 2.0 code but isn't actually used for anything.
        # self.norm_final = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
        self.final_layer = FinalLayer(dim, patch_size, self.out_channels, z_image_modulation=z_image_modulation, operation_settings=operation_settings)
@ -635,168 +531,70 @@ class NextDiT(nn.Module):
            imgs = torch.stack(imgs, dim=0)
        return imgs

-    def embed_cap(self, cap_feats=None, offset=0, bsz=1, device=None, dtype=None):
-        if cap_feats is not None:
-            cap_feats = self.cap_embedder(cap_feats)
-            cap_feats_len = cap_feats.shape[1]
-            if self.pad_tokens_multiple is not None:
-                cap_feats, _ = pad_zimage(cap_feats, self.cap_pad_token, self.pad_tokens_multiple)
-        else:
-            cap_feats_len = 0
-            cap_feats = self.cap_pad_token.to(device=device, dtype=dtype, copy=True).unsqueeze(0).repeat(bsz, self.pad_tokens_multiple, 1)
+    def patchify_and_embed(
+        self, x: List[torch.Tensor] | torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens, transformer_options={}
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor]:
+        bsz = len(x)
+        pH = pW = self.patch_size
+        device = x[0].device
+        orig_x = x
+
+        if self.pad_tokens_multiple is not None:
+            pad_extra = (-cap_feats.shape[1]) % self.pad_tokens_multiple
+            cap_feats = torch.cat((cap_feats, self.cap_pad_token.to(device=cap_feats.device, dtype=cap_feats.dtype, copy=True).unsqueeze(0).repeat(cap_feats.shape[0], pad_extra, 1)), dim=1)

        cap_pos_ids = torch.zeros(bsz, cap_feats.shape[1], 3, dtype=torch.float32, device=device)
-        cap_pos_ids[:, :, 0] = torch.arange(cap_feats.shape[1], dtype=torch.float32, device=device) + 1.0 + offset
-        embeds = (cap_feats,)
-        freqs_cis = (self.rope_embedder(cap_pos_ids).movedim(1, 2),)
-        return embeds, freqs_cis, cap_feats_len
-
-    def embed_all(self, x, cap_feats=None, siglip_feats=None, offset=0, omni=False, transformer_options={}):
-        bsz = 1
-        pH = pW = self.patch_size
-        device = x.device
-        embeds, freqs_cis, cap_feats_len = self.embed_cap(cap_feats, offset=offset, bsz=bsz, device=device, dtype=x.dtype)
-
-        if (not omni) or self.siglip_embedder is None:
-            cap_feats_len = embeds[0].shape[1] + offset
-            embeds += (None,)
-            freqs_cis += (None,)
-        else:
-            cap_feats_len += offset
-            if siglip_feats is not None:
-                b, h, w, c = siglip_feats.shape
-                siglip_feats = siglip_feats.permute(0, 3, 1, 2).reshape(b, h * w, c)
-                siglip_feats = self.siglip_embedder(siglip_feats)
-                siglip_pos_ids = torch.zeros((bsz, siglip_feats.shape[1], 3), dtype=torch.float32, device=device)
-                siglip_pos_ids[:, :, 0] = cap_feats_len + 2
-                siglip_pos_ids[:, :, 1] = (torch.linspace(0, h * 8 - 1, steps=h, dtype=torch.float32, device=device).floor()).view(-1, 1).repeat(1, w).flatten()
-                siglip_pos_ids[:, :, 2] = (torch.linspace(0, w * 8 - 1, steps=w, dtype=torch.float32, device=device).floor()).view(1, -1).repeat(h, 1).flatten()
-                if self.siglip_pad_token is not None:
-                    siglip_feats, pad_extra = pad_zimage(siglip_feats, self.siglip_pad_token, self.pad_tokens_multiple)  # TODO: double check
-                    siglip_pos_ids = torch.nn.functional.pad(siglip_pos_ids, (0, 0, 0, pad_extra))
-            else:
-                if self.siglip_pad_token is not None:
-                    siglip_feats = self.siglip_pad_token.to(device=device, dtype=x.dtype, copy=True).unsqueeze(0).repeat(bsz, self.pad_tokens_multiple, 1)
-                    siglip_pos_ids = torch.zeros((bsz, siglip_feats.shape[1], 3), dtype=torch.float32, device=device)
-
-            if siglip_feats is None:
-                embeds += (None,)
-                freqs_cis += (None,)
-            else:
-                embeds += (siglip_feats,)
-                freqs_cis += (self.rope_embedder(siglip_pos_ids).movedim(1, 2),)
+        cap_pos_ids[:, :, 0] = torch.arange(cap_feats.shape[1], dtype=torch.float32, device=device) + 1.0

        B, C, H, W = x.shape
        x = self.x_embedder(x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
-        x_pos_ids = pos_ids_x(cap_feats_len + 1, H // pH, W // pW, bsz, device, transformer_options=transformer_options)
+
+        rope_options = transformer_options.get("rope_options", None)
+        h_scale = 1.0
+        w_scale = 1.0
+        h_start = 0
+        w_start = 0
+        if rope_options is not None:
+            h_scale = rope_options.get("scale_y", 1.0)
+            w_scale = rope_options.get("scale_x", 1.0)
+
+            h_start = rope_options.get("shift_y", 0.0)
+            w_start = rope_options.get("shift_x", 0.0)
+
+        H_tokens, W_tokens = H // pH, W // pW
+        x_pos_ids = torch.zeros((bsz, x.shape[1], 3), dtype=torch.float32, device=device)
+        x_pos_ids[:, :, 0] = cap_feats.shape[1] + 1
+        x_pos_ids[:, :, 1] = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten()
+        x_pos_ids[:, :, 2] = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten()
+
        if self.pad_tokens_multiple is not None:
-            x, pad_extra = pad_zimage(x, self.x_pad_token, self.pad_tokens_multiple)
+            pad_extra = (-x.shape[1]) % self.pad_tokens_multiple
+            x = torch.cat((x, self.x_pad_token.to(device=x.device, dtype=x.dtype, copy=True).unsqueeze(0).repeat(x.shape[0], pad_extra, 1)), dim=1)
            x_pos_ids = torch.nn.functional.pad(x_pos_ids, (0, 0, 0, pad_extra))

-        embeds += (x,)
-        freqs_cis += (self.rope_embedder(x_pos_ids).movedim(1, 2),)
-        return embeds, freqs_cis, cap_feats_len + len(freqs_cis) - 1
-
-
-    def patchify_and_embed(
-        self, x: torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens, ref_latents=[], ref_contexts=[], siglip_feats=[], transformer_options={}
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor]:
-        bsz = x.shape[0]
-        cap_mask = None  # TODO?
-        main_siglip = None
-        orig_x = x
-
-        embeds = ([], [], [])
-        freqs_cis = ([], [], [])
-        leftover_cap = []
-
-        start_t = 0
-        omni = len(ref_latents) > 0
-        if omni:
-            for i, ref in enumerate(ref_latents):
-                if i < len(ref_contexts):
-                    ref_con = ref_contexts[i]
-                else:
-                    ref_con = None
-                if i < len(siglip_feats):
-                    sig_feat = siglip_feats[i]
-                else:
-                    sig_feat = None
-
-                out = self.embed_all(ref, ref_con, sig_feat, offset=start_t, omni=omni, transformer_options=transformer_options)
-                for i, e in enumerate(out[0]):
-                    if e is not None:
-                        embeds[i].append(comfy.utils.repeat_to_batch_size(e, bsz))
-                        freqs_cis[i].append(out[1][i])
-                start_t = out[2]
-            leftover_cap = ref_contexts[len(ref_latents):]
-
-        H, W = x.shape[-2], x.shape[-1]
-        img_sizes = [(H, W)] * bsz
-        out = self.embed_all(x, cap_feats, main_siglip, offset=start_t, omni=omni, transformer_options=transformer_options)
-        img_len = out[0][-1].shape[1]
-        cap_len = out[0][0].shape[1]
-        for i, e in enumerate(out[0]):
-            if e is not None:
-                e = comfy.utils.repeat_to_batch_size(e, bsz)
-                embeds[i].append(e)
-                freqs_cis[i].append(out[1][i])
-        start_t = out[2]
-
-        for cap in leftover_cap:
-            out = self.embed_cap(cap, offset=start_t, bsz=bsz, device=x.device, dtype=x.dtype)
-            cap_len += out[0][0].shape[1]
-            embeds[0].append(comfy.utils.repeat_to_batch_size(out[0][0], bsz))
-            freqs_cis[0].append(out[1][0])
-            start_t += out[2]
+        freqs_cis = self.rope_embedder(torch.cat((cap_pos_ids, x_pos_ids), dim=1)).movedim(1, 2)

        patches = transformer_options.get("patches", {})

        # refine context
-        cap_feats = torch.cat(embeds[0], dim=1)
-        cap_freqs_cis = torch.cat(freqs_cis[0], dim=1)
        for layer in self.context_refiner:
-            cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis, transformer_options=transformer_options)
-
-        feats = (cap_feats,)
-        fc = (cap_freqs_cis,)
-
-        if omni and len(embeds[1]) > 0:
-            siglip_mask = None
-            siglip_feats_combined = torch.cat(embeds[1], dim=1)
-            siglip_feats_freqs_cis = torch.cat(freqs_cis[1], dim=1)
-            if self.siglip_refiner is not None:
-                for layer in self.siglip_refiner:
-                    siglip_feats_combined = layer(siglip_feats_combined, siglip_mask, siglip_feats_freqs_cis, transformer_options=transformer_options)
-            feats += (siglip_feats_combined,)
-            fc += (siglip_feats_freqs_cis,)
+            cap_feats = layer(cap_feats, cap_mask, freqs_cis[:, :cap_pos_ids.shape[1]], transformer_options=transformer_options)

        padded_img_mask = None
-        x = torch.cat(embeds[-1], dim=1)
-        fc_x = torch.cat(freqs_cis[-1], dim=1)
-        if omni:
-            timestep_zero_index = [(x.shape[1] - img_len, x.shape[1])]
-        else:
-            timestep_zero_index = None
-
        x_input = x
        for i, layer in enumerate(self.noise_refiner):
-            x = layer(x, padded_img_mask, fc_x, t, timestep_zero_index=timestep_zero_index, transformer_options=transformer_options)
+            x = layer(x, padded_img_mask, freqs_cis[:, cap_pos_ids.shape[1]:], t, transformer_options=transformer_options)
            if "noise_refiner" in patches:
                for p in patches["noise_refiner"]:
-                    out = p({"img": x, "img_input": x_input, "txt": cap_feats, "pe": fc_x, "vec": t, "x": orig_x, "block_index": i, "transformer_options": transformer_options, "block_type": "noise_refiner"})
+                    out = p({"img": x, "img_input": x_input, "txt": cap_feats, "pe": freqs_cis[:, cap_pos_ids.shape[1]:], "vec": t, "x": orig_x, "block_index": i, "transformer_options": transformer_options, "block_type": "noise_refiner"})
                    if "img" in out:
                        x = out["img"]

-        padded_full_embed = torch.cat(feats + (x,), dim=1)
-        if timestep_zero_index is not None:
-            ind = padded_full_embed.shape[1] - x.shape[1]
-            timestep_zero_index = [(ind + x.shape[1] - img_len, ind + x.shape[1])]
-            timestep_zero_index.append((feats[0].shape[1] - cap_len, feats[0].shape[1]))
-
+        padded_full_embed = torch.cat((cap_feats, x), dim=1)
        mask = None
-        l_effective_cap_len = [padded_full_embed.shape[1] - img_len] * bsz
-        return padded_full_embed, mask, img_sizes, l_effective_cap_len, torch.cat(fc + (fc_x,), dim=1), timestep_zero_index
+        img_sizes = [(H, W)] * bsz
+        l_effective_cap_len = [cap_feats.shape[1]] * bsz
+        return padded_full_embed, mask, img_sizes, l_effective_cap_len, freqs_cis

    def forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs):
        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
@ -806,11 +604,7 @@ class NextDiT(nn.Module):
        ).execute(x, timesteps, context, num_tokens, attention_mask, **kwargs)

    # def forward(self, x, t, cap_feats, cap_mask):
-    def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, ref_latents=[], ref_contexts=[], siglip_feats=[], transformer_options={}, **kwargs):
-        omni = len(ref_latents) > 0
-        if omni:
-            timesteps = torch.cat([timesteps * 0, timesteps], dim=0)
-
+    def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, transformer_options={}, **kwargs):
        t = 1.0 - timesteps
        cap_feats = context
        cap_mask = attention_mask
@ -825,6 +619,8 @@ class NextDiT(nn.Module):
        t = self.t_embedder(t * self.time_scale, dtype=x.dtype)  # (N, D)
        adaln_input = t

+        cap_feats = self.cap_embedder(cap_feats)  # (N, L, D)  # todo check if able to batchify w.o. redundant compute
+
        if self.clip_text_pooled_proj is not None:
            pooled = kwargs.get("clip_text_pooled", None)
            if pooled is not None:
@ -836,7 +632,7 @@ class NextDiT(nn.Module):

        patches = transformer_options.get("patches", {})
        x_is_tensor = isinstance(x, torch.Tensor)
-        img, mask, img_size, cap_size, freqs_cis, timestep_zero_index = self.patchify_and_embed(x, cap_feats, cap_mask, adaln_input, num_tokens, ref_latents=ref_latents, ref_contexts=ref_contexts, siglip_feats=siglip_feats, transformer_options=transformer_options)
+        img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, adaln_input, num_tokens, transformer_options=transformer_options)
        freqs_cis = freqs_cis.to(img.device)

        transformer_options["total_blocks"] = len(self.layers)
@ -844,7 +640,7 @@ class NextDiT(nn.Module):
        img_input = img
        for i, layer in enumerate(self.layers):
            transformer_options["block_index"] = i
-            img = layer(img, mask, freqs_cis, adaln_input, timestep_zero_index=timestep_zero_index, transformer_options=transformer_options)
+            img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
            if "double_block" in patches:
                for p in patches["double_block"]:
                    out = p({"img": img[:, cap_size[0]:], "img_input": img_input[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options})
@ -853,7 +649,8 @@ class NextDiT(nn.Module):
                    if "txt" in out:
                        img[:, :cap_size[0]] = out["txt"]

-        img = self.final_layer(img, adaln_input, timestep_zero_index=timestep_zero_index)
+        img = self.final_layer(img, adaln_input)
        img = self.unpatchify(img, img_size, cap_size, return_tensor=x_is_tensor)[:, :, :h, :w]
+
        return -img

--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -1150,7 +1150,6 @@ class CosmosPredict2(BaseModel):
 class Lumina2(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.lumina.model.NextDiT)
-        self.memory_usage_factor_conds = ("ref_latents",)

    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
@ -1170,35 +1169,6 @@ class Lumina2(BaseModel):
        if clip_text_pooled is not None:
            out['clip_text_pooled'] = comfy.conds.CONDRegular(clip_text_pooled)

-        clip_vision_outputs = kwargs.get("clip_vision_outputs", list(map(lambda a: a.get("clip_vision_output"), kwargs.get("unclip_conditioning", [{}]))))  # Z Image omni
-        if clip_vision_outputs is not None and len(clip_vision_outputs) > 0:
-            sigfeats = []
-            for clip_vision_output in clip_vision_outputs:
-                if clip_vision_output is not None:
-                    image_size = clip_vision_output.image_sizes[0]
-                    shape = clip_vision_output.last_hidden_state.shape
-                    sigfeats.append(clip_vision_output.last_hidden_state.reshape(shape[0], image_size[1] // 16, image_size[2] // 16, shape[-1]))
-            if len(sigfeats) > 0:
-                out['siglip_feats'] = comfy.conds.CONDList(sigfeats)
-
-        ref_latents = kwargs.get("reference_latents", None)
-        if ref_latents is not None:
-            latents = []
-            for lat in ref_latents:
-                latents.append(self.process_latent_in(lat))
-            out['ref_latents'] = comfy.conds.CONDList(latents)
-
-        ref_contexts = kwargs.get("reference_latents_text_embeds", None)
-        if ref_contexts is not None:
-            out['ref_contexts'] = comfy.conds.CONDList(ref_contexts)
-
-        return out
-
-    def extra_conds_shapes(self, **kwargs):
-        out = {}
-        ref_latents = kwargs.get("reference_latents", None)
-        if ref_latents is not None:
-            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()[2:]), ref_latents))])
        return out

 class WAN21(BaseModel):
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -446,9 +446,6 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
            dit_config["time_scale"] = 1000.0
            if '{}cap_pad_token'.format(key_prefix) in state_dict_keys:
                dit_config["pad_tokens_multiple"] = 32
-            sig_weight = state_dict.get('{}siglip_embedder.0.weight'.format(key_prefix), None)
-            if sig_weight is not None:
-                dit_config["siglip_feat_dim"] = sig_weight.shape[0]

        return dit_config

--- a/comfy/text_encoders/ovis.py
+++ b/comfy/text_encoders/ovis.py
@ -61,7 +61,6 @@ def te(dtype_llama=None, llama_quantization_metadata=None):
            if dtype_llama is not None:
                dtype = dtype_llama
            if llama_quantization_metadata is not None:
-                model_options = model_options.copy()
                model_options["quantization_metadata"] = llama_quantization_metadata
            super().__init__(device=device, dtype=dtype, model_options=model_options)
    return OvisTEModel_
--- a/comfy/text_encoders/z_image.py
+++ b/comfy/text_encoders/z_image.py
@ -40,7 +40,6 @@ def te(dtype_llama=None, llama_quantization_metadata=None):
            if dtype_llama is not None:
                dtype = dtype_llama
            if llama_quantization_metadata is not None:
-                model_options = model_options.copy()
                model_options["quantization_metadata"] = llama_quantization_metadata
            super().__init__(device=device, dtype=dtype, model_options=model_options)
    return ZImageTEModel_
--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -639,8 +639,6 @@ def flux_to_diffusers(mmdit_config, output_prefix=""):
                        "proj_out.bias": "linear2.bias",
                        "attn.norm_q.weight": "norm.query_norm.scale",
                        "attn.norm_k.weight": "norm.key_norm.scale",
-                        "attn.to_qkv_mlp_proj.weight": "linear1.weight", # Flux 2
-                        "attn.to_out.weight": "linear2.weight", # Flux 2
                    }

        for k in block_map:
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@ -1000,38 +1000,20 @@ class Autogrow(ComfyTypeI):
            names = [f"{prefix}{i}" for i in range(max)]
        # need to create a new input based on the contents of input
        template_input = None
-        template_required = True
-        for _input_type, dict_input in input.items():
-            # for now, get just the first value from dict_input; if not required, min can be ignored
-            if len(dict_input) == 0:
-                continue
+        for _, dict_input in input.items():
+            # for now, get just the first value from dict_input
            template_input = list(dict_input.values())[0]
-            template_required = _input_type == "required"
-            break
-        if template_input is None:
-            raise Exception("template_input could not be determined from required or optional; this should never happen.")
        new_dict = {}
-        new_dict_added_to = False
-        # first, add possible inputs into out_dict
        for i, name in enumerate(names):
            expected_id = finalize_prefix(curr_prefix, name)
-            # required
-            if i < min and template_required:
-                out_dict["required"][expected_id] = template_input
-                type_dict = new_dict.setdefault("required", {})
-            # optional
-            else:
-                out_dict["optional"][expected_id] = template_input
-                type_dict = new_dict.setdefault("optional", {})
            if expected_id in live_inputs:
-                # NOTE: prefix gets added in parse_class_inputs
+                # required
+                if i < min:
+                    type_dict = new_dict.setdefault("required", {})
+                # optional
+                else:
+                    type_dict = new_dict.setdefault("optional", {})
                type_dict[name] = template_input
-                new_dict_added_to = True
-        # account for the edge case that all inputs are optional and no values are received
-        if not new_dict_added_to:
-            finalized_prefix = finalize_prefix(curr_prefix)
-            out_dict["dynamic_paths"][finalized_prefix] = finalized_prefix
-            out_dict["dynamic_paths_default_value"][finalized_prefix] = DynamicPathsDefaultValue.EMPTY_DICT
        parse_class_inputs(out_dict, live_inputs, new_dict, curr_prefix)

@comfytype(io_type="COMFY_DYNAMICCOMBO_V3")
@ -1169,8 +1151,6 @@ class V3Data(TypedDict):
    'Dictionary where the keys are the hidden input ids and the values are the values of the hidden inputs.'
    dynamic_paths: dict[str, Any]
    'Dictionary where the keys are the input ids and the values dictate how to turn the inputs into a nested dictionary.'
-    dynamic_paths_default_value: dict[str, Any]
-    'Dictionary where the keys are the input ids and the values are a string from DynamicPathsDefaultValue for the inputs if value is None.'
    create_dynamic_tuple: bool
    'When True, the value of the dynamic input will be in the format (value, path_key).'

@ -1524,7 +1504,6 @@ def get_finalized_class_inputs(d: dict[str, Any], live_inputs: dict[str, Any], i
        "required": {},
        "optional": {},
        "dynamic_paths": {},
-        "dynamic_paths_default_value": {},
    }
    d = d.copy()
    # ignore hidden for parsing
@ -1534,12 +1513,8 @@ def get_finalized_class_inputs(d: dict[str, Any], live_inputs: dict[str, Any], i
        out_dict["hidden"] = hidden
    v3_data = {}
    dynamic_paths = out_dict.pop("dynamic_paths", None)
-    if dynamic_paths is not None and len(dynamic_paths) > 0:
+    if dynamic_paths is not None:
        v3_data["dynamic_paths"] = dynamic_paths
-    # this list is used for autogrow, in the case all inputs are optional and no values are passed
-    dynamic_paths_default_value = out_dict.pop("dynamic_paths_default_value", None)
-    if dynamic_paths_default_value is not None and len(dynamic_paths_default_value) > 0:
-        v3_data["dynamic_paths_default_value"] = dynamic_paths_default_value
    return out_dict, hidden, v3_data

 def parse_class_inputs(out_dict: dict[str, Any], live_inputs: dict[str, Any], curr_dict: dict[str, Any], curr_prefix: list[str] | None=None) -> None:
@ -1576,16 +1551,11 @@ def add_to_dict_v1(i: Input, d: dict):
 def add_to_dict_v3(io: Input | Output, d: dict):
    d[io.id] = (io.get_io_type(), io.as_dict())

-class DynamicPathsDefaultValue:
-    EMPTY_DICT = "empty_dict"
-
 def build_nested_inputs(values: dict[str, Any], v3_data: V3Data):
    paths = v3_data.get("dynamic_paths", None)
-    default_value_dict = v3_data.get("dynamic_paths_default_value", {})
    if paths is None:
        return values
    values = values.copy()
-
    result = {}

    create_tuple = v3_data.get("create_dynamic_tuple", False)
@ -1599,11 +1569,6 @@ def build_nested_inputs(values: dict[str, Any], v3_data: V3Data):

            if is_last:
                value = values.pop(key, None)
-                if value is None:
-                    # see if a default value was provided for this key
-                    default_option = default_value_dict.get(key, None)
-                    if default_option == DynamicPathsDefaultValue.EMPTY_DICT:
-                        value = {}
                if create_tuple:
                    value = (value, key)
                current[p] = value
--- a/comfy_api_nodes/apis/bria.py
+++ b/comfy_api_nodes/apis/bria.py
@ -1,61 +0,0 @@
-from typing import TypedDict
-
-from pydantic import BaseModel, Field
-
-
-class InputModerationSettings(TypedDict):
-    prompt_content_moderation: bool
-    visual_input_moderation: bool
-    visual_output_moderation: bool
-
-
-class BriaEditImageRequest(BaseModel):
-    instruction: str | None = Field(...)
-    structured_instruction: str | None = Field(
-        ...,
-        description="Use this instead of instruction for precise, programmatic control.",
-    )
-    images: list[str] = Field(
-        ...,
-        description="Required. Publicly available URL or Base64-encoded. Must contain exactly one item.",
-    )
-    mask: str | None = Field(
-        None,
-        description="Mask image (black and white). Black areas will be preserved, white areas will be edited. "
-        "If omitted, the edit applies to the entire image. "
-        "The input image and the the input mask must be of the same size.",
-    )
-    negative_prompt: str | None = Field(None)
-    guidance_scale: float = Field(...)
-    model_version: str = Field(...)
-    steps_num: int = Field(...)
-    seed: int = Field(...)
-    ip_signal: bool = Field(
-        False,
-        description="If true, returns a warning for potential IP content in the instruction.",
-    )
-    prompt_content_moderation: bool = Field(
-        False, description="If true, returns 422 on instruction moderation failure."
-    )
-    visual_input_content_moderation: bool = Field(
-        False, description="If true, returns 422 on images or mask moderation failure."
-    )
-    visual_output_content_moderation: bool = Field(
-        False, description="If true, returns 422 on visual output moderation failure."
-    )
-
-
-class BriaStatusResponse(BaseModel):
-    request_id: str = Field(...)
-    status_url: str = Field(...)
-    warning: str | None = Field(None)
-
-
-class BriaResult(BaseModel):
-    structured_prompt: str = Field(...)
-    image_url: str = Field(...)
-
-
-class BriaResponse(BaseModel):
-    status: str = Field(...)
-    result: BriaResult | None = Field(None)
--- a/comfy_api_nodes/nodes_bria.py
+++ b/comfy_api_nodes/nodes_bria.py
@ -1,198 +0,0 @@
-from typing_extensions import override
-
-from comfy_api.latest import IO, ComfyExtension, Input
-from comfy_api_nodes.apis.bria import (
-    BriaEditImageRequest,
-    BriaResponse,
-    BriaStatusResponse,
-    InputModerationSettings,
-)
-from comfy_api_nodes.util import (
-    ApiEndpoint,
-    convert_mask_to_image,
-    download_url_to_image_tensor,
-    get_number_of_images,
-    poll_op,
-    sync_op,
-    upload_images_to_comfyapi,
-)
-
-
-class BriaImageEditNode(IO.ComfyNode):
-
-    @classmethod
-    def define_schema(cls):
-        return IO.Schema(
-            node_id="BriaImageEditNode",
-            display_name="Bria Image Edit",
-            category="api node/image/Bria",
-            description="Edit images using Bria latest model",
-            inputs=[
-                IO.Combo.Input("model", options=["FIBO"]),
-                IO.Image.Input("image"),
-                IO.String.Input(
-                    "prompt",
-                    multiline=True,
-                    default="",
-                    tooltip="Instruction to edit image",
-                ),
-                IO.String.Input("negative_prompt", multiline=True, default=""),
-                IO.String.Input(
-                    "structured_prompt",
-                    multiline=True,
-                    default="",
-                    tooltip="A string containing the structured edit prompt in JSON format. "
-                    "Use this instead of usual prompt for precise, programmatic control.",
-                ),
-                IO.Int.Input(
-                    "seed",
-                    default=1,
-                    min=1,
-                    max=2147483647,
-                    step=1,
-                    display_mode=IO.NumberDisplay.number,
-                    control_after_generate=True,
-                ),
-                IO.Float.Input(
-                    "guidance_scale",
-                    default=3,
-                    min=3,
-                    max=5,
-                    step=0.01,
-                    display_mode=IO.NumberDisplay.number,
-                    tooltip="Higher value makes the image follow the prompt more closely.",
-                ),
-                IO.Int.Input(
-                    "steps",
-                    default=50,
-                    min=20,
-                    max=50,
-                    step=1,
-                    display_mode=IO.NumberDisplay.number,
-                ),
-                IO.DynamicCombo.Input(
-                    "moderation",
-                    options=[
-                        IO.DynamicCombo.Option(
-                            "true",
-                            [
-                                IO.Boolean.Input(
-                                    "prompt_content_moderation", default=False
-                                ),
-                                IO.Boolean.Input(
-                                    "visual_input_moderation", default=False
-                                ),
-                                IO.Boolean.Input(
-                                    "visual_output_moderation", default=True
-                                ),
-                            ],
-                        ),
-                        IO.DynamicCombo.Option("false", []),
-                    ],
-                    tooltip="Moderation settings",
-                ),
-                IO.Mask.Input(
-                    "mask",
-                    tooltip="If omitted, the edit applies to the entire image.",
-                    optional=True,
-                ),
-            ],
-            outputs=[
-                IO.Image.Output(),
-                IO.String.Output(display_name="structured_prompt"),
-            ],
-            hidden=[
-                IO.Hidden.auth_token_comfy_org,
-                IO.Hidden.api_key_comfy_org,
-                IO.Hidden.unique_id,
-            ],
-            is_api_node=True,
-            price_badge=IO.PriceBadge(
-                expr="""{"type":"usd","usd":0.04}""",
-            ),
-        )
-
-    @classmethod
-    async def execute(
-        cls,
-        model: str,
-        image: Input.Image,
-        prompt: str,
-        negative_prompt: str,
-        structured_prompt: str,
-        seed: int,
-        guidance_scale: float,
-        steps: int,
-        moderation: InputModerationSettings,
-        mask: Input.Image | None = None,
-    ) -> IO.NodeOutput:
-        if not prompt and not structured_prompt:
-            raise ValueError(
-                "One of prompt or structured_prompt is required to be non-empty."
-            )
-        if get_number_of_images(image) != 1:
-            raise ValueError("Exactly one input image is required.")
-        mask_url = None
-        if mask is not None:
-            mask_url = (
-                await upload_images_to_comfyapi(
-                    cls,
-                    convert_mask_to_image(mask),
-                    max_images=1,
-                    mime_type="image/png",
-                    wait_label="Uploading mask",
-                )
-            )[0]
-        response = await sync_op(
-            cls,
-            ApiEndpoint(path="proxy/bria/v2/image/edit", method="POST"),
-            data=BriaEditImageRequest(
-                instruction=prompt if prompt else None,
-                structured_instruction=structured_prompt if structured_prompt else None,
-                images=await upload_images_to_comfyapi(
-                    cls,
-                    image,
-                    max_images=1,
-                    mime_type="image/png",
-                    wait_label="Uploading image",
-                ),
-                mask=mask_url,
-                negative_prompt=negative_prompt if negative_prompt else None,
-                guidance_scale=guidance_scale,
-                seed=seed,
-                model_version=model,
-                steps_num=steps,
-                prompt_content_moderation=moderation.get(
-                    "prompt_content_moderation", False
-                ),
-                visual_input_content_moderation=moderation.get(
-                    "visual_input_moderation", False
-                ),
-                visual_output_content_moderation=moderation.get(
-                    "visual_output_moderation", False
-                ),
-            ),
-            response_model=BriaStatusResponse,
-        )
-        response = await poll_op(
-            cls,
-            ApiEndpoint(path=f"/proxy/bria/v2/status/{response.request_id}"),
-            status_extractor=lambda r: r.status,
-            response_model=BriaResponse,
-        )
-        return IO.NodeOutput(
-            await download_url_to_image_tensor(response.result.image_url),
-            response.result.structured_prompt,
-        )
-
-
-class BriaExtension(ComfyExtension):
-    @override
-    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
-        return [
-            BriaImageEditNode,
-        ]
-
-
-async def comfy_entrypoint() -> BriaExtension:
-    return BriaExtension()
--- a/comfy_api_nodes/util/init.py
+++ b/comfy_api_nodes/util/init.py
@ -11,7 +11,6 @@ from .conversions import (
    audio_input_to_mp3,
    audio_to_base64_string,
    bytesio_to_image_tensor,
-    convert_mask_to_image,
    downscale_image_tensor,
    image_tensor_pair_to_batch,
    pil_to_bytesio,
@ -73,7 +72,6 @@ __all__ = [
    "audio_input_to_mp3",
    "audio_to_base64_string",
    "bytesio_to_image_tensor",
-    "convert_mask_to_image",
    "downscale_image_tensor",
    "image_tensor_pair_to_batch",
    "pil_to_bytesio",
--- a/comfy_api_nodes/util/conversions.py
+++ b/comfy_api_nodes/util/conversions.py
@ -451,12 +451,6 @@ def resize_mask_to_image(
    return mask


-def convert_mask_to_image(mask: Input.Image) -> torch.Tensor:
-    """Make mask have the expected amount of dims (4) and channels (3) to be recognized as an image."""
-    mask = mask.unsqueeze(-1)
-    return torch.cat([mask] * 3, dim=-1)
-
-
 def text_filepath_to_base64_string(filepath: str) -> str:
    """Converts a text file to a base64 string."""
    with open(filepath, "rb") as f:
--- a/comfy_extras/nodes_zimage.py
+++ b/comfy_extras/nodes_zimage.py
@ -1,88 +0,0 @@
-import node_helpers
-from typing_extensions import override
-from comfy_api.latest import ComfyExtension, io
-import math
-import comfy.utils
-
-
-class TextEncodeZImageOmni(io.ComfyNode):
-    @classmethod
-    def define_schema(cls):
-        return io.Schema(
-            node_id="TextEncodeZImageOmni",
-            category="advanced/conditioning",
-            is_experimental=True,
-            inputs=[
-                io.Clip.Input("clip"),
-                io.ClipVision.Input("image_encoder", optional=True),
-                io.String.Input("prompt", multiline=True, dynamic_prompts=True),
-                io.Boolean.Input("auto_resize_images", default=True),
-                io.Vae.Input("vae", optional=True),
-                io.Image.Input("image1", optional=True),
-                io.Image.Input("image2", optional=True),
-                io.Image.Input("image3", optional=True),
-            ],
-            outputs=[
-                io.Conditioning.Output(),
-            ],
-        )
-
-    @classmethod
-    def execute(cls, clip, prompt, image_encoder=None, auto_resize_images=True, vae=None, image1=None, image2=None, image3=None) -> io.NodeOutput:
-        ref_latents = []
-        images = list(filter(lambda a: a is not None, [image1, image2, image3]))
-
-        prompt_list = []
-        template = None
-        if len(images) > 0:
-            prompt_list = ["<|im_start|>user\n<|vision_start|>"]
-            prompt_list += ["<|vision_end|><|vision_start|>"] * (len(images) - 1)
-            prompt_list += ["<|vision_end|><|im_end|>"]
-            template = "<|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n<|vision_start|>"
-
-        encoded_images = []
-
-        for i, image in enumerate(images):
-            if image_encoder is not None:
-                encoded_images.append(image_encoder.encode_image(image))
-
-            if vae is not None:
-                if auto_resize_images:
-                    samples = image.movedim(-1, 1)
-                    total = int(1024 * 1024)
-                    scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
-                    width = round(samples.shape[3] * scale_by / 8.0) * 8
-                    height = round(samples.shape[2] * scale_by / 8.0) * 8
-
-                    image = comfy.utils.common_upscale(samples, width, height, "area", "disabled").movedim(1, -1)
-                ref_latents.append(vae.encode(image))
-
-        tokens = clip.tokenize(prompt, llama_template=template)
-        conditioning = clip.encode_from_tokens_scheduled(tokens)
-
-        extra_text_embeds = []
-        for p in prompt_list:
-            tokens = clip.tokenize(p, llama_template="{}")
-            text_embeds = clip.encode_from_tokens_scheduled(tokens)
-            extra_text_embeds.append(text_embeds[0][0])
-
-        if len(ref_latents) > 0:
-            conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": ref_latents}, append=True)
-        if len(encoded_images) > 0:
-            conditioning = node_helpers.conditioning_set_values(conditioning, {"clip_vision_outputs": encoded_images}, append=True)
-        if len(extra_text_embeds) > 0:
-            conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents_text_embeds": extra_text_embeds}, append=True)
-
-        return io.NodeOutput(conditioning)
-
-
-class ZImageExtension(ComfyExtension):
-    @override
-    async def get_node_list(self) -> list[type[io.ComfyNode]]:
-        return [
-            TextEncodeZImageOmni,
-        ]
-
-
-async def comfy_entrypoint() -> ZImageExtension:
-    return ZImageExtension()
--- a/comfyui_version.py
+++ b/comfyui_version.py
@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.10.0"
+__version__ = "0.9.2"
--- a/nodes.py
+++ b/nodes.py
@ -2373,7 +2373,6 @@ async def init_builtin_extra_nodes():
        "nodes_kandinsky5.py",
        "nodes_wanmove.py",
        "nodes_image_compare.py",
-        "nodes_zimage.py",
    ]

    import_failed = []
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.10.0"
+version = "0.9.2"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.10"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
-comfyui-frontend-package==1.37.11
-comfyui-workflow-templates==0.8.15
+comfyui-frontend-package==1.36.14
+comfyui-workflow-templates==0.8.11
 comfyui-embedded-docs==0.4.0
 torch
 torchsde