Merge branch 'master' into feat/point-cloud-gaussian-splat-nodes-v2

2026-07-25 17:48:01 +08:00 · 2026-06-04 19:55:21 -07:00
parent 30828a7466 ab0d8a9203
commit 7e8e23f910
28 changed files with 6176 additions and 11777 deletions
@@ -4,7 +4,7 @@ from torch import Tensor

 from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management
-import logging
+import comfy.quant_ops


 def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transformer_options={}) -> Tensor:
@@ -44,21 +44,15 @@ def _apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
    return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)


-try:
-    import comfy.quant_ops
-    q_apply_rope = comfy.quant_ops.ck.apply_rope
-    q_apply_rope1 = comfy.quant_ops.ck.apply_rope1
-    def apply_rope(xq, xk, freqs_cis):
-        if comfy.model_management.in_training:
-            return _apply_rope(xq, xk, freqs_cis)
-        else:
-            return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
-    def apply_rope1(x, freqs_cis):
-        if comfy.model_management.in_training:
-            return _apply_rope1(x, freqs_cis)
-        else:
-            return q_apply_rope1(x, freqs_cis)
-except:
-    logging.warning("No comfy kitchen, using old apply_rope functions.")
-    apply_rope = _apply_rope
-    apply_rope1 = _apply_rope1
+def apply_rope(xq, xk, freqs_cis):
+    if comfy.model_management.in_training:
+        return _apply_rope(xq, xk, freqs_cis)
+    else:
+        return comfy.quant_ops.ck.apply_rope(xq, xk, freqs_cis)
+
+
+def apply_rope1(x, freqs_cis):
+    if comfy.model_management.in_training:
+        return _apply_rope1(x, freqs_cis)
+    else:
+        return comfy.quant_ops.ck.apply_rope1(x, freqs_cis)
@@ -0,0 +1,297 @@
+"""
+The Ideogram 4 transformer is a NextDiT/Lumina2-family single-stream model
+consumes Qwen3-VL hidden-state features (concatenated from 13 layers -> 53248 dims)
+packs ``[text tokens, image tokens]`` into one sequence with block-diagonal segment attention and 3D interleaved MRoPE.
+"""
+
+from __future__ import annotations
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.patcher_extension
+from comfy.ldm.lumina.model import FeedForward
+from comfy.ldm.modules.attention import optimized_attention_masked
+from comfy.text_encoders.llama import apply_rope, precompute_freqs_cis
+
+# Per-token role indicators
+SEQUENCE_PADDING_INDICATOR = -1
+OUTPUT_IMAGE_INDICATOR = 2
+LLM_TOKEN_INDICATOR = 3
+# Image grid coordinates are offset so they never collide with text positions
+IMAGE_POSITION_OFFSET = 65536
+
+
+class Ideogram4Attention(nn.Module):
+    def __init__(self, hidden_size, num_heads, eps=1e-5, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.hidden_size = hidden_size
+
+        self.qkv = operations.Linear(hidden_size, hidden_size * 3, bias=False, dtype=dtype, device=device)
+        self.norm_q = operations.RMSNorm(self.head_dim, eps=eps, elementwise_affine=True, dtype=dtype, device=device)
+        self.norm_k = operations.RMSNorm(self.head_dim, eps=eps, elementwise_affine=True, dtype=dtype, device=device)
+        self.o = operations.Linear(hidden_size, hidden_size, bias=False, dtype=dtype, device=device)
+
+    def forward(self, x, attn_mask, freqs_cis, transformer_options={}):
+        batch_size, seq_len, _ = x.shape
+        qkv = self.qkv(x).view(batch_size, seq_len, 3, self.num_heads, self.head_dim)
+        q, k, v = qkv.unbind(dim=2)
+
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+
+        # (B, heads, L, head_dim)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        q, k = apply_rope(q, k, freqs_cis)
+
+        out = optimized_attention_masked(q, k, v, self.num_heads, attn_mask, skip_reshape=True, transformer_options=transformer_options)
+        return self.o(out)
+
+
+class Ideogram4TransformerBlock(nn.Module):
+    def __init__(self, hidden_size, intermediate_size, num_heads, norm_eps, adaln_dim, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.attention = Ideogram4Attention(hidden_size, num_heads, eps=1e-5, dtype=dtype, device=device, operations=operations)
+        self.feed_forward = FeedForward(
+            dim=hidden_size, hidden_dim=intermediate_size, multiple_of=1, ffn_dim_multiplier=None,
+            operation_settings={"operations": operations, "dtype": dtype, "device": device},
+        )
+
+        self.attention_norm1 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device)
+        self.ffn_norm1 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device)
+        self.attention_norm2 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device)
+        self.ffn_norm2 = operations.RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=True, dtype=dtype, device=device)
+
+        self.adaln_modulation = operations.Linear(adaln_dim, 4 * hidden_size, bias=True, dtype=dtype, device=device)
+
+    def forward(self, x, attn_mask, freqs_cis, adaln_input, transformer_options={}):
+        mod = self.adaln_modulation(adaln_input)
+        scale_msa, gate_msa, scale_mlp, gate_mlp = mod.chunk(4, dim=-1)
+        gate_msa = torch.tanh(gate_msa)
+        gate_mlp = torch.tanh(gate_mlp)
+        scale_msa = 1.0 + scale_msa
+        scale_mlp = 1.0 + scale_mlp
+
+        attn_out = self.attention(self.attention_norm1(x) * scale_msa, attn_mask, freqs_cis, transformer_options=transformer_options)
+        x = x + gate_msa * self.attention_norm2(attn_out)
+        x = x + gate_mlp * self.ffn_norm2(self.feed_forward(self.ffn_norm1(x) * scale_mlp))
+        return x
+
+
+def _sinusoidal_embedding(t, dim, scale=1e4):
+    t = t.to(torch.float32)
+    half = dim // 2
+    freq = math.log(scale) / (half - 1)
+    freq = torch.exp(torch.arange(half, dtype=torch.float32, device=t.device) * -freq)
+    emb = t.unsqueeze(-1) * freq
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    if dim % 2 == 1:
+        emb = F.pad(emb, (0, 1))
+    return emb
+
+
+class Ideogram4EmbedScalar(nn.Module):
+    def __init__(self, dim, input_range=(0.0, 1.0), dtype=None, device=None, operations=None):
+        super().__init__()
+        self.dim = dim
+        self.range_min, self.range_max = input_range
+        self.mlp_in = operations.Linear(dim, dim, bias=True, dtype=dtype, device=device)
+        self.mlp_out = operations.Linear(dim, dim, bias=True, dtype=dtype, device=device)
+
+    def forward(self, x):
+        x = x.to(torch.float32)
+        scaled = 1e4 * (x - self.range_min) / (self.range_max - self.range_min)
+        emb = _sinusoidal_embedding(scaled, self.dim)
+        emb = emb.to(self.mlp_in.weight.dtype)
+        emb = F.silu(self.mlp_in(emb))
+        return self.mlp_out(emb)
+
+
+class Ideogram4FinalLayer(nn.Module):
+    def __init__(self, hidden_size, out_channels, adaln_dim, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm_final = operations.LayerNorm(hidden_size, eps=1e-6, elementwise_affine=False, dtype=dtype, device=device)
+        self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device)
+        self.adaln_modulation = operations.Linear(adaln_dim, hidden_size, bias=True, dtype=dtype, device=device)
+
+    def forward(self, x, c):
+        scale = 1.0 + self.adaln_modulation(F.silu(c))
+        return self.linear(self.norm_final(x) * scale)
+
+
+class Ideogram4Transformer(nn.Module):
+    """A single Ideogram 4 backbone operating on a packed token sequence."""
+
+    def __init__(self, emb_dim, num_layers, num_heads, intermediate_size, adaln_dim,
+                 in_channels, llm_features_dim, rope_theta, mrope_section, norm_eps,
+                 dtype=None, device=None, operations=None):
+        super().__init__()
+        self.head_dim = emb_dim // num_heads
+        self.rope_theta = rope_theta
+        self.mrope_section = tuple(mrope_section)
+
+        self.input_proj = operations.Linear(in_channels, emb_dim, bias=True, dtype=dtype, device=device)
+        self.llm_cond_norm = operations.RMSNorm(llm_features_dim, eps=1e-6, elementwise_affine=True, dtype=dtype, device=device)
+        self.llm_cond_proj = operations.Linear(llm_features_dim, emb_dim, bias=True, dtype=dtype, device=device)
+        self.t_embedding = Ideogram4EmbedScalar(emb_dim, input_range=(0.0, 1.0), dtype=dtype, device=device, operations=operations)
+        self.adaln_proj = operations.Linear(emb_dim, adaln_dim, bias=True, dtype=dtype, device=device)
+
+        self.embed_image_indicator = operations.Embedding(2, emb_dim, dtype=dtype, device=device)
+
+        self.layers = nn.ModuleList([
+            Ideogram4TransformerBlock(emb_dim, intermediate_size, num_heads, norm_eps, adaln_dim,
+                                      dtype=dtype, device=device, operations=operations)
+            for _ in range(num_layers)
+        ])
+
+        self.final_layer = Ideogram4FinalLayer(emb_dim, in_channels, adaln_dim, dtype=dtype, device=device, operations=operations)
+
+    def _backbone(self, llm_features, x, t, position_ids, attn_mask, indicator, transformer_options={}):
+        indicator = indicator.to(torch.long)
+        output_image_mask = (indicator == OUTPUT_IMAGE_INDICATOR).to(x.dtype).unsqueeze(-1)
+
+        x = x * output_image_mask
+        h = self.input_proj(x) * output_image_mask
+
+        t_cond = self.t_embedding(t)
+        if t.dim() == 1:
+            t_cond = t_cond.unsqueeze(1)
+        adaln_input = F.silu(self.adaln_proj(t_cond))
+
+        # h is zero on the text rows (content lives only on image rows), add writes the text features in place
+        if llm_features is not None:
+            L_text = llm_features.shape[1]
+            text_mask = (indicator[:, :L_text] == LLM_TOKEN_INDICATOR).to(x.dtype).unsqueeze(-1)
+            llm = self.llm_cond_norm(llm_features * text_mask)
+            llm = self.llm_cond_proj(llm) * text_mask
+            h[:, :L_text] = h[:, :L_text] + llm
+
+        h = h + self.embed_image_indicator((indicator == OUTPUT_IMAGE_INDICATOR).to(torch.long), out_dtype=h.dtype)
+
+        # Qwen3-VL interleaved MRoPE; position_ids (B, L, 3) -> (3, L) (same across batch).
+        freqs_cis = precompute_freqs_cis(
+            self.head_dim, position_ids[0].transpose(0, 1), self.rope_theta,
+            rope_dims=self.mrope_section, interleaved_mrope=True, device=position_ids.device,
+        )
+
+        if attn_mask is not None and attn_mask.dtype == torch.bool:
+            attn_mask = torch.zeros_like(attn_mask, dtype=h.dtype).masked_fill_(~attn_mask, -torch.finfo(h.dtype).max)
+
+        for layer in self.layers:
+            h = layer(h, attn_mask, freqs_cis, adaln_input, transformer_options=transformer_options)
+
+        return self.final_layer(h, adaln_input)
+
+
+class Ideogram4Transformer2DModel(Ideogram4Transformer):
+    """Ideogram 4 single-stream DiT.
+
+    Runs a packed ``[text, image]`` sequence when text context is supplied, or an image-only sequence when ``context is None``.
+    """
+
+    def __init__(self, image_model=None, in_channels=128, num_layers=34, num_attention_heads=18, attention_head_dim=256, intermediate_size=12288,
+                 adaln_dim=512, llm_features_dim=53248, rope_theta=5000000, mrope_section=(24, 20, 20), norm_eps=1e-5,
+                 dtype=None, device=None, operations=None, **kwargs):
+        emb_dim = num_attention_heads * attention_head_dim
+        super().__init__(
+            emb_dim=emb_dim, num_layers=num_layers, num_heads=num_attention_heads,
+            intermediate_size=intermediate_size, adaln_dim=adaln_dim, in_channels=in_channels,
+            llm_features_dim=llm_features_dim, rope_theta=rope_theta, mrope_section=mrope_section,
+            norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
+        self.dtype = dtype
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        # 128-dim token = patch (2x2) * ae_channels (32).
+        self.patch_size = 2
+        self.ae_channels = in_channels // (self.patch_size * self.patch_size)
+
+    def _img_to_tokens(self, x):
+        B, C, gh, gw = x.shape
+        x = x.view(B, self.ae_channels, self.patch_size, self.patch_size, gh, gw)
+        x = x.permute(0, 4, 5, 2, 3, 1)  # (B, gh, gw, pi, pj, c)
+        return x.reshape(B, gh * gw, C)
+
+    def _tokens_to_img(self, tokens, gh, gw):
+        B = tokens.shape[0]
+        C = tokens.shape[-1]
+        x = tokens.reshape(B, gh, gw, self.patch_size, self.patch_size, self.ae_channels)
+        x = x.permute(0, 5, 3, 4, 1, 2)  # (B, c, pi, pj, gh, gw)
+        return x.reshape(B, C, gh, gw)
+
+    def _image_position_ids(self, gh, gw, device):
+        h_idx = torch.arange(gh, device=device).view(-1, 1).expand(gh, gw).reshape(-1)
+        w_idx = torch.arange(gw, device=device).view(1, -1).expand(gh, gw).reshape(-1)
+        t_idx = torch.zeros_like(h_idx)
+        return torch.stack([t_idx, h_idx, w_idx], dim=1) + IMAGE_POSITION_OFFSET  # (L_img, 3)
+
+    def _run_conditional(self, x_chunk, context_chunk, attn_mask_chunk, t_chunk, gh, gw, transformer_options):
+        B = x_chunk.shape[0]
+        device = x_chunk.device
+        img_tokens = self._img_to_tokens(x_chunk)
+        L_img = img_tokens.shape[1]
+        L_text = context_chunk.shape[1]
+        L = L_text + L_img
+        latent_dim = img_tokens.shape[-1]
+
+        x_full = torch.zeros(B, L, latent_dim, dtype=img_tokens.dtype, device=device)
+        x_full[:, L_text:] = img_tokens
+
+        text_pos = torch.arange(L_text, device=device).view(-1, 1).expand(L_text, 3)
+        img_pos = self._image_position_ids(gh, gw, device)
+        position_ids = torch.cat([text_pos, img_pos], dim=0).unsqueeze(0).expand(B, L, 3)
+
+        indicator = torch.empty(B, L, dtype=torch.long, device=device)
+        indicator[:, :L_text] = LLM_TOKEN_INDICATOR
+        indicator[:, L_text:] = OUTPUT_IMAGE_INDICATOR
+
+        attn_mask = None
+        if attn_mask_chunk is not None:
+            segment_ids = torch.ones(B, L, dtype=torch.long, device=device)
+            pad = (attn_mask_chunk == 0)
+            segment_ids[:, :L_text][pad] = SEQUENCE_PADDING_INDICATOR
+            indicator[:, :L_text][pad] = 0
+            # Block-diagonal mask from segment ids: (B, 1, L, L), True = attend.
+            attn_mask = (segment_ids.unsqueeze(2) == segment_ids.unsqueeze(1)).unsqueeze(1)
+
+        out = self._backbone(context_chunk, x_full, t_chunk, position_ids, attn_mask, indicator,
+                             transformer_options=transformer_options)
+        return self._tokens_to_img(out[:, L_text:], gh, gw)
+
+    def _run_image_only(self, x_chunk, t_chunk, gh, gw, transformer_options):
+        B = x_chunk.shape[0]
+        device = x_chunk.device
+        img_tokens = self._img_to_tokens(x_chunk)
+        L_img = img_tokens.shape[1]
+
+        position_ids = self._image_position_ids(gh, gw, device).unsqueeze(0).expand(B, L_img, 3)
+        indicator = torch.full((B, L_img), OUTPUT_IMAGE_INDICATOR, dtype=torch.long, device=device)
+
+        # Image-only sequence is a single segment -> no mask, full attention, no LLM context.
+        out = self._backbone(None, img_tokens, t_chunk, position_ids, None, indicator, transformer_options=transformer_options)
+        return self._tokens_to_img(out, gh, gw)
+
+    def forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs):
+        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
+            self._forward,
+            self,
+            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options),
+        ).execute(x, timesteps, context, attention_mask, transformer_options, **kwargs)
+
+    def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs):
+        bs, c, gh, gw = x.shape
+
+        timesteps = 1.0 - timesteps
+
+        # unconditional pass
+        if context is None:
+            return -self._run_image_only(x, timesteps, gh, gw, transformer_options)
+
+        return -self._run_conditional(x, context, attention_mask, timesteps, gh, gw, transformer_options)
@@ -55,6 +55,7 @@ import comfy.ldm.pixeldit.pid
 import comfy.ldm.ace.model
 import comfy.ldm.omnigen.omnigen2
 import comfy.ldm.qwen_image.model
+import comfy.ldm.ideogram4.model
 import comfy.ldm.kandinsky5.model
 import comfy.ldm.anima.model
 import comfy.ldm.ace.ace_step15
@@ -2018,6 +2019,21 @@ class QwenImage(BaseModel):
            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
        return out

+class Ideogram4(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ideogram4.model.Ideogram4Transformer2DModel)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            if torch.numel(attention_mask) != attention_mask.sum():
+                out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+        return out
+
 class HunyuanImage21(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan_video.model.HunyuanVideo)
@@ -815,6 +815,13 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
            dit_config["default_ref_method"] = "negative_index"
        return dit_config

+    if '{}embed_image_indicator.weight'.format(key_prefix) in state_dict_keys:  # Ideogram 4
+        dit_config = {}
+        dit_config["image_model"] = "ideogram4"
+        dit_config["in_channels"] = state_dict['{}input_proj.weight'.format(key_prefix)].shape[1]
+        dit_config["num_layers"] = count_blocks(state_dict_keys, '{}layers.'.format(key_prefix) + '{}.')
+        return dit_config
+
    if '{}visual_transformer_blocks.0.cross_attention.key_norm.weight'.format(key_prefix) in state_dict_keys: # Kandinsky 5
        dit_config = {}
        model_dim = state_dict['{}visual_embeddings.in_layer.bias'.format(key_prefix)].shape[0]
@@ -54,6 +54,8 @@ class MultiGPUThreadPool:
            try:
                result = fn(*args, **kwargs)
                result_q.put((result, None))
+            except comfy.model_management.InterruptProcessingException as e:
+                result_q.put((None, e))
            except Exception as e:
                result_q.put((None, e))

@@ -58,6 +58,7 @@ import comfy.text_encoders.omnigen2
 import comfy.text_encoders.qwen_image
 import comfy.text_encoders.hunyuan_image
 import comfy.text_encoders.z_image
+import comfy.text_encoders.ideogram4
 import comfy.text_encoders.ovis
 import comfy.text_encoders.kandinsky5
 import comfy.text_encoders.jina_clip_2
@@ -1298,6 +1299,7 @@ class CLIPType(Enum):
    COGVIDEOX = 27
    LENS = 28
    PIXELDIT = 29
+    IDEOGRAM4 = 30



@@ -1596,8 +1598,12 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            clip_target.clip = comfy.text_encoders.ovis.te(**llama_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.ovis.OvisTokenizer
        elif te_model == TEModel.QWEN3_8B:
-            clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type="qwen3_8b")
-            clip_target.tokenizer = comfy.text_encoders.flux.KleinTokenizer8B
+            if clip_type == CLIPType.IDEOGRAM4:
+                clip_target.clip = comfy.text_encoders.ideogram4.te(**llama_detect(clip_data))
+                clip_target.tokenizer = comfy.text_encoders.ideogram4.Ideogram4Tokenizer
+            else:
+                clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type="qwen3_8b")
+                clip_target.tokenizer = comfy.text_encoders.flux.KleinTokenizer8B
        elif te_model == TEModel.JINA_CLIP_2:
            clip_target.clip = comfy.text_encoders.jina_clip_2.JinaClip2TextModelWrapper
            clip_target.tokenizer = comfy.text_encoders.jina_clip_2.JinaClip2TokenizerWrapper
@@ -24,6 +24,7 @@ import comfy.text_encoders.qwen_image
 import comfy.text_encoders.hunyuan_image
 import comfy.text_encoders.kandinsky5
 import comfy.text_encoders.z_image
+import comfy.text_encoders.ideogram4
 import comfy.text_encoders.anima
 import comfy.text_encoders.ace15
 import comfy.text_encoders.longcat_image
@@ -1746,6 +1747,44 @@ class Omnigen2(supported_models_base.BASE):
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect))

+class Ideogram4(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "ideogram4",
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 1.0,
+    }
+
+    memory_usage_factor = 11.6
+
+    unet_extra_config = {
+        "num_attention_heads": 18,
+        "attention_head_dim": 256,
+        "intermediate_size": 12288,
+        "adaln_dim": 512,
+        "llm_features_dim": 53248,
+        "rope_theta": 5000000,
+        "mrope_section": [24, 20, 20],
+        "norm_eps": 1e-5,
+    }
+    latent_format = latent_formats.Flux2
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Ideogram4(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3vl_8b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.ideogram4.Ideogram4Tokenizer, comfy.text_encoders.ideogram4.te(**hunyuan_detect))
+
 class QwenImage(supported_models_base.BASE):
    unet_config = {
        "image_model": "qwen_image",
@@ -2233,6 +2272,7 @@ models = [
    ACEStep15,
    Omnigen2,
    QwenImage,
+    Ideogram4,
    Flux2,
    Lens,
    Kandinsky5Image,
@@ -0,0 +1,77 @@
+"""Ideogram 4 text encoder: Qwen3-VL-8B language model, 13-layer tap.
+
+Ideogram 4 conditions on the concatenation of hidden states from 13 layers of
+Qwen3-VL (layers 0,3,...,33,35), giving a 4096*13 = 53248-dim feature per token.
+"""
+
+import os
+
+from transformers import Qwen2Tokenizer
+
+import comfy.text_encoders.llama
+from comfy import sd1_clip
+
+# Reference taps outputs of layers (0,3,...,35); comfy captures layer inputs, offset by +1.
+IDEOGRAM4_TAP_LAYERS = [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 36]
+
+
+class Qwen3VLTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory,
+                         embedding_size=4096, embedding_key='qwen3vl_8b', tokenizer_class=Qwen2Tokenizer,
+                         has_start_token=False, has_end_token=False, pad_to_max_length=False,
+                         max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
+
+
+class Ideogram4Tokenizer(sd1_clip.SD1Tokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data,
+                         name="qwen3vl_8b", tokenizer=Qwen3VLTokenizer)
+
+        self.llama_template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+
+    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
+        if llama_template is None:
+            llama_text = self.llama_template.format(text)
+        else:
+            llama_text = llama_template.format(text)
+        return super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
+
+
+# Qwen3-VL-8B = 5e6 (vs plain Qwen3-8B's 1e6)
+# final_norm/lm_head off -> Ideogram only reads raw tapped hidden states
+QWEN3VL_8B_CONFIG = {"rope_theta": 5000000.0, "final_norm": False, "lm_head": False}
+
+
+class Qwen3VL8BModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="hidden", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
+        super().__init__(device=device, layer=IDEOGRAM4_TAP_LAYERS, layer_idx=None,
+                         textmodel_json_config=dict(QWEN3VL_8B_CONFIG),
+                         dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False,
+                         model_class=comfy.text_encoders.llama.Qwen3_8B,
+                         enable_attention_masks=attention_mask, return_attention_masks=attention_mask,
+                         model_options=model_options)
+
+
+class Ideogram4TEModel(sd1_clip.SD1ClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
+        super().__init__(device=device, dtype=dtype, name="qwen3vl_8b", clip_model=Qwen3VL8BModel, model_options=model_options)
+
+    def encode_token_weights(self, token_weight_pairs):
+        out, pooled, extra = super().encode_token_weights(token_weight_pairs)
+        b, n, seq, h = out.shape # (B, n_taps=13, seq, 4096) stacked in ascending layer order.
+        out = out.permute(0, 2, 3, 1).reshape(b, seq, h * n) # (B, seq, 4096*13). permute -> (B, seq, H, taps).
+        return out, pooled, extra
+
+
+def te(dtype_llama=None, llama_quantization_metadata=None):
+    class Ideogram4TEModel_(Ideogram4TEModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if dtype_llama is not None:
+                dtype = dtype_llama
+            if llama_quantization_metadata is not None:
+                model_options = model_options.copy()
+                model_options["quantization_metadata"] = llama_quantization_metadata
+            super().__init__(device=device, dtype=dtype, model_options=model_options)
+    return Ideogram4TEModel_
@@ -285,7 +285,7 @@ class AudioSaveHelper:
        results = []
        for batch_number, waveform in enumerate(audio["waveform"].cpu()):
            filename_with_batch_num = filename.replace("%batch_num%", str(batch_number))
-            file = f"{filename_with_batch_num}_{counter:05}_.{format}"
+            file = f"{filename_with_batch_num}_{counter:05}.{format}"
            output_path = os.path.join(full_output_folder, file)

            # Use original sample rate initially
@@ -43,6 +43,7 @@ class BFLFluxEraseRequest(BaseModel):
        "white (255) marks areas to remove, black (0) marks areas to preserve.",
    )
    dilate_pixels: int = Field(10)
+    seed: int | None = Field(None)
    output_format: str = Field("png")


@@ -97,3 +97,28 @@ class BriaRemoveVideoBackgroundResult(BaseModel):
 class BriaRemoveVideoBackgroundResponse(BaseModel):
    status: str = Field(...)
    result: BriaRemoveVideoBackgroundResult | None = Field(None)
+
+
+class BriaVideoGreenScreenRequest(BaseModel):
+    video: str = Field(..., description="Publicly accessible URL of the input video.")
+    green_shade: str = Field(
+        default="broadcast_green",
+        description="Solid chroma-key shade applied behind the foreground "
+        "(broadcast_green, chroma_green, or blue_screen).",
+    )
+    output_container_and_codec: str = Field(...)
+    preserve_audio: bool = Field(True)
+    seed: int = Field(...)
+
+
+class BriaVideoReplaceBackgroundRequest(BaseModel):
+    video: str = Field(..., description="Publicly accessible URL of the input (foreground) video.")
+    background_url: str = Field(
+        ...,
+        description="Publicly accessible URL of the background image or video to composite behind "
+        "the foreground. Stretched to the foreground frame; match its aspect ratio for "
+        "undistorted results.",
+    )
+    output_container_and_codec: str = Field(...)
+    preserve_audio: bool = Field(True)
+    seed: int = Field(...)
@@ -290,3 +290,19 @@ class IdeogramV3Request(BaseModel):
        None,
        description='Optional masks for character reference images. When provided, must match the number of character_reference_images. Each mask should be a grayscale image of the same dimensions as the corresponding character reference image. The images should be in JPEG, PNG or WebP format.'
    )
+
+
+class IdeogramV4Request(BaseModel):
+    text_prompt: str | None = Field(
+        None,
+        description="Natural-language prompt; Magic Prompt is applied automatically. "
+        "Supply exactly one of text_prompt or json_prompt.",
+    )
+    json_prompt: dict[str, Any] | None = Field(
+        None,
+        description="Structured V4 prompt object consumed directly (disables Magic Prompt). "
+        "Supply exactly one of text_prompt or json_prompt.",
+    )
+    resolution: str | None = Field(None, description="Output resolution in WIDTHxHEIGHT (e.g. '2048x2048').")
+    rendering_speed: str | None = Field(None, description="Rendering speed: 'TURBO', 'DEFAULT', or 'QUALITY'.")
+    enable_copyright_detection: bool | None = Field(None, description="Opt into post-generation copyright detection.")
@@ -534,6 +534,15 @@ class FluxEraseNode(IO.ComfyNode):
                    max=25,
                    tooltip="Expands the mask boundaries to ensure clean coverage of the object's edges.",
                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    control_after_generate=True,
+                    tooltip="The random seed used for creating the noise.",
+                    optional=True,
+                ),
            ],
            outputs=[IO.Image.Output()],
            hidden=[
@@ -553,6 +562,7 @@ class FluxEraseNode(IO.ComfyNode):
        image: Input.Image,
        mask: Input.Image,
        dilate_pixels: int = 10,
+        seed: int = 0,
    ) -> IO.NodeOutput:
        validate_image_dimensions(image, min_width=256, min_height=256)
        mask = resize_mask_to_image(mask, image)
@@ -565,6 +575,7 @@ class FluxEraseNode(IO.ComfyNode):
                image=tensor_to_base64_string(image[:, :, :, :3]),  # make sure image will have alpha channel removed
                mask=mask,
                dilate_pixels=dilate_pixels,
+                seed=seed,
            ),
        )

@@ -1,14 +1,19 @@
+import av
+import torch
+from av.codec import CodecContext
 from typing_extensions import override

 from comfy_api.latest import IO, ComfyExtension, Input
 from comfy_api_nodes.apis.bria import (
    BriaEditImageRequest,
+    BriaImageEditResponse,
    BriaRemoveBackgroundRequest,
    BriaRemoveBackgroundResponse,
    BriaRemoveVideoBackgroundRequest,
    BriaRemoveVideoBackgroundResponse,
-    BriaImageEditResponse,
    BriaStatusResponse,
+    BriaVideoGreenScreenRequest,
+    BriaVideoReplaceBackgroundRequest,
    InputModerationSettings,
 )
 from comfy_api_nodes.util import (
@@ -316,6 +321,248 @@ class BriaRemoveVideoBackground(IO.ComfyNode):
        return IO.NodeOutput(await download_url_to_video_output(response.result.video_url))


+class BriaVideoGreenScreen(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="BriaVideoGreenScreen",
+            display_name="Bria Video Green Screen",
+            category="partner/video/Bria",
+            description="Replace a video's background with a solid chroma-key screen using Bria.",
+            inputs=[
+                IO.Video.Input("video"),
+                IO.Combo.Input(
+                    "green_shade",
+                    options=["broadcast_green", "chroma_green", "blue_screen"],
+                    tooltip="Solid chroma-key shade applied behind the foreground: "
+                    "broadcast_green (#00B140), chroma_green (#00FF00), or blue_screen (#0000FF).",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed controls whether the node should re-run; "
+                    "results are non-deterministic regardless of seed.",
+                ),
+            ],
+            outputs=[IO.Video.Output()],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                expr="""{"type":"usd","usd":0.14,"format":{"suffix":"/second"}}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        video: Input.Video,
+        green_shade: str,
+        seed: int,
+    ) -> IO.NodeOutput:
+        validate_video_duration(video, max_duration=60.0)
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/bria/v2/video/edit/green_screen", method="POST"),
+            data=BriaVideoGreenScreenRequest(
+                video=await upload_video_to_comfyapi(cls, video),
+                green_shade=green_shade,
+                output_container_and_codec="mp4_h264",
+                seed=seed,
+            ),
+            response_model=BriaStatusResponse,
+        )
+        response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/bria/v2/status/{response.request_id}"),
+            status_extractor=lambda r: r.status,
+            response_model=BriaRemoveVideoBackgroundResponse,
+        )
+        return IO.NodeOutput(await download_url_to_video_output(response.result.video_url))
+
+
+class BriaVideoReplaceBackground(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="BriaVideoReplaceBackground",
+            display_name="Bria Video Replace Background",
+            category="partner/video/Bria",
+            description="Replace a video's background with a supplied image or video using Bria. "
+            "The output keeps the foreground's resolution and frame rate; a background with a "
+            "different aspect ratio is stretched to fit, so match it for undistorted results.",
+            inputs=[
+                IO.Video.Input("video", tooltip="Foreground video whose background is replaced."),
+                IO.Image.Input(
+                    "background_image",
+                    optional=True,
+                    tooltip="Background image to composite behind the foreground. "
+                    "Provide either a background image or a background video, not both.",
+                ),
+                IO.Video.Input(
+                    "background_video",
+                    optional=True,
+                    tooltip="Background video to composite behind the foreground. "
+                    "Provide either a background image or a background video, not both.",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed controls whether the node should re-run; "
+                    "results are non-deterministic regardless of seed.",
+                ),
+            ],
+            outputs=[IO.Video.Output()],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                expr="""{"type":"usd","usd":0.14,"format":{"suffix":"/second"}}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        video: Input.Video,
+        seed: int,
+        background_image: Input.Image | None = None,
+        background_video: Input.Video | None = None,
+    ) -> IO.NodeOutput:
+        if (background_image is None) == (background_video is None):
+            raise ValueError("Provide either a background image or a background video, not both.")
+        validate_video_duration(video, max_duration=60.0)
+        if background_video is not None:
+            validate_video_duration(background_video, max_duration=60.0)
+            background_url = await upload_video_to_comfyapi(cls, background_video, wait_label="Uploading background")
+        else:
+            background_url = await upload_image_to_comfyapi(cls, background_image, wait_label="Uploading background")
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/bria/v2/video/edit/replace_background", method="POST"),
+            data=BriaVideoReplaceBackgroundRequest(
+                video=await upload_video_to_comfyapi(cls, video),
+                background_url=background_url,
+                output_container_and_codec="mp4_h264",
+                seed=seed,
+            ),
+            response_model=BriaStatusResponse,
+        )
+        response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/bria/v2/status/{response.request_id}"),
+            status_extractor=lambda r: r.status,
+            response_model=BriaRemoveVideoBackgroundResponse,
+        )
+        return IO.NodeOutput(await download_url_to_video_output(response.result.video_url))
+
+
+def _video_to_images_and_mask(video: Input.Video) -> tuple[Input.Image, Input.Mask]:
+    """Decode a transparent webm (VP9 + alpha) into image frames and an alpha mask.
+
+    VP9 keeps its alpha in a side layer that PyAV's default vp9 decoder drops, so the frames
+    are decoded with libvpx-vp9. Returns RGB images [B,H,W,3] in 0..1 and a mask [B,H,W]
+    following the Load Image convention (1 = transparent) for compositing or Save WEBM.
+    """
+    rgb_frames: list[torch.Tensor] = []
+    alpha_frames: list[torch.Tensor] = []
+    with av.open(video.get_stream_source(), mode="r") as container:
+        stream = container.streams.video[0]
+        decoder = CodecContext.create("libvpx-vp9", "r") if stream.codec_context.name == "vp9" else None
+        for packet in container.demux(stream):
+            for frame in (decoder.decode(packet) if decoder is not None else packet.decode()):
+                rgba = torch.from_numpy(frame.to_ndarray(format="rgba")).float() / 255.0
+                rgb_frames.append(rgba[..., :3])
+                alpha_frames.append(rgba[..., 3])
+    images = torch.stack(rgb_frames) if rgb_frames else torch.zeros(0, 0, 0, 3)
+    mask = (1.0 - torch.stack(alpha_frames)) if alpha_frames else torch.zeros((images.shape[0], 64, 64))
+    return images, mask
+
+
+class BriaTransparentVideoBackground(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="BriaTransparentVideoBackground",
+            display_name="Bria Remove Video Background (Transparent)",
+            category="partner/video/Bria",
+            description="Remove the background from a video using Bria and return the cut-out frames "
+            "plus an alpha mask. Connect both to a compositing node, or feed them to Save WEBM to "
+            "write a transparent video.",
+            inputs=[
+                IO.Video.Input("video"),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed controls whether the node should re-run; "
+                    "results are non-deterministic regardless of seed.",
+                ),
+            ],
+            outputs=[
+                IO.Image.Output(display_name="images"),
+                IO.Mask.Output(display_name="mask"),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                expr="""{"type":"usd","usd":0.14,"format":{"suffix":"/second"}}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        video: Input.Video,
+        seed: int,
+    ) -> IO.NodeOutput:
+        validate_video_duration(video, max_duration=60.0)
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/bria/v2/video/edit/remove_background", method="POST"),
+            data=BriaRemoveVideoBackgroundRequest(
+                video=await upload_video_to_comfyapi(cls, video),
+                background_color="Transparent",
+                output_container_and_codec="webm_vp9",
+                seed=seed,
+            ),
+            response_model=BriaStatusResponse,
+        )
+        response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/bria/v2/status/{response.request_id}"),
+            status_extractor=lambda r: r.status,
+            response_model=BriaRemoveVideoBackgroundResponse,
+        )
+        video_out = await download_url_to_video_output(response.result.video_url)
+        images, mask = _video_to_images_and_mask(video_out)
+        return IO.NodeOutput(images, mask)
+
+
 class BriaExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -323,6 +570,9 @@ class BriaExtension(ComfyExtension):
            BriaImageEditNode,
            BriaRemoveImageBackground,
            BriaRemoveVideoBackground,
+            BriaVideoGreenScreen,
+            # BriaVideoReplaceBackground,  # server returns Status 500 when we pass background video
+            BriaTransparentVideoBackground,
        ]


@@ -7,6 +7,7 @@ from io import BytesIO
 import torch
 from typing_extensions import override

+from comfy.utils import common_upscale
 from comfy_api.latest import IO, ComfyExtension, Input, Types
 from comfy_api_nodes.apis.bytedance import (
    RECOMMENDED_PRESETS,
@@ -131,6 +132,44 @@ def _prepare_seedance_image(image: Input.Image) -> Input.Image:
    return image


+# Supported output aspect ratios, used to pre-size FLF frames to matching pixel pair to avoid the 1080p stretch jump.
+SEEDANCE2_RATIO_WH = {
+    "16:9": (16, 9),
+    "4:3": (4, 3),
+    "1:1": (1, 1),
+    "3:4": (3, 4),
+    "9:16": (9, 16),
+    "21:9": (21, 9),
+}
+SEEDANCE2_RES_SHORT_SIDE = {"480p": 480, "720p": 720, "1080p": 1080}
+
+
+def _seedance2_target_dims(resolution: str, ratio: str, image: torch.Tensor) -> tuple[int, int]:
+    """Exact supported output (width, height) for (resolution, ratio).
+
+    The shorter side equals the resolution number (e.g. 1080p 16:9 -> 1920x1080). For ratio
+    "adaptive" (or any unexpected value) the ratio is derived from the image's own aspect, snapped
+    to the nearest supported ratio, so the output keeps the frame's orientation.
+    """
+    short = SEEDANCE2_RES_SHORT_SIDE[resolution]
+    if ratio not in SEEDANCE2_RATIO_WH:
+        aspect = image.shape[-2] / image.shape[-3]  # W / H; tensor is (B, H, W, C)
+        ratio = min(SEEDANCE2_RATIO_WH, key=lambda k: abs(SEEDANCE2_RATIO_WH[k][0] / SEEDANCE2_RATIO_WH[k][1] - aspect))
+    rw, rh = SEEDANCE2_RATIO_WH[ratio]
+    if rw >= rh:  # landscape or square: shorter side is the height
+        out_w, out_h = round(short * rw / rh), short
+    else:  # portrait: shorter side is the width
+        out_w, out_h = short, round(short * rh / rw)
+    return out_w - out_w % 2, out_h - out_h % 2
+
+
+def _resize_to_exact(image: torch.Tensor, width: int, height: int) -> torch.Tensor:
+    """Center-crop to the target aspect and resize to exactly width x height (lanczos)."""
+    samples = image.movedim(-1, 1)  # (B, H, W, C) -> (B, C, H, W)
+    resized = common_upscale(samples, width, height, "lanczos", "center")
+    return resized.movedim(1, -1)
+
+
 async def _resolve_reference_assets(
    cls: type[IO.ComfyNode],
    asset_ids: list[str],
@@ -1790,10 +1829,28 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
        if last_frame is not None and last_frame_asset_id:
            raise ValueError("Provide only one of last_frame or last_frame_asset_id, not both.")

-        if first_frame is not None:
-            first_frame = _prepare_seedance_image(first_frame)
-        if last_frame is not None:
-            last_frame = _prepare_seedance_image(last_frame)
+        request_ratio = model["ratio"]
+        if first_frame_asset_id or last_frame_asset_id:
+            if first_frame is not None:
+                first_frame = _prepare_seedance_image(first_frame)
+            if last_frame is not None:
+                last_frame = _prepare_seedance_image(last_frame)
+        else:
+            # The 1080p FLF stretch fix (pre-size frames to a supported pixel pair + submit ratio="adaptive")
+            # only applies to local image inputs we can resize.
+            request_ratio = "adaptive"
+            target_dims: tuple[int, int] | None = None
+            if first_frame is not None:
+                validate_image_aspect_ratio(first_frame, (2, 5), (5, 2), strict=False)  # 0.4 to 2.5
+                validate_image_dimensions(first_frame, min_width=300, min_height=300)
+                target_dims = _seedance2_target_dims(model["resolution"], model["ratio"], first_frame)
+                first_frame = _resize_to_exact(first_frame, *target_dims)
+            if last_frame is not None:
+                validate_image_aspect_ratio(last_frame, (2, 5), (5, 2), strict=False)  # 0.4 to 2.5
+                validate_image_dimensions(last_frame, min_width=300, min_height=300)
+                if target_dims is None:
+                    target_dims = _seedance2_target_dims(model["resolution"], model["ratio"], last_frame)
+                last_frame = _resize_to_exact(last_frame, *target_dims)

        asset_ids_to_resolve = [a for a in (first_frame_asset_id, last_frame_asset_id) if a]
        image_assets: dict[str, str] = {}
@@ -1844,7 +1901,7 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
                content=content,
                generate_audio=model["generate_audio"],
                resolution=model["resolution"],
-                ratio=model["ratio"],
+                ratio=request_ratio,
                duration=model["duration"],
                seed=seed,
                watermark=watermark,
@@ -10,6 +10,7 @@ from comfy_api_nodes.apis.ideogram import (
    ImageRequest,
    IdeogramV3Request,
    IdeogramV3EditRequest,
+    IdeogramV4Request,
 )
 from comfy_api_nodes.util import (
    ApiEndpoint,
@@ -17,6 +18,7 @@ from comfy_api_nodes.util import (
    download_url_as_bytesio,
    resize_mask_to_image,
    sync_op,
+    validate_string,
 )

 V1_V1_RES_MAP = {
@@ -798,6 +800,119 @@ class IdeogramV3(IO.ComfyNode):
        return IO.NodeOutput(await download_and_process_images(image_urls))


+class IdeogramV4(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="IdeogramV4",
+            display_name="Ideogram V4",
+            category="partner/image/Ideogram",
+            description="Generates images using the Ideogram 4.0 model from a text prompt.",
+            inputs=[
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Text prompt for the image generation.",
+                ),
+                IO.Combo.Input(
+                    "resolution",
+                    options=[
+                        "Auto",
+                        "2048x2048 (1:1)",
+                        "1440x2880 (1:2)",
+                        "2880x1440 (2:1)",
+                        "1664x2496 (2:3)",
+                        "2496x1664 (3:2)",
+                        "1792x2240 (4:5)",
+                        "2240x1792 (5:4)",
+                        "1440x2560 (9:16)",
+                        "2560x1440 (16:9)",
+                        "1600x2560 (5:8)",
+                        "2560x1600 (8:5)",
+                        "1728x2304 (3:4)",
+                        "2304x1728 (4:3)",
+                        "1296x3168 (9:22)",
+                        "3168x1296 (22:9)",
+                        "1152x2944 (9:23)",
+                        "2944x1152 (23:9)",
+                        "1248x3328 (3:8)",
+                        "3328x1248 (8:3)",
+                        "1280x3072 (5:12)",
+                        "3072x1280 (12:5)",
+                    ],
+                    default="Auto",
+                ),
+                IO.Combo.Input(
+                    "rendering_speed",
+                    options=["DEFAULT", "TURBO", "QUALITY"],
+                    default="DEFAULT",
+                    tooltip="Controls the trade-off between generation speed and quality.",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    control_after_generate=True,
+                    display_mode=IO.NumberDisplay.number,
+                ),
+            ],
+            outputs=[
+                IO.Image.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["rendering_speed"]),
+                expr="""
+                (
+                  $speed := widgets.rendering_speed;
+                  $price :=
+                    $contains($speed,"turbo") ? 0.0429 :
+                    $contains($speed,"quality") ? 0.143 :
+                    0.0858;
+                  {"type":"usd","usd": $price}
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        prompt: str,
+        resolution: str,
+        rendering_speed: str,
+        seed: int,
+    ):
+        validate_string(prompt, strip_whitespace=True, min_length=1)
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/ideogram/ideogram-v4/generate", method="POST"),
+            response_model=IdeogramGenerateResponse,
+            data=IdeogramV4Request(
+                text_prompt=prompt,
+                resolution=resolution.split(" ")[0] if resolution != "Auto" else None,
+                rendering_speed=rendering_speed,
+            ),
+            max_retries=1,
+        )
+
+        if not response.data or len(response.data) == 0:
+            raise Exception("No images were generated in the response")
+        image_urls = [image_data.url for image_data in response.data if image_data.url]
+        if not image_urls:
+            raise Exception("No image URLs were generated in the response")
+        return IO.NodeOutput(await download_and_process_images(image_urls))
+
+
 class IdeogramExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -805,6 +920,7 @@ class IdeogramExtension(ComfyExtension):
            IdeogramV1,
            IdeogramV2,
            IdeogramV3,
+            IdeogramV4,
        ]


@@ -42,9 +42,11 @@ async def _upload_image_to_krea_assets(cls: type[IO.ComfyNode], image: Input.Ima


 _MODEL_MEDIUM = "Krea 2 Medium"
+_MODEL_MEDIUM_TURBO = "Krea 2 Medium Turbo"
 _MODEL_LARGE = "Krea 2 Large"
 _MODEL_ENDPOINTS: dict[str, str] = {
    _MODEL_MEDIUM: "/proxy/krea/generate/image/krea/krea-2/medium",
+    _MODEL_MEDIUM_TURBO: "/proxy/krea/generate/image/krea/krea-2/medium-turbo",
    _MODEL_LARGE: "/proxy/krea/generate/image/krea/krea-2/large",
 }

@@ -57,7 +59,7 @@ _UUID_RE = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F


 def _krea_model_inputs() -> list:
-    """Nested inputs shared by both Krea 2 Medium and Large under the DynamicCombo."""
+    """Nested inputs shared by Krea 2 Medium, Medium Turbo and Large under the DynamicCombo."""
    return [
        IO.Combo.Input(
            "aspect_ratio",
@@ -123,6 +125,7 @@ class Krea2ImageNode(IO.ComfyNode):
                    "model",
                    options=[
                        IO.DynamicCombo.Option(_MODEL_MEDIUM, _krea_model_inputs()),
+                        IO.DynamicCombo.Option(_MODEL_MEDIUM_TURBO, _krea_model_inputs()),
                        IO.DynamicCombo.Option(_MODEL_LARGE, _krea_model_inputs()),
                    ],
                    tooltip="Krea 2 Medium is best for expressive illustrations; "
@@ -151,14 +154,15 @@ class Krea2ImageNode(IO.ComfyNode):
                ),
                expr="""
                (
-                  $isLarge := widgets.model = "krea 2 large";
+                  $rates := {
+                    "krea 2 medium turbo": {"text": 0.015, "style": 0.0175, "moodboard": 0.02},
+                    "krea 2 medium": {"text": 0.03, "style": 0.035, "moodboard": 0.04},
+                    "krea 2 large": {"text": 0.06, "style": 0.065, "moodboard": 0.07}
+                  };
+                  $r := $lookup($rates, widgets.model);
                  $hasMoodboard := $length($lookup(widgets, "model.moodboard_id")) > 0;
                  $hasStyle := $lookup(inputs, "model.style_reference").connected;
-                  $usd := $hasMoodboard
-                    ? ($isLarge ? 0.07 : 0.04)
-                    : ($hasStyle
-                        ? ($isLarge ? 0.065 : 0.035)
-                        : ($isLarge ? 0.06 : 0.03));
+                  $usd := $hasMoodboard ? $r.moodboard : ($hasStyle ? $r.style : $r.text);
                  {"type":"usd","usd": $usd}
                )
                """,
@@ -158,7 +158,7 @@ class SaveAudio(IO.ComfyNode):
        return IO.Schema(
            node_id="SaveAudio",
            search_aliases=["export flac"],
-            display_name="Save Audio (FLAC)",
+            display_name="Save Audio (FLAC) (Deprecated)",
            category="audio",
            essentials_category="Audio",
            inputs=[
@@ -167,6 +167,7 @@ class SaveAudio(IO.ComfyNode):
            ],
            hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo],
            is_output_node=True,
+            is_deprecated=True,
        )

    @classmethod
@@ -186,7 +187,7 @@ class SaveAudioMP3(IO.ComfyNode):
        return IO.Schema(
            node_id="SaveAudioMP3",
            search_aliases=["export mp3"],
-            display_name="Save Audio (MP3)",
+            display_name="Save Audio (MP3) (Deprecated)",
            category="audio",
            essentials_category="Audio",
            inputs=[
@@ -196,6 +197,7 @@ class SaveAudioMP3(IO.ComfyNode):
            ],
            hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo],
            is_output_node=True,
+            is_deprecated=True,
        )

    @classmethod
@@ -217,7 +219,7 @@ class SaveAudioOpus(IO.ComfyNode):
        return IO.Schema(
            node_id="SaveAudioOpus",
            search_aliases=["export opus"],
-            display_name="Save Audio (Opus)",
+            display_name="Save Audio (Opus) (Deprecated)",
            category="audio",
            inputs=[
                IO.Audio.Input("audio"),
@@ -226,6 +228,7 @@ class SaveAudioOpus(IO.ComfyNode):
            ],
            hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo],
            is_output_node=True,
+            is_deprecated=True,
        )

    @classmethod
@@ -241,6 +244,54 @@ class SaveAudioOpus(IO.ComfyNode):
    save_opus = execute  # TODO: remove


+class SaveAudioAdvanced(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="SaveAudioAdvanced",
+            search_aliases=["save audio", "export audio", "output audio", "write audio", "flac", "mp3", "opus"],
+            display_name="Save Audio (Advanced)",
+            description="Saves the input audio to your ComfyUI output directory.",
+            category="audio",
+            inputs=[
+                IO.Audio.Input("audio", tooltip="The audio to save."),
+                IO.String.Input(
+                    "filename_prefix",
+                    default="audio/ComfyUI",
+                    tooltip=(
+                        "The prefix for the file to save. May include formatting tokens "
+                        "such as %date:yyyy-MM-dd%."
+                    ),
+                ),
+                IO.DynamicCombo.Input(
+                    "format",
+                    options=[
+                        IO.DynamicCombo.Option("flac", []),
+                        IO.DynamicCombo.Option("mp3", [
+                            IO.Combo.Input("quality", options=["V0", "128k", "320k"], default="V0"),
+                        ]),
+                        IO.DynamicCombo.Option("opus", [
+                            IO.Combo.Input("quality", options=["64k", "96k", "128k", "192k", "320k"], default="128k"),
+                        ]),
+                    ],
+                    tooltip="The file format in which to save the audio.",
+                ),
+            ],
+            hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo],
+            is_output_node=True,
+        )
+
+    @classmethod
+    def execute(cls, audio, filename_prefix: str, format: dict) -> IO.NodeOutput:
+        file_format = format.get("format", None)
+        quality = format.get("quality", None)
+        if quality:
+            ui=UI.AudioSaveHelper.get_save_audio_ui(audio, filename_prefix=filename_prefix, cls=cls, format=file_format, quality=quality)
+        else:
+            ui=UI.AudioSaveHelper.get_save_audio_ui(audio, filename_prefix=filename_prefix, cls=cls, format=file_format)
+        return IO.NodeOutput(ui=ui)
+
+
 class PreviewAudio(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
@@ -822,6 +873,7 @@ class AudioExtension(ComfyExtension):
            SaveAudio,
            SaveAudioMP3,
            SaveAudioOpus,
+            SaveAudioAdvanced,
            LoadAudio,
            PreviewAudio,
            ConditioningStableAudio,
@@ -1,5 +1,7 @@
 import math
 import comfy.samplers
+import comfy.sampler_helpers
+import comfy.patcher_extension
 import comfy.sample
 from comfy.k_diffusion import sampling as k_diffusion_sampling
 from comfy.k_diffusion import sa_solver
@@ -894,6 +896,84 @@ class DualCFGGuider(io.ComfyNode):

    get_guider = execute

+class Guider_DualModel(comfy.samplers.CFGGuider):
+    # Runs the positive (cond) pass on the main model and the negative (uncond) pass on a separate model
+    def __init__(self, model_patcher, uncond_model_patcher):
+        super().__init__(model_patcher)
+        self.uncond_model_patcher = uncond_model_patcher
+        self.uncond_inner = None
+
+    def outer_sample(self, noise, latent_image, sampler, sigmas, denoise_mask=None, callback=None, disable_pbar=False, seed=None, latent_shapes=None):
+        self.uncond_inner = None
+        self.uncond_loaded = []
+        self._uncond_neg = None
+        # skip at cfg 1.0
+        if not math.isclose(self.cfg, 1.0):
+            uc = {"negative": list(map(lambda a: a.copy(), self.conds["negative"]))}
+            self.uncond_inner, uc, self.uncond_loaded = comfy.sampler_helpers.prepare_sampling(
+                self.uncond_model_patcher, noise.shape, uc, self.uncond_model_patcher.model_options)
+            self._uncond_neg = uc["negative"]
+            self.uncond_model_patcher.pre_run()
+        try:
+            return super().outer_sample(noise, latent_image, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes)
+        finally:
+            if self.uncond_inner is not None:
+                self.uncond_model_patcher.cleanup()
+                comfy.sampler_helpers.cleanup_models({"negative": self._uncond_neg}, self.uncond_loaded)
+                self.uncond_inner = None
+
+    def inner_sample(self, noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=None):
+        if self.uncond_inner is not None:
+            li = latent_image
+            if li is not None and torch.count_nonzero(li) > 0:
+                li = self.uncond_inner.process_latent_in(li)
+            self._uncond_conds = comfy.samplers.process_conds(
+                self.uncond_inner, noise, {"negative": self._uncond_neg}, device, li, denoise_mask, seed, latent_shapes=latent_shapes)["negative"]
+        return super().inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes)
+
+    def predict_noise(self, x, timestep, model_options={}, seed=None):
+        positive = self.conds.get("positive", None)
+        if self.uncond_inner is None:  # cfg == 1 or no negative -> single model, cond only
+            return comfy.samplers.calc_cond_batch(self.inner_model, [positive], x, timestep, model_options)[0]
+        cond = comfy.samplers.calc_cond_batch(self.inner_model, [positive], x, timestep, model_options)[0]
+
+        uncond_model_options = model_options
+        if "multigpu_clones" in model_options: # TODO: support multigpu instead of just running uncond on a single GPU
+            uncond_model_options = {k: v for k, v in model_options.items() if k != "multigpu_clones"}
+        uncond = comfy.samplers.calc_cond_batch(self.uncond_inner, [self._uncond_conds], x, timestep, uncond_model_options)[0]
+        return comfy.samplers.cfg_function(self.inner_model, cond, uncond, self.cfg, x, timestep,
+                                           model_options=model_options, cond=positive, uncond=self._uncond_conds)
+
+class DualModelGuider(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="DualModelGuider",
+            display_name="Dual Model CFG Guider",
+            category="model/sampling/guiders",
+            is_experimental=True,
+            inputs=[
+                io.Model.Input("model", tooltip="Model used for the positive (conditional) pass."),
+                io.Model.Input("model_negative", optional=True, tooltip="Model used for the negative (unconditional) pass. Use the same model for ordinary CFG."),
+                io.Conditioning.Input("positive"),
+                io.Float.Input("cfg", default=4.0, min=0.0, max=100.0, step=0.1, round=0.01),
+                io.Conditioning.Input("negative", optional=True, tooltip="Negative conditioning run on the negative model. Leave unconnected for a text-free (image-only) unconditional pass."),
+            ],
+            outputs=[io.Guider.Output()],
+        )
+
+    @classmethod
+    def execute(cls, model, positive, cfg, model_negative=None, negative=None) -> io.NodeOutput:
+        if negative is None:
+            negative = [[None, {}]]  # null cond -> no cross_attn -> model runs image-only
+
+        guider = Guider_DualModel(model, model_negative) if model_negative is not None else comfy.samplers.CFGGuider(model)
+        guider.set_conds(positive, negative)
+        guider.set_cfg(cfg)
+        return io.NodeOutput(guider)
+
+    get_guider = execute
+
 class DisableNoise(io.ComfyNode):
    @classmethod
    def define_schema(cls):
@@ -1054,11 +1134,53 @@ class ManualSigmas(io.ComfyNode):
        sigmas = torch.FloatTensor(sigmas)
        return io.NodeOutput(sigmas)

+class CFGOverride(io.ComfyNode):
+    @classmethod
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="CFGOverride",
+            display_name="CFG Override",
+            description="Override cfg to a fixed value over a [start, end] percent slice of the steps. "
+                        "With multiple overrides, the one nearest the sampler wins on overlap.",
+            category="sampling/custom_sampling",
+            inputs=[
+                io.Model.Input("model"),
+                io.Float.Input("cfg", default=1.0, min=0.0, max=100.0, step=0.1, round=0.01),
+                io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001),
+                io.Float.Input("end_percent", default=1.0, min=0.0, max=1.0, step=0.001),
+            ],
+            outputs=[io.Model.Output()],
+        )
+
+    @classmethod
+    def execute(cls, model, cfg, start_percent, end_percent) -> io.NodeOutput:
+        ms = model.get_model_object("model_sampling")
+        sigma_hi = ms.percent_to_sigma(start_percent)  # percent->sigma decreasing, so hi >= lo
+        sigma_lo = ms.percent_to_sigma(end_percent)
+
+        def predict_noise_wrapper(executor, *args, **kwargs):
+            sigma = float(args[1].flatten()[0])        # args = (x, timestep, model_options, seed)
+            if not (sigma_lo <= sigma <= sigma_hi):
+                return executor(*args, **kwargs)
+            guider = executor.class_obj                # guider.cfg feeds cond_scale
+            saved = guider.cfg
+            guider.cfg = cfg
+            try:
+                return executor(*args, **kwargs)
+            finally:
+                guider.cfg = saved                     # restore for other steps/overrides
+
+        m = model.clone()
+        m.add_wrapper(comfy.patcher_extension.WrappersMP.PREDICT_NOISE, predict_noise_wrapper)
+        return io.NodeOutput(m)
+
+
 class CustomSamplersExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
        return [
            SamplerCustom,
+            CFGOverride,
            BasicScheduler,
            KarrasScheduler,
            ExponentialScheduler,
@@ -1087,6 +1209,7 @@ class CustomSamplersExtension(ComfyExtension):
            SamplingPercentToSigma,
            CFGGuider,
            DualCFGGuider,
+            DualModelGuider,
            BasicGuider,
            RandomNoise,
            DisableNoise,
@@ -411,6 +411,21 @@ class ImageProcessingNode(io.ComfyNode):

        return has_group

+    @classmethod
+    def _ensure_image_list(cls, images):
+        """Normalize to a flat list of [1, H, W, C] tensors."""
+        if isinstance(images, torch.Tensor):
+            if images.ndim != 4:
+                raise ValueError(f"Expected 4D image tensor, got shape {tuple(images.shape)}")
+            return [images[i:i+1] for i in range(images.shape[0])]
+
+        flat = []
+        for item in images:
+            if not isinstance(item, torch.Tensor) or item.ndim != 4:
+                raise ValueError(f"Expected 4D image tensor, got {type(item).__name__} shape {getattr(item, 'shape', None)}")
+            flat.extend([item[i:i+1] for i in range(item.shape[0])])
+        return flat
+
    @classmethod
    def define_schema(cls):
        if cls.node_id is None:
@@ -458,6 +473,9 @@ class ImageProcessingNode(io.ComfyNode):
        """Execute the node. Routes to _process or _group_process based on mode."""
        is_group = cls._detect_processing_mode()

+        if is_group:
+            images = cls._ensure_image_list(images)
+
        # Extract scalar values from lists for parameters
        params = {}
        for k, v in kwargs.items():
@@ -0,0 +1,64 @@
+"""Ideogram 4 sampling helper
+"""
+
+import math
+
+import torch
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
+
+_LOGSNR_MIN = -15.0
+_LOGSNR_MAX = 18.0
+
+
+def _logit_normal_schedule(u, mean, std):
+    # Reference time (0=noise..1=clean) via the probit/ndtri quantile.
+    u = torch.as_tensor(u, dtype=torch.float64)
+    t = 1.0 - torch.special.expit(mean + std * torch.special.ndtri(u))
+    t_min = 1.0 / (1.0 + math.exp(0.5 * _LOGSNR_MAX))
+    t_max = 1.0 / (1.0 + math.exp(0.5 * _LOGSNR_MIN))
+    return t.clamp(t_min, t_max)
+
+
+def ideogram4_sigmas(num_steps, width, height, mu, std):
+    """Descending sigmas (len num_steps+1) for the reference schedule.
+
+    mu + the resolution term form the logSNR shift; std is the spread.
+    """
+    mean = mu + 0.5 * math.log((width * height) / (512 * 512))
+    u = torch.linspace(0.0, 1.0, num_steps + 1, dtype=torch.float64)
+    sigmas = (1.0 - _logit_normal_schedule(u, mean, std)).flip(0)
+    sigmas[-1] = 0.0                                      # clamp leaves ~6e-4; force full denoise
+    return sigmas.to(torch.float32)
+
+
+class Ideogram4Scheduler(io.ComfyNode):
+    @classmethod
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="Ideogram4Scheduler",
+            display_name="Ideogram 4 Scheduler",
+            category="sampling/custom_sampling/schedulers",
+            inputs=[
+                io.Int.Input("steps", default=20, min=1, max=200),
+                io.Int.Input("width", default=1024, min=256, max=8192, step=16),
+                io.Int.Input("height", default=1024, min=256, max=8192, step=16),
+                io.Float.Input("mu", default=0.0, min=-10.0, max=10.0, step=0.05),
+                io.Float.Input("std", default=1.75, min=0.1, max=5.0, step=0.05),
+            ],
+            outputs=[io.Sigmas.Output()],
+        )
+
+    @classmethod
+    def execute(cls, steps, width, height, mu, std) -> io.NodeOutput:
+        return io.NodeOutput(ideogram4_sigmas(steps, width, height, mu, std))
+
+
+class Ideogram4Extension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [Ideogram4Scheduler]
+
+
+async def comfy_entrypoint() -> Ideogram4Extension:
+    return Ideogram4Extension()
@@ -21,8 +21,8 @@ class PiDConditioning(io.ComfyNode):
            inputs=[
                io.Conditioning.Input("positive"),
                io.Latent.Input("latent", tooltip="latent (from VAEEncode or a KSampler)."),
-                io.Combo.Input("latent_format", options=["flux", "sd3"], default="flux",
-                               tooltip="Flux1 and Flux2 latents auto-detected from channel dim, sd3 has to be selected manually."),
+                io.Combo.Input("latent_format", options=["flux", "sd3", "sdxl", "qwenimage"], default="flux",
+                               tooltip="Flux1 (16-ch) and Flux2 (128-ch) latents are auto-detected from channel dim under 'flux'. For SD3 (16-ch), SDXL (4-ch), or QwenImage (16-ch), select manually."),
                io.Float.Input(
                    "degrade_sigma", default=0.0, min=0.0, max=1.0, step=0.01,
                    tooltip="0 = clean latent. Increase to denoise corrupted latent outputs.",
@@ -36,9 +36,17 @@ class PiDConditioning(io.ComfyNode):
        samples = latent["samples"]
        if latent_format == "flux":
            fmt_cls = comfy.latent_formats.Flux2 if samples.shape[1] == 128 else comfy.latent_formats.Flux
-        else:
+        elif latent_format == "sd3":
            fmt_cls = comfy.latent_formats.SD3
+        elif latent_format == "sdxl":
+            fmt_cls = comfy.latent_formats.SDXL
+        elif latent_format == "qwenimage":
+            fmt_cls = comfy.latent_formats.Wan21
+        else:
+            raise ValueError(f"Unknown latent_format: {latent_format}")
        lq_latent = fmt_cls().process_in(samples)
+        if lq_latent.ndim == 5:
+            lq_latent = lq_latent[:, :, 0]
        sigma_t = torch.tensor([float(degrade_sigma)], dtype=torch.float32)
        return io.NodeOutput(node_helpers.conditioning_set_values(
            positive, {"lq_latent": lq_latent, "degrade_sigma": sigma_t},
@@ -19,7 +19,7 @@ class SaveWEBM(io.ComfyNode):
            category="video",
            is_experimental=True,
            inputs=[
-                io.Image.Input("images"),
+                io.Image.Input("images", tooltip="RGBA images are saved with their alpha channel as transparency (vp9 codec only)."),
                io.String.Input("filename_prefix", default="ComfyUI"),
                io.Combo.Input("codec", options=["vp9", "av1"]),
                io.Float.Input("fps", default=24.0, min=0.01, max=1000.0, step=0.01),
@@ -45,18 +45,25 @@ class SaveWEBM(io.ComfyNode):
            for x in cls.hidden.extra_pnginfo:
                container.metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x])

+        # Save transparency when the images carry an alpha channel (RGBA) and the codec supports it.
+        # vp9 -> yuva420p; other codecs have no usable alpha path, so the alpha is ignored.
+        save_alpha = images.shape[-1] == 4 and codec == "vp9"
+
        codec_map = {"vp9": "libvpx-vp9", "av1": "libsvtav1"}
        stream = container.add_stream(codec_map[codec], rate=Fraction(round(fps * 1000), 1000))
        stream.width = images.shape[-2]
        stream.height = images.shape[-3]
-        stream.pix_fmt = "yuv420p10le" if codec == "av1" else "yuv420p"
+        stream.pix_fmt = "yuva420p" if save_alpha else ("yuv420p10le" if codec == "av1" else "yuv420p")
        stream.bit_rate = 0
        stream.options = {'crf': str(crf)}
        if codec == "av1":
            stream.options["preset"] = "6"

        for frame in images:
-            frame = av.VideoFrame.from_ndarray(torch.clamp(frame[..., :3] * 255, min=0, max=255).to(device=torch.device("cpu"), dtype=torch.uint8).numpy(), format="rgb24")
+            if save_alpha:
+                frame = av.VideoFrame.from_ndarray(torch.clamp(frame[..., :4] * 255, min=0, max=255).to(device=torch.device("cpu"), dtype=torch.uint8).numpy(), format="rgba")
+            else:
+                frame = av.VideoFrame.from_ndarray(torch.clamp(frame[..., :3] * 255, min=0, max=255).to(device=torch.device("cpu"), dtype=torch.uint8).numpy(), format="rgb24")
            for packet in stream.encode(frame):
                container.mux(packet)
        container.mux(stream.encode())
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.23.0"
+__version__ = "0.24.0"
@@ -969,7 +969,7 @@ class CLIPLoader:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit"], ),
+                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit", "ideogram4"], ),
                              },
                "optional": {
                              "device": (["default", "cpu"], {"advanced": True}),
@@ -2362,6 +2362,7 @@ async def init_builtin_extra_nodes():
        "nodes_model_downscale.py",
        "nodes_images.py",
        "nodes_video_model.py",
+        "nodes_ideogram4.py",
        "nodes_train.py",
        "nodes_dataset.py",
        "nodes_sag.py",
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.23.0"
+version = "0.24.0"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.10"
@@ -1,5 +1,5 @@
-comfyui-frontend-package==1.44.19
-comfyui-workflow-templates==0.9.92
+comfyui-frontend-package==1.45.15
+comfyui-workflow-templates==0.9.98
 comfyui-embedded-docs==0.5.2
 torch
 torchsde
@@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0
 filelock
 av>=16.0.0
 comfy-kitchen==0.2.10
-comfy-aimdo==0.4.7
+comfy-aimdo==0.4.8
 requests
 simpleeval>=1.0.0
 blake3