Merge 6d4f9e86ab into 96f1cee9f5

chore(api-nodes): always display the custom width and height in GPTImage2 node (#13651 )
Signed-off-by: bigcat88 <bigcat88@icloud.com>
2026-05-30 02:47:24 +08:00 · 2026-05-01 11:01:11 +03:00 · 2026-04-30 23:15:11 -07:00 · 2026-04-30 21:49:31 -04:00 · 2026-04-30 18:14:28 -07:00 · 2026-04-30 19:33:41 -04:00
24 changed files with 2719 additions and 117 deletions
--- a/2
+++ b/2
@ -1,2 +1,2 @@
 # Admins
-* @comfyanonymous @kosinkadink @guill
+* @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -224,6 +224,7 @@ class Flux2(LatentFormat):

        self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
        self.latent_rgb_factors_reshape = lambda t: t.reshape(t.shape[0], 32, 2, 2, t.shape[-2], t.shape[-1]).permute(0, 1, 4, 2, 5, 3).reshape(t.shape[0], 32, t.shape[-2] * 2, t.shape[-1] * 2)
+        self.taesd_decoder_name = "taef2_decoder"

    def process_in(self, latent):
        return latent
@ -783,3 +784,10 @@ class ZImagePixelSpace(ChromaRadiance):
    No VAE encoding/decoding — the model operates directly on RGB pixels.
    """
    pass
+
+class CogVideoX(LatentFormat):
+    latent_channels = 16
+    latent_dimensions = 3
+
+    def __init__(self):
+        self.scale_factor = 1.15258426
--- a/comfy/ldm/cogvideo/init.py
+++ b/comfy/ldm/cogvideo/init.py
--- a/comfy/ldm/cogvideo/model.py
+++ b/comfy/ldm/cogvideo/model.py
@ -0,0 +1,573 @@
+# CogVideoX 3D Transformer - ported to ComfyUI native ops
+# Architecture reference: diffusers CogVideoXTransformer3DModel
+# Style reference: comfy/ldm/wan/model.py
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from comfy.ldm.modules.attention import optimized_attention
+import comfy.patcher_extension
+import comfy.ldm.common_dit
+
+
+def _get_1d_rotary_pos_embed(dim, pos, theta=10000.0):
+    """Returns (cos, sin) each with shape [seq_len, dim].
+
+    Frequencies are computed at dim//2 resolution then repeat_interleaved
+    to full dim, matching CogVideoX's interleaved (real, imag) pair format.
+    """
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim))
+    angles = torch.outer(pos.float(), freqs.float())
+    cos = angles.cos().repeat_interleave(2, dim=-1).float()
+    sin = angles.sin().repeat_interleave(2, dim=-1).float()
+    return (cos, sin)
+
+
+def apply_rotary_emb(x, freqs_cos_sin):
+    """Apply CogVideoX rotary embedding to query or key tensor.
+
+    x: [B, heads, seq_len, head_dim]
+    freqs_cos_sin: (cos, sin) each [seq_len, head_dim//2]
+
+    Uses interleaved pair rotation (same as diffusers CogVideoX/Flux).
+    head_dim is reshaped to (-1, 2) pairs, rotated, then flattened back.
+    """
+    cos, sin = freqs_cos_sin
+    cos = cos[None, None, :, :].to(x.device)
+    sin = sin[None, None, :, :].to(x.device)
+
+    # Interleaved pairs: [B, H, S, D] -> [B, H, S, D//2, 2] -> (real, imag)
+    x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
+    x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+
+    return (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+
+
+def get_timestep_embedding(timesteps, dim, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1, max_period=10000):
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device) / half)
+    args = timesteps[:, None].float() * freqs[None] * scale
+    embedding = torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
+    if flip_sin_to_cos:
+        embedding = torch.cat([embedding[:, half:], embedding[:, :half]], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+
+
+def get_3d_sincos_pos_embed(embed_dim, spatial_size, temporal_size, spatial_interpolation_scale=1.0, temporal_interpolation_scale=1.0, device=None):
+    if isinstance(spatial_size, int):
+        spatial_size = (spatial_size, spatial_size)
+
+    grid_w = torch.arange(spatial_size[0], dtype=torch.float32, device=device) / spatial_interpolation_scale
+    grid_h = torch.arange(spatial_size[1], dtype=torch.float32, device=device) / spatial_interpolation_scale
+    grid_t = torch.arange(temporal_size, dtype=torch.float32, device=device) / temporal_interpolation_scale
+
+    grid_t, grid_h, grid_w = torch.meshgrid(grid_t, grid_h, grid_w, indexing="ij")
+
+    embed_dim_spatial = 2 * (embed_dim // 3)
+    embed_dim_temporal = embed_dim // 3
+
+    pos_embed_spatial = _get_2d_sincos_pos_embed(embed_dim_spatial, grid_h, grid_w, device=device)
+    pos_embed_temporal = _get_1d_sincos_pos_embed(embed_dim_temporal, grid_t[:, 0, 0], device=device)
+
+    T, H, W = grid_t.shape
+    pos_embed_temporal = pos_embed_temporal.unsqueeze(1).unsqueeze(1).expand(-1, H, W, -1)
+    pos_embed = torch.cat([pos_embed_temporal, pos_embed_spatial], dim=-1)
+
+    return pos_embed
+
+
+def _get_2d_sincos_pos_embed(embed_dim, grid_h, grid_w, device=None):
+    T, H, W = grid_h.shape
+    half_dim = embed_dim // 2
+    pos_h = _get_1d_sincos_pos_embed(half_dim, grid_h.reshape(-1), device=device).reshape(T, H, W, half_dim)
+    pos_w = _get_1d_sincos_pos_embed(half_dim, grid_w.reshape(-1), device=device).reshape(T, H, W, half_dim)
+    return torch.cat([pos_h, pos_w], dim=-1)
+
+
+def _get_1d_sincos_pos_embed(embed_dim, pos, device=None):
+    half = embed_dim // 2
+    freqs = torch.exp(-math.log(10000.0) * torch.arange(start=0, end=half, dtype=torch.float32, device=device) / half)
+    args = pos.float().reshape(-1)[:, None] * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if embed_dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+
+
+
+class CogVideoXPatchEmbed(nn.Module):
+    def __init__(self, patch_size=2, patch_size_t=None, in_channels=16, dim=1920,
+                 text_dim=4096, bias=True, sample_width=90, sample_height=60,
+                 sample_frames=49, temporal_compression_ratio=4,
+                 max_text_seq_length=226, spatial_interpolation_scale=1.875,
+                 temporal_interpolation_scale=1.0, use_positional_embeddings=True,
+                 use_learned_positional_embeddings=True,
+                 device=None, dtype=None, operations=None):
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_size_t = patch_size_t
+        self.dim = dim
+        self.sample_height = sample_height
+        self.sample_width = sample_width
+        self.sample_frames = sample_frames
+        self.temporal_compression_ratio = temporal_compression_ratio
+        self.max_text_seq_length = max_text_seq_length
+        self.spatial_interpolation_scale = spatial_interpolation_scale
+        self.temporal_interpolation_scale = temporal_interpolation_scale
+        self.use_positional_embeddings = use_positional_embeddings
+        self.use_learned_positional_embeddings = use_learned_positional_embeddings
+
+        if patch_size_t is None:
+            self.proj = operations.Conv2d(in_channels, dim, kernel_size=patch_size, stride=patch_size, bias=bias, device=device, dtype=dtype)
+        else:
+            self.proj = operations.Linear(in_channels * patch_size * patch_size * patch_size_t, dim, device=device, dtype=dtype)
+
+        self.text_proj = operations.Linear(text_dim, dim, device=device, dtype=dtype)
+
+        if use_positional_embeddings or use_learned_positional_embeddings:
+            persistent = use_learned_positional_embeddings
+            pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
+            self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
+
+    def _get_positional_embeddings(self, sample_height, sample_width, sample_frames, device=None):
+        post_patch_height = sample_height // self.patch_size
+        post_patch_width = sample_width // self.patch_size
+        post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
+        if self.patch_size_t is not None:
+            post_time_compression_frames = post_time_compression_frames // self.patch_size_t
+        num_patches = post_patch_height * post_patch_width * post_time_compression_frames
+
+        pos_embedding = get_3d_sincos_pos_embed(
+            self.dim,
+            (post_patch_width, post_patch_height),
+            post_time_compression_frames,
+            self.spatial_interpolation_scale,
+            self.temporal_interpolation_scale,
+            device=device,
+        )
+        pos_embedding = pos_embedding.reshape(-1, self.dim)
+        joint_pos_embedding = pos_embedding.new_zeros(
+            1, self.max_text_seq_length + num_patches, self.dim, requires_grad=False
+        )
+        joint_pos_embedding.data[:, self.max_text_seq_length:].copy_(pos_embedding)
+        return joint_pos_embedding
+
+    def forward(self, text_embeds, image_embeds):
+        input_dtype = text_embeds.dtype
+        text_embeds = self.text_proj(text_embeds.to(self.text_proj.weight.dtype)).to(input_dtype)
+        batch_size, num_frames, channels, height, width = image_embeds.shape
+
+        proj_dtype = self.proj.weight.dtype
+        if self.patch_size_t is None:
+            image_embeds = image_embeds.reshape(-1, channels, height, width)
+            image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
+            image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
+            image_embeds = image_embeds.flatten(3).transpose(2, 3)
+            image_embeds = image_embeds.flatten(1, 2)
+        else:
+            p = self.patch_size
+            p_t = self.patch_size_t
+            image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
+            image_embeds = image_embeds.reshape(
+                batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
+            )
+            image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
+            image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
+
+        embeds = torch.cat([text_embeds, image_embeds], dim=1).contiguous()
+
+        if self.use_positional_embeddings or self.use_learned_positional_embeddings:
+            text_seq_length = text_embeds.shape[1]
+            num_image_patches = image_embeds.shape[1]
+
+            if self.use_learned_positional_embeddings:
+                image_pos = self.pos_embedding[
+                    :, self.max_text_seq_length:self.max_text_seq_length + num_image_patches
+                ].to(device=embeds.device, dtype=embeds.dtype)
+            else:
+                image_pos = get_3d_sincos_pos_embed(
+                    self.dim,
+                    (width // self.patch_size, height // self.patch_size),
+                    num_image_patches // ((height // self.patch_size) * (width // self.patch_size)),
+                    self.spatial_interpolation_scale,
+                    self.temporal_interpolation_scale,
+                    device=embeds.device,
+                ).reshape(1, num_image_patches, self.dim).to(dtype=embeds.dtype)
+
+            # Build joint: zeros for text + sincos for image
+            joint_pos = torch.zeros(1, text_seq_length + num_image_patches, self.dim, device=embeds.device, dtype=embeds.dtype)
+            joint_pos[:, text_seq_length:] = image_pos
+            embeds = embeds + joint_pos
+
+        return embeds
+
+
+class CogVideoXLayerNormZero(nn.Module):
+    def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5, bias=True,
+                 device=None, dtype=None, operations=None):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = operations.Linear(time_dim, 6 * dim, bias=bias, device=device, dtype=dtype)
+        self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
+
+    def forward(self, hidden_states, encoder_hidden_states, temb):
+        shift, scale, gate, enc_shift, enc_scale, enc_gate = self.linear(self.silu(temb)).chunk(6, dim=1)
+        hidden_states = self.norm(hidden_states) * (1 + scale)[:, None, :] + shift[:, None, :]
+        encoder_hidden_states = self.norm(encoder_hidden_states) * (1 + enc_scale)[:, None, :] + enc_shift[:, None, :]
+        return hidden_states, encoder_hidden_states, gate[:, None, :], enc_gate[:, None, :]
+
+
+class CogVideoXAdaLayerNorm(nn.Module):
+    def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5,
+                 device=None, dtype=None, operations=None):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = operations.Linear(time_dim, 2 * dim, device=device, dtype=dtype)
+        self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
+
+    def forward(self, x, temb):
+        temb = self.linear(self.silu(temb))
+        shift, scale = temb.chunk(2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+
+
+class CogVideoXBlock(nn.Module):
+    def __init__(self, dim, num_heads, head_dim, time_dim,
+                 eps=1e-5, ff_inner_dim=None, ff_bias=True,
+                 device=None, dtype=None, operations=None):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+
+        self.norm1 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
+
+        # Self-attention (joint text + latent)
+        self.q = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
+        self.k = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
+        self.v = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
+        self.norm_q = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
+        self.norm_k = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
+        self.attn_out = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
+
+        self.norm2 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
+
+        # Feed-forward (GELU approximate)
+        inner_dim = ff_inner_dim or dim * 4
+        self.ff_proj = operations.Linear(dim, inner_dim, bias=ff_bias, device=device, dtype=dtype)
+        self.ff_out = operations.Linear(inner_dim, dim, bias=ff_bias, device=device, dtype=dtype)
+
+    def forward(self, hidden_states, encoder_hidden_states, temb, image_rotary_emb=None, transformer_options=None):
+        if transformer_options is None:
+            transformer_options = {}
+        text_seq_length = encoder_hidden_states.size(1)
+
+        # Norm & modulate
+        norm_hidden, norm_encoder, gate_msa, enc_gate_msa = self.norm1(hidden_states, encoder_hidden_states, temb)
+
+        # Joint self-attention
+        qkv_input = torch.cat([norm_encoder, norm_hidden], dim=1)
+        b, s, _ = qkv_input.shape
+        n, d = self.num_heads, self.head_dim
+
+        q = self.q(qkv_input).view(b, s, n, d)
+        k = self.k(qkv_input).view(b, s, n, d)
+        v = self.v(qkv_input)
+
+        q = self.norm_q(q).view(b, s, n, d)
+        k = self.norm_k(k).view(b, s, n, d)
+
+        # Apply rotary embeddings to image tokens only (diffusers format: [B, heads, seq, head_dim])
+        if image_rotary_emb is not None:
+            q_img = q[:, text_seq_length:].transpose(1, 2)  # [B, heads, img_seq, head_dim]
+            k_img = k[:, text_seq_length:].transpose(1, 2)
+            q_img = apply_rotary_emb(q_img, image_rotary_emb)
+            k_img = apply_rotary_emb(k_img, image_rotary_emb)
+            q = torch.cat([q[:, :text_seq_length], q_img.transpose(1, 2)], dim=1)
+            k = torch.cat([k[:, :text_seq_length], k_img.transpose(1, 2)], dim=1)
+
+        attn_out = optimized_attention(
+            q.reshape(b, s, n * d),
+            k.reshape(b, s, n * d),
+            v,
+            heads=self.num_heads,
+            transformer_options=transformer_options,
+        )
+
+        attn_out = self.attn_out(attn_out)
+
+        attn_encoder, attn_hidden = attn_out.split([text_seq_length, s - text_seq_length], dim=1)
+
+        hidden_states = hidden_states + gate_msa * attn_hidden
+        encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder
+
+        # Norm & modulate for FF
+        norm_hidden, norm_encoder, gate_ff, enc_gate_ff = self.norm2(hidden_states, encoder_hidden_states, temb)
+
+        # Feed-forward (GELU on concatenated text + latent)
+        ff_input = torch.cat([norm_encoder, norm_hidden], dim=1)
+        ff_output = self.ff_out(F.gelu(self.ff_proj(ff_input), approximate="tanh"))
+
+        hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
+        encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
+
+        return hidden_states, encoder_hidden_states
+
+
+class CogVideoXTransformer3DModel(nn.Module):
+    def __init__(self,
+                 num_attention_heads=30,
+                 attention_head_dim=64,
+                 in_channels=16,
+                 out_channels=16,
+                 flip_sin_to_cos=True,
+                 freq_shift=0,
+                 time_embed_dim=512,
+                 ofs_embed_dim=None,
+                 text_embed_dim=4096,
+                 num_layers=30,
+                 dropout=0.0,
+                 attention_bias=True,
+                 sample_width=90,
+                 sample_height=60,
+                 sample_frames=49,
+                 patch_size=2,
+                 patch_size_t=None,
+                 temporal_compression_ratio=4,
+                 max_text_seq_length=226,
+                 spatial_interpolation_scale=1.875,
+                 temporal_interpolation_scale=1.0,
+                 use_rotary_positional_embeddings=False,
+                 use_learned_positional_embeddings=False,
+                 patch_bias=True,
+                 image_model=None,
+                 device=None,
+                 dtype=None,
+                 operations=None,
+                 ):
+        super().__init__()
+        self.dtype = dtype
+        dim = num_attention_heads * attention_head_dim
+        self.dim = dim
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.patch_size = patch_size
+        self.patch_size_t = patch_size_t
+        self.max_text_seq_length = max_text_seq_length
+        self.use_rotary_positional_embeddings = use_rotary_positional_embeddings
+
+        # 1. Patch embedding
+        self.patch_embed = CogVideoXPatchEmbed(
+            patch_size=patch_size,
+            patch_size_t=patch_size_t,
+            in_channels=in_channels,
+            dim=dim,
+            text_dim=text_embed_dim,
+            bias=patch_bias,
+            sample_width=sample_width,
+            sample_height=sample_height,
+            sample_frames=sample_frames,
+            temporal_compression_ratio=temporal_compression_ratio,
+            max_text_seq_length=max_text_seq_length,
+            spatial_interpolation_scale=spatial_interpolation_scale,
+            temporal_interpolation_scale=temporal_interpolation_scale,
+            use_positional_embeddings=not use_rotary_positional_embeddings,
+            use_learned_positional_embeddings=use_learned_positional_embeddings,
+            device=device, dtype=torch.float32, operations=operations,
+        )
+
+        # 2. Time embedding
+        self.time_proj_dim = dim
+        self.time_proj_flip = flip_sin_to_cos
+        self.time_proj_shift = freq_shift
+        self.time_embedding_linear_1 = operations.Linear(dim, time_embed_dim, device=device, dtype=dtype)
+        self.time_embedding_act = nn.SiLU()
+        self.time_embedding_linear_2 = operations.Linear(time_embed_dim, time_embed_dim, device=device, dtype=dtype)
+
+        # Optional OFS embedding (CogVideoX 1.5 I2V)
+        self.ofs_proj_dim = ofs_embed_dim
+        if ofs_embed_dim:
+            self.ofs_embedding_linear_1 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
+            self.ofs_embedding_act = nn.SiLU()
+            self.ofs_embedding_linear_2 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
+        else:
+            self.ofs_embedding_linear_1 = None
+
+        # 3. Transformer blocks
+        self.blocks = nn.ModuleList([
+            CogVideoXBlock(
+                dim=dim,
+                num_heads=num_attention_heads,
+                head_dim=attention_head_dim,
+                time_dim=time_embed_dim,
+                eps=1e-5,
+                device=device, dtype=dtype, operations=operations,
+            )
+            for _ in range(num_layers)
+        ])
+
+        self.norm_final = operations.LayerNorm(dim, eps=1e-5, elementwise_affine=True, device=device, dtype=dtype)
+
+        # 4. Output
+        self.norm_out = CogVideoXAdaLayerNorm(
+            time_dim=time_embed_dim, dim=dim, eps=1e-5,
+            device=device, dtype=dtype, operations=operations,
+        )
+
+        if patch_size_t is None:
+            output_dim = patch_size * patch_size * out_channels
+        else:
+            output_dim = patch_size * patch_size * patch_size_t * out_channels
+
+        self.proj_out = operations.Linear(dim, output_dim, device=device, dtype=dtype)
+
+        self.spatial_interpolation_scale = spatial_interpolation_scale
+        self.temporal_interpolation_scale = temporal_interpolation_scale
+        self.temporal_compression_ratio = temporal_compression_ratio
+
+    def forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
+        if transformer_options is None:
+            transformer_options = {}
+        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
+            self._forward,
+            self,
+            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
+        ).execute(x, timestep, context, ofs, transformer_options, **kwargs)
+
+    def _forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
+        if transformer_options is None:
+            transformer_options = {}
+        # ComfyUI passes [B, C, T, H, W]
+        batch_size, channels, t, h, w = x.shape
+
+        # Pad to patch size (temporal + spatial), same pattern as WAN
+        p_t = self.patch_size_t if self.patch_size_t is not None else 1
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, (p_t, self.patch_size, self.patch_size))
+
+        # CogVideoX expects [B, T, C, H, W]
+        x = x.permute(0, 2, 1, 3, 4)
+        batch_size, num_frames, channels, height, width = x.shape
+
+        # Time embedding
+        t_emb = get_timestep_embedding(timestep, self.time_proj_dim, self.time_proj_flip, self.time_proj_shift)
+        t_emb = t_emb.to(dtype=x.dtype)
+        emb = self.time_embedding_linear_2(self.time_embedding_act(self.time_embedding_linear_1(t_emb)))
+
+        if self.ofs_embedding_linear_1 is not None and ofs is not None:
+            ofs_emb = get_timestep_embedding(ofs, self.ofs_proj_dim, self.time_proj_flip, self.time_proj_shift)
+            ofs_emb = ofs_emb.to(dtype=x.dtype)
+            ofs_emb = self.ofs_embedding_linear_2(self.ofs_embedding_act(self.ofs_embedding_linear_1(ofs_emb)))
+            emb = emb + ofs_emb
+
+        # Patch embedding
+        hidden_states = self.patch_embed(context, x)
+
+        text_seq_length = context.shape[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length]
+        hidden_states = hidden_states[:, text_seq_length:]
+
+        # Rotary embeddings (if used)
+        image_rotary_emb = None
+        if self.use_rotary_positional_embeddings:
+            post_patch_height = height // self.patch_size
+            post_patch_width = width // self.patch_size
+            if self.patch_size_t is None:
+                post_time = num_frames
+            else:
+                post_time = num_frames // self.patch_size_t
+            image_rotary_emb = self._get_rotary_emb(post_patch_height, post_patch_width, post_time, device=x.device)
+
+        # Transformer blocks
+        for i, block in enumerate(self.blocks):
+            hidden_states, encoder_hidden_states = block(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                temb=emb,
+                image_rotary_emb=image_rotary_emb,
+                transformer_options=transformer_options,
+            )
+
+        hidden_states = self.norm_final(hidden_states)
+
+        # Output projection
+        hidden_states = self.norm_out(hidden_states, temb=emb)
+        hidden_states = self.proj_out(hidden_states)
+
+        # Unpatchify
+        p = self.patch_size
+        p_t = self.patch_size_t
+
+        if p_t is None:
+            output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
+            output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        else:
+            output = hidden_states.reshape(
+                batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
+            )
+            output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
+
+        # Back to ComfyUI format [B, C, T, H, W] and crop padding
+        output = output.permute(0, 2, 1, 3, 4)[:, :, :t, :h, :w]
+        return output
+
+    def _get_rotary_emb(self, h, w, t, device):
+        """Compute CogVideoX 3D rotary positional embeddings.
+
+        For CogVideoX 1.5 (patch_size_t != None): uses "slice" mode — grid positions
+        are integer arange computed at max_size, then sliced to actual size.
+        For CogVideoX 1.0 (patch_size_t == None): uses "linspace" mode with crop coords
+        scaled by spatial_interpolation_scale.
+        """
+        d = self.attention_head_dim
+        dim_t = d // 4
+        dim_h = d // 8 * 3
+        dim_w = d // 8 * 3
+
+        if self.patch_size_t is not None:
+            # CogVideoX 1.5: "slice" mode — positions are simple integer indices
+            # Compute at max(sample_size, actual_size) then slice to actual
+            base_h = self.patch_embed.sample_height // self.patch_size
+            base_w = self.patch_embed.sample_width // self.patch_size
+            max_h = max(base_h, h)
+            max_w = max(base_w, w)
+
+            grid_h = torch.arange(max_h, device=device, dtype=torch.float32)
+            grid_w = torch.arange(max_w, device=device, dtype=torch.float32)
+            grid_t = torch.arange(t, device=device, dtype=torch.float32)
+        else:
+            # CogVideoX 1.0: "linspace" mode with interpolation scale
+            grid_h = torch.linspace(0, h - 1, h, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
+            grid_w = torch.linspace(0, w - 1, w, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
+            grid_t = torch.arange(t, device=device, dtype=torch.float32)
+
+        freqs_t = _get_1d_rotary_pos_embed(dim_t, grid_t)
+        freqs_h = _get_1d_rotary_pos_embed(dim_h, grid_h)
+        freqs_w = _get_1d_rotary_pos_embed(dim_w, grid_w)
+
+        t_cos, t_sin = freqs_t
+        h_cos, h_sin = freqs_h
+        w_cos, w_sin = freqs_w
+
+        # Slice to actual size (for "slice" mode where grids may be larger)
+        t_cos, t_sin = t_cos[:t], t_sin[:t]
+        h_cos, h_sin = h_cos[:h], h_sin[:h]
+        w_cos, w_sin = w_cos[:w], w_sin[:w]
+
+        # Broadcast and concatenate into [T*H*W, head_dim]
+        t_cos = t_cos[:, None, None, :].expand(-1, h, w, -1)
+        t_sin = t_sin[:, None, None, :].expand(-1, h, w, -1)
+        h_cos = h_cos[None, :, None, :].expand(t, -1, w, -1)
+        h_sin = h_sin[None, :, None, :].expand(t, -1, w, -1)
+        w_cos = w_cos[None, None, :, :].expand(t, h, -1, -1)
+        w_sin = w_sin[None, None, :, :].expand(t, h, -1, -1)
+
+        cos = torch.cat([t_cos, h_cos, w_cos], dim=-1).reshape(t * h * w, -1)
+        sin = torch.cat([t_sin, h_sin, w_sin], dim=-1).reshape(t * h * w, -1)
+        return (cos, sin)
--- a/comfy/ldm/cogvideo/vae.py
+++ b/comfy/ldm/cogvideo/vae.py
@ -0,0 +1,566 @@
+# CogVideoX VAE - ported to ComfyUI native ops
+# Architecture reference: diffusers AutoencoderKLCogVideoX
+# Style reference: comfy/ldm/wan/vae.py
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+
+
+class CausalConv3d(nn.Module):
+    """Causal 3D convolution with temporal padding.
+
+    Uses comfy.ops.Conv3d with autopad='causal_zero' fast path: when input has
+    a single temporal frame and no cache, the 3D conv weight is sliced to act
+    as a 2D conv, avoiding computation on zero-padded temporal dimensions.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, pad_mode="constant"):
+        super().__init__()
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size,) * 3
+
+        time_kernel, height_kernel, width_kernel = kernel_size
+        self.time_kernel_size = time_kernel
+        self.pad_mode = pad_mode
+
+        height_pad = (height_kernel - 1) // 2
+        width_pad = (width_kernel - 1) // 2
+        self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_kernel - 1, 0)
+
+        stride = stride if isinstance(stride, tuple) else (stride, 1, 1)
+        dilation = (dilation, 1, 1)
+        self.conv = ops.Conv3d(
+            in_channels, out_channels, kernel_size,
+            stride=stride, dilation=dilation,
+            padding=(0, height_pad, width_pad),
+        )
+
+    def forward(self, x, conv_cache=None):
+        if self.pad_mode == "replicate":
+            x = F.pad(x, self.time_causal_padding, mode="replicate")
+            conv_cache = None
+        else:
+            kernel_t = self.time_kernel_size
+            if kernel_t > 1:
+                if conv_cache is None and x.shape[2] == 1:
+                    # Fast path: single frame, no cache. All temporal padding
+                    # frames are copies of the input (replicate-style), so the
+                    # 3D conv reduces to a 2D conv with summed temporal kernel.
+                    w = comfy.ops.cast_to_input(self.conv.weight, x)
+                    b = comfy.ops.cast_to_input(self.conv.bias, x) if self.conv.bias is not None else None
+                    w2d = w.sum(dim=2, keepdim=True)
+                    out = F.conv3d(x, w2d, b,
+                                   self.conv.stride, self.conv.padding,
+                                   self.conv.dilation, self.conv.groups)
+                    return out, None
+                cached = [conv_cache] if conv_cache is not None else [x[:, :, :1]] * (kernel_t - 1)
+                x = torch.cat(cached + [x], dim=2)
+            conv_cache = x[:, :, -self.time_kernel_size + 1:].clone() if self.time_kernel_size > 1 else None
+
+        out = self.conv(x)
+        return out, conv_cache
+
+
+def _interpolate_zq(zq, target_size):
+    """Interpolate latent z to target (T, H, W), matching CogVideoX's first-frame-special handling."""
+    t = target_size[0]
+    if t > 1 and t % 2 == 1:
+        z_first = F.interpolate(zq[:, :, :1], size=(1, target_size[1], target_size[2]))
+        z_rest = F.interpolate(zq[:, :, 1:], size=(t - 1, target_size[1], target_size[2]))
+        return torch.cat([z_first, z_rest], dim=2)
+    return F.interpolate(zq, size=target_size)
+
+
+class SpatialNorm3D(nn.Module):
+    """Spatially conditioned normalization."""
+    def __init__(self, f_channels, zq_channels, groups=32):
+        super().__init__()
+        self.norm_layer = ops.GroupNorm(num_channels=f_channels, num_groups=groups, eps=1e-6, affine=True)
+        self.conv_y = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
+        self.conv_b = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
+
+    def forward(self, f, zq, conv_cache=None):
+        new_cache = {}
+        conv_cache = conv_cache or {}
+
+        if zq.shape[-3:] != f.shape[-3:]:
+            zq = _interpolate_zq(zq, f.shape[-3:])
+
+        conv_y, new_cache["conv_y"] = self.conv_y(zq, conv_cache=conv_cache.get("conv_y"))
+        conv_b, new_cache["conv_b"] = self.conv_b(zq, conv_cache=conv_cache.get("conv_b"))
+
+        return self.norm_layer(f) * conv_y + conv_b, new_cache
+
+
+class ResnetBlock3D(nn.Module):
+    """3D ResNet block with optional spatial norm."""
+    def __init__(self, in_channels, out_channels=None, temb_channels=512, groups=32,
+                 eps=1e-6, act_fn="silu", spatial_norm_dim=None, pad_mode="first"):
+        super().__init__()
+        out_channels = out_channels or in_channels
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.spatial_norm_dim = spatial_norm_dim
+
+        if act_fn == "silu":
+            self.nonlinearity = nn.SiLU()
+        elif act_fn == "swish":
+            self.nonlinearity = nn.SiLU()
+        else:
+            self.nonlinearity = nn.SiLU()
+
+        if spatial_norm_dim is None:
+            self.norm1 = ops.GroupNorm(num_channels=in_channels, num_groups=groups, eps=eps)
+            self.norm2 = ops.GroupNorm(num_channels=out_channels, num_groups=groups, eps=eps)
+        else:
+            self.norm1 = SpatialNorm3D(in_channels, spatial_norm_dim, groups=groups)
+            self.norm2 = SpatialNorm3D(out_channels, spatial_norm_dim, groups=groups)
+
+        self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
+
+        if temb_channels > 0:
+            self.temb_proj = ops.Linear(temb_channels, out_channels)
+
+        self.conv2 = CausalConv3d(out_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
+
+        if in_channels != out_channels:
+            self.conv_shortcut = ops.Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        else:
+            self.conv_shortcut = None
+
+    def forward(self, x, temb=None, zq=None, conv_cache=None):
+        new_cache = {}
+        conv_cache = conv_cache or {}
+        residual = x
+
+        if zq is not None:
+            x, new_cache["norm1"] = self.norm1(x, zq, conv_cache=conv_cache.get("norm1"))
+        else:
+            x = self.norm1(x)
+
+        x = self.nonlinearity(x)
+        x, new_cache["conv1"] = self.conv1(x, conv_cache=conv_cache.get("conv1"))
+
+        if temb is not None and hasattr(self, "temb_proj"):
+            x = x + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None]
+
+        if zq is not None:
+            x, new_cache["norm2"] = self.norm2(x, zq, conv_cache=conv_cache.get("norm2"))
+        else:
+            x = self.norm2(x)
+
+        x = self.nonlinearity(x)
+        x, new_cache["conv2"] = self.conv2(x, conv_cache=conv_cache.get("conv2"))
+
+        if self.conv_shortcut is not None:
+            residual = self.conv_shortcut(residual)
+
+        return x + residual, new_cache
+
+
+class Downsample3D(nn.Module):
+    """3D downsampling with optional temporal compression."""
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, padding=0, compress_time=False):
+        super().__init__()
+        self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.compress_time = compress_time
+
+    def forward(self, x):
+        if self.compress_time:
+            b, c, t, h, w = x.shape
+            x = x.permute(0, 3, 4, 1, 2).reshape(b * h * w, c, t)
+            if t % 2 == 1:
+                x_first, x_rest = x[..., 0], x[..., 1:]
+                if x_rest.shape[-1] > 0:
+                    x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
+                x = torch.cat([x_first[..., None], x_rest], dim=-1)
+                x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
+            else:
+                x = F.avg_pool1d(x, kernel_size=2, stride=2)
+                x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
+
+        pad = (0, 1, 0, 1)
+        x = F.pad(x, pad, mode="constant", value=0)
+        b, c, t, h, w = x.shape
+        x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        x = self.conv(x)
+        x = x.reshape(b, t, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4)
+        return x
+
+
+class Upsample3D(nn.Module):
+    """3D upsampling with optional temporal decompression."""
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, compress_time=False):
+        super().__init__()
+        self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.compress_time = compress_time
+
+    def forward(self, x):
+        if self.compress_time:
+            if x.shape[2] > 1 and x.shape[2] % 2 == 1:
+                x_first, x_rest = x[:, :, 0], x[:, :, 1:]
+                x_first = F.interpolate(x_first, scale_factor=2.0)
+                x_rest = F.interpolate(x_rest, scale_factor=2.0)
+                x = torch.cat([x_first[:, :, None, :, :], x_rest], dim=2)
+            elif x.shape[2] > 1:
+                x = F.interpolate(x, scale_factor=2.0)
+            else:
+                x = x.squeeze(2)
+                x = F.interpolate(x, scale_factor=2.0)
+                x = x[:, :, None, :, :]
+        else:
+            b, c, t, h, w = x.shape
+            x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+            x = F.interpolate(x, scale_factor=2.0)
+            x = x.reshape(b, t, c, *x.shape[2:]).permute(0, 2, 1, 3, 4)
+
+        b, c, t, h, w = x.shape
+        x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        x = self.conv(x)
+        x = x.reshape(b, t, *x.shape[1:]).permute(0, 2, 1, 3, 4)
+        return x
+
+
+class DownBlock3D(nn.Module):
+    def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
+                 eps=1e-6, act_fn="silu", groups=32, add_downsample=True,
+                 compress_time=False, pad_mode="first"):
+        super().__init__()
+        self.resnets = nn.ModuleList([
+            ResnetBlock3D(
+                in_channels=in_channels if i == 0 else out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                groups=groups, eps=eps, act_fn=act_fn, pad_mode=pad_mode,
+            )
+            for i in range(num_layers)
+        ])
+        self.downsamplers = nn.ModuleList([Downsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_downsample else None
+
+    def forward(self, x, temb=None, zq=None, conv_cache=None):
+        new_cache = {}
+        conv_cache = conv_cache or {}
+        for i, resnet in enumerate(self.resnets):
+            x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
+        if self.downsamplers is not None:
+            for ds in self.downsamplers:
+                x = ds(x)
+        return x, new_cache
+
+
+class MidBlock3D(nn.Module):
+    def __init__(self, in_channels, temb_channels=0, num_layers=1,
+                 eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=None, pad_mode="first"):
+        super().__init__()
+        self.resnets = nn.ModuleList([
+            ResnetBlock3D(
+                in_channels=in_channels, out_channels=in_channels,
+                temb_channels=temb_channels, groups=groups, eps=eps,
+                act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
+            )
+            for _ in range(num_layers)
+        ])
+
+    def forward(self, x, temb=None, zq=None, conv_cache=None):
+        new_cache = {}
+        conv_cache = conv_cache or {}
+        for i, resnet in enumerate(self.resnets):
+            x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
+        return x, new_cache
+
+
+class UpBlock3D(nn.Module):
+    def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
+                 eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=16,
+                 add_upsample=True, compress_time=False, pad_mode="first"):
+        super().__init__()
+        self.resnets = nn.ModuleList([
+            ResnetBlock3D(
+                in_channels=in_channels if i == 0 else out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels, groups=groups, eps=eps,
+                act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
+            )
+            for i in range(num_layers)
+        ])
+        self.upsamplers = nn.ModuleList([Upsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_upsample else None
+
+    def forward(self, x, temb=None, zq=None, conv_cache=None):
+        new_cache = {}
+        conv_cache = conv_cache or {}
+        for i, resnet in enumerate(self.resnets):
+            x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
+        if self.upsamplers is not None:
+            for us in self.upsamplers:
+                x = us(x)
+        return x, new_cache
+
+
+class Encoder3D(nn.Module):
+    def __init__(self, in_channels=3, out_channels=16,
+                 block_out_channels=(128, 256, 256, 512),
+                 layers_per_block=3, act_fn="silu",
+                 eps=1e-6, groups=32, pad_mode="first",
+                 temporal_compression_ratio=4):
+        super().__init__()
+        temporal_compress_level = int(np.log2(temporal_compression_ratio))
+
+        self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode)
+
+        self.down_blocks = nn.ModuleList()
+        output_channel = block_out_channels[0]
+        for i in range(len(block_out_channels)):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final = i == len(block_out_channels) - 1
+            compress_time = i < temporal_compress_level
+
+            self.down_blocks.append(DownBlock3D(
+                in_channels=input_channel, out_channels=output_channel,
+                temb_channels=0, num_layers=layers_per_block,
+                eps=eps, act_fn=act_fn, groups=groups,
+                add_downsample=not is_final, compress_time=compress_time,
+            ))
+
+        self.mid_block = MidBlock3D(
+            in_channels=block_out_channels[-1], temb_channels=0,
+            num_layers=2, eps=eps, act_fn=act_fn, groups=groups, pad_mode=pad_mode,
+        )
+
+        self.norm_out = ops.GroupNorm(groups, block_out_channels[-1], eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = CausalConv3d(block_out_channels[-1], 2 * out_channels, kernel_size=3, pad_mode=pad_mode)
+
+    def forward(self, x, conv_cache=None):
+        new_cache = {}
+        conv_cache = conv_cache or {}
+
+        x, new_cache["conv_in"] = self.conv_in(x, conv_cache=conv_cache.get("conv_in"))
+
+        for i, block in enumerate(self.down_blocks):
+            key = f"down_block_{i}"
+            x, new_cache[key] = block(x, None, None, conv_cache.get(key))
+
+        x, new_cache["mid_block"] = self.mid_block(x, None, None, conv_cache=conv_cache.get("mid_block"))
+
+        x = self.norm_out(x)
+        x = self.conv_act(x)
+        x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
+
+        return x, new_cache
+
+
+class Decoder3D(nn.Module):
+    def __init__(self, in_channels=16, out_channels=3,
+                 block_out_channels=(128, 256, 256, 512),
+                 layers_per_block=3, act_fn="silu",
+                 eps=1e-6, groups=32, pad_mode="first",
+                 temporal_compression_ratio=4):
+        super().__init__()
+        reversed_channels = list(reversed(block_out_channels))
+        temporal_compress_level = int(np.log2(temporal_compression_ratio))
+
+        self.conv_in = CausalConv3d(in_channels, reversed_channels[0], kernel_size=3, pad_mode=pad_mode)
+
+        self.mid_block = MidBlock3D(
+            in_channels=reversed_channels[0], temb_channels=0,
+            num_layers=2, eps=eps, act_fn=act_fn, groups=groups,
+            spatial_norm_dim=in_channels, pad_mode=pad_mode,
+        )
+
+        self.up_blocks = nn.ModuleList()
+        output_channel = reversed_channels[0]
+        for i in range(len(block_out_channels)):
+            prev_channel = output_channel
+            output_channel = reversed_channels[i]
+            is_final = i == len(block_out_channels) - 1
+            compress_time = i < temporal_compress_level
+
+            self.up_blocks.append(UpBlock3D(
+                in_channels=prev_channel, out_channels=output_channel,
+                temb_channels=0, num_layers=layers_per_block + 1,
+                eps=eps, act_fn=act_fn, groups=groups,
+                spatial_norm_dim=in_channels,
+                add_upsample=not is_final, compress_time=compress_time,
+            ))
+
+        self.norm_out = SpatialNorm3D(reversed_channels[-1], in_channels, groups=groups)
+        self.conv_act = nn.SiLU()
+        self.conv_out = CausalConv3d(reversed_channels[-1], out_channels, kernel_size=3, pad_mode=pad_mode)
+
+    def forward(self, sample, conv_cache=None):
+        new_cache = {}
+        conv_cache = conv_cache or {}
+
+        x, new_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
+
+        x, new_cache["mid_block"] = self.mid_block(x, None, sample, conv_cache=conv_cache.get("mid_block"))
+
+        for i, block in enumerate(self.up_blocks):
+            key = f"up_block_{i}"
+            x, new_cache[key] = block(x, None, sample, conv_cache=conv_cache.get(key))
+
+        x, new_cache["norm_out"] = self.norm_out(x, sample, conv_cache=conv_cache.get("norm_out"))
+        x = self.conv_act(x)
+        x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
+
+        return x, new_cache
+
+
+
+class AutoencoderKLCogVideoX(nn.Module):
+    """CogVideoX VAE. Spatial tiling/slicing handled by ComfyUI's VAE wrapper.
+
+    Uses rolling temporal decode: conv_in + mid_block + temporal up_blocks run
+    on the full (low-res) tensor, then the expensive spatial-only up_blocks +
+    norm_out + conv_out are processed in small temporal chunks with conv_cache
+    carrying causal state between chunks. This keeps peak VRAM proportional to
+    chunk_size rather than total frame count.
+    """
+
+    def __init__(self,
+                 in_channels=3, out_channels=3,
+                 block_out_channels=(128, 256, 256, 512),
+                 latent_channels=16, layers_per_block=3,
+                 act_fn="silu", eps=1e-6, groups=32,
+                 temporal_compression_ratio=4,
+                 ):
+        super().__init__()
+        self.latent_channels = latent_channels
+        self.temporal_compression_ratio = temporal_compression_ratio
+
+        self.encoder = Encoder3D(
+            in_channels=in_channels, out_channels=latent_channels,
+            block_out_channels=block_out_channels, layers_per_block=layers_per_block,
+            act_fn=act_fn, eps=eps, groups=groups,
+            temporal_compression_ratio=temporal_compression_ratio,
+        )
+        self.decoder = Decoder3D(
+            in_channels=latent_channels, out_channels=out_channels,
+            block_out_channels=block_out_channels, layers_per_block=layers_per_block,
+            act_fn=act_fn, eps=eps, groups=groups,
+            temporal_compression_ratio=temporal_compression_ratio,
+        )
+
+        self.num_latent_frames_batch_size = 2
+        self.num_sample_frames_batch_size = 8
+
+    def encode(self, x):
+        t = x.shape[2]
+        frame_batch = self.num_sample_frames_batch_size
+        remainder = t % frame_batch
+        conv_cache = None
+        enc = []
+
+        # Process remainder frames first so only the first chunk can have an
+        # odd temporal dimension — where Downsample3D's first-frame-special
+        # handling in temporal compression is actually correct.
+        if remainder > 0:
+            chunk, conv_cache = self.encoder(x[:, :, :remainder], conv_cache=conv_cache)
+            enc.append(chunk.to(x.device))
+
+        for start in range(remainder, t, frame_batch):
+            chunk, conv_cache = self.encoder(x[:, :, start:start + frame_batch], conv_cache=conv_cache)
+            enc.append(chunk.to(x.device))
+
+        enc = torch.cat(enc, dim=2)
+        mean, _ = enc.chunk(2, dim=1)
+        return mean
+
+    def decode(self, z):
+        return self._decode_rolling(z)
+
+    def _decode_batched(self, z):
+        """Original batched decode - processes 2 latent frames through full decoder."""
+        t = z.shape[2]
+        frame_batch = self.num_latent_frames_batch_size
+        num_batches = max(t // frame_batch, 1)
+        conv_cache = None
+        dec = []
+        for i in range(num_batches):
+            remaining = t % frame_batch
+            start = frame_batch * i + (0 if i == 0 else remaining)
+            end = frame_batch * (i + 1) + remaining
+            chunk, conv_cache = self.decoder(z[:, :, start:end], conv_cache=conv_cache)
+            dec.append(chunk.cpu())
+        return torch.cat(dec, dim=2).to(z.device)
+
+    def _decode_rolling(self, z):
+        """Rolling decode - processes low-res layers on full tensor, then rolls
+        through expensive high-res layers in temporal chunks."""
+        decoder = self.decoder
+        device = z.device
+
+        # Determine which up_blocks have temporal upsample vs spatial-only.
+        # Temporal up_blocks are cheap (low res), spatial-only are expensive.
+        temporal_compress_level = int(np.log2(self.temporal_compression_ratio))
+        split_at = temporal_compress_level  # first N up_blocks do temporal upsample
+
+        # Phase 1: conv_in + mid_block + temporal up_blocks on full tensor (low/medium res)
+        x, _ = decoder.conv_in(z)
+        x, _ = decoder.mid_block(x, None, z)
+
+        for i in range(split_at):
+            x, _ = decoder.up_blocks[i](x, None, z)
+
+        # Phase 2: remaining spatial-only up_blocks + norm_out + conv_out in temporal chunks
+        remaining_blocks = list(range(split_at, len(decoder.up_blocks)))
+        chunk_size = 4  # pixel frames per chunk through high-res layers
+        t_expanded = x.shape[2]
+
+        if t_expanded <= chunk_size or len(remaining_blocks) == 0:
+            # Small enough to process in one go
+            for i in remaining_blocks:
+                x, _ = decoder.up_blocks[i](x, None, z)
+            x, _ = decoder.norm_out(x, z)
+            x = decoder.conv_act(x)
+            x, _ = decoder.conv_out(x)
+            return x
+
+        # Expand z temporally once to match Phase 2's time dimension.
+        # z stays at latent spatial resolution so this is small (~16 MB vs ~1.3 GB
+        # for the old approach of pre-interpolating to every pixel resolution).
+        z_time_expanded = _interpolate_zq(z, (t_expanded, z.shape[3], z.shape[4]))
+
+        # Process in temporal chunks, interpolating spatially per-chunk to avoid
+        # allocating full [B, C, t_expanded, H, W] tensors at each resolution.
+        dec_out = []
+        conv_caches = {}
+
+        for chunk_start in range(0, t_expanded, chunk_size):
+            chunk_end = min(chunk_start + chunk_size, t_expanded)
+            x_chunk = x[:, :, chunk_start:chunk_end]
+            z_t_chunk = z_time_expanded[:, :, chunk_start:chunk_end]
+            z_spatial_cache = {}
+
+            for i in remaining_blocks:
+                block = decoder.up_blocks[i]
+                cache_key = f"up_block_{i}"
+                hw_key = (x_chunk.shape[3], x_chunk.shape[4])
+                if hw_key not in z_spatial_cache:
+                    if z_t_chunk.shape[3] == hw_key[0] and z_t_chunk.shape[4] == hw_key[1]:
+                        z_spatial_cache[hw_key] = z_t_chunk
+                    else:
+                        z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
+                x_chunk, new_cache = block(x_chunk, None, z_spatial_cache[hw_key], conv_cache=conv_caches.get(cache_key))
+                conv_caches[cache_key] = new_cache
+
+            hw_key = (x_chunk.shape[3], x_chunk.shape[4])
+            if hw_key not in z_spatial_cache:
+                z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
+            x_chunk, new_cache = decoder.norm_out(x_chunk, z_spatial_cache[hw_key], conv_cache=conv_caches.get("norm_out"))
+            conv_caches["norm_out"] = new_cache
+            x_chunk = decoder.conv_act(x_chunk)
+            x_chunk, new_cache = decoder.conv_out(x_chunk, conv_cache=conv_caches.get("conv_out"))
+            conv_caches["conv_out"] = new_cache
+
+            dec_out.append(x_chunk.cpu())
+            del z_spatial_cache
+
+        del x, z_time_expanded
+        return torch.cat(dec_out, dim=2).to(device)
--- a/comfy/lora.py
+++ b/comfy/lora.py
@ -342,6 +342,12 @@ def model_lora_keys_unet(model, key_map={}):
                key_map["base_model.model.{}".format(key_lora)] = k  # Official base model loras
                key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k  # LyCORIS/LoKR format

+    if isinstance(model, comfy.model_base.ErnieImage):
+        for k in sdk:
+            if k.startswith("diffusion_model.") and k.endswith(".weight"):
+                key_lora = k[len("diffusion_model."):-len(".weight")]
+                key_map["transformer.{}".format(key_lora)] = k
+
    return key_map


--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -52,6 +52,7 @@ import comfy.ldm.qwen_image.model
 import comfy.ldm.kandinsky5.model
 import comfy.ldm.anima.model
 import comfy.ldm.ace.ace_step15
+import comfy.ldm.cogvideo.model
 import comfy.ldm.rt_detr.rtdetr_v4
 import comfy.ldm.ernie.model
 import comfy.ldm.sam3.detector
@ -81,6 +82,7 @@ class ModelType(Enum):
    IMG_TO_IMG = 9
    FLOW_COSMOS = 10
    IMG_TO_IMG_FLOW = 11
+    V_PREDICTION_DDPM = 12


 def model_sampling(model_config, model_type):
@ -115,6 +117,8 @@ def model_sampling(model_config, model_type):
        s = comfy.model_sampling.ModelSamplingCosmosRFlow
    elif model_type == ModelType.IMG_TO_IMG_FLOW:
        c = comfy.model_sampling.IMG_TO_IMG_FLOW
+    elif model_type == ModelType.V_PREDICTION_DDPM:
+        c = comfy.model_sampling.V_PREDICTION_DDPM

    class ModelSampling(s, c):
        pass
@ -1979,3 +1983,59 @@ class ErnieImage(BaseModel):
 class SAM3(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.sam3.detector.SAM3Model)
+
+class CogVideoX(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.V_PREDICTION_DDPM, image_to_video=False, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.cogvideo.model.CogVideoXTransformer3DModel)
+        self.image_to_video = image_to_video
+
+    def concat_cond(self, **kwargs):
+        noise = kwargs.get("noise", None)
+        # Detect extra channels needed (e.g. 32 - 16 = 16 for ref latent)
+        extra_channels = self.diffusion_model.in_channels - noise.shape[1]
+        if extra_channels == 0:
+            return None
+
+        image = kwargs.get("concat_latent_image", None)
+        device = kwargs["device"]
+
+        if image is None:
+            shape = list(noise.shape)
+            shape[1] = extra_channels
+            return torch.zeros(shape, dtype=noise.dtype, layout=noise.layout, device=noise.device)
+
+        latent_dim = self.latent_format.latent_channels
+        image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
+
+        if noise.ndim == 5 and image.ndim == 5:
+            if image.shape[-3] < noise.shape[-3]:
+                image = torch.nn.functional.pad(image, (0, 0, 0, 0, 0, noise.shape[-3] - image.shape[-3]), "constant", 0)
+            elif image.shape[-3] > noise.shape[-3]:
+                image = image[:, :, :noise.shape[-3]]
+
+        for i in range(0, image.shape[1], latent_dim):
+            image[:, i:i + latent_dim] = self.process_latent_in(image[:, i:i + latent_dim])
+        image = utils.resize_to_batch_size(image, noise.shape[0])
+
+        if image.shape[1] > extra_channels:
+            image = image[:, :extra_channels]
+        elif image.shape[1] < extra_channels:
+            repeats = extra_channels // image.shape[1]
+            remainder = extra_channels % image.shape[1]
+            parts = [image] * repeats
+            if remainder > 0:
+                parts.append(image[:, :remainder])
+            image = torch.cat(parts, dim=1)
+
+        return image
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        # OFS embedding (CogVideoX 1.5 I2V), default 2.0 as used by SparkVSR
+        if self.diffusion_model.ofs_proj_dim is not None:
+            ofs = kwargs.get("ofs", None)
+            if ofs is None:
+                noise = kwargs.get("noise", None)
+                ofs = torch.full((noise.shape[0],), 2.0, device=noise.device, dtype=noise.dtype)
+            out['ofs'] = comfy.conds.CONDRegular(ofs)
+        return out
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -490,6 +490,54 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):

        return dit_config

+    if '{}blocks.0.norm1.linear.weight'.format(key_prefix) in state_dict_keys:  # CogVideoX
+        dit_config = {}
+        dit_config["image_model"] = "cogvideox"
+
+        # Extract config from weight shapes
+        norm1_weight = state_dict['{}blocks.0.norm1.linear.weight'.format(key_prefix)]
+        time_embed_dim = norm1_weight.shape[1]
+        dim = norm1_weight.shape[0] // 6
+
+        dit_config["num_attention_heads"] = dim // 64
+        dit_config["attention_head_dim"] = 64
+        dit_config["time_embed_dim"] = time_embed_dim
+        dit_config["num_layers"] = count_blocks(state_dict_keys, '{}blocks.'.format(key_prefix) + '{}.')
+
+        # Detect in_channels from patch_embed
+        patch_proj_key = '{}patch_embed.proj.weight'.format(key_prefix)
+        if patch_proj_key in state_dict_keys:
+            w = state_dict[patch_proj_key]
+            if w.ndim == 4:
+                # Conv2d: [out, in, kh, kw] — CogVideoX 1.0
+                dit_config["in_channels"] = w.shape[1]
+                dit_config["patch_size"] = w.shape[2]
+            elif w.ndim == 2:
+                # Linear: [out, in_channels * patch_size * patch_size * patch_size_t] — CogVideoX 1.5
+                dit_config["patch_size"] = 2
+                dit_config["patch_size_t"] = 2
+                dit_config["in_channels"] = w.shape[1] // (2 * 2 * 2)  # 256 // 8 = 32
+
+        text_proj_key = '{}patch_embed.text_proj.weight'.format(key_prefix)
+        if text_proj_key in state_dict_keys:
+            dit_config["text_embed_dim"] = state_dict[text_proj_key].shape[1]
+
+        # Detect OFS embedding
+        ofs_key = '{}ofs_embedding_linear_1.weight'.format(key_prefix)
+        if ofs_key in state_dict_keys:
+            dit_config["ofs_embed_dim"] = state_dict[ofs_key].shape[1]
+
+        # Detect positional embedding type
+        pos_key = '{}patch_embed.pos_embedding'.format(key_prefix)
+        if pos_key in state_dict_keys:
+            dit_config["use_learned_positional_embeddings"] = True
+            dit_config["use_rotary_positional_embeddings"] = False
+        else:
+            dit_config["use_learned_positional_embeddings"] = False
+            dit_config["use_rotary_positional_embeddings"] = True
+
+        return dit_config
+
    if '{}head.modulation'.format(key_prefix) in state_dict_keys:  # Wan 2.1
        dit_config = {}
        dit_config["image_model"] = "wan2.1"
--- a/comfy/model_sampling.py
+++ b/comfy/model_sampling.py
@ -54,6 +54,30 @@ class V_PREDICTION(EPS):
        sigma = reshape_sigma(sigma, model_output.ndim)
        return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) - model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5

+class V_PREDICTION_DDPM:
+    """CogVideoX v-prediction: model receives raw x_t (unscaled), predicts velocity v.
+    x_0 = sqrt(alpha) * x_t - sqrt(1-alpha) * v
+        = x_t / sqrt(sigma^2 + 1) - v * sigma / sqrt(sigma^2 + 1)
+    """
+    def calculate_input(self, sigma, noise):
+        return noise
+
+    def calculate_denoised(self, sigma, model_output, model_input):
+        sigma = reshape_sigma(sigma, model_output.ndim)
+        return model_input / (sigma ** 2 + 1.0) ** 0.5 - model_output * sigma / (sigma ** 2 + 1.0) ** 0.5
+
+    def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
+        sigma = reshape_sigma(sigma, noise.ndim)
+        if max_denoise:
+            noise = noise * torch.sqrt(1.0 + sigma ** 2.0)
+        else:
+            noise = noise * sigma
+        noise += latent_image
+        return noise
+
+    def inverse_noise_scaling(self, sigma, latent):
+        return latent
+
 class EDM(V_PREDICTION):
    def calculate_denoised(self, sigma, model_output, model_input):
        sigma = reshape_sigma(sigma, model_output.ndim)
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -18,6 +18,7 @@ import comfy.ldm.wan.vae
 import comfy.ldm.wan.vae2_2
 import comfy.ldm.hunyuan3d.vae
 import comfy.ldm.ace.vae.music_dcae_pipeline
+import comfy.ldm.cogvideo.vae
 import comfy.ldm.hunyuan_video.vae
 import comfy.ldm.mmaudio.vae.autoencoder
 import comfy.pixel_space_convert
@ -478,7 +479,10 @@ class VAE:
                                                            encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': encoder_config},
                                                            decoder_config={'target': "comfy.ldm.modules.temporal_ae.VideoDecoder", 'params': decoder_config})
            elif "taesd_decoder.1.weight" in sd:
-                self.latent_channels = sd["taesd_decoder.1.weight"].shape[1]
+                if isinstance(metadata, dict) and "tae_latent_channels" in metadata:
+                    self.latent_channels = metadata["tae_latent_channels"]
+                else:
+                    self.latent_channels = sd["taesd_decoder.1.weight"].shape[1]
                self.first_stage_model = comfy.taesd.taesd.TAESD(latent_channels=self.latent_channels)
            elif "vquantizer.codebook.weight" in sd: #VQGan: stage a of stable cascade
                self.first_stage_model = StageA()
@ -652,6 +656,17 @@ class VAE:

                self.memory_used_encode = lambda shape, dtype: (1400 * 9 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
                self.memory_used_decode = lambda shape, dtype: (3600 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
+            elif "decoder.conv_in.conv.weight" in sd and "decoder.mid_block.resnets.0.norm1.norm_layer.weight" in sd:  # CogVideoX VAE
+                self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
+                self.upscale_index_formula = (4, 8, 8)
+                self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
+                self.downscale_index_formula = (4, 8, 8)
+                self.latent_dim = 3
+                self.latent_channels = sd["encoder.conv_out.conv.weight"].shape[0] // 2
+                self.first_stage_model = comfy.ldm.cogvideo.vae.AutoencoderKLCogVideoX(latent_channels=self.latent_channels)
+                self.memory_used_decode = lambda shape, dtype: (2800 * max(2, ((shape[2] - 1) * 4) + 1) * shape[3] * shape[4] * (8 * 8)) * model_management.dtype_size(dtype)
+                self.memory_used_encode = lambda shape, dtype: (1400 * max(1, shape[2]) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
+                self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
            elif "decoder.conv_in.conv.weight" in sd:
                ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
                ddconfig["conv3d"] = True
@ -1105,7 +1120,17 @@ class VAE:
            else:
                pixel_samples = pixel_samples.unsqueeze(2)

-        memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)  # TODO: calculate mem required for tile
+        if dims == 2:
+            default_tile_x = 512 if tile_x is None else tile_x
+            default_tile_y = 512 if tile_y is None else tile_y
+            tile_shapes = [
+                (1, pixel_samples.shape[1], min(pixel_samples.shape[2], max(1, default_tile_y)), min(pixel_samples.shape[3], max(1, default_tile_x))),
+                (1, pixel_samples.shape[1], min(pixel_samples.shape[2], max(1, default_tile_y // 2)), min(pixel_samples.shape[3], max(1, default_tile_x * 2))),
+                (1, pixel_samples.shape[1], min(pixel_samples.shape[2], max(1, default_tile_y * 2)), min(pixel_samples.shape[3], max(1, default_tile_x // 2))),
+            ]
+            memory_used = max(self.memory_used_encode(shape, self.vae_dtype) for shape in tile_shapes)
+        else:
+            memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
        model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)

        args = {}
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -27,6 +27,7 @@ import comfy.text_encoders.anima
 import comfy.text_encoders.ace15
 import comfy.text_encoders.longcat_image
 import comfy.text_encoders.ernie
+import comfy.text_encoders.cogvideo

 from . import supported_models_base
 from . import latent_formats
@ -1832,6 +1833,52 @@ class SAM31(SAM3):
    unet_config = {"image_model": "SAM31"}


-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4, ErnieImage, SAM3, SAM31]
+class CogVideoX_T2V(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "cogvideox",
+    }
+
+    sampling_settings = {
+        "linear_start": 0.00085,
+        "linear_end": 0.012,
+        "beta_schedule": "linear",
+        "zsnr": True,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.CogVideoX
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        # CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE
+        if self.unet_config.get("patch_size_t") is not None:
+            self.unet_config.setdefault("sample_height", 96)
+            self.unet_config.setdefault("sample_width", 170)
+            self.unet_config.setdefault("sample_frames", 81)
+        out = model_base.CogVideoX(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(comfy.text_encoders.cogvideo.CogVideoXT5Tokenizer, comfy.text_encoders.sd3_clip.T5XXLModel)
+
+class CogVideoX_I2V(CogVideoX_T2V):
+    unet_config = {
+        "image_model": "cogvideox",
+        "in_channels": 32,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        if self.unet_config.get("patch_size_t") is not None:
+            self.unet_config.setdefault("sample_height", 96)
+            self.unet_config.setdefault("sample_width", 170)
+            self.unet_config.setdefault("sample_frames", 81)
+        out = model_base.CogVideoX(self, image_to_video=True, device=device)
+        return out
+
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4, ErnieImage, SAM3, SAM31, CogVideoX_I2V, CogVideoX_T2V]

 models += [SVD_img2vid]
--- a/comfy/taesd/taehv.py
+++ b/comfy/taesd/taehv.py
@ -7,6 +7,7 @@ from tqdm.auto import tqdm
 from collections import namedtuple, deque

 import comfy.ops
+import comfy.model_management
 operations=comfy.ops.disable_weight_init

 DecoderResult = namedtuple("DecoderResult", ("frame", "memory"))
@ -47,11 +48,14 @@ class TGrow(nn.Module):
        x = self.conv(x)
        return x.reshape(-1, C, H, W)

-def apply_model_with_memblocks(model, x, parallel, show_progress_bar):
+def apply_model_with_memblocks(model, x, parallel, show_progress_bar, output_device=None,
+                               patch_size=1, decode=False):

    B, T, C, H, W = x.shape
    if parallel:
        x = x.reshape(B*T, C, H, W)
+        if not decode and patch_size > 1:
+            x = F.pixel_unshuffle(x, patch_size)
        # parallel over input timesteps, iterate over blocks
        for b in tqdm(model, disable=not show_progress_bar):
            if isinstance(b, MemBlock):
@ -62,20 +66,27 @@ def apply_model_with_memblocks(model, x, parallel, show_progress_bar):
                x = b(x, mem)
            else:
                x = b(x)
-        BT, C, H, W = x.shape
-        T = BT // B
-        x = x.view(B, T, C, H, W)
+        if decode and patch_size > 1:
+            x = F.pixel_shuffle(x, patch_size)
+        x = x.view(B, x.shape[0] // B, *x.shape[1:])
+        x = x.to(output_device)
    else:
        out = []
-        work_queue = deque([TWorkItem(xt, 0) for t, xt in enumerate(x.reshape(B, T * C, H, W).chunk(T, dim=1))])
+        # Chunk along the time dim directly (chunks are [B,1,C,H,W] views, squeeze to [B,C,H,W] views).
+        # Avoids forcing a contiguous copy when x is non-contiguous (e.g. after movedim in encode/decode).
+        work_queue = deque([TWorkItem(xt.squeeze(1), 0) for xt in x.chunk(T, dim=1)])
        progress_bar = tqdm(range(T), disable=not show_progress_bar)
        mem = [None] * len(model)
        while work_queue:
            xt, i = work_queue.popleft()
            if i == 0:
                progress_bar.update(1)
+                if not decode and patch_size > 1:
+                    xt = F.pixel_unshuffle(xt, patch_size)
            if i == len(model):
-                out.append(xt)
+                if decode and patch_size > 1:
+                    xt = F.pixel_shuffle(xt, patch_size)
+                out.append(xt.to(output_device))
                del xt
            else:
                b = model[i]
@ -165,24 +176,20 @@ class TAEHV(nn.Module):

    def encode(self, x, **kwargs):
        x = x.movedim(2, 1)  # [B, C, T, H, W] -> [B, T, C, H, W]
-        if self.patch_size > 1:
-            B, T, C, H, W = x.shape
-            x = x.reshape(B * T, C, H, W)
-            x = F.pixel_unshuffle(x, self.patch_size)
-            x = x.reshape(B, T, C * self.patch_size ** 2, H // self.patch_size, W // self.patch_size)
        if x.shape[1] % self.t_downscale != 0:
            # pad at end to multiple of t_downscale
            n_pad = self.t_downscale - x.shape[1] % self.t_downscale
            padding = x[:, -1:].repeat_interleave(n_pad, dim=1)
            x = torch.cat([x, padding], 1)
-        x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar).movedim(2, 1)
+        x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar,
+                                        patch_size=self.patch_size).movedim(2, 1)
        return self.process_out(x)

    def decode(self, x, **kwargs):
        x = x.unsqueeze(0) if x.ndim == 4 else x  # [T, C, H, W] -> [1, T, C, H, W]
        x = x.movedim(1, 2) if x.shape[1] != self.latent_channels else x  # [B, T, C, H, W] or [B, C, T, H, W]
        x = self.process_in(x).movedim(2, 1)  # [B, C, T, H, W] -> [B, T, C, H, W]
-        x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar)
-        if self.patch_size > 1:
-            x = F.pixel_shuffle(x, self.patch_size)
+        x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar,
+                                        output_device=comfy.model_management.intermediate_device(),
+                                        patch_size=self.patch_size, decode=True)
        return x[:, self.frames_to_trim:].movedim(2, 1)
--- a/comfy/taesd/taesd.py
+++ b/comfy/taesd/taesd.py
@ -17,32 +17,79 @@ class Clamp(nn.Module):
        return torch.tanh(x / 3) * 3

 class Block(nn.Module):
-    def __init__(self, n_in, n_out):
+    def __init__(self, n_in: int, n_out: int, use_midblock_gn: bool = False):
        super().__init__()
        self.conv = nn.Sequential(conv(n_in, n_out), nn.ReLU(), conv(n_out, n_out), nn.ReLU(), conv(n_out, n_out))
        self.skip = comfy.ops.disable_weight_init.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
        self.fuse = nn.ReLU()
-    def forward(self, x):
+        if not use_midblock_gn:
+            self.pool = None
+            return
+        n_gn = n_in * 4
+        self.pool = nn.Sequential(
+            comfy.ops.disable_weight_init.Conv2d(n_in, n_gn, 1, bias=False),
+            comfy.ops.disable_weight_init.GroupNorm(4, n_gn),
+            nn.ReLU(inplace=True),
+            comfy.ops.disable_weight_init.Conv2d(n_gn, n_in, 1, bias=False),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.pool is not None:
+            x = x + self.pool(x)
        return self.fuse(self.conv(x) + self.skip(x))

-def Encoder(latent_channels=4):
-    return nn.Sequential(
-        conv(3, 64), Block(64, 64),
-        conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
-        conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
-        conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
-        conv(64, latent_channels),
-    )
+class Encoder(nn.Sequential):
+    def __init__(self, latent_channels: int = 4, use_gn: bool = False):
+        super().__init__(
+            conv(3, 64), Block(64, 64),
+            conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
+            conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
+            conv(64, 64, stride=2, bias=False), Block(64, 64, use_gn), Block(64, 64, use_gn), Block(64, 64, use_gn),
+            conv(64, latent_channels),
+        )

+class Decoder(nn.Sequential):
+    def __init__(self, latent_channels: int = 4, use_gn: bool = False):
+        super().__init__(
+            Clamp(), conv(latent_channels, 64), nn.ReLU(),
+            Block(64, 64, use_gn), Block(64, 64, use_gn), Block(64, 64, use_gn), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
+            Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
+            Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
+            Block(64, 64), conv(64, 3),
+        )
+
+class DecoderFlux2(Decoder):
+    def __init__(self, latent_channels: int = 128, use_gn: bool = True):
+        if latent_channels != 128 or not use_gn:
+            raise ValueError("Unexpected parameters for Flux2 TAE module")
+        super().__init__(latent_channels=32, use_gn=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, C, H, W = x.shape
+        x = (
+            x
+            .reshape(B, 32, 2, 2, H, W)
+            .permute(0, 1, 4, 2, 5, 3)
+            .reshape(B, 32, H * 2, W * 2)
+        )
+        return super().forward(x)
+
+class EncoderFlux2(Encoder):
+    def __init__(self, latent_channels: int = 128, use_gn: bool = True):
+        if latent_channels != 128 or not use_gn:
+            raise ValueError("Unexpected parameters for Flux2 TAE module")
+        super().__init__(latent_channels=32, use_gn=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        result = super().forward(x)
+        B, C, H, W = result.shape
+        return (
+            result
+            .reshape(B, C, H // 2, 2, W // 2, 2)
+            .permute(0, 1, 3, 5, 2, 4)
+            .reshape(B, 128, H // 2, W // 2)
+        )

-def Decoder(latent_channels=4):
-    return nn.Sequential(
-        Clamp(), conv(latent_channels, 64), nn.ReLU(),
-        Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
-        Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
-        Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
-        Block(64, 64), conv(64, 3),
-    )

 class TAESD(nn.Module):
    latent_magnitude = 3
@ -51,8 +98,15 @@ class TAESD(nn.Module):
    def __init__(self, encoder_path=None, decoder_path=None, latent_channels=4):
        """Initialize pretrained TAESD on the given device from the given checkpoints."""
        super().__init__()
-        self.taesd_encoder = Encoder(latent_channels=latent_channels)
-        self.taesd_decoder = Decoder(latent_channels=latent_channels)
+        if latent_channels == 128:
+            encoder_class = EncoderFlux2
+            decoder_class = DecoderFlux2
+        else:
+            encoder_class = Encoder
+            decoder_class = Decoder
+        self.taesd_encoder = encoder_class(latent_channels=latent_channels)
+        self.taesd_decoder = decoder_class(latent_channels=latent_channels)
+
        self.vae_scale = torch.nn.Parameter(torch.tensor(1.0))
        self.vae_shift = torch.nn.Parameter(torch.tensor(0.0))
        if encoder_path is not None:
@ -61,19 +115,19 @@ class TAESD(nn.Module):
            self.taesd_decoder.load_state_dict(comfy.utils.load_torch_file(decoder_path, safe_load=True))

    @staticmethod
-    def scale_latents(x):
+    def scale_latents(x: torch.Tensor) -> torch.Tensor:
        """raw latents -> [0, 1]"""
        return x.div(2 * TAESD.latent_magnitude).add(TAESD.latent_shift).clamp(0, 1)

    @staticmethod
-    def unscale_latents(x):
+    def unscale_latents(x: torch.Tensor) -> torch.Tensor:
        """[0, 1] -> raw latents"""
        return x.sub(TAESD.latent_shift).mul(2 * TAESD.latent_magnitude)

-    def decode(self, x):
+    def decode(self, x: torch.Tensor) -> torch.Tensor:
        x_sample = self.taesd_decoder((x - self.vae_shift) * self.vae_scale)
        x_sample = x_sample.sub(0.5).mul(2)
        return x_sample

-    def encode(self, x):
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
        return (self.taesd_encoder(x * 0.5 + 0.5) / self.vae_scale) + self.vae_shift
--- a/comfy/text_encoders/cogvideo.py
+++ b/comfy/text_encoders/cogvideo.py
@ -0,0 +1,6 @@
+import comfy.text_encoders.sd3_clip
+
+
+class CogVideoXT5Tokenizer(comfy.text_encoders.sd3_clip.T5XXLTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, min_length=226)
--- a/comfy_api/latest/_input_impl/video_types.py
+++ b/comfy_api/latest/_input_impl/video_types.py
@ -251,6 +251,7 @@ class VideoFromFile(VideoInput):
            container.seek(start_pts, stream=video_stream)

        image_format = 'gbrpf32le'
+        process_image_format = lambda a: a
        audio = None

        streams = [video_stream]
@ -283,11 +284,25 @@ class VideoFromFile(VideoInput):
                            break

                        if not checked_alpha:
+                            alpha_channel = False
                            for comp in frame.format.components:
                                if comp.is_alpha or frame.format.name == "pal8":
                                    alphas = []
-                                    image_format = 'gbrapf32le'
+                                    alpha_channel = True
                                    break
+                            if frame.format.name in ("yuvj420p", "yuvj422p", "yuvj444p", "rgb24", "rgba", "pal8"):
+                                process_image_format = lambda a: a.float() / 255.0
+                                if alpha_channel:
+                                    image_format = 'rgba'
+                                else:
+                                    image_format = 'rgb24'
+                            else:
+                                process_image_format = lambda a: a
+                                if alpha_channel:
+                                    image_format = 'gbrapf32le'
+                                else:
+                                    image_format = 'gbrpf32le'
+
                            checked_alpha = True

                        img = frame.to_ndarray(format=image_format)  # shape: (H, W, 4)
@ -323,9 +338,9 @@ class VideoFromFile(VideoInput):
                    else:
                        audio_frames.append(frame.to_ndarray())

-        images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 0, 0, 3)
+        images = process_image_format(torch.stack(frames)) if len(frames) > 0 else torch.zeros(0, 0, 0, 3)
        if alphas is not None:
-            alphas = torch.stack(alphas) if len(alphas) > 0 else torch.zeros(0, 0, 0, 1)
+            alphas = process_image_format(torch.stack(alphas)) if len(alphas) > 0 else torch.zeros(0, 0, 0, 1)

        # Get frame rate
        frame_rate = Fraction(video_stream.average_rate) if video_stream.average_rate else Fraction(1)
--- a/comfy_api_nodes/apis/bytedance.py
+++ b/comfy_api_nodes/apis/bytedance.py
@ -157,6 +157,11 @@ class SeedanceCreateAssetResponse(BaseModel):
    asset_id: str = Field(...)


+class SeedanceVirtualLibraryCreateAssetRequest(BaseModel):
+    url: str = Field(..., description="Publicly accessible URL of the image asset to upload.")
+    hash: str = Field(..., description="Dedup key. Re-submitting the same hash returns the existing asset id.")
+
+
 # Dollars per 1K tokens, keyed by (model_id, has_video_input).
 SEEDANCE2_PRICE_PER_1K_TOKENS = {
    ("dreamina-seedance-2-0-260128", False): 0.007,
--- a/comfy_api_nodes/nodes_bytedance.py
+++ b/comfy_api_nodes/nodes_bytedance.py
@ -1,3 +1,4 @@
+import hashlib
 import logging
 import math
 import re
@ -20,6 +21,7 @@ from comfy_api_nodes.apis.bytedance import (
    SeedanceCreateAssetResponse,
    SeedanceCreateVisualValidateSessionResponse,
    SeedanceGetVisualValidateSessionResponse,
+    SeedanceVirtualLibraryCreateAssetRequest,
    Seedream4Options,
    Seedream4TaskCreationRequest,
    TaskAudioContent,
@ -271,6 +273,30 @@ async def _wait_for_asset_active(cls: type[IO.ComfyNode], asset_id: str, group_i
    )


+async def _seedance_virtual_library_upload_image_asset(
+    cls: type[IO.ComfyNode],
+    image: torch.Tensor,
+    *,
+    wait_label: str = "Uploading image",
+) -> str:
+    """Upload an image into the caller's per-customer Seedance virtual library."""
+    public_url = await upload_image_to_comfyapi(cls, image, wait_label=wait_label)
+    normalized = image.detach().cpu().contiguous().to(torch.float32)
+    digest = hashlib.sha256()
+    digest.update(str(tuple(normalized.shape)).encode("utf-8"))
+    digest.update(b"\0")
+    digest.update(normalized.numpy().tobytes())
+    image_hash = digest.hexdigest()
+    create_resp = await sync_op(
+        cls,
+        ApiEndpoint(path="/proxy/seedance/virtual-library/assets", method="POST"),
+        response_model=SeedanceCreateAssetResponse,
+        data=SeedanceVirtualLibraryCreateAssetRequest(url=public_url, hash=image_hash),
+    )
+    await _wait_for_asset_active(cls, create_resp.asset_id, group_id="virtual-library")
+    return f"asset://{create_resp.asset_id}"
+
+
 def _seedance2_price_extractor(model_id: str, has_video_input: bool):
    """Returns a price_extractor closure for Seedance 2.0 poll_op."""
    rate = SEEDANCE2_PRICE_PER_1K_TOKENS.get((model_id, has_video_input))
@ -1507,7 +1533,9 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
        if first_frame_asset_id:
            first_frame_url = image_assets[first_frame_asset_id]
        else:
-            first_frame_url = await upload_image_to_comfyapi(cls, first_frame, wait_label="Uploading first frame.")
+            first_frame_url = await _seedance_virtual_library_upload_image_asset(
+                cls, first_frame, wait_label="Uploading first frame."
+            )

        content: list[TaskTextContent | TaskImageContent] = [
            TaskTextContent(text=model["prompt"]),
@ -1527,7 +1555,9 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
            content.append(
                TaskImageContent(
                    image_url=TaskImageContentUrl(
-                        url=await upload_image_to_comfyapi(cls, last_frame, wait_label="Uploading last frame.")
+                        url=await _seedance_virtual_library_upload_image_asset(
+                            cls, last_frame, wait_label="Uploading last frame."
+                        )
                    ),
                    role="last_frame",
                ),
@ -1805,9 +1835,9 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
            content.append(
                TaskImageContent(
                    image_url=TaskImageContentUrl(
-                        url=await upload_image_to_comfyapi(
+                        url=await _seedance_virtual_library_upload_image_asset(
                            cls,
-                            image=reference_images[key],
+                            reference_images[key],
                            wait_label=f"Uploading image {i}",
                        ),
                    ),
--- a/comfy_api_nodes/nodes_openai.py
+++ b/comfy_api_nodes/nodes_openai.py
@ -415,8 +415,9 @@ class OpenAIGPTImage1(IO.ComfyNode):
                        "1152x2048",
                        "3840x2160",
                        "2160x3840",
+                        "Custom",
                    ],
-                    tooltip="Image size",
+                    tooltip="Image size. Select 'Custom' to use the custom width and height (GPT Image 2 only).",
                    optional=True,
                ),
                IO.Int.Input(
@ -445,6 +446,24 @@ class OpenAIGPTImage1(IO.ComfyNode):
                    default="gpt-image-2",
                    optional=True,
                ),
+                IO.Int.Input(
+                    "custom_width",
+                    default=1024,
+                    min=1024,
+                    max=3840,
+                    step=16,
+                    tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16 (GPT Image 2 only).",
+                    optional=True,
+                ),
+                IO.Int.Input(
+                    "custom_height",
+                    default=1024,
+                    min=1024,
+                    max=3840,
+                    step=16,
+                    tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16 (GPT Image 2 only).",
+                    optional=True,
+                ),
            ],
            outputs=[
                IO.Image.Output(),
@ -471,9 +490,9 @@ class OpenAIGPTImage1(IO.ComfyNode):
                      "high":   [0.133, 0.22]
                    },
                    "gpt-image-2": {
-                      "low":    [0.0048, 0.012],
-                      "medium": [0.041, 0.112],
-                      "high":   [0.165, 0.43]
+                      "low":    [0.0048, 0.019],
+                      "medium": [0.041, 0.168],
+                      "high":   [0.165, 0.67]
                    }
                  };
                  $range := $lookup($lookup($ranges, widgets.model), widgets.quality);
@ -503,6 +522,8 @@ class OpenAIGPTImage1(IO.ComfyNode):
        mask: Input.Image | None = None,
        n: int = 1,
        size: str = "1024x1024",
+        custom_width: int = 1024,
+        custom_height: int = 1024,
        model: str = "gpt-image-1",
    ) -> IO.NodeOutput:
        validate_string(prompt, strip_whitespace=False)
@ -510,7 +531,25 @@ class OpenAIGPTImage1(IO.ComfyNode):
        if mask is not None and image is None:
            raise ValueError("Cannot use a mask without an input image")

-        if model in ("gpt-image-1", "gpt-image-1.5"):
+        if size == "Custom":
+            if model != "gpt-image-2":
+                raise ValueError("Custom resolution is only supported by GPT Image 2 model")
+            if custom_width % 16 != 0 or custom_height % 16 != 0:
+                raise ValueError(f"Custom width and height must be multiples of 16, got {custom_width}x{custom_height}")
+            if max(custom_width, custom_height) > 3840:
+                raise ValueError(f"Custom resolution max edge must be <= 3840, got {custom_width}x{custom_height}")
+            ratio = max(custom_width, custom_height) / min(custom_width, custom_height)
+            if ratio > 3:
+                raise ValueError(
+                    f"Custom resolution aspect ratio must not exceed 3:1, got {custom_width}x{custom_height}"
+                )
+            total_pixels = custom_width * custom_height
+            if not 655_360 <= total_pixels <= 8_294_400:
+                raise ValueError(
+                    f"Custom resolution total pixels must be between 655,360 and 8,294,400, got {total_pixels}"
+                )
+            size = f"{custom_width}x{custom_height}"
+        elif model in ("gpt-image-1", "gpt-image-1.5"):
            if size not in ("auto", "1024x1024", "1024x1536", "1536x1024"):
                raise ValueError(f"Resolution {size} is only supported by GPT Image 2 model")

--- a/comfy_execution/caching.py
+++ b/comfy_execution/caching.py
@ -1,5 +1,6 @@
 import asyncio
 import bisect
+import gc
 import itertools
 import psutil
 import time
@ -17,6 +18,7 @@ NODE_CLASS_CONTAINS_UNIQUE_ID: Dict[str, bool] = {}


 def include_unique_id_in_input(class_type: str) -> bool:
+    """Return whether a node class includes UNIQUE_ID among its hidden inputs."""
    if class_type in NODE_CLASS_CONTAINS_UNIQUE_ID:
        return NODE_CLASS_CONTAINS_UNIQUE_ID[class_type]
    class_def = nodes.NODE_CLASS_MAPPINGS[class_type]
@ -24,52 +26,412 @@ def include_unique_id_in_input(class_type: str) -> bool:
    return NODE_CLASS_CONTAINS_UNIQUE_ID[class_type]

 class CacheKeySet(ABC):
+    """Base helper for building and storing cache keys for prompt nodes."""
    def __init__(self, dynprompt, node_ids, is_changed_cache):
+        """Initialize cache-key storage for a dynamic prompt execution pass."""
        self.keys = {}
        self.subcache_keys = {}

    @abstractmethod
    async def add_keys(self, node_ids):
+        """Populate cache keys for the provided node ids."""
        raise NotImplementedError()

    def all_node_ids(self):
+        """Return the set of node ids currently tracked by this key set."""
        return set(self.keys.keys())

    def get_used_keys(self):
+        """Return the computed cache keys currently in use."""
        return self.keys.values()

    def get_used_subcache_keys(self):
+        """Return the computed subcache keys currently in use."""
        return self.subcache_keys.values()

    def get_data_key(self, node_id):
+        """Return the cache key for a node, if present."""
        return self.keys.get(node_id, None)

    def get_subcache_key(self, node_id):
+        """Return the subcache key for a node, if present."""
        return self.subcache_keys.get(node_id, None)

 class Unhashable:
-    def __init__(self):
-        self.value = float("NaN")
+    """Hashable identity sentinel for values that cannot be represented safely in cache keys."""
+    pass

-def to_hashable(obj):
-    # So that we don't infinitely recurse since frozenset and tuples
-    # are Sequences.
-    if isinstance(obj, (int, float, str, bool, bytes, type(None))):
-        return obj
-    elif isinstance(obj, Mapping):
-        return frozenset([(to_hashable(k), to_hashable(v)) for k, v in sorted(obj.items())])
-    elif isinstance(obj, Sequence):
-        return frozenset(zip(itertools.count(), [to_hashable(i) for i in obj]))
-    else:
-        # TODO - Support other objects like tensors?
+
+_PRIMITIVE_SIGNATURE_TYPES = (int, float, str, bool, bytes, type(None))
+_CONTAINER_SIGNATURE_TYPES = (dict, list, tuple, set, frozenset)
+_MAX_SIGNATURE_DEPTH = 32
+_MAX_SIGNATURE_CONTAINER_VISITS = 10_000
+_FAILED_SIGNATURE = object()
+
+
+def _shallow_is_changed_signature(value):
+    """Reduce execution-time `is_changed` values through a fail-closed builtin canonicalizer."""
+    value_type = type(value)
+    if value_type in _PRIMITIVE_SIGNATURE_TYPES:
+        return value
+
+    if value_type not in _CONTAINER_SIGNATURE_TYPES:
        return Unhashable()

+    canonical = _signature_to_hashable(value, max_nodes=64)
+    if type(canonical) is Unhashable:
+        return canonical
+    if value_type is list or value_type is tuple:
+        container_tag = "is_changed_list" if value_type is list else "is_changed_tuple"
+        return (container_tag, canonical[1])
+
+    return canonical
+
+
+def _primitive_signature_sort_key(obj):
+    """Return a deterministic ordering key for primitive signature values."""
+    obj_type = type(obj)
+    return ("primitive", obj_type.__module__, obj_type.__qualname__, repr(obj))
+
+
+def _sanitized_sort_key(obj, depth=0, max_depth=_MAX_SIGNATURE_DEPTH, active=None, memo=None):
+    """Return a deterministic ordering key for sanitized built-in container content."""
+    if depth >= max_depth:
+        return ("MAX_DEPTH",)
+
+    if active is None:
+        active = set()
+    if memo is None:
+        memo = {}
+
+    obj_type = type(obj)
+    if obj_type is Unhashable:
+        return ("UNHASHABLE",)
+    elif obj_type in _PRIMITIVE_SIGNATURE_TYPES:
+        return (obj_type.__module__, obj_type.__qualname__, repr(obj))
+    elif obj_type not in _CONTAINER_SIGNATURE_TYPES:
+        return (obj_type.__module__, obj_type.__qualname__, "OPAQUE")
+
+    obj_id = id(obj)
+    if obj_id in memo:
+        return memo[obj_id]
+    if obj_id in active:
+        return ("CYCLE",)
+
+    active.add(obj_id)
+    try:
+        if obj_type is dict:
+            items = [
+                (
+                    _sanitized_sort_key(k, depth + 1, max_depth, active, memo),
+                    _sanitized_sort_key(v, depth + 1, max_depth, active, memo),
+                )
+                for k, v in obj.items()
+            ]
+            items.sort()
+            result = ("dict", tuple(items))
+        elif obj_type is list:
+            result = ("list", tuple(_sanitized_sort_key(i, depth + 1, max_depth, active, memo) for i in obj))
+        elif obj_type is tuple:
+            result = ("tuple", tuple(_sanitized_sort_key(i, depth + 1, max_depth, active, memo) for i in obj))
+        elif obj_type is set:
+            result = ("set", tuple(sorted(_sanitized_sort_key(i, depth + 1, max_depth, active, memo) for i in obj)))
+        else:
+            result = ("frozenset", tuple(sorted(_sanitized_sort_key(i, depth + 1, max_depth, active, memo) for i in obj)))
+    finally:
+        active.discard(obj_id)
+
+    memo[obj_id] = result
+    return result
+
+
+def _signature_to_hashable_impl(obj, depth=0, max_depth=_MAX_SIGNATURE_DEPTH, active=None, memo=None, budget=None):
+    """Canonicalize signature inputs directly into their final hashable form."""
+    if depth >= max_depth:
+        return _FAILED_SIGNATURE
+
+    if active is None:
+        active = set()
+    if memo is None:
+        memo = {}
+    if budget is None:
+        budget = {"remaining": _MAX_SIGNATURE_CONTAINER_VISITS}
+
+    obj_type = type(obj)
+    if obj_type in _PRIMITIVE_SIGNATURE_TYPES:
+        return obj, _primitive_signature_sort_key(obj)
+    if obj_type is Unhashable or obj_type not in _CONTAINER_SIGNATURE_TYPES:
+        return _FAILED_SIGNATURE
+
+    obj_id = id(obj)
+    if obj_id in memo:
+        return memo[obj_id]
+    if obj_id in active:
+        return _FAILED_SIGNATURE
+
+    budget["remaining"] -= 1
+    if budget["remaining"] < 0:
+        return _FAILED_SIGNATURE
+
+    active.add(obj_id)
+    try:
+        if obj_type is dict:
+            try:
+                items = list(obj.items())
+            except RuntimeError:
+                return _FAILED_SIGNATURE
+
+            ordered_items = []
+            for key, value in items:
+                if type(key) not in _PRIMITIVE_SIGNATURE_TYPES:
+                    return _FAILED_SIGNATURE
+                key_result = (key, _primitive_signature_sort_key(key))
+                value_result = _signature_to_hashable_impl(value, depth + 1, max_depth, active, memo, budget)
+                if value_result is _FAILED_SIGNATURE:
+                    return _FAILED_SIGNATURE
+                key_value, key_sort = key_result
+                value_value, value_sort = value_result
+                ordered_items.append((key_sort, value_sort, key_value, value_value))
+
+            ordered_items.sort(key=lambda item: (item[0], item[1]))
+            for index in range(1, len(ordered_items)):
+                previous_key_sort = ordered_items[index - 1][0]
+                current_key_sort = ordered_items[index][0]
+                if previous_key_sort == current_key_sort:
+                    return _FAILED_SIGNATURE
+
+            value = ("dict", tuple((key_value, value_value) for _, _, key_value, value_value in ordered_items))
+            sort_key = ("dict", tuple((key_sort, value_sort) for key_sort, value_sort, _, _ in ordered_items))
+        elif obj_type is list or obj_type is tuple:
+            try:
+                items = list(obj)
+            except RuntimeError:
+                return _FAILED_SIGNATURE
+
+            child_results = []
+            for item in items:
+                child_result = _signature_to_hashable_impl(item, depth + 1, max_depth, active, memo, budget)
+                if child_result is _FAILED_SIGNATURE:
+                    return _FAILED_SIGNATURE
+                child_results.append(child_result)
+
+            container_tag = "list" if obj_type is list else "tuple"
+            value = (container_tag, tuple(child for child, _ in child_results))
+            sort_key = (container_tag, tuple(child_sort for _, child_sort in child_results))
+        else:
+            try:
+                items = list(obj)
+            except RuntimeError:
+                return _FAILED_SIGNATURE
+
+            ordered_items = []
+            for item in items:
+                child_result = _signature_to_hashable_impl(item, depth + 1, max_depth, active, memo, budget)
+                if child_result is _FAILED_SIGNATURE:
+                    return _FAILED_SIGNATURE
+                child_value, child_sort = child_result
+                ordered_items.append((child_sort, child_value))
+
+            ordered_items.sort(key=lambda item: item[0])
+            for index in range(1, len(ordered_items)):
+                previous_sort_key, previous_value = ordered_items[index - 1]
+                current_sort_key, current_value = ordered_items[index]
+                if previous_sort_key == current_sort_key and previous_value != current_value:
+                    return _FAILED_SIGNATURE
+
+            container_tag = "set" if obj_type is set else "frozenset"
+            value = (container_tag, tuple(child_value for _, child_value in ordered_items))
+            sort_key = (container_tag, tuple(child_sort for child_sort, _ in ordered_items))
+    finally:
+        active.discard(obj_id)
+
+    memo[obj_id] = (value, sort_key)
+    return memo[obj_id]
+
+
+def _signature_to_hashable(obj, max_nodes=_MAX_SIGNATURE_CONTAINER_VISITS):
+    """Build the final cache-signature representation in one fail-closed pass."""
+    try:
+        result = _signature_to_hashable_impl(obj, budget={"remaining": max_nodes})
+    except RuntimeError:
+        return Unhashable()
+    if result is _FAILED_SIGNATURE:
+        return Unhashable()
+    return result[0]
+
+
+def to_hashable(obj, max_nodes=_MAX_SIGNATURE_CONTAINER_VISITS):
+    """Convert sanitized prompt inputs into a stable hashable representation.
+
+    The input is expected to already be sanitized to plain built-in containers,
+    but this function still fails safe for anything unexpected. Traversal is
+    iterative and memoized so shared built-in substructures do not trigger
+    exponential re-walks during cache-key construction.
+    """
+    obj_type = type(obj)
+    if obj_type in _PRIMITIVE_SIGNATURE_TYPES or obj_type is Unhashable:
+        return obj
+    if obj_type not in _CONTAINER_SIGNATURE_TYPES:
+        return Unhashable()
+
+    memo = {}
+    active = set()
+    snapshots = {}
+    sort_memo = {}
+    processed = 0
+    # Keep traversal state separate from container snapshots/results.
+    work_stack = [(obj, False)]
+
+    def resolve_value(value):
+        """Resolve a child value from the completed memo table when available."""
+        value_type = type(value)
+        if value_type in _PRIMITIVE_SIGNATURE_TYPES or value_type is Unhashable:
+            return value
+        return memo.get(id(value), Unhashable())
+
+    def is_failed(value):
+        """Return whether a resolved child value represents failed canonicalization."""
+        return type(value) is Unhashable
+
+    def resolve_unordered_values(current_items, container_tag):
+        """Resolve a set-like container or fail closed if ordering is ambiguous."""
+        try:
+            ordered_items = [
+                (_sanitized_sort_key(item, memo=sort_memo), resolve_value(item))
+                for item in current_items
+            ]
+            if any(is_failed(value) for _, value in ordered_items):
+                return Unhashable()
+            ordered_items.sort(key=lambda item: item[0])
+        except RuntimeError:
+            return Unhashable()
+
+        for index in range(1, len(ordered_items)):
+            previous_key, previous_value = ordered_items[index - 1]
+            current_key, current_value = ordered_items[index]
+            if previous_key == current_key and previous_value != current_value:
+                return Unhashable()
+
+        return (container_tag, tuple(value for _, value in ordered_items))
+
+    while work_stack:
+        entry = work_stack.pop()
+        if len(entry) == 3:
+            _, current_id, current_type = entry
+            current = None
+            expanded = True
+        else:
+            current, expanded = entry
+            current_type = type(current)
+            current_id = id(current)
+
+        if not expanded and (current_type in _PRIMITIVE_SIGNATURE_TYPES or current_type is Unhashable):
+            continue
+        if not expanded and current_type not in _CONTAINER_SIGNATURE_TYPES:
+            memo[current_id] = Unhashable()
+            continue
+
+        if current_id in memo:
+            continue
+
+        if expanded:
+            active.discard(current_id)
+            try:
+                items = snapshots.pop(current_id, None)
+                if items is None:
+                    memo[current_id] = Unhashable()
+                    continue
+
+                if current_type is dict:
+                    ordered_items = [
+                        (_sanitized_sort_key(k, memo=sort_memo), k, resolve_value(v))
+                        for k, v in items
+                    ]
+                    if any(type(key) not in _PRIMITIVE_SIGNATURE_TYPES or is_failed(value) for _, key, value in ordered_items):
+                        memo[current_id] = Unhashable()
+                        continue
+                    ordered_items.sort(key=lambda item: item[0])
+                    for index in range(1, len(ordered_items)):
+                        if ordered_items[index - 1][0] == ordered_items[index][0]:
+                            memo[current_id] = Unhashable()
+                            break
+                    else:
+                        memo[current_id] = (
+                            "dict",
+                            tuple((key, value) for _, key, value in ordered_items),
+                        )
+                elif current_type is list:
+                    resolved_items = tuple(resolve_value(item) for item in items)
+                    if any(is_failed(item) for item in resolved_items):
+                        memo[current_id] = Unhashable()
+                    else:
+                        memo[current_id] = ("list", resolved_items)
+                elif current_type is tuple:
+                    resolved_items = tuple(resolve_value(item) for item in items)
+                    if any(is_failed(item) for item in resolved_items):
+                        memo[current_id] = Unhashable()
+                    else:
+                        memo[current_id] = ("tuple", resolved_items)
+                elif current_type is set:
+                    memo[current_id] = resolve_unordered_values(items, "set")
+                else:
+                    memo[current_id] = resolve_unordered_values(items, "frozenset")
+            except RuntimeError:
+                memo[current_id] = Unhashable()
+            continue
+
+        if current_id in active:
+            memo[current_id] = Unhashable()
+            continue
+
+        processed += 1
+        if processed > max_nodes:
+            return Unhashable()
+
+        active.add(current_id)
+        if current_type is dict:
+            try:
+                items = list(current.items())
+                snapshots[current_id] = items
+            except RuntimeError:
+                memo[current_id] = Unhashable()
+                active.discard(current_id)
+                continue
+            for key, value in items:
+                if type(key) not in _PRIMITIVE_SIGNATURE_TYPES:
+                    snapshots.pop(current_id, None)
+                    memo[current_id] = Unhashable()
+                    active.discard(current_id)
+                    break
+            else:
+                work_stack.append(("EXPANDED", current_id, current_type))
+                for _, value in reversed(items):
+                    work_stack.append((value, False))
+                continue
+            continue
+        else:
+            try:
+                items = list(current)
+                snapshots[current_id] = items
+            except RuntimeError:
+                memo[current_id] = Unhashable()
+                active.discard(current_id)
+                continue
+            work_stack.append(("EXPANDED", current_id, current_type))
+            for item in reversed(items):
+                work_stack.append((item, False))
+
+    return memo.get(id(obj), Unhashable())
+
 class CacheKeySetID(CacheKeySet):
+    """Cache-key strategy that keys nodes by node id and class type."""
    def __init__(self, dynprompt, node_ids, is_changed_cache):
+        """Initialize identity-based cache keys for the supplied dynamic prompt."""
        super().__init__(dynprompt, node_ids, is_changed_cache)
        self.dynprompt = dynprompt

    async def add_keys(self, node_ids):
+        """Populate identity-based keys for nodes that exist in the dynamic prompt."""
        for node_id in node_ids:
            if node_id in self.keys:
                continue
@ -80,15 +442,19 @@ class CacheKeySetID(CacheKeySet):
            self.subcache_keys[node_id] = (node_id, node["class_type"])

 class CacheKeySetInputSignature(CacheKeySet):
+    """Cache-key strategy that hashes a node's immediate inputs plus ancestor references."""
    def __init__(self, dynprompt, node_ids, is_changed_cache):
+        """Initialize input-signature-based cache keys for the supplied dynamic prompt."""
        super().__init__(dynprompt, node_ids, is_changed_cache)
        self.dynprompt = dynprompt
        self.is_changed_cache = is_changed_cache

    def include_node_id_in_input(self) -> bool:
+        """Return whether node ids should be included in computed input signatures."""
        return False

    async def add_keys(self, node_ids):
+        """Populate input-signature-based keys for nodes in the dynamic prompt."""
        for node_id in node_ids:
            if node_id in self.keys:
                continue
@ -99,21 +465,37 @@ class CacheKeySetInputSignature(CacheKeySet):
            self.subcache_keys[node_id] = (node_id, node["class_type"])

    async def get_node_signature(self, dynprompt, node_id):
+        """Build the full cache signature for a node and its ordered ancestors."""
        signature = []
        ancestors, order_mapping = self.get_ordered_ancestry(dynprompt, node_id)
-        signature.append(await self.get_immediate_node_signature(dynprompt, node_id, order_mapping))
+        immediate = await self.get_immediate_node_signature(dynprompt, node_id, order_mapping)
+        if type(immediate) is Unhashable:
+            return immediate
+        signature.append(immediate)
        for ancestor_id in ancestors:
-            signature.append(await self.get_immediate_node_signature(dynprompt, ancestor_id, order_mapping))
-        return to_hashable(signature)
+            immediate = await self.get_immediate_node_signature(dynprompt, ancestor_id, order_mapping)
+            if type(immediate) is Unhashable:
+                return immediate
+            signature.append(immediate)
+        return tuple(signature)

    async def get_immediate_node_signature(self, dynprompt, node_id, ancestor_order_mapping):
+        """Build the immediate cache-signature fragment for a node.
+
+        Link inputs are reduced to ancestor references here. Non-link values
+        are canonicalized or failed closed before being appended so the final
+        node signature is assembled from already-hashable fragments.
+        """
        if not dynprompt.has_node(node_id):
            # This node doesn't exist -- we can't cache it.
-            return [float("NaN")]
+            return Unhashable()
        node = dynprompt.get_node(node_id)
        class_type = node["class_type"]
        class_def = nodes.NODE_CLASS_MAPPINGS[class_type]
-        signature = [class_type, await self.is_changed_cache.get(node_id)]
+        is_changed_signature = _shallow_is_changed_signature(await self.is_changed_cache.get(node_id))
+        if type(is_changed_signature) is Unhashable:
+            return is_changed_signature
+        signature = [class_type, is_changed_signature]
        if self.include_node_id_in_input() or (hasattr(class_def, "NOT_IDEMPOTENT") and class_def.NOT_IDEMPOTENT) or include_unique_id_in_input(class_type):
            signature.append(node_id)
        inputs = node["inputs"]
@ -123,18 +505,23 @@ class CacheKeySetInputSignature(CacheKeySet):
                ancestor_index = ancestor_order_mapping[ancestor_id]
                signature.append((key,("ANCESTOR", ancestor_index, ancestor_socket)))
            else:
-                signature.append((key, inputs[key]))
-        return signature
+                value_signature = to_hashable(inputs[key])
+                if type(value_signature) is Unhashable:
+                    return value_signature
+                signature.append((key, value_signature))
+        return tuple(signature)

    # This function returns a list of all ancestors of the given node. The order of the list is
    # deterministic based on which specific inputs the ancestor is connected by.
    def get_ordered_ancestry(self, dynprompt, node_id):
+        """Return ancestors in deterministic traversal order and their index mapping."""
        ancestors = []
        order_mapping = {}
        self.get_ordered_ancestry_internal(dynprompt, node_id, ancestors, order_mapping)
        return ancestors, order_mapping

    def get_ordered_ancestry_internal(self, dynprompt, node_id, ancestors, order_mapping):
+        """Recursively collect ancestors in input order without revisiting prior nodes."""
        if not dynprompt.has_node(node_id):
            return
        inputs = dynprompt.get_node(node_id)["inputs"]
--- a/comfy_execution/graph_utils.py
+++ b/comfy_execution/graph_utils.py
@ -1,11 +1,17 @@
 def is_link(obj):
-    if not isinstance(obj, list):
+    """Return whether obj is a plain prompt link of the form [node_id, output_index]."""
+    # Prompt links produced by the frontend / GraphBuilder are plain Python
+    # lists in the form [node_id, output_index]. Some custom-node paths can
+    # inject foreign runtime objects into prompt inputs during on-prompt graph
+    # rewriting or subgraph construction. Be strict here so cache signature
+    # building never tries to treat list-like proxy objects as links.
+    if type(obj) is not list:
        return False
    if len(obj) != 2:
        return False
-    if not isinstance(obj[0], str):
+    if type(obj[0]) is not str:
        return False
-    if not isinstance(obj[1], int) and not isinstance(obj[1], float):
+    if type(obj[1]) is not int:
        return False
    return True

--- a/nodes.py
+++ b/nodes.py
@ -728,50 +728,26 @@ class LoraLoaderModelOnly(LoraLoader):

 class VAELoader:
    video_taes = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5", "taeltx_2"]
-    image_taes = ["taesd", "taesdxl", "taesd3", "taef1"]
+    image_taes = ["taesd", "taesdxl", "taesd3", "taef1", "taef2"]
+
    @staticmethod
    def vae_list(s):
        vaes = folder_paths.get_filename_list("vae")
        approx_vaes = folder_paths.get_filename_list("vae_approx")
-        sdxl_taesd_enc = False
-        sdxl_taesd_dec = False
-        sd1_taesd_enc = False
-        sd1_taesd_dec = False
-        sd3_taesd_enc = False
-        sd3_taesd_dec = False
-        f1_taesd_enc = False
-        f1_taesd_dec = False
-
+        have_img_encoder, have_img_decoder = set(), set()
        for v in approx_vaes:
-            if v.startswith("taesd_decoder."):
-                sd1_taesd_dec = True
-            elif v.startswith("taesd_encoder."):
-                sd1_taesd_enc = True
-            elif v.startswith("taesdxl_decoder."):
-                sdxl_taesd_dec = True
-            elif v.startswith("taesdxl_encoder."):
-                sdxl_taesd_enc = True
-            elif v.startswith("taesd3_decoder."):
-                sd3_taesd_dec = True
-            elif v.startswith("taesd3_encoder."):
-                sd3_taesd_enc = True
-            elif v.startswith("taef1_encoder."):
-                f1_taesd_dec = True
-            elif v.startswith("taef1_decoder."):
-                f1_taesd_enc = True
-            else:
+            parts = v.split("_", 1)
+            if len(parts) != 2 or parts[0] not in s.image_taes:
                for tae in s.video_taes:
                    if v.startswith(tae):
                        vaes.append(v)
-
-        if sd1_taesd_dec and sd1_taesd_enc:
-            vaes.append("taesd")
-        if sdxl_taesd_dec and sdxl_taesd_enc:
-            vaes.append("taesdxl")
-        if sd3_taesd_dec and sd3_taesd_enc:
-            vaes.append("taesd3")
-        if f1_taesd_dec and f1_taesd_enc:
-            vaes.append("taef1")
+                        break
+                continue
+            if parts[1].startswith("encoder."):
+                have_img_encoder.add(parts[0])
+            elif parts[1].startswith("decoder."):
+                have_img_decoder.add(parts[0])
+        vaes += [k for k in have_img_decoder if k in have_img_encoder]
        vaes.append("pixel_space")
        return vaes

@ -827,6 +803,11 @@ class VAELoader:
            else:
                vae_path = folder_paths.get_full_path_or_raise("vae", vae_name)
            sd, metadata = comfy.utils.load_torch_file(vae_path, return_metadata=True)
+        if vae_name == "taef2":
+            if metadata is None:
+                metadata = {"tae_latent_channels": 128}
+            else:
+                metadata["tae_latent_channels"] = 128
        vae = comfy.sd.VAE(sd=sd, metadata=metadata)
        vae.throw_exception_if_invalid()
        return (vae,)
@ -2463,7 +2444,7 @@ async def init_builtin_extra_nodes():
        "nodes_curve.py",
        "nodes_rtdetr.py",
        "nodes_frame_interpolation.py",
-        "nodes_sam3.py"
+        "nodes_sam3.py",
    ]

    import_failed = []
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 comfyui-frontend-package==1.42.15
-comfyui-workflow-templates==0.9.63
+comfyui-workflow-templates==0.9.65
 comfyui-embedded-docs==0.4.4
 torch
 torchsde
@ -19,7 +19,7 @@ scipy
 tqdm
 psutil
 alembic
-SQLAlchemy>=2.0
+SQLAlchemy>=2.0.0
 filelock
 av>=14.2.0
 comfy-kitchen>=0.2.8
--- a/tests-unit/execution_test/caching_test.py
+++ b/tests-unit/execution_test/caching_test.py
@ -0,0 +1,473 @@
+"""Unit tests for cache-signature canonicalization hardening."""
+
+import asyncio
+import importlib
+import sys
+import types
+
+import pytest
+
+
+class _DummyNode:
+    """Minimal node stub used to satisfy cache-signature class lookups."""
+
+    @staticmethod
+    def INPUT_TYPES():
+        """Return a minimal empty input schema for unit tests."""
+        return {"required": {}}
+
+
+class _FakeDynPrompt:
+    """Small DynamicPrompt stand-in with only the methods these tests need."""
+
+    def __init__(self, nodes_by_id):
+        """Store test nodes by id."""
+        self._nodes_by_id = nodes_by_id
+
+    def has_node(self, node_id):
+        """Return whether the fake prompt contains the requested node."""
+        return node_id in self._nodes_by_id
+
+    def get_node(self, node_id):
+        """Return the stored node payload for the requested id."""
+        return self._nodes_by_id[node_id]
+
+
+class _FakeIsChangedCache:
+    """Async stub for `is_changed` lookups used by cache-key generation."""
+
+    def __init__(self, values):
+        """Store canned `is_changed` responses keyed by node id."""
+        self._values = values
+
+    async def get(self, node_id):
+        """Return the canned `is_changed` value for a node."""
+        return self._values[node_id]
+
+
+class _OpaqueValue:
+    """Hashable opaque object used to exercise fail-closed unordered hashing paths."""
+
+
+@pytest.fixture
+def caching_module(monkeypatch):
+    """Import `comfy_execution.caching` with lightweight stub dependencies."""
+    torch_module = types.ModuleType("torch")
+    psutil_module = types.ModuleType("psutil")
+    nodes_module = types.ModuleType("nodes")
+    nodes_module.NODE_CLASS_MAPPINGS = {}
+    graph_module = types.ModuleType("comfy_execution.graph")
+
+    class DynamicPrompt:
+        """Placeholder graph type so the caching module can import cleanly."""
+
+        pass
+
+    graph_module.DynamicPrompt = DynamicPrompt
+
+    monkeypatch.setitem(sys.modules, "torch", torch_module)
+    monkeypatch.setitem(sys.modules, "psutil", psutil_module)
+    monkeypatch.setitem(sys.modules, "nodes", nodes_module)
+    monkeypatch.setitem(sys.modules, "comfy_execution.graph", graph_module)
+    monkeypatch.delitem(sys.modules, "comfy_execution.caching", raising=False)
+
+    module = importlib.import_module("comfy_execution.caching")
+    module = importlib.reload(module)
+    return module, nodes_module
+
+
+def test_signature_to_hashable_handles_shared_builtin_substructures(caching_module):
+    """Shared built-in substructures should canonicalize without collapsing to Unhashable."""
+    caching, _ = caching_module
+    shared = [{"value": 1}, {"value": 2}]
+
+    signature = caching._signature_to_hashable([shared, shared])
+
+    assert signature[0] == "list"
+    assert signature[1][0] == signature[1][1]
+    assert signature[1][0][0] == "list"
+    assert signature[1][0][1][0] == ("dict", (("value", 1),))
+    assert signature[1][0][1][1] == ("dict", (("value", 2),))
+
+
+def test_signature_to_hashable_fails_closed_on_opaque_values(caching_module):
+    """Opaque values should collapse the full signature to Unhashable immediately."""
+    caching, _ = caching_module
+
+    signature = caching._signature_to_hashable(["safe", object()])
+
+    assert isinstance(signature, caching.Unhashable)
+
+
+def test_signature_to_hashable_stops_descending_after_failure(caching_module, monkeypatch):
+    """Once canonicalization fails, later recursive descent should stop immediately."""
+    caching, _ = caching_module
+    original = caching._signature_to_hashable_impl
+    marker = object()
+    marker_seen = False
+
+    def tracking_canonicalize(obj, *args, **kwargs):
+        """Track whether recursion reaches the nested marker after failure."""
+        nonlocal marker_seen
+        if obj is marker:
+            marker_seen = True
+        return original(obj, *args, **kwargs)
+
+    monkeypatch.setattr(caching, "_signature_to_hashable_impl", tracking_canonicalize)
+
+    signature = caching._signature_to_hashable([object(), [marker]])
+
+    assert isinstance(signature, caching.Unhashable)
+    assert marker_seen is False
+
+
+def test_signature_to_hashable_snapshots_list_before_recursing(caching_module, monkeypatch):
+    """List canonicalization should read a point-in-time snapshot before recursive descent."""
+    caching, _ = caching_module
+    original = caching._signature_to_hashable_impl
+    marker = ("marker",)
+    values = [marker, 2]
+
+    def mutating_canonicalize(obj, *args, **kwargs):
+        """Mutate the live list during recursion to verify snapshot-based traversal."""
+        if obj is marker:
+            values[1] = 3
+        return original(obj, *args, **kwargs)
+
+    monkeypatch.setattr(caching, "_signature_to_hashable_impl", mutating_canonicalize)
+
+    signature = caching._signature_to_hashable(values)
+
+    assert signature == ("list", (("tuple", ("marker",)), 2))
+    assert values[1] == 3
+
+
+def test_signature_to_hashable_snapshots_dict_before_recursing(caching_module, monkeypatch):
+    """Dict canonicalization should read a point-in-time snapshot before recursive descent."""
+    caching, _ = caching_module
+    original = caching._signature_to_hashable_impl
+    marker = ("marker",)
+    values = {"first": marker, "second": 2}
+
+    def mutating_canonicalize(obj, *args, **kwargs):
+        """Mutate the live dict during recursion to verify snapshot-based traversal."""
+        if obj is marker:
+            values["second"] = 3
+        return original(obj, *args, **kwargs)
+
+    monkeypatch.setattr(caching, "_signature_to_hashable_impl", mutating_canonicalize)
+
+    signature = caching._signature_to_hashable(values)
+
+    assert signature == ("dict", (("first", ("tuple", ("marker",))), ("second", 2)))
+    assert values["second"] == 3
+
+
+@pytest.mark.parametrize(
+    "container_factory",
+    [
+        lambda marker: [marker],
+        lambda marker: (marker,),
+        lambda marker: {marker},
+        lambda marker: frozenset({marker}),
+        lambda marker: {"key": marker},
+    ],
+)
+def test_signature_to_hashable_fails_closed_on_runtimeerror(caching_module, monkeypatch, container_factory):
+    """Traversal RuntimeError should degrade canonicalization to Unhashable."""
+    caching, _ = caching_module
+    original = caching._signature_to_hashable_impl
+    marker = object()
+
+    def raising_canonicalize(obj, *args, **kwargs):
+        """Raise a traversal RuntimeError for the marker value and delegate otherwise."""
+        if obj is marker:
+            raise RuntimeError("container changed during iteration")
+        return original(obj, *args, **kwargs)
+
+    monkeypatch.setattr(caching, "_signature_to_hashable_impl", raising_canonicalize)
+
+    signature = caching._signature_to_hashable(container_factory(marker))
+
+    assert isinstance(signature, caching.Unhashable)
+
+
+def test_to_hashable_handles_shared_builtin_substructures(caching_module):
+    """The legacy helper should still hash sanitized built-ins stably when used directly."""
+    caching, _ = caching_module
+    shared = [{"value": 1}, {"value": 2}]
+
+    sanitized = [shared, shared]
+    hashable = caching.to_hashable(sanitized)
+
+    assert hashable[0] == "list"
+    assert hashable[1][0] == hashable[1][1]
+    assert hashable[1][0][0] == "list"
+
+
+def test_to_hashable_uses_parent_snapshot_during_expanded_phase(caching_module, monkeypatch):
+    """Expanded-phase assembly should not reread a live parent container after snapshotting."""
+    caching, _ = caching_module
+    original_sort_key = caching._sanitized_sort_key
+    outer = [{"marker"}, 2]
+
+    def mutating_sort_key(obj, *args, **kwargs):
+        """Mutate the live parent while a child container is being canonicalized."""
+        if obj == "marker":
+            outer[1] = 3
+        return original_sort_key(obj, *args, **kwargs)
+
+    monkeypatch.setattr(caching, "_sanitized_sort_key", mutating_sort_key)
+
+    hashable = caching.to_hashable(outer)
+
+    assert hashable == ("list", (("set", ("marker",)), 2))
+    assert outer[1] == 3
+
+
+def test_to_hashable_fails_closed_for_ordered_container_with_opaque_child(caching_module):
+    """Ordered containers should fail closed when a child cannot be canonicalized."""
+    caching, _ = caching_module
+
+    result = caching.to_hashable([object()])
+
+    assert isinstance(result, caching.Unhashable)
+
+
+def test_to_hashable_canonicalizes_dict_insertion_order(caching_module):
+    """Dicts with the same content should hash identically regardless of insertion order."""
+    caching, _ = caching_module
+
+    first = {"b": 2, "a": 1}
+    second = {"a": 1, "b": 2}
+
+    assert caching.to_hashable(first) == ("dict", (("a", 1), ("b", 2)))
+    assert caching.to_hashable(first) == caching.to_hashable(second)
+
+
+def test_to_hashable_fails_closed_for_opaque_dict_key(caching_module):
+    """Opaque dict keys should fail closed instead of being traversed during hashing."""
+    caching, _ = caching_module
+
+    hashable = caching.to_hashable({_OpaqueValue(): 1})
+
+    assert isinstance(hashable, caching.Unhashable)
+
+
+@pytest.mark.parametrize(
+    "container_factory",
+    [
+        set,
+        frozenset,
+    ],
+)
+def test_to_hashable_fails_closed_on_runtimeerror(caching_module, monkeypatch, container_factory):
+    """Traversal RuntimeError should degrade unordered hash conversion to Unhashable."""
+    caching, _ = caching_module
+
+    def raising_sort_key(obj, *args, **kwargs):
+        """Raise a traversal RuntimeError while unordered values are canonicalized."""
+        raise RuntimeError("container changed during iteration")
+
+    monkeypatch.setattr(caching, "_sanitized_sort_key", raising_sort_key)
+
+    hashable = caching.to_hashable(container_factory({"value"}))
+
+    assert isinstance(hashable, caching.Unhashable)
+
+
+def test_to_hashable_fails_closed_for_ambiguous_dict_ordering(caching_module, monkeypatch):
+    """Ambiguous dict key ordering should fail closed instead of using insertion order."""
+    caching, _ = caching_module
+    original_sort_key = caching._sanitized_sort_key
+    ambiguous = {"a": 1, "b": 1}
+
+    def colliding_sort_key(obj, *args, **kwargs):
+        """Force two distinct primitive keys to share the same ordering key."""
+        if obj == "a" or obj == "b":
+            return ("COLLIDE",)
+        return original_sort_key(obj, *args, **kwargs)
+
+    monkeypatch.setattr(caching, "_sanitized_sort_key", colliding_sort_key)
+
+    hashable = caching.to_hashable(ambiguous)
+
+    assert isinstance(hashable, caching.Unhashable)
+
+
+def test_signature_to_hashable_fails_closed_for_ambiguous_dict_ordering(caching_module, monkeypatch):
+    """Ambiguous dict sort ties should fail closed instead of depending on input order."""
+    caching, _ = caching_module
+    original_sort_key = caching._primitive_signature_sort_key
+    ambiguous = {"a": 1, "b": 1}
+
+    def colliding_sort_key(obj):
+        """Force two distinct primitive keys to share the same ordering key."""
+        if obj == "a" or obj == "b":
+            return ("COLLIDE",)
+        return original_sort_key(obj)
+
+    monkeypatch.setattr(caching, "_primitive_signature_sort_key", colliding_sort_key)
+
+    sanitized = caching._signature_to_hashable(ambiguous)
+
+    assert isinstance(sanitized, caching.Unhashable)
+
+
+def test_signature_to_hashable_fails_closed_for_opaque_dict_key(caching_module):
+    """Opaque dict keys should fail closed instead of being recursively canonicalized."""
+    caching, _ = caching_module
+
+    sanitized = caching._signature_to_hashable({_OpaqueValue(): 1})
+
+    assert isinstance(sanitized, caching.Unhashable)
+
+
+def test_signature_to_hashable_fails_closed_on_dict_key_sort_collisions_even_with_distinct_values(caching_module, monkeypatch):
+    """Different values must not mask dict key-sort collisions during canonicalization."""
+    caching, _ = caching_module
+    original_sort_key = caching._primitive_signature_sort_key
+
+    def colliding_sort_key(obj):
+        """Force two distinct primitive keys to share the same ordering key."""
+        if obj == "a" or obj == "b":
+            return ("COLLIDE",)
+        return original_sort_key(obj)
+
+    monkeypatch.setattr(caching, "_primitive_signature_sort_key", colliding_sort_key)
+
+    sanitized = caching._signature_to_hashable({"a": 1, "b": 2})
+
+    assert isinstance(sanitized, caching.Unhashable)
+
+
+@pytest.mark.parametrize(
+    "container_factory",
+    [
+        set,
+        frozenset,
+    ],
+)
+def test_to_hashable_fails_closed_for_ambiguous_unordered_values(caching_module, monkeypatch, container_factory):
+    """Ambiguous unordered values should fail closed instead of depending on iteration order."""
+    caching, _ = caching_module
+    original_sort_key = caching._sanitized_sort_key
+    container = container_factory({"a", "b"})
+
+    def colliding_sort_key(obj, *args, **kwargs):
+        """Force two distinct primitive values to share the same ordering key."""
+        if obj == "a" or obj == "b":
+            return ("COLLIDE",)
+        return original_sort_key(obj, *args, **kwargs)
+
+    monkeypatch.setattr(caching, "_sanitized_sort_key", colliding_sort_key)
+
+    hashable = caching.to_hashable(container)
+
+    assert isinstance(hashable, caching.Unhashable)
+
+
+def test_get_node_signature_returns_top_level_unhashable_for_tainted_signature(caching_module, monkeypatch):
+    """Tainted full signatures should fail closed before `to_hashable()` runs."""
+    caching, nodes_module = caching_module
+    monkeypatch.setitem(nodes_module.NODE_CLASS_MAPPINGS, "UnitTestNode", _DummyNode)
+    monkeypatch.setattr(
+        caching,
+        "to_hashable",
+        lambda *_args, **_kwargs: pytest.fail("to_hashable should not run for tainted signatures"),
+    )
+
+    is_changed_value = []
+    is_changed_value.append(is_changed_value)
+
+    dynprompt = _FakeDynPrompt(
+        {
+            "node": {
+                "class_type": "UnitTestNode",
+                "inputs": {"value": 5},
+            }
+        }
+    )
+    key_set = caching.CacheKeySetInputSignature(
+        dynprompt,
+        ["node"],
+        _FakeIsChangedCache({"node": is_changed_value}),
+    )
+
+    signature = asyncio.run(key_set.get_node_signature(dynprompt, "node"))
+
+    assert isinstance(signature, caching.Unhashable)
+
+
+def test_shallow_is_changed_signature_accepts_primitive_lists(caching_module):
+    """Primitive-only `is_changed` lists should stay hashable without deep descent."""
+    caching, _ = caching_module
+
+    sanitized = caching._shallow_is_changed_signature([1, "two", None, True])
+
+    assert sanitized == ("is_changed_list", (1, "two", None, True))
+
+
+def test_shallow_is_changed_signature_accepts_structured_builtin_fingerprint_lists(caching_module):
+    """Structured built-in `is_changed` fingerprints should remain representable."""
+    caching, _ = caching_module
+
+    sanitized = caching._shallow_is_changed_signature([("seed", 42), {"cfg": 8}])
+
+    assert sanitized == (
+        "is_changed_list",
+        (
+            ("tuple", ("seed", 42)),
+            ("dict", (("cfg", 8),)),
+        ),
+    )
+
+
+def test_shallow_is_changed_signature_fails_closed_for_opaque_payload(caching_module):
+    """Opaque `is_changed` payloads should still fail closed."""
+    caching, _ = caching_module
+
+    sanitized = caching._shallow_is_changed_signature([_OpaqueValue()])
+
+    assert isinstance(sanitized, caching.Unhashable)
+
+
+def test_get_immediate_node_signature_fails_closed_for_unhashable_is_changed(caching_module, monkeypatch):
+    """Recursive `is_changed` payloads should fail the full fragment closed."""
+    caching, nodes_module = caching_module
+    monkeypatch.setitem(nodes_module.NODE_CLASS_MAPPINGS, "UnitTestNode", _DummyNode)
+
+    is_changed_value = []
+    is_changed_value.append(is_changed_value)
+    dynprompt = _FakeDynPrompt(
+        {
+            "node": {
+                "class_type": "UnitTestNode",
+                "inputs": {"value": 5},
+            }
+        }
+    )
+    key_set = caching.CacheKeySetInputSignature(
+        dynprompt,
+        ["node"],
+        _FakeIsChangedCache({"node": is_changed_value}),
+    )
+
+    signature = asyncio.run(key_set.get_immediate_node_signature(dynprompt, "node", {}))
+
+    assert isinstance(signature, caching.Unhashable)
+
+
+def test_get_immediate_node_signature_fails_closed_for_missing_node(caching_module):
+    """Missing nodes should return the fail-closed sentinel instead of a NaN tuple."""
+    caching, _ = caching_module
+    dynprompt = _FakeDynPrompt({})
+    key_set = caching.CacheKeySetInputSignature(
+        dynprompt,
+        [],
+        _FakeIsChangedCache({}),
+    )
+
+    signature = asyncio.run(key_set.get_immediate_node_signature(dynprompt, "missing", {}))
+
+    assert isinstance(signature, caching.Unhashable)
--- a/tests/execution/test_caching.py
+++ b/tests/execution/test_caching.py
@ -0,0 +1,242 @@
+import asyncio
+
+from comfy_execution import caching
+
+
+class _StubDynPrompt:
+    def __init__(self, nodes):
+        self._nodes = nodes
+
+    def has_node(self, node_id):
+        return node_id in self._nodes
+
+    def get_node(self, node_id):
+        return self._nodes[node_id]
+
+
+class _StubIsChangedCache:
+    async def get(self, node_id):
+        return None
+
+
+class _StubNode:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {"required": {}}
+
+
+def test_shallow_is_changed_signature_keeps_primitive_only_list_shallow():
+    assert caching._shallow_is_changed_signature([1, "two", None, True]) == (
+        "is_changed_list",
+        (1, "two", None, True),
+    )
+
+
+def test_shallow_is_changed_signature_keeps_primitive_only_tuple_shallow():
+    assert caching._shallow_is_changed_signature((1, "two", None, True)) == (
+        "is_changed_tuple",
+        (1, "two", None, True),
+    )
+
+
+def test_shallow_is_changed_signature_keeps_structured_builtin_fingerprint_list():
+    assert caching._shallow_is_changed_signature([("seed", 42), {"cfg": 8}]) == (
+        "is_changed_list",
+        (
+            ("tuple", ("seed", 42)),
+            ("dict", (("cfg", 8),)),
+        ),
+    )
+
+
+def test_shallow_is_changed_signature_does_not_use_to_hashable(monkeypatch):
+    monkeypatch.setattr(
+        caching,
+        "to_hashable",
+        lambda *_args, **_kwargs: (_ for _ in ()).throw(
+            AssertionError("is_changed signature must not deep-canonicalize")
+        ),
+    )
+
+    signature = caching._shallow_is_changed_signature([("seed", 42), {"cfg": 8}])
+
+    assert signature == (
+        "is_changed_list",
+        (
+            ("tuple", ("seed", 42)),
+            ("dict", (("cfg", 8),)),
+        ),
+    )
+
+
+def test_get_immediate_node_signature_canonicalizes_non_link_inputs(monkeypatch):
+    live_value = [1, {"nested": [2, 3]}]
+    dynprompt = _StubDynPrompt(
+        {
+            "1": {
+                "class_type": "TestCacheNode",
+                "inputs": {"value": live_value},
+            }
+        }
+    )
+
+    monkeypatch.setitem(caching.nodes.NODE_CLASS_MAPPINGS, "TestCacheNode", _StubNode)
+    monkeypatch.setattr(caching, "NODE_CLASS_CONTAINS_UNIQUE_ID", {})
+
+    keyset = caching.CacheKeySetInputSignature(dynprompt, [], _StubIsChangedCache())
+    signature = asyncio.run(keyset.get_immediate_node_signature(dynprompt, "1", {}))
+
+    assert signature == (
+        "TestCacheNode",
+        None,
+        ("value", ("list", (1, ("dict", (("nested", ("list", (2, 3))),))))),
+    )
+
+
+def test_to_hashable_walks_dicts_without_rebinding_traversal_stack():
+    live_value = {
+        "outer": {"nested": [2, 3]},
+        "items": [{"leaf": 4}],
+    }
+
+    assert caching.to_hashable(live_value) == (
+        "dict",
+        (
+            ("items", ("list", (("dict", (("leaf", 4),)),))),
+            ("outer", ("dict", (("nested", ("list", (2, 3))),))),
+        ),
+    )
+
+
+def test_get_immediate_node_signature_fails_closed_for_opaque_non_link_input(monkeypatch):
+    class OpaqueRuntimeValue:
+        pass
+
+    live_value = OpaqueRuntimeValue()
+    dynprompt = _StubDynPrompt(
+        {
+            "1": {
+                "class_type": "TestCacheNode",
+                "inputs": {"value": live_value},
+            }
+        }
+    )
+
+    monkeypatch.setitem(caching.nodes.NODE_CLASS_MAPPINGS, "TestCacheNode", _StubNode)
+    monkeypatch.setattr(caching, "NODE_CLASS_CONTAINS_UNIQUE_ID", {})
+
+    keyset = caching.CacheKeySetInputSignature(dynprompt, [], _StubIsChangedCache())
+    signature = asyncio.run(keyset.get_immediate_node_signature(dynprompt, "1", {}))
+
+    assert isinstance(signature, caching.Unhashable)
+
+
+def test_get_node_signature_propagates_unhashable_immediate_fragment(monkeypatch):
+    class OpaqueRuntimeValue:
+        pass
+
+    dynprompt = _StubDynPrompt(
+        {
+            "1": {
+                "class_type": "TestCacheNode",
+                "inputs": {"value": OpaqueRuntimeValue()},
+            }
+        }
+    )
+
+    monkeypatch.setitem(caching.nodes.NODE_CLASS_MAPPINGS, "TestCacheNode", _StubNode)
+    monkeypatch.setattr(caching, "NODE_CLASS_CONTAINS_UNIQUE_ID", {})
+
+    keyset = caching.CacheKeySetInputSignature(dynprompt, [], _StubIsChangedCache())
+    signature = asyncio.run(keyset.get_node_signature(dynprompt, "1"))
+
+    assert isinstance(signature, caching.Unhashable)
+
+
+def test_get_node_signature_never_visits_raw_non_link_input(monkeypatch):
+    live_value = [1, 2, 3]
+    dynprompt = _StubDynPrompt(
+        {
+            "1": {
+                "class_type": "TestCacheNode",
+                "inputs": {"value": live_value},
+            }
+        }
+    )
+
+    monkeypatch.setitem(caching.nodes.NODE_CLASS_MAPPINGS, "TestCacheNode", _StubNode)
+    monkeypatch.setattr(caching, "NODE_CLASS_CONTAINS_UNIQUE_ID", {})
+    monkeypatch.setattr(
+        caching,
+        "_signature_to_hashable",
+        lambda *_args, **_kwargs: (_ for _ in ()).throw(
+            AssertionError("outer signature canonicalizer should not run")
+        ),
+    )
+
+    keyset = caching.CacheKeySetInputSignature(dynprompt, [], _StubIsChangedCache())
+    signature = asyncio.run(keyset.get_node_signature(dynprompt, "1"))
+
+    assert isinstance(signature, tuple)
+
+
+def test_get_node_signature_keeps_deep_canonicalized_input_fragment(monkeypatch):
+    live_value = 1
+    for _ in range(8):
+        live_value = [live_value]
+    expected = caching.to_hashable(live_value)
+
+    dynprompt = _StubDynPrompt(
+        {
+            "1": {
+                "class_type": "TestCacheNode",
+                "inputs": {"value": live_value},
+            }
+        }
+    )
+
+    monkeypatch.setitem(caching.nodes.NODE_CLASS_MAPPINGS, "TestCacheNode", _StubNode)
+    monkeypatch.setattr(caching, "NODE_CLASS_CONTAINS_UNIQUE_ID", {})
+
+    keyset = caching.CacheKeySetInputSignature(dynprompt, [], _StubIsChangedCache())
+    signature = asyncio.run(keyset.get_node_signature(dynprompt, "1"))
+
+    assert isinstance(signature, tuple)
+    assert signature[0][2][0] == "value"
+    assert signature[0][2][1] == expected
+
+
+def test_get_node_signature_keeps_large_precanonicalized_fragment(monkeypatch):
+    live_value = object()
+    canonical_fragment = ("tuple", tuple(("list", (index, index + 1)) for index in range(256)))
+    dynprompt = _StubDynPrompt(
+        {
+            "1": {
+                "class_type": "TestCacheNode",
+                "inputs": {"value": live_value},
+            }
+        }
+    )
+
+    monkeypatch.setitem(caching.nodes.NODE_CLASS_MAPPINGS, "TestCacheNode", _StubNode)
+    monkeypatch.setattr(caching, "NODE_CLASS_CONTAINS_UNIQUE_ID", {})
+    monkeypatch.setattr(
+        caching,
+        "to_hashable",
+        lambda value, max_nodes=caching._MAX_SIGNATURE_CONTAINER_VISITS: (
+            canonical_fragment if value is live_value else caching.Unhashable()
+        ),
+    )
+    monkeypatch.setattr(
+        caching,
+        "_signature_to_hashable",
+        lambda *_args, **_kwargs: (_ for _ in ()).throw(
+            AssertionError("outer signature canonicalizer should not run")
+        ),
+    )
+
+    keyset = caching.CacheKeySetInputSignature(dynprompt, [], _StubIsChangedCache())
+    signature = asyncio.run(keyset.get_node_signature(dynprompt, "1"))
+
+    assert isinstance(signature, tuple)
+    assert signature[0][2] == ("value", canonical_fragment)
Author	SHA1	Message	Date
xmarre	f9d53e5d75	Merge `6d4f9e86ab` into `96f1cee9f5`	2026-05-01 11:01:11 +03:00
Alexander Piskun	96f1cee9f5	chore(api-nodes): always display the custom width and height in GPTImage2 node (#13651 ) Signed-off-by: bigcat88 <bigcat88@icloud.com>	2026-04-30 23:15:11 -07:00
Jedrzej Kosinski	97f58baaaf	Add alexisrolland and rattus128 as code owners (#13648 ) Some checks are pending Python Linting / Run Ruff (push) Waiting to run Details Python Linting / Run Pylint (push) Waiting to run Details Build package / Build Test (3.10) (push) Waiting to run Details Build package / Build Test (3.11) (push) Waiting to run Details Build package / Build Test (3.12) (push) Waiting to run Details Build package / Build Test (3.13) (push) Waiting to run Details Build package / Build Test (3.14) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run Details Execution Tests / test (macos-latest) (push) Waiting to run Details Execution Tests / test (ubuntu-latest) (push) Waiting to run Details Execution Tests / test (windows-latest) (push) Waiting to run Details Test server launches without errors / test (push) Waiting to run Details Unit Tests / test (macos-latest) (push) Waiting to run Details Unit Tests / test (ubuntu-latest) (push) Waiting to run Details Unit Tests / test (windows-2022) (push) Waiting to run Details	2026-04-30 21:49:31 -04:00
Daxiong (Lin)	e8e8fee224	chore: update workflow templates to v0.9.65 (#13644 )	2026-04-30 18:14:28 -07:00
Rainer	e9c311b245	OneTainer ERNIE LoRA support (#13640 )	2026-04-30 19:33:41 -04:00
comfyanonymous	e6e0936128	Load other jpeg formats without taking so much memory. (#13642 )	2026-04-30 19:33:09 -04:00
Alexander Piskun	b633244635	[Partner Nodes] ByteDance: virtual portrait library for regular images (#13638 ) Some checks are pending Python Linting / Run Ruff (push) Waiting to run Details Python Linting / Run Pylint (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run Details Execution Tests / test (macos-latest) (push) Waiting to run Details Execution Tests / test (ubuntu-latest) (push) Waiting to run Details Execution Tests / test (windows-latest) (push) Waiting to run Details Test server launches without errors / test (push) Waiting to run Details Unit Tests / test (macos-latest) (push) Waiting to run Details Unit Tests / test (ubuntu-latest) (push) Waiting to run Details Unit Tests / test (windows-2022) (push) Waiting to run Details Build package / Build Test (3.10) (push) Waiting to run Details Build package / Build Test (3.11) (push) Waiting to run Details Build package / Build Test (3.12) (push) Waiting to run Details Build package / Build Test (3.13) (push) Waiting to run Details Build package / Build Test (3.14) (push) Waiting to run Details * feat(api-nodes-bytedance): use the virtual portrait library for regular images Signed-off-by: bigcat88 <bigcat88@icloud.com> * fix: include shape in image dedup hash Signed-off-by: bigcat88 <bigcat88@icloud.com> --------- Signed-off-by: bigcat88 <bigcat88@icloud.com>	2026-04-30 11:49:08 -07:00
Alexander Piskun	38ecad8f8a	feat(api-nodes): allow custom resolutions for GPTImage2 node (#13631 ) Some checks are pending Python Linting / Run Ruff (push) Waiting to run Details Python Linting / Run Pylint (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run Details Execution Tests / test (macos-latest) (push) Waiting to run Details Execution Tests / test (ubuntu-latest) (push) Waiting to run Details Execution Tests / test (windows-latest) (push) Waiting to run Details Test server launches without errors / test (push) Waiting to run Details Unit Tests / test (macos-latest) (push) Waiting to run Details Unit Tests / test (ubuntu-latest) (push) Waiting to run Details Unit Tests / test (windows-2022) (push) Waiting to run Details Signed-off-by: bigcat88 <bigcat88@icloud.com>	2026-04-30 01:09:33 -07:00
Jedrzej Kosinski	a7d82baa06	Fix SQLAlchemy version format in requirements.txt (#13547 ) Some checks are pending Python Linting / Run Ruff (push) Waiting to run Details Python Linting / Run Pylint (push) Waiting to run Details Build package / Build Test (3.11) (push) Waiting to run Details Build package / Build Test (3.10) (push) Waiting to run Details Build package / Build Test (3.12) (push) Waiting to run Details Build package / Build Test (3.13) (push) Waiting to run Details Build package / Build Test (3.14) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run Details Execution Tests / test (macos-latest) (push) Waiting to run Details Execution Tests / test (ubuntu-latest) (push) Waiting to run Details Execution Tests / test (windows-latest) (push) Waiting to run Details Test server launches without errors / test (push) Waiting to run Details Unit Tests / test (macos-latest) (push) Waiting to run Details Unit Tests / test (ubuntu-latest) (push) Waiting to run Details Unit Tests / test (windows-2022) (push) Waiting to run Details Change SQLAlchemy>=2.0 to SQLAlchemy>=2.0.0 to satisfy the X.Y.Z version format expected by install_util.is_valid_version().	2026-04-29 23:30:01 -04:00
comfyanonymous	d10fc2d652	Lower peak mem usage for 8 bit formats with pyav. (#13626 )	2026-04-29 23:05:31 -04:00
blepping	a164c82913	Add high quality preview support for Flux2 latents (#13496 )	2026-04-29 19:37:30 -04:00
Talmaj	5eeae3f1d8	Cogvideox (#13402 ) --------- Co-authored-by: kijai <40791699+kijai@users.noreply.github.com> Co-authored-by: Talmaj Marinc <talmaj@comfy.org>	2026-04-29 19:30:08 -04:00
Jukka Seppänen	0e25a6936e	Reduce video tiny VAE peak VRAM and decode time (CORE-127) (#13617 ) Some checks are pending Python Linting / Run Ruff (push) Waiting to run Details Python Linting / Run Pylint (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run Details Execution Tests / test (macos-latest) (push) Waiting to run Details Execution Tests / test (ubuntu-latest) (push) Waiting to run Details Execution Tests / test (windows-latest) (push) Waiting to run Details Test server launches without errors / test (push) Waiting to run Details Unit Tests / test (macos-latest) (push) Waiting to run Details Unit Tests / test (ubuntu-latest) (push) Waiting to run Details Unit Tests / test (windows-2022) (push) Waiting to run Details * Update taehv.py * Simplify * Simplify pixel_unshuffle dispatch	2026-04-29 12:15:10 -07:00
xmarre	6d4f9e86ab	Merge branch 'Comfy-Org:master' into master	2026-04-18 09:20:41 +02:00
xmarre	1548aee40e	Merge pull request #3 from xmarre/codex/vae-encode-tiled-admission-fix Fix 2D tiled VAE encode memory admission estimation	2026-04-16 12:59:16 +02:00
xmarre	9c210473fc	Fix tiled VAE encode memory admission estimate	2026-04-16 12:49:49 +02:00
xmarre	c1e9164c63	Merge branch 'master' into master	2026-04-16 10:07:30 +02:00
xmarre	5e9a90186f	Merge branch 'master' into master	2026-04-14 20:11:29 +02:00
xmarre	ece906328a	Merge branch 'master' into master	2026-04-02 18:55:32 +02:00
xmarre	500ca8e02a	Merge branch 'Comfy-Org:master' into master	2026-03-25 17:45:49 +01:00
xmarre	3143b7981f	Merge branch 'Comfy-Org:master' into master	2026-03-23 02:13:08 +01:00
xmarre	c9b3f81e83	Merge branch 'master' into master	2026-03-18 14:06:06 +01:00
xmarre	5e74e9b3ed	Merge pull request #1 from xmarre/codex/fix-cache-signature-shallow-check Enforce shallow is_changed signature handling	2026-03-18 13:29:34 +01:00
xmarre	c702cddf75	Fix shallow is_changed logic	2026-03-18 13:15:04 +01:00
xmarre	e13da8104c	Fix shallow is_changed handling	2026-03-18 12:26:30 +01:00
xmarre	fdcc38b9ea	Return Unhashable on missing node	2026-03-17 07:48:14 +01:00
xmarre	c1ce00287c	Stop requeueing live containers	2026-03-16 19:21:24 +01:00
xmarre	6e3bd33665	Prevent dict key canonicalization	2026-03-16 17:06:09 +01:00
xmarre	ce05e377a8	Stop canonicalizing dict keys	2026-03-16 16:48:42 +01:00
xmarre	1a00f7743f	Stop traversing dict keys	2026-03-16 16:10:01 +01:00
xmarre	a6472b1514	Fix to_hashable traversal stack handling	2026-03-16 15:34:15 +01:00
xmarre	6158cd5820	Prevent redundant signature rewalk	2026-03-16 13:31:02 +01:00
xmarre	bff714dda0	Fix non-link input cache signature	2026-03-16 10:13:04 +01:00
xmarre	fce22da313	Prevent signature traversal of raw	2026-03-16 09:29:00 +01:00
xmarre	9f9d37bd9a	Merge branch 'master' into master	2026-03-16 09:07:29 +01:00
xmarre	088778c35d	Stop canonicalizing is_changed	2026-03-15 17:06:20 +01:00
xmarre	4c5f82971e	Restrict is_changed canonicalization	2026-03-15 16:44:25 +01:00
xmarre	f1d91a4c8c	Prevent canonicalizing is_changed	2026-03-15 16:14:23 +01:00
xmarre	dbed5a1b52	Replace sanitize and hash passes	2026-03-15 07:39:10 +01:00
xmarre	24fdbb9aca	Replace sanitize hash two pass	2026-03-15 07:30:18 +01:00
xmarre	a6624a9afd	Unify signature sanitize and hash	2026-03-15 07:09:24 +01:00
xmarre	0b512198e8	Adopt single-pass signature hashing	2026-03-15 05:41:39 +01:00
xmarre	9feb26928c	Change signature cache to bail early	2026-03-15 04:31:32 +01:00
xmarre	fadd79ad48	Fix nondeterministic set signing	2026-03-15 03:29:59 +01:00
xmarre	77bc7bdd6b	Merge branch 'master' of https://github.com/xmarre/ComfyUI	2026-03-15 02:56:09 +01:00
xmarre	117afbc1d7	Add docstrings and harden signature	2026-03-15 02:55:39 +01:00
xmarre	064eec2278	Merge branch 'master' into master	2026-03-15 02:32:56 +01:00
xmarre	aceaa5e579	fail closed on ambiguous container ordering in cache signatures	2026-03-15 02:32:25 +01:00
xmarre	763089f681	Merge branch 'master' into master	2026-03-15 01:48:10 +01:00
xmarre	1693dabc8f	Merge branch 'master' into master	2026-03-15 00:28:34 +01:00
xmarre	08063d2638	Merge branch 'Comfy-Org:master' into master	2026-03-14 23:38:46 +01:00
xmarre	e069617e54	Merge branch 'Comfy-Org:master' into master	2026-03-14 21:27:17 +01:00
xmarre	2bea0ee5d7	Simplify Unhashable sentinel implementation	2026-03-14 12:42:04 +01:00
xmarre	17863f603a	Add comprehensive docstrings for cache key helpers	2026-03-14 12:26:27 +01:00
xmarre	31ba844624	Add cycle detection to signature input sanitization	2026-03-14 12:04:31 +01:00
xmarre	1451001f64	Add docstrings for cache signature hardening helpers	2026-03-14 10:57:45 +01:00
xmarre	1af99b2e81	Update caching hash recursion	2026-03-14 10:31:07 +01:00
xmarre	3568b82b76	Revert "Add missing docstrings" This reverts commit `4b431ffc27`.	2026-03-14 10:11:35 +01:00
xmarre	6728d4d439	Revert "Harden to_hashable against cycles" This reverts commit `880b51ac4f`.	2026-03-14 10:11:04 +01:00
xmarre	4b431ffc27	Add missing docstrings	2026-03-14 09:57:22 +01:00
xmarre	880b51ac4f	Harden to_hashable against cycles	2026-03-14 09:46:27 +01:00
xmarre	4d9516b909	Fix caching sanitization logic	2026-03-14 07:06:39 +01:00
xmarre	39086890e2	Fix sanitize_signature_input	2026-03-14 06:56:49 +01:00
xmarre	2adde5a0e1	Keep container types in sanitizer	2026-03-14 06:36:06 +01:00
xmarre	0c1bfad0df	Merge branch 'Comfy-Org:master' into master	2026-03-14 06:13:25 +01:00
xmarre	7d76a4447e	Sanitize execution cache inputs	2026-03-14 02:36:40 +01:00