Merge f76e3a11b5 into 85fc35e8fa

Fix mac issue. (#12250 )
llama: cast logits as a comfy-weight (#12248 )
2026-02-06 11:32:31 +08:00 · 2026-02-03 19:22:17 +02:00 · 2026-02-03 12:19:39 -05:00 · 2026-02-03 11:31:36 -05:00 · 2026-02-03 11:07:04 -05:00 · 2026-02-03 04:09:30 -05:00
22 changed files with 4156 additions and 14 deletions
--- a/comfy/image_encoders/dino3.py
+++ b/comfy/image_encoders/dino3.py
@ -0,0 +1,240 @@
+import math
+import torch
+import torch.nn as nn
+
+from comfy.ldm.modules.attention import optimized_attention_for_device
+from comfy.ldm.flux.math import apply_rope
+from dino2 import Dinov2MLP as DINOv3ViTMLP, LayerScale as DINOv3ViTLayerScale
+
+class DINOv3ViTAttention(nn.Module):
+    def __init__(self, hidden_size, num_attention_heads, device, dtype, operations):
+        super().__init__()
+        self.embed_dim = hidden_size
+        self.num_heads = num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.is_causal = False
+
+        self.scaling = self.head_dim**-0.5
+        self.is_causal = False
+
+        self.k_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=False, device=device, dtype=dtype) # key_bias = False
+        self.v_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=True, device=device, dtype=dtype)
+
+        self.q_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=True, device=device, dtype=dtype)
+        self.o_proj = operations.Linear(self.embed_dim, self.embed_dim, bias=True, device=device, dtype=dtype)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+
+        batch_size, patches, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        position_embeddings = torch.stack([cos, sin], dim = -1)
+        query_states, key_states = apply_rope(query_states, key_states, position_embeddings)
+
+        attn_output, attn_weights = optimized_attention_for_device(
+            query_states, key_states, value_states, attention_mask, skip_reshape=True, skip_output_reshape=True
+        )
+
+        attn_output = attn_output.reshape(batch_size, patches, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, attn_weights
+
+class DINOv3ViTGatedMLP(nn.Module):
+    def __init__(self, hidden_size, intermediate_size, mlp_bias, device, dtype, operations):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.gate_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias, device=device, dtype=dtype)
+        self.up_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=mlp_bias, device=device, dtype=dtype)
+        self.down_proj = operations.Linear(self.intermediate_size, self.hidden_size, bias=mlp_bias, device=device, dtype=dtype)
+        self.act_fn = torch.nn.GELU()
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+def get_patches_center_coordinates(
+    num_patches_h: int, num_patches_w: int, dtype: torch.dtype, device: torch.device
+) -> torch.Tensor:
+
+    coords_h = torch.arange(0.5, num_patches_h, dtype=dtype, device=device)
+    coords_w = torch.arange(0.5, num_patches_w, dtype=dtype, device=device)
+    coords_h = coords_h / num_patches_h
+    coords_w = coords_w / num_patches_w
+    coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1)
+    coords = coords.flatten(0, 1)
+    coords = 2.0 * coords - 1.0
+    return coords
+
+class DINOv3ViTRopePositionEmbedding(nn.Module):
+    inv_freq: torch.Tensor
+
+    def __init__(self, rope_theta, hidden_size, num_attention_heads, image_size, patch_size, device, dtype):
+        super().__init__()
+        self.base = rope_theta
+        self.head_dim = hidden_size // num_attention_heads
+        self.num_patches_h = image_size // patch_size
+        self.num_patches_w = image_size // patch_size
+
+        inv_freq = 1 / self.base ** torch.arange(0, 1, 4 / self.head_dim, dtype=torch.float32, device=device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, pixel_values: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        _, _, height, width = pixel_values.shape
+        num_patches_h = height // self.patch_size
+        num_patches_w = width // self.patch_size
+
+        device = pixel_values.device
+        device_type = device.type if isinstance(device.type, str) and device.type != "mps" else "cpu"
+        with torch.amp.autocast(device_type = device_type, enabled=False):
+            patch_coords = get_patches_center_coordinates(
+                num_patches_h, num_patches_w, dtype=torch.float32, device=device
+            )
+
+            angles = 2 * math.pi * patch_coords[:, :, None] * self.inv_freq[None, None, :]
+            angles = angles.flatten(1, 2)
+            angles = angles.tile(2)
+
+            cos = torch.cos(angles)
+            sin = torch.sin(angles)
+
+        dtype = pixel_values.dtype
+        return cos.to(dtype=dtype), sin.to(dtype=dtype)
+
+
+class DINOv3ViTEmbeddings(nn.Module):
+    def __init__(self, hidden_size, num_register_tokens, num_channels, patch_size, dtype, device, operations):
+        super().__init__()
+        self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_size, device=device, dtype=dtype))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, hidden_size, device=device, dtype=dtype))
+        self.register_tokens = nn.Parameter(torch.empty(1, num_register_tokens, hidden_size, device=device, dtype=dtype))
+        self.patch_embeddings = operations.Conv2d(
+            num_channels, hidden_size, kernel_size=patch_size, stride=patch_size, device=device, dtype=dtype
+        )
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: torch.Tensor | None = None):
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embeddings.weight.dtype
+
+        patch_embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+        patch_embeddings = patch_embeddings.flatten(2).transpose(1, 2)
+
+        if bool_masked_pos is not None:
+            mask_token = self.mask_token.to(patch_embeddings.dtype)
+            patch_embeddings = torch.where(bool_masked_pos.unsqueeze(-1), mask_token, patch_embeddings)
+
+        cls_token = self.cls_token.expand(batch_size, -1, -1)
+        register_tokens = self.register_tokens.expand(batch_size, -1, -1)
+        embeddings = torch.cat([cls_token, register_tokens, patch_embeddings], dim=1)
+
+        return embeddings
+
+class DINOv3ViTLayer(nn.Module):
+
+    def __init__(self, hidden_size, layer_norm_eps, use_gated_mlp, layerscale_value, mlp_bias, intermediate_size, num_attention_heads,
+                 device, dtype, operations):
+        super().__init__()
+
+        self.norm1 = operations.LayerNorm(hidden_size, eps=layer_norm_eps)
+        self.attention = DINOv3ViTAttention(hidden_size, num_attention_heads, device=device, dtype=dtype, operations=operations)
+        self.layer_scale1 = DINOv3ViTLayerScale(hidden_size, layerscale_value, device=device, dtype=dtype)
+
+        self.norm2 = operations.LayerNorm(hidden_size, eps=layer_norm_eps, device=device, dtype=dtype)
+
+        if use_gated_mlp:
+            self.mlp = DINOv3ViTGatedMLP(hidden_size, intermediate_size, mlp_bias, device=device, dtype=dtype, operations=operations)
+        else:
+            self.mlp = DINOv3ViTMLP(hidden_size, device=device, dtype=dtype, operations=operations)
+        self.layer_scale2 = DINOv3ViTLayerScale(hidden_size, layerscale_value, device=device, dtype=dtype)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states, _ = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = self.layer_scale1(hidden_states)
+        hidden_states = hidden_states + residual
+
+        residual = hidden_states
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.layer_scale2(hidden_states)
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class DINOv3ViTModel(nn.Module):
+    def __init__(self, config, device, dtype, operations):
+        super().__init__()
+        num_hidden_layers = config["num_hidden_layers"]
+        hidden_size = config["hidden_size"]
+        num_attention_heads = config["num_attention_heads"]
+        num_register_tokens = config["num_register_tokens"]
+        intermediate_size = config["intermediate_size"]
+        layer_norm_eps = config["layer_norm_eps"]
+        layerscale_value =  config["layerscale_value"]
+        num_channels = config["num_channels"]
+        patch_size = config["patch_size"]
+        rope_theta = config["rope_theta"]
+
+        self.embeddings = DINOv3ViTEmbeddings(
+            hidden_size, num_register_tokens, num_channels=num_channels, patch_size=patch_size, dtype=dtype, device=device, operations=operations
+        )
+        self.rope_embeddings = DINOv3ViTRopePositionEmbedding(
+            rope_theta, hidden_size, num_attention_heads, image_size=512, patch_size=patch_size, dtype=dtype, device=device
+        )
+        self.layer = nn.ModuleList(
+            [DINOv3ViTLayer(hidden_size, layer_norm_eps, use_gated_mlp=False, layerscale_value=layerscale_value, mlp_bias=True,
+                            intermediate_size=intermediate_size,num_attention_heads = num_attention_heads,
+                            dtype=dtype, device=device, operations=operations)
+            for _ in range(num_hidden_layers)])
+        self.norm = nn.LayerNorm(hidden_size, eps=layer_norm_eps, dtype=dtype, device=device)
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: torch.Tensor | None = None,
+        **kwargs,
+    ):
+
+        pixel_values = pixel_values.to(self.embeddings.patch_embeddings.weight.dtype)
+        hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+        position_embeddings = self.rope_embeddings(pixel_values)
+
+        for i, layer_module in enumerate(self.layer):
+            hidden_states = layer_module(
+                hidden_states,
+                position_embeddings=position_embeddings,
+            )
+
+        sequence_output = self.norm(hidden_states)
+        pooled_output = sequence_output[:, 0, :]
+
+        return sequence_output, None, pooled_output, None
--- a/comfy/image_encoders/dino3_large.json
+++ b/comfy/image_encoders/dino3_large.json
@ -0,0 +1,24 @@
+{
+
+  "hidden_size": 384,
+  "image_size": 224,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "key_bias": false,
+  "layer_norm_eps": 1e-05,
+  "layerscale_value": 1.0,
+  "mlp_bias": true,
+  "num_attention_heads": 6,
+  "num_channels": 3,
+  "num_hidden_layers": 12,
+  "num_register_tokens": 4,
+  "patch_size": 16,
+  "pos_embed_rescale": 2.0,
+  "proj_bias": true,
+  "query_bias": true,
+  "rope_theta": 100.0,
+  "use_gated_mlp": false,
+  "value_bias": true,
+  "mean": [0.485, 0.456, 0.406],
+  "std": [0.229, 0.224, 0.225]
+}
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -755,6 +755,10 @@ class ACEAudio(LatentFormat):
    latent_channels = 8
    latent_dimensions = 2

+class ACEAudio15(LatentFormat):
+    latent_channels = 64
+    latent_dimensions = 1
+
 class ChromaRadiance(LatentFormat):
    latent_channels = 3
    spacial_downscale_ratio = 1
--- a/comfy/ldm/ace/ace_step15.py
+++ b/comfy/ldm/ace/ace_step15.py
--- a/comfy/ldm/trellis2/attention.py
+++ b/comfy/ldm/trellis2/attention.py
@ -0,0 +1,194 @@
+import torch
+import math
+from comfy.ldm.modules.attention import optimized_attention
+from typing import Tuple, Union, List
+from vae import VarLenTensor
+
+def sparse_windowed_scaled_dot_product_self_attention(
+    qkv,
+    window_size: int,
+    shift_window: Tuple[int, int, int] = (0, 0, 0)
+):
+
+    serialization_spatial_cache_name = f'windowed_attention_{window_size}_{shift_window}'
+    serialization_spatial_cache = qkv.get_spatial_cache(serialization_spatial_cache_name)
+    if serialization_spatial_cache is None:
+        fwd_indices, bwd_indices, seq_lens, attn_func_args = calc_window_partition(qkv, window_size, shift_window)
+        qkv.register_spatial_cache(serialization_spatial_cache_name, (fwd_indices, bwd_indices, seq_lens, attn_func_args))
+    else:
+        fwd_indices, bwd_indices, seq_lens, attn_func_args = serialization_spatial_cache
+
+    qkv_feats = qkv.feats[fwd_indices]      # [M, 3, H, C]
+
+    if optimized_attention.__name__ == 'attention_xformers':
+        if 'xops' not in globals():
+            import xformers.ops as xops
+        q, k, v = qkv_feats.unbind(dim=1)
+        q = q.unsqueeze(0)                                                              # [1, M, H, C]
+        k = k.unsqueeze(0)                                                              # [1, M, H, C]
+        v = v.unsqueeze(0)                                                              # [1, M, H, C]
+        out = xops.memory_efficient_attention(q, k, v, **attn_func_args)[0]             # [M, H, C]
+    elif optimized_attention.__name__ == 'attention_flash':
+        if 'flash_attn' not in globals():
+            import flash_attn
+        out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv_feats, **attn_func_args)  # [M, H, C]
+
+    out = out[bwd_indices]      # [T, H, C]
+
+    return qkv.replace(out)
+
+def calc_window_partition(
+    tensor,
+    window_size: Union[int, Tuple[int, ...]],
+    shift_window: Union[int, Tuple[int, ...]] = 0,
+) -> Tuple[torch.Tensor, torch.Tensor, List[int], List[int]]:
+
+    DIM = tensor.coords.shape[1] - 1
+    shift_window = (shift_window,) * DIM if isinstance(shift_window, int) else shift_window
+    window_size = (window_size,) * DIM if isinstance(window_size, int) else window_size
+    shifted_coords = tensor.coords.clone().detach()
+    shifted_coords[:, 1:] += torch.tensor(shift_window, device=tensor.device, dtype=torch.int32).unsqueeze(0)
+
+    MAX_COORDS = [i + j for i, j in zip(tensor.spatial_shape, shift_window)]
+    NUM_WINDOWS = [math.ceil((mc + 1) / ws) for mc, ws in zip(MAX_COORDS, window_size)]
+    OFFSET = torch.cumprod(torch.tensor([1] + NUM_WINDOWS[::-1]), dim=0).tolist()[::-1]
+
+    shifted_coords[:, 1:] //= torch.tensor(window_size, device=tensor.device, dtype=torch.int32).unsqueeze(0)
+    shifted_indices = (shifted_coords * torch.tensor(OFFSET, device=tensor.device, dtype=torch.int32).unsqueeze(0)).sum(dim=1)
+    fwd_indices = torch.argsort(shifted_indices)
+    bwd_indices = torch.empty_like(fwd_indices)
+    bwd_indices[fwd_indices] = torch.arange(fwd_indices.shape[0], device=tensor.device)
+    seq_lens = torch.bincount(shifted_indices)
+    mask = seq_lens != 0
+    seq_lens = seq_lens[mask]
+
+    if optimized_attention.__name__ == 'attention_xformers':
+        if 'xops' not in globals():
+            import xformers.ops as xops
+        attn_func_args = {
+            'attn_bias': xops.fmha.BlockDiagonalMask.from_seqlens(seq_lens)
+        }
+    elif optimized_attention.__name__ == 'attention_flash':
+        attn_func_args = {
+            'cu_seqlens': torch.cat([torch.tensor([0], device=tensor.device), torch.cumsum(seq_lens, dim=0)], dim=0).int(),
+            'max_seqlen': torch.max(seq_lens)
+        }
+
+    return fwd_indices, bwd_indices, seq_lens, attn_func_args
+
+
+def sparse_scaled_dot_product_attention(*args, **kwargs):
+    arg_names_dict = {
+        1: ['qkv'],
+        2: ['q', 'kv'],
+        3: ['q', 'k', 'v']
+    }
+    num_all_args = len(args) + len(kwargs)
+    for key in arg_names_dict[num_all_args][len(args):]:
+        assert key in kwargs, f"Missing argument {key}"
+
+    if num_all_args == 1:
+        qkv = args[0] if len(args) > 0 else kwargs['qkv']
+        device = qkv.device
+
+        s = qkv
+        q_seqlen = [qkv.layout[i].stop - qkv.layout[i].start for i in range(qkv.shape[0])]
+        kv_seqlen = q_seqlen
+        qkv = qkv.feats     # [T, 3, H, C]
+
+    elif num_all_args == 2:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        kv = args[1] if len(args) > 1 else kwargs['kv']
+        device = q.device
+
+        if isinstance(q, VarLenTensor):
+            s = q
+            q_seqlen = [q.layout[i].stop - q.layout[i].start for i in range(q.shape[0])]
+            q = q.feats     # [T_Q, H, C]
+        else:
+            s = None
+            N, L, H, C = q.shape
+            q_seqlen = [L] * N
+            q = q.reshape(N * L, H, C)   # [T_Q, H, C]
+
+        if isinstance(kv, VarLenTensor):
+            kv_seqlen = [kv.layout[i].stop - kv.layout[i].start for i in range(kv.shape[0])]
+            kv = kv.feats     # [T_KV, 2, H, C]
+        else:
+            N, L, _, H, C = kv.shape
+            kv_seqlen = [L] * N
+            kv = kv.reshape(N * L, 2, H, C)   # [T_KV, 2, H, C]
+
+    elif num_all_args == 3:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        k = args[1] if len(args) > 1 else kwargs['k']
+        v = args[2] if len(args) > 2 else kwargs['v']
+        device = q.device
+
+        if isinstance(q, VarLenTensor):
+            s = q
+            q_seqlen = [q.layout[i].stop - q.layout[i].start for i in range(q.shape[0])]
+            q = q.feats     # [T_Q, H, Ci]
+        else:
+            s = None
+            N, L, H, CI = q.shape
+            q_seqlen = [L] * N
+            q = q.reshape(N * L, H, CI)  # [T_Q, H, Ci]
+
+        if isinstance(k, VarLenTensor):
+            kv_seqlen = [k.layout[i].stop - k.layout[i].start for i in range(k.shape[0])]
+            k = k.feats     # [T_KV, H, Ci]
+            v = v.feats     # [T_KV, H, Co]
+        else:
+            N, L, H, CI, CO = *k.shape, v.shape[-1]
+            kv_seqlen = [L] * N
+            k = k.reshape(N * L, H, CI)     # [T_KV, H, Ci]
+            v = v.reshape(N * L, H, CO)     # [T_KV, H, Co]
+
+    if optimized_attention.__name__ == 'attention_xformers':
+        if 'xops' not in globals():
+            import xformers.ops as xops
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=1)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=1)
+        q = q.unsqueeze(0)
+        k = k.unsqueeze(0)
+        v = v.unsqueeze(0)
+        mask = xops.fmha.BlockDiagonalMask.from_seqlens(q_seqlen, kv_seqlen)
+        out = xops.memory_efficient_attention(q, k, v, mask)[0]
+    elif optimized_attention.__name__ == 'attention_flash':
+        if 'flash_attn' not in globals():
+            import flash_attn
+        cu_seqlens_q = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(q_seqlen), dim=0)]).int().to(device)
+        if num_all_args in [2, 3]:
+            cu_seqlens_kv = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(kv_seqlen), dim=0)]).int().to(device)
+        if num_all_args == 1:
+            out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv, cu_seqlens_q, max(q_seqlen))
+        elif num_all_args == 2:
+            out = flash_attn.flash_attn_varlen_kvpacked_func(q, kv, cu_seqlens_q, cu_seqlens_kv, max(q_seqlen), max(kv_seqlen))
+        elif num_all_args == 3:
+            out = flash_attn.flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max(q_seqlen), max(kv_seqlen))
+    elif optimized_attention.__name__  == 'flash_attn_3': # TODO
+        if 'flash_attn_3' not in globals():
+            import flash_attn_interface as flash_attn_3
+        cu_seqlens_q = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(q_seqlen), dim=0)]).int().to(device)
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=1)
+            cu_seqlens_kv = cu_seqlens_q.clone()
+            max_q_seqlen = max_kv_seqlen = max(q_seqlen)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=1)
+            cu_seqlens_kv = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(kv_seqlen), dim=0)]).int().to(device)
+            max_q_seqlen = max(q_seqlen)
+            max_kv_seqlen = max(kv_seqlen)
+        elif num_all_args == 3:
+            cu_seqlens_kv = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(kv_seqlen), dim=0)]).int().to(device)
+            max_q_seqlen = max(q_seqlen)
+            max_kv_seqlen = max(kv_seqlen)
+        out = flash_attn_3.flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_q_seqlen, max_kv_seqlen)
+
+    if s is not None:
+        return s.replace(out)
+    else:
+        return out.reshape(N, L, H, -1)
--- a/comfy/ldm/trellis2/cumesh.py
+++ b/comfy/ldm/trellis2/cumesh.py
@ -0,0 +1,149 @@
+# will contain every cuda -> pytorch operation
+
+import torch
+from typing import Dict
+
+
+class TorchHashMap:
+    def __init__(self, keys: torch.Tensor, values: torch.Tensor, default_value: int):
+        device = keys.device
+        # use long for searchsorted
+        self.sorted_keys, order = torch.sort(keys.long())
+        self.sorted_vals = values.long()[order]
+        self.default_value = torch.tensor(default_value, dtype=torch.long, device=device)
+        self._n = self.sorted_keys.numel()
+
+    def lookup_flat(self, flat_keys: torch.Tensor) -> torch.Tensor:
+        flat = flat_keys.long()
+        idx = torch.searchsorted(self.sorted_keys, flat)
+        found = (idx < self._n) & (self.sorted_keys[idx] == flat)
+        out = torch.full((flat.shape[0],), self.default_value, device=flat.device, dtype=self.sorted_vals.dtype)
+        if found.any():
+            out[found] = self.sorted_vals[idx[found]]
+        return out
+
+class Voxel:
+    def __init__(
+            self,
+            origin: list,
+            voxel_size: float,
+            coords: torch.Tensor = None,
+            attrs: torch.Tensor = None,
+            layout: Dict = {},
+            device: torch.device = 'cuda'
+        ):
+        self.origin = torch.tensor(origin, dtype=torch.float32, device=device)
+        self.voxel_size = voxel_size
+        self.coords = coords
+        self.attrs = attrs
+        self.layout = layout
+        self.device = device
+
+    @property
+    def position(self):
+        return (self.coords + 0.5) * self.voxel_size + self.origin[None, :]
+
+    def split_attrs(self):
+        return {
+            k: self.attrs[:, self.layout[k]]
+            for k in self.layout
+        }
+
+class Mesh:
+    def __init__(self,
+        vertices,
+        faces,
+        vertex_attrs=None
+    ):
+        self.vertices = vertices.float()
+        self.faces = faces.int()
+        self.vertex_attrs = vertex_attrs
+
+    @property
+    def device(self):
+        return self.vertices.device
+
+    def to(self, device, non_blocking=False):
+        return Mesh(
+            self.vertices.to(device, non_blocking=non_blocking),
+            self.faces.to(device, non_blocking=non_blocking),
+            self.vertex_attrs.to(device, non_blocking=non_blocking) if self.vertex_attrs is not None else None,
+        )
+
+    def cuda(self, non_blocking=False):
+        return self.to('cuda', non_blocking=non_blocking)
+
+    def cpu(self):
+        return self.to('cpu')
+
+    # TODO could be an option
+    def fill_holes(self, max_hole_perimeter=3e-2):
+        import cumesh
+        vertices = self.vertices.cuda()
+        faces = self.faces.cuda()
+
+        mesh = cumesh.CuMesh()
+        mesh.init(vertices, faces)
+        mesh.get_edges()
+        mesh.get_boundary_info()
+        if mesh.num_boundaries == 0:
+            return
+        mesh.get_vertex_edge_adjacency()
+        mesh.get_vertex_boundary_adjacency()
+        mesh.get_manifold_boundary_adjacency()
+        mesh.read_manifold_boundary_adjacency()
+        mesh.get_boundary_connected_components()
+        mesh.get_boundary_loops()
+        if mesh.num_boundary_loops == 0:
+            return
+        mesh.fill_holes(max_hole_perimeter=max_hole_perimeter)
+        new_vertices, new_faces = mesh.read()
+
+        self.vertices = new_vertices.to(self.device)
+        self.faces = new_faces.to(self.device)
+
+    # TODO could be an option
+    def simplify(self, target=1000000, verbose: bool=False, options: dict={}):
+        import cumesh
+        vertices = self.vertices.cuda()
+        faces = self.faces.cuda()
+
+        mesh = cumesh.CuMesh()
+        mesh.init(vertices, faces)
+        mesh.simplify(target, verbose=verbose, options=options)
+        new_vertices, new_faces = mesh.read()
+
+        self.vertices = new_vertices.to(self.device)
+        self.faces = new_faces.to(self.device)
+
+class MeshWithVoxel(Mesh, Voxel):
+    def __init__(self,
+        vertices: torch.Tensor,
+        faces: torch.Tensor,
+        origin: list,
+        voxel_size: float,
+        coords: torch.Tensor,
+        attrs: torch.Tensor,
+        voxel_shape: torch.Size,
+        layout: Dict = {},
+    ):
+        self.vertices = vertices.float()
+        self.faces = faces.int()
+        self.origin = torch.tensor(origin, dtype=torch.float32, device=self.device)
+        self.voxel_size = voxel_size
+        self.coords = coords
+        self.attrs = attrs
+        self.voxel_shape = voxel_shape
+        self.layout = layout
+
+    def to(self, device, non_blocking=False):
+        return MeshWithVoxel(
+            self.vertices.to(device, non_blocking=non_blocking),
+            self.faces.to(device, non_blocking=non_blocking),
+            self.origin.tolist(),
+            self.voxel_size,
+            self.coords.to(device, non_blocking=non_blocking),
+            self.attrs.to(device, non_blocking=non_blocking),
+            self.voxel_shape,
+            self.layout,
+        )
--- a/comfy/ldm/trellis2/model.py
+++ b/comfy/ldm/trellis2/model.py
@ -0,0 +1,499 @@
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from comfy.ldm.trellis2.vae import SparseTensor, SparseLinear, sparse_cat, VarLenTensor
+from typing import Optional, Tuple, Literal, Union, List
+from comfy.ldm.trellis2.attention import sparse_windowed_scaled_dot_product_self_attention, sparse_scaled_dot_product_attention
+from comfy.ldm.genmo.joint_model.layers import TimestepEmbedder
+
+class SparseGELU(nn.GELU):
+    def forward(self, input: VarLenTensor) -> VarLenTensor:
+        return input.replace(super().forward(input.feats))
+
+class SparseFeedForwardNet(nn.Module):
+    def __init__(self, channels: int, mlp_ratio: float = 4.0):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            SparseLinear(channels, int(channels * mlp_ratio)),
+            SparseGELU(approximate="tanh"),
+            SparseLinear(int(channels * mlp_ratio), channels),
+        )
+
+    def forward(self, x: VarLenTensor) -> VarLenTensor:
+        return self.mlp(x)
+
+def manual_cast(tensor, dtype):
+    if not torch.is_autocast_enabled():
+        return tensor.type(dtype)
+    return tensor
+class LayerNorm32(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_dtype = x.dtype
+        x = manual_cast(x, torch.float32)
+        o = super().forward(x)
+        return manual_cast(o, x_dtype)
+
+
+class SparseMultiHeadRMSNorm(nn.Module):
+    def __init__(self, dim: int, heads: int):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(heads, dim))
+
+    def forward(self, x: Union[VarLenTensor, torch.Tensor]) -> Union[VarLenTensor, torch.Tensor]:
+        x_type = x.dtype
+        x = x.float()
+        if isinstance(x, VarLenTensor):
+            x = x.replace(F.normalize(x.feats, dim=-1) * self.gamma * self.scale)
+        else:
+            x = F.normalize(x, dim=-1) * self.gamma * self.scale
+        return x.to(x_type)
+
+# TODO: replace with apply_rope1
+class SparseRotaryPositionEmbedder(nn.Module):
+    def __init__(
+        self,
+        head_dim: int,
+        dim: int = 3,
+        rope_freq: Tuple[float, float] = (1.0, 10000.0)
+    ):
+        super().__init__()
+        assert head_dim % 2 == 0, "Head dim must be divisible by 2"
+        self.head_dim = head_dim
+        self.dim = dim
+        self.rope_freq = rope_freq
+        self.freq_dim = head_dim // 2 // dim
+        self.freqs = torch.arange(self.freq_dim, dtype=torch.float32) / self.freq_dim
+        self.freqs = rope_freq[0] / (rope_freq[1] ** (self.freqs))
+
+    def _get_phases(self, indices: torch.Tensor) -> torch.Tensor:
+        self.freqs = self.freqs.to(indices.device)
+        phases = torch.outer(indices, self.freqs)
+        phases = torch.polar(torch.ones_like(phases), phases)
+        return phases
+
+    def _rotary_embedding(self, x: torch.Tensor, phases: torch.Tensor) -> torch.Tensor:
+        x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        x_rotated = x_complex * phases.unsqueeze(-2)
+        x_embed = torch.view_as_real(x_rotated).reshape(*x_rotated.shape[:-1], -1).to(x.dtype)
+        return x_embed
+
+    def forward(self, q: SparseTensor, k: Optional[SparseTensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            q (SparseTensor): [..., N, H, D] tensor of queries
+            k (SparseTensor): [..., N, H, D] tensor of keys
+        """
+        assert q.coords.shape[-1] == self.dim + 1, "Last dimension of coords must be equal to dim+1"
+        phases_cache_name = f'rope_phase_{self.dim}d_freq{self.rope_freq[0]}-{self.rope_freq[1]}_hd{self.head_dim}'
+        phases = q.get_spatial_cache(phases_cache_name)
+        if phases is None:
+            coords = q.coords[..., 1:]
+            phases = self._get_phases(coords.reshape(-1)).reshape(*coords.shape[:-1], -1)
+            if phases.shape[-1] < self.head_dim // 2:
+                padn = self.head_dim // 2 - phases.shape[-1]
+                phases = torch.cat([phases, torch.polar(
+                    torch.ones(*phases.shape[:-1], padn, device=phases.device),
+                    torch.zeros(*phases.shape[:-1], padn, device=phases.device)
+                )], dim=-1)
+            q.register_spatial_cache(phases_cache_name, phases)
+        q_embed = q.replace(self._rotary_embedding(q.feats, phases))
+        if k is None:
+            return q_embed
+        k_embed = k.replace(self._rotary_embedding(k.feats, phases))
+        return q_embed, k_embed
+
+class SparseMultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        ctx_channels: Optional[int] = None,
+        type: Literal["self", "cross"] = "self",
+        attn_mode: Literal["full", "windowed", "double_windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        qkv_bias: bool = True,
+        use_rope: bool = False,
+        rope_freq: Tuple[int, int] = (1.0, 10000.0),
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__()
+
+        self.channels = channels
+        self.head_dim = channels // num_heads
+        self.ctx_channels = ctx_channels if ctx_channels is not None else channels
+        self.num_heads = num_heads
+        self._type = type
+        self.attn_mode = attn_mode
+        self.window_size = window_size
+        self.shift_window = shift_window
+        self.use_rope = use_rope
+        self.qk_rms_norm = qk_rms_norm
+
+        if self._type == "self":
+            self.to_qkv = nn.Linear(channels, channels * 3, bias=qkv_bias)
+        else:
+            self.to_q = nn.Linear(channels, channels, bias=qkv_bias)
+            self.to_kv = nn.Linear(self.ctx_channels, channels * 2, bias=qkv_bias)
+
+        if self.qk_rms_norm:
+            self.q_rms_norm = SparseMultiHeadRMSNorm(self.head_dim, num_heads)
+            self.k_rms_norm = SparseMultiHeadRMSNorm(self.head_dim, num_heads)
+
+        self.to_out = nn.Linear(channels, channels)
+
+        if use_rope:
+            self.rope = SparseRotaryPositionEmbedder(self.head_dim, rope_freq=rope_freq)
+
+    @staticmethod
+    def _linear(module: nn.Linear, x: Union[VarLenTensor, torch.Tensor]) -> Union[VarLenTensor, torch.Tensor]:
+        if isinstance(x, VarLenTensor):
+            return x.replace(module(x.feats))
+        else:
+            return module(x)
+
+    @staticmethod
+    def _reshape_chs(x: Union[VarLenTensor, torch.Tensor], shape: Tuple[int, ...]) -> Union[VarLenTensor, torch.Tensor]:
+        if isinstance(x, VarLenTensor):
+            return x.reshape(*shape)
+        else:
+            return x.reshape(*x.shape[:2], *shape)
+
+    def _fused_pre(self, x: Union[VarLenTensor, torch.Tensor], num_fused: int) -> Union[VarLenTensor, torch.Tensor]:
+        if isinstance(x, VarLenTensor):
+            x_feats = x.feats.unsqueeze(0)
+        else:
+            x_feats = x
+        x_feats = x_feats.reshape(*x_feats.shape[:2], num_fused, self.num_heads, -1)
+        return x.replace(x_feats.squeeze(0)) if isinstance(x, VarLenTensor) else x_feats
+
+    def forward(self, x: SparseTensor, context: Optional[Union[VarLenTensor, torch.Tensor]] = None) -> SparseTensor:
+        if self._type == "self":
+            qkv = self._linear(self.to_qkv, x)
+            qkv = self._fused_pre(qkv, num_fused=3)
+            if self.qk_rms_norm or self.use_rope:
+                q, k, v = qkv.unbind(dim=-3)
+                if self.qk_rms_norm:
+                    q = self.q_rms_norm(q)
+                    k = self.k_rms_norm(k)
+                if self.use_rope:
+                    q, k = self.rope(q, k)
+                qkv = qkv.replace(torch.stack([q.feats, k.feats, v.feats], dim=1))
+            if self.attn_mode == "full":
+                h = sparse_scaled_dot_product_attention(qkv)
+            elif self.attn_mode == "windowed":
+                h = sparse_windowed_scaled_dot_product_self_attention(
+                    qkv, self.window_size, shift_window=self.shift_window
+                )
+            elif self.attn_mode == "double_windowed":
+                qkv0 = qkv.replace(qkv.feats[:, :, self.num_heads//2:])
+                qkv1 = qkv.replace(qkv.feats[:, :, :self.num_heads//2])
+                h0 = sparse_windowed_scaled_dot_product_self_attention(
+                    qkv0, self.window_size, shift_window=(0, 0, 0)
+                )
+                h1 = sparse_windowed_scaled_dot_product_self_attention(
+                    qkv1, self.window_size, shift_window=tuple([self.window_size//2] * 3)
+                )
+                h = qkv.replace(torch.cat([h0.feats, h1.feats], dim=1))
+        else:
+            q = self._linear(self.to_q, x)
+            q = self._reshape_chs(q, (self.num_heads, -1))
+            kv = self._linear(self.to_kv, context)
+            kv = self._fused_pre(kv, num_fused=2)
+            if self.qk_rms_norm:
+                q = self.q_rms_norm(q)
+                k, v = kv.unbind(dim=-3)
+                k = self.k_rms_norm(k)
+                h = sparse_scaled_dot_product_attention(q, k, v)
+            else:
+                h = sparse_scaled_dot_product_attention(q, kv)
+        h = self._reshape_chs(h, (-1,))
+        h = self._linear(self.to_out, h)
+        return h
+
+class ModulatedSparseTransformerBlock(nn.Module):
+    """
+    Sparse Transformer block (MSA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        rope_freq: Tuple[float, float] = (1.0, 10000.0),
+        qk_rms_norm: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            rope_freq=rope_freq,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+        else:
+            self.modulation = nn.Parameter(torch.randn(6 * channels) / channels ** 0.5)
+
+    def _forward(self, x: SparseTensor, mod: torch.Tensor) -> SparseTensor:
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.modulation + mod).type(mod.dtype).chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = x.replace(self.norm1(x.feats))
+        h = h * (1 + scale_msa) + shift_msa
+        h = self.attn(h)
+        h = h * gate_msa
+        x = x + h
+        h = x.replace(self.norm2(x.feats))
+        h = h * (1 + scale_mlp) + shift_mlp
+        h = self.mlp(h)
+        h = h * gate_mlp
+        x = x + h
+        return x
+
+    def forward(self, x: SparseTensor, mod: torch.Tensor) -> SparseTensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, use_reentrant=False)
+        else:
+            return self._forward(x, mod)
+
+
+class ModulatedSparseTransformerCrossBlock(nn.Module):
+    """
+    Sparse Transformer cross-attention block (MSA + MCA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        rope_freq: Tuple[float, float] = (1.0, 10000.0),
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.self_attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            rope_freq=rope_freq,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn = SparseMultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+        else:
+            self.modulation = nn.Parameter(torch.randn(6 * channels) / channels ** 0.5)
+
+    def _forward(self, x: SparseTensor, mod: torch.Tensor, context: Union[torch.Tensor, VarLenTensor]) -> SparseTensor:
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.modulation + mod).type(mod.dtype).chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = x.replace(self.norm1(x.feats))
+        h = h * (1 + scale_msa) + shift_msa
+        h = self.self_attn(h)
+        h = h * gate_msa
+        x = x + h
+        h = x.replace(self.norm2(x.feats))
+        h = self.cross_attn(h, context)
+        x = x + h
+        h = x.replace(self.norm3(x.feats))
+        h = h * (1 + scale_mlp) + shift_mlp
+        h = self.mlp(h)
+        h = h * gate_mlp
+        x = x + h
+        return x
+
+    def forward(self, x: SparseTensor, mod: torch.Tensor, context: Union[torch.Tensor, VarLenTensor]) -> SparseTensor:
+        return self._forward(x, mod, context)
+
+
+class SLatFlowModel(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        cond_channels: int,
+        out_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        pe_mode: Literal["ape", "rope"] = "rope",
+        rope_freq: Tuple[float, float] = (1.0, 10000.0),
+        use_checkpoint: bool = False,
+        share_mod: bool = False,
+        initialization: str = 'vanilla',
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        dtype = None,
+        device = None,
+        operations = None,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.cond_channels = cond_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.pe_mode = pe_mode
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.initialization = initialization
+        self.qk_rms_norm = qk_rms_norm
+        self.qk_rms_norm_cross = qk_rms_norm_cross
+        self.dtype = dtype
+
+        self.t_embedder = TimestepEmbedder(model_channels)
+        if share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels, 6 * model_channels, bias=True)
+            )
+
+        self.input_layer = SparseLinear(in_channels, model_channels)
+
+        self.blocks = nn.ModuleList([
+            ModulatedSparseTransformerCrossBlock(
+                model_channels,
+                cond_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode='full',
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                rope_freq=rope_freq,
+                share_mod=self.share_mod,
+                qk_rms_norm=self.qk_rms_norm,
+                qk_rms_norm_cross=self.qk_rms_norm_cross,
+            )
+            for _ in range(num_blocks)
+        ])
+
+        self.out_layer = SparseLinear(model_channels, out_channels)
+
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+
+    def forward(
+        self,
+        x: SparseTensor,
+        t: torch.Tensor,
+        cond: Union[torch.Tensor, List[torch.Tensor]],
+        concat_cond: Optional[SparseTensor] = None,
+        **kwargs
+    ) -> SparseTensor:
+        if concat_cond is not None:
+            x = sparse_cat([x, concat_cond], dim=-1)
+        if isinstance(cond, list):
+            cond = VarLenTensor.from_tensor_list(cond)
+
+        h = self.input_layer(x)
+        h = manual_cast(h, self.dtype)
+        t_emb = self.t_embedder(t)
+        if self.share_mod:
+            t_emb = self.adaLN_modulation(t_emb)
+        t_emb = manual_cast(t_emb, self.dtype)
+        cond = manual_cast(cond, self.dtype)
+
+        if self.pe_mode == "ape":
+            pe = self.pos_embedder(h.coords[:, 1:])
+            h = h + manual_cast(pe, self.dtype)
+        for block in self.blocks:
+            h = block(h, t_emb, cond)
+
+        h = manual_cast(h, x.dtype)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = self.out_layer(h)
+        return h
+
+class Trellis2(nn.Module):
+    def __init__(self, resolution,
+                 in_channels = 32,
+                 out_channels = 32,
+                 model_channels = 1536,
+                 cond_channels = 1024,
+                 num_blocks = 30,
+                 num_heads = 12,
+                 mlp_ratio = 5.3334,
+                 share_mod = True,
+                 qk_rms_norm = True,
+                 qk_rms_norm_cross = True,
+                 dtype=None, device=None, operations=None):
+        args = {
+            "out_channels":out_channels, "num_blocks":num_blocks, "cond_channels" :cond_channels,
+            "model_channels":model_channels, "num_heads":num_heads, "mlp_ratio": mlp_ratio, "share_mod": share_mod,
+            "qk_rms_norm": qk_rms_norm, "qk_rms_norm_cross": qk_rms_norm_cross, "device": device, "dtype": dtype, "operations": operations
+        }
+        # TODO: update the names/checkpoints
+        self.img2shape = SLatFlowModel(resolution, in_channels=in_channels, *args)
+        self.shape2txt = SLatFlowModel(resolution, in_channels=in_channels*2, *args)
+        self.shape_generation = True
+
+    def forward(self, x, timestep, context):
+        pass
--- a/comfy/ldm/trellis2/vae.py
+++ b/comfy/ldm/trellis2/vae.py
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -50,6 +50,7 @@ import comfy.ldm.omnigen.omnigen2
 import comfy.ldm.qwen_image.model
 import comfy.ldm.kandinsky5.model
 import comfy.ldm.anima.model
+import comfy.ldm.ace.ace_step15

 import comfy.model_management
 import comfy.patcher_extension
@ -1540,6 +1541,47 @@ class ACEStep(BaseModel):
        out['lyrics_strength'] = comfy.conds.CONDConstant(kwargs.get("lyrics_strength", 1.0))
        return out

+class ACEStep15(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ace.ace_step15.AceStepConditionGenerationModel)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        device = kwargs["device"]
+
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+
+        conditioning_lyrics = kwargs.get("conditioning_lyrics", None)
+        if cross_attn is not None:
+            out['lyric_embed'] = comfy.conds.CONDRegular(conditioning_lyrics)
+
+        refer_audio = kwargs.get("reference_audio_timbre_latents", None)
+        if refer_audio is None or len(refer_audio) == 0:
+            refer_audio = torch.tensor([[[-1.3672e-01, -1.5820e-01,  5.8594e-01, -5.7422e-01,  3.0273e-02,
+                                        2.7930e-01, -2.5940e-03, -2.0703e-01, -1.6113e-01, -1.4746e-01,
+                                        -2.7710e-02, -1.8066e-01, -2.9688e-01,  1.6016e+00, -2.6719e+00,
+                                        7.7734e-01, -1.3516e+00, -1.9434e-01, -7.1289e-02, -5.0938e+00,
+                                        2.4316e-01,  4.7266e-01,  4.6387e-02, -6.6406e-01, -2.1973e-01,
+                                        -6.7578e-01, -1.5723e-01,  9.5312e-01, -2.0020e-01, -1.7109e+00,
+                                        5.8984e-01, -5.7422e-01,  5.1562e-01,  2.8320e-01,  1.4551e-01,
+                                        -1.8750e-01, -5.9814e-02,  3.6719e-01, -1.0059e-01, -1.5723e-01,
+                                        2.0605e-01, -4.3359e-01, -8.2812e-01,  4.5654e-02, -6.6016e-01,
+                                        1.4844e-01,  9.4727e-02,  3.8477e-01, -1.2578e+00, -3.3203e-01,
+                                        -8.5547e-01,  4.3359e-01,  4.2383e-01, -8.9453e-01, -5.0391e-01,
+                                        -5.6152e-02, -2.9219e+00, -2.4658e-02,  5.0391e-01,  9.8438e-01,
+                                        7.2754e-02, -2.1582e-01,  6.3672e-01,  1.0000e+00]]], device=device).movedim(-1, 1).repeat(1, 1, 750)
+        else:
+            refer_audio = refer_audio[-1]
+        out['refer_audio'] = comfy.conds.CONDRegular(refer_audio)
+
+        audio_codes = kwargs.get("audio_codes", None)
+        if audio_codes is not None:
+            out['audio_codes'] = comfy.conds.CONDRegular(torch.tensor(audio_codes, device=device))
+
+        return out
+
 class Omnigen2(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel)
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -655,6 +655,11 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["num_visual_blocks"] = count_blocks(state_dict_keys, '{}visual_transformer_blocks.'.format(key_prefix) + '{}.')
        return dit_config

+    if '{}encoder.lyric_encoder.layers.0.input_layernorm.weight'.format(key_prefix) in state_dict_keys:
+        dit_config = {}
+        dit_config["audio_model"] = "ace1.5"
+        return dit_config
+
    if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
        return None

--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -59,6 +59,7 @@ import comfy.text_encoders.kandinsky5
 import comfy.text_encoders.jina_clip_2
 import comfy.text_encoders.newbie
 import comfy.text_encoders.anima
+import comfy.text_encoders.ace15

 import comfy.model_patcher
 import comfy.lora
@ -452,6 +453,8 @@ class VAE:
        self.extra_1d_channel = None
        self.crop_input = True

+        self.audio_sample_rate = 44100
+
        if config is None:
            if "decoder.mid.block_1.mix_factor" in sd:
                encoder_config = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
@ -549,14 +552,25 @@ class VAE:
                                                                    encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': ddconfig},
                                                                    decoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Decoder", 'params': ddconfig})
            elif "decoder.layers.1.layers.0.beta" in sd:
-                self.first_stage_model = AudioOobleckVAE()
+                config = {}
+                param_key = None
+                if "decoder.layers.2.layers.1.weight_v" in sd:
+                    param_key = "decoder.layers.2.layers.1.weight_v"
+                if "decoder.layers.2.layers.1.parametrizations.weight.original1" in sd:
+                    param_key = "decoder.layers.2.layers.1.parametrizations.weight.original1"
+                if param_key is not None:
+                    if sd[param_key].shape[-1] == 12:
+                        config["strides"] = [2, 4, 4, 6, 10]
+                        self.audio_sample_rate = 48000
+
+                self.first_stage_model = AudioOobleckVAE(**config)
                self.memory_used_encode = lambda shape, dtype: (1000 * shape[2]) * model_management.dtype_size(dtype)
                self.memory_used_decode = lambda shape, dtype: (1000 * shape[2] * 2048) * model_management.dtype_size(dtype)
                self.latent_channels = 64
                self.output_channels = 2
                self.pad_channel_value = "replicate"
                self.upscale_ratio = 2048
-                self.downscale_ratio =  2048
+                self.downscale_ratio = 2048
                self.latent_dim = 1
                self.process_output = lambda audio: audio
                self.process_input = lambda audio: audio
@ -1427,6 +1441,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
                clip_data_jina = clip_data[0]
            tokenizer_data["gemma_spiece_model"] = clip_data_gemma.get("spiece_model", None)
            tokenizer_data["jina_spiece_model"] = clip_data_jina.get("spiece_model", None)
+        elif clip_type == CLIPType.ACE:
+            clip_target.clip = comfy.text_encoders.ace15.te(**llama_detect(clip_data))
+            clip_target.tokenizer = comfy.text_encoders.ace15.ACE15Tokenizer
        else:
            clip_target.clip = sdxl_clip.SDXLClipModel
            clip_target.tokenizer = sdxl_clip.SDXLTokenizer
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@ -155,6 +155,8 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        self.execution_device = options.get("execution_device", self.execution_device)
        if isinstance(self.layer, list) or self.layer == "all":
            pass
+        elif isinstance(layer_idx, list):
+            self.layer = layer_idx
        elif layer_idx is None or abs(layer_idx) > self.num_layers:
            self.layer = "last"
        else:
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -24,6 +24,7 @@ import comfy.text_encoders.hunyuan_image
 import comfy.text_encoders.kandinsky5
 import comfy.text_encoders.z_image
 import comfy.text_encoders.anima
+import comfy.text_encoders.ace15

 from . import supported_models_base
 from . import latent_formats
@ -1596,6 +1597,38 @@ class Kandinsky5Image(Kandinsky5):
        return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage, comfy.text_encoders.kandinsky5.te(**hunyuan_detect))


-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]
+class ACEStep15(supported_models_base.BASE):
+    unet_config = {
+        "audio_model": "ace1.5",
+    }
+
+    unet_extra_config = {
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 3.0,
+    }
+
+    latent_format = comfy.latent_formats.ACEAudio15
+
+    memory_usage_factor = 4.7
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.ACEStep15(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_2b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.ace15.ACE15Tokenizer, comfy.text_encoders.ace15.te(**hunyuan_detect))
+
+
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]

 models += [SVD_img2vid]
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@ -0,0 +1,223 @@
+from .anima import Qwen3Tokenizer
+import comfy.text_encoders.llama
+from comfy import sd1_clip
+import torch
+import math
+import comfy.utils
+
+
+def sample_manual_loop_no_classes(
+    model,
+    ids=None,
+    paddings=[],
+    execution_dtype=None,
+    cfg_scale: float = 2.0,
+    temperature: float = 0.85,
+    top_p: float = 0.9,
+    top_k: int = None,
+    seed: int = 1,
+    min_tokens: int = 1,
+    max_new_tokens: int = 2048,
+    audio_start_id: int = 151669,  # The cutoff ID for audio codes
+    eos_token_id: int = 151645,
+):
+    device = model.execution_device
+
+    if execution_dtype is None:
+        if comfy.model_management.should_use_bf16(device):
+            execution_dtype = torch.bfloat16
+        else:
+            execution_dtype = torch.float32
+
+    embeds, attention_mask, num_tokens, embeds_info = model.process_tokens(ids, device)
+    for i, t in enumerate(paddings):
+        attention_mask[i, :t] = 0
+        attention_mask[i, t:] = 1
+
+    output_audio_codes = []
+    past_key_values = []
+    generator = torch.Generator(device=device)
+    generator.manual_seed(seed)
+    model_config = model.transformer.model.config
+
+    for x in range(model_config.num_hidden_layers):
+        past_key_values.append((torch.empty([embeds.shape[0], model_config.num_key_value_heads, embeds.shape[1] + min_tokens, model_config.head_dim], device=device, dtype=execution_dtype), torch.empty([embeds.shape[0], model_config.num_key_value_heads, embeds.shape[1] + min_tokens, model_config.head_dim], device=device, dtype=execution_dtype), 0))
+
+    progress_bar = comfy.utils.ProgressBar(max_new_tokens)
+
+    for step in range(max_new_tokens):
+        outputs = model.transformer(None, attention_mask, embeds=embeds.to(execution_dtype), num_tokens=num_tokens, intermediate_output=None, dtype=execution_dtype, embeds_info=embeds_info, past_key_values=past_key_values)
+        next_token_logits = model.transformer.logits(outputs[0])[:, -1]
+        past_key_values = outputs[2]
+
+        cond_logits = next_token_logits[0:1]
+        uncond_logits = next_token_logits[1:2]
+        cfg_logits = uncond_logits + cfg_scale * (cond_logits - uncond_logits)
+
+        if eos_token_id is not None and eos_token_id < audio_start_id and min_tokens < step:
+            eos_score = cfg_logits[:, eos_token_id].clone()
+
+        remove_logit_value = torch.finfo(cfg_logits.dtype).min
+        # Only generate audio tokens
+        cfg_logits[:, :audio_start_id] = remove_logit_value
+
+        if eos_token_id is not None and eos_token_id < audio_start_id and min_tokens < step:
+            cfg_logits[:, eos_token_id] = eos_score
+
+        if top_k is not None and top_k > 0:
+            top_k_vals, _ = torch.topk(cfg_logits, top_k)
+            min_val = top_k_vals[..., -1, None]
+            cfg_logits[cfg_logits < min_val] = remove_logit_value
+
+        if top_p is not None and top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(cfg_logits, descending=True)
+            cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+            sorted_indices_to_remove = cumulative_probs > top_p
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+            cfg_logits[indices_to_remove] = remove_logit_value
+
+        if temperature > 0:
+            cfg_logits = cfg_logits / temperature
+            next_token = torch.multinomial(torch.softmax(cfg_logits, dim=-1), num_samples=1, generator=generator).squeeze(1)
+        else:
+            next_token = torch.argmax(cfg_logits, dim=-1)
+
+        token = next_token.item()
+
+        if token == eos_token_id:
+            break
+
+        embed, _, _, _ = model.process_tokens([[token]], device)
+        embeds = embed.repeat(2, 1, 1)
+        attention_mask = torch.cat([attention_mask, torch.ones((2, 1), device=device, dtype=attention_mask.dtype)], dim=1)
+
+        output_audio_codes.append(token - audio_start_id)
+        progress_bar.update_absolute(step)
+
+    return output_audio_codes
+
+
+def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=1024, seed=0):
+    cfg_scale = 2.0
+
+    positive = [[token for token, _ in inner_list] for inner_list in positive]
+    negative = [[token for token, _ in inner_list] for inner_list in negative]
+    positive = positive[0]
+    negative = negative[0]
+
+    neg_pad = 0
+    if len(negative) < len(positive):
+        neg_pad = (len(positive) - len(negative))
+        negative = [model.special_tokens["pad"]] * neg_pad + negative
+
+    pos_pad = 0
+    if len(negative) > len(positive):
+        pos_pad = (len(negative) - len(positive))
+        positive = [model.special_tokens["pad"]] * pos_pad + positive
+
+    paddings = [pos_pad, neg_pad]
+    return sample_manual_loop_no_classes(model, [positive, negative], paddings, cfg_scale=cfg_scale, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
+
+
+class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_06b", tokenizer=Qwen3Tokenizer)
+
+    def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
+        out = {}
+        lyrics = kwargs.get("lyrics", "")
+        bpm = kwargs.get("bpm", 120)
+        duration = kwargs.get("duration", 120)
+        keyscale = kwargs.get("keyscale", "C major")
+        timesignature = kwargs.get("timesignature", 2)
+        language = kwargs.get("language", "en")
+        seed = kwargs.get("seed", 0)
+
+        duration = math.ceil(duration)
+        meta_lm = 'bpm: {}\nduration: {}\nkeyscale: {}\ntimesignature: {}'.format(bpm, duration, keyscale, timesignature)
+        lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n{}\n<|im_end|>\n<|im_start|>assistant\n<think>\n{}\n</think>\n\n<|im_end|>\n"
+
+        meta_cap = '- bpm: {}\n- timesignature: {}\n- keyscale: {}\n- duration: {}\n'.format(bpm, timesignature, keyscale, duration)
+        out["lm_prompt"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, meta_lm), disable_weights=True)
+        out["lm_prompt_negative"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, ""), disable_weights=True)
+
+        out["lyrics"] = self.qwen3_06b.tokenize_with_weights("# Languages\n{}\n\n# Lyric{}<|endoftext|><|endoftext|>".format(language, lyrics), return_word_ids, disable_weights=True, **kwargs)
+        out["qwen3_06b"] = self.qwen3_06b.tokenize_with_weights("# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}# Metas\n{}<|endoftext|>\n<|endoftext|>".format(text, meta_cap), return_word_ids, **kwargs)
+        out["lm_metadata"] = {"min_tokens": duration * 5, "seed": seed}
+        return out
+
+
+class Qwen3_06BModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen3_06B_ACE15, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
+
+class Qwen3_2B_ACE15(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
+        llama_quantization_metadata = model_options.get("llama_quantization_metadata", None)
+        if llama_quantization_metadata is not None:
+            model_options = model_options.copy()
+            model_options["quantization_metadata"] = llama_quantization_metadata
+
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen3_2B_ACE15_lm, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
+
+class ACE15TEModel(torch.nn.Module):
+    def __init__(self, device="cpu", dtype=None, dtype_llama=None, model_options={}):
+        super().__init__()
+        if dtype_llama is None:
+            dtype_llama = dtype
+
+        self.qwen3_06b = Qwen3_06BModel(device=device, dtype=dtype, model_options=model_options)
+        self.qwen3_2b = Qwen3_2B_ACE15(device=device, dtype=dtype_llama, model_options=model_options)
+        self.dtypes = set([dtype, dtype_llama])
+
+    def encode_token_weights(self, token_weight_pairs):
+        token_weight_pairs_base = token_weight_pairs["qwen3_06b"]
+        token_weight_pairs_lyrics = token_weight_pairs["lyrics"]
+
+        self.qwen3_06b.set_clip_options({"layer": None})
+        base_out, _, extra = self.qwen3_06b.encode_token_weights(token_weight_pairs_base)
+        self.qwen3_06b.set_clip_options({"layer": [0]})
+        lyrics_embeds, _, extra_l = self.qwen3_06b.encode_token_weights(token_weight_pairs_lyrics)
+
+        lm_metadata = token_weight_pairs["lm_metadata"]
+        audio_codes = generate_audio_codes(self.qwen3_2b, token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["min_tokens"], seed=lm_metadata["seed"])
+
+        return base_out, None, {"conditioning_lyrics": lyrics_embeds[:, 0], "audio_codes": [audio_codes]}
+
+    def set_clip_options(self, options):
+        self.qwen3_06b.set_clip_options(options)
+        self.qwen3_2b.set_clip_options(options)
+
+    def reset_clip_options(self):
+        self.qwen3_06b.reset_clip_options()
+        self.qwen3_2b.reset_clip_options()
+
+    def load_sd(self, sd):
+        if "model.layers.0.post_attention_layernorm.weight" in sd:
+            shape = sd["model.layers.0.post_attention_layernorm.weight"].shape
+            if shape[0] == 1024:
+                return self.qwen3_06b.load_sd(sd)
+            else:
+                return self.qwen3_2b.load_sd(sd)
+
+    def memory_estimation_function(self, token_weight_pairs, device=None):
+        lm_metadata = token_weight_pairs["lm_metadata"]
+        constant = 0.4375
+        if comfy.model_management.should_use_bf16(device):
+            constant *= 0.5
+
+        token_weight_pairs = token_weight_pairs.get("lm_prompt", [])
+        num_tokens = sum(map(lambda a: len(a), token_weight_pairs))
+        num_tokens += lm_metadata['min_tokens']
+        return num_tokens * constant * 1024 * 1024
+
+def te(dtype_llama=None, llama_quantization_metadata=None):
+    class ACE15TEModel_(ACE15TEModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if llama_quantization_metadata is not None:
+                model_options = model_options.copy()
+                model_options["llama_quantization_metadata"] = llama_quantization_metadata
+            super().__init__(device=device, dtype_llama=dtype_llama, dtype=dtype, model_options=model_options)
+    return ACE15TEModel_
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@ -6,6 +6,7 @@ import math

 from comfy.ldm.modules.attention import optimized_attention_for_device
 import comfy.model_management
+import comfy.ops
 import comfy.ldm.common_dit
 import comfy.clip_model

@ -103,6 +104,52 @@ class Qwen3_06BConfig:
    final_norm: bool = True
    lm_head: bool = False

+@dataclass
+class Qwen3_06B_ACE15_Config:
+    vocab_size: int = 151669
+    hidden_size: int = 1024
+    intermediate_size: int = 3072
+    num_hidden_layers: int = 28
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 8
+    max_position_embeddings: int = 32768
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 1000000.0
+    transformer_type: str = "llama"
+    head_dim = 128
+    rms_norm_add = False
+    mlp_activation = "silu"
+    qkv_bias = False
+    rope_dims = None
+    q_norm = "gemma3"
+    k_norm = "gemma3"
+    rope_scale = None
+    final_norm: bool = True
+    lm_head: bool = False
+
+@dataclass
+class Qwen3_2B_ACE15_lm_Config:
+    vocab_size: int = 217204
+    hidden_size: int = 2048
+    intermediate_size: int = 6144
+    num_hidden_layers: int = 28
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 8
+    max_position_embeddings: int = 40960
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 1000000.0
+    transformer_type: str = "llama"
+    head_dim = 128
+    rms_norm_add = False
+    mlp_activation = "silu"
+    qkv_bias = False
+    rope_dims = None
+    q_norm = "gemma3"
+    k_norm = "gemma3"
+    rope_scale = None
+    final_norm: bool = True
+    lm_head: bool = False
+
@dataclass
 class Qwen3_4BConfig:
    vocab_size: int = 151936
@ -581,10 +628,10 @@ class Llama2_(nn.Module):
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, seq_len, attention_mask.shape[-1])
-            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))
+            mask = mask.masked_fill(mask.to(torch.bool), torch.finfo(x.dtype).min)

        if seq_len > 1:
-            causal_mask = torch.empty(past_len + seq_len, past_len + seq_len, dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
+            causal_mask = torch.empty(past_len + seq_len, past_len + seq_len, dtype=x.dtype, device=x.device).fill_(torch.finfo(x.dtype).min).triu_(1)
            if mask is not None:
                mask += causal_mask
            else:
@ -729,6 +776,39 @@ class Qwen3_06B(BaseLlama, torch.nn.Module):
        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
        self.dtype = dtype

+class Qwen3_06B_ACE15(BaseLlama, torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        config = Qwen3_06B_ACE15_Config(**config_dict)
+        self.num_layers = config.num_hidden_layers
+
+        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
+        self.dtype = dtype
+
+class Qwen3_2B_ACE15_lm(BaseLlama, torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        config = Qwen3_2B_ACE15_lm_Config(**config_dict)
+        self.num_layers = config.num_hidden_layers
+
+        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
+        self.dtype = dtype
+
+    def logits(self, x):
+        input = x[:, -1:]
+        module = self.model.embed_tokens
+
+        offload_stream = None
+        if module.comfy_cast_weights:
+            weight, _, offload_stream = comfy.ops.cast_bias_weight(module, input, offloadable=True)
+        else:
+            weight = self.model.embed_tokens.weight.to(x)
+
+        x = torch.nn.functional.linear(input, weight, None)
+
+        comfy.ops.uncast_bias_weight(module, weight, None, offload_stream)
+        return x
+
 class Qwen3_4B(BaseLlama, torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
--- a/comfy_extras/nodes_ace.py
+++ b/comfy_extras/nodes_ace.py
@ -28,12 +28,39 @@ class TextEncodeAceStepAudio(io.ComfyNode):
        conditioning = node_helpers.conditioning_set_values(conditioning, {"lyrics_strength": lyrics_strength})
        return io.NodeOutput(conditioning)

+class TextEncodeAceStepAudio15(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="TextEncodeAceStepAudio1.5",
+            category="conditioning",
+            inputs=[
+                io.Clip.Input("clip"),
+                io.String.Input("tags", multiline=True, dynamic_prompts=True),
+                io.String.Input("lyrics", multiline=True, dynamic_prompts=True),
+                io.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True),
+                io.Int.Input("bpm", default=120, min=10, max=300),
+                io.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1),
+                io.Combo.Input("timesignature", options=['2', '3', '4', '6']),
+                io.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
+                io.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
+            ],
+            outputs=[io.Conditioning.Output()],
+        )
+
+    @classmethod
+    def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale) -> io.NodeOutput:
+        tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed)
+        conditioning = clip.encode_from_tokens_scheduled(tokens)
+        return io.NodeOutput(conditioning)
+

 class EmptyAceStepLatentAudio(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="EmptyAceStepLatentAudio",
+            display_name="Empty Ace Step 1.0 Latent Audio",
            category="latent/audio",
            inputs=[
                io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
@ -51,12 +78,60 @@ class EmptyAceStepLatentAudio(io.ComfyNode):
        return io.NodeOutput({"samples": latent, "type": "audio"})


+class EmptyAceStep15LatentAudio(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="EmptyAceStep1.5LatentAudio",
+            display_name="Empty Ace Step 1.5 Latent Audio",
+            category="latent/audio",
+            inputs=[
+                io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
+                io.Int.Input(
+                    "batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."
+                ),
+            ],
+            outputs=[io.Latent.Output()],
+        )
+
+    @classmethod
+    def execute(cls, seconds, batch_size) -> io.NodeOutput:
+        length = round((seconds * 48000 / 1920))
+        latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device())
+        return io.NodeOutput({"samples": latent, "type": "audio"})
+
+class ReferenceTimbreAudio(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ReferenceTimbreAudio",
+            category="advanced/conditioning/audio",
+            is_experimental=True,
+            description="This node sets the reference audio for timbre (for ace step 1.5)",
+            inputs=[
+                io.Conditioning.Input("conditioning"),
+                io.Latent.Input("latent", optional=True),
+            ],
+            outputs=[
+                io.Conditioning.Output(),
+            ]
+        )
+
+    @classmethod
+    def execute(cls, conditioning, latent=None) -> io.NodeOutput:
+        if latent is not None:
+            conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_audio_timbre_latents": [latent["samples"]]}, append=True)
+        return io.NodeOutput(conditioning)
+
 class AceExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
        return [
            TextEncodeAceStepAudio,
            EmptyAceStepLatentAudio,
+            TextEncodeAceStepAudio15,
+            EmptyAceStep15LatentAudio,
+            ReferenceTimbreAudio,
        ]

 async def comfy_entrypoint() -> AceExtension:
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@ -82,13 +82,14 @@ class VAEEncodeAudio(IO.ComfyNode):
    @classmethod
    def execute(cls, vae, audio) -> IO.NodeOutput:
        sample_rate = audio["sample_rate"]
-        if 44100 != sample_rate:
-            waveform = torchaudio.functional.resample(audio["waveform"], sample_rate, 44100)
+        vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
+        if vae_sample_rate != sample_rate:
+            waveform = torchaudio.functional.resample(audio["waveform"], sample_rate, vae_sample_rate)
        else:
            waveform = audio["waveform"]

        t = vae.encode(waveform.movedim(1, -1))
-        return IO.NodeOutput({"samples":t})
+        return IO.NodeOutput({"samples": t})

    encode = execute  # TODO: remove

@ -114,7 +115,8 @@ class VAEDecodeAudio(IO.ComfyNode):
        std = torch.std(audio, dim=[1,2], keepdim=True) * 5.0
        std[std < 1.0] = 1.0
        audio /= std
-        return IO.NodeOutput({"waveform": audio, "sample_rate": 44100 if "sample_rate" not in samples else samples["sample_rate"]})
+        vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
+        return IO.NodeOutput({"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]})

    decode = execute  # TODO: remove

--- a/comfy_extras/nodes_trellis2.py
+++ b/comfy_extras/nodes_trellis2.py
@ -0,0 +1,274 @@
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, IO
+import torch
+from comfy.ldm.trellis2.model import SparseTensor
+import comfy.model_management
+from PIL import Image
+import PIL
+import numpy as np
+
+shape_slat_normalization = {
+    "mean": torch.tensor([
+        0.781296, 0.018091, -0.495192, -0.558457, 1.060530, 0.093252, 1.518149, -0.933218,
+        -0.732996, 2.604095, -0.118341, -2.143904, 0.495076, -2.179512, -2.130751, -0.996944,
+        0.261421, -2.217463, 1.260067, -0.150213, 3.790713, 1.481266, -1.046058, -1.523667,
+        -0.059621, 2.220780, 1.621212, 0.877230, 0.567247, -3.175944, -3.186688, 1.578665
+    ])[None],
+    "std": torch.tensor([
+        5.972266, 4.706852, 5.445010, 5.209927, 5.320220, 4.547237, 5.020802, 5.444004,
+        5.226681, 5.683095, 4.831436, 5.286469, 5.652043, 5.367606, 5.525084, 4.730578,
+        4.805265, 5.124013, 5.530808, 5.619001, 5.103930, 5.417670, 5.269677, 5.547194,
+        5.634698, 5.235274, 6.110351, 5.511298, 6.237273, 4.879207, 5.347008, 5.405691
+    ])[None]
+}
+
+tex_slat_normalization = {
+    "mean": torch.tensor([
+        3.501659, 2.212398, 2.226094, 0.251093, -0.026248, -0.687364, 0.439898, -0.928075,
+        0.029398, -0.339596, -0.869527, 1.038479, -0.972385, 0.126042, -1.129303, 0.455149,
+        -1.209521, 2.069067, 0.544735, 2.569128, -0.323407, 2.293000, -1.925608, -1.217717,
+        1.213905, 0.971588, -0.023631, 0.106750, 2.021786, 0.250524, -0.662387, -0.768862
+    ])[None],
+    "std": torch.tensor([
+        2.665652, 2.743913, 2.765121, 2.595319, 3.037293, 2.291316, 2.144656, 2.911822,
+        2.969419, 2.501689, 2.154811, 3.163343, 2.621215, 2.381943, 3.186697, 3.021588,
+        2.295916, 3.234985, 3.233086, 2.260140, 2.874801, 2.810596, 3.292720, 2.674999,
+        2.680878, 2.372054, 2.451546, 2.353556, 2.995195, 2.379849, 2.786195, 2.775190
+    ])[None]
+}
+
+def smart_crop_square(
+    image: torch.Tensor,
+    background_color=(128, 128, 128),
+):
+    C, H, W = image.shape
+    size = max(H, W)
+    canvas = torch.empty(
+        (C, size, size),
+        dtype=image.dtype,
+        device=image.device
+    )
+    for c in range(C):
+        canvas[c].fill_(background_color[c])
+    top = (size - H) // 2
+    left = (size - W) // 2
+    canvas[:, top:top + H, left:left + W] = image
+
+    return canvas
+
+def run_conditioning(
+    model,
+    image: torch.Tensor,
+    include_1024: bool = True,
+    background_color: str = "black",
+):
+    # TODO: should check if normalization was applied in these steps
+    model = model.model
+    device = comfy.model_management.intermediate_device() # replaces .cpu()
+    torch_device =  comfy.model_management.get_torch_device() # replaces .cuda()
+    bg_colors = {
+        "black": (0, 0, 0),
+        "gray": (128, 128, 128),
+        "white": (255, 255, 255),
+    }
+    bg_color = bg_colors.get(background_color, (128, 128, 128))
+
+    # Convert image to PIL
+    if image.dim() == 4:
+        pil_image = (image[0] * 255).clip(0, 255).astype(torch.uint8)
+    else:
+        pil_image = (image * 255).clip(0, 255).astype(torch.uint8)
+
+    pil_image = smart_crop_square(pil_image, background_color=bg_color)
+
+    model.image_size = 512
+    def set_image_size(image, image_size=512):
+        image = PIL.from_array(image)
+        image = [i.resize((image_size, image_size), Image.LANCZOS) for i in image]
+        image = [np.array(i.convert('RGB')).astype(np.float32) / 255 for i in image]
+        image = [torch.from_numpy(i).permute(2, 0, 1).float() for i in image]
+        image = torch.stack(image).to(torch_device)
+        return image
+
+    pil_image = set_image_size(image, 512)
+    cond_512 = model([pil_image])
+
+    cond_1024 = None
+    if include_1024:
+        model.image_size = 1024
+        pil_image = set_image_size(pil_image, 1024)
+        cond_1024 = model([pil_image])
+
+    neg_cond = torch.zeros_like(cond_512)
+
+    conditioning = {
+        'cond_512': cond_512.to(device),
+        'neg_cond': neg_cond.to(device),
+    }
+    if cond_1024 is not None:
+        conditioning['cond_1024'] = cond_1024.to(device)
+
+    preprocessed_tensor = pil_image.to(torch.float32) / 255.0
+    preprocessed_tensor = torch.from_numpy(preprocessed_tensor).unsqueeze(0)
+
+    return conditioning, preprocessed_tensor
+
+class VaeDecodeShapeTrellis(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="VaeDecodeShapeTrellis",
+            category="latent/3d",
+            inputs=[
+                IO.Latent.Input("samples"),
+                IO.Vae.Input("vae"),
+                IO.Int.Input("resolution", tooltip="Shape Generation Resolution"),
+            ],
+            outputs=[
+                IO.Mesh.Output("mesh"),
+                IO.AnyType.Output("shape_subs"),
+            ]
+        )
+
+    @classmethod
+    def execute(cls, samples, vae, resolution):
+        std = shape_slat_normalization["std"]
+        mean = shape_slat_normalization["mean"]
+        samples = samples * std + mean
+
+        mesh, subs = vae.decode_shape_slat(resolution, samples)
+        return mesh, subs
+
+class VaeDecodeTextureTrellis(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="VaeDecodeTextureTrellis",
+            category="latent/3d",
+            inputs=[
+                IO.Latent.Input("samples"),
+                IO.Vae.Input("vae"),
+                IO.AnyType.Input("shape_subs"),
+            ],
+            outputs=[
+                IO.Mesh.Output("mesh"),
+            ]
+        )
+
+    @classmethod
+    def execute(cls, samples, vae, shape_subs):
+        if shape_subs is None:
+            raise ValueError("Shape subs must be provided for texture generation")
+
+        std = tex_slat_normalization["std"]
+        mean = tex_slat_normalization["mean"]
+        samples = samples * std + mean
+
+        mesh = vae.decode_tex_slat(samples, shape_subs)
+        return mesh
+
+class Trellis2Conditioning(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="Trellis2Conditioning",
+            category="conditioning/video_models",
+            inputs=[
+                IO.ClipVision.Input("clip_vision_model"),
+                IO.Image.Input("image"),
+                IO.MultiCombo.Input("background_color", options=["black", "gray", "white"], default="black")
+            ],
+            outputs=[
+                IO.Conditioning.Output(display_name="positive"),
+                IO.Conditioning.Output(display_name="negative"),
+            ]
+        )
+
+    @classmethod
+    def execute(cls, clip_vision_model, image, background_color) -> IO.NodeOutput:
+        # could make 1024 an option
+        conditioning, _ = run_conditioning(clip_vision_model, image, include_1024=True, background_color=background_color)
+        embeds = conditioning["cond_1024"] # should add that
+        positive = [[conditioning["cond_512"], {embeds}]]
+        negative = [[conditioning["cond_neg"], {embeds}]]
+        return IO.NodeOutput(positive, negative)
+
+class EmptyShapeLatentTrellis2(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="EmptyLatentTrellis2",
+            category="latent/3d",
+            inputs=[
+                IO.Latent.Input("structure_output"),
+            ],
+            outputs=[
+                IO.Latent.Output(),
+            ]
+        )
+
+    @classmethod
+    def execute(cls, structure_output):
+        # i will see what i have to do here
+        coords = structure_output or structure_output.coords
+        in_channels = 32
+        latent = SparseTensor(feats=torch.randn(coords.shape[0], in_channels), coords=coords)
+        return IO.NodeOutput({"samples": latent, "type": "trellis2"})
+
+class EmptyTextureLatentTrellis2(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="EmptyLatentTrellis2",
+            category="latent/3d",
+            inputs=[
+                IO.Latent.Input("structure_output"),
+            ],
+            outputs=[
+                IO.Latent.Output(),
+            ]
+        )
+
+    @classmethod
+    def execute(cls, structure_output):
+        # TODO
+        in_channels = 32
+        latent = structure_output.replace(feats=torch.randn(structure_output.coords.shape[0], in_channels - structure_output.feats.shape[1]))
+        return IO.NodeOutput({"samples": latent, "type": "trellis2"})
+
+class EmptyStructureLatentTrellis2(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="EmptyLatentTrellis2",
+            category="latent/3d",
+            inputs=[
+                IO.Int.Input("resolution", default=3072, min=1, max=8192),
+                IO.Int.Input("batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."),
+            ],
+            outputs=[
+                IO.Latent.Output(),
+            ]
+        )
+    @classmethod
+    def execute(cls, res, batch_size):
+        in_channels = 32
+        latent = torch.randn(batch_size, in_channels, res, res, res)
+        return IO.NodeOutput({"samples": latent, "type": "trellis2"})
+
+
+class Trellis2Extension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+        return [
+            Trellis2Conditioning,
+            EmptyShapeLatentTrellis2,
+            EmptyStructureLatentTrellis2,
+            EmptyTextureLatentTrellis2,
+            VaeDecodeTextureTrellis,
+            VaeDecodeShapeTrellis
+        ]
+
+
+async def comfy_entrypoint() -> Trellis2Extension:
+    return Trellis2Extension()
--- a/comfyui_version.py
+++ b/comfyui_version.py
@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.11.1"
+__version__ = "0.12.0"
--- a/nodes.py
+++ b/nodes.py
@ -1001,7 +1001,7 @@ class DualCLIPLoader:
    def INPUT_TYPES(s):
        return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ),
                              "clip_name2": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15", "kandinsky5", "kandinsky5_image", "ltxv", "newbie"], ),
+                              "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15", "kandinsky5", "kandinsky5_image", "ltxv", "newbie", "ace"], ),
                              },
                "optional": {
                              "device": (["default", "cpu"], {"advanced": True}),
@ -2433,7 +2433,8 @@ async def init_builtin_extra_nodes():
        "nodes_image_compare.py",
        "nodes_zimage.py",
        "nodes_lora_debug.py",
-        "nodes_color.py"
+        "nodes_color.py",
+        "nodes_trellis2.py"
    ]

    import_failed = []
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.11.1"
+version = "0.12.0"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.10"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 comfyui-frontend-package==1.37.11
-comfyui-workflow-templates==0.8.27
+comfyui-workflow-templates==0.8.31
 comfyui-embedded-docs==0.4.0
 torch
 torchsde
Author	SHA1	Message	Date
Yousef R. Gamaleldin	2bee7d28db	Merge `f76e3a11b5` into `85fc35e8fa`	2026-02-03 19:22:17 +02:00
comfyanonymous	85fc35e8fa	Fix mac issue. (#12250 ) Some checks are pending Python Linting / Run Ruff (push) Waiting to run Details Python Linting / Run Pylint (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run Details Execution Tests / test (macos-latest) (push) Waiting to run Details Execution Tests / test (ubuntu-latest) (push) Waiting to run Details Execution Tests / test (windows-latest) (push) Waiting to run Details Test server launches without errors / test (push) Waiting to run Details Unit Tests / test (macos-latest) (push) Waiting to run Details Unit Tests / test (ubuntu-latest) (push) Waiting to run Details Unit Tests / test (windows-2022) (push) Waiting to run Details	2026-02-03 12:19:39 -05:00
comfyanonymous	223364743c	llama: cast logits as a comfy-weight (#12248 ) This is using a different layers weight with .to(). Change it to use the ops caster if the original layer is a comfy weight so that it picks up dynamic_vram and async_offload functionality in full. Co-authored-by: Rattus <rattus128@gmail.com>	2026-02-03 11:31:36 -05:00
comfyanonymous	affe881354	Fix some issues with mac. (#12247 )	2026-02-03 11:07:04 -05:00
comfyanonymous	f5030e26fd	Add progress bar to ace step. (#12242 ) Some checks failed Python Linting / Run Ruff (push) Waiting to run Details Python Linting / Run Pylint (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run Details Execution Tests / test (macos-latest) (push) Waiting to run Details Execution Tests / test (ubuntu-latest) (push) Waiting to run Details Execution Tests / test (windows-latest) (push) Waiting to run Details Test server launches without errors / test (push) Waiting to run Details Unit Tests / test (macos-latest) (push) Waiting to run Details Unit Tests / test (ubuntu-latest) (push) Waiting to run Details Unit Tests / test (windows-2022) (push) Waiting to run Details Build package / Build Test (3.10) (push) Has been cancelled Details Build package / Build Test (3.11) (push) Has been cancelled Details Build package / Build Test (3.12) (push) Has been cancelled Details Build package / Build Test (3.13) (push) Has been cancelled Details Build package / Build Test (3.14) (push) Has been cancelled Details	2026-02-03 04:09:30 -05:00
comfyanonymous	66e1b07402	ComfyUI v0.12.0	2026-02-03 02:20:59 -05:00
ComfyUI Wiki	be4345d1c9	chore: update workflow templates to v0.8.31 (#12239 )	2026-02-02 23:08:43 -08:00
comfyanonymous	3c1a1a2df8	Basic support for the ace step 1.5 model. (#12237 )	2026-02-03 00:06:18 -05:00
Yousef Rafat	f76e3a11b5	..	2026-02-02 21:27:15 +02:00
Yousef Rafat	614b167994	.	2026-02-02 21:23:19 +02:00
Yousef Rafat	23474ce816	updated the trellis2 nodes	2026-02-02 21:20:46 +02:00
Yousef R. Gamaleldin	f1d25a460c	Merge branch 'master' into trellis2	2026-02-02 18:13:58 +02:00
Yousef R. Gamaleldin	6ea2e5b288	init	2026-01-30 23:34:48 +02:00