model fixes

2026-07-13 18:07:08 +08:00 · 2026-02-06 19:28:49 +02:00 · 2026-02-06 19:28:49 +02:00 · cdd7ced1e8
commit cdd7ced1e8
parent f2c0320fe8
1 changed files with 21 additions and 13 deletions
--- a/comfy/ldm/trellis2/model.py
+++ b/comfy/ldm/trellis2/model.py
@ -26,10 +26,9 @@ class SparseFeedForwardNet(nn.Module):
    def forward(self, x: VarLenTensor) -> VarLenTensor:
        return self.mlp(x)

-def manual_cast(tensor, dtype):
-    if not torch.is_autocast_enabled():
-        return tensor.type(dtype)
-    return tensor
+def manual_cast(obj, dtype):
+    return obj.to(dtype=dtype)
+
 class LayerNorm32(nn.LayerNorm):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x_dtype = x.dtype
@ -88,6 +87,12 @@ class SparseRotaryPositionEmbedder(nn.Module):

        return freqs_cis

+    def _get_phases(self, indices: torch.Tensor) -> torch.Tensor:
+        self.freqs = self.freqs.to(indices.device)
+        phases = torch.outer(indices, self.freqs)
+        phases = torch.polar(torch.ones_like(phases), phases)
+        return phases
+
    def forward(self, q, k=None):
        cache_name = f'rope_cis_{self.dim}d_f{self.rope_freq[1]}_hd{self.head_dim}'
        freqs_cis = q.get_spatial_cache(cache_name)
@ -111,11 +116,15 @@ class SparseRotaryPositionEmbedder(nn.Module):
 class RotaryPositionEmbedder(SparseRotaryPositionEmbedder):
    def forward(self, indices: torch.Tensor) -> torch.Tensor:
        phases = self._get_phases(indices.reshape(-1)).reshape(*indices.shape[:-1], -1)
+        if torch.is_complex(phases):
+            phases = phases.to(torch.complex64)
+        else:
+            phases = phases.to(torch.float32)
        if phases.shape[-1] < self.head_dim // 2:
                padn = self.head_dim // 2 - phases.shape[-1]
                phases = torch.cat([phases, torch.polar(
-                    torch.ones(*phases.shape[:-1], padn, device=phases.device),
-                    torch.zeros(*phases.shape[:-1], padn, device=phases.device)
+                    torch.ones(*phases.shape[:-1], padn, device=phases.device, dtype=torch.float32),
+                    torch.zeros(*phases.shape[:-1], padn, device=phases.device, dtype=torch.float32)
                )], dim=-1)
        return phases

@ -468,7 +477,7 @@ class SLatFlowModel(nn.Module):

        h = self.input_layer(x)
        h = manual_cast(h, self.dtype)
-        t_emb = self.t_embedder(t)
+        t_emb = self.t_embedder(t, out_dtype = t.dtype)
        if self.share_mod:
            t_emb = self.adaLN_modulation(t_emb)
        t_emb = manual_cast(t_emb, self.dtype)
@ -687,9 +696,12 @@ class SparseStructureFlowModel(nn.Module):
        initialization: str = 'vanilla',
        qk_rms_norm: bool = False,
        qk_rms_norm_cross: bool = False,
+        operations=None,
+        device = None,
        **kwargs
    ):
        super().__init__()
+        self.device = device
        self.resolution = resolution
        self.in_channels = in_channels
        self.model_channels = model_channels
@ -706,7 +718,7 @@ class SparseStructureFlowModel(nn.Module):
        self.qk_rms_norm_cross = qk_rms_norm_cross
        self.dtype = dtype

-        self.t_embedder = TimestepEmbedder(model_channels)
+        self.t_embedder = TimestepEmbedder(model_channels, operations=operations)
        if share_mod:
            self.adaLN_modulation = nn.Sequential(
                nn.SiLU(),
@ -743,9 +755,6 @@ class SparseStructureFlowModel(nn.Module):

        self.out_layer = nn.Linear(model_channels, out_channels)

-        self.initialize_weights()
-        self.convert_to(self.dtype)
-
    def forward(self, x: torch.Tensor, t: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
        assert [*x.shape] == [x.shape[0], self.in_channels, *[self.resolution] * 3], \
                f"Input shape mismatch, got {x.shape}, expected {[x.shape[0], self.in_channels, *[self.resolution] * 3]}"
@ -755,7 +764,7 @@ class SparseStructureFlowModel(nn.Module):
        h = self.input_layer(h)
        if self.pe_mode == "ape":
            h = h + self.pos_emb[None]
-        t_emb = self.t_embedder(t)
+        t_emb = self.t_embedder(t, out_dtype = t.dtype)
        if self.share_mod:
            t_emb = self.adaLN_modulation(t_emb)
        t_emb = manual_cast(t_emb, self.dtype)
@ -799,7 +808,6 @@ class Trellis2(nn.Module):
        self.img2shape = SLatFlowModel(resolution=resolution, in_channels=in_channels, **args)
        self.shape2txt = SLatFlowModel(resolution=resolution, in_channels=in_channels*2, **args)
        args.pop("out_channels")
-        args.pop("in_channels")
        self.structure_model = SparseStructureFlowModel(resolution=16, in_channels=8, out_channels=8, **args)

    def forward(self, x: NestedTensor, timestep, context, **kwargs):