dinov3 fixes + other

2026-07-13 01:47:14 +08:00 · 2026-02-11 01:27:54 +02:00 · 2026-02-11 01:27:54 +02:00 · f4059c189e
commit f4059c189e
parent 2eef826def
3 changed files with 51 additions and 19 deletions
--- a/comfy/image_encoders/dino3.py
+++ b/comfy/image_encoders/dino3.py
@ -24,7 +24,6 @@ class DINOv3ViTAttention(nn.Module):
        self.embed_dim = hidden_size
        self.num_heads = num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
-        self.is_causal = False

        self.scaling = self.head_dim**-0.5
        self.is_causal = False
@ -53,18 +52,41 @@ class DINOv3ViTAttention(nn.Module):
        key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)

-        cos, sin = position_embeddings
-        position_embeddings = torch.stack([cos, sin], dim = -1)
-        query_states, key_states = apply_rope(query_states, key_states, position_embeddings)
+        if position_embeddings is not None:
+            cos, sin = position_embeddings

-        attn_output, attn_weights = optimized_attention_for_device(
-            query_states, key_states, value_states, attention_mask, skip_reshape=True, skip_output_reshape=True
+            num_tokens = query_states.shape[-2]
+            num_patches = cos.shape[-2]
+            num_prefix_tokens = num_tokens - num_patches
+
+            q_prefix, q_patches = query_states.split((num_prefix_tokens, num_patches), dim=-2)
+            k_prefix, k_patches = key_states.split((num_prefix_tokens, num_patches), dim=-2)
+
+            cos = cos[..., :self.head_dim // 2]
+            sin = sin[..., :self.head_dim // 2]
+
+            f_cis_0 = torch.stack([cos, sin], dim=-1)
+            f_cis_1 = torch.stack([-sin, cos], dim=-1)
+            freqs_cis = torch.stack([f_cis_0, f_cis_1], dim=-1)
+
+            while freqs_cis.ndim < q_patches.ndim + 1:
+                freqs_cis = freqs_cis.unsqueeze(0)
+
+            q_patches, k_patches = apply_rope(q_patches, k_patches, freqs_cis)
+
+            query_states = torch.cat((q_prefix, q_patches), dim=-2)
+            key_states = torch.cat((k_prefix, k_patches), dim=-2)
+
+        attn = optimized_attention_for_device(query_states.device, mask=False)
+
+        attn_output = attn(
+            query_states, key_states, value_states, self.num_heads, attention_mask, skip_reshape=True, skip_output_reshape=True
        )

        attn_output = attn_output.reshape(batch_size, patches, -1).contiguous()
        attn_output = self.o_proj(attn_output)

-        return attn_output, attn_weights
+        return attn_output

 class DINOv3ViTGatedMLP(nn.Module):
    def __init__(self, hidden_size, intermediate_size, mlp_bias, device, dtype, operations):
@ -187,7 +209,7 @@ class DINOv3ViTLayer(nn.Module):
    ) -> torch.Tensor:
        residual = hidden_states
        hidden_states = self.norm1(hidden_states)
-        hidden_states, _ = self.attention(
+        hidden_states = self.attention(
            hidden_states,
            attention_mask=attention_mask,
            position_embeddings=position_embeddings,
@ -250,6 +272,7 @@ class DINOv3ViTModel(nn.Module):
                position_embeddings=position_embeddings,
            )

+        self.norm = self.norm.to(hidden_states.device)
        sequence_output = self.norm(hidden_states)
        pooled_output = sequence_output[:, 0, :]

--- a/comfy/ldm/trellis2/model.py
+++ b/comfy/ldm/trellis2/model.py
@ -113,6 +113,13 @@ class SparseRotaryPositionEmbedder(nn.Module):
        q_feats, k_feats = apply_rope(q.feats, k.feats, f_cis)
        return q.replace(q_feats), k.replace(k_feats)

+    @staticmethod
+    def apply_rotary_embedding(x: torch.Tensor, phases: torch.Tensor) -> torch.Tensor:
+        x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        x_rotated = x_complex * phases.unsqueeze(-2)
+        x_embed = torch.view_as_real(x_rotated).reshape(*x_rotated.shape[:-1], -1).to(x.dtype)
+        return x_embed
+
 class RotaryPositionEmbedder(SparseRotaryPositionEmbedder):
    def forward(self, indices: torch.Tensor) -> torch.Tensor:
        phases = self._get_phases(indices.reshape(-1)).reshape(*indices.shape[:-1], -1)
@ -559,6 +566,7 @@ class MultiHeadAttention(nn.Module):
    def forward(self, x: torch.Tensor, context: Optional[torch.Tensor] = None, phases: Optional[torch.Tensor] = None) -> torch.Tensor:
        B, L, C = x.shape
        if self._type == "self":
+            x = x.to(next(self.to_qkv.parameters()).dtype)
            qkv = self.to_qkv(x)
            qkv = qkv.reshape(B, L, 3, self.num_heads, -1)

@ -688,7 +696,7 @@ class SparseStructureFlowModel(nn.Module):
        num_heads: Optional[int] = None,
        num_head_channels: Optional[int] = 64,
        mlp_ratio: float = 4,
-        pe_mode: Literal["ape", "rope"] = "ape",
+        pe_mode: Literal["ape", "rope"] = "rope",
        rope_freq: Tuple[float, float] = (1.0, 10000.0),
        dtype: str = 'float32',
        use_checkpoint: bool = False,
@ -756,14 +764,14 @@ class SparseStructureFlowModel(nn.Module):
        self.out_layer = nn.Linear(model_channels, out_channels)

    def forward(self, x: torch.Tensor, t: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
+        x = x.view(x.shape[0], self.in_channels, *[self.resolution] * 3)
        assert [*x.shape] == [x.shape[0], self.in_channels, *[self.resolution] * 3], \
                f"Input shape mismatch, got {x.shape}, expected {[x.shape[0], self.in_channels, *[self.resolution] * 3]}"

        h = x.view(*x.shape[:2], -1).permute(0, 2, 1).contiguous()

+        h = h.to(next(self.input_layer.parameters()).dtype)
        h = self.input_layer(h)
-        if self.pe_mode == "ape":
-            h = h + self.pos_emb[None]
        t_emb = self.t_embedder(t, out_dtype = t.dtype)
        if self.share_mod:
            t_emb = self.adaLN_modulation(t_emb)
@ -816,7 +824,8 @@ class Trellis2(nn.Module):
        self.structure_model = SparseStructureFlowModel(resolution=16, in_channels=8, out_channels=8, **args)

    def forward(self, x: NestedTensor, timestep, context, **kwargs):
-        x = x.tensors[0]
+        if isinstance(x, NestedTensor):
+            x = x.tensors[0]
        embeds = kwargs.get("embeds")
        if not hasattr(x, "feats"):
            mode = "structure_generation"
--- a/comfy_extras/nodes_trellis2.py
+++ b/comfy_extras/nodes_trellis2.py
@ -97,13 +97,13 @@ def run_conditioning(
        return image.to(torch_device).float()

    pil_image = set_image_size(pil_image, 512)
-    cond_512 = model(pil_image)
+    cond_512 = model(pil_image)[0]

    cond_1024 = None
    if include_1024:
        model.image_size = 1024
        pil_image = set_image_size(pil_image, 1024)
-        cond_1024 = model([pil_image])
+        cond_1024 = model(pil_image)[0]

    neg_cond = torch.zeros_like(cond_512)

@ -115,7 +115,7 @@ def run_conditioning(
        conditioning['cond_1024'] = cond_1024.to(device)

    preprocessed_tensor = pil_image.to(torch.float32) / 255.0
-    preprocessed_tensor = torch.from_numpy(preprocessed_tensor).unsqueeze(0)
+    preprocessed_tensor = preprocessed_tensor.unsqueeze(0)

    return conditioning, preprocessed_tensor

@ -217,7 +217,7 @@ class Trellis2Conditioning(IO.ComfyNode):
        conditioning, _ = run_conditioning(clip_vision_model, image, include_1024=True, background_color=background_color)
        embeds = conditioning["cond_1024"] # should add that
        positive = [[conditioning["cond_512"], {"embeds": embeds}]]
-        negative = [[conditioning["cond_neg"], {"embeds": embeds}]]
+        negative = [[conditioning["neg_cond"], {"embeds": embeds}]]
        return IO.NodeOutput(positive, negative)

 class EmptyShapeLatentTrellis2(IO.ComfyNode):
@ -272,7 +272,6 @@ class EmptyStructureLatentTrellis2(IO.ComfyNode):
            node_id="EmptyStructureLatentTrellis2",
            category="latent/3d",
            inputs=[
-                IO.Int.Input("resolution", default=256, min=1, max=8192),
                IO.Int.Input("batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."),
            ],
            outputs=[
@ -280,8 +279,9 @@ class EmptyStructureLatentTrellis2(IO.ComfyNode):
            ]
        )
    @classmethod
-    def execute(cls, resolution, batch_size):
-        in_channels = 32
+    def execute(cls, batch_size):
+        in_channels = 8
+        resolution = 16
        latent = torch.randn(batch_size, in_channels, resolution, resolution, resolution)
        latent = NestedTensor([latent])
        return IO.NodeOutput({"samples": latent, "type": "trellis2"})