dinov3 fixes + other

This commit is contained in:
Yousef Rafat 2026-02-11 01:27:54 +02:00
parent 2eef826def
commit f4059c189e
3 changed files with 51 additions and 19 deletions

View File

@ -24,7 +24,6 @@ class DINOv3ViTAttention(nn.Module):
self.embed_dim = hidden_size
self.num_heads = num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
self.is_causal = False
self.scaling = self.head_dim**-0.5
self.is_causal = False
@ -53,18 +52,41 @@ class DINOv3ViTAttention(nn.Module):
key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
cos, sin = position_embeddings
position_embeddings = torch.stack([cos, sin], dim = -1)
query_states, key_states = apply_rope(query_states, key_states, position_embeddings)
if position_embeddings is not None:
cos, sin = position_embeddings
attn_output, attn_weights = optimized_attention_for_device(
query_states, key_states, value_states, attention_mask, skip_reshape=True, skip_output_reshape=True
num_tokens = query_states.shape[-2]
num_patches = cos.shape[-2]
num_prefix_tokens = num_tokens - num_patches
q_prefix, q_patches = query_states.split((num_prefix_tokens, num_patches), dim=-2)
k_prefix, k_patches = key_states.split((num_prefix_tokens, num_patches), dim=-2)
cos = cos[..., :self.head_dim // 2]
sin = sin[..., :self.head_dim // 2]
f_cis_0 = torch.stack([cos, sin], dim=-1)
f_cis_1 = torch.stack([-sin, cos], dim=-1)
freqs_cis = torch.stack([f_cis_0, f_cis_1], dim=-1)
while freqs_cis.ndim < q_patches.ndim + 1:
freqs_cis = freqs_cis.unsqueeze(0)
q_patches, k_patches = apply_rope(q_patches, k_patches, freqs_cis)
query_states = torch.cat((q_prefix, q_patches), dim=-2)
key_states = torch.cat((k_prefix, k_patches), dim=-2)
attn = optimized_attention_for_device(query_states.device, mask=False)
attn_output = attn(
query_states, key_states, value_states, self.num_heads, attention_mask, skip_reshape=True, skip_output_reshape=True
)
attn_output = attn_output.reshape(batch_size, patches, -1).contiguous()
attn_output = self.o_proj(attn_output)
return attn_output, attn_weights
return attn_output
class DINOv3ViTGatedMLP(nn.Module):
def __init__(self, hidden_size, intermediate_size, mlp_bias, device, dtype, operations):
@ -187,7 +209,7 @@ class DINOv3ViTLayer(nn.Module):
) -> torch.Tensor:
residual = hidden_states
hidden_states = self.norm1(hidden_states)
hidden_states, _ = self.attention(
hidden_states = self.attention(
hidden_states,
attention_mask=attention_mask,
position_embeddings=position_embeddings,
@ -250,6 +272,7 @@ class DINOv3ViTModel(nn.Module):
position_embeddings=position_embeddings,
)
self.norm = self.norm.to(hidden_states.device)
sequence_output = self.norm(hidden_states)
pooled_output = sequence_output[:, 0, :]

View File

@ -113,6 +113,13 @@ class SparseRotaryPositionEmbedder(nn.Module):
q_feats, k_feats = apply_rope(q.feats, k.feats, f_cis)
return q.replace(q_feats), k.replace(k_feats)
@staticmethod
def apply_rotary_embedding(x: torch.Tensor, phases: torch.Tensor) -> torch.Tensor:
x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
x_rotated = x_complex * phases.unsqueeze(-2)
x_embed = torch.view_as_real(x_rotated).reshape(*x_rotated.shape[:-1], -1).to(x.dtype)
return x_embed
class RotaryPositionEmbedder(SparseRotaryPositionEmbedder):
def forward(self, indices: torch.Tensor) -> torch.Tensor:
phases = self._get_phases(indices.reshape(-1)).reshape(*indices.shape[:-1], -1)
@ -559,6 +566,7 @@ class MultiHeadAttention(nn.Module):
def forward(self, x: torch.Tensor, context: Optional[torch.Tensor] = None, phases: Optional[torch.Tensor] = None) -> torch.Tensor:
B, L, C = x.shape
if self._type == "self":
x = x.to(next(self.to_qkv.parameters()).dtype)
qkv = self.to_qkv(x)
qkv = qkv.reshape(B, L, 3, self.num_heads, -1)
@ -688,7 +696,7 @@ class SparseStructureFlowModel(nn.Module):
num_heads: Optional[int] = None,
num_head_channels: Optional[int] = 64,
mlp_ratio: float = 4,
pe_mode: Literal["ape", "rope"] = "ape",
pe_mode: Literal["ape", "rope"] = "rope",
rope_freq: Tuple[float, float] = (1.0, 10000.0),
dtype: str = 'float32',
use_checkpoint: bool = False,
@ -756,14 +764,14 @@ class SparseStructureFlowModel(nn.Module):
self.out_layer = nn.Linear(model_channels, out_channels)
def forward(self, x: torch.Tensor, t: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
x = x.view(x.shape[0], self.in_channels, *[self.resolution] * 3)
assert [*x.shape] == [x.shape[0], self.in_channels, *[self.resolution] * 3], \
f"Input shape mismatch, got {x.shape}, expected {[x.shape[0], self.in_channels, *[self.resolution] * 3]}"
h = x.view(*x.shape[:2], -1).permute(0, 2, 1).contiguous()
h = h.to(next(self.input_layer.parameters()).dtype)
h = self.input_layer(h)
if self.pe_mode == "ape":
h = h + self.pos_emb[None]
t_emb = self.t_embedder(t, out_dtype = t.dtype)
if self.share_mod:
t_emb = self.adaLN_modulation(t_emb)
@ -816,7 +824,8 @@ class Trellis2(nn.Module):
self.structure_model = SparseStructureFlowModel(resolution=16, in_channels=8, out_channels=8, **args)
def forward(self, x: NestedTensor, timestep, context, **kwargs):
x = x.tensors[0]
if isinstance(x, NestedTensor):
x = x.tensors[0]
embeds = kwargs.get("embeds")
if not hasattr(x, "feats"):
mode = "structure_generation"

View File

@ -97,13 +97,13 @@ def run_conditioning(
return image.to(torch_device).float()
pil_image = set_image_size(pil_image, 512)
cond_512 = model(pil_image)
cond_512 = model(pil_image)[0]
cond_1024 = None
if include_1024:
model.image_size = 1024
pil_image = set_image_size(pil_image, 1024)
cond_1024 = model([pil_image])
cond_1024 = model(pil_image)[0]
neg_cond = torch.zeros_like(cond_512)
@ -115,7 +115,7 @@ def run_conditioning(
conditioning['cond_1024'] = cond_1024.to(device)
preprocessed_tensor = pil_image.to(torch.float32) / 255.0
preprocessed_tensor = torch.from_numpy(preprocessed_tensor).unsqueeze(0)
preprocessed_tensor = preprocessed_tensor.unsqueeze(0)
return conditioning, preprocessed_tensor
@ -217,7 +217,7 @@ class Trellis2Conditioning(IO.ComfyNode):
conditioning, _ = run_conditioning(clip_vision_model, image, include_1024=True, background_color=background_color)
embeds = conditioning["cond_1024"] # should add that
positive = [[conditioning["cond_512"], {"embeds": embeds}]]
negative = [[conditioning["cond_neg"], {"embeds": embeds}]]
negative = [[conditioning["neg_cond"], {"embeds": embeds}]]
return IO.NodeOutput(positive, negative)
class EmptyShapeLatentTrellis2(IO.ComfyNode):
@ -272,7 +272,6 @@ class EmptyStructureLatentTrellis2(IO.ComfyNode):
node_id="EmptyStructureLatentTrellis2",
category="latent/3d",
inputs=[
IO.Int.Input("resolution", default=256, min=1, max=8192),
IO.Int.Input("batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."),
],
outputs=[
@ -280,8 +279,9 @@ class EmptyStructureLatentTrellis2(IO.ComfyNode):
]
)
@classmethod
def execute(cls, resolution, batch_size):
in_channels = 32
def execute(cls, batch_size):
in_channels = 8
resolution = 16
latent = torch.randn(batch_size, in_channels, resolution, resolution, resolution)
latent = NestedTensor([latent])
return IO.NodeOutput({"samples": latent, "type": "trellis2"})