From aa4eef71d09c95b72766774f4c76de3509f32a01 Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Tue, 19 May 2026 12:11:17 +0200 Subject: [PATCH] Update DA3 to use dino2.get_intermediate_layers_da3 --- comfy/image_encoders/dino2.py | 23 ++++++++++++++++++++++- comfy/ldm/depth_anything_3/model.py | 2 +- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/comfy/image_encoders/dino2.py b/comfy/image_encoders/dino2.py index 59e8c0b6f..5ec29a4c2 100644 --- a/comfy/image_encoders/dino2.py +++ b/comfy/image_encoders/dino2.py @@ -363,6 +363,21 @@ class Dinov2Model(torch.nn.Module): return x, i, pooled_output, None def get_intermediate_layers(self, pixel_values, indices, apply_norm=True): + """Single-view multi-layer feature extraction (MoGe / vanilla DINOv2). + + For the multi-view Depth Anything 3 path (RoPE, alt-attention, + camera-token injection, ref-view selection, cat_token), use + :meth:`get_intermediate_layers_da3` instead. + + Args: + pixel_values: ``(B, 3, H, W)`` single-view input. + indices: layer indices to extract; supports negative indexing. + apply_norm: if True, apply the final layernorm to each output. + + Returns: + list of ``(patch_tokens, cls_token)`` tuples with shapes + ``(B, N_patch, C)`` and ``(B, C)`` (one entry per ``indices``). + """ x = self.embeddings(pixel_values) optimized_attention = optimized_attention_for_device(x.device, False, small_input=True) n_layers = len(self.encoder.layer) @@ -415,7 +430,13 @@ class Dinov2Model(torch.nn.Module): def get_intermediate_layers_da3(self, pixel_values, out_layers, cam_token=None, ref_view_strategy="saddle_balanced", export_feat_layers=None): - """Multi-layer DINOv2 feature extraction used by Depth Anything 3. + """Multi-view multi-layer feature extraction used by Depth Anything 3. + + Adds RoPE positions, alternating local/global attention across views, + camera-token injection, reference-view selection/reordering, + ``cat_token`` output and optional auxiliary feature exports on top of + the vanilla DINOv2 path. For the single-view MoGe / CLIP-vision use + case, see :meth:`get_intermediate_layers`. Args: pixel_values: ``(B, S, 3, H, W)`` views or ``(B, 3, H, W)``. diff --git a/comfy/ldm/depth_anything_3/model.py b/comfy/ldm/depth_anything_3/model.py index 782517bac..a719ef151 100644 --- a/comfy/ldm/depth_anything_3/model.py +++ b/comfy/ldm/depth_anything_3/model.py @@ -243,7 +243,7 @@ class DepthAnything3Net(nn.Module): if isinstance(self.head, DualDPT): self.head.enable_aux = bool(use_ray_pose) - feats, aux_feats = self.backbone.get_intermediate_layers( + feats, aux_feats = self.backbone.get_intermediate_layers_da3( image, self.out_layers, cam_token=cam_token, ref_view_strategy=ref_view_strategy, export_feat_layers=export_feat_layers,