# DepthAnything3Net: top-level wrapper that combines backbone + head.
#
# This wrapper covers the monocular forward path only (single image -> depth).
# Camera encoder/decoder, ray-pose head, 3D Gaussians and the Nested
# architecture are intentionally omitted. The HF state dict for those
# components is filtered out before loading -- see
# ``comfy.supported_models.DepthAnything3.process_unet_state_dict``.
#
# The class signature mirrors the upstream YAML config so a single dit_config
# detected from the state dict in ``comfy/model_detection.py`` is sufficient
# to construct the right variant.
#
# Backbone: ``comfy.image_encoders.dino2.Dinov2Model`` is shared with the
# CLIP-vision DINOv2 path. DA3-specific extensions (RoPE, QK-norm,
# alternating local/global attention, camera token, multi-layer feature
# extraction, pos-embed interpolation) are opt-in via the config dict and are
# all disabled for the Mono/Metric variants. The upstream DA3 weight layout
# (``backbone.pretrained.*`` with fused QKV) is converted to the
# ``Dinov2Model`` layout in
# ``comfy.supported_models.DepthAnything3.process_unet_state_dict``.

from __future__ import annotations

from typing import Dict, Optional, Sequence

import torch
import torch.nn as nn

from comfy.image_encoders.dino2 import Dinov2Model

from .dpt import DPT, DualDPT


_HEAD_REGISTRY = {
    "dpt": DPT,
    "dualdpt": DualDPT,
}


# Backbone presets (mirror the upstream DINOv2 ViT variants).
_BACKBONE_PRESETS = {
    "vits": dict(hidden_size=384,  num_hidden_layers=12, num_attention_heads=6,  use_swiglu_ffn=False),
    "vitb": dict(hidden_size=768,  num_hidden_layers=12, num_attention_heads=12, use_swiglu_ffn=False),
    "vitl": dict(hidden_size=1024, num_hidden_layers=24, num_attention_heads=16, use_swiglu_ffn=False),
    "vitg": dict(hidden_size=1536, num_hidden_layers=40, num_attention_heads=24, use_swiglu_ffn=True),
}


def _build_backbone_config(
    backbone_name: str,
    *,
    alt_start: int,
    qknorm_start: int,
    rope_start: int,
    cat_token: bool,
) -> dict:
    if backbone_name not in _BACKBONE_PRESETS:
        raise ValueError(f"Unknown DINOv2 backbone variant: {backbone_name!r}")
    cfg = dict(_BACKBONE_PRESETS[backbone_name])
    cfg.update(dict(
        layer_norm_eps=1e-6,
        patch_size=14,
        image_size=518,
        # DA3 weights have no mask_token; skip registering it to avoid spurious
        # missing-key warnings on load.
        use_mask_token=False,
        alt_start=alt_start,
        qknorm_start=qknorm_start,
        rope_start=rope_start,
        cat_token=cat_token,
        rope_freq=100.0,
    ))
    return cfg


class DepthAnything3Net(nn.Module):
    """ComfyUI-side DepthAnything3 network (monocular path only).

    Parameters mirror the variant YAML configs from the upstream repo.
    Values are auto-detected by ``comfy/model_detection.py`` from the state
    dict. The kwargs ``device``, ``dtype`` and ``operations`` are injected by
    ``BaseModel``.
    """

    PATCH_SIZE = 14

    def __init__(
        self,
        # --- Backbone ---
        backbone_name: str = "vitl",
        out_layers: Sequence[int] = (4, 11, 17, 23),
        alt_start: int = -1,
        qknorm_start: int = -1,
        rope_start: int = -1,
        cat_token: bool = False,
        # --- Head ---
        head_type: str = "dpt",                # "dpt" or "dualdpt"
        head_dim_in: int = 1024,
        head_output_dim: int = 1,              # 1 = depth only, 2 = depth+conf
        head_features: int = 256,
        head_out_channels: Sequence[int] = (256, 512, 1024, 1024),
        head_use_sky_head: bool = True,        # ignored by DualDPT
        head_pos_embed: Optional[bool] = None, # default: True for DualDPT, False for DPT
        # ComfyUI plumbing
        device=None, dtype=None, operations=None,
        **_ignored,
    ):
        super().__init__()
        head_cls = _HEAD_REGISTRY[head_type.lower()]
        self.head_type = head_type.lower()
        self.has_sky = (self.head_type == "dpt") and head_use_sky_head
        self.has_conf = head_output_dim > 1
        self.out_layers = list(out_layers)

        backbone_cfg = _build_backbone_config(
            backbone_name,
            alt_start=alt_start,
            qknorm_start=qknorm_start,
            rope_start=rope_start,
            cat_token=cat_token,
        )
        self.backbone = Dinov2Model(backbone_cfg, dtype, device, operations)

        head_kwargs = dict(
            dim_in=head_dim_in,
            patch_size=self.PATCH_SIZE,
            output_dim=head_output_dim,
            features=head_features,
            out_channels=tuple(head_out_channels),
            device=device, dtype=dtype, operations=operations,
        )
        if self.head_type == "dpt":
            head_kwargs.update(
                use_sky_head=head_use_sky_head,
                pos_embed=(False if head_pos_embed is None else head_pos_embed),
            )
        else:  # dualdpt
            head_kwargs.update(
                pos_embed=(True if head_pos_embed is None else head_pos_embed),
            )
        self.head = head_cls(**head_kwargs)
        self.dtype = dtype

    # ------------------------------------------------------------------
    # Forward
    # ------------------------------------------------------------------
    def forward(self, image: torch.Tensor, **_unused) -> Dict[str, torch.Tensor]:
        """Run monocular forward.

        Args:
            image: ``(B, 3, H, W)`` ImageNet-normalised image tensor, or
                   ``(B, S, 3, H, W)`` if a fake "views" axis is supplied.
                   H and W must be multiples of 14.

        Returns:
            Dict with:
              - ``depth``:      ``(B, H, W)`` raw depth values.
              - ``depth_conf``: ``(B, H, W)`` confidence (DualDPT variants only).
              - ``sky``:        ``(B, H, W)`` sky probability/logit
                                (DPT variants only).
        """
        if image.ndim == 4:
            image = image.unsqueeze(1)  # (B, 1, 3, H, W)
        assert image.ndim == 5 and image.shape[2] == 3, \
            f"image must be (B,3,H,W) or (B,S,3,H,W); got {tuple(image.shape)}"

        B, S, _, H, W = image.shape
        assert H % self.PATCH_SIZE == 0 and W % self.PATCH_SIZE == 0, \
            f"image H,W must be multiples of {self.PATCH_SIZE}; got {(H, W)}"

        feats = self.backbone.get_intermediate_layers(image, self.out_layers)
        head_out = self.head(feats, H=H, W=W, patch_start_idx=0)

        # Flatten the views axis (S=1 in mono inference path).
        out: Dict[str, torch.Tensor] = {}
        for k, v in head_out.items():
            if v.ndim >= 3 and v.shape[0] == B and v.shape[1] == S:
                out[k] = v.reshape(B * S, *v.shape[2:])
            else:
                out[k] = v
        return out