mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-05-23 23:47:25 +08:00
Cleanup of comments.
This commit is contained in:
parent
2686038f94
commit
5e889e73b9
@ -106,11 +106,6 @@ class _Block(nn.Module):
|
|||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Camera encoder
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class CameraEnc(nn.Module):
|
class CameraEnc(nn.Module):
|
||||||
"""Encode per-view (extrinsics, intrinsics) into a camera token.
|
"""Encode per-view (extrinsics, intrinsics) into a camera token.
|
||||||
|
|
||||||
@ -165,11 +160,6 @@ class CameraEnc(nn.Module):
|
|||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Camera decoder
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class CameraDec(nn.Module):
|
class CameraDec(nn.Module):
|
||||||
"""Decode the final cam token into a 9-D pose encoding.
|
"""Decode the final cam token into a 9-D pose encoding.
|
||||||
|
|
||||||
|
|||||||
@ -65,8 +65,7 @@ def _build_backbone_config(
|
|||||||
layer_norm_eps=1e-6,
|
layer_norm_eps=1e-6,
|
||||||
patch_size=14,
|
patch_size=14,
|
||||||
image_size=518,
|
image_size=518,
|
||||||
# DA3 weights have no mask_token; skip registering it to avoid spurious
|
# No mask_token in DA3 weights; omit param to avoid load warnings.
|
||||||
# missing-key warnings on load.
|
|
||||||
use_mask_token=False,
|
use_mask_token=False,
|
||||||
alt_start=alt_start,
|
alt_start=alt_start,
|
||||||
qknorm_start=qknorm_start,
|
qknorm_start=qknorm_start,
|
||||||
@ -149,10 +148,7 @@ class DepthAnything3Net(nn.Module):
|
|||||||
)
|
)
|
||||||
self.head = head_cls(**head_kwargs)
|
self.head = head_cls(**head_kwargs)
|
||||||
|
|
||||||
# Camera encoder / decoder are only constructed when their weights are
|
# Built only if checkpoint has weights; cam_enc output dim == embed_dim.
|
||||||
# present in the checkpoint; the multi-view / pose forward path becomes
|
|
||||||
# available accordingly. ``cam_enc.dim_out`` matches the backbone's
|
|
||||||
# ``embed_dim`` so the cam token slots into block ``alt_start``.
|
|
||||||
embed_dim = backbone_cfg["hidden_size"]
|
embed_dim = backbone_cfg["hidden_size"]
|
||||||
if has_cam_enc:
|
if has_cam_enc:
|
||||||
self.cam_enc = CameraEnc(
|
self.cam_enc = CameraEnc(
|
||||||
@ -163,8 +159,6 @@ class DepthAnything3Net(nn.Module):
|
|||||||
else:
|
else:
|
||||||
self.cam_enc = None
|
self.cam_enc = None
|
||||||
if has_cam_dec:
|
if has_cam_dec:
|
||||||
# Default cam_dec dim_in is 2*embed_dim when cat_token is on
|
|
||||||
# (the cls/cam token in the output is the cat'd version).
|
|
||||||
default_dim = embed_dim * (2 if cat_token else 1)
|
default_dim = embed_dim * (2 if cat_token else 1)
|
||||||
self.cam_dec = CameraDec(
|
self.cam_dec = CameraDec(
|
||||||
dim_in=cam_dec_dim_in if cam_dec_dim_in is not None else default_dim,
|
dim_in=cam_dec_dim_in if cam_dec_dim_in is not None else default_dim,
|
||||||
@ -175,9 +169,6 @@ class DepthAnything3Net(nn.Module):
|
|||||||
|
|
||||||
self.dtype = dtype
|
self.dtype = dtype
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Forward
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
image: torch.Tensor,
|
image: torch.Tensor,
|
||||||
|
|||||||
@ -24,9 +24,7 @@ from typing import Optional, Tuple
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# qr/svd use fp32: CUDA often has no fp16/bf16 kernels for these ops.
|
||||||
# Linear-algebra helpers
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _ql_decomposition(A: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
def _ql_decomposition(A: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
|||||||
@ -766,7 +766,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
|||||||
dit_config["enc_h"] = state_dict['{}encoder.pan_blocks.1.cv4.conv.weight'.format(key_prefix)].shape[0]
|
dit_config["enc_h"] = state_dict['{}encoder.pan_blocks.1.cv4.conv.weight'.format(key_prefix)].shape[0]
|
||||||
return dit_config
|
return dit_config
|
||||||
|
|
||||||
# Depth Anything 3 (Apache-2.0 monocular variants: Small/Base/Mono-Large/Metric-Large).
|
# Depth Anything 3
|
||||||
if '{}backbone.pretrained.patch_embed.proj.weight'.format(key_prefix) in state_dict_keys:
|
if '{}backbone.pretrained.patch_embed.proj.weight'.format(key_prefix) in state_dict_keys:
|
||||||
dit_config = {}
|
dit_config = {}
|
||||||
dit_config["image_model"] = "DepthAnything3"
|
dit_config["image_model"] = "DepthAnything3"
|
||||||
|
|||||||
@ -1864,41 +1864,16 @@ class DepthAnything3(supported_models_base.BASE):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def process_unet_state_dict(self, state_dict):
|
def process_unet_state_dict(self, state_dict):
|
||||||
# Drop weights for components we do not build (3D Gaussian heads).
|
# Drop Gaussian-head weights; remap fused backbone QKV to Dinov2Model layout.
|
||||||
# ``cam_enc.*`` / ``cam_dec.*`` are kept and consumed by the multi-view
|
|
||||||
# forward path -- their layouts in our ``camera.py`` mirror the
|
|
||||||
# upstream ``cam_enc.py`` / ``cam_dec.py`` so HF safetensors load
|
|
||||||
# directly without any key remap.
|
|
||||||
drop_prefixes = ("gs_head.", "gs_adapter.")
|
drop_prefixes = ("gs_head.", "gs_adapter.")
|
||||||
for k in list(state_dict.keys()):
|
for k in list(state_dict.keys()):
|
||||||
if k.startswith(drop_prefixes):
|
if k.startswith(drop_prefixes):
|
||||||
state_dict.pop(k)
|
state_dict.pop(k)
|
||||||
# Remap upstream DA3 backbone keys (``backbone.pretrained.*`` with
|
|
||||||
# fused QKV) to the layout used by ``comfy.image_encoders.dino2.Dinov2Model``.
|
|
||||||
return _da3_remap_backbone_keys(state_dict, prefix="backbone.")
|
return _da3_remap_backbone_keys(state_dict, prefix="backbone.")
|
||||||
|
|
||||||
|
|
||||||
def _da3_remap_backbone_keys(state_dict, prefix="backbone."):
|
def _da3_remap_backbone_keys(state_dict, prefix="backbone."):
|
||||||
"""Rewrite upstream DA3 DINOv2 keys to the shared ``Dinov2Model`` layout.
|
"""Map ``backbone.pretrained.*`` (upstream DA3) keys to ``Dinov2Model`` under ``prefix``."""
|
||||||
|
|
||||||
Upstream layout (under ``{prefix}pretrained.``):
|
|
||||||
patch_embed.proj.{weight,bias}, pos_embed, cls_token, camera_token, norm.*,
|
|
||||||
blocks.{i}.norm{1,2}.*, blocks.{i}.attn.qkv.{weight,bias},
|
|
||||||
blocks.{i}.attn.q_norm.*, blocks.{i}.attn.k_norm.*,
|
|
||||||
blocks.{i}.attn.proj.*, blocks.{i}.ls{1,2}.gamma,
|
|
||||||
blocks.{i}.mlp.fc{1,2}.* (or w12/w3 for SwiGLU)
|
|
||||||
|
|
||||||
Target layout (Dinov2Model under ``{prefix}``):
|
|
||||||
embeddings.patch_embeddings.projection.*,
|
|
||||||
embeddings.position_embeddings, embeddings.cls_token, embeddings.camera_token,
|
|
||||||
layernorm.*,
|
|
||||||
encoder.layer.{i}.norm{1,2}.*,
|
|
||||||
encoder.layer.{i}.attention.attention.{query,key,value}.*,
|
|
||||||
encoder.layer.{i}.attention.q_norm.*, encoder.layer.{i}.attention.k_norm.*,
|
|
||||||
encoder.layer.{i}.attention.output.dense.*,
|
|
||||||
encoder.layer.{i}.layer_scale{1,2}.lambda1,
|
|
||||||
encoder.layer.{i}.mlp.fc{1,2}.* (or weights_in/weights_out for SwiGLU)
|
|
||||||
"""
|
|
||||||
pre = prefix + "pretrained."
|
pre = prefix + "pretrained."
|
||||||
src_keys = [k for k in state_dict.keys() if k.startswith(pre)]
|
src_keys = [k for k in state_dict.keys() if k.startswith(pre)]
|
||||||
if not src_keys:
|
if not src_keys:
|
||||||
|
|||||||
@ -72,19 +72,9 @@ class LoadDepthAnything3(io.ComfyNode):
|
|||||||
return io.NodeOutput(model)
|
return io.NodeOutput(model)
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Inference helpers
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _run_da3(model_patcher, image: torch.Tensor, process_res: int,
|
def _run_da3(model_patcher, image: torch.Tensor, process_res: int,
|
||||||
method: str = "upper_bound_resize"):
|
method: str = "upper_bound_resize"):
|
||||||
"""Run the DA3 network on a (B, H, W, 3) IMAGE batch.
|
"""Run DA3 on ``(B,H,W,3)`` IMAGE; returns depth/conf/sky at original resolution (or None)."""
|
||||||
|
|
||||||
Returns ``(depth, confidence, sky)`` tensors at the original image
|
|
||||||
resolution. ``confidence`` / ``sky`` are ``None`` when the variant does
|
|
||||||
not produce them.
|
|
||||||
"""
|
|
||||||
assert image.ndim == 4 and image.shape[-1] == 3, \
|
assert image.ndim == 4 and image.shape[-1] == 3, \
|
||||||
f"expected (B,H,W,3) IMAGE; got {tuple(image.shape)}"
|
f"expected (B,H,W,3) IMAGE; got {tuple(image.shape)}"
|
||||||
|
|
||||||
@ -95,7 +85,6 @@ def _run_da3(model_patcher, image: torch.Tensor, process_res: int,
|
|||||||
dtype = diffusion.dtype if diffusion.dtype is not None else torch.float32
|
dtype = diffusion.dtype if diffusion.dtype is not None else torch.float32
|
||||||
|
|
||||||
depths, confs, skies = [], [], []
|
depths, confs, skies = [], [], []
|
||||||
# Process one image at a time to keep peak memory predictable.
|
|
||||||
for i in range(B):
|
for i in range(B):
|
||||||
single = image[i:i + 1].to(device)
|
single = image[i:i + 1].to(device)
|
||||||
x = da3_preprocess.preprocess_image(single, process_res=process_res, method=method)
|
x = da3_preprocess.preprocess_image(single, process_res=process_res, method=method)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user