ComfyUI/comfy_extras/mediapipe/face_geometry.py
Jukka Seppänen 4d6a058bf1
feat: MediaPipe face detection (CORE-235) (#14009)
* Initial mediapipe face detection support

* Update face_geometry.py

* Account for diff sized batch input

* Model folder placeholder
2026-05-20 16:07:48 -07:00

112 lines
4.6 KiB
Python

"""Pure-numpy port of MediaPipe's face_geometry (FACE_LANDMARK_PIPELINE mode)
+ weighted Procrustes solver. Computes the 4x4 facial transformation matrix.
"""
from __future__ import annotations
import math
import numpy as np
def _solve_weighted_orthogonal_problem(src: np.ndarray, tgt: np.ndarray, weights: np.ndarray) -> np.ndarray:
"""Weighted orthogonal Procrustes (similarity). Returns 4x4 M with
`target ≈ M @ homogeneous(source)` in the weighted LS sense. fp64 for
SVD stability. Port of procrustes_solver.cc."""
sqrt_w = np.sqrt(weights.astype(np.float64))
w_total = float((sqrt_w ** 2).sum())
ws = src.astype(np.float64) * sqrt_w
wt = tgt.astype(np.float64) * sqrt_w
c_w = (ws @ sqrt_w) / w_total
centered = ws - np.outer(c_w, sqrt_w)
U, _S, Vt = np.linalg.svd(wt @ centered.T, full_matrices=True)
# Disallow reflection: flip the least-significant axis when det(U)·det(V)<0.
post, pre = U.copy(), Vt.T.copy()
if np.linalg.det(post) * np.linalg.det(pre) < 0:
post[:, 2] *= -1.0
R = post @ pre.T
denom = float((centered * ws).sum())
if denom < 1e-12:
raise ValueError("Procrustes denominator collapsed (degenerate source).")
scale = float((R @ centered * wt).sum()) / denom
translation = ((wt - scale * (R @ ws)) @ sqrt_w) / w_total
M = np.eye(4, dtype=np.float64)
M[:3, :3] = scale * R
M[:3, 3] = translation
return M
def _estimate_scale(canonical: np.ndarray, runtime: np.ndarray, weights: np.ndarray) -> float:
"""scale = ‖first column of M[:3]‖ per geometry_pipeline.cc::EstimateScale."""
return float(np.linalg.norm(_solve_weighted_orthogonal_problem(canonical, runtime, weights)[:3, 0]))
def solve_facial_transformation_matrix(
landmarks_normalized: np.ndarray,
canonical_vertices: np.ndarray,
procrustes_indices: np.ndarray,
procrustes_weights: np.ndarray,
image_width: int,
image_height: int,
# face_geometry_calculator_options.pbtxt defaults
vertical_fov_degrees: float = 63.0,
near: float = 1.0,
) -> np.ndarray:
"""4x4 facial transformation matrix via two-pass scale recovery
`landmarks_normalized` is (N, 3) in MediaPipe normalized convention: x, y
in [0,1] with TOP-LEFT origin, z in width-scaled units.
"""
h_near = 2.0 * near * math.tan(0.5 * math.radians(vertical_fov_degrees))
w_near = image_width * h_near / image_height
sub = procrustes_indices.astype(np.int64)
screen = landmarks_normalized[sub].T.astype(np.float64).copy()
canon = canonical_vertices[sub].T.astype(np.float64).copy()
weights = procrustes_weights.astype(np.float64)
# ProjectXY (TOP_LEFT y-flip, then scale all 3 axes; z uses x-scale).
screen[1] = 1.0 - screen[1]
screen[0] = screen[0] * w_near - 0.5 * w_near
screen[1] = screen[1] * h_near - 0.5 * h_near
screen[2] = screen[2] * w_near
depth_offset = float(screen[2].mean())
def _unproject(s: np.ndarray, scale: float) -> np.ndarray:
s = s.copy()
s[2] = (s[2] - depth_offset + near) / scale
s[0] *= s[2] / near
s[1] *= s[2] / near
s[2] *= -1.0
return s
first = screen.copy()
first[2] *= -1.0
s1 = _estimate_scale(canon, first, weights) # 1st pass: Procrustes on projected XY
s2 = _estimate_scale(canon, _unproject(screen, s1), weights) # 2nd pass: rescale z by s1, un-project XY
return _solve_weighted_orthogonal_problem(canon, _unproject(screen, s1 * s2), weights).astype(np.float32)
def transformation_matrix_from_detection(face_dict: dict, image_width: int, image_height: int, canonical_data: dict) -> np.ndarray:
"""Adapt a FaceLandmarker face dict to MP's normalized convention and solve.
FaceMesh emits (x, y, z) in 192-canonical units; MP's geometry expects
z_norm = z_canonical * scale_x / image_width"""
lmks_xy, lmks_3d = face_dict["landmarks_xy"], face_dict["landmarks_3d"]
aug = np.concatenate([lmks_3d[:, :2].astype(np.float64), np.ones((lmks_xy.shape[0], 1))], axis=1)
M, *_ = np.linalg.lstsq(aug, lmks_xy.astype(np.float64), rcond=None)
scale_x = float(np.linalg.norm(M[0]))
z_scale = scale_x / image_width if scale_x > 1e-6 else 1.0 / image_width
normalized = np.empty((lmks_xy.shape[0], 3), dtype=np.float32)
normalized[:, 0] = lmks_xy[:, 0] / image_width
normalized[:, 1] = lmks_xy[:, 1] / image_height
normalized[:, 2] = lmks_3d[:, 2] * z_scale
return solve_facial_transformation_matrix(
normalized, canonical_data["canonical_vertices"],
canonical_data["procrustes_indices"], canonical_data["procrustes_weights"],
image_width=image_width, image_height=image_height,
)