Camera info to render nodes, fixes and tweaks

This commit is contained in:
kijai 2026-05-28 01:47:00 +03:00
parent 828016a274
commit 8acb162c31
7 changed files with 373 additions and 91 deletions

View File

@ -17,6 +17,7 @@ import folder_paths
from comfy.ldm.sam3d_body.model.model import SAM3DBody
from comfy.ldm.sam3d_body.model.dinov3 import apply_dinov3_qkv_bias_mask
from comfy_extras.sam3d_body.utils import (
apply_camera_override,
cam_int_from_fov,
cam_int_from_moge,
inputs_from_sam3_track,
@ -99,6 +100,32 @@ class SAM3DBody_Loader(io.ComfyNode):
# Predict
def _per_frame_bboxes_from_detections(bboxes, B: int):
# BoundingBox payload (RT-DETR etc.): dict | list[dict] | list[list[dict]].
if isinstance(bboxes, dict):
norm = [[bboxes]]
elif not bboxes:
return None
elif isinstance(bboxes[0], dict):
norm = [bboxes] # flat list → same detections every frame
else:
norm = list(bboxes)
if len(norm) == 1:
norm = norm * B
norm = (norm + [[]] * B)[:B]
out = []
for frame in norm:
if frame:
boxes = torch.tensor(
[[d["x"], d["y"], d["x"] + d["width"], d["y"] + d["height"]] for d in frame],
dtype=torch.float32,
)
else:
boxes = torch.zeros((0, 4), dtype=torch.float32)
out.append(boxes)
return out
class SAM3DBody_Predict(io.ComfyNode):
@classmethod
def define_schema(cls):
@ -113,6 +140,14 @@ class SAM3DBody_Predict(io.ComfyNode):
"sam3_track_data", optional=True,
tooltip=("Output of SAM3 Video Track, required for multi-person detection"),
),
io.BoundingBox.Input(
"bboxes", optional=True, force_input=True,
tooltip=(
"Per-frame person boxes (e.g. RT-DETR Detect with class_name='person'). "
"Used when no SAM3 track is wired — gives the top-down model a tight, "
"person-centered crop. Multi-person supported (one box = one person)."
),
),
io.Boolean.Input(
"run_hand_refinement", default=True,
tooltip="Improves hand pose at the cost of extra inference time and memory use"),
@ -146,19 +181,22 @@ class SAM3DBody_Predict(io.ComfyNode):
)
@classmethod
def execute(cls, sam3d_body_model, image, sam3_track_data=None, run_hand_refinement=True, fov_degrees=0.0, moge_geometry=None, chunk_size=144) -> io.NodeOutput:
def execute(cls, sam3d_body_model, image, sam3_track_data=None, bboxes=None, run_hand_refinement=True, fov_degrees=0.0, moge_geometry=None, chunk_size=64) -> io.NodeOutput:
comfy.model_management.load_model_gpu(sam3d_body_model)
inner: SAM3DBody = sam3d_body_model.model
B, H, W, _ = image.shape
image_size = getattr(inner, "_sam3d_image_size", (512, 512))
# Precedence: SAM3 track (masks + boxes) > detector boxes > full-frame fallback.
per_frame_bboxes, per_frame_masks = (None, None)
if sam3_track_data is not None:
per_frame_bboxes, per_frame_masks = inputs_from_sam3_track(sam3_track_data, B, H, W)
if per_frame_bboxes is None and bboxes:
per_frame_bboxes = _per_frame_bboxes_from_detections(bboxes, B)
per_frame_masks = None
if per_frame_bboxes is None:
# No track wired (or empty / frame count mismatch) — single-person
# full-frame fallback. Multi-person scenes need SAM3 Video Track.
# No track or detector boxes — single-person full-frame fallback.
full_frame_bbox = torch.tensor([[0.0, 0.0, float(W), float(H)]], dtype=torch.float32)
per_frame_bboxes = [full_frame_bbox.clone() for _ in range(B)]
per_frame_masks = None
@ -711,6 +749,26 @@ def _render_capsules_mode_inputs():
]
def _render_openpose3d_mode_inputs():
return [
io.Float.Input(
"radius_m", default=0.015, min=0.004, max=0.1, step=0.001,
tooltip="Limb capsule radius in meters (thin = stick-like).",
),
io.Boolean.Input(
"include_hands", default=True,
tooltip="Draw 21+21 hand keypoints as 3D capsules.",
),
io.Float.Input(
"person_palette_falloff", default=0.6, min=0.1, max=1.0, step=0.05,
tooltip=(
"Per-person desaturation: track k blends toward white by "
"1 - falloff^k. Track 0 stays vivid; 1.0 disables falloff."
),
),
]
def _render_openpose_mode_inputs():
return [
io.Int.Input(
@ -755,15 +813,8 @@ def _render_openpose_mode_inputs():
def _scale_pose_data(mhr_pose_data: Dict[str, Any], new_H: int, new_W: int) -> Dict[str, Any]:
"""Rescale per-person camera intrinsics + 2D coords to a new canvas size.
Pose data records focal_length in pixels of the original image; without
scaling, the FOV would change and subjects would be cropped/zoomed.
When the new aspect differs from the original, the body (3D-projected
through focal_length on a centered principal point) lands in a
letterboxed region of the new canvas. 2D-prestored coords must follow
the same uniform scale + center offset so face/hand overlays align with
the body per-axis stretching would split them apart."""
# 2D coords must match the body's letterbox transform (uniform scale +
# center offset), else face/hand overlays drift off the body.
old_H, old_W = mhr_pose_data["image_size"]
if new_H == old_H and new_W == old_W:
return mhr_pose_data
@ -831,20 +882,38 @@ class SAM3DBody_Render(io.ComfyNode):
"other is derived preserving the original aspect."
),
),
io.Load3DCamera.Input(
"camera_info", optional=True,
tooltip=(
"Free 6DOF camera override. When wired, the pose is re-projected through this camera "
"(position/target/zoom) instead of the predicted one. "
),
),
io.Float.Input(
"camera_fov", default=0.0, min=0.0, max=170.0, step=0.5, advanced=True,
tooltip=(
"Vertical FOV for the camera_info override. 0 = keep the SAM3D "
"predicted camera's FOV (only the viewpoint changes). Any non-zero "
"value overrides the lens. Ignored when camera_info is unwired."
),
),
io.DynamicCombo.Input(
"render_style",
options=[
io.DynamicCombo.Option("mesh", _render_mesh_mode_inputs()),
io.DynamicCombo.Option("silhouette", []),
io.DynamicCombo.Option("openpose", _render_openpose_mode_inputs()),
io.DynamicCombo.Option("openpose_2d", _render_openpose_mode_inputs()),
io.DynamicCombo.Option("openpose_3d", _render_openpose3d_mode_inputs()),
io.DynamicCombo.Option("scail", _render_capsules_mode_inputs()),
],
tooltip=(
"'mesh' = 3D MHR mesh rasterized through the camera. "
"'silhouette' = binary mask of the mesh (white-on-black, "
"background ignored). 'openpose' = flat 2D skeleton "
"from pred_keypoints_2d (DWPose look). 'scail' = SCAIL "
"3D capsules via torch SDF ray-march (proper occlusion / depth)."
"background ignored). 'openpose_2d' = flat 2D skeleton "
"from pred_keypoints_2d (DWPose look, ControlNet-ready). "
"'openpose_3d' = same skeleton as flat-shaded 3D capsules "
"(camera-aware, proper depth). 'scail' = SCAIL 3D capsules "
"via torch SDF ray-march (proper occlusion / depth)."
),
),
],
@ -853,7 +922,7 @@ class SAM3DBody_Render(io.ComfyNode):
@classmethod
def execute(cls, mhr_pose_data, background=None, width=0, height=0, render_style=None) -> io.NodeOutput:
def execute(cls, mhr_pose_data, background=None, width=0, height=0, camera_info=None, camera_fov=0.0, render_style=None) -> io.NodeOutput:
render_style = render_style or {"render_style": "mesh"}
mode_key = render_style.get("render_style", "mesh")
@ -869,10 +938,11 @@ class SAM3DBody_Render(io.ComfyNode):
new_H = max(1, round(native_H * new_W / native_W))
mhr_pose_data = _scale_pose_data(mhr_pose_data, new_H, new_W)
H, W = new_H, new_W
# Marker/stick px constants are authored for native resolution —
# scale them so the openpose overlay reads at the same relative size.
px_scale = min(new_W / native_W, new_H / native_H)
if camera_info is not None:
mhr_pose_data = apply_camera_override(mhr_pose_data, camera_info, H, W, fov_deg=float(camera_fov))
B = len(mhr_pose_data["frames"])
if B == 0:
return io.NodeOutput(torch.zeros(1, H, W, 3, dtype=torch.float32))
@ -880,6 +950,8 @@ class SAM3DBody_Render(io.ComfyNode):
out_device = comfy.model_management.intermediate_device()
bg_t = None if background is None else background.to(device=out_device, dtype=torch.float32)
if bg_t is not None and tuple(bg_t.shape[1:3]) != (H, W): # Match the background to the render resolution
bg_t = comfy.utils.common_upscale(bg_t.movedim(-1, 1), W, H, "bilinear", "disabled").movedim(1, -1)
if mode_key == "silhouette":
composite = "silhouette"
@ -888,7 +960,7 @@ class SAM3DBody_Render(io.ComfyNode):
else:
composite = "mesh_only"
if mode_key == "openpose":
if mode_key == "openpose_2d":
marker_radius_px = max(1, int(round(render_style.get("marker_radius_px", 4) * px_scale)))
stick_width_px = max(1, int(round(render_style.get("stick_width_px", 4) * px_scale)))
limb_alpha = float(render_style.get("limb_alpha", 0.6))
@ -897,6 +969,10 @@ class SAM3DBody_Render(io.ComfyNode):
include_hands = hand_style != "disabled"
hand_color_style = hand_style if include_hands else "dwpose"
person_palette_falloff = float(render_style.get("person_palette_falloff", 0.6))
elif mode_key == "openpose_3d":
op3d_radius_m = float(render_style.get("radius_m", 0.015))
op3d_include_hands = bool(render_style.get("include_hands", True))
person_palette_falloff = float(render_style.get("person_palette_falloff", 0.6))
elif mode_key == "scail":
cap_radius_m = float(render_style.get("radius_m", 0.030))
cap_hand_style = str(render_style.get("hand_style", "disabled"))
@ -931,7 +1007,8 @@ class SAM3DBody_Render(io.ComfyNode):
frames_out = []
pbar = comfy.utils.ProgressBar(B)
desc = (
"SAM3D openpose-2D render" if mode_key == "openpose"
"SAM3D openpose-2D render" if mode_key == "openpose_2d"
else "SAM3D openpose-3D render" if mode_key == "openpose_3d"
else "SAM3D SCAIL-3D render" if mode_key == "scail"
else "SAM3D silhouette" if mode_key == "silhouette"
else "SAM3D render"
@ -940,7 +1017,7 @@ class SAM3DBody_Render(io.ComfyNode):
bg_f = None
if bg_t is not None:
bg_f = bg_t[min(f, bg_t.shape[0] - 1)]
if mode_key == "openpose":
if mode_key == "openpose_2d":
img = render_pose_data_openpose(
mhr_pose_data, frame_idx=f, W=W, H=H,
background=bg_f,
@ -953,6 +1030,17 @@ class SAM3DBody_Render(io.ComfyNode):
hand_color_style=hand_color_style,
person_brightness_falloff=person_palette_falloff,
)
elif mode_key == "openpose_3d":
img = render_pose_data_capsules(
mhr_pose_data, frame_idx=f, W=W, H=H,
background=bg_f,
composite=composite,
radius_m=op3d_radius_m,
include_hands=op3d_include_hands,
palette="openpose",
flat_shade=True,
person_brightness_falloff=person_palette_falloff,
)
elif mode_key == "scail":
# SCAIL renders body as 3D capsules + 2D openpose hands on top
img = render_pose_data_capsules(

View File

@ -449,7 +449,7 @@ class BuildPoseGLB(IO.ComfyNode):
IO.DynamicCombo.Option("octahedrons", [
IO.Float.Input(
"bone_vis_radius_m",
default=0.02, min=0.005, max=0.5, step=0.005,
default=0.02, min=0.005, max=0.5, step=0.005, advanced=True,
tooltip="Radius in m (sphere radius / octahedron half-width).",
),
IO.Combo.Input(
@ -527,7 +527,7 @@ class BuildPoseGLB(IO.ComfyNode):
IO.DynamicCombo.Option("octahedrons", [
IO.Float.Input(
"bone_vis_radius_m",
default=0.02, min=0.005, max=0.5, step=0.005,
default=0.02, min=0.005, max=0.5, step=0.005, advanced=True,
tooltip="Radius in m (sphere radius / octahedron half-width).",
),
IO.Combo.Input(
@ -557,12 +557,20 @@ class BuildPoseGLB(IO.ComfyNode):
),
]),
IO.DynamicCombo.Option("openpose", [
IO.Int.Input(
"bone_smooth_window",
default=0, min=0, max=51, step=2,
tooltip=(
"Gaussian window on keypoint tracks. 0 = off. "
"7-15 calms jitter where upstream Smooth misses spikes."
),
),
IO.Float.Input(
"marker_radius_m", default=0.010, min=0.005, max=0.1, step=0.001,
"marker_radius_m", default=0.010, min=0.005, max=0.1, step=0.001, advanced=True,
tooltip="Sphere radius in m.",
),
IO.Float.Input(
"stick_radius_m", default=0.008, min=0.002, max=0.05, step=0.001,
"stick_radius_m", default=0.008, min=0.002, max=0.05, step=0.001, advanced=True,
tooltip="Limb half-width in m. Auto-clamped to bone_length x 0.1.",
),
IO.Boolean.Input(
@ -573,31 +581,39 @@ class BuildPoseGLB(IO.ComfyNode):
),
),
IO.Float.Input(
"hand_marker_radius_m", default=0.005, min=0.001, max=0.1, step=0.001,
"hand_marker_radius_m", default=0.005, min=0.001, max=0.1, step=0.001, advanced=True,
tooltip="Hand sphere radius in m.",
),
IO.Float.Input(
"hand_stick_radius_m", default=0.003, min=0.001, max=0.05, step=0.001,
"hand_stick_radius_m", default=0.003, min=0.001, max=0.05, step=0.001, advanced=True,
tooltip="Hand limb half-width in m.",
),
IO.Combo.Input(
"face_source",
options=["off", "rig"],
default="off",
"face_style",
options=["disabled", "full", "eyes_mouth"],
default="disabled",
tooltip=(
"'rig' adds ~30 face-contour landmarks sampled from pred_vertices "
"at fixed head-mesh vertex IDs (brow/eyes/nose/mouth/jaw); needs "
"canonical_colors on pose_data."
"Face-contour landmarks sampled from pred_vertices at fixed "
"head-mesh vertex IDs (needs canonical_colors on pose_data). "
"'full' = all ~30 points; 'eyes_mouth' = eyes + outer lips only."
),
),
IO.Float.Input(
"face_marker_radius_m", default=0.0, min=0.0, max=0.05, step=0.0005,
"face_marker_radius_m", default=0.0, min=0.0, max=0.05, step=0.0005, advanced=True,
tooltip="Face dot radius. 0 = auto = 0.3 x marker_radius_m.",
),
]),
IO.DynamicCombo.Option("scail", [
IO.Int.Input(
"bone_smooth_window",
default=0, min=0, max=51, step=2,
tooltip=(
"Gaussian window on keypoint tracks. 0 = off. "
"7-15 calms jitter where upstream Smooth misses spikes."
),
),
IO.Float.Input(
"stick_radius_m", default=0.022, min=0.002, max=0.1, step=0.001,
"stick_radius_m", default=0.022, min=0.002, max=0.1, step=0.001, advanced=True,
tooltip=(
"Cylinder radius in m. Bones are open cylinders at constant "
"radius; joint spheres (auto-sized to match) cap the open ends. "
@ -605,11 +621,11 @@ class BuildPoseGLB(IO.ComfyNode):
),
),
IO.Float.Input(
"marker_radius_m", default=0.0, min=0.0, max=0.1, step=0.001,
"marker_radius_m", default=0.0, min=0.0, max=0.1, step=0.001, advanced=True,
tooltip="Joint sphere radius. 0 = auto = stick_radius_m (flush cap).",
),
IO.Float.Input(
"material_roughness", default=0.3, min=0.0, max=1.0, step=0.05,
"material_roughness", default=0.3, min=0.0, max=1.0, step=0.05, advanced=True,
tooltip="PBR roughness. SCAIL ref = 0.3. 1 = matte; 0 = chrome.",
),
IO.Boolean.Input(
@ -617,13 +633,23 @@ class BuildPoseGLB(IO.ComfyNode):
tooltip="Append 21+21 hand keypoints + capsule sticks per track.",
),
IO.Float.Input(
"hand_marker_radius_m", default=0.005, min=0.001, max=0.05, step=0.001,
"hand_marker_radius_m", default=0.005, min=0.001, max=0.05, step=0.001, advanced=True,
tooltip="Hand sphere radius in m.",
),
IO.Float.Input(
"hand_stick_radius_m", default=0.003, min=0.001, max=0.05, step=0.001,
"hand_stick_radius_m", default=0.003, min=0.001, max=0.05, step=0.001, advanced=True,
tooltip="Hand cylinder radius in m.",
),
IO.Combo.Input(
"face_style",
options=["disabled", "full", "eyes_mouth"],
default="disabled",
tooltip=(
"Face-contour landmarks sampled from pred_vertices (needs "
"canonical_colors on pose_data). 'full' = all ~30 points; "
"'eyes_mouth' = eyes + outer lips only."
),
),
]),
],
tooltip=(
@ -710,10 +736,11 @@ class BuildPoseGLB(IO.ComfyNode):
include_hands=bool(mesh_style.get("include_hands", False)),
hand_marker_radius_m=float(mesh_style.get("hand_marker_radius_m", 0.005)),
hand_stick_radius_m=float(mesh_style.get("hand_stick_radius_m", 0.003)),
face_source=str(mesh_style.get("face_source", "off")),
face_style=str(mesh_style.get("face_style", "disabled")),
face_marker_radius_m=float(mesh_style.get("face_marker_radius_m", 0.0)),
palette="openpose",
shape="ellipsoid",
bone_smooth_window=int(mesh_style.get("bone_smooth_window", 0)),
)
elif mode_key == "scail":
# SCAIL rig: open cylinders capped flush by joint spheres (sphere
@ -732,7 +759,7 @@ class BuildPoseGLB(IO.ComfyNode):
include_hands=bool(mesh_style.get("include_hands", False)),
hand_marker_radius_m=float(mesh_style.get("hand_marker_radius_m", 0.005)),
hand_stick_radius_m=float(mesh_style.get("hand_stick_radius_m", 0.003)),
face_source="off",
face_style=str(mesh_style.get("face_style", "disabled")),
palette="scail",
shape="capsule",
smooth_shade=True,
@ -740,6 +767,7 @@ class BuildPoseGLB(IO.ComfyNode):
# inside of the open cylinders shades sensibly at grazing angles.
material_roughness=float(mesh_style.get("material_roughness", 0.3)),
material_double_sided=True,
bone_smooth_window=int(mesh_style.get("bone_smooth_window", 0)),
)
else:
raise ValueError(f"BuildPoseGLB: unknown mesh_style {mode_key!r}")

View File

@ -41,10 +41,11 @@ def _build_specs_from_pose(
include_hands: bool,
palette: str,
person_brightness_falloff: float = 0.0,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""Flatten body + optional hand limbs for one frame into
(starts, ends, colors_rgba) in camera coords (Y-down, +Z forward).
Drops endpoints that are non-finite or behind the camera.
(starts, ends, colors_rgba, is_hand) in camera coords (Y-down, +Z forward).
Drops endpoints that are non-finite or behind the camera. `is_hand` flags
the hand limbs so the renderer can draw them thinner.
`person_brightness_falloff` mixes each per-person limb color toward white
by `1 - falloff^k` for track index `k` (track 0 stays vivid). Matches the
@ -52,6 +53,7 @@ def _build_specs_from_pose(
starts: List[np.ndarray] = []
ends: List[np.ndarray] = []
colors: List[np.ndarray] = []
is_hand: List[bool] = []
body_limb_colors = _limb_palette_rgb01(palette)
hand_limb_colors = OPENPOSE_HAND_COLORS_21.astype(np.float32)
@ -109,6 +111,7 @@ def _build_specs_from_pose(
sb = sa + spine_dir * (sd_len * 0.3)
starts.append(sa)
ends.append(sb)
is_hand.append(False)
color_rgb = _tint(body_limb_colors[limb_i])
colors.append(np.array([color_rgb[0], color_rgb[1], color_rgb[2], 1.0],
dtype=np.float32))
@ -125,6 +128,7 @@ def _build_specs_from_pose(
continue
starts.append(sa)
ends.append(sb)
is_hand.append(True)
color_rgb = _tint(hand_limb_colors[(a + b) % len(hand_limb_colors)])
colors.append(np.array([color_rgb[0], color_rgb[1], color_rgb[2], 1.0],
dtype=np.float32))
@ -132,10 +136,12 @@ def _build_specs_from_pose(
if not starts:
return (np.zeros((0, 3), dtype=np.float32),
np.zeros((0, 3), dtype=np.float32),
np.zeros((0, 4), dtype=np.float32))
np.zeros((0, 4), dtype=np.float32),
np.zeros((0,), dtype=bool))
return (np.stack(starts).astype(np.float32),
np.stack(ends).astype(np.float32),
np.stack(colors).astype(np.float32))
np.stack(colors).astype(np.float32),
np.asarray(is_hand, dtype=bool))
def _ray_capsule_t(
@ -144,14 +150,14 @@ def _ray_capsule_t(
ends: torch.Tensor, # (M, 3)
ba_norm: torch.Tensor, # (M, 3) unit axis (A → B)
ba_len: torch.Tensor, # (M,) segment length
radius: float,
radius: torch.Tensor, # (M,) per-capsule radius
) -> torch.Tensor:
"""Closed-form ray-capsule intersection. Returns (K, M) tensor of ray
parameters t to the nearest valid hit per capsule, +inf where the ray
misses. A capsule is the union of (cylinder body, hemisphere at A,
hemisphere at B); each component is a quadratic root-find."""
INF = float("inf")
r_sq = float(radius) * float(radius)
r_sq = radius * radius # (M,)
# Cached dot products.
dn = ray_dirs @ ba_norm.transpose(0, 1) # (K, M) — d·n
@ -199,9 +205,10 @@ def _render_capsules_torch(
colors: torch.Tensor,
H: int, W: int,
fx: float, fy: float, cx: float, cy: float,
radius: float,
radius: torch.Tensor, # scalar or (M,) per-capsule radius
background_rgb: Optional[torch.Tensor],
device: torch.device,
flat_shade: bool = False,
) -> torch.Tensor:
"""Analytic ray-capsule renderer for a union of capsules. Camera at
origin looking down +Z; pixels in y-down screen coords."""
@ -224,12 +231,16 @@ def _render_capsules_torch(
flat_dirs = ray_dirs.view(-1, 3)
N = flat_dirs.shape[0]
radius = torch.as_tensor(radius, device=device, dtype=torch.float32)
if radius.ndim == 0:
radius = radius.expand(M)
ba = ends - starts
ba_len = torch.linalg.norm(ba, dim=1).clamp(min=1e-6)
ba_norm = ba / ba_len.unsqueeze(1)
z_min = float(min(starts[:, 2].min().item(), ends[:, 2].min().item()))
z_near = max(0.05, z_min - radius)
z_near = max(0.05, z_min - float(radius.max().item()))
# Union of per-capsule screen-space bboxes. Pixels outside this mask
# provably can't hit any capsule, so the analytic intersection only runs
@ -298,6 +309,10 @@ def _render_capsules_torch(
normals = normals / normals.norm(dim=-1, keepdim=True).clamp(min=1e-8)
col = colors[m_h, :3]
if flat_shade:
# Solid per-limb color (OpenPose look) — no lighting/depth modulation.
out[hit_idx] = col
return out.view(H, W, 3).clamp(0.0, 1.0)
# SCAIL Blinn-Phong (render_torch.py:290-331). Headlight: light = +Z.
diff = torch.clamp(-(normals[:, 2]), min=0.0)
diffuse = 0.45 + 0.55 * diff
@ -336,6 +351,8 @@ def render_pose_data_capsules(
include_hands: bool = False,
palette: str = "scail",
person_brightness_falloff: float = 0.0,
flat_shade: bool = False,
hand_radius_scale: float = 0.4,
device: Optional[torch.device] = None,
) -> torch.Tensor:
"""Render a frame's pose_data as 3D capsules projected through the per-
@ -345,7 +362,8 @@ def render_pose_data_capsules(
`composite='mesh_only'` always uses a black canvas.
`radius_m` is in METERS (matching `pred_keypoints_3d` / `pred_cam_t`).
Camera fx/fy come from each person's `focal_length` (pixels); cx/cy = center.
Hand limbs use `radius_m * hand_radius_scale` (their bones are far shorter
than body limbs). Camera fx/fy come from each person's `focal_length`.
"""
persons = pose_data["frames"][frame_idx]
if device is None:
@ -361,7 +379,7 @@ def render_pose_data_capsules(
break
cx, cy = W * 0.5, H * 0.5
starts_np, ends_np, colors_np = _build_specs_from_pose(
starts_np, ends_np, colors_np, is_hand_np = _build_specs_from_pose(
persons, include_hands=include_hands, palette=palette,
person_brightness_falloff=person_brightness_falloff,
)
@ -384,11 +402,14 @@ def render_pose_data_capsules(
starts_t = torch.from_numpy(starts_np).to(device=device, dtype=torch.float32)
ends_t = torch.from_numpy(ends_np).to(device=device, dtype=torch.float32)
colors_t = torch.from_numpy(colors_np).to(device=device, dtype=torch.float32)
radii_np = np.where(is_hand_np, radius_m * hand_radius_scale, radius_m).astype(np.float32)
radii_t = torch.from_numpy(radii_np).to(device=device, dtype=torch.float32)
return _render_capsules_torch(
starts_t, ends_t, colors_t,
H=H, W=W, fx=fx, fy=fy, cx=cx, cy=cy,
radius=float(radius_m),
radius=radii_t,
background_rgb=bg_t,
device=device,
flat_shade=flat_shade,
)

View File

@ -37,6 +37,7 @@ from .glb_shared import (
SCAIL_LIMB_COLORS_17,
collect_tracks,
flat_shade_mesh,
gaussian_smooth_positions,
make_lit_material,
quat_sign_fix_per_joint,
rotation_align,
@ -364,11 +365,14 @@ def _build_openpose_spheres(
bind_kp_m: np.ndarray, radius_m: float, kp_colors: np.ndarray,
base_joint_idx: int = 0,
smooth_shade: bool = False,
joint_indices: Optional[np.ndarray] = None,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""UV sphere per OpenPose keypoint, rigidly skinned to that keypoint's
joint, vertex-colored from kp_colors. `base_joint_idx` is added to the
emitted JOINTS_0 indices so callers can place this group at any offset
in the shared skin (body=0, right hand=18, etc.).
in the shared skin (body=0, right hand=18, etc.). `joint_indices` (when
given) overrides that with explicit per-sphere joint indices, so callers
can skip keypoints (e.g. SCAIL head dots).
`smooth_shade=True` keeps the indexed mesh and writes per-vertex
normals via face-normal averaging round shading on the spheres.
@ -390,7 +394,7 @@ def _build_openpose_spheres(
out_v[v_off:v_off + Nv] = sv * radius_m + bind_kp_m[j]
out_n[v_off:v_off + Nv] = sv
out_f[j * Nf:(j + 1) * Nf] = sf + v_off
out_j[v_off:v_off + Nv, 0] = j + base_joint_idx
out_j[v_off:v_off + Nv, 0] = int(joint_indices[j]) if joint_indices is not None else j + base_joint_idx
out_w[v_off:v_off + Nv, 0] = 1.0
out_c[v_off:v_off + Nv] = kp_colors[j]
return _finalize_skinned_mesh(out_v, out_f, out_j, out_w, out_c, smooth_shade)
@ -579,6 +583,24 @@ def _capsule_mesh_local(
return v_arr, np.asarray(faces, dtype=np.uint32), weights
def _scail_redirect_neck_stub(body_kp: np.ndarray) -> np.ndarray:
"""Replace the nose keypoint (idx 0) of a (...,18,3) array with a short
neck stub (0.6 spine + 0.4 necknose), matching the capsule render."""
out = body_kp.copy()
neck = body_kp[..., 1, :]
nose = body_kp[..., 0, :]
mid_hip = 0.5 * (body_kp[..., 8, :] + body_kp[..., 11, :])
def _unit(v):
return v / np.linalg.norm(v, axis=-1, keepdims=True).clip(min=1e-6)
nose_vec = nose - neck
nose_len = np.linalg.norm(nose_vec, axis=-1, keepdims=True)
mixed = _unit(0.6 * _unit(neck - mid_hip) + 0.4 * _unit(nose_vec))
out[..., 0, :] = neck + mixed * (nose_len * 0.5)
return out
def _openpose_limb_rest_trs(
bind_kp_m: np.ndarray, pairs: Tuple[Tuple[int, int], ...],
) -> Tuple[np.ndarray, np.ndarray]:
@ -636,6 +658,7 @@ def _build_openpose_sticks(
limb_joint_base_idx: int = 0,
shape: str = "ellipsoid",
smooth_shade: bool = False,
end_width_frac: float = 0.3,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""Capsule (cylinder + hemispherical caps) per limb pair (a, b).
@ -682,7 +705,7 @@ def _build_openpose_sticks(
half_width_eff = max(MIN_WIDTH, min(length * WIDTH_RATIO, half_width_m))
v_local, f_local, _weights_unused = _capsule_mesh_local(
length, half_width_eff, shape=shape,
length, half_width_eff, shape=shape, end_width_frac=end_width_frac,
)
v_world = v_local @ R.T + head
Nv = v_local.shape[0]
@ -729,13 +752,15 @@ def build_glb_openpose(
hand_marker_radius_m: float = 0.0,
hand_stick_radius_m: float = 0.0,
hand_color_style: str = "dwpose",
face_source: str = "off",
face_style: str = "disabled",
face_marker_radius_m: float = 0.0,
palette: str = "openpose",
shape: str = "ellipsoid",
smooth_shade: bool = False,
material_roughness: float = 0.85,
material_double_sided: bool = False,
stick_end_width_frac: float = 0.6,
bone_smooth_window: int = 0,
) -> bytes:
"""Build a GLB containing an OpenPose-style 3D skeleton — sphere markers
per keypoint plus rainbow-colored sticks between standard limb pairs.
@ -757,9 +782,10 @@ def build_glb_openpose(
rainbow per-finger sticks (controlnet_aux/dwpose convention);
'openpose' = rainbow per-finger dots AND sticks (matches
poseParameters.cpp::HAND_COLORS_RENDER).
face_source: 'off' (default) | 'rig' when 'rig', adds ~30 face
contour landmarks sampled from `pred_vertices` at vertex IDs
picked from `pose_data["canonical_colors"]["positions"]`.
face_style: 'disabled' (default) | 'full' | 'eyes_mouth' face
landmarks sampled from `pred_vertices` at vertex IDs picked from
`pose_data["canonical_colors"]["positions"]`. 'full' = all ~30
contour points; 'eyes_mouth' = the eyes + outer-lip subset.
face_marker_radius_m: per-face landmark sphere radius. 0 = auto =
0.3 × `marker_radius_m` face landmarks are densely packed
around the eyes/mouth/jaw and need to be much smaller than
@ -771,6 +797,12 @@ def build_glb_openpose(
SCAIL-Pose style warm hues right side, cool hues left side,
grey neck-to-nose centerline, distinct per-limb colors.
"""
is_scail = str(palette) == "scail"
# SCAIL drops the face bones (13..16) and eye/ear spheres; keeps nose (idx 0,
# the neck-stub tip) to cap the open cylinder. Matches the capsule render.
body_pairs = OPENPOSE_18_PAIRS[:13] if is_scail else OPENPOSE_18_PAIRS
body_sphere_kp = (np.arange(14, dtype=np.int64)
if is_scail else np.arange(18, dtype=np.int64))
if str(palette) == "scail":
body_sphere_colors = SCAIL_KEYPOINT_COLORS_18
body_stick_colors = SCAIL_LIMB_COLORS_17
@ -805,25 +837,30 @@ def build_glb_openpose(
if not tracks:
raise ValueError("build_glb_openpose: no valid tracks in pose_data")
# Eyes (6..13) + outer-lip ring (19..22) from FACE_LANDMARK_TARGETS.
_EYES_MOUTH_IDX = np.array([6, 7, 8, 9, 10, 11, 12, 13, 19, 20, 21, 22], dtype=np.int64)
face_vert_ids: Optional[np.ndarray] = None
if face_source == "rig":
face_target_idx = np.arange(len(FACE_LANDMARK_TARGETS), dtype=np.int64)
if face_style in ("full", "eyes_mouth"):
canonical_colors = pose_data.get("canonical_colors") or {}
positions = canonical_colors.get("positions")
if positions is None:
raise ValueError(
"build_glb_openpose: face_source='rig' needs "
"build_glb_openpose: face_style needs "
"pose_data['canonical_colors']['positions'] (computed at "
"model load and attached by Predict). Ensure the SAM3DBody "
"Loader+Predict ran upstream of this node."
)
if face_style == "eyes_mouth":
face_target_idx = _EYES_MOUTH_IDX
face_vert_ids = select_face_landmark_vert_ids(
np.asarray(positions),
face_mask=canonical_colors.get("face_mask"),
)
elif face_source != "off":
)[face_target_idx]
elif face_style != "disabled":
raise ValueError(
f"build_glb_openpose: unknown face_source={face_source!r} "
"(expected 'off' or 'rig')"
f"build_glb_openpose: unknown face_style={face_style!r} "
"(expected 'disabled', 'full', or 'eyes_mouth')"
)
K_body = 18
@ -833,7 +870,7 @@ def build_glb_openpose(
# Limb counts: one joint per stick pair. Limb joints carry translation +
# rotation so each capsule rotates rigidly with its limb (no LBS thinning).
K_body_limbs = len(OPENPOSE_18_PAIRS)
K_body_limbs = len(body_pairs)
K_hand_limbs = len(OPENPOSE_HAND_PAIRS) if include_hands else 0
K_limbs = K_body_limbs + 2 * K_hand_limbs # face has no sticks
@ -843,14 +880,14 @@ def build_glb_openpose(
joint_names.extend([f"openpose_R_{n}" for n in OPENPOSE_HAND21_NAMES])
joint_names.extend([f"openpose_L_{n}" for n in OPENPOSE_HAND21_NAMES])
if K_face > 0:
joint_names.extend([f"openpose_face_{name}"
for name, _ in FACE_LANDMARK_TARGETS])
joint_names.extend([f"openpose_face_{FACE_LANDMARK_TARGETS[i][0]}"
for i in face_target_idx])
# Limb joint names, stacked body → R-hand → L-hand to match the limb
# joint ordering in skin.joints (after the K keypoint joints).
limb_names: List[str] = [
f"openpose_limb_{OPENPOSE_18_NAMES[a]}_{OPENPOSE_18_NAMES[b]}"
for (a, b) in OPENPOSE_18_PAIRS
for (a, b) in body_pairs
]
if include_hands:
for side in ("R", "L"):
@ -882,6 +919,8 @@ def build_glb_openpose(
seq_chunks.append(_extract_face_landmarks_from_verts(
pose_data, frame_indices, person_k, face_vert_ids))
kp_seq = np.concatenate(seq_chunks, axis=1) # (N, K, 3)
if bone_smooth_window and bone_smooth_window > 1:
kp_seq = gaussian_smooth_positions(kp_seq, int(bone_smooth_window))
# Static-bind = rig's REST pose when available (override path); else
# fall back to frame 0 of the motion. The rest-pose bind makes the
@ -896,6 +935,10 @@ def build_glb_openpose(
bind_kp_m = (bind_kp_m_rest if bind_kp_m_rest is not None
else kp_seq[0].astype(np.float32))
if is_scail: # nose → neck stub, matching the capsule render
kp_seq[:, :K_body] = _scail_redirect_neck_stub(kp_seq[:, :K_body])
bind_kp_m[:K_body] = _scail_redirect_neck_stub(bind_kp_m[:K_body])
person_root: Dict[str, Any] = {"name": f"track{track_i:02d}", "children": []}
nodes.append(person_root)
person_root_idx = len(nodes) - 1
@ -920,8 +963,8 @@ def build_glb_openpose(
limb_rest_axes_list: List[np.ndarray] = []
limb_anim_mids_list: List[np.ndarray] = []
limb_anim_quats_list: List[np.ndarray] = []
rmid_b, raxis_b = _openpose_limb_rest_trs(bind_kp_m[:K_body], OPENPOSE_18_PAIRS)
amid_b, aquat_b = _openpose_limb_anim_trs(kp_seq[:, :K_body], OPENPOSE_18_PAIRS, raxis_b)
rmid_b, raxis_b = _openpose_limb_rest_trs(bind_kp_m[:K_body], body_pairs)
amid_b, aquat_b = _openpose_limb_anim_trs(kp_seq[:, :K_body], body_pairs, raxis_b)
limb_rest_mids_list.append(rmid_b)
limb_rest_axes_list.append(raxis_b)
limb_anim_mids_list.append(amid_b)
@ -979,15 +1022,17 @@ def build_glb_openpose(
group_meshes: List[Tuple[np.ndarray, np.ndarray, np.ndarray,
np.ndarray, np.ndarray, np.ndarray]] = []
sp = _build_openpose_spheres(
bind_kp_m[:K_body], float(marker_radius_m),
body_sphere_colors, base_joint_idx=0,
bind_kp_m[body_sphere_kp], float(marker_radius_m),
body_sphere_colors[body_sphere_kp], base_joint_idx=0,
smooth_shade=smooth_shade,
joint_indices=body_sphere_kp,
)
st = _build_openpose_sticks(
bind_kp_m[:K_body], OPENPOSE_18_PAIRS, float(stick_radius_m),
bind_kp_m[:K_body], body_pairs, float(stick_radius_m),
body_stick_colors, limb_joint_base_idx=K, # body limbs start at K
shape=shape,
smooth_shade=smooth_shade,
end_width_frac=stick_end_width_frac,
)
group_meshes.append(sp)
group_meshes.append(st)
@ -1012,6 +1057,7 @@ def build_glb_openpose(
limb_joint_base_idx=K + K_body_limbs + hand_i * K_hand_limbs,
shape=shape,
smooth_shade=smooth_shade,
end_width_frac=stick_end_width_frac,
))
if K_face > 0:

View File

@ -122,6 +122,30 @@ def gaussian_smooth_quats(q_seq: np.ndarray, window: int) -> np.ndarray:
return out.astype(np.float32)
def gaussian_smooth_positions(seq: np.ndarray, window: int) -> np.ndarray:
"""Gaussian-smooth a (N, K, 3) position sequence along time (edge-replicate
padding). Used to calm jittery keypoint tracks before the openpose rig
derives sphere translations + limb TRS from them."""
if window <= 1 or seq.shape[0] < 2:
return seq
s = np.asarray(seq, dtype=np.float64)
n = s.shape[0]
half = window // 2
sigma = max(0.5, window / 4.0)
x = np.arange(-half, half + 1, dtype=np.float64)
kernel = np.exp(-x * x / (2.0 * sigma * sigma))
kernel = kernel / kernel.sum()
padded = np.concatenate([
np.broadcast_to(s[:1], (half,) + s.shape[1:]),
s,
np.broadcast_to(s[-1:], (half,) + s.shape[1:]),
], axis=0)
out = np.zeros_like(s)
for k, wgt in enumerate(kernel):
out += wgt * padded[k:k + n]
return out.astype(np.float32)
def quat_sign_fix_per_joint(q_seq: np.ndarray) -> np.ndarray:
"""Walk (N, NJ, 4) along time, flip sign whenever consecutive frames sit
on opposite hemispheres. Eliminates long-path slerp glitches (mid-anim
@ -900,19 +924,23 @@ def rotation_align(from_vec: np.ndarray, to_vec: np.ndarray) -> np.ndarray:
def make_lit_material(
roughness: float = 0.85, double_sided: bool = False,
roughness: float = 0.85, double_sided: bool = False, opacity: float = 1.0,
) -> dict:
"""Lit PBR material using vertex COLOR_0 multiplicatively. KHR_materials_unlit
is intentionally off so viewer lighting reveals surface form. metallic=0
keeps the surface dielectric so vertex colors stay readable. roughness=0.85
suits dense rainbow body meshes; 0.3 matches SCAIL-Pose's glossy rig look."""
suits dense rainbow body meshes; 0.3 matches SCAIL-Pose's glossy rig look.
opacity < 1 switches to alpha-blend (e.g. see-through body mesh over bones)."""
a = float(max(0.0, min(1.0, opacity)))
mat = {
"pbrMetallicRoughness": {
"baseColorFactor": [1.0, 1.0, 1.0, 1.0],
"baseColorFactor": [1.0, 1.0, 1.0, a],
"metallicFactor": 0.0,
"roughnessFactor": float(max(0.0, min(1.0, roughness))),
},
}
if a < 1.0:
mat["alphaMode"] = "BLEND"
if double_sided:
mat["doubleSided"] = True
return mat

View File

@ -362,8 +362,10 @@ def build_glb_skeletal(
"indices": indices_acc,
"mode": 4,
}
if color_acc is not None:
materials.append(make_lit_material())
# See-through body when bones are shown, else opaque (only when a
# vertex-color shader baked COLOR_0 — otherwise default material).
if color_acc is not None or include_bones:
materials.append(make_lit_material(opacity=0.35 if include_bones else 1.0))
primitive["material"] = len(materials) - 1
if expr_morph_accs:
primitive["targets"] = [{"POSITION": a} for a in expr_morph_accs]

View File

@ -105,16 +105,85 @@ def cam_int_from_moge(moge_geometry, height: int, width: int) -> Optional[torch.
)
def run_batched_single_chunk(
inner: SAM3DBody,
frames_rgb: List[torch.Tensor],
per_frame_boxes: List[torch.Tensor],
per_frame_masks: Optional[List[torch.Tensor]],
image_size: Tuple[int, int],
inference_type: str,
K: int,
cam_int: Optional[torch.Tensor] = None,
) -> List[List[Dict[str, Any]]]:
def apply_camera_override(mhr_pose_data: Dict[str, Any], camera_info: Dict[str, Any],
H: int, W: int, fov_deg: float = 0.0) -> Dict[str, Any]:
"""Re-project every frame's pose through a Load3D 6DOF camera (position/
target/zoom + optional FOV). Returns a new mhr_pose_data; unchanged on
empty/invalid input."""
first_frame = mhr_pose_data["frames"][0] if mhr_pose_data["frames"] else []
if not first_frame:
return mhr_pose_data
# GLB exports the rig root at origin, so Load3D coords are root-relative
roots = [np.asarray(p["pred_cam_t"], dtype=np.float32).reshape(3)
for p in first_frame if p.get("pred_cam_t") is not None]
if not roots:
return mhr_pose_data
subj_center = np.mean(np.stack(roots, axis=0), axis=0)
# Meter-scale, so Three.js coords map 1:1 (Three.js Y-up → flip Y,Z)
pos = camera_info.get("position") or {}
tgt = camera_info.get("target") or {}
pos_v = np.array([float(pos.get("x", 0.0)), -float(pos.get("y", 5.0)), -float(pos.get("z", 0.0))], dtype=np.float32)
tgt_v = np.array([float(tgt.get("x", 0.0)), -float(tgt.get("y", 0.0)), -float(tgt.get("z", 0.0))], dtype=np.float32)
offset = pos_v - tgt_v
if float(np.linalg.norm(offset)) < 1e-6:
return mhr_pose_data
zoom = float(camera_info.get("zoom", 1.0)) or 1.0
target = subj_center + tgt_v
eye = target + offset / max(0.01, zoom)
# Look-at basis. z = -offset (already non-zero); x degenerates only when
# looking straight along world-up, then fall back to world +X.
z_axis = -offset / float(np.linalg.norm(offset))
x_axis = np.cross(z_axis, np.array([0.0, -1.0, 0.0], dtype=np.float32))
x_norm = float(np.linalg.norm(x_axis))
x_axis = x_axis / x_norm if x_norm > 1e-6 else np.array([1.0, 0.0, 0.0], dtype=np.float32)
y_axis = np.cross(z_axis, x_axis)
R = np.stack([x_axis, y_axis, z_axis], axis=0).astype(np.float32)
# fov_deg > 0 overrides the lens; 0 keeps the SAM3D predicted focal so only
# the viewpoint changes. Three.js fov is vertical → focal from image height.
if fov_deg > 0:
new_focal = float(H) / (2.0 * float(np.tan(np.deg2rad(fov_deg) / 2.0)))
else:
f0 = first_frame[0].get("focal_length")
new_focal = (float(np.asarray(f0, dtype=np.float32).reshape(-1)[0]) if f0 is not None
else float(H) / (2.0 * float(np.tan(np.deg2rad(50.0) / 2.0))))
center = np.array([W * 0.5, H * 0.5], dtype=np.float32)
reproj = {"pred_keypoints_3d": "pred_keypoints_2d", "pred_face_keypoints_3d": "pred_face_keypoints_2d"}
new_frames: List[List[Dict[str, Any]]] = []
for frame in mhr_pose_data["frames"]:
scaled = []
for p in frame:
p = dict(p)
cam_t = p.get("pred_cam_t")
if cam_t is None:
scaled.append(p)
continue
cam_t = np.asarray(cam_t, dtype=np.float32).reshape(3)
for k in ("pred_keypoints_3d", "pred_vertices", "pred_face_keypoints_3d"):
v = p.get(k)
if v is None:
continue
cam = (np.asarray(v, dtype=np.float32) + cam_t - eye) @ R.T
p[k] = cam.astype(np.float32)
if k in reproj: # re-project the new 3D to 2D image coords
z = np.maximum(cam[..., 2:3], 1e-6)
p[reproj[k]] = (cam[..., :2] * new_focal / z + center).astype(np.float32)
p["pred_cam_t"] = np.zeros(3, dtype=np.float32)
p["focal_length"] = np.array(new_focal, dtype=np.float32)
scaled.append(p)
new_frames.append(scaled)
out = dict(mhr_pose_data)
out["frames"] = new_frames
return out
def run_batched_single_chunk(inner: SAM3DBody, frames_rgb: List[torch.Tensor], per_frame_boxes: List[torch.Tensor],
per_frame_masks: Optional[List[torch.Tensor]], image_size: Tuple[int, int], inference_type: str, K: int,
cam_int: Optional[torch.Tensor] = None) -> List[List[Dict[str, Any]]]:
"""Run a SINGLE chunk of frames through run_inference in one forward."""
N = len(frames_rgb)
total = N * K