diff --git a/comfy_extras/nodes_sam3d_body.py b/comfy_extras/nodes_sam3d_body.py index afa6855a5..77bad110d 100644 --- a/comfy_extras/nodes_sam3d_body.py +++ b/comfy_extras/nodes_sam3d_body.py @@ -17,6 +17,7 @@ import folder_paths from comfy.ldm.sam3d_body.model.model import SAM3DBody from comfy.ldm.sam3d_body.model.dinov3 import apply_dinov3_qkv_bias_mask from comfy_extras.sam3d_body.utils import ( + apply_camera_override, cam_int_from_fov, cam_int_from_moge, inputs_from_sam3_track, @@ -99,6 +100,32 @@ class SAM3DBody_Loader(io.ComfyNode): # Predict +def _per_frame_bboxes_from_detections(bboxes, B: int): + # BoundingBox payload (RT-DETR etc.): dict | list[dict] | list[list[dict]]. + if isinstance(bboxes, dict): + norm = [[bboxes]] + elif not bboxes: + return None + elif isinstance(bboxes[0], dict): + norm = [bboxes] # flat list → same detections every frame + else: + norm = list(bboxes) + if len(norm) == 1: + norm = norm * B + norm = (norm + [[]] * B)[:B] + out = [] + for frame in norm: + if frame: + boxes = torch.tensor( + [[d["x"], d["y"], d["x"] + d["width"], d["y"] + d["height"]] for d in frame], + dtype=torch.float32, + ) + else: + boxes = torch.zeros((0, 4), dtype=torch.float32) + out.append(boxes) + return out + + class SAM3DBody_Predict(io.ComfyNode): @classmethod def define_schema(cls): @@ -113,6 +140,14 @@ class SAM3DBody_Predict(io.ComfyNode): "sam3_track_data", optional=True, tooltip=("Output of SAM3 Video Track, required for multi-person detection"), ), + io.BoundingBox.Input( + "bboxes", optional=True, force_input=True, + tooltip=( + "Per-frame person boxes (e.g. RT-DETR Detect with class_name='person'). " + "Used when no SAM3 track is wired — gives the top-down model a tight, " + "person-centered crop. Multi-person supported (one box = one person)." + ), + ), io.Boolean.Input( "run_hand_refinement", default=True, tooltip="Improves hand pose at the cost of extra inference time and memory use"), @@ -146,19 +181,22 @@ class SAM3DBody_Predict(io.ComfyNode): ) @classmethod - def execute(cls, sam3d_body_model, image, sam3_track_data=None, run_hand_refinement=True, fov_degrees=0.0, moge_geometry=None, chunk_size=144) -> io.NodeOutput: + def execute(cls, sam3d_body_model, image, sam3_track_data=None, bboxes=None, run_hand_refinement=True, fov_degrees=0.0, moge_geometry=None, chunk_size=64) -> io.NodeOutput: comfy.model_management.load_model_gpu(sam3d_body_model) inner: SAM3DBody = sam3d_body_model.model B, H, W, _ = image.shape image_size = getattr(inner, "_sam3d_image_size", (512, 512)) + # Precedence: SAM3 track (masks + boxes) > detector boxes > full-frame fallback. per_frame_bboxes, per_frame_masks = (None, None) if sam3_track_data is not None: per_frame_bboxes, per_frame_masks = inputs_from_sam3_track(sam3_track_data, B, H, W) + if per_frame_bboxes is None and bboxes: + per_frame_bboxes = _per_frame_bboxes_from_detections(bboxes, B) + per_frame_masks = None if per_frame_bboxes is None: - # No track wired (or empty / frame count mismatch) — single-person - # full-frame fallback. Multi-person scenes need SAM3 Video Track. + # No track or detector boxes — single-person full-frame fallback. full_frame_bbox = torch.tensor([[0.0, 0.0, float(W), float(H)]], dtype=torch.float32) per_frame_bboxes = [full_frame_bbox.clone() for _ in range(B)] per_frame_masks = None @@ -711,6 +749,26 @@ def _render_capsules_mode_inputs(): ] +def _render_openpose3d_mode_inputs(): + return [ + io.Float.Input( + "radius_m", default=0.015, min=0.004, max=0.1, step=0.001, + tooltip="Limb capsule radius in meters (thin = stick-like).", + ), + io.Boolean.Input( + "include_hands", default=True, + tooltip="Draw 21+21 hand keypoints as 3D capsules.", + ), + io.Float.Input( + "person_palette_falloff", default=0.6, min=0.1, max=1.0, step=0.05, + tooltip=( + "Per-person desaturation: track k blends toward white by " + "1 - falloff^k. Track 0 stays vivid; 1.0 disables falloff." + ), + ), + ] + + def _render_openpose_mode_inputs(): return [ io.Int.Input( @@ -755,15 +813,8 @@ def _render_openpose_mode_inputs(): def _scale_pose_data(mhr_pose_data: Dict[str, Any], new_H: int, new_W: int) -> Dict[str, Any]: - """Rescale per-person camera intrinsics + 2D coords to a new canvas size. - Pose data records focal_length in pixels of the original image; without - scaling, the FOV would change and subjects would be cropped/zoomed. - - When the new aspect differs from the original, the body (3D-projected - through focal_length on a centered principal point) lands in a - letterboxed region of the new canvas. 2D-prestored coords must follow - the same uniform scale + center offset so face/hand overlays align with - the body — per-axis stretching would split them apart.""" + # 2D coords must match the body's letterbox transform (uniform scale + + # center offset), else face/hand overlays drift off the body. old_H, old_W = mhr_pose_data["image_size"] if new_H == old_H and new_W == old_W: return mhr_pose_data @@ -831,20 +882,38 @@ class SAM3DBody_Render(io.ComfyNode): "other is derived preserving the original aspect." ), ), + io.Load3DCamera.Input( + "camera_info", optional=True, + tooltip=( + "Free 6DOF camera override. When wired, the pose is re-projected through this camera " + "(position/target/zoom) instead of the predicted one. " + ), + ), + io.Float.Input( + "camera_fov", default=0.0, min=0.0, max=170.0, step=0.5, advanced=True, + tooltip=( + "Vertical FOV for the camera_info override. 0 = keep the SAM3D " + "predicted camera's FOV (only the viewpoint changes). Any non-zero " + "value overrides the lens. Ignored when camera_info is unwired." + ), + ), io.DynamicCombo.Input( "render_style", options=[ io.DynamicCombo.Option("mesh", _render_mesh_mode_inputs()), io.DynamicCombo.Option("silhouette", []), - io.DynamicCombo.Option("openpose", _render_openpose_mode_inputs()), + io.DynamicCombo.Option("openpose_2d", _render_openpose_mode_inputs()), + io.DynamicCombo.Option("openpose_3d", _render_openpose3d_mode_inputs()), io.DynamicCombo.Option("scail", _render_capsules_mode_inputs()), ], tooltip=( "'mesh' = 3D MHR mesh rasterized through the camera. " "'silhouette' = binary mask of the mesh (white-on-black, " - "background ignored). 'openpose' = flat 2D skeleton " - "from pred_keypoints_2d (DWPose look). 'scail' = SCAIL " - "3D capsules via torch SDF ray-march (proper occlusion / depth)." + "background ignored). 'openpose_2d' = flat 2D skeleton " + "from pred_keypoints_2d (DWPose look, ControlNet-ready). " + "'openpose_3d' = same skeleton as flat-shaded 3D capsules " + "(camera-aware, proper depth). 'scail' = SCAIL 3D capsules " + "via torch SDF ray-march (proper occlusion / depth)." ), ), ], @@ -853,7 +922,7 @@ class SAM3DBody_Render(io.ComfyNode): @classmethod - def execute(cls, mhr_pose_data, background=None, width=0, height=0, render_style=None) -> io.NodeOutput: + def execute(cls, mhr_pose_data, background=None, width=0, height=0, camera_info=None, camera_fov=0.0, render_style=None) -> io.NodeOutput: render_style = render_style or {"render_style": "mesh"} mode_key = render_style.get("render_style", "mesh") @@ -869,10 +938,11 @@ class SAM3DBody_Render(io.ComfyNode): new_H = max(1, round(native_H * new_W / native_W)) mhr_pose_data = _scale_pose_data(mhr_pose_data, new_H, new_W) H, W = new_H, new_W - # Marker/stick px constants are authored for native resolution — - # scale them so the openpose overlay reads at the same relative size. px_scale = min(new_W / native_W, new_H / native_H) + if camera_info is not None: + mhr_pose_data = apply_camera_override(mhr_pose_data, camera_info, H, W, fov_deg=float(camera_fov)) + B = len(mhr_pose_data["frames"]) if B == 0: return io.NodeOutput(torch.zeros(1, H, W, 3, dtype=torch.float32)) @@ -880,6 +950,8 @@ class SAM3DBody_Render(io.ComfyNode): out_device = comfy.model_management.intermediate_device() bg_t = None if background is None else background.to(device=out_device, dtype=torch.float32) + if bg_t is not None and tuple(bg_t.shape[1:3]) != (H, W): # Match the background to the render resolution + bg_t = comfy.utils.common_upscale(bg_t.movedim(-1, 1), W, H, "bilinear", "disabled").movedim(1, -1) if mode_key == "silhouette": composite = "silhouette" @@ -888,7 +960,7 @@ class SAM3DBody_Render(io.ComfyNode): else: composite = "mesh_only" - if mode_key == "openpose": + if mode_key == "openpose_2d": marker_radius_px = max(1, int(round(render_style.get("marker_radius_px", 4) * px_scale))) stick_width_px = max(1, int(round(render_style.get("stick_width_px", 4) * px_scale))) limb_alpha = float(render_style.get("limb_alpha", 0.6)) @@ -897,6 +969,10 @@ class SAM3DBody_Render(io.ComfyNode): include_hands = hand_style != "disabled" hand_color_style = hand_style if include_hands else "dwpose" person_palette_falloff = float(render_style.get("person_palette_falloff", 0.6)) + elif mode_key == "openpose_3d": + op3d_radius_m = float(render_style.get("radius_m", 0.015)) + op3d_include_hands = bool(render_style.get("include_hands", True)) + person_palette_falloff = float(render_style.get("person_palette_falloff", 0.6)) elif mode_key == "scail": cap_radius_m = float(render_style.get("radius_m", 0.030)) cap_hand_style = str(render_style.get("hand_style", "disabled")) @@ -931,7 +1007,8 @@ class SAM3DBody_Render(io.ComfyNode): frames_out = [] pbar = comfy.utils.ProgressBar(B) desc = ( - "SAM3D openpose-2D render" if mode_key == "openpose" + "SAM3D openpose-2D render" if mode_key == "openpose_2d" + else "SAM3D openpose-3D render" if mode_key == "openpose_3d" else "SAM3D SCAIL-3D render" if mode_key == "scail" else "SAM3D silhouette" if mode_key == "silhouette" else "SAM3D render" @@ -940,7 +1017,7 @@ class SAM3DBody_Render(io.ComfyNode): bg_f = None if bg_t is not None: bg_f = bg_t[min(f, bg_t.shape[0] - 1)] - if mode_key == "openpose": + if mode_key == "openpose_2d": img = render_pose_data_openpose( mhr_pose_data, frame_idx=f, W=W, H=H, background=bg_f, @@ -953,6 +1030,17 @@ class SAM3DBody_Render(io.ComfyNode): hand_color_style=hand_color_style, person_brightness_falloff=person_palette_falloff, ) + elif mode_key == "openpose_3d": + img = render_pose_data_capsules( + mhr_pose_data, frame_idx=f, W=W, H=H, + background=bg_f, + composite=composite, + radius_m=op3d_radius_m, + include_hands=op3d_include_hands, + palette="openpose", + flat_shade=True, + person_brightness_falloff=person_palette_falloff, + ) elif mode_key == "scail": # SCAIL renders body as 3D capsules + 2D openpose hands on top img = render_pose_data_capsules( diff --git a/comfy_extras/nodes_save_3d.py b/comfy_extras/nodes_save_3d.py index f0a57a2f1..23a1c63e8 100644 --- a/comfy_extras/nodes_save_3d.py +++ b/comfy_extras/nodes_save_3d.py @@ -449,7 +449,7 @@ class BuildPoseGLB(IO.ComfyNode): IO.DynamicCombo.Option("octahedrons", [ IO.Float.Input( "bone_vis_radius_m", - default=0.02, min=0.005, max=0.5, step=0.005, + default=0.02, min=0.005, max=0.5, step=0.005, advanced=True, tooltip="Radius in m (sphere radius / octahedron half-width).", ), IO.Combo.Input( @@ -527,7 +527,7 @@ class BuildPoseGLB(IO.ComfyNode): IO.DynamicCombo.Option("octahedrons", [ IO.Float.Input( "bone_vis_radius_m", - default=0.02, min=0.005, max=0.5, step=0.005, + default=0.02, min=0.005, max=0.5, step=0.005, advanced=True, tooltip="Radius in m (sphere radius / octahedron half-width).", ), IO.Combo.Input( @@ -557,12 +557,20 @@ class BuildPoseGLB(IO.ComfyNode): ), ]), IO.DynamicCombo.Option("openpose", [ + IO.Int.Input( + "bone_smooth_window", + default=0, min=0, max=51, step=2, + tooltip=( + "Gaussian window on keypoint tracks. 0 = off. " + "7-15 calms jitter where upstream Smooth misses spikes." + ), + ), IO.Float.Input( - "marker_radius_m", default=0.010, min=0.005, max=0.1, step=0.001, + "marker_radius_m", default=0.010, min=0.005, max=0.1, step=0.001, advanced=True, tooltip="Sphere radius in m.", ), IO.Float.Input( - "stick_radius_m", default=0.008, min=0.002, max=0.05, step=0.001, + "stick_radius_m", default=0.008, min=0.002, max=0.05, step=0.001, advanced=True, tooltip="Limb half-width in m. Auto-clamped to bone_length x 0.1.", ), IO.Boolean.Input( @@ -573,31 +581,39 @@ class BuildPoseGLB(IO.ComfyNode): ), ), IO.Float.Input( - "hand_marker_radius_m", default=0.005, min=0.001, max=0.1, step=0.001, + "hand_marker_radius_m", default=0.005, min=0.001, max=0.1, step=0.001, advanced=True, tooltip="Hand sphere radius in m.", ), IO.Float.Input( - "hand_stick_radius_m", default=0.003, min=0.001, max=0.05, step=0.001, + "hand_stick_radius_m", default=0.003, min=0.001, max=0.05, step=0.001, advanced=True, tooltip="Hand limb half-width in m.", ), IO.Combo.Input( - "face_source", - options=["off", "rig"], - default="off", + "face_style", + options=["disabled", "full", "eyes_mouth"], + default="disabled", tooltip=( - "'rig' adds ~30 face-contour landmarks sampled from pred_vertices " - "at fixed head-mesh vertex IDs (brow/eyes/nose/mouth/jaw); needs " - "canonical_colors on pose_data." + "Face-contour landmarks sampled from pred_vertices at fixed " + "head-mesh vertex IDs (needs canonical_colors on pose_data). " + "'full' = all ~30 points; 'eyes_mouth' = eyes + outer lips only." ), ), IO.Float.Input( - "face_marker_radius_m", default=0.0, min=0.0, max=0.05, step=0.0005, + "face_marker_radius_m", default=0.0, min=0.0, max=0.05, step=0.0005, advanced=True, tooltip="Face dot radius. 0 = auto = 0.3 x marker_radius_m.", ), ]), IO.DynamicCombo.Option("scail", [ + IO.Int.Input( + "bone_smooth_window", + default=0, min=0, max=51, step=2, + tooltip=( + "Gaussian window on keypoint tracks. 0 = off. " + "7-15 calms jitter where upstream Smooth misses spikes." + ), + ), IO.Float.Input( - "stick_radius_m", default=0.022, min=0.002, max=0.1, step=0.001, + "stick_radius_m", default=0.022, min=0.002, max=0.1, step=0.001, advanced=True, tooltip=( "Cylinder radius in m. Bones are open cylinders at constant " "radius; joint spheres (auto-sized to match) cap the open ends. " @@ -605,11 +621,11 @@ class BuildPoseGLB(IO.ComfyNode): ), ), IO.Float.Input( - "marker_radius_m", default=0.0, min=0.0, max=0.1, step=0.001, + "marker_radius_m", default=0.0, min=0.0, max=0.1, step=0.001, advanced=True, tooltip="Joint sphere radius. 0 = auto = stick_radius_m (flush cap).", ), IO.Float.Input( - "material_roughness", default=0.3, min=0.0, max=1.0, step=0.05, + "material_roughness", default=0.3, min=0.0, max=1.0, step=0.05, advanced=True, tooltip="PBR roughness. SCAIL ref = 0.3. 1 = matte; 0 = chrome.", ), IO.Boolean.Input( @@ -617,13 +633,23 @@ class BuildPoseGLB(IO.ComfyNode): tooltip="Append 21+21 hand keypoints + capsule sticks per track.", ), IO.Float.Input( - "hand_marker_radius_m", default=0.005, min=0.001, max=0.05, step=0.001, + "hand_marker_radius_m", default=0.005, min=0.001, max=0.05, step=0.001, advanced=True, tooltip="Hand sphere radius in m.", ), IO.Float.Input( - "hand_stick_radius_m", default=0.003, min=0.001, max=0.05, step=0.001, + "hand_stick_radius_m", default=0.003, min=0.001, max=0.05, step=0.001, advanced=True, tooltip="Hand cylinder radius in m.", ), + IO.Combo.Input( + "face_style", + options=["disabled", "full", "eyes_mouth"], + default="disabled", + tooltip=( + "Face-contour landmarks sampled from pred_vertices (needs " + "canonical_colors on pose_data). 'full' = all ~30 points; " + "'eyes_mouth' = eyes + outer lips only." + ), + ), ]), ], tooltip=( @@ -710,10 +736,11 @@ class BuildPoseGLB(IO.ComfyNode): include_hands=bool(mesh_style.get("include_hands", False)), hand_marker_radius_m=float(mesh_style.get("hand_marker_radius_m", 0.005)), hand_stick_radius_m=float(mesh_style.get("hand_stick_radius_m", 0.003)), - face_source=str(mesh_style.get("face_source", "off")), + face_style=str(mesh_style.get("face_style", "disabled")), face_marker_radius_m=float(mesh_style.get("face_marker_radius_m", 0.0)), palette="openpose", shape="ellipsoid", + bone_smooth_window=int(mesh_style.get("bone_smooth_window", 0)), ) elif mode_key == "scail": # SCAIL rig: open cylinders capped flush by joint spheres (sphere @@ -732,7 +759,7 @@ class BuildPoseGLB(IO.ComfyNode): include_hands=bool(mesh_style.get("include_hands", False)), hand_marker_radius_m=float(mesh_style.get("hand_marker_radius_m", 0.005)), hand_stick_radius_m=float(mesh_style.get("hand_stick_radius_m", 0.003)), - face_source="off", + face_style=str(mesh_style.get("face_style", "disabled")), palette="scail", shape="capsule", smooth_shade=True, @@ -740,6 +767,7 @@ class BuildPoseGLB(IO.ComfyNode): # inside of the open cylinders shades sensibly at grazing angles. material_roughness=float(mesh_style.get("material_roughness", 0.3)), material_double_sided=True, + bone_smooth_window=int(mesh_style.get("bone_smooth_window", 0)), ) else: raise ValueError(f"BuildPoseGLB: unknown mesh_style {mode_key!r}") diff --git a/comfy_extras/sam3d_body/export/capsules.py b/comfy_extras/sam3d_body/export/capsules.py index 37b059a4c..1a3d85b32 100644 --- a/comfy_extras/sam3d_body/export/capsules.py +++ b/comfy_extras/sam3d_body/export/capsules.py @@ -41,10 +41,11 @@ def _build_specs_from_pose( include_hands: bool, palette: str, person_brightness_falloff: float = 0.0, -) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Flatten body + optional hand limbs for one frame into - (starts, ends, colors_rgba) in camera coords (Y-down, +Z forward). - Drops endpoints that are non-finite or behind the camera. + (starts, ends, colors_rgba, is_hand) in camera coords (Y-down, +Z forward). + Drops endpoints that are non-finite or behind the camera. `is_hand` flags + the hand limbs so the renderer can draw them thinner. `person_brightness_falloff` mixes each per-person limb color toward white by `1 - falloff^k` for track index `k` (track 0 stays vivid). Matches the @@ -52,6 +53,7 @@ def _build_specs_from_pose( starts: List[np.ndarray] = [] ends: List[np.ndarray] = [] colors: List[np.ndarray] = [] + is_hand: List[bool] = [] body_limb_colors = _limb_palette_rgb01(palette) hand_limb_colors = OPENPOSE_HAND_COLORS_21.astype(np.float32) @@ -109,6 +111,7 @@ def _build_specs_from_pose( sb = sa + spine_dir * (sd_len * 0.3) starts.append(sa) ends.append(sb) + is_hand.append(False) color_rgb = _tint(body_limb_colors[limb_i]) colors.append(np.array([color_rgb[0], color_rgb[1], color_rgb[2], 1.0], dtype=np.float32)) @@ -125,6 +128,7 @@ def _build_specs_from_pose( continue starts.append(sa) ends.append(sb) + is_hand.append(True) color_rgb = _tint(hand_limb_colors[(a + b) % len(hand_limb_colors)]) colors.append(np.array([color_rgb[0], color_rgb[1], color_rgb[2], 1.0], dtype=np.float32)) @@ -132,10 +136,12 @@ def _build_specs_from_pose( if not starts: return (np.zeros((0, 3), dtype=np.float32), np.zeros((0, 3), dtype=np.float32), - np.zeros((0, 4), dtype=np.float32)) + np.zeros((0, 4), dtype=np.float32), + np.zeros((0,), dtype=bool)) return (np.stack(starts).astype(np.float32), np.stack(ends).astype(np.float32), - np.stack(colors).astype(np.float32)) + np.stack(colors).astype(np.float32), + np.asarray(is_hand, dtype=bool)) def _ray_capsule_t( @@ -144,14 +150,14 @@ def _ray_capsule_t( ends: torch.Tensor, # (M, 3) ba_norm: torch.Tensor, # (M, 3) unit axis (A → B) ba_len: torch.Tensor, # (M,) segment length - radius: float, + radius: torch.Tensor, # (M,) per-capsule radius ) -> torch.Tensor: """Closed-form ray-capsule intersection. Returns (K, M) tensor of ray parameters t to the nearest valid hit per capsule, +inf where the ray misses. A capsule is the union of (cylinder body, hemisphere at A, hemisphere at B); each component is a quadratic root-find.""" INF = float("inf") - r_sq = float(radius) * float(radius) + r_sq = radius * radius # (M,) # Cached dot products. dn = ray_dirs @ ba_norm.transpose(0, 1) # (K, M) — d·n @@ -199,9 +205,10 @@ def _render_capsules_torch( colors: torch.Tensor, H: int, W: int, fx: float, fy: float, cx: float, cy: float, - radius: float, + radius: torch.Tensor, # scalar or (M,) per-capsule radius background_rgb: Optional[torch.Tensor], device: torch.device, + flat_shade: bool = False, ) -> torch.Tensor: """Analytic ray-capsule renderer for a union of capsules. Camera at origin looking down +Z; pixels in y-down screen coords.""" @@ -224,12 +231,16 @@ def _render_capsules_torch( flat_dirs = ray_dirs.view(-1, 3) N = flat_dirs.shape[0] + radius = torch.as_tensor(radius, device=device, dtype=torch.float32) + if radius.ndim == 0: + radius = radius.expand(M) + ba = ends - starts ba_len = torch.linalg.norm(ba, dim=1).clamp(min=1e-6) ba_norm = ba / ba_len.unsqueeze(1) z_min = float(min(starts[:, 2].min().item(), ends[:, 2].min().item())) - z_near = max(0.05, z_min - radius) + z_near = max(0.05, z_min - float(radius.max().item())) # Union of per-capsule screen-space bboxes. Pixels outside this mask # provably can't hit any capsule, so the analytic intersection only runs @@ -298,6 +309,10 @@ def _render_capsules_torch( normals = normals / normals.norm(dim=-1, keepdim=True).clamp(min=1e-8) col = colors[m_h, :3] + if flat_shade: + # Solid per-limb color (OpenPose look) — no lighting/depth modulation. + out[hit_idx] = col + return out.view(H, W, 3).clamp(0.0, 1.0) # SCAIL Blinn-Phong (render_torch.py:290-331). Headlight: light = +Z. diff = torch.clamp(-(normals[:, 2]), min=0.0) diffuse = 0.45 + 0.55 * diff @@ -336,6 +351,8 @@ def render_pose_data_capsules( include_hands: bool = False, palette: str = "scail", person_brightness_falloff: float = 0.0, + flat_shade: bool = False, + hand_radius_scale: float = 0.4, device: Optional[torch.device] = None, ) -> torch.Tensor: """Render a frame's pose_data as 3D capsules projected through the per- @@ -345,7 +362,8 @@ def render_pose_data_capsules( `composite='mesh_only'` always uses a black canvas. `radius_m` is in METERS (matching `pred_keypoints_3d` / `pred_cam_t`). - Camera fx/fy come from each person's `focal_length` (pixels); cx/cy = center. + Hand limbs use `radius_m * hand_radius_scale` (their bones are far shorter + than body limbs). Camera fx/fy come from each person's `focal_length`. """ persons = pose_data["frames"][frame_idx] if device is None: @@ -361,7 +379,7 @@ def render_pose_data_capsules( break cx, cy = W * 0.5, H * 0.5 - starts_np, ends_np, colors_np = _build_specs_from_pose( + starts_np, ends_np, colors_np, is_hand_np = _build_specs_from_pose( persons, include_hands=include_hands, palette=palette, person_brightness_falloff=person_brightness_falloff, ) @@ -384,11 +402,14 @@ def render_pose_data_capsules( starts_t = torch.from_numpy(starts_np).to(device=device, dtype=torch.float32) ends_t = torch.from_numpy(ends_np).to(device=device, dtype=torch.float32) colors_t = torch.from_numpy(colors_np).to(device=device, dtype=torch.float32) + radii_np = np.where(is_hand_np, radius_m * hand_radius_scale, radius_m).astype(np.float32) + radii_t = torch.from_numpy(radii_np).to(device=device, dtype=torch.float32) return _render_capsules_torch( starts_t, ends_t, colors_t, H=H, W=W, fx=fx, fy=fy, cx=cx, cy=cy, - radius=float(radius_m), + radius=radii_t, background_rgb=bg_t, device=device, + flat_shade=flat_shade, ) diff --git a/comfy_extras/sam3d_body/export/glb_openpose.py b/comfy_extras/sam3d_body/export/glb_openpose.py index 2b1617eb6..68685c2ac 100644 --- a/comfy_extras/sam3d_body/export/glb_openpose.py +++ b/comfy_extras/sam3d_body/export/glb_openpose.py @@ -37,6 +37,7 @@ from .glb_shared import ( SCAIL_LIMB_COLORS_17, collect_tracks, flat_shade_mesh, + gaussian_smooth_positions, make_lit_material, quat_sign_fix_per_joint, rotation_align, @@ -364,11 +365,14 @@ def _build_openpose_spheres( bind_kp_m: np.ndarray, radius_m: float, kp_colors: np.ndarray, base_joint_idx: int = 0, smooth_shade: bool = False, + joint_indices: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """UV sphere per OpenPose keypoint, rigidly skinned to that keypoint's joint, vertex-colored from kp_colors. `base_joint_idx` is added to the emitted JOINTS_0 indices so callers can place this group at any offset - in the shared skin (body=0, right hand=18, etc.). + in the shared skin (body=0, right hand=18, etc.). `joint_indices` (when + given) overrides that with explicit per-sphere joint indices, so callers + can skip keypoints (e.g. SCAIL head dots). `smooth_shade=True` keeps the indexed mesh and writes per-vertex normals via face-normal averaging — round shading on the spheres. @@ -390,7 +394,7 @@ def _build_openpose_spheres( out_v[v_off:v_off + Nv] = sv * radius_m + bind_kp_m[j] out_n[v_off:v_off + Nv] = sv out_f[j * Nf:(j + 1) * Nf] = sf + v_off - out_j[v_off:v_off + Nv, 0] = j + base_joint_idx + out_j[v_off:v_off + Nv, 0] = int(joint_indices[j]) if joint_indices is not None else j + base_joint_idx out_w[v_off:v_off + Nv, 0] = 1.0 out_c[v_off:v_off + Nv] = kp_colors[j] return _finalize_skinned_mesh(out_v, out_f, out_j, out_w, out_c, smooth_shade) @@ -579,6 +583,24 @@ def _capsule_mesh_local( return v_arr, np.asarray(faces, dtype=np.uint32), weights +def _scail_redirect_neck_stub(body_kp: np.ndarray) -> np.ndarray: + """Replace the nose keypoint (idx 0) of a (...,18,3) array with a short + neck stub (0.6 spine + 0.4 neck→nose), matching the capsule render.""" + out = body_kp.copy() + neck = body_kp[..., 1, :] + nose = body_kp[..., 0, :] + mid_hip = 0.5 * (body_kp[..., 8, :] + body_kp[..., 11, :]) + + def _unit(v): + return v / np.linalg.norm(v, axis=-1, keepdims=True).clip(min=1e-6) + + nose_vec = nose - neck + nose_len = np.linalg.norm(nose_vec, axis=-1, keepdims=True) + mixed = _unit(0.6 * _unit(neck - mid_hip) + 0.4 * _unit(nose_vec)) + out[..., 0, :] = neck + mixed * (nose_len * 0.5) + return out + + def _openpose_limb_rest_trs( bind_kp_m: np.ndarray, pairs: Tuple[Tuple[int, int], ...], ) -> Tuple[np.ndarray, np.ndarray]: @@ -636,6 +658,7 @@ def _build_openpose_sticks( limb_joint_base_idx: int = 0, shape: str = "ellipsoid", smooth_shade: bool = False, + end_width_frac: float = 0.3, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Capsule (cylinder + hemispherical caps) per limb pair (a, b). @@ -682,7 +705,7 @@ def _build_openpose_sticks( half_width_eff = max(MIN_WIDTH, min(length * WIDTH_RATIO, half_width_m)) v_local, f_local, _weights_unused = _capsule_mesh_local( - length, half_width_eff, shape=shape, + length, half_width_eff, shape=shape, end_width_frac=end_width_frac, ) v_world = v_local @ R.T + head Nv = v_local.shape[0] @@ -729,13 +752,15 @@ def build_glb_openpose( hand_marker_radius_m: float = 0.0, hand_stick_radius_m: float = 0.0, hand_color_style: str = "dwpose", - face_source: str = "off", + face_style: str = "disabled", face_marker_radius_m: float = 0.0, palette: str = "openpose", shape: str = "ellipsoid", smooth_shade: bool = False, material_roughness: float = 0.85, material_double_sided: bool = False, + stick_end_width_frac: float = 0.6, + bone_smooth_window: int = 0, ) -> bytes: """Build a GLB containing an OpenPose-style 3D skeleton — sphere markers per keypoint plus rainbow-colored sticks between standard limb pairs. @@ -757,9 +782,10 @@ def build_glb_openpose( rainbow per-finger sticks (controlnet_aux/dwpose convention); 'openpose' = rainbow per-finger dots AND sticks (matches poseParameters.cpp::HAND_COLORS_RENDER). - face_source: 'off' (default) | 'rig' — when 'rig', adds ~30 face - contour landmarks sampled from `pred_vertices` at vertex IDs - picked from `pose_data["canonical_colors"]["positions"]`. + face_style: 'disabled' (default) | 'full' | 'eyes_mouth' — face + landmarks sampled from `pred_vertices` at vertex IDs picked from + `pose_data["canonical_colors"]["positions"]`. 'full' = all ~30 + contour points; 'eyes_mouth' = the eyes + outer-lip subset. face_marker_radius_m: per-face landmark sphere radius. 0 = auto = 0.3 × `marker_radius_m` — face landmarks are densely packed around the eyes/mouth/jaw and need to be much smaller than @@ -771,6 +797,12 @@ def build_glb_openpose( SCAIL-Pose style — warm hues right side, cool hues left side, grey neck-to-nose centerline, distinct per-limb colors. """ + is_scail = str(palette) == "scail" + # SCAIL drops the face bones (13..16) and eye/ear spheres; keeps nose (idx 0, + # the neck-stub tip) to cap the open cylinder. Matches the capsule render. + body_pairs = OPENPOSE_18_PAIRS[:13] if is_scail else OPENPOSE_18_PAIRS + body_sphere_kp = (np.arange(14, dtype=np.int64) + if is_scail else np.arange(18, dtype=np.int64)) if str(palette) == "scail": body_sphere_colors = SCAIL_KEYPOINT_COLORS_18 body_stick_colors = SCAIL_LIMB_COLORS_17 @@ -805,25 +837,30 @@ def build_glb_openpose( if not tracks: raise ValueError("build_glb_openpose: no valid tracks in pose_data") + # Eyes (6..13) + outer-lip ring (19..22) from FACE_LANDMARK_TARGETS. + _EYES_MOUTH_IDX = np.array([6, 7, 8, 9, 10, 11, 12, 13, 19, 20, 21, 22], dtype=np.int64) face_vert_ids: Optional[np.ndarray] = None - if face_source == "rig": + face_target_idx = np.arange(len(FACE_LANDMARK_TARGETS), dtype=np.int64) + if face_style in ("full", "eyes_mouth"): canonical_colors = pose_data.get("canonical_colors") or {} positions = canonical_colors.get("positions") if positions is None: raise ValueError( - "build_glb_openpose: face_source='rig' needs " + "build_glb_openpose: face_style needs " "pose_data['canonical_colors']['positions'] (computed at " "model load and attached by Predict). Ensure the SAM3DBody " "Loader+Predict ran upstream of this node." ) + if face_style == "eyes_mouth": + face_target_idx = _EYES_MOUTH_IDX face_vert_ids = select_face_landmark_vert_ids( np.asarray(positions), face_mask=canonical_colors.get("face_mask"), - ) - elif face_source != "off": + )[face_target_idx] + elif face_style != "disabled": raise ValueError( - f"build_glb_openpose: unknown face_source={face_source!r} " - "(expected 'off' or 'rig')" + f"build_glb_openpose: unknown face_style={face_style!r} " + "(expected 'disabled', 'full', or 'eyes_mouth')" ) K_body = 18 @@ -833,7 +870,7 @@ def build_glb_openpose( # Limb counts: one joint per stick pair. Limb joints carry translation + # rotation so each capsule rotates rigidly with its limb (no LBS thinning). - K_body_limbs = len(OPENPOSE_18_PAIRS) + K_body_limbs = len(body_pairs) K_hand_limbs = len(OPENPOSE_HAND_PAIRS) if include_hands else 0 K_limbs = K_body_limbs + 2 * K_hand_limbs # face has no sticks @@ -843,14 +880,14 @@ def build_glb_openpose( joint_names.extend([f"openpose_R_{n}" for n in OPENPOSE_HAND21_NAMES]) joint_names.extend([f"openpose_L_{n}" for n in OPENPOSE_HAND21_NAMES]) if K_face > 0: - joint_names.extend([f"openpose_face_{name}" - for name, _ in FACE_LANDMARK_TARGETS]) + joint_names.extend([f"openpose_face_{FACE_LANDMARK_TARGETS[i][0]}" + for i in face_target_idx]) # Limb joint names, stacked body → R-hand → L-hand to match the limb # joint ordering in skin.joints (after the K keypoint joints). limb_names: List[str] = [ f"openpose_limb_{OPENPOSE_18_NAMES[a]}_{OPENPOSE_18_NAMES[b]}" - for (a, b) in OPENPOSE_18_PAIRS + for (a, b) in body_pairs ] if include_hands: for side in ("R", "L"): @@ -882,6 +919,8 @@ def build_glb_openpose( seq_chunks.append(_extract_face_landmarks_from_verts( pose_data, frame_indices, person_k, face_vert_ids)) kp_seq = np.concatenate(seq_chunks, axis=1) # (N, K, 3) + if bone_smooth_window and bone_smooth_window > 1: + kp_seq = gaussian_smooth_positions(kp_seq, int(bone_smooth_window)) # Static-bind = rig's REST pose when available (override path); else # fall back to frame 0 of the motion. The rest-pose bind makes the @@ -896,6 +935,10 @@ def build_glb_openpose( bind_kp_m = (bind_kp_m_rest if bind_kp_m_rest is not None else kp_seq[0].astype(np.float32)) + if is_scail: # nose → neck stub, matching the capsule render + kp_seq[:, :K_body] = _scail_redirect_neck_stub(kp_seq[:, :K_body]) + bind_kp_m[:K_body] = _scail_redirect_neck_stub(bind_kp_m[:K_body]) + person_root: Dict[str, Any] = {"name": f"track{track_i:02d}", "children": []} nodes.append(person_root) person_root_idx = len(nodes) - 1 @@ -920,8 +963,8 @@ def build_glb_openpose( limb_rest_axes_list: List[np.ndarray] = [] limb_anim_mids_list: List[np.ndarray] = [] limb_anim_quats_list: List[np.ndarray] = [] - rmid_b, raxis_b = _openpose_limb_rest_trs(bind_kp_m[:K_body], OPENPOSE_18_PAIRS) - amid_b, aquat_b = _openpose_limb_anim_trs(kp_seq[:, :K_body], OPENPOSE_18_PAIRS, raxis_b) + rmid_b, raxis_b = _openpose_limb_rest_trs(bind_kp_m[:K_body], body_pairs) + amid_b, aquat_b = _openpose_limb_anim_trs(kp_seq[:, :K_body], body_pairs, raxis_b) limb_rest_mids_list.append(rmid_b) limb_rest_axes_list.append(raxis_b) limb_anim_mids_list.append(amid_b) @@ -979,15 +1022,17 @@ def build_glb_openpose( group_meshes: List[Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]] = [] sp = _build_openpose_spheres( - bind_kp_m[:K_body], float(marker_radius_m), - body_sphere_colors, base_joint_idx=0, + bind_kp_m[body_sphere_kp], float(marker_radius_m), + body_sphere_colors[body_sphere_kp], base_joint_idx=0, smooth_shade=smooth_shade, + joint_indices=body_sphere_kp, ) st = _build_openpose_sticks( - bind_kp_m[:K_body], OPENPOSE_18_PAIRS, float(stick_radius_m), + bind_kp_m[:K_body], body_pairs, float(stick_radius_m), body_stick_colors, limb_joint_base_idx=K, # body limbs start at K shape=shape, smooth_shade=smooth_shade, + end_width_frac=stick_end_width_frac, ) group_meshes.append(sp) group_meshes.append(st) @@ -1012,6 +1057,7 @@ def build_glb_openpose( limb_joint_base_idx=K + K_body_limbs + hand_i * K_hand_limbs, shape=shape, smooth_shade=smooth_shade, + end_width_frac=stick_end_width_frac, )) if K_face > 0: diff --git a/comfy_extras/sam3d_body/export/glb_shared.py b/comfy_extras/sam3d_body/export/glb_shared.py index 66a732a18..ba7268985 100644 --- a/comfy_extras/sam3d_body/export/glb_shared.py +++ b/comfy_extras/sam3d_body/export/glb_shared.py @@ -122,6 +122,30 @@ def gaussian_smooth_quats(q_seq: np.ndarray, window: int) -> np.ndarray: return out.astype(np.float32) +def gaussian_smooth_positions(seq: np.ndarray, window: int) -> np.ndarray: + """Gaussian-smooth a (N, K, 3) position sequence along time (edge-replicate + padding). Used to calm jittery keypoint tracks before the openpose rig + derives sphere translations + limb TRS from them.""" + if window <= 1 or seq.shape[0] < 2: + return seq + s = np.asarray(seq, dtype=np.float64) + n = s.shape[0] + half = window // 2 + sigma = max(0.5, window / 4.0) + x = np.arange(-half, half + 1, dtype=np.float64) + kernel = np.exp(-x * x / (2.0 * sigma * sigma)) + kernel = kernel / kernel.sum() + padded = np.concatenate([ + np.broadcast_to(s[:1], (half,) + s.shape[1:]), + s, + np.broadcast_to(s[-1:], (half,) + s.shape[1:]), + ], axis=0) + out = np.zeros_like(s) + for k, wgt in enumerate(kernel): + out += wgt * padded[k:k + n] + return out.astype(np.float32) + + def quat_sign_fix_per_joint(q_seq: np.ndarray) -> np.ndarray: """Walk (N, NJ, 4) along time, flip sign whenever consecutive frames sit on opposite hemispheres. Eliminates long-path slerp glitches (mid-anim @@ -900,19 +924,23 @@ def rotation_align(from_vec: np.ndarray, to_vec: np.ndarray) -> np.ndarray: def make_lit_material( - roughness: float = 0.85, double_sided: bool = False, + roughness: float = 0.85, double_sided: bool = False, opacity: float = 1.0, ) -> dict: """Lit PBR material using vertex COLOR_0 multiplicatively. KHR_materials_unlit is intentionally off so viewer lighting reveals surface form. metallic=0 keeps the surface dielectric so vertex colors stay readable. roughness=0.85 - suits dense rainbow body meshes; 0.3 matches SCAIL-Pose's glossy rig look.""" + suits dense rainbow body meshes; 0.3 matches SCAIL-Pose's glossy rig look. + opacity < 1 switches to alpha-blend (e.g. see-through body mesh over bones).""" + a = float(max(0.0, min(1.0, opacity))) mat = { "pbrMetallicRoughness": { - "baseColorFactor": [1.0, 1.0, 1.0, 1.0], + "baseColorFactor": [1.0, 1.0, 1.0, a], "metallicFactor": 0.0, "roughnessFactor": float(max(0.0, min(1.0, roughness))), }, } + if a < 1.0: + mat["alphaMode"] = "BLEND" if double_sided: mat["doubleSided"] = True return mat diff --git a/comfy_extras/sam3d_body/export/glb_skeletal.py b/comfy_extras/sam3d_body/export/glb_skeletal.py index 94783b5a0..aa66b10e9 100644 --- a/comfy_extras/sam3d_body/export/glb_skeletal.py +++ b/comfy_extras/sam3d_body/export/glb_skeletal.py @@ -362,8 +362,10 @@ def build_glb_skeletal( "indices": indices_acc, "mode": 4, } - if color_acc is not None: - materials.append(make_lit_material()) + # See-through body when bones are shown, else opaque (only when a + # vertex-color shader baked COLOR_0 — otherwise default material). + if color_acc is not None or include_bones: + materials.append(make_lit_material(opacity=0.35 if include_bones else 1.0)) primitive["material"] = len(materials) - 1 if expr_morph_accs: primitive["targets"] = [{"POSITION": a} for a in expr_morph_accs] diff --git a/comfy_extras/sam3d_body/utils.py b/comfy_extras/sam3d_body/utils.py index 2b5345481..6d3405418 100644 --- a/comfy_extras/sam3d_body/utils.py +++ b/comfy_extras/sam3d_body/utils.py @@ -105,16 +105,85 @@ def cam_int_from_moge(moge_geometry, height: int, width: int) -> Optional[torch. ) -def run_batched_single_chunk( - inner: SAM3DBody, - frames_rgb: List[torch.Tensor], - per_frame_boxes: List[torch.Tensor], - per_frame_masks: Optional[List[torch.Tensor]], - image_size: Tuple[int, int], - inference_type: str, - K: int, - cam_int: Optional[torch.Tensor] = None, -) -> List[List[Dict[str, Any]]]: +def apply_camera_override(mhr_pose_data: Dict[str, Any], camera_info: Dict[str, Any], + H: int, W: int, fov_deg: float = 0.0) -> Dict[str, Any]: + """Re-project every frame's pose through a Load3D 6DOF camera (position/ + target/zoom + optional FOV). Returns a new mhr_pose_data; unchanged on + empty/invalid input.""" + first_frame = mhr_pose_data["frames"][0] if mhr_pose_data["frames"] else [] + if not first_frame: + return mhr_pose_data + # GLB exports the rig root at origin, so Load3D coords are root-relative + roots = [np.asarray(p["pred_cam_t"], dtype=np.float32).reshape(3) + for p in first_frame if p.get("pred_cam_t") is not None] + if not roots: + return mhr_pose_data + subj_center = np.mean(np.stack(roots, axis=0), axis=0) + + # Meter-scale, so Three.js coords map 1:1 (Three.js Y-up → flip Y,Z) + pos = camera_info.get("position") or {} + tgt = camera_info.get("target") or {} + pos_v = np.array([float(pos.get("x", 0.0)), -float(pos.get("y", 5.0)), -float(pos.get("z", 0.0))], dtype=np.float32) + tgt_v = np.array([float(tgt.get("x", 0.0)), -float(tgt.get("y", 0.0)), -float(tgt.get("z", 0.0))], dtype=np.float32) + offset = pos_v - tgt_v + if float(np.linalg.norm(offset)) < 1e-6: + return mhr_pose_data + + zoom = float(camera_info.get("zoom", 1.0)) or 1.0 + target = subj_center + tgt_v + eye = target + offset / max(0.01, zoom) + + # Look-at basis. z = -offset (already non-zero); x degenerates only when + # looking straight along world-up, then fall back to world +X. + z_axis = -offset / float(np.linalg.norm(offset)) + x_axis = np.cross(z_axis, np.array([0.0, -1.0, 0.0], dtype=np.float32)) + x_norm = float(np.linalg.norm(x_axis)) + x_axis = x_axis / x_norm if x_norm > 1e-6 else np.array([1.0, 0.0, 0.0], dtype=np.float32) + y_axis = np.cross(z_axis, x_axis) + R = np.stack([x_axis, y_axis, z_axis], axis=0).astype(np.float32) + + # fov_deg > 0 overrides the lens; 0 keeps the SAM3D predicted focal so only + # the viewpoint changes. Three.js fov is vertical → focal from image height. + if fov_deg > 0: + new_focal = float(H) / (2.0 * float(np.tan(np.deg2rad(fov_deg) / 2.0))) + else: + f0 = first_frame[0].get("focal_length") + new_focal = (float(np.asarray(f0, dtype=np.float32).reshape(-1)[0]) if f0 is not None + else float(H) / (2.0 * float(np.tan(np.deg2rad(50.0) / 2.0)))) + + center = np.array([W * 0.5, H * 0.5], dtype=np.float32) + reproj = {"pred_keypoints_3d": "pred_keypoints_2d", "pred_face_keypoints_3d": "pred_face_keypoints_2d"} + new_frames: List[List[Dict[str, Any]]] = [] + for frame in mhr_pose_data["frames"]: + scaled = [] + for p in frame: + p = dict(p) + cam_t = p.get("pred_cam_t") + if cam_t is None: + scaled.append(p) + continue + cam_t = np.asarray(cam_t, dtype=np.float32).reshape(3) + for k in ("pred_keypoints_3d", "pred_vertices", "pred_face_keypoints_3d"): + v = p.get(k) + if v is None: + continue + cam = (np.asarray(v, dtype=np.float32) + cam_t - eye) @ R.T + p[k] = cam.astype(np.float32) + if k in reproj: # re-project the new 3D to 2D image coords + z = np.maximum(cam[..., 2:3], 1e-6) + p[reproj[k]] = (cam[..., :2] * new_focal / z + center).astype(np.float32) + p["pred_cam_t"] = np.zeros(3, dtype=np.float32) + p["focal_length"] = np.array(new_focal, dtype=np.float32) + scaled.append(p) + new_frames.append(scaled) + out = dict(mhr_pose_data) + out["frames"] = new_frames + return out + + +def run_batched_single_chunk(inner: SAM3DBody, frames_rgb: List[torch.Tensor], per_frame_boxes: List[torch.Tensor], + per_frame_masks: Optional[List[torch.Tensor]], image_size: Tuple[int, int], inference_type: str, K: int, + cam_int: Optional[torch.Tensor] = None) -> List[List[Dict[str, Any]]]: """Run a SINGLE chunk of frames through run_inference in one forward.""" N = len(frames_rgb) total = N * K