diff --git a/comfy_extras/nodes_sam3d_body.py b/comfy_extras/nodes_sam3d_body.py index 0c2b03565..cac17a7cb 100644 --- a/comfy_extras/nodes_sam3d_body.py +++ b/comfy_extras/nodes_sam3d_body.py @@ -143,7 +143,7 @@ class SAM3DBody_Predict(io.ComfyNode): "fov", tooltip=( "Vertical FoV in degrees. Affects predicted depth and absolute scale. 0 = fall back to ~53° (16:9)." - ), + ), advanced=True ), io.Int.Input( "batch_size", #TODO: automate? @@ -454,7 +454,7 @@ class SAM3DBody_Smooth(io.ComfyNode): io.Combo.Input( "method", options=["gaussian", "savgol"], - default="savgol", advanced=True, + default="savgol", tooltip=( "gaussian: symmetric weighted average, best general-purpose smoother./n" "savgol: sliding polynomial fit, preserves sharp peaks." diff --git a/comfy_extras/sam3d_body/utils.py b/comfy_extras/sam3d_body/utils.py index 5fff80953..d46337470 100644 --- a/comfy_extras/sam3d_body/utils.py +++ b/comfy_extras/sam3d_body/utils.py @@ -67,6 +67,17 @@ def inputs_from_sam3_track(track_data, B: int, H: int, W: int): BATCHED_CROPS_PER_CHUNK = 64 +def _quat_to_mat_wxyz(w: float, x: float, y: float, z: float) -> np.ndarray: + """(3,3) rotation from a wxyz quaternion; columns are the rotated axes.""" + n = math.sqrt(w * w + x * x + y * y + z * z) or 1.0 + w, x, y, z = w / n, x / n, y / n, z / n + return np.array([ + [1 - 2 * (y * y + z * z), 2 * (x * y - w * z), 2 * (x * z + w * y)], + [2 * (x * y + w * z), 1 - 2 * (x * x + z * z), 2 * (y * z - w * x)], + [2 * (x * z - w * y), 2 * (y * z + w * x), 1 - 2 * (x * x + y * y)], + ], dtype=np.float32) + + def cam_int_from_fov(height: int, width: int, fov_degrees: float) -> Optional[torch.Tensor]: """(1,3,3) intrinsic matrix from a vertical FOV in degrees. Matches MoGe2's convention (vertical focal for both axes). Returns None for fov<=0 so the @@ -83,19 +94,28 @@ def cam_int_from_fov(height: int, width: int, fov_degrees: float) -> Optional[to def apply_camera_override(mhr_pose_data: Dict[str, Any], camera_info: Dict[str, Any], - H: int, W: int, fov_deg: float = 0.0) -> Dict[str, Any]: + H: int, W: int) -> Dict[str, Any]: """Re-project every frame's pose through a Load3D 6DOF camera (position/ target/zoom + optional FOV). Returns a new mhr_pose_data; unchanged on empty/invalid input.""" first_frame = mhr_pose_data["frames"][0] if mhr_pose_data["frames"] else [] if not first_frame: return mhr_pose_data - # GLB exports the rig root at origin, so Load3D coords are root-relative - roots = [np.asarray(p["pred_cam_t"], dtype=np.float32).reshape(3) - for p in first_frame if p.get("pred_cam_t") is not None] + # Per-person rig root (pred_cam_t) and body centroid (mesh mean), in camera space. + roots, centroids = [], [] + for p in first_frame: + cam_t = p.get("pred_cam_t") + if cam_t is None: + continue + cam_t = np.asarray(cam_t, dtype=np.float32).reshape(3) + roots.append(cam_t) + v = p.get("pred_vertices") + centroids.append(np.asarray(v, dtype=np.float32).reshape(-1, 3).mean(axis=0) + cam_t + if v is not None else cam_t) if not roots: return mhr_pose_data - subj_center = np.mean(np.stack(roots, axis=0), axis=0) + subj_root = np.mean(np.stack(roots, axis=0), axis=0) + subj_centroid = np.mean(np.stack(centroids, axis=0), axis=0) # Meter-scale, so Three.js coords map 1:1 (Three.js Y-up → flip Y,Z) pos = camera_info.get("position") or {} @@ -103,26 +123,50 @@ def apply_camera_override(mhr_pose_data: Dict[str, Any], camera_info: Dict[str, pos_v = np.array([float(pos.get("x", 0.0)), -float(pos.get("y", 5.0)), -float(pos.get("z", 0.0))], dtype=np.float32) tgt_v = np.array([float(tgt.get("x", 0.0)), -float(tgt.get("y", 0.0)), -float(tgt.get("z", 0.0))], dtype=np.float32) offset = pos_v - tgt_v - if float(np.linalg.norm(offset)) < 1e-6: - return mhr_pose_data + has_offset = float(np.linalg.norm(offset)) >= 1e-6 + q = camera_info.get("quaternion") + if not has_offset and not q: + return mhr_pose_data # no viewpoint and no orientation -> nothing to apply zoom = float(camera_info.get("zoom", 1.0)) or 1.0 - target = subj_center + tgt_v - eye = target + offset / max(0.01, zoom) + # SAM3D roots near the feet. A target at the origin -> center the body centroid + if float(np.linalg.norm(tgt_v)) < 1e-6: + target = subj_centroid + else: + target = subj_root + tgt_v - # Look-at basis. z = -offset (already non-zero); x degenerates only when - # looking straight along world-up, then fall back to world +X. - z_axis = -offset / float(np.linalg.norm(offset)) - x_axis = np.cross(z_axis, np.array([0.0, -1.0, 0.0], dtype=np.float32)) - x_norm = float(np.linalg.norm(x_axis)) - x_axis = x_axis / x_norm if x_norm > 1e-6 else np.array([1.0, 0.0, 0.0], dtype=np.float32) - y_axis = np.cross(z_axis, x_axis) + if q: + mv = lambda v: np.array([v[0], -v[1], -v[2]], dtype=np.float32) + norm = lambda v: v / max(1e-6, float(np.linalg.norm(v))) + Rc = _quat_to_mat_wxyz( + float(q.get("w", 1.0)), float(q.get("x", 0.0)), + float(q.get("y", 0.0)), float(q.get("z", 0.0)), + ) # columns = camera world axes + x_axis = norm(mv(Rc[:, 0])) # camera +X -> image right + y_axis = norm(mv(-Rc[:, 1])) # image +Y is down -> negative of camera up + z_axis = norm(mv(-Rc[:, 2])) # camera looks down local -Z -> view direction + else: + # x degenerates only when looking straight along world-up -> world +X. + z_axis = -offset / float(np.linalg.norm(offset)) + x_axis = np.cross(z_axis, np.array([0.0, -1.0, 0.0], dtype=np.float32)) + x_norm = float(np.linalg.norm(x_axis)) + x_axis = x_axis / x_norm if x_norm > 1e-6 else np.array([1.0, 0.0, 0.0], dtype=np.float32) + y_axis = np.cross(z_axis, x_axis) R = np.stack([x_axis, y_axis, z_axis], axis=0).astype(np.float32) - # fov_deg > 0 overrides the lens; 0 keeps the SAM3D predicted focal so only - # the viewpoint changes. Three.js fov is vertical → focal from image height. - if fov_deg > 0: - new_focal = float(H) / (2.0 * float(np.tan(np.deg2rad(fov_deg) / 2.0))) + # Eye: dolly along the given offset; for a rotation-only camera (position == + # target) keep the predicted viewing distance so only orientation/roll changes. + if has_offset: + eye = target + offset / max(0.01, zoom) + else: + d = max(0.1, float(target[2])) + eye = target - z_axis * (d / max(0.01, zoom)) + + # Lens: use the camera's own FoV; else the SAM3D predicted focal (viewpoint- + # only change). Three.js fov is vertical → focal from image height. + cam_fov = float(camera_info.get("fov", 0.0) or 0.0) + if cam_fov > 0: + new_focal = float(H) / (2.0 * float(np.tan(np.deg2rad(cam_fov) / 2.0))) else: f0 = first_frame[0].get("focal_length") new_focal = (float(np.asarray(f0, dtype=np.float32).reshape(-1)[0]) if f0 is not None