ComfyUI/comfy_extras/sam3d_body/export/bvh.py

"""BVH export for SAM 3D Body pose_data.

BVH stores explicit bone OFFSETs per joint, so any standard importer
(Blender, Maya, MotionBuilder, etc.) reconstructs anatomical bone orientations
directly — no heuristic guessing as needed for glTF. We skip the rig's joint 0
(static world anchor) and use joint 1 as the BVH ROOT (6 channels: XYZ pos +
ZXY rot); every other joint gets 3 channels (ZXY rot only). Rotations are
intrinsic Z-X-Y Euler degrees.
"""

from __future__ import annotations

import io
from typing import Any, Dict, List

import numpy as np

from .glb_shared import (
    bind_skel_state,
    bone_locals_from_globals,
    collect_tracks,
    extract_rig_static,
    global_skel_state_from_pose_data,
    quat_sign_fix_per_joint,
    unflip,
)


def _quat_to_zxy_euler_deg(quat: np.ndarray) -> np.ndarray:
    """xyzw quat → intrinsic Z-X-Y Euler degrees, returned as (..., 3) in
    (z, x, y) order to match BVH's `CHANNELS Zrotation Xrotation Yrotation`."""
    q = np.asarray(quat, dtype=np.float64)
    x, y, z, w = q[..., 0], q[..., 1], q[..., 2], q[..., 3]
    # ZXY decomposition R = Rz(c)·Rx(a)·Ry(b):
    #   M[2][1] = 2(yz + xw)        → sin(a)
    #   M[0][1] = 2(xy - zw)        → -cos(a) sin(c)
    #   M[1][1] = 1 - 2(x² + z²)    → cos(a) cos(c)
    #   M[2][0] = 2(xz - yw)        → -cos(a) sin(b)
    #   M[2][2] = 1 - 2(x² + y²)    → cos(a) cos(b)
    M21 = np.clip(2.0 * (y * z + x * w), -1.0, 1.0)
    M01 = 2.0 * (x * y - z * w)
    M11 = 1.0 - 2.0 * (x * x + z * z)
    M20 = 2.0 * (x * z - y * w)
    M22 = 1.0 - 2.0 * (x * x + y * y)
    a = np.arcsin(M21)
    c = np.arctan2(-M01, M11)
    b = np.arctan2(-M20, M22)
    out = np.stack([np.rad2deg(c), np.rad2deg(a), np.rad2deg(b)], axis=-1)
    return out.astype(np.float32)


def _find_bvh_root(parents: np.ndarray) -> int:
    """First child of the rig's world anchor so the static origin→body stick
    bone gets left out. Falls back to the first root joint."""
    NJ = parents.shape[0]
    world_anchors = [j for j in range(NJ)
                     if not (0 <= int(parents[j]) < NJ and int(parents[j]) != j)]
    if not world_anchors:
        return 0
    children: List[List[int]] = [[] for _ in range(NJ)]
    for j in range(NJ):
        p = int(parents[j])
        if 0 <= p < NJ and p != j:
            children[p].append(j)
    wa = world_anchors[0]
    if children[wa]:
        return children[wa][0]
    return wa


def _build_children_map(parents: np.ndarray) -> List[List[int]]:
    NJ = parents.shape[0]
    out: List[List[int]] = [[] for _ in range(NJ)]
    for j in range(NJ):
        p = int(parents[j])
        if 0 <= p < NJ and p != j:
            out[p].append(j)
    return out


def build_bvh(
    pose_data: Dict[str, Any],
    model: Any,
    *,
    fps: float = 24.0,
    camera_translation: str = "off",
    track_index: int = -1,
    units: str = "cm",
) -> bytes:
    """Build a BVH file from pose_data. Returns UTF-8 encoded text bytes.

    `units` is "cm" (default, standard mocap convention) or "m". Affects the
    OFFSET and root-position values; rotations are independent of units.
    """
    if units not in ("cm", "m"):
        raise ValueError(f"build_bvh: units must be 'cm' or 'm', got {units!r}")
    unit_scale = 100.0 if units == "cm" else 1.0

    rig_static = extract_rig_static(model)
    NJ = int(rig_static["num_joints"])
    parents = rig_static["parents"]
    frames = pose_data["frames"]

    tracks = collect_tracks(pose_data, track_index)
    if not tracks:
        raise ValueError("build_bvh: no valid tracks in pose_data")
    person_k, frame_indices = tracks[0]
    n_frames = len(frame_indices)
    if n_frames == 0:
        raise ValueError("build_bvh: track has zero frames")

    body_root = _find_bvh_root(parents)
    children_map = _build_children_map(parents)

    # Bone OFFSETs come from MHR's translation_offsets (joint position
    # relative to parent in parent's local-bind frame). For the BVH root,
    # we use its bind world position so the skeleton sits at the right
    # spot when imported.
    bind_global = bind_skel_state(model)                     # (NJ, 8) cm
    bind_pos_m = bind_global[:, :3].astype(np.float64) * 0.01  # (NJ, 3) m
    offset_m = rig_static["joint_translation_offsets"].astype(np.float64) * 0.01

    # DFS order rooted at body_root — matches per-frame channel order.
    bvh_order: List[int] = []
    def _visit(j: int) -> None:
        bvh_order.append(j)
        for c in children_map[j]:
            _visit(c)
    _visit(body_root)

    # Use pose_data's stored pred_global_rots/pred_joint_coords (authoritative)
    # rather than re-running rig.forward, then derive locals with body_root
    # treated as the hierarchy root in BVH-space.
    rig_global_m = global_skel_state_from_pose_data(
        pose_data, frame_indices, person_k, NJ,
    )
    rig_global_m[..., 3:7] = quat_sign_fix_per_joint(rig_global_m[..., 3:7])
    bvh_parents = parents.copy()
    bvh_parents[body_root] = -1
    bone_local = bone_locals_from_globals(rig_global_m, bvh_parents)
    # Second pass catches sign discontinuities from the parent-inverse composition.
    bone_local[..., 3:7] = quat_sign_fix_per_joint(bone_local[..., 3:7])

    eulers_deg = _quat_to_zxy_euler_deg(bone_local[..., 3:7])

    if camera_translation in ("absolute", "centered"):
        cam_t = np.stack([
            unflip(np.asarray(frames[t][person_k]["pred_cam_t"], dtype=np.float32))
            for t in frame_indices
        ], axis=0).astype(np.float64)
        if camera_translation == "centered":
            cam_t = cam_t - cam_t[0:1]
        root_pos_m = bind_pos_m[body_root][None, :] + cam_t
    else:
        root_pos_m = np.tile(bind_pos_m[body_root], (n_frames, 1))

    lines: List[str] = ["HIERARCHY"]

    def _emit_joint(j: int, depth: int, is_root: bool) -> None:
        ind = "  " * depth
        keyword = "ROOT" if is_root else "JOINT"
        name = "Hips" if is_root else f"joint_{j:03d}"
        lines.append(f"{ind}{keyword} {name}")
        lines.append(ind + "{")
        o = (bind_pos_m[j] if is_root else offset_m[j]) * unit_scale
        lines.append(f"{ind}  OFFSET {o[0]:.6f} {o[1]:.6f} {o[2]:.6f}")
        if is_root:
            lines.append(ind + "  CHANNELS 6 Xposition Yposition Zposition "
                         "Zrotation Xrotation Yrotation")
        else:
            lines.append(ind + "  CHANNELS 3 Zrotation Xrotation Yrotation")
        kids = children_map[j]
        if kids:
            for c in kids:
                _emit_joint(c, depth + 1, is_root=False)
        else:
            # End Site (standard BVH spec) gives leaf bones a drawable length.
            lines.append(ind + "  End Site")
            lines.append(ind + "  {")
            tip = (offset_m[j] * unit_scale) * 0.3
            tip_norm = float(np.linalg.norm(tip))
            if tip_norm < 0.5 * unit_scale * 0.01:  # < 0.5 mm → fall back
                tip = np.array([0.0, 0.05 * unit_scale, 0.0], dtype=np.float64)
            lines.append(f"{ind}    OFFSET {tip[0]:.6f} {tip[1]:.6f} {tip[2]:.6f}")
            lines.append(ind + "  }")
        lines.append(ind + "}")

    _emit_joint(body_root, 0, is_root=True)

    lines.append("MOTION")
    lines.append(f"Frames: {n_frames}")
    lines.append(f"Frame Time: {1.0 / float(fps):.6f}")

    # Channel matrix: root pos (3) + root rot (3) + non-root rots (3 each) per
    # frame, columns in `bvh_order` order. Vectorized — savetxt's C-side
    # formatting beats Python f-strings by ~10× on long clips.
    non_root_idx = np.asarray(bvh_order[1:], dtype=np.int64)
    motion = np.concatenate([
        root_pos_m * unit_scale,                                 # (N, 3)
        eulers_deg[:, body_root].astype(np.float64),             # (N, 3)
        eulers_deg[:, non_root_idx, :].reshape(n_frames, -1),    # (N, 3*(NJ-1))
    ], axis=1)
    buf = io.StringIO()
    np.savetxt(buf, motion, fmt="%.6f")
    lines.append(buf.getvalue().rstrip("\n"))

    return ("\n".join(lines) + "\n").encode("utf-8")