refactor(entrypoint): single-pass GPU checks, preserved env across user switch, streamlined SageAttention build/cleanup

- Add a fast GPU presence gate with `nvidia-smi -L` at container start; exit early if unavailable or zero GPUs. Perform one thorough torch-based probe only in the root phase, export probe results (GPU_COUNT, COMPAT_GE_75, TORCH_CUDA_ARCH_LIST, SAGE_STRATEGY, SAGE_BUILD_STRATEGY), and call `runuser -p` so the app-user pass skips all GPU checks/logs. Remove any post-switch probing and strategy recovery paths to prevent duplicate logs. Unify wording to “SageAttention” and avoid duplicate “build” messages by logging the compilation once. After a successful install, delete the cloned sources under `.sage_attention/SageAttention` and retain `.built`. No features removed; behavior on GPU hosts is unchanged with cleaner, more accurate logs.
2026-06-16 04:49:35 +08:00 · 2025-10-03 13:27:33 -06:00 · 2025-10-03 13:27:33 -06:00 · 5e33515edc
commit 5e33515edc
parent 39b0a0cca8
1 changed files with 42 additions and 24 deletions
--- a/entrypoint.sh
+++ b/entrypoint.sh
@ -25,6 +25,21 @@ log() { echo "[$(date '+%H:%M:%S')] $1"; }
 # Make newly created files group-writable (helps in shared volumes)
 umask 0002
 # --- quick GPU presence check (nvidia-smi) ---
 quick_check_gpus() {
    if ! out="$(nvidia-smi -L 2>/dev/null)"; then
        log "GPU quick check failed (nvidia-smi not available); shutting down."
        exit 0
    fi
    local count
    count="$(printf "%s\n" "$out" | grep -c '^GPU [0-9]\+:')"
    if [ "${count:-0}" -lt 1 ]; then
        log "GPU quick check found 0 NVIDIA devices; shutting down."
        exit 0
    fi
    log "GPU quick check passed"
 }
 # --- build parallelism (single knob) ---
 decide_build_jobs() {
    if [ -n "${SAGE_MAX_JOBS:-}" ]; then echo "$SAGE_MAX_JOBS"; return; fi
@ -98,7 +113,6 @@ print(f"TORCH_CUDA_ARCH_LIST='{arch_list}'")
 for k,v in flags.items():
    print(f"{k}={'true' if v else 'false'}")
 print(f"SAGE_STRATEGY='{strategy}'")
 # stderr: detailed device list
 print(f"[GPU] {n} CUDA device(s); CC list: {arch_list or 'none'}; strategy={strategy}; compat>=7.5:{compat}", file=sys.stderr)
 for i,(nm,cc,mb) in enumerate(zip(names, ccs, mems)):
    print(f"[GPU] cuda:{i} - {nm} (CC {cc}, {mb} MB)", file=sys.stderr)
@ -160,6 +174,8 @@ build_sage_attention_mixed() {
    if MAX_JOBS="${jobs}" python -m pip install --no-build-isolation .; then
        echo "${SAGE_BUILD_STRATEGY:-${SAGE_STRATEGY:-fallback}}|${TORCH_CUDA_ARCH_LIST:-}" > "$SAGE_ATTENTION_BUILT_FLAG"
        log "SageAttention built successfully"
        # cleanup cloned sources to save space; keep .built flag
        cd "$SAGE_ATTENTION_DIR" && rm -rf "SageAttention" || true
        cd "$BASE_DIR"; return 0
    else
        log "ERROR: SageAttention build failed"
@ -217,7 +233,6 @@ seed_flag = pathlib.Path(sys.argv[2])
 cfg_dir = cfg_path.parent
 cfg_dir.mkdir(parents=True, exist_ok=True)
 # Collect CM_* envs -> [default] keys
 def norm_bool(v:str):
    t=v.strip().lower()
    if t in ("1","true","yes","on"): return "True"
@ -238,7 +253,6 @@ if cfg_path.exists():
 if "default" not in cfg:
    cfg["default"] = {}
 # If first boot for config, fully recreate default section from env
 if first_seed:
    cfg["default"].clear()
    for k,v in sorted(env_items.items()):
@ -249,7 +263,6 @@ if first_seed:
    seed_flag.touch()
    print(f"[CFG] created: {cfg_path} with {len(env_items)} CM_ keys", file=sys.stderr)
 else:
    # Subsequent boots: apply only provided CM_ overrides; keep others
    for k,v in env_items.items():
        if cfg["default"].get(k) != v:
            cfg["default"][k] = v
@ -261,21 +274,28 @@ else:
 PY
 }
-# --- early GPU probe and exit (before heavy setup) ---
+# --- start: quick check then thorough probe (root only) ---
-eval "$(probe_and_prepare_gpu)"
+if [ -z "${GPU_QUICK_CHECK_DONE:-}" ]; then
-export SAGE_BUILD_STRATEGY="${SAGE_STRATEGY:-fallback}"
+    quick_check_gpus
 log "GPU probe: ${GPU_COUNT:-0} CUDA device(s); CC list: ${TORCH_CUDA_ARCH_LIST:-none}; strategy=${SAGE_BUILD_STRATEGY}"
 if [ "${GPU_COUNT:-0}" -eq 0 ]; then
    log "No NVIDIA GPU detected; shutting down."
    exit 0
 fi
 if [ "${COMPAT_GE_75:-0}" -ne 1 ]; then
    log "GPU compute capability < 7.5; shutting down."
    exit 0
 fi
 # --- root to runtime user ---
 if [ "$(id -u)" = "0" ]; then
    # thorough probe & strategy (visible log once)
    eval "$(probe_and_prepare_gpu)"
    # export all needed vars so app-user pass doesn't re-probe
    export GPU_COUNT COMPAT_GE_75 TORCH_CUDA_ARCH_LIST SAGE_STRATEGY
    export SAGE_BUILD_STRATEGY="${SAGE_STRATEGY:-fallback}"
    log "GPU probe: ${GPU_COUNT:-0} CUDA device(s); CC list: ${TORCH_CUDA_ARCH_LIST:-none}; strategy=${SAGE_BUILD_STRATEGY}"
    if [ "${GPU_COUNT:-0}" -eq 0 ]; then
        log "No NVIDIA GPU detected; shutting down."
        exit 0
    fi
    if [ "${COMPAT_GE_75:-0}" -ne 1 ]; then
        log "GPU compute capability < 7.5; shutting down."
        exit 0
    fi
    # permissions and user switch
    if [ ! -f "$PERMISSIONS_SET_FLAG" ]; then
        log "Setting up user permissions..."
        if getent group "${PGID}" >/dev/null; then
@ -285,7 +305,6 @@ if [ "$(id -u)" = "0" ]; then
        mkdir -p "/home/${APP_USER}"
        for d in "$BASE_DIR" "/home/$APP_USER"; do [ -e "$d" ] && chown -R "${APP_USER}:${APP_GROUP}" "$d" || true; done
        # Make system site-packages writable by the runtime user (no venvs; system-wide installs)
        readarray -t PY_PATHS < <(python - <<'PY'
 import sys, sysconfig, os, site, datetime
 def log(m): print(f"[bootstrap:python {datetime.datetime.now().strftime('%H:%M:%S')}] {m}", file=sys.stderr, flush=True)
@ -326,15 +345,14 @@ PY
    else
        log "User permissions already configured, skipping..."
    fi
-    exec runuser -u "${APP_USER}" -- "$0" "$@"
+
    # flag and preserve env across user switch; skip quick check as app user
    export GPU_QUICK_CHECK_DONE=1
    exec runuser -p -u "${APP_USER}" -- "$0" "$@"
 fi
-# From here on, running as $APP_USER
+# --- From here on, running as $APP_USER ---
-
+# No quick check or probe here; variables were preserved
 # --- refresh GPU probe after user switch (no exit) ---
 eval "$(probe_and_prepare_gpu)"
 export SAGE_BUILD_STRATEGY="${SAGE_STRATEGY:-fallback}"
 log "GPU probe (post-switch): ${GPU_COUNT:-0} CUDA device(s); CC list: ${TORCH_CUDA_ARCH_LIST:-none}; strategy=${SAGE_BUILD_STRATEGY}"
 # --- SageAttention setup using probed data ---
 setup_sage_attention