fix(entrypoint): probe GPU once at startup before permissions; exit fast if not compatible

- Add early GPU probe that first tries nvidia-smi, then torch, with compute capability >= 7.5 gating; write a pass flag to avoid reprobe; exit 42 otherwise to prevent unnecessary work. - Move GPU detection before any user/permission operations to stop repeated permission logs on restarts. - Replace bracketed Markdown URLs with plain URLs in git commands.
2026-06-16 21:09:48 +08:00 · 2025-09-30 16:11:07 -06:00 · 2025-09-30 16:11:07 -06:00 · db5ae38c11
commit db5ae38c11
parent 249ff20e20
1 changed files with 69 additions and 11 deletions
--- a/entrypoint.sh
+++ b/entrypoint.sh
@ -12,6 +12,7 @@ SAGE_ATTENTION_DIR="$BASE_DIR/.sage_attention"
 SAGE_ATTENTION_BUILT_FLAG="$SAGE_ATTENTION_DIR/.built"
 PERMISSIONS_SET_FLAG="$BASE_DIR/.permissions_set"
 FIRST_RUN_FLAG="$BASE_DIR/.first_run_done"
+GPU_CHECK_FLAG="$BASE_DIR/.gpu_check_passed"

 # --- logging ---
 log() { echo "[$(date '+%H:%M:%S')] $1"; }
@ -19,12 +20,74 @@ log() { echo "[$(date '+%H:%M:%S')] $1"; }
 # Make newly created files group-writable (helps in shared volumes)
 umask 0002

+# --- early GPU probe (single pass; exit if not OK) ---
+early_gpu_probe() {
+  # Skip re-probe if cached OK
+  if [ -f "$GPU_CHECK_FLAG" ]; then
+    log "GPU check previously passed; skipping reprobe"
+    return 0
+  fi
+
+  # 1) Prefer nvidia-smi if present (fails when driver/device not exposed)
+  if command -v nvidia-smi >/dev/null 2>&1; then
+    if nvidia-smi -L >/dev/null 2>&1; then
+      # Optional: check compute capability via nvidia-smi if available
+      cc_ok=1
+      if nvidia-smi --query-gpu=compute_cap --format=csv,noheader >/dev/null 2>&1; then
+        # compute_cap is like "8.9", "9.0"
+        while IFS= read -r cap; do
+          maj="${cap%%.*}"; min="${cap#*.}"
+          [ -z "$maj" ] && continue
+          val=$((10*${maj:-0}+${min:-0}))
+          if [ "$val" -lt 75 ]; then cc_ok=0; fi
+        done < <(nvidia-smi --query-gpu=compute_cap --format=csv,noheader || true)
+      fi
+      if [ "$cc_ok" -eq 1 ]; then
+        touch "$GPU_CHECK_FLAG"
+        log "GPU check passed via nvidia-smi"
+        return 0
+      else
+        log "Incompatible GPU compute capability detected (< 7.5); exiting"
+        return 2
+      fi
+    fi
+  fi
+
+  # 2) Fallback: torch probe (does not need a GPU to import)
+  if python - <<'PY'
+import sys
+try:
+    import torch
+    sys.exit(0 if torch.cuda.is_available() else 3)
+except Exception:
+    sys.exit(4)
+PY
+  then
+    touch "$GPU_CHECK_FLAG"
+    log "GPU check passed via torch probe"
+    return 0
+  else
+    rc=$?
+    if [ $rc -eq 3 ]; then
+      log "torch reports CUDA unavailable; exiting"
+    else
+      log "torch probe failed to import or run; exiting"
+    fi
+    return 5
+  fi
+}
+
+# Run early GPU probe before any user/permission work
+if ! early_gpu_probe; then
+  log "No compatible NVIDIA GPU detected or driver not visible to container; exiting."
+  exit 42
+fi
+
 # --- build parallelism (single knob) ---
 decide_build_jobs() {
    if [ -n "${SAGE_MAX_JOBS:-}" ]; then echo "$SAGE_MAX_JOBS"; return; fi
-    local mem_kb
+    local mem_kb cpu cap jobs
    mem_kb=$(awk '/MemTotal:/ {print $2}' /proc/meminfo 2>/dev/null || echo 0)
-    local cpu cap jobs
    cpu=$(nproc) cap=24
    if   [ "$mem_kb" -le $((8*1024*1024)) ];  then jobs=2
    elif [ "$mem_kb" -le $((12*1024*1024)) ]; then jobs=3
@ -35,7 +98,7 @@ decide_build_jobs() {
    echo "$jobs"
 }

-# --- unified GPU probe (torch-based) ---
+# --- unified GPU probe (torch-based, for downstream flags) ---
 probe_and_prepare_gpu() {
 python - <<'PY' 2>/dev/null
 import os, sys
@ -251,19 +314,14 @@ export PYTHONPATH="$HOME/.local/lib/python${pyver}/site-packages:${PYTHONPATH:-}
 export PIP_USER=1
 export PIP_PREFER_BINARY=1

-# --- single GPU probe + early exit ---
+# --- single GPU probe for downstream flags ---
 eval "$(probe_and_prepare_gpu)"
-if [ "${GPU_COUNT:-0}" -eq 0 ] || [ "${COMPAT_GE_75:-0}" -ne 1 ]; then
-    log "No compatible NVIDIA GPU (compute capability >= 7.5) detected; shutting down."
-    exit 0
-fi

-# --- Ensure package manager and Manager deps are available ---
-# Ensure python -m pip works (bootstrap if needed)
+# Ensure pip available
 python -m pip --version >/dev/null 2>&1 || python -m ensurepip --upgrade >/dev/null 2>&1 || true
 python -m pip --version >/dev/null 2>&1 || log "WARNING: pip still not available after ensurepip"

-# Ensure ComfyUI-Manager minimal Python deps
+# Ensure ComfyUI-Manager minimal dep
 python - <<'PY'
 import sys
 try: