mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-04-26 02:12:31 +08:00
refactor(entrypoint): single-pass GPU checks, preserved env across user switch, streamlined SageAttention build/cleanup
- Add a fast GPU presence gate with `nvidia-smi -L` at container start; exit early if unavailable or zero GPUs. Perform one thorough torch-based probe only in the root phase, export probe results (GPU_COUNT, COMPAT_GE_75, TORCH_CUDA_ARCH_LIST, SAGE_STRATEGY, SAGE_BUILD_STRATEGY), and call `runuser -p` so the app-user pass skips all GPU checks/logs. Remove any post-switch probing and strategy recovery paths to prevent duplicate logs. Unify wording to “SageAttention” and avoid duplicate “build” messages by logging the compilation once. After a successful install, delete the cloned sources under `.sage_attention/SageAttention` and retain `.built`. No features removed; behavior on GPU hosts is unchanged with cleaner, more accurate logs.
This commit is contained in:
parent
39b0a0cca8
commit
5e33515edc
@ -25,6 +25,21 @@ log() { echo "[$(date '+%H:%M:%S')] $1"; }
|
|||||||
# Make newly created files group-writable (helps in shared volumes)
|
# Make newly created files group-writable (helps in shared volumes)
|
||||||
umask 0002
|
umask 0002
|
||||||
|
|
||||||
|
# --- quick GPU presence check (nvidia-smi) ---
|
||||||
|
quick_check_gpus() {
|
||||||
|
if ! out="$(nvidia-smi -L 2>/dev/null)"; then
|
||||||
|
log "GPU quick check failed (nvidia-smi not available); shutting down."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
local count
|
||||||
|
count="$(printf "%s\n" "$out" | grep -c '^GPU [0-9]\+:')"
|
||||||
|
if [ "${count:-0}" -lt 1 ]; then
|
||||||
|
log "GPU quick check found 0 NVIDIA devices; shutting down."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
log "GPU quick check passed"
|
||||||
|
}
|
||||||
|
|
||||||
# --- build parallelism (single knob) ---
|
# --- build parallelism (single knob) ---
|
||||||
decide_build_jobs() {
|
decide_build_jobs() {
|
||||||
if [ -n "${SAGE_MAX_JOBS:-}" ]; then echo "$SAGE_MAX_JOBS"; return; fi
|
if [ -n "${SAGE_MAX_JOBS:-}" ]; then echo "$SAGE_MAX_JOBS"; return; fi
|
||||||
@ -98,7 +113,6 @@ print(f"TORCH_CUDA_ARCH_LIST='{arch_list}'")
|
|||||||
for k,v in flags.items():
|
for k,v in flags.items():
|
||||||
print(f"{k}={'true' if v else 'false'}")
|
print(f"{k}={'true' if v else 'false'}")
|
||||||
print(f"SAGE_STRATEGY='{strategy}'")
|
print(f"SAGE_STRATEGY='{strategy}'")
|
||||||
# stderr: detailed device list
|
|
||||||
print(f"[GPU] {n} CUDA device(s); CC list: {arch_list or 'none'}; strategy={strategy}; compat>=7.5:{compat}", file=sys.stderr)
|
print(f"[GPU] {n} CUDA device(s); CC list: {arch_list or 'none'}; strategy={strategy}; compat>=7.5:{compat}", file=sys.stderr)
|
||||||
for i,(nm,cc,mb) in enumerate(zip(names, ccs, mems)):
|
for i,(nm,cc,mb) in enumerate(zip(names, ccs, mems)):
|
||||||
print(f"[GPU] cuda:{i} - {nm} (CC {cc}, {mb} MB)", file=sys.stderr)
|
print(f"[GPU] cuda:{i} - {nm} (CC {cc}, {mb} MB)", file=sys.stderr)
|
||||||
@ -160,6 +174,8 @@ build_sage_attention_mixed() {
|
|||||||
if MAX_JOBS="${jobs}" python -m pip install --no-build-isolation .; then
|
if MAX_JOBS="${jobs}" python -m pip install --no-build-isolation .; then
|
||||||
echo "${SAGE_BUILD_STRATEGY:-${SAGE_STRATEGY:-fallback}}|${TORCH_CUDA_ARCH_LIST:-}" > "$SAGE_ATTENTION_BUILT_FLAG"
|
echo "${SAGE_BUILD_STRATEGY:-${SAGE_STRATEGY:-fallback}}|${TORCH_CUDA_ARCH_LIST:-}" > "$SAGE_ATTENTION_BUILT_FLAG"
|
||||||
log "SageAttention built successfully"
|
log "SageAttention built successfully"
|
||||||
|
# cleanup cloned sources to save space; keep .built flag
|
||||||
|
cd "$SAGE_ATTENTION_DIR" && rm -rf "SageAttention" || true
|
||||||
cd "$BASE_DIR"; return 0
|
cd "$BASE_DIR"; return 0
|
||||||
else
|
else
|
||||||
log "ERROR: SageAttention build failed"
|
log "ERROR: SageAttention build failed"
|
||||||
@ -217,7 +233,6 @@ seed_flag = pathlib.Path(sys.argv[2])
|
|||||||
cfg_dir = cfg_path.parent
|
cfg_dir = cfg_path.parent
|
||||||
cfg_dir.mkdir(parents=True, exist_ok=True)
|
cfg_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Collect CM_* envs -> [default] keys
|
|
||||||
def norm_bool(v:str):
|
def norm_bool(v:str):
|
||||||
t=v.strip().lower()
|
t=v.strip().lower()
|
||||||
if t in ("1","true","yes","on"): return "True"
|
if t in ("1","true","yes","on"): return "True"
|
||||||
@ -238,7 +253,6 @@ if cfg_path.exists():
|
|||||||
if "default" not in cfg:
|
if "default" not in cfg:
|
||||||
cfg["default"] = {}
|
cfg["default"] = {}
|
||||||
|
|
||||||
# If first boot for config, fully recreate default section from env
|
|
||||||
if first_seed:
|
if first_seed:
|
||||||
cfg["default"].clear()
|
cfg["default"].clear()
|
||||||
for k,v in sorted(env_items.items()):
|
for k,v in sorted(env_items.items()):
|
||||||
@ -249,7 +263,6 @@ if first_seed:
|
|||||||
seed_flag.touch()
|
seed_flag.touch()
|
||||||
print(f"[CFG] created: {cfg_path} with {len(env_items)} CM_ keys", file=sys.stderr)
|
print(f"[CFG] created: {cfg_path} with {len(env_items)} CM_ keys", file=sys.stderr)
|
||||||
else:
|
else:
|
||||||
# Subsequent boots: apply only provided CM_ overrides; keep others
|
|
||||||
for k,v in env_items.items():
|
for k,v in env_items.items():
|
||||||
if cfg["default"].get(k) != v:
|
if cfg["default"].get(k) != v:
|
||||||
cfg["default"][k] = v
|
cfg["default"][k] = v
|
||||||
@ -261,21 +274,28 @@ else:
|
|||||||
PY
|
PY
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- early GPU probe and exit (before heavy setup) ---
|
# --- start: quick check then thorough probe (root only) ---
|
||||||
eval "$(probe_and_prepare_gpu)"
|
if [ -z "${GPU_QUICK_CHECK_DONE:-}" ]; then
|
||||||
export SAGE_BUILD_STRATEGY="${SAGE_STRATEGY:-fallback}"
|
quick_check_gpus
|
||||||
log "GPU probe: ${GPU_COUNT:-0} CUDA device(s); CC list: ${TORCH_CUDA_ARCH_LIST:-none}; strategy=${SAGE_BUILD_STRATEGY}"
|
|
||||||
if [ "${GPU_COUNT:-0}" -eq 0 ]; then
|
|
||||||
log "No NVIDIA GPU detected; shutting down."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
if [ "${COMPAT_GE_75:-0}" -ne 1 ]; then
|
|
||||||
log "GPU compute capability < 7.5; shutting down."
|
|
||||||
exit 0
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# --- root to runtime user ---
|
|
||||||
if [ "$(id -u)" = "0" ]; then
|
if [ "$(id -u)" = "0" ]; then
|
||||||
|
# thorough probe & strategy (visible log once)
|
||||||
|
eval "$(probe_and_prepare_gpu)"
|
||||||
|
# export all needed vars so app-user pass doesn't re-probe
|
||||||
|
export GPU_COUNT COMPAT_GE_75 TORCH_CUDA_ARCH_LIST SAGE_STRATEGY
|
||||||
|
export SAGE_BUILD_STRATEGY="${SAGE_STRATEGY:-fallback}"
|
||||||
|
log "GPU probe: ${GPU_COUNT:-0} CUDA device(s); CC list: ${TORCH_CUDA_ARCH_LIST:-none}; strategy=${SAGE_BUILD_STRATEGY}"
|
||||||
|
if [ "${GPU_COUNT:-0}" -eq 0 ]; then
|
||||||
|
log "No NVIDIA GPU detected; shutting down."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
if [ "${COMPAT_GE_75:-0}" -ne 1 ]; then
|
||||||
|
log "GPU compute capability < 7.5; shutting down."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# permissions and user switch
|
||||||
if [ ! -f "$PERMISSIONS_SET_FLAG" ]; then
|
if [ ! -f "$PERMISSIONS_SET_FLAG" ]; then
|
||||||
log "Setting up user permissions..."
|
log "Setting up user permissions..."
|
||||||
if getent group "${PGID}" >/dev/null; then
|
if getent group "${PGID}" >/dev/null; then
|
||||||
@ -285,7 +305,6 @@ if [ "$(id -u)" = "0" ]; then
|
|||||||
mkdir -p "/home/${APP_USER}"
|
mkdir -p "/home/${APP_USER}"
|
||||||
for d in "$BASE_DIR" "/home/$APP_USER"; do [ -e "$d" ] && chown -R "${APP_USER}:${APP_GROUP}" "$d" || true; done
|
for d in "$BASE_DIR" "/home/$APP_USER"; do [ -e "$d" ] && chown -R "${APP_USER}:${APP_GROUP}" "$d" || true; done
|
||||||
|
|
||||||
# Make system site-packages writable by the runtime user (no venvs; system-wide installs)
|
|
||||||
readarray -t PY_PATHS < <(python - <<'PY'
|
readarray -t PY_PATHS < <(python - <<'PY'
|
||||||
import sys, sysconfig, os, site, datetime
|
import sys, sysconfig, os, site, datetime
|
||||||
def log(m): print(f"[bootstrap:python {datetime.datetime.now().strftime('%H:%M:%S')}] {m}", file=sys.stderr, flush=True)
|
def log(m): print(f"[bootstrap:python {datetime.datetime.now().strftime('%H:%M:%S')}] {m}", file=sys.stderr, flush=True)
|
||||||
@ -326,15 +345,14 @@ PY
|
|||||||
else
|
else
|
||||||
log "User permissions already configured, skipping..."
|
log "User permissions already configured, skipping..."
|
||||||
fi
|
fi
|
||||||
exec runuser -u "${APP_USER}" -- "$0" "$@"
|
|
||||||
|
# flag and preserve env across user switch; skip quick check as app user
|
||||||
|
export GPU_QUICK_CHECK_DONE=1
|
||||||
|
exec runuser -p -u "${APP_USER}" -- "$0" "$@"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# From here on, running as $APP_USER
|
# --- From here on, running as $APP_USER ---
|
||||||
|
# No quick check or probe here; variables were preserved
|
||||||
# --- refresh GPU probe after user switch (no exit) ---
|
|
||||||
eval "$(probe_and_prepare_gpu)"
|
|
||||||
export SAGE_BUILD_STRATEGY="${SAGE_STRATEGY:-fallback}"
|
|
||||||
log "GPU probe (post-switch): ${GPU_COUNT:-0} CUDA device(s); CC list: ${TORCH_CUDA_ARCH_LIST:-none}; strategy=${SAGE_BUILD_STRATEGY}"
|
|
||||||
|
|
||||||
# --- SageAttention setup using probed data ---
|
# --- SageAttention setup using probed data ---
|
||||||
setup_sage_attention
|
setup_sage_attention
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user