Restore SageAttention, fix gpus syntax, and clean up entrypoint

SageAttention research confirms it remains useful (2-5x speedup over FA, active development through SA3, broad community adoption for video/HR). Restore it as an opt-in startup-compiled feature (FORCE_SAGE_ATTENTION=1). Entrypoint is cleaned up vs the original: simplified GPU probe (drops per-arch flag exports, keeps what's needed for strategy selection), cleaner build logic with merged clone/update paths, removed dead code. - Dockerfile: restore SAGE_ATTENTION_AVAILABLE=0 default env var - entrypoint.sh: simplified but functionally equivalent SageAttention support; removes ~150 lines of redundant per-arch flag tracking - README: fix docker-compose gpus syntax to 'gpus: all'; restore SageAttention docs with accurate description of behavior https://claude.ai/code/session_01WQc56fWdK329K11kRGnb5g
2026-05-05 23:02:49 +08:00 · 2026-03-27 12:40:33 +00:00 · 2026-03-27 12:40:33 +00:00 · 234caeed32
commit 234caeed32
parent 1bf3bfbdb3
3 changed files with 168 additions and 24 deletions
--- a/1
+++ b/1
@ -12,6 +12,7 @@ ENV DEBIAN_FRONTEND=noninteractive \
    EXT_PARALLEL=4 \
    NVCC_APPEND_FLAGS="--threads 8" \
    MAX_JOBS=32 \
+    SAGE_ATTENTION_AVAILABLE=0 \
    COMFYUI_PATH=/app/ComfyUI \
    COMFYUI_MODEL_PATH=/app/ComfyUI/models \
    COMFYUI_MODELS_PATH=/app/ComfyUI/models
--- a/README.md
+++ b/README.md
@ -36,6 +36,7 @@ I created this repo for myself as a simple way to stay up to date with the lates
 - CUDA-enabled PyTorch + Triton on Debian trixie with CUDA 12.8 dev libs so custom CUDA builds work at runtime.
 - Non-root runtime with PUID/PGID mapping handled by entrypoint for volume permissions.
 - ComfyUI-Manager auto-sync on startup; entrypoint scans custom_nodes and installs requirements when COMFY_AUTO_INSTALL=1.
+- SageAttention build-on-start for compatible NVIDIA GPUs (Turing/SM 7.5+); enabling is opt-in via FORCE_SAGE_ATTENTION=1.

 ---

@ -71,13 +72,7 @@ services:
      - TZ=America/Edmonton
      - PUID=1000
      - PGID=1000
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: all
-              capabilities: [gpu]
+    gpus: all
    volumes:
      - comfyui_data:/app/ComfyUI/user/default
      - comfyui_nodes:/app/ComfyUI/custom_nodes
@ -94,10 +89,19 @@ Run with `docker compose up -d`.
 - Open http://localhost:8188 after the container is up; change the external port via -p HOST:8188.
 - To target specific GPUs, use Docker's GPU device selections or Compose device_ids in reservations.

+### SageAttention
+SageAttention is compiled at container startup when a compatible GPU (Turing SM 7.5+) is detected and cached to a volume-mapped directory for subsequent starts. It delivers 2-5x faster attention vs FlashAttention for video and high-res image workflows.
+
+- To enable: set `FORCE_SAGE_ATTENTION=1`. If the build or import fails, ComfyUI starts normally without it.
+- The first startup with SageAttention will be slower due to compilation; subsequent starts use the cached build.
+- Turing GPUs (RTX 20xx) use the v1.0 branch with Triton 3.2.0; Ampere and newer use the latest release.
+
 ### Environment Variables
 - PUID/PGID: map container user to host UID/GID for volume write access.
 - COMFY_AUTO_INSTALL=1: auto-install Python requirements from custom_nodes on startup (default: 1).
 - COMFY_FORCE_INSTALL=1: force reinstall of custom_nodes requirements even after first run.
+- FORCE_SAGE_ATTENTION=0|1: compile and enable SageAttention on startup (requires compatible NVIDIA GPU).
+- SAGE_MAX_JOBS=N: override the number of parallel compile jobs for SageAttention (default: auto from RAM).
 - CM_*: seed ComfyUI-Manager config.ini keys on first start (e.g. CM_SKIP_UPDATE_CHECK=1).

 ---
--- a/entrypoint.sh
+++ b/entrypoint.sh
@ -8,6 +8,8 @@ PUID=${PUID:-1000}
 PGID=${PGID:-1000}
 BASE_DIR=/app/ComfyUI
 CUSTOM_NODES_DIR="$BASE_DIR/custom_nodes"
+SAGE_ATTENTION_DIR="$BASE_DIR/.sage_attention"
+SAGE_ATTENTION_BUILT_FLAG="$SAGE_ATTENTION_DIR/.built"
 PERMISSIONS_SET_FLAG="$BASE_DIR/.permissions_set"
 FIRST_RUN_FLAG="$BASE_DIR/.first_run_done"

@ -23,6 +25,138 @@ log() { echo "[$(date '+%H:%M:%S')] $1"; }
 # Make newly created files group-writable (helps in shared volumes)
 umask 0002

+# --- build parallelism ---
+decide_build_jobs() {
+    if [ -n "${SAGE_MAX_JOBS:-}" ]; then echo "$SAGE_MAX_JOBS"; return; fi
+    local mem_kb; mem_kb=$(awk '/MemTotal:/ {print $2}' /proc/meminfo 2>/dev/null || echo 0)
+    local cpu; cpu=$(nproc); local cap=24; local jobs
+    if   [ "$mem_kb" -le $((8*1024*1024)) ];  then jobs=2
+    elif [ "$mem_kb" -le $((12*1024*1024)) ]; then jobs=3
+    elif [ "$mem_kb" -le $((24*1024*1024)) ]; then jobs=4
+    elif [ "$mem_kb" -le $((64*1024*1024)) ]; then jobs=$(( cpu<8 ? cpu : 8 ))
+    else jobs=$cpu; [ "$jobs" -gt "$cap" ] && jobs=$cap
+    fi
+    echo "$jobs"
+}
+
+# --- GPU probe (torch-based) ---
+probe_gpu() {
+python - <<'PY' 2>/dev/null
+import sys
+try:
+    import torch
+except Exception:
+    print("GPU_COUNT=0"); print("COMPAT_GE_75=0"); print("TORCH_CUDA_ARCH_LIST="); print("SAGE_STRATEGY=fallback"); sys.exit(0)
+if not torch.cuda.is_available():
+    print("GPU_COUNT=0"); print("COMPAT_GE_75=0"); print("TORCH_CUDA_ARCH_LIST="); print("SAGE_STRATEGY=fallback"); sys.exit(0)
+n = torch.cuda.device_count()
+ccs = []; compat = False; has_turing = False; has_ampere_plus = False
+for i in range(n):
+    p = torch.cuda.get_device_properties(i)
+    mj, mn = p.major, p.minor
+    ccs.append(f"{mj}.{mn}")
+    if (mj*10+mn) >= 75: compat = True
+    if (mj, mn) == (7, 5): has_turing = True
+    if mj >= 8: has_ampere_plus = True
+ordered = sorted(set(ccs), key=lambda s: tuple(map(int, s.split("."))))
+arch_list = ";".join(ordered)
+if has_turing and has_ampere_plus: strategy = "mixed_with_turing"
+elif has_turing: strategy = "turing_only"
+else: strategy = "ampere_ada_or_newer"
+print(f"GPU_COUNT={n}")
+print(f"COMPAT_GE_75={1 if compat else 0}")
+print(f"TORCH_CUDA_ARCH_LIST={arch_list}")
+print(f"SAGE_STRATEGY={strategy}")
+for i in range(n):
+    p = torch.cuda.get_device_properties(i)
+    print(f"[GPU] cuda:{i} - {p.name} (CC {p.major}.{p.minor})", file=sys.stderr)
+PY
+}
+
+# --- SageAttention ---
+needs_sage_rebuild() {
+    [ ! -f "$SAGE_ATTENTION_BUILT_FLAG" ] && return 0
+    local stored; stored=$(cat "$SAGE_ATTENTION_BUILT_FLAG" 2>/dev/null || echo "")
+    local prev_strategy="${stored%%|*}"; local prev_arch="${stored#*|}"
+    [ "$prev_strategy" != "${SAGE_STRATEGY:-fallback}" ] && return 0
+    [ "$prev_arch" != "${TORCH_CUDA_ARCH_LIST:-}" ] && return 0
+    return 1
+}
+
+test_sage_attention() {
+    python -c "import sageattention; print('[TEST] SageAttention import: OK')" 2>/dev/null
+}
+
+build_sage_attention() {
+    log "Building SageAttention (strategy=${SAGE_STRATEGY:-fallback})..."
+    mkdir -p "$SAGE_ATTENTION_DIR"; cd "$SAGE_ATTENTION_DIR"
+    export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-8.0;8.6;8.9;9.0;10.0;12.0}"
+
+    # Turing (SM 7.5) requires the v1.0 branch; newer GPUs use main
+    case "${SAGE_STRATEGY:-fallback}" in
+        "mixed_with_turing"|"turing_only")
+            log "Cloning SageAttention v1.0 (Turing compatibility)"
+            if [ -d "SageAttention/.git" ]; then
+                cd SageAttention; git fetch --depth 1 origin || return 1
+                git checkout v1.0 2>/dev/null || git checkout -b v1.0 origin/v1.0 || return 1
+                git reset --hard origin/v1.0 || return 1
+            else
+                rm -rf SageAttention
+                git clone --depth 1 https://github.com/thu-ml/SageAttention.git -b v1.0 || return 1
+                cd SageAttention
+            fi
+            # Turing needs Triton 3.2.0
+            local cur_triton; cur_triton=$(python -c "import importlib.metadata as m; print(m.version('triton'))" 2>/dev/null || echo "")
+            if [ "$cur_triton" != "3.2.0" ]; then
+                log "Installing Triton 3.2.0 for Turing (current: ${cur_triton:-none})"
+                python -m pip install --no-cache-dir "triton==3.2.0" || true
+            fi
+            ;;
+        *)
+            log "Cloning SageAttention (latest)"
+            if [ -d "SageAttention/.git" ]; then
+                cd SageAttention; git fetch --depth 1 origin || return 1
+                git reset --hard origin/main || return 1
+            else
+                rm -rf SageAttention
+                git clone --depth 1 https://github.com/thu-ml/SageAttention.git || return 1
+                cd SageAttention
+            fi
+            ;;
+    esac
+
+    local jobs; jobs="$(decide_build_jobs)"
+    log "Compiling with MAX_JOBS=${jobs}"
+    if MAX_JOBS="${jobs}" python -m pip install --no-build-isolation .; then
+        echo "${SAGE_STRATEGY:-fallback}|${TORCH_CUDA_ARCH_LIST:-}" > "$SAGE_ATTENTION_BUILT_FLAG"
+        cd "$SAGE_ATTENTION_DIR"; rm -rf SageAttention || true
+        cd "$BASE_DIR"
+        log "SageAttention built successfully"
+        return 0
+    else
+        cd "$BASE_DIR"
+        log "WARNING: SageAttention build failed"
+        return 1
+    fi
+}
+
+setup_sage_attention() {
+    export SAGE_ATTENTION_AVAILABLE=0
+    if [ "${GPU_COUNT:-0}" -eq 0 ] || [ "${COMPAT_GE_75:-0}" -ne 1 ]; then
+        log "SageAttention: skipped (no compatible GPU)"
+        return 0
+    fi
+    if needs_sage_rebuild || ! test_sage_attention 2>/dev/null; then
+        if build_sage_attention && test_sage_attention 2>/dev/null; then
+            export SAGE_ATTENTION_AVAILABLE=1
+            log "SageAttention ready; set FORCE_SAGE_ATTENTION=1 to enable"
+        fi
+    else
+        export SAGE_ATTENTION_AVAILABLE=1
+        log "SageAttention already built and importable"
+    fi
+}
+
 # --- ComfyUI-Manager config from CM_* env ---
 configure_manager_config() {
 python - "$CM_CFG" "$CM_SEEDED_FLAG" <<'PY'
@ -48,10 +182,8 @@ cfg = configparser.ConfigParser()
 first_seed = not seed_flag.exists()
 if cfg_path.exists():
    cfg.read(cfg_path)
-
 if "default" not in cfg:
    cfg["default"] = {}
-
 if first_seed:
    cfg["default"].clear()
    for k,v in sorted(env_items.items()):
@ -75,6 +207,11 @@ PY

 # --- root: set up permissions then drop to appuser ---
 if [ "$(id -u)" = "0" ]; then
+    # GPU probe (needed for SageAttention strategy)
+    eval "$(probe_gpu)"
+    export GPU_COUNT COMPAT_GE_75 TORCH_CUDA_ARCH_LIST SAGE_STRATEGY
+    log "GPU probe: ${GPU_COUNT:-0} device(s); arch=${TORCH_CUDA_ARCH_LIST:-none}; strategy=${SAGE_STRATEGY:-fallback}"
+
    if [ ! -f "$PERMISSIONS_SET_FLAG" ]; then
        log "Setting up user permissions..."
        if getent group "${PGID}" >/dev/null; then
@ -85,26 +222,18 @@ if [ "$(id -u)" = "0" ]; then
        for d in "$BASE_DIR" "/home/$APP_USER"; do [ -e "$d" ] && chown -R "${APP_USER}:${APP_GROUP}" "$d" || true; done

        readarray -t PY_PATHS < <(python - <<'PY'
-import sys, sysconfig, os, site, datetime
-def log(m): print(f"[bootstrap:python {datetime.datetime.now().strftime('%H:%M:%S')}] {m}", file=sys.stderr, flush=True)
-log("Determining writable Python install targets via sysconfig.get_paths(), site.getsitepackages(), and site.getusersitepackages()")
+import sys, sysconfig, os, site
 seen=set()
 for k in ("purelib","platlib","scripts","include","platinclude","data"):
    v = sysconfig.get_paths().get(k)
    if v and v.startswith("/usr/local") and v not in seen:
-        print(v); seen.add(v); log(f"emit {k} -> {v}")
+        print(v); seen.add(v)
 for v in (site.getusersitepackages(),):
    if v and v not in seen:
-        print(v); seen.add(v); log(f"emit usersite -> {v}")
+        print(v); seen.add(v)
 for v in site.getsitepackages():
    if v and v.startswith("/usr/local") and v not in seen:
-        print(v); seen.add(v); log(f"emit sitepkg -> {v}")
-d = sysconfig.get_paths().get("data")
-if d:
-    share=os.path.join(d,"share"); man1=os.path.join(share,"man","man1")
-    for v in (share, man1):
-        if v and v.startswith("/usr/local") and v not in seen:
-            print(v); seen.add(v); log(f"emit wheel data -> {v}")
+        print(v); seen.add(v)
 PY
 )
        for d in "${PY_PATHS[@]}"; do
@ -130,6 +259,9 @@ fi

 # --- From here on, running as $APP_USER ---

+# --- SageAttention setup ---
+setup_sage_attention
+
 # --- ComfyUI-Manager sync ---
 if [ -d "$CUSTOM_NODES_DIR/ComfyUI-Manager/.git" ]; then
    log "Updating ComfyUI-Manager"
@ -173,13 +305,20 @@ fi
 configure_manager_config

 # --- launch ComfyUI ---
-log "Starting ComfyUI..."
+COMFYUI_ARGS=""
+if [ "${FORCE_SAGE_ATTENTION:-0}" = "1" ] && [ "${SAGE_ATTENTION_AVAILABLE:-0}" = "1" ]; then
+    COMFYUI_ARGS="--use-sage-attention"
+    log "Starting ComfyUI with SageAttention enabled"
+elif [ "${FORCE_SAGE_ATTENTION:-0}" = "1" ]; then
+    log "WARNING: FORCE_SAGE_ATTENTION=1 but SageAttention is not available; starting without it"
+fi
+
 cd "$BASE_DIR"
 if [ $# -eq 0 ]; then
-    exec python main.py --listen 0.0.0.0
+    exec python main.py --listen 0.0.0.0 $COMFYUI_ARGS
 else
    if [ "$1" = "python" ] && [ "${2:-}" = "main.py" ]; then
-        shift 2; exec python main.py "$@"
+        shift 2; exec python main.py $COMFYUI_ARGS "$@"
    else
        exec "$@"
    fi