#!/bin/bash
set -euo pipefail

# --- config ---
APP_USER=${APP_USER:-appuser}
APP_GROUP=${APP_GROUP:-appuser}
PUID=${PUID:-1000}
PGID=${PGID:-1000}
BASE_DIR=/app/ComfyUI
CUSTOM_NODES_DIR="$BASE_DIR/custom_nodes"
SAGE_ATTENTION_DIR="$BASE_DIR/.sage_attention"
SAGE_ATTENTION_BUILT_FLAG="$SAGE_ATTENTION_DIR/.built"
PERMISSIONS_SET_FLAG="$BASE_DIR/.permissions_set"
FIRST_RUN_FLAG="$BASE_DIR/.first_run_done"

# --- logging ---
log() { echo "[$(date '+%H:%M:%S')] $1"; }

# Make newly created files group-writable (helps in shared volumes)
umask 0002

# --- build parallelism (single knob) ---
# Public knob: SAGE_MAX_JOBS. If unset, pick RAM/CPU heuristic.
decide_build_jobs() {
    if [ -n "${SAGE_MAX_JOBS:-}" ]; then echo "$SAGE_MAX_JOBS"; return; fi
    local mem_kb=$(awk '/MemTotal:/ {print $2}' /proc/meminfo 2>/dev/null || echo 0)
    local cpu=$(nproc) cap=24 jobs
    if   [ "$mem_kb" -le $((8*1024*1024)) ];  then jobs=2
    elif [ "$mem_kb" -le $((12*1024*1024)) ]; then jobs=3
    elif [ "$mem_kb" -le $((24*1024*1024)) ]; then jobs=4
    elif [ "$mem_kb" -le $((64*1024*1024)) ]; then jobs=$(( cpu<8 ? cpu : 8 ))
    else jobs=$cpu; [ "$jobs" -gt "$cap" ] && jobs=$cap
    fi
    echo "$jobs"
}

# --- CUDA/Torch checks ---
test_pytorch_cuda() {
    python -c "
import torch, sys
if not torch.cuda.is_available():
    print('[ERROR] PyTorch CUDA not available'); sys.exit(1)
c=torch.cuda.device_count(); print(f'[TEST] PyTorch CUDA available with {c} devices')
for i in range(c):
    p=torch.cuda.get_device_properties(i)
    print(f'[TEST] GPU {i}: {p.name} (Compute {p.major}.{p.minor})')
" 2>/dev/null
}

# Determine if there is a compatible NVIDIA GPU (>= sm_75, i.e., 16-series/Turing and newer)
gpu_is_compatible() {
    python - <<'PY' 2>/dev/null
import sys
try:
    import torch
    if not torch.cuda.is_available():
        sys.exit(2)
    ok=False
    for i in range(torch.cuda.device_count()):
        p=torch.cuda.get_device_properties(i)
        cc=float(f"{p.major}.{p.minor}")
        if cc >= 7.5:
            ok=True
    sys.exit(0 if ok else 3)
except Exception:
    sys.exit(4)
PY
}

# Derive arch list directly from Torch; optional +PTX via SAGE_PTX_FALLBACK=1
compute_arch_list_from_torch() {
    python - <<'PY' 2>/dev/null
import os, sys
try:
    import torch
    if not torch.cuda.is_available():
        print(""); sys.exit(0)
    caps = {f"{torch.cuda.get_device_properties(i).major}.{torch.cuda.get_device_properties(i).minor}"
            for i in range(torch.cuda.device_count())}
    ordered = sorted(caps, key=lambda s: tuple(int(x) for x in s.split(".")))
    if not ordered: print(""); sys.exit(0)
    if os.environ.get("SAGE_PTX_FALLBACK","0")=="1":
        highest = ordered[-1]; print(";".join(ordered+[highest + "+PTX"]))
    else:
        print(";".join(ordered))
except Exception:
    print("")
PY
}

# Fallback name-based mapping across Turing→Blackwell
detect_gpu_generations() {
    local info=$(nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>/dev/null || echo "")
    local has_turing=false has_amp_ga100=false has_amp_ga10x=false has_amp_ga10b=false
    local has_ada=false has_hopper=false has_bw_cons=false has_bw_dc=false
    local n=0
    [ -z "$info" ] && { log "No NVIDIA GPUs detected"; return 1; }
    log "Detecting GPU generations:"
    while IFS= read -r g; do
        n=$((n+1)); log "  GPU $n: $g"
        case "$g" in
            *"RTX 20"*|*"T4"*) has_turing=true ;;
            *"A100"*|*"A30"*|*"A40"*) has_amp_ga100=true ;;
            *"RTX 30"*|*"RTX 3090"*|*"RTX 3080"*|*"RTX 3070"*|*"RTX 3060"*) has_amp_ga10x=true ;;
            *"Orin"*|*"Jetson"*) has_amp_ga10b=true ;;
            *"RTX 40"*|*"4090"*|*"L40"*|*"L4"*) has_ada=true ;;
            *"H100"*|*"H200"*|*"GH200"*) has_hopper=true ;;
            *"RTX 50"*|*"5090"*|*"5080"*|*"5070"*|*"5060"*|*"PRO "*Blackwell*|*"PRO 4000 Blackwell"*) has_bw_cons=true ;;
            *"B200"*|*"B100"*|*"GB200"*|*"B40"*|*"RTX 6000 Blackwell"*|*"RTX 5000 Blackwell"*) has_bw_dc=true ;;
        esac
    done <<< "$info"
    export DET_TURING=$has_turing DET_AMP80=$has_amp_ga100 DET_AMP86=$has_amp_ga10x DET_AMP87=$has_amp_ga10b
    export DET_ADA=$has_ada DET_HOPPER=$has_hopper DET_BW12=$has_bw_cons DET_BW10=$has_bw_dc
    export GPU_COUNT=$n
    log "Summary: Turing=$has_turing Amp(8.0)=$has_amp_ga100 Amp(8.6)=$has_amp_ga10x Amp(8.7)=$has_amp_ga10b Ada=$has_ada Hopper=$has_hopper Blackwell(12.x)=$has_bw_cons Blackwell(10.0)=$has_bw_dc"
    test_pytorch_cuda && log "PyTorch CUDA compatibility confirmed" || log "WARNING: PyTorch CUDA compatibility issues detected"
}

determine_sage_strategy() {
    local s=""
    if [ "${DET_TURING:-false}" = "true" ]; then
        if [ "${DET_AMP80:-false}" = "true" ] || [ "${DET_AMP86:-false}" = "true" ] || [ "${DET_AMP87:-false}" = "true" ] || [ "${DET_ADA:-false}" = "true" ] || [ "${DET_HOPPER:-false}" = "true" ] || [ "${DET_BW12:-false}" = "true" ] || [ "${DET_BW10:-false}" = "true" ]; then
            s="mixed_with_turing"; log "Mixed rig including Turing - using compatibility mode"
        else s="turing_only"; log "Turing-only rig detected"; fi
    elif [ "${DET_BW12:-false}" = "true" ] || [ "${DET_BW10:-false}" = "true" ]; then s="blackwell_capable"; log "Blackwell detected - using latest optimizations"
    elif [ "${DET_HOPPER:-false}" = "true" ]; then s="hopper_capable"; log "Hopper detected - using modern optimizations"
    elif [ "${DET_ADA:-false}" = "true" ] || [ "${DET_AMP86:-false}" = "true" ] || [ "${DET_AMP87:-false}" = "true" ] || [ "${DET_AMP80:-false}" = "true" ]; then
        s="ampere_ada_optimized"; log "Ampere/Ada detected - using standard optimizations"
    else s="fallback"; log "Unknown configuration - using fallback"; fi
    export SAGE_STRATEGY=$s
}

install_triton_version() {
    case "$SAGE_STRATEGY" in
        "mixed_with_turing"|"turing_only")
            log "Installing Triton 3.2.0 for Turing compatibility"
            python -m pip install --user --force-reinstall "triton==3.2.0" || python -m pip install --user --force-reinstall triton || true
            ;;
        "blackwell_capable"|"hopper_capable")
            log "Installing latest Triton for Hopper/Blackwell"
            python -m pip install --user --force-reinstall triton || python -m pip install --user --force-reinstall --pre triton || python -m pip install --user --force-reinstall "triton>=3.2.0" || true
            ;;
        *)
            log "Installing latest stable Triton"
            python -m pip install --user --force-reinstall triton || { log "WARNING: Triton installation failed"; return 1; }
            ;;
    esac
}

build_sage_attention_mixed() {
    log "Building Sage Attention..."
    mkdir -p "$SAGE_ATTENTION_DIR"; cd "$SAGE_ATTENTION_DIR"

    local arch_list="${SAGE_ARCH_LIST_OVERRIDE:-$(compute_arch_list_from_torch)}"
    if [ -z "$arch_list" ]; then
        local tmp=""
        [ "${DET_TURING:-false}" = "true" ] && tmp="${tmp}7.5;"
        [ "${DET_AMP80:-false}" = "true" ] && tmp="${tmp}8.0;"
        [ "${DET_AMP86:-false}" = "true" ] && tmp="${tmp}8.6;"
        [ "${DET_AMP87:-false}" = "true" ] && tmp="${tmp}8.7;"
        [ "${DET_ADA:-false}" = "true" ] && tmp="${tmp}8.9;"
        [ "${DET_HOPPER:-false}" = "true" ] && tmp="${tmp}9.0;"
        [ "${DET_BW10:-false}" = "true" ] && tmp="${tmp}10.0;"
        [ "${DET_BW12:-false}" = "true" ] && tmp="${tmp}12.0;"
        arch_list="${tmp%;}"
    fi
    export TORCH_CUDA_ARCH_LIST="$arch_list"
    log "Set TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"

    case "$SAGE_STRATEGY" in
        "mixed_with_turing"|"turing_only")
            log "Cloning SageAttention v1.0 for Turing"
            if [ -d "SageAttention/.git" ]; then cd SageAttention; git fetch --depth 1 origin || return 1; git checkout v1.0 2>/dev/null || git checkout -b v1.0 origin/v1.0 || return 1; git reset --hard origin/v1.0 || return 1
            else rm -rf SageAttention; git clone --depth 1 https://github.com/thu-ml/SageAttention.git -b v1.0 || return 1; cd SageAttention; fi
            ;;
        *)
            log "Cloning latest SageAttention"
            if [ -d "SageAttention/.git" ]; then cd SageAttention; git fetch --depth 1 origin || return 1; git reset --hard origin/main || return 1
            else rm -rf SageAttention; git clone --depth 1 https://github.com/thu-ml/SageAttention.git || return 1; cd SageAttention; fi
            ;;
    esac

    [ "${SAGE_VERBOSE_BUILD:-0}" = "1" ] && export TORCH_CPP_BUILD_VERBOSE=1
    local jobs; jobs="$(decide_build_jobs)"
    log "Using MAX_JOBS=${jobs} for SageAttention build"

    if MAX_JOBS="${jobs}" python -m pip install --user --no-build-isolation .; then
        echo "$SAGE_STRATEGY|$TORCH_CUDA_ARCH_LIST" > "$SAGE_ATTENTION_BUILT_FLAG"
        log "SageAttention built successfully"
        cd "$BASE_DIR"; return 0
    else
        log "ERROR: SageAttention build failed"
        cd "$BASE_DIR"; return 1
    fi
}

needs_rebuild() {
    if [ ! -f "$SAGE_ATTENTION_BUILT_FLAG" ]; then return 0; fi
    local x; x=$(cat "$SAGE_ATTENTION_BUILT_FLAG" 2>/dev/null || echo "")
    local prev_strategy="${x%%|*}"; local prev_arch="${x#*|}"
    if [ "$prev_strategy" != "$SAGE_STRATEGY" ] || [ "$prev_arch" != "$TORCH_CUDA_ARCH_LIST" ]; then return 0; fi
    return 1
}

test_sage_attention() {
    python -c "
import sys
try:
    import sageattention; print('[TEST] SageAttention import: SUCCESS')
    v=getattr(sageattention,'__version__',None)
    if v: print(f'[TEST] Version: {v}'); sys.exit(0)
except ImportError as e:
    print(f'[TEST] SageAttention import: FAILED - {e}'); sys.exit(1)
except Exception as e:
    print(f'[TEST] SageAttention test: ERROR - {e}'); sys.exit(1)
" 2>/dev/null
}

setup_sage_attention() {
    export SAGE_ATTENTION_BUILT=0 SAGE_ATTENTION_AVAILABLE=0
    if ! detect_gpu_generations; then log "No GPUs detected, skipping SageAttention setup"; return 0; fi
    determine_sage_strategy

    export TORCH_CUDA_ARCH_LIST="${SAGE_ARCH_LIST_OVERRIDE:-$(compute_arch_list_from_torch)}"
    if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then
        local tmp=""
        [ "${DET_TURING:-false}" = "true" ] && tmp="${tmp}7.5;"
        [ "${DET_AMP80:-false}" = "true" ] && tmp="${tmp}8.0;"
        [ "${DET_AMP86:-false}" = "true" ] && tmp="${tmp}8.6;"
        [ "${DET_AMP87:-false}" = "true" ] && tmp="${tmp}8.7;"
        [ "${DET_ADA:-false}" = "true" ] && tmp="${tmp}8.9;"
        [ "${DET_HOPPER:-false}" = "true" ] && tmp="${tmp}9.0;"
        [ "${DET_BW10:-false}" = "true" ] && tmp="${tmp}10.0;"
        [ "${DET_BW12:-false}" = "true" ] && tmp="${tmp}12.0;"
        export TORCH_CUDA_ARCH_LIST="${tmp%;}"
    fi
    log "Resolved TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"

    if needs_rebuild || ! test_sage_attention; then
        log "Building SageAttention..."
        if install_triton_version && build_sage_attention_mixed && test_sage_attention; then
            export SAGE_ATTENTION_BUILT=1 SAGE_ATTENTION_AVAILABLE=1
            log "SageAttention is built; set FORCE_SAGE_ATTENTION=1 to enable it at startup"
        else
            export SAGE_ATTENTION_BUILT=0 SAGE_ATTENTION_AVAILABLE=0
            log "WARNING: SageAttention is not available after build attempt"
        fi
    else
        export SAGE_ATTENTION_BUILT=1 SAGE_ATTENTION_AVAILABLE=1
        log "SageAttention already built and importable"
    fi
}

# --- root to runtime user ---
if [ "$(id -u)" = "0" ]; then
    if [ ! -f "$PERMISSIONS_SET_FLAG" ]; then
        log "Setting up user permissions..."
        if getent group "${PGID}" >/dev/null; then
            EXISTING_GRP="$(getent group "${PGID}" | cut -d: -f1)"; usermod -g "${EXISTING_GRP}" "${APP_USER}" || true; APP_GROUP="${EXISTING_GRP}"
        else groupmod -o -g "${PGID}" "${APP_GROUP}" || true; fi
        usermod -o -u "${PUID}" "${APP_USER}" || true
        mkdir -p "/home/${APP_USER}"
        for d in "$BASE_DIR" "/home/$APP_USER"; do [ -e "$d" ] && chown -R "${APP_USER}:${APP_GROUP}" "$d" || true; done

        # Discover both system and user site dirs and make them writable by the runtime user
        readarray -t PY_PATHS < <(python - <<'PY'
import sys, sysconfig, os, site, datetime
def log(m): print(f"[bootstrap:python {datetime.datetime.now().strftime('%H:%M:%S')}] {m}", file=sys.stderr, flush=True)
log("Determining writable Python install targets via sysconfig.get_paths(), site.getsitepackages(), and site.getusersitepackages()")
seen=set()
for k in ("purelib","platlib","scripts","include","platinclude","data"):
    v = sysconfig.get_paths().get(k)
    if v and v.startswith("/usr/local") and v not in seen:
        print(v); seen.add(v); log(f"emit {k} -> {v}")
for v in (site.getusersitepackages(),):
    if v and v not in seen:
        print(v); seen.add(v); log(f"emit usersite -> {v}")
for v in site.getsitepackages():
    if v and v.startswith("/usr/local") and v not in seen:
        print(v); seen.add(v); log(f"emit sitepkg -> {v}")
d = sysconfig.get_paths().get("data")
if d:
    share=os.path.join(d,"share"); man1=os.path.join(share,"man","man1")
    for v in (share, man1):
        if v and v.startswith("/usr/local") and v not in seen:
            print(v); seen.add(v); log(f"emit wheel data -> {v}")
PY
)
        for d in "${PY_PATHS[@]}"; do
            [ -n "$d" ] || continue
            mkdir -p "$d" || true
            chown -R "${APP_USER}:${APP_GROUP}" "$d" || true
            chmod -R u+rwX,g+rwX "$d" || true
        done

        # Also ensure the main site-packages tree is writable if present (guards numpy uninstall/upgrade)
        if [ -d "/usr/local/lib/python3.12/site-packages" ]; then
            chown -R "${APP_USER}:${APP_GROUP}" /usr/local/lib/python3.12/site-packages || true
            chmod -R u+rwX,g+rwX /usr/local/lib/python3.12/site-packages || true
        fi

        touch "$PERMISSIONS_SET_FLAG"; chown "${APP_USER}:${APP_GROUP}" "$PERMISSIONS_SET_FLAG"
        log "User permissions configured"
    else
        log "User permissions already configured, skipping..."
    fi
    exec runuser -u "${APP_USER}" -- "$0" "$@"
fi

# From here on, running as $APP_USER

# Favor user installs everywhere to avoid touching system packages
export PATH="$HOME/.local/bin:$PATH"
pyver="$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')"
export PYTHONPATH="$HOME/.local/lib/python${pyver}/site-packages:${PYTHONPATH:-}"
export PIP_USER=1
export PIP_PREFER_BINARY=1

# Abort early if no compatible NVIDIA GPU (>= sm_75) is present
if ! gpu_is_compatible; then
    log "No compatible NVIDIA GPU detected (compute capability 7.5+ required). Shutting down container."
    # Exit 0 to avoid restart loops in some runtimes
    exit 0
fi

# --- SageAttention setup (runs only if compatible GPU is present) ---
setup_sage_attention

# --- ComfyUI-Manager sync ---
if [ -d "$CUSTOM_NODES_DIR/ComfyUI-Manager/.git" ]; then
    log "Updating ComfyUI-Manager"
    git -C "$CUSTOM_NODES_DIR/ComfyUI-Manager" fetch --depth 1 origin || true
    git -C "$CUSTOM_NODES_DIR/ComfyUI-Manager" reset --hard origin/HEAD || true
    git -C "$CUSTOM_NODES_DIR/ComfyUI-Manager" clean -fdx || true
elif [ ! -d "$CUSTOM_NODES_DIR/ComfyUI-Manager" ]; then
    log "Installing ComfyUI-Manager"
    git clone --depth 1 https://github.com/ltdrdata/ComfyUI-Manager.git "$CUSTOM_NODES_DIR/ComfyUI-Manager" || true
fi

# --- first-run install of custom_nodes ---
if [ ! -f "$FIRST_RUN_FLAG" ] || [ "${COMFY_FORCE_INSTALL:-0}" = "1" ]; then
    if [ "${COMFY_AUTO_INSTALL:-1}" = "1" ]; then
        log "First run or forced; installing custom node dependencies..."

        # 1) Install requirements files (Manager-like behavior)
        while IFS= read -r -d '' req; do
            log "python -m pip install --user --upgrade --upgrade-strategy only-if-needed -r $req"
            python -m pip install --no-cache-dir --user --upgrade --upgrade-strategy only-if-needed -r "$req" || true
        done < <(find "$CUSTOM_NODES_DIR" -maxdepth 3 -type f \( -iname 'requirements.txt' -o -iname 'requirements-*.txt' -o -path '*/requirements/*.txt' \) -print0)

        # 2) Install from pyproject (editable build avoided to mimic Manager’s typical install)
        while IFS= read -r -d '' pjt; do
            d="$(dirname "$pjt")"
            log "python -m pip install --user . in $d"
            (cd "$d" && python -m pip install --no-cache-dir --user .) || true
        done < <(find "$CUSTOM_NODES_DIR" -maxdepth 2 -type f -iname 'pyproject.toml' -not -path '*/ComfyUI-Manager/*' -print0)

        # 3) Run node-provided install.py if present (Manager runs install scripts; mirror that)
        while IFS= read -r -d '' inst; do
            d="$(dirname "$inst")"
            log "Running node install script: $inst"
            (cd "$d" && python "$inst") || true
        done < <(find "$CUSTOM_NODES_DIR" -maxdepth 2 -type f -iname 'install.py' -not -path '*/ComfyUI-Manager/*' -print0)

        python -m pip check || true
    else
        log "COMFY_AUTO_INSTALL=0; skipping dependency install"
    fi
    touch "$FIRST_RUN_FLAG"
else
    log "Not first run; skipping custom_nodes dependency install"
fi

# --- launch ComfyUI ---
COMFYUI_ARGS=""
if [ "${FORCE_SAGE_ATTENTION:-0}" = "1" ]; then
    if test_sage_attention; then COMFYUI_ARGS="--use-sage-attention"; log "Starting ComfyUI with SageAttention (FORCE_SAGE_ATTENTION=1)"
    else log "WARNING: FORCE_SAGE_ATTENTION=1 but import failed; starting without"; fi
else
    if [ "${SAGE_ATTENTION_AVAILABLE:-0}" = "1" ]; then log "SageAttention is built; set FORCE_SAGE_ATTENTION=1 to enable"
    else log "SageAttention not available; starting without it"; fi
fi

cd "$BASE_DIR"
if [ $# -eq 0 ]; then
    exec python main.py --listen 0.0.0.0 $COMFYUI_ARGS
else
    if [ "$1" = "python" ] && [ "${2:-}" = "main.py" ]; then
        shift 2; exec python main.py $COMFYUI_ARGS "$@"
    else
        exec "$@"
    fi
fi