#!/usr/bin/env bash
# ====================================================================
# Krea 2 Turbo — One-Step Installer for 8GB Pascal GPUs (Linux)
# ====================================================================
# Tested on: GTX 1070 (8 GB VRAM, Pascal CC 6.1)
# Runs on:   Any Pascal GPU (GTX 1050 Ti through GTX 1080 Ti)
#            Any Turing/Ampere/Ada GPU with 8+ GB VRAM
#
# What this does:
#   1. Checks for NVIDIA driver + CUDA
#   2. Installs Miniconda (if missing)
#   3. Builds stable-diffusion.cpp from source
#   4. Downloads Krea 2 Turbo GGUF + Qwen3-4B text encoder + VAE
#   5. Runs a 2-frame test to verify everything works
#
# Usage:
#   curl -fsSL https://<pastebin-or-gist>/krea2_pascal_setup.sh | bash
#   # OR
#   chmod +x krea2_pascal_setup.sh && ./krea2_pascal_setup.sh
#
# The entire process takes 15-30 minutes depending on internet speed
# and CPU cores available for compilation.
# ====================================================================

set -euo pipefail

# ── Colors ──
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
info()  { echo -e "${BLUE}[INFO]${NC} $1"; }
ok()    { echo -e "${GREEN}[OK]${NC}   $1"; }
warn()  { echo -e "${YELLOW}[WARN]${NC} $1"; }
err()   { echo -e "${RED}[ERR]${NC}  $1"; }

# ── Step 0: Detect system ──
echo -e "\n${BLUE}╔══════════════════════════════════════════════╗${NC}"
echo -e "${BLUE}║  Krea 2 Turbo — Pascal 8GB One-Step Setup   ║${NC}"
echo -e "${BLUE}╚══════════════════════════════════════════════╝${NC}"
echo ""

# Detect OS
OS="linux"
if [[ "$(uname)" == "Darwin" ]]; then
  err "macOS not supported — requires NVIDIA GPU with CUDA"
  exit 1
fi

# ── Step 1: Check NVIDIA driver + CUDA ──
info "Step 1/8: Checking NVIDIA hardware..."

if ! command -v nvidia-smi &>/dev/null; then
  err "nvidia-smi not found. Install NVIDIA drivers first:"
  echo ""
  echo "  Ubuntu/Debian:"
  echo "    sudo apt update && sudo apt install nvidia-driver-535 nvidia-utils-535"
  echo "    sudo reboot"
  echo ""
  echo "  Fedora/RHEL:"
  echo "    sudo dnf install akmod-nvidia xorg-x11-drv-nvidia-cuda"
  echo "    sudo reboot"
  echo ""
  echo "  Arch:"
  echo "    sudo pacman -S nvidia nvidia-utils"
  echo "    sudo reboot"
  echo ""
  exit 1
fi

GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1)
GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader 2>/dev/null | head -1 | awk '{print $1}')
DRIVER_VER=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1)

info "  GPU:  $GPU_NAME"
info "  VRAM: ${GPU_MEM} MiB"
info "  Driver: $DRIVER_VER"

if [ "$GPU_MEM" -lt 6000 ]; then
  err "Krea 2 Turbo needs ~6 GB VRAM minimum. Your GPU has ${GPU_MEM} MiB."
  err "Consider using a smaller model or upgrading your GPU."
  exit 1
fi

if [ "$GPU_MEM" -ge 8000 ]; then
  ok "Plenty of VRAM for Krea 2 Turbo"
else
  warn "Only ${GPU_MEM} MiB VRAM — Krea 2 needs ~4855 MiB for weights + workspace."
  warn "Kill all X servers / GUI sessions and other GPU programs before running."
fi

# Check Compute Capability
CC=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -1 | tr -d '.')
if [ -z "$CC" ] || [ "$CC" -lt 60 ]; then
  warn "Compute Capability ${CC:0:1}.${CC:1:1} — older than Pascal. May still work but slower."
else
  ok "Compute Capability ${CC:0:1}.${CC:1:1} — Pascal or newer"
fi

# Check CUDA version
CUDA_VER=$(nvidia-smi | grep "CUDA Version" | sed 's/.*CUDA Version: //' | awk '{print $1}')
info "  CUDA Version: $CUDA_VER"

# ── Step 2: Install system dependencies ──
info "Step 2/8: Installing system build dependencies..."

# psmisc provides `fuser`, used below to clear lingering GPU processes. It is
# NOT installed on a minimal Ubuntu image, so install it explicitly.
if command -v apt &>/dev/null; then
  sudo DEBIAN_FRONTEND=noninteractive apt update -qq && \
  sudo DEBIAN_FRONTEND=noninteractive apt install -y -qq build-essential cmake git curl wget pkg-config psmisc || true
elif command -v dnf &>/dev/null; then
  sudo dnf install -y gcc gcc-c++ cmake git curl wget psmisc || true
elif command -v pacman &>/dev/null; then
  sudo pacman -S --noconfirm base-devel cmake git curl wget psmisc || true
fi

ok "Build dependencies ready"

# ── Step 3: Install / verify Miniconda ──
info "Step 3/8: Setting up Miniconda..."

if [ ! -d "$HOME/miniconda" ]; then
  info "  Downloading Miniconda..."
  wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
  bash /tmp/miniconda.sh -b -p "$HOME/miniconda" >/dev/null 2>&1
  rm /tmp/miniconda.sh
  ok "Miniconda installed at ~/miniconda"
else
  ok "Miniconda already present"
fi

export PATH="$HOME/miniconda/bin:$PATH"

# ── Step 4: Clone + Build stable-diffusion.cpp ──
info "Step 4/8: Building stable-diffusion.cpp (this takes 5-10 minutes)..."

if [ -d "$HOME/stable-diffusion.cpp/build" ]; then
  info "  stable-diffusion.cpp already built, checking binary..."
  if [ -f "$HOME/stable-diffusion.cpp/build/bin/sd-cli" ]; then
    ok "Binary exists at ~/stable-diffusion.cpp/build/bin/sd-cli"
  else
    warn "Build directory found but sd-cli missing — rebuilding..."
    cd "$HOME/stable-diffusion.cpp"
    # -DCMAKE_CUDA_ARCHITECTURES="61" is REQUIRED on Pascal. Without it cmake
    # auto-detect skips CC 6.1 and the binary silently runs on the CPU (~20 min/frame).
    cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="61"
    cmake --build build --config Release -j$(nproc)
  fi
else
  info "  Cloning..."
  git clone --depth 1 https://github.com/ggml-org/stable-diffusion.cpp.git "$HOME/stable-diffusion.cpp"
  cd "$HOME/stable-diffusion.cpp"
  info "  Configuring with CUDA support (Pascal sm_61)..."
  # -DCMAKE_CUDA_ARCHITECTURES="61" is REQUIRED on Pascal. Without it cmake
  # auto-detect skips CC 6.1 and the binary silently runs on the CPU (~20 min/frame).
  cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="61" >/dev/null 2>&1
  info "  Building (this will take a while)..."
  cmake --build build --config Release -j$(nproc) 2>&1 | tail -5
fi

if [ ! -f "$HOME/stable-diffusion.cpp/build/bin/sd-cli" ]; then
  err "Build failed — sd-cli binary not found"
  exit 1
fi

SDCLI="$HOME/stable-diffusion.cpp/build/bin/sd-cli"
ok "sd-cli built successfully"

# ── Step 5: Create model directories ──
info "Step 5/8: Creating model directories..."
mkdir -p "$HOME/models/krea2" "$HOME/models/vae" "$HOME/models/text-encoders"

# ── Step 6: Download Krea 2 Turbo GGUF + text encoder + VAE ──
# All three are pulled straight from HuggingFace with curl (resumable via -C -).
info "Step 6/8: Downloading Krea 2 Turbo Q2_K GGUF (~4.55 GB)..."
MODEL_FILE="$HOME/models/krea2/krea2_turbo-Q2_K.gguf"
MODEL_URL="https://huggingface.co/vantagewithai/Krea-2-Turbo-GGUF/resolve/main/krea2_turbo-Q2_K.gguf"
if [ -f "$MODEL_FILE" ] && [ $(stat -c%s "$MODEL_FILE") -gt 4000000000 ]; then
  ok "Krea 2 Turbo already downloaded (~4.55 GB)"
else
  curl -L -C - --progress-bar -o "$MODEL_FILE" "$MODEL_URL" || \
    warn "Krea 2 Turbo download failed — fetch $MODEL_URL manually to $MODEL_FILE"
fi

# Download Qwen3-4B text encoder
info "  Downloading Qwen3-4B text encoder Q4_K_M (~2.33 GB)..."
LLM_FILE="$HOME/models/text-encoders/Qwen3-4B-Instruct-2507-Q4_K_M.gguf"
LLM_URL="https://huggingface.co/lmstudio-community/Qwen3-4B-Instruct-2507-GGUF/resolve/main/Qwen3-4B-Instruct-2507-Q4_K_M.gguf"
if [ -f "$LLM_FILE" ] && [ $(stat -c%s "$LLM_FILE") -gt 2000000000 ]; then
  ok "Qwen3-4B already downloaded"
else
  curl -L -C - --progress-bar -o "$LLM_FILE" "$LLM_URL" || \
    warn "Qwen3-4B download failed — fetch $LLM_URL manually to $LLM_FILE"
fi

# Download VAE
info "  Downloading qwen_image_vae (~242 MB)..."
VAE_FILE="$HOME/models/vae/qwen_image_vae.safetensors"
VAE_URL="https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/vae/qwen_image_vae.safetensors"
if [ -f "$VAE_FILE" ] && [ $(stat -c%s "$VAE_FILE") -gt 200000000 ]; then
  ok "qwen_image_vae already downloaded"
else
  curl -L -C - --progress-bar -o "$VAE_FILE" "$VAE_URL" || \
    warn "VAE download failed — fetch $VAE_URL manually to $VAE_FILE"
fi

# ── Step 7: Write convenience script ──
info "Step 7/8: Writing generation helper script at ~/generate_krea2.sh..."

cat > "$HOME/generate_krea2.sh" << 'SCRIPT'
#!/bin/bash
# ── Krea 2 Turbo — single/multi-frame storyboard generator ──
# Usage: ./generate_krea2.sh "prompt" [seed] [output_name]
#   ./generate_krea2.sh "A cat riding a bicycle"  # single frame, random seed
#   ./generate_krea2.sh "A dragon" 555 dragon.png

export PATH=~/miniconda/bin:$PATH
export LD_LIBRARY_PATH=~/miniconda/lib:~/miniconda/targets/x86_64-linux/lib:$LD_LIBRARY_PATH

MODEL=~/models/krea2/krea2_turbo-Q2_K.gguf
VAE=~/models/vae/qwen_image_vae.safetensors
LLM=~/models/text-encoders/Qwen3-4B-Instruct-2507-Q4_K_M.gguf
SDCLI=~/stable-diffusion.cpp/build/bin/sd-cli
OUTDIR=~/krea2_output
STYLE="Pixar style 3D render, Disney CG animation, cinematic lighting, subsurface scattering, detailed textured skin, volumetric rim light, cgi film still, highly detailed, sharp focus"

mkdir -p "$OUTDIR"
PROMPT="${1:?Usage: generate_krea2.sh \"prompt\" [seed] [filename]}"
SEED="${2:-$RANDOM}"
FNAME="${3:-frame_$(printf '%03d' $SEED).png}"

echo "╔═══════════════════════════════════════╗"
echo "║  Krea 2 Turbo — Generating Frame     ║"
echo "╚═══════════════════════════════════════╝"
echo ""
echo "  Prompt: $PROMPT"
echo "  Style:  Krea 2 style (appended)"
echo "  Seed:   $SEED"
echo "  Output: $OUTDIR/$FNAME"
echo ""

# ── WARNING (Pascal 8GB owners): ──
# Do NOT add --params-backend cpu or --offload-to-cpu!
# Those flags break the refinement phase on Pascal.
# The model weights MUST load into VRAM (~4613 MB)
# for the compute workspace to reuse the same memory region.

"$SDCLI" \
  --diffusion-model "$MODEL" \
  --vae "$VAE" \
  --llm "$LLM" \
  --backend "llm=cpu" \
  --vae-tiling --diffusion-fa \
  --cfg-scale 1.0 --steps 20 --sampling-method euler \
  -H 640 -W 640 \
  --seed "$SEED" \
  --output "$OUTDIR/$FNAME" \
  -p "${PROMPT}, $STYLE"

RC=$?
echo ""
if [ $RC -eq 0 ]; then
  echo -e "\033[32m✓ Frame saved to $OUTDIR/$FNAME\033[0m"
else
  echo -e "\033[31m✗ Generation failed (exit code $RC)\033[0m"
fi
exit $RC
SCRIPT
chmod +x "$HOME/generate_krea2.sh"
ok "Helper script created at ~/generate_krea2.sh"

# ── Step 8: Verify / test ──
info "Step 8/8: Running 1-frame test..."
echo ""
echo "  This generates a 640×640 Pixar-style frame (~155 seconds)."
echo "  Press Ctrl+C to skip the test."
echo ""

cd "$HOME"
if [ -f "$MODEL_FILE" ] && [ -f "$LLM_FILE" ] && [ -f "$VAE_FILE" ]; then
  info "All model files present. Running test..."
  # First, clear any lingering GPU processes
  fuser /dev/nvidia* 2>/dev/null | sort -u | xargs -r kill 2>/dev/null || true
  sleep 2
  
  bash "$HOME/generate_krea2.sh" \
    "A cute robot painting a picture in a sunlit garden" \
    42 "test_verify.png"
  
  if [ -f "$HOME/krea2_output/test_verify.png" ]; then
    ok "TEST PASSED! Your Krea 2 Turbo pipeline is ready."
    echo ""
    echo "  Output: $HOME/krea2_output/test_verify.png"
    echo "  Size:   $(stat -c%s "$HOME/krea2_output/test_verify.png" 2>/dev/null || echo '?') bytes"
    echo ""
  else
    warn "Test generation failed — check ~/krea2_output/ for errors"
  fi
else
  warn "Model files not all present — skipping test."
  warn "  Missing: $([ ! -f "$MODEL_FILE" ] && echo 'Krea2 GGUF ') $([ ! -f "$LLM_FILE" ] && echo 'Qwen3-4B ') $([ ! -f "$VAE_FILE" ] && echo 'VAE ')"
  echo ""
  echo "Run the test manually after downloading models:"
  echo "  ~/generate_krea2.sh \"A cat riding a bicycle\""
fi

echo ""
echo -e "${GREEN}╔══════════════════════════════════════════════════════════════╗${NC}"
echo -e "${GREEN}║                    SETUP COMPLETE                          ║${NC}"
echo -e "${GREEN}╚══════════════════════════════════════════════════════════════╝${NC}"
echo ""
echo "  Quick usage:"
echo "    ~/generate_krea2.sh \"your prompt\" [seed] [filename]"
echo ""
echo "  Batch script example:"
echo "    See ~/stable-diffusion.cpp/examples/ or the storyboard-pipeline skill"
echo ""
echo "  Important (Pascal 8GB owners — READ THIS):"
echo "    • Model MUST load into VRAM (~4613 MB) — do NOT use --offload-to-cpu"
echo "    • Always clear GPU before runs: fuser /dev/nvidia* | xargs kill"
echo "    • Each frame takes ~155 seconds at 640×640"
echo "    • The '--steps 20' flag controls the REFINEMENT phase, not diffusion steps"
echo "    • sd-cli version: commit 3b6c9ca or newer"
echo ""
