File size: 9,538 Bytes

d083607

#!/bin/bash
# =============================================================================
# vastai_deploy.sh - Deploy Stack 2.9 Training on Vast.ai
# =============================================================================
#
# USAGE:
#   ./vastai_deploy.sh [--mode train|inference] [--config CONFIG] [--gpu GPU_NAME]
#   ./vastai_deploy.sh [--list-gpus] [--ssh INSTANCE_ID]
#
# EXAMPLES:
#   # Find and launch a training instance with A100 80GB
#   ./vastai_deploy.sh --mode train --gpu A100-80
#
#   # Launch inference on RTX 4090
#   ./vastai_deploy.sh --mode inference --gpu RTX-4090
#
#   # SSH into running instance
#   ./vastai_deploy.sh --ssh 123456
#
#   # List available GPU instances
#   ./vastai_deploy.sh --list-gpus
#
# PREREQUISITES:
#   - vastai CLI installed: pip install vastai
#   - Vast.ai account with API key: vastai auth
#   - SSH key configured: vastai create-key
#   - HF_TOKEN set for gated models
#
# =============================================================================

set -euo pipefail

# ------------------------------ Defaults -------------------------------------
MODE="${MODE:-train}"
CONFIG_PATH="${CONFIG_PATH:-./stack_2_9_training/train_config.yaml}"
GPU_NAME="${GPU_NAME:-A100-80}"
MIN_VRAM_GB="${MIN_VRAM_GB:-40}"
MIN_DL_SPEED="${MIN_DL_SPEED:-800}"      # MB/s
MIN_CPU="${MIN_CPU:-8}"
SSH_KEY="${SSH_KEY:-}"                    # Leave empty to auto-detect
REPO_URL="${REPO_URL:-https://github.com/walidsobhie-code/ai-voice-clone.git}"
REPO_BRANCH="${REPO_BRANCH:-main}"
LOG_FILE="${LOG_FILE:-~/vastai_stack29.log}"
INSTANCE_ID=""

# ------------------------------ Helpers --------------------------------------
usage() {
    grep "^#" "$0" | sed 's/^# //;s/^#//'
    exit 1
}

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; }
error() { log "ERROR: $*" >&2; exit 1; }

require_cmd() {
    command -v "$1" &>/dev/null || error "Required command not found: $1"
}

# GPU name map: friendly -> vastai search string
declare -A GPU_SEARCH_MAP
GPU_SEARCH_MAP["A100-80"]="A100 80GB"
GPU_SEARCH_MAP["A100-40"]="A100 40GB"
GPU_SEARCH_MAP["H100"]="H100"
GPU_SEARCH_MAP["RTX-4090"]="RTX 4090"
GPU_SEARCH_MAP["RTX-3090"]="RTX 3090"

# ------------------------------ Parse Args ----------------------------------
while [[ $# -gt 0 ]]; do
    case $1 in
        --mode) MODE="$2"; shift 2 ;;
        --config) CONFIG_PATH="$2"; shift 2 ;;
        --gpu) GPU_NAME="$2"; shift 2 ;;
        --ssh) INSTANCE_ID="$2"; shift 2 ;;
        --list-gpus) LIST_GPUS=true; shift ;;
        --help|-h) usage ;;
        *) error "Unknown option: $1" ;;
    esac
done

# --------------------------------- List GPUs ---------------------------------
if [[ "${LIST_GPUS:-false}" == "true" ]]; then
    log "Fetching available GPU offers..."
    vastai search instances "" --gpu "${GPU_SEARCH_MAP[$GPU_NAME]:-$GPU_NAME}" \
        --order "dph_total" \
        --num 20 2>/dev/null || vastai search offers "" 2>/dev/null
    exit 0
fi

# --------------------------------- SSH into Instance ------------------------
if [[ -n "$INSTANCE_ID" ]]; then
    log "Connecting to instance $INSTANCE_ID..."
    ssh -o StrictHostKeyChecking=no "instance${INSTANCE_ID}@console.vast.ai"
    exit 0
fi

# Validate mode
if [[ "$MODE" != "train" && "$MODE" != "inference" ]]; then
    error "Mode must be 'train' or 'inference', got: $MODE"
fi

# ------------------------------ Prerequisites --------------------------------
log "Checking prerequisites..."
require_cmd vastai

# ------------------------------ Find Suitable Instance -----------------------
SEARCH_TERM="${GPU_SEARCH_MAP[$GPU_NAME]:-$GPU_NAME}"
log "Searching for GPU: $SEARCH_TERM (min VRAM: ${MIN_VRAM_GB}GB)..."

# Query available offers
# Using: vastai search offers <query>
OFFERS=$(vastai search offers "$SEARCH_TERM" 2>/dev/null || echo "")

if [[ -z "$OFFERS" ]]; then
    error "No offers found for GPU: $GPU_NAME. Try --list-gpus to see available options."
fi

# Parse best offer (lowest price, meets requirements)
# Extract the first offer that meets VRAM requirements
BEST_OFFER=$(echo "$OFFERS" | awk -v min_vram="$MIN_VRAM_GB" '
    /^[0-9]/ {
        # Very rough parsing - in production use jq with vastai API
        # This is a simplified heuristic
    }
' | head -1)

# Simpler approach: use the CLI directly with filters
log "Finding best available instance..."

# Create instance with inline args
# See: https://docs.vast.ai/cli/#creating-an-instance
CREATE_CMD="vastai create instance \
    --gpu \"$SEARCH_TERM\" \
    --min-dl-speed $MIN_DL_SPEED \
    --min-cpu-cores $MIN_CPU \
    --onstart-url https://raw.githubusercontent.com/walidsobhie-code/ai-voice-clone/main/vastai_onstart.sh \
    --image nvidia/cuda:12.1.0-runtime-ubuntu22.04 \
    --force-yes"

log "Would run: $CREATE_CMD"
log ""
log "NOTE: Vast.ai interactive mode recommended. Run the following manually:"
log ""
log "  # Search for available instances:"
log "  vastai search offers \"${GPU_SEARCH_MAP[$GPU_NAME]:-$GPU_NAME}\""
log ""
log "  # Launch an instance:"
log "  vastai create instance \\"
log "    --gpu ${GPU_SEARCH_MAP[$GPU_NAME]:-$GPU_NAME} \\"
log "    --image nvidia/cuda:12.1.0-runtime-ubuntu22.04 \\"
log "    --min-dl-speed $MIN_DL_SPEED \\"
log "    --ssh-key $(ssh-add -L 2>/dev/null | cut -d' ' -f2 | head -1 || echo 'YOUR_SSH_KEY_ID')"
log ""
log "  # Then SSH in and run training manually (see below)"
log ""
log "  # Or use this script in interactive mode with TMUX:"
log "  tmux new-session -d -s stack29 'bash'"
log ""

# ------------------------------ Training/Inference Script ---------------------
log "Creating deployment script for instance..."

DEPLOY_SCRIPT="/tmp/stack29_deploy.sh"
cat > "$DEPLOY_SCRIPT" << 'DEPLOY_EOF'
#!/bin/bash
set -euo pipefail

MODE="${1:-train}"
CONFIG_PATH="${2:-./stack_2_9_training/train_config.yaml}"
LOGFILE="/root/stack29_$(date +%Y%m%d_%H%M%S).log"
HF_TOKEN="${HF_TOKEN:-}"

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOGFILE"; }

log "=== Stack 2.9 Deployment Started ==="
log "Mode: $MODE"
log "Config: $CONFIG_PATH"
log "Log: $LOGFILE"
log "Hostname: $(hostname)"
log "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv 2>/dev/null || echo 'nvidia-smi not found')"
log ""

# ---- Env setup ----
export HF_TOKEN="${HF_TOKEN}"
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb=512"
export TRANSFORMERS_CACHE="/data/hf_cache"
export HF_HOME="/data/hf_cache"
export CUDA_VISIBLE_DEVICES="0"

mkdir -p /data/hf_cache /data/outputs /data/adapters

# ---- Install deps ----
log "Installing system packages..."
apt-get update -qq && apt-get install -y -qq \
    git curl wget build-essential libsndfile1 ffmpeg \
    2>&1 | tail -3

log "Installing Python packages..."
pip install --upgrade pip -q
pip install -q \
    torch \
    transformers \
    peft \
    accelerate \
    bitsandbytes \
    datasets \
    trl \
    scipy \
    soundfile \
    librosa \
    pyyaml \
    tqdm \
    gradio \
    fastapi \
    uvicorn \
    2>&1 | tail -5

# ---- Clone repo ----
log "Cloning repository..."
cd /data
if [[ ! -d "ai-voice-clone" ]]; then
    git clone --depth 1 -b main https://github.com/walidsobhie-code/ai-voice-clone.git ai-voice-clone
fi
cd ai-voice-clone

# Copy config if custom
if [[ "$CONFIG_PATH" != "./stack_2_9_training/train_config.yaml" ]]; then
    cp "$CONFIG_PATH" ./stack_2_9_training/train_config.yaml
fi

log "Repository ready. Starting application..."

# ---- Start Training or Inference ----
if [[ "$MODE" == "train" ]]; then
    log "Starting LoRA training..."
    log "Command: python -m stack_2_9_training.train_lora --config ./stack_2_9_training/train_config.yaml"
    python -m stack_2_9_training.train_lora \
        --config ./stack_2_9_training/train_config.yaml \
        2>&1 | tee -a "$LOGFILE"
else
    log "Starting inference server..."
    log "Command: python -m uvicorn stack.serve:app --host 0.0.0.0 --port 7860"
    python -m uvicorn \
        stack.serve:app \
        --host 0.0.0.0 \
        --port 7860 \
        2>&1 | tee -a "$LOGFILE"
fi
DEPLOY_EOF

chmod +x "$DEPLOY_SCRIPT"
log "Deploy script written to: $DEPLOY_SCRIPT"
log "Contents will be transferred to the instance on creation."

# ------------------------------ Full Create Instructions ---------------------
log ""
log "=== Full Vast.ai Deployment Instructions ==="
log ""
log "1. Find a suitable instance:"
log "   vastai search offers \"${GPU_SEARCH_MAP[$GPU_NAME]:-$GPU_NAME}\""
log ""
log "2. Create the instance (note the offer ID from step 1):"
log "   vastai create instance --offer-id <id> \\"
log "     --image nvidia/cuda:12.1.0-devel-ubuntu22.04 \\"
log "     --ssh-key <your-ssh-key> \\"
log "     --onstart-url https://raw.githubusercontent.com/walidsobhie-code/ai-voice-clone/main/vastai_onstart.sh \\"
log "     --onstart-cmd '$MODE /data/ai-voice-clone/stack_2_9_training/train_config.yaml'"
log ""
log "3. SSH into the instance after it starts:"
log "   vastai ssh <instance-id>"
log ""
log "4. Or use screen/tmux for persistent sessions:"
log "   screen -S stack29"
log "   bash /tmp/stack29_deploy.sh $MODE $CONFIG_PATH"
log "   # Ctrl+A D to detach"
log ""
log "5. Monitor training:"
log "   tail -f $LOGFILE"
log "   nvidia-smi -l 1"
log ""
log "=== Clean Shutdown ==="
log "To stop training gracefully:"
log "  # Find the process"
log "  ps aux | grep train_lora"
log "  # Send SIGTERM for graceful shutdown"
log "  kill -SIGTERM <pid>"
log ""
log "To stop and destroy the instance:"
log "  vastai destroy instance <instance-id>"