#!/bin/bash # ============================================================================= # runpod_deploy.sh - Deploy Stack 2.9 Training on RunPod # ============================================================================= # # USAGE: # ./runpod_deploy.sh [--mode train|inference] [--config CONFIG_PATH] [--gpu GPU_TYPE] # # EXAMPLES: # # Start training on an A100 80GB # ./runpod_deploy.sh --mode train --gpu A100-80 # # # Start inference server on a smaller GPU # ./runpod_deploy.sh --mode inference --gpu A100-40 # # # Use custom config # ./runpod_deploy.sh --mode train --config ./my_config.yaml # # PREREQUISITES: # - RunPod CLI installed: https://docs.runpod.io/cli/install # - RunPod account with API key set: runpod config # - HF_TOKEN set for gated models (Qwen) # # ============================================================================= set -euo pipefail # ------------------------------ Defaults ------------------------------------- MODE="${MODE:-train}" GPU_TYPE="${GPU_TYPE:-A100-80}" CONFIG_PATH="${CONFIG_PATH:-./stack_2_9_training/train_config.yaml}" HF_TOKEN="${HF_TOKEN:-}" OUTPUT_DIR="${OUTPUT_DIR:-./stack-2.9}" CONTAINER_DISK_SIZE="${CONTAINER_DISK_SIZE:-200}" MIN_VRAM_GB="${MIN_VRAM_GB:-80}" REPO_URL="${REPO_URL:-https://github.com/walidsobhie-code/ai-voice-clone.git}" REPO_BRANCH="${REPO_BRANCH:-main}" # ------------------------------ Helpers -------------------------------------- usage() { grep "^#" "$0" | sed 's/^# //;s/^#//' exit 1 } log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } error() { log "ERROR: $*" >&2; exit 1; } require_cmd() { command -v "$1" &>/dev/null || error "Required command not found: $1. Install it first." } # ------------------------------ Parse Args ---------------------------------- while [[ $# -gt 0 ]]; do case $1 in --mode) MODE="$2"; shift 2 ;; --config) CONFIG_PATH="$2"; shift 2 ;; --gpu) GPU_TYPE="$2"; shift 2 ;; --help|-h) usage ;; *) error "Unknown option: $1" ;; esac done # Validate mode if [[ "$MODE" != "train" && "$MODE" != "inference" ]]; then error "Mode must be 'train' or 'inference', got: $MODE" fi # ------------------------------ Prerequisites -------------------------------- log "Checking prerequisites..." require_cmd runpod # Check HF_TOKEN if [[ -z "$HF_TOKEN" ]]; then log "WARNING: HF_TOKEN not set. Some models may fail to download." log "Set it with: export HF_TOKEN=your_token_here" fi # --------------------------------- GPU Selection ---------------------------- # Map friendly names to RunPod GPU IDs declare -A GPU_MAP GPU_MAP["A100-80"]="NVIDIA-A100-80GB" GPU_MAP["A100-40"]="NVIDIA-A100-40GB" GPU_MAP["A6000"]="NVIDIA-RTX-A6000" GPU_MAP["4090"]="NVIDIA-RTX-4090" GPU_MAP["3090"]="NVIDIA-RTX-3090" GPU_ID="${GPU_MAP[$GPU_TYPE]:-$GPU_TYPE}" log "Selected GPU: $GPU_TYPE (RunPod ID: $GPU_ID)" # ------------------------------ Detect GPU Availability ---------------------- log "Checking GPU availability on RunPod..." # Find available pod templates with the requested GPU AVAILABLE_GPUS=$(runpod list gpus 2>/dev/null | grep -c "$GPU_ID" || echo "0") if [[ "$AVAILABLE_GPUS" == "0" ]]; then log "WARNING: GPU $GPU_ID may not be available. Proceeding anyway..." fi # ------------------------------ Build Docker Command ------------------------ log "Building docker run command..." # Base environment variables ENV_VARS=( "HF_TOKEN=${HF_TOKEN}" "PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb=512" "TRANSFORMERS_CACHE=/data/hf_cache" "HF_HOME=/data/hf_cache" ) # Build env string ENV_STRING="" for var in "${ENV_VARS[@]}"; do if [[ "$var" == "${var%=*}" ]]; then continue; fi # skip if no '=' KEY="${var%%=*}" VAL="${var#*=}" ENV_STRING+=" -e ${KEY}=${VAL}" done # Mount data volume for models and outputs VOLUME_MOUNTS="-v /data:/data" # Training command if [[ "$MODE" == "train" ]]; then CMD="python -m stack_2_9_training.train_lora \ --config ${CONFIG_PATH}" CONTAINER_PORT="" else # Inference mode - start Gradio server CMD="python -m uvicorn stack.serve:app \ --host 0.0.0.0 \ --port 7860" CONTAINER_PORT="-p 7860:7860" fi # ------------------------------ Launch on RunPod ----------------------------- log "Launching RunPod instance..." # Check if user wants interactive or one-liner if [[ -t 0 ]]; then log "Interactive mode - will print the docker command for manual run:" echo "" echo "runpod run --gpu ${GPU_ID} \\" echo " --container-disk-size ${CONTAINER_DISK_SIZE} \\" echo " ${ENV_STRING} \\" echo " ${VOLUME_MOUNTS} \\" echo " ${CONTAINER_PORT} \\" echo " -- python /app/entrypoint.sh" echo "" echo "Recommended: Use runpod CLI with a template instead." echo "See: https://docs.runpod.io/cli/templates" else # Non-interactive: use runpod run runpod run \ --gpu "$GPU_ID" \ --container-disk-size "$CONTAINER_DISK_SIZE" \ docker \ bash -c " set -e echo '=== Starting Stack 2.9 Deployment ===' echo 'Mode: $MODE' echo 'GPU: $GPU_ID' echo '' echo '=== Installing dependencies ===' pip install --no-cache-dir \ torch \ transformers \ peft \ accelerate \ bitsandbytes \ datasets \ trl \ pyyaml \ tqdm \ gradio \ fastapi \ uvicorn 2>&1 | tail -5 echo '' echo '=== Cloning repository ===' git clone --depth 1 -b $REPO_BRANCH $REPO_URL /app 2>/dev/null || echo 'Repo already present' cd /app echo '' echo '=== Starting application ===' $CMD " fi # ------------------------------ Post-Launch -------------------------------- log "Done. To check your pod status:" log " runpod ps" log "" log "To stream logs:" log " runpod logs " log "" log "To SSH into the instance:" log " runpod ssh " # ------------------------------ Cleanup Hint --------------------------------- log "" log "To stop and remove the instance:" log " runpod stop && runpod rm "