File size: 6,344 Bytes
d083607 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | #!/bin/bash
# =============================================================================
# runpod_deploy.sh - Deploy Stack 2.9 Training on RunPod
# =============================================================================
#
# USAGE:
# ./runpod_deploy.sh [--mode train|inference] [--config CONFIG_PATH] [--gpu GPU_TYPE]
#
# EXAMPLES:
# # Start training on an A100 80GB
# ./runpod_deploy.sh --mode train --gpu A100-80
#
# # Start inference server on a smaller GPU
# ./runpod_deploy.sh --mode inference --gpu A100-40
#
# # Use custom config
# ./runpod_deploy.sh --mode train --config ./my_config.yaml
#
# PREREQUISITES:
# - RunPod CLI installed: https://docs.runpod.io/cli/install
# - RunPod account with API key set: runpod config
# - HF_TOKEN set for gated models (Qwen)
#
# =============================================================================
set -euo pipefail
# ------------------------------ Defaults -------------------------------------
MODE="${MODE:-train}"
GPU_TYPE="${GPU_TYPE:-A100-80}"
CONFIG_PATH="${CONFIG_PATH:-./stack_2_9_training/train_config.yaml}"
HF_TOKEN="${HF_TOKEN:-}"
OUTPUT_DIR="${OUTPUT_DIR:-./stack-2.9}"
CONTAINER_DISK_SIZE="${CONTAINER_DISK_SIZE:-200}"
MIN_VRAM_GB="${MIN_VRAM_GB:-80}"
REPO_URL="${REPO_URL:-https://github.com/walidsobhie-code/ai-voice-clone.git}"
REPO_BRANCH="${REPO_BRANCH:-main}"
# ------------------------------ Helpers --------------------------------------
usage() {
grep "^#" "$0" | sed 's/^# //;s/^#//'
exit 1
}
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
error() { log "ERROR: $*" >&2; exit 1; }
require_cmd() {
command -v "$1" &>/dev/null || error "Required command not found: $1. Install it first."
}
# ------------------------------ Parse Args ----------------------------------
while [[ $# -gt 0 ]]; do
case $1 in
--mode) MODE="$2"; shift 2 ;;
--config) CONFIG_PATH="$2"; shift 2 ;;
--gpu) GPU_TYPE="$2"; shift 2 ;;
--help|-h) usage ;;
*) error "Unknown option: $1" ;;
esac
done
# Validate mode
if [[ "$MODE" != "train" && "$MODE" != "inference" ]]; then
error "Mode must be 'train' or 'inference', got: $MODE"
fi
# ------------------------------ Prerequisites --------------------------------
log "Checking prerequisites..."
require_cmd runpod
# Check HF_TOKEN
if [[ -z "$HF_TOKEN" ]]; then
log "WARNING: HF_TOKEN not set. Some models may fail to download."
log "Set it with: export HF_TOKEN=your_token_here"
fi
# --------------------------------- GPU Selection ----------------------------
# Map friendly names to RunPod GPU IDs
declare -A GPU_MAP
GPU_MAP["A100-80"]="NVIDIA-A100-80GB"
GPU_MAP["A100-40"]="NVIDIA-A100-40GB"
GPU_MAP["A6000"]="NVIDIA-RTX-A6000"
GPU_MAP["4090"]="NVIDIA-RTX-4090"
GPU_MAP["3090"]="NVIDIA-RTX-3090"
GPU_ID="${GPU_MAP[$GPU_TYPE]:-$GPU_TYPE}"
log "Selected GPU: $GPU_TYPE (RunPod ID: $GPU_ID)"
# ------------------------------ Detect GPU Availability ----------------------
log "Checking GPU availability on RunPod..."
# Find available pod templates with the requested GPU
AVAILABLE_GPUS=$(runpod list gpus 2>/dev/null | grep -c "$GPU_ID" || echo "0")
if [[ "$AVAILABLE_GPUS" == "0" ]]; then
log "WARNING: GPU $GPU_ID may not be available. Proceeding anyway..."
fi
# ------------------------------ Build Docker Command ------------------------
log "Building docker run command..."
# Base environment variables
ENV_VARS=(
"HF_TOKEN=${HF_TOKEN}"
"PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb=512"
"TRANSFORMERS_CACHE=/data/hf_cache"
"HF_HOME=/data/hf_cache"
)
# Build env string
ENV_STRING=""
for var in "${ENV_VARS[@]}"; do
if [[ "$var" == "${var%=*}" ]]; then continue; fi # skip if no '='
KEY="${var%%=*}"
VAL="${var#*=}"
ENV_STRING+=" -e ${KEY}=${VAL}"
done
# Mount data volume for models and outputs
VOLUME_MOUNTS="-v /data:/data"
# Training command
if [[ "$MODE" == "train" ]]; then
CMD="python -m stack_2_9_training.train_lora \
--config ${CONFIG_PATH}"
CONTAINER_PORT=""
else
# Inference mode - start Gradio server
CMD="python -m uvicorn stack.serve:app \
--host 0.0.0.0 \
--port 7860"
CONTAINER_PORT="-p 7860:7860"
fi
# ------------------------------ Launch on RunPod -----------------------------
log "Launching RunPod instance..."
# Check if user wants interactive or one-liner
if [[ -t 0 ]]; then
log "Interactive mode - will print the docker command for manual run:"
echo ""
echo "runpod run --gpu ${GPU_ID} \\"
echo " --container-disk-size ${CONTAINER_DISK_SIZE} \\"
echo " ${ENV_STRING} \\"
echo " ${VOLUME_MOUNTS} \\"
echo " ${CONTAINER_PORT} \\"
echo " -- python /app/entrypoint.sh"
echo ""
echo "Recommended: Use runpod CLI with a template instead."
echo "See: https://docs.runpod.io/cli/templates"
else
# Non-interactive: use runpod run
runpod run \
--gpu "$GPU_ID" \
--container-disk-size "$CONTAINER_DISK_SIZE" \
docker \
bash -c "
set -e
echo '=== Starting Stack 2.9 Deployment ==='
echo 'Mode: $MODE'
echo 'GPU: $GPU_ID'
echo ''
echo '=== Installing dependencies ==='
pip install --no-cache-dir \
torch \
transformers \
peft \
accelerate \
bitsandbytes \
datasets \
trl \
pyyaml \
tqdm \
gradio \
fastapi \
uvicorn 2>&1 | tail -5
echo ''
echo '=== Cloning repository ==='
git clone --depth 1 -b $REPO_BRANCH $REPO_URL /app 2>/dev/null || echo 'Repo already present'
cd /app
echo ''
echo '=== Starting application ==='
$CMD
"
fi
# ------------------------------ Post-Launch --------------------------------
log "Done. To check your pod status:"
log " runpod ps"
log ""
log "To stream logs:"
log " runpod logs <pod-id>"
log ""
log "To SSH into the instance:"
log " runpod ssh <pod-id>"
# ------------------------------ Cleanup Hint ---------------------------------
log ""
log "To stop and remove the instance:"
log " runpod stop <pod-id> && runpod rm <pod-id>"
|