| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| |
| MODE="${MODE:-train}" |
| GPU_TYPE="${GPU_TYPE:-A100-80}" |
| CONFIG_PATH="${CONFIG_PATH:-./stack_2_9_training/train_config.yaml}" |
| HF_TOKEN="${HF_TOKEN:-}" |
| OUTPUT_DIR="${OUTPUT_DIR:-./stack-2.9}" |
| CONTAINER_DISK_SIZE="${CONTAINER_DISK_SIZE:-200}" |
| MIN_VRAM_GB="${MIN_VRAM_GB:-80}" |
| REPO_URL="${REPO_URL:-https://github.com/walidsobhie-code/ai-voice-clone.git}" |
| REPO_BRANCH="${REPO_BRANCH:-main}" |
|
|
| |
| usage() { |
| grep "^#" "$0" | sed 's/^# //;s/^#//' |
| exit 1 |
| } |
|
|
| log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } |
| error() { log "ERROR: $*" >&2; exit 1; } |
|
|
| require_cmd() { |
| command -v "$1" &>/dev/null || error "Required command not found: $1. Install it first." |
| } |
|
|
| |
| while [[ $# -gt 0 ]]; do |
| case $1 in |
| --mode) MODE="$2"; shift 2 ;; |
| --config) CONFIG_PATH="$2"; shift 2 ;; |
| --gpu) GPU_TYPE="$2"; shift 2 ;; |
| --help|-h) usage ;; |
| *) error "Unknown option: $1" ;; |
| esac |
| done |
|
|
| |
| if [[ "$MODE" != "train" && "$MODE" != "inference" ]]; then |
| error "Mode must be 'train' or 'inference', got: $MODE" |
| fi |
|
|
| |
| log "Checking prerequisites..." |
| require_cmd runpod |
|
|
| |
| if [[ -z "$HF_TOKEN" ]]; then |
| log "WARNING: HF_TOKEN not set. Some models may fail to download." |
| log "Set it with: export HF_TOKEN=your_token_here" |
| fi |
|
|
| |
| |
| declare -A GPU_MAP |
| GPU_MAP["A100-80"]="NVIDIA-A100-80GB" |
| GPU_MAP["A100-40"]="NVIDIA-A100-40GB" |
| GPU_MAP["A6000"]="NVIDIA-RTX-A6000" |
| GPU_MAP["4090"]="NVIDIA-RTX-4090" |
| GPU_MAP["3090"]="NVIDIA-RTX-3090" |
|
|
| GPU_ID="${GPU_MAP[$GPU_TYPE]:-$GPU_TYPE}" |
|
|
| log "Selected GPU: $GPU_TYPE (RunPod ID: $GPU_ID)" |
|
|
| |
| log "Checking GPU availability on RunPod..." |
|
|
| |
| AVAILABLE_GPUS=$(runpod list gpus 2>/dev/null | grep -c "$GPU_ID" || echo "0") |
| if [[ "$AVAILABLE_GPUS" == "0" ]]; then |
| log "WARNING: GPU $GPU_ID may not be available. Proceeding anyway..." |
| fi |
|
|
| |
| log "Building docker run command..." |
|
|
| |
| ENV_VARS=( |
| "HF_TOKEN=${HF_TOKEN}" |
| "PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb=512" |
| "TRANSFORMERS_CACHE=/data/hf_cache" |
| "HF_HOME=/data/hf_cache" |
| ) |
|
|
| |
| ENV_STRING="" |
| for var in "${ENV_VARS[@]}"; do |
| if [[ "$var" == "${var%=*}" ]]; then continue; fi |
| KEY="${var%%=*}" |
| VAL="${var#*=}" |
| ENV_STRING+=" -e ${KEY}=${VAL}" |
| done |
|
|
| |
| VOLUME_MOUNTS="-v /data:/data" |
|
|
| |
| if [[ "$MODE" == "train" ]]; then |
| CMD="python -m stack_2_9_training.train_lora \ |
| --config ${CONFIG_PATH}" |
| CONTAINER_PORT="" |
| else |
| |
| CMD="python -m uvicorn stack.serve:app \ |
| --host 0.0.0.0 \ |
| --port 7860" |
| CONTAINER_PORT="-p 7860:7860" |
| fi |
|
|
| |
| log "Launching RunPod instance..." |
|
|
| |
| if [[ -t 0 ]]; then |
| log "Interactive mode - will print the docker command for manual run:" |
| echo "" |
| echo "runpod run --gpu ${GPU_ID} \\" |
| echo " --container-disk-size ${CONTAINER_DISK_SIZE} \\" |
| echo " ${ENV_STRING} \\" |
| echo " ${VOLUME_MOUNTS} \\" |
| echo " ${CONTAINER_PORT} \\" |
| echo " -- python /app/entrypoint.sh" |
| echo "" |
| echo "Recommended: Use runpod CLI with a template instead." |
| echo "See: https://docs.runpod.io/cli/templates" |
| else |
| |
| runpod run \ |
| --gpu "$GPU_ID" \ |
| --container-disk-size "$CONTAINER_DISK_SIZE" \ |
| docker \ |
| bash -c " |
| set -e |
| echo '=== Starting Stack 2.9 Deployment ===' |
| echo 'Mode: $MODE' |
| echo 'GPU: $GPU_ID' |
| echo '' |
| echo '=== Installing dependencies ===' |
| pip install --no-cache-dir \ |
| torch \ |
| transformers \ |
| peft \ |
| accelerate \ |
| bitsandbytes \ |
| datasets \ |
| trl \ |
| pyyaml \ |
| tqdm \ |
| gradio \ |
| fastapi \ |
| uvicorn 2>&1 | tail -5 |
| echo '' |
| echo '=== Cloning repository ===' |
| git clone --depth 1 -b $REPO_BRANCH $REPO_URL /app 2>/dev/null || echo 'Repo already present' |
| cd /app |
| echo '' |
| echo '=== Starting application ===' |
| $CMD |
| " |
| fi |
|
|
| |
| log "Done. To check your pod status:" |
| log " runpod ps" |
| log "" |
| log "To stream logs:" |
| log " runpod logs <pod-id>" |
| log "" |
| log "To SSH into the instance:" |
| log " runpod ssh <pod-id>" |
|
|
| |
| log "" |
| log "To stop and remove the instance:" |
| log " runpod stop <pod-id> && runpod rm <pod-id>" |
|
|