File size: 6,344 Bytes
d083607
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/bin/bash
# =============================================================================
# runpod_deploy.sh - Deploy Stack 2.9 Training on RunPod
# =============================================================================
#
# USAGE:
#   ./runpod_deploy.sh [--mode train|inference] [--config CONFIG_PATH] [--gpu GPU_TYPE]
#
# EXAMPLES:
#   # Start training on an A100 80GB
#   ./runpod_deploy.sh --mode train --gpu A100-80
#
#   # Start inference server on a smaller GPU
#   ./runpod_deploy.sh --mode inference --gpu A100-40
#
#   # Use custom config
#   ./runpod_deploy.sh --mode train --config ./my_config.yaml
#
# PREREQUISITES:
#   - RunPod CLI installed: https://docs.runpod.io/cli/install
#   - RunPod account with API key set: runpod config
#   - HF_TOKEN set for gated models (Qwen)
#
# =============================================================================

set -euo pipefail

# ------------------------------ Defaults -------------------------------------
MODE="${MODE:-train}"
GPU_TYPE="${GPU_TYPE:-A100-80}"
CONFIG_PATH="${CONFIG_PATH:-./stack_2_9_training/train_config.yaml}"
HF_TOKEN="${HF_TOKEN:-}"
OUTPUT_DIR="${OUTPUT_DIR:-./stack-2.9}"
CONTAINER_DISK_SIZE="${CONTAINER_DISK_SIZE:-200}"
MIN_VRAM_GB="${MIN_VRAM_GB:-80}"
REPO_URL="${REPO_URL:-https://github.com/walidsobhie-code/ai-voice-clone.git}"
REPO_BRANCH="${REPO_BRANCH:-main}"

# ------------------------------ Helpers --------------------------------------
usage() {
    grep "^#" "$0" | sed 's/^# //;s/^#//'
    exit 1
}

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
error() { log "ERROR: $*" >&2; exit 1; }

require_cmd() {
    command -v "$1" &>/dev/null || error "Required command not found: $1. Install it first."
}

# ------------------------------ Parse Args ----------------------------------
while [[ $# -gt 0 ]]; do
    case $1 in
        --mode) MODE="$2"; shift 2 ;;
        --config) CONFIG_PATH="$2"; shift 2 ;;
        --gpu) GPU_TYPE="$2"; shift 2 ;;
        --help|-h) usage ;;
        *) error "Unknown option: $1" ;;
    esac
done

# Validate mode
if [[ "$MODE" != "train" && "$MODE" != "inference" ]]; then
    error "Mode must be 'train' or 'inference', got: $MODE"
fi

# ------------------------------ Prerequisites --------------------------------
log "Checking prerequisites..."
require_cmd runpod

# Check HF_TOKEN
if [[ -z "$HF_TOKEN" ]]; then
    log "WARNING: HF_TOKEN not set. Some models may fail to download."
    log "Set it with: export HF_TOKEN=your_token_here"
fi

# --------------------------------- GPU Selection ----------------------------
# Map friendly names to RunPod GPU IDs
declare -A GPU_MAP
GPU_MAP["A100-80"]="NVIDIA-A100-80GB"
GPU_MAP["A100-40"]="NVIDIA-A100-40GB"
GPU_MAP["A6000"]="NVIDIA-RTX-A6000"
GPU_MAP["4090"]="NVIDIA-RTX-4090"
GPU_MAP["3090"]="NVIDIA-RTX-3090"

GPU_ID="${GPU_MAP[$GPU_TYPE]:-$GPU_TYPE}"

log "Selected GPU: $GPU_TYPE (RunPod ID: $GPU_ID)"

# ------------------------------ Detect GPU Availability ----------------------
log "Checking GPU availability on RunPod..."

# Find available pod templates with the requested GPU
AVAILABLE_GPUS=$(runpod list gpus 2>/dev/null | grep -c "$GPU_ID" || echo "0")
if [[ "$AVAILABLE_GPUS" == "0" ]]; then
    log "WARNING: GPU $GPU_ID may not be available. Proceeding anyway..."
fi

# ------------------------------ Build Docker Command ------------------------
log "Building docker run command..."

# Base environment variables
ENV_VARS=(
    "HF_TOKEN=${HF_TOKEN}"
    "PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb=512"
    "TRANSFORMERS_CACHE=/data/hf_cache"
    "HF_HOME=/data/hf_cache"
)

# Build env string
ENV_STRING=""
for var in "${ENV_VARS[@]}"; do
    if [[ "$var" == "${var%=*}" ]]; then continue; fi  # skip if no '='
    KEY="${var%%=*}"
    VAL="${var#*=}"
    ENV_STRING+=" -e ${KEY}=${VAL}"
done

# Mount data volume for models and outputs
VOLUME_MOUNTS="-v /data:/data"

# Training command
if [[ "$MODE" == "train" ]]; then
    CMD="python -m stack_2_9_training.train_lora \
        --config ${CONFIG_PATH}"
    CONTAINER_PORT=""
else
    # Inference mode - start Gradio server
    CMD="python -m uvicorn stack.serve:app \
        --host 0.0.0.0 \
        --port 7860"
    CONTAINER_PORT="-p 7860:7860"
fi

# ------------------------------ Launch on RunPod -----------------------------
log "Launching RunPod instance..."

# Check if user wants interactive or one-liner
if [[ -t 0 ]]; then
    log "Interactive mode - will print the docker command for manual run:"
    echo ""
    echo "runpod run --gpu ${GPU_ID} \\"
    echo "  --container-disk-size ${CONTAINER_DISK_SIZE} \\"
    echo "  ${ENV_STRING} \\"
    echo "  ${VOLUME_MOUNTS} \\"
    echo "  ${CONTAINER_PORT} \\"
    echo "  -- python /app/entrypoint.sh"
    echo ""
    echo "Recommended: Use runpod CLI with a template instead."
    echo "See: https://docs.runpod.io/cli/templates"
else
    # Non-interactive: use runpod run
    runpod run \
        --gpu "$GPU_ID" \
        --container-disk-size "$CONTAINER_DISK_SIZE" \
        docker \
        bash -c "
            set -e
            echo '=== Starting Stack 2.9 Deployment ==='
            echo 'Mode: $MODE'
            echo 'GPU: $GPU_ID'
            echo ''
            echo '=== Installing dependencies ==='
            pip install --no-cache-dir \
                torch \
                transformers \
                peft \
                accelerate \
                bitsandbytes \
                datasets \
                trl \
                pyyaml \
                tqdm \
                gradio \
                fastapi \
                uvicorn 2>&1 | tail -5
            echo ''
            echo '=== Cloning repository ==='
            git clone --depth 1 -b $REPO_BRANCH $REPO_URL /app 2>/dev/null || echo 'Repo already present'
            cd /app
            echo ''
            echo '=== Starting application ==='
            $CMD
        "
fi

# ------------------------------ Post-Launch --------------------------------
log "Done. To check your pod status:"
log "  runpod ps"
log ""
log "To stream logs:"
log "  runpod logs <pod-id>"
log ""
log "To SSH into the instance:"
log "  runpod ssh <pod-id>"

# ------------------------------ Cleanup Hint ---------------------------------
log ""
log "To stop and remove the instance:"
log "  runpod stop <pod-id> && runpod rm <pod-id>"