| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| |
| MODE="${MODE:-train}" |
| CONFIG_PATH="${CONFIG_PATH:-./stack_2_9_training/train_config.yaml}" |
| GPU_NAME="${GPU_NAME:-A100-80}" |
| MIN_VRAM_GB="${MIN_VRAM_GB:-40}" |
| MIN_DL_SPEED="${MIN_DL_SPEED:-800}" |
| MIN_CPU="${MIN_CPU:-8}" |
| SSH_KEY="${SSH_KEY:-}" |
| REPO_URL="${REPO_URL:-https://github.com/walidsobhie-code/ai-voice-clone.git}" |
| REPO_BRANCH="${REPO_BRANCH:-main}" |
| LOG_FILE="${LOG_FILE:-~/vastai_stack29.log}" |
| INSTANCE_ID="" |
|
|
| |
| usage() { |
| grep "^#" "$0" | sed 's/^# //;s/^#//' |
| exit 1 |
| } |
|
|
| log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; } |
| error() { log "ERROR: $*" >&2; exit 1; } |
|
|
| require_cmd() { |
| command -v "$1" &>/dev/null || error "Required command not found: $1" |
| } |
|
|
| |
| declare -A GPU_SEARCH_MAP |
| GPU_SEARCH_MAP["A100-80"]="A100 80GB" |
| GPU_SEARCH_MAP["A100-40"]="A100 40GB" |
| GPU_SEARCH_MAP["H100"]="H100" |
| GPU_SEARCH_MAP["RTX-4090"]="RTX 4090" |
| GPU_SEARCH_MAP["RTX-3090"]="RTX 3090" |
|
|
| |
| while [[ $# -gt 0 ]]; do |
| case $1 in |
| --mode) MODE="$2"; shift 2 ;; |
| --config) CONFIG_PATH="$2"; shift 2 ;; |
| --gpu) GPU_NAME="$2"; shift 2 ;; |
| --ssh) INSTANCE_ID="$2"; shift 2 ;; |
| --list-gpus) LIST_GPUS=true; shift ;; |
| --help|-h) usage ;; |
| *) error "Unknown option: $1" ;; |
| esac |
| done |
|
|
| |
| if [[ "${LIST_GPUS:-false}" == "true" ]]; then |
| log "Fetching available GPU offers..." |
| vastai search instances "" --gpu "${GPU_SEARCH_MAP[$GPU_NAME]:-$GPU_NAME}" \ |
| --order "dph_total" \ |
| --num 20 2>/dev/null || vastai search offers "" 2>/dev/null |
| exit 0 |
| fi |
|
|
| |
| if [[ -n "$INSTANCE_ID" ]]; then |
| log "Connecting to instance $INSTANCE_ID..." |
| ssh -o StrictHostKeyChecking=no "instance${INSTANCE_ID}@console.vast.ai" |
| exit 0 |
| fi |
|
|
| |
| if [[ "$MODE" != "train" && "$MODE" != "inference" ]]; then |
| error "Mode must be 'train' or 'inference', got: $MODE" |
| fi |
|
|
| |
| log "Checking prerequisites..." |
| require_cmd vastai |
|
|
| |
| SEARCH_TERM="${GPU_SEARCH_MAP[$GPU_NAME]:-$GPU_NAME}" |
| log "Searching for GPU: $SEARCH_TERM (min VRAM: ${MIN_VRAM_GB}GB)..." |
|
|
| |
| |
| OFFERS=$(vastai search offers "$SEARCH_TERM" 2>/dev/null || echo "") |
|
|
| if [[ -z "$OFFERS" ]]; then |
| error "No offers found for GPU: $GPU_NAME. Try --list-gpus to see available options." |
| fi |
|
|
| |
| |
| BEST_OFFER=$(echo "$OFFERS" | awk -v min_vram="$MIN_VRAM_GB" ' |
| /^[0-9]/ { |
| # Very rough parsing - in production use jq with vastai API |
| # This is a simplified heuristic |
| } |
| ' | head -1) |
|
|
| |
| log "Finding best available instance..." |
|
|
| |
| |
| CREATE_CMD="vastai create instance \ |
| --gpu \"$SEARCH_TERM\" \ |
| --min-dl-speed $MIN_DL_SPEED \ |
| --min-cpu-cores $MIN_CPU \ |
| --onstart-url https://raw.githubusercontent.com/walidsobhie-code/ai-voice-clone/main/vastai_onstart.sh \ |
| --image nvidia/cuda:12.1.0-runtime-ubuntu22.04 \ |
| --force-yes" |
|
|
| log "Would run: $CREATE_CMD" |
| log "" |
| log "NOTE: Vast.ai interactive mode recommended. Run the following manually:" |
| log "" |
| log " # Search for available instances:" |
| log " vastai search offers \"${GPU_SEARCH_MAP[$GPU_NAME]:-$GPU_NAME}\"" |
| log "" |
| log " # Launch an instance:" |
| log " vastai create instance \\" |
| log " --gpu ${GPU_SEARCH_MAP[$GPU_NAME]:-$GPU_NAME} \\" |
| log " --image nvidia/cuda:12.1.0-runtime-ubuntu22.04 \\" |
| log " --min-dl-speed $MIN_DL_SPEED \\" |
| log " --ssh-key $(ssh-add -L 2>/dev/null | cut -d' ' -f2 | head -1 || echo 'YOUR_SSH_KEY_ID')" |
| log "" |
| log " # Then SSH in and run training manually (see below)" |
| log "" |
| log " # Or use this script in interactive mode with TMUX:" |
| log " tmux new-session -d -s stack29 'bash'" |
| log "" |
|
|
| |
| log "Creating deployment script for instance..." |
|
|
| DEPLOY_SCRIPT="/tmp/stack29_deploy.sh" |
| cat > "$DEPLOY_SCRIPT" << 'DEPLOY_EOF' |
| |
| set -euo pipefail |
|
|
| MODE="${1:-train}" |
| CONFIG_PATH="${2:-./stack_2_9_training/train_config.yaml}" |
| LOGFILE="/root/stack29_$(date +%Y%m%d_%H%M%S).log" |
| HF_TOKEN="${HF_TOKEN:-}" |
|
|
| log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOGFILE"; } |
|
|
| log "=== Stack 2.9 Deployment Started ===" |
| log "Mode: $MODE" |
| log "Config: $CONFIG_PATH" |
| log "Log: $LOGFILE" |
| log "Hostname: $(hostname)" |
| log "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv 2>/dev/null || echo 'nvidia-smi not found')" |
| log "" |
|
|
| |
| export HF_TOKEN="${HF_TOKEN}" |
| export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb=512" |
| export TRANSFORMERS_CACHE="/data/hf_cache" |
| export HF_HOME="/data/hf_cache" |
| export CUDA_VISIBLE_DEVICES="0" |
|
|
| mkdir -p /data/hf_cache /data/outputs /data/adapters |
|
|
| |
| log "Installing system packages..." |
| apt-get update -qq && apt-get install -y -qq \ |
| git curl wget build-essential libsndfile1 ffmpeg \ |
| 2>&1 | tail -3 |
|
|
| log "Installing Python packages..." |
| pip install --upgrade pip -q |
| pip install -q \ |
| torch \ |
| transformers \ |
| peft \ |
| accelerate \ |
| bitsandbytes \ |
| datasets \ |
| trl \ |
| scipy \ |
| soundfile \ |
| librosa \ |
| pyyaml \ |
| tqdm \ |
| gradio \ |
| fastapi \ |
| uvicorn \ |
| 2>&1 | tail -5 |
|
|
| |
| log "Cloning repository..." |
| cd /data |
| if [[ ! -d "ai-voice-clone" ]]; then |
| git clone --depth 1 -b main https://github.com/walidsobhie-code/ai-voice-clone.git ai-voice-clone |
| fi |
| cd ai-voice-clone |
|
|
| |
| if [[ "$CONFIG_PATH" != "./stack_2_9_training/train_config.yaml" ]]; then |
| cp "$CONFIG_PATH" ./stack_2_9_training/train_config.yaml |
| fi |
|
|
| log "Repository ready. Starting application..." |
|
|
| |
| if [[ "$MODE" == "train" ]]; then |
| log "Starting LoRA training..." |
| log "Command: python -m stack_2_9_training.train_lora --config ./stack_2_9_training/train_config.yaml" |
| python -m stack_2_9_training.train_lora \ |
| --config ./stack_2_9_training/train_config.yaml \ |
| 2>&1 | tee -a "$LOGFILE" |
| else |
| log "Starting inference server..." |
| log "Command: python -m uvicorn stack.serve:app --host 0.0.0.0 --port 7860" |
| python -m uvicorn \ |
| stack.serve:app \ |
| --host 0.0.0.0 \ |
| --port 7860 \ |
| 2>&1 | tee -a "$LOGFILE" |
| fi |
| DEPLOY_EOF |
|
|
| chmod +x "$DEPLOY_SCRIPT" |
| log "Deploy script written to: $DEPLOY_SCRIPT" |
| log "Contents will be transferred to the instance on creation." |
|
|
| |
| log "" |
| log "=== Full Vast.ai Deployment Instructions ===" |
| log "" |
| log "1. Find a suitable instance:" |
| log " vastai search offers \"${GPU_SEARCH_MAP[$GPU_NAME]:-$GPU_NAME}\"" |
| log "" |
| log "2. Create the instance (note the offer ID from step 1):" |
| log " vastai create instance --offer-id <id> \\" |
| log " --image nvidia/cuda:12.1.0-devel-ubuntu22.04 \\" |
| log " --ssh-key <your-ssh-key> \\" |
| log " --onstart-url https://raw.githubusercontent.com/walidsobhie-code/ai-voice-clone/main/vastai_onstart.sh \\" |
| log " --onstart-cmd '$MODE /data/ai-voice-clone/stack_2_9_training/train_config.yaml'" |
| log "" |
| log "3. SSH into the instance after it starts:" |
| log " vastai ssh <instance-id>" |
| log "" |
| log "4. Or use screen/tmux for persistent sessions:" |
| log " screen -S stack29" |
| log " bash /tmp/stack29_deploy.sh $MODE $CONFIG_PATH" |
| log " # Ctrl+A D to detach" |
| log "" |
| log "5. Monitor training:" |
| log " tail -f $LOGFILE" |
| log " nvidia-smi -l 1" |
| log "" |
| log "=== Clean Shutdown ===" |
| log "To stop training gracefully:" |
| log " # Find the process" |
| log " ps aux | grep train_lora" |
| log " # Send SIGTERM for graceful shutdown" |
| log " kill -SIGTERM <pid>" |
| log "" |
| log "To stop and destroy the instance:" |
| log " vastai destroy instance <instance-id>" |
|
|