File size: 4,838 Bytes
9049ef3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb78cbf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9049ef3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env bash
# -----------------------------------------------------------------------------
# Sahel-Voice-Core — RunPod bootstrap
# -----------------------------------------------------------------------------
# Run this once inside a fresh RunPod pod (PyTorch 2.2 + CUDA 12.1 template).
# It clones the repo into /workspace, installs training-only dependencies
# (the HF Space requirements.txt is runtime-only), and prepares secrets.
#
# Usage inside the pod's Jupyter terminal:
#
#   curl -fsSL https://huggingface.co/spaces/ous-sow/sahel-agri-voice/resolve/main/scripts/runpod_setup.sh -o setup.sh
#   bash setup.sh
#
# Or, if you've already cloned the repo:
#
#   bash /workspace/sahel-voice/scripts/runpod_setup.sh
#
# After setup, export HF_TOKEN and open notebooks/kaggle_master_trainer.ipynb.
# Cell 3 auto-detects the RunPod environment; no path edits needed.
# -----------------------------------------------------------------------------
set -euo pipefail

REPO_URL="${REPO_URL:-https://huggingface.co/spaces/ous-sow/sahel-agri-voice}"
WORKSPACE="${WORKSPACE:-/workspace}"
REPO_DIR="${REPO_DIR:-${WORKSPACE}/sahel-voice}"

echo "=============================================="
echo " Sahel-Voice-Core — RunPod setup"
echo "=============================================="
echo " Workspace : ${WORKSPACE}"
echo " Repo      : ${REPO_DIR}"
echo "=============================================="

# 1. Clone (idempotent)
if [[ ! -d "${REPO_DIR}/.git" ]]; then
    echo ">> Cloning repo..."
    git clone "${REPO_URL}" "${REPO_DIR}"
else
    echo ">> Repo already present — pulling latest."
    git -C "${REPO_DIR}" pull --ff-only || true
fi

cd "${REPO_DIR}"

# 2. Training dependencies (not in requirements.txt which is runtime-only)
echo ">> Installing training dependencies..."
pip install -q --upgrade pip
pip install -q \
    "transformers==5.5.0" \
    "datasets==4.8.4" \
    "accelerate==1.13.0" \
    "huggingface-hub==1.9.0" \
    "peft>=0.13.0" \
    "evaluate>=0.4.1" \
    "jiwer==3.0.4" \
    "librosa==0.10.2" \
    "soundfile==0.12.1" \
    "tensorboard>=2.14" \
    "pypdf>=4.0.0" \
    "python-docx>=1.1.0"

# torchcodec — required by datasets>=4.0 for audio decoding.
# Version must match the installed torch; let pip resolve, fallback to pinned.
echo ">> Installing torchcodec (audio backend for datasets 4.x)..."
pip install -q torchcodec || {
    TORCH_VER=$(python -c "import torch; print(torch.__version__.split('+')[0])" 2>/dev/null || echo "unknown")
    echo "   pip resolve failed; torch=${TORCH_VER}. Trying pinned versions..."
    case "${TORCH_VER}" in
        2.4.*) pip install -q "torchcodec==0.1.*" ;;
        2.5.*) pip install -q "torchcodec==0.2.*" ;;
        2.6.*) pip install -q "torchcodec==0.3.*" ;;
        2.7.*|2.8.*) pip install -q "torchcodec==0.4.*" ;;
        *)     echo "   ⚠️  Unknown torch version — install torchcodec manually." ;;
    esac
}

# 3. HF token prompt (one-time)
ENV_FILE="${REPO_DIR}/.env"
if [[ -z "${HF_TOKEN:-}" ]] && [[ ! -f "${ENV_FILE}" ]]; then
    echo ""
    echo "=============================================="
    echo " HF_TOKEN not set."
    echo " Get a write-scoped token from"
    echo "   https://huggingface.co/settings/tokens"
    echo " Then either:"
    echo "   export HF_TOKEN=hf_xxxxxxxx"
    echo " or add it to ${ENV_FILE}:"
    echo "   echo 'HF_TOKEN=hf_xxxxxxxx' > ${ENV_FILE}"
    echo "=============================================="
elif [[ -f "${ENV_FILE}" ]]; then
    # Source it so this shell has HF_TOKEN available for downstream commands
    set -a
    # shellcheck disable=SC1090
    source "${ENV_FILE}"
    set +a
    echo ">> Loaded env vars from ${ENV_FILE}"
fi

# 4. Persistent output dir for checkpoints (survives pod stop via Volume disk)
mkdir -p "${WORKSPACE}/adapter_bam" "${WORKSPACE}/adapter_ful" \
         "${WORKSPACE}/data"        "${WORKSPACE}/audio_feedback"

# 5. GPU sanity check
python - <<'PY'
import torch
print("=" * 46)
print(f" PyTorch        : {torch.__version__}")
print(f" CUDA available : {torch.cuda.is_available()}")
if torch.cuda.is_available():
    p = torch.cuda.get_device_properties(0)
    print(f" GPU            : {p.name}")
    print(f" VRAM           : {p.total_memory/1e9:.1f} GB")
    print(f" Compute cap    : {p.major}.{p.minor}")
print("=" * 46)
PY

echo ""
echo "✅ Setup complete."
echo ""
echo "Next steps:"
echo "  1. Open Jupyter Lab (port 8888 on the pod)"
echo "  2. Navigate to:  ${REPO_DIR}/notebooks/kaggle_master_trainer.ipynb"
echo "  3. Set TRAIN_LANG in Cell 3 (or export TRAIN_LANG=ful before launching)"
echo "  4. Run All Cells — Cell 3 auto-detects /workspace and uses RunPod defaults"
echo ""
echo "Checkpoints will be saved to: ${WORKSPACE}/adapter_\$TRAIN_LANG"
echo "This path is on the Volume disk — survives pod stop/restart."