File size: 1,835 Bytes
a188746 230508d a188746 5fbb1fb a188746 5fbb1fb 230508d 5fbb1fb 230508d a188746 230508d a188746 5fbb1fb 230508d 5fbb1fb 230508d 5fbb1fb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | #!/usr/bin/env bash
# Check training progress from HuggingFace submodules and local logs.
# Usage: check_progress.sh [--sync] [LOG_DIR]
set -euo pipefail
SYNC=false
LOG_DIR=""
for arg in "$@"; do
case "$arg" in
--sync) SYNC=true ;;
*) LOG_DIR="$arg" ;;
esac
done
LOG_DIR="${LOG_DIR:-logs}"
REPO="$(cd "$(dirname "$0")/.." && pwd)"
# Sync submodules from HuggingFace
if $SYNC; then
bash "$REPO/deploy/sync.sh" 2>/dev/null || true
fi
# Show progress from all metrics.jsonl files (local logs + submodules)
N=5
{
find "$LOG_DIR" -name metrics.jsonl -printf '%T@ %p\n' 2>/dev/null
find "$REPO/checkpoints" -name metrics.jsonl -printf '%T@ %p\n' 2>/dev/null
} | sort -rn | head -n "$N" | while read -r _ path; do
run_name="$(basename "$(dirname "$path")")"
python3 -c "
import json, sys
records = [json.loads(l) for l in open('$path')]
cfg = next((r for r in records if r.get('type') == 'config'), {})
train = [r for r in records if r.get('type') == 'train']
if not train:
print(f'$run_name (no training steps yet)')
sys.exit(0)
last = train[-1]
model = cfg.get('model', {})
tcfg = cfg.get('training', {})
variant = f\"{model.get('d_model','?')}d/{model.get('n_layers','?')}L\"
discard = tcfg.get('discard_ply_limit', False)
total = tcfg.get('total_steps', '?')
step = last.get('step', '?')
loss = last.get('train/loss', last.get('loss', 0))
acc = last.get('train/accuracy', last.get('acc', 0))
gs = last.get('games_per_sec', 0)
print(f'{\"$run_name\":<28} {variant} discard_ply={str(discard):<5} step {step}/{total} loss {loss:.4f} acc {acc:.3f} {gs:.0f} g/s')
" 2>/dev/null || echo "$run_name (parse error)"
done
# Check local training process
if pgrep -f 'train.py' > /dev/null 2>&1; then
echo "Local training: RUNNING"
else
echo "Local training: NOT RUNNING"
fi
|