PAWN / scripts /check_progress.sh
thomas-schweich's picture
Safetensors migration, checkpoint integrity, and multi-model training. (#1)
230508d unverified
#!/usr/bin/env bash
# Check training progress from HuggingFace submodules and local logs.
# Usage: check_progress.sh [--sync] [LOG_DIR]
set -euo pipefail
SYNC=false
LOG_DIR=""
for arg in "$@"; do
case "$arg" in
--sync) SYNC=true ;;
*) LOG_DIR="$arg" ;;
esac
done
LOG_DIR="${LOG_DIR:-logs}"
REPO="$(cd "$(dirname "$0")/.." && pwd)"
# Sync submodules from HuggingFace
if $SYNC; then
bash "$REPO/deploy/sync.sh" 2>/dev/null || true
fi
# Show progress from all metrics.jsonl files (local logs + submodules)
N=5
{
find "$LOG_DIR" -name metrics.jsonl -printf '%T@ %p\n' 2>/dev/null
find "$REPO/checkpoints" -name metrics.jsonl -printf '%T@ %p\n' 2>/dev/null
} | sort -rn | head -n "$N" | while read -r _ path; do
run_name="$(basename "$(dirname "$path")")"
python3 -c "
import json, sys
records = [json.loads(l) for l in open('$path')]
cfg = next((r for r in records if r.get('type') == 'config'), {})
train = [r for r in records if r.get('type') == 'train']
if not train:
print(f'$run_name (no training steps yet)')
sys.exit(0)
last = train[-1]
model = cfg.get('model', {})
tcfg = cfg.get('training', {})
variant = f\"{model.get('d_model','?')}d/{model.get('n_layers','?')}L\"
discard = tcfg.get('discard_ply_limit', False)
total = tcfg.get('total_steps', '?')
step = last.get('step', '?')
loss = last.get('train/loss', last.get('loss', 0))
acc = last.get('train/accuracy', last.get('acc', 0))
gs = last.get('games_per_sec', 0)
print(f'{\"$run_name\":<28} {variant} discard_ply={str(discard):<5} step {step}/{total} loss {loss:.4f} acc {acc:.3f} {gs:.0f} g/s')
" 2>/dev/null || echo "$run_name (parse error)"
done
# Check local training process
if pgrep -f 'train.py' > /dev/null 2>&1; then
echo "Local training: RUNNING"
else
echo "Local training: NOT RUNNING"
fi