| #!/usr/bin/env bash |
| |
| |
| set -euo pipefail |
|
|
| SYNC=false |
| LOG_DIR="" |
|
|
| for arg in "$@"; do |
| case "$arg" in |
| --sync) SYNC=true ;; |
| *) LOG_DIR="$arg" ;; |
| esac |
| done |
| LOG_DIR="${LOG_DIR:-logs}" |
|
|
| REPO="$(cd "$(dirname "$0")/.." && pwd)" |
|
|
| |
| if $SYNC; then |
| bash "$REPO/deploy/sync.sh" 2>/dev/null || true |
| fi |
|
|
| |
| N=5 |
| { |
| find "$LOG_DIR" -name metrics.jsonl -printf '%T@ %p\n' 2>/dev/null |
| find "$REPO/checkpoints" -name metrics.jsonl -printf '%T@ %p\n' 2>/dev/null |
| } | sort -rn | head -n "$N" | while read -r _ path; do |
| run_name="$(basename "$(dirname "$path")")" |
|
|
| python3 -c " |
| import json, sys |
| records = [json.loads(l) for l in open('$path')] |
| cfg = next((r for r in records if r.get('type') == 'config'), {}) |
| train = [r for r in records if r.get('type') == 'train'] |
| if not train: |
| print(f'$run_name (no training steps yet)') |
| sys.exit(0) |
| last = train[-1] |
| model = cfg.get('model', {}) |
| tcfg = cfg.get('training', {}) |
| variant = f\"{model.get('d_model','?')}d/{model.get('n_layers','?')}L\" |
| discard = tcfg.get('discard_ply_limit', False) |
| total = tcfg.get('total_steps', '?') |
| step = last.get('step', '?') |
| loss = last.get('train/loss', last.get('loss', 0)) |
| acc = last.get('train/accuracy', last.get('acc', 0)) |
| gs = last.get('games_per_sec', 0) |
| print(f'{\"$run_name\":<28} {variant} discard_ply={str(discard):<5} step {step}/{total} loss {loss:.4f} acc {acc:.3f} {gs:.0f} g/s') |
| " 2>/dev/null || echo "$run_name (parse error)" |
| done |
|
|
| |
| if pgrep -f 'train.py' > /dev/null 2>&1; then |
| echo "Local training: RUNNING" |
| else |
| echo "Local training: NOT RUNNING" |
| fi |
|
|