File size: 1,835 Bytes
a188746
230508d
 
a188746
 
5fbb1fb
 
a188746
5fbb1fb
 
 
 
 
 
 
 
 
 
230508d
 
5fbb1fb
 
 
230508d
 
 
 
 
 
a188746
230508d
a188746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fbb1fb
 
230508d
 
5fbb1fb
230508d
5fbb1fb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env bash
# Check training progress from HuggingFace submodules and local logs.
# Usage: check_progress.sh [--sync] [LOG_DIR]
set -euo pipefail

SYNC=false
LOG_DIR=""

for arg in "$@"; do
    case "$arg" in
        --sync) SYNC=true ;;
        *) LOG_DIR="$arg" ;;
    esac
done
LOG_DIR="${LOG_DIR:-logs}"

REPO="$(cd "$(dirname "$0")/.." && pwd)"

# Sync submodules from HuggingFace
if $SYNC; then
    bash "$REPO/deploy/sync.sh" 2>/dev/null || true
fi

# Show progress from all metrics.jsonl files (local logs + submodules)
N=5
{
    find "$LOG_DIR" -name metrics.jsonl -printf '%T@ %p\n' 2>/dev/null
    find "$REPO/checkpoints" -name metrics.jsonl -printf '%T@ %p\n' 2>/dev/null
} | sort -rn | head -n "$N" | while read -r _ path; do
    run_name="$(basename "$(dirname "$path")")"

    python3 -c "
import json, sys
records = [json.loads(l) for l in open('$path')]
cfg = next((r for r in records if r.get('type') == 'config'), {})
train = [r for r in records if r.get('type') == 'train']
if not train:
    print(f'$run_name  (no training steps yet)')
    sys.exit(0)
last = train[-1]
model = cfg.get('model', {})
tcfg = cfg.get('training', {})
variant = f\"{model.get('d_model','?')}d/{model.get('n_layers','?')}L\"
discard = tcfg.get('discard_ply_limit', False)
total = tcfg.get('total_steps', '?')
step = last.get('step', '?')
loss = last.get('train/loss', last.get('loss', 0))
acc = last.get('train/accuracy', last.get('acc', 0))
gs = last.get('games_per_sec', 0)
print(f'{\"$run_name\":<28}  {variant}  discard_ply={str(discard):<5}  step {step}/{total}  loss {loss:.4f}  acc {acc:.3f}  {gs:.0f} g/s')
" 2>/dev/null || echo "$run_name  (parse error)"
done

# Check local training process
if pgrep -f 'train.py' > /dev/null 2>&1; then
    echo "Local training: RUNNING"
else
    echo "Local training: NOT RUNNING"
fi