File size: 2,832 Bytes
729546e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env bash
# Phase 2 sweep: waits for phase 1 to finish, then resumes from whichever
# phase 1 run achieved the lowest eval KL. All configs use very small LRs
# and constant/very-slow schedules. Goal: monotone, very slow KL descent.
#
# Launch in the background with:
#   nohup ./scripts/run_phase2_sweep.sh > logs/sweep_phase2_master.log 2>&1 &

set -uo pipefail
cd "$(dirname "$0")/.."

LOG_DIR="logs"
mkdir -p "$LOG_DIR"

# 1. Wait for phase 1 to finish.
echo ">>> [$(date '+%F %T')] phase2 waiter: waiting for phase 1 to finish..."
while pgrep -f "run_hparam_sweep.sh" > /dev/null; do
    sleep 30
done
# Also wait for any straggler distill.py procs from phase 1 to die
while pgrep -f "distill.py --config configs/sweep/[A-I]_" > /dev/null; do
    sleep 30
done
echo ">>> [$(date '+%F %T')] phase2 waiter: phase 1 done."

# 2. Find phase 1's best ckpt.
PHASE1_BEST=$(.venv/bin/python - <<'PY'
import json, glob, os, sys
best_kl = float("inf")
best_dir = None
for f in glob.glob("out/sweep/[A-I]_*/best/best.json"):
    try:
        kl = json.load(open(f))["eval_kl"]
    except Exception:
        continue
    if kl < best_kl:
        best_kl = kl
        best_dir = os.path.dirname(f)
if best_dir is None:
    sys.exit("no phase 1 best found")
print(f"{best_dir}\t{best_kl}")
PY
)
BEST_DIR=$(echo "$PHASE1_BEST" | cut -f1)
BEST_KL=$(echo "$PHASE1_BEST" | cut -f2)
echo ">>> phase 1 best: $BEST_DIR (eval_kl=$BEST_KL)"

# 3. Symlink ./out/phase1_best -> the winner so phase 2 configs can reference
#    a stable path.
mkdir -p out
rm -f out/phase1_best
ln -sfn "$(realpath "$BEST_DIR")" out/phase1_best
echo ">>> linked out/phase1_best -> $(readlink out/phase1_best)"

# 4. Run phase 2 configs sequentially.
CONFIGS=(
    "configs/sweep/J_phase2_lr5e9_const.toml"
    "configs/sweep/K_phase2_lr2e8_const.toml"
    "configs/sweep/L_phase2_lr1e8_warmup500.toml"
    "configs/sweep/M_phase2_lr2e8_largebatch.toml"
)

for cfg in "${CONFIGS[@]}"; do
    name="$(basename "$cfg" .toml)"
    log="$LOG_DIR/$name.log"
    echo ">>> [$(date '+%F %T')] starting $name -> $log"
    .venv/bin/accelerate launch \
        --config_file configs/accelerate.yaml \
        distill.py \
        --config "$cfg" \
        > "$log" 2>&1
    rc=$?
    best_line=$(grep -E "Best eval KL" "$log" | tail -1)
    echo "<<< [$(date '+%F %T')] finished $name (exit=$rc) ${best_line}"
    if [[ $rc -ne 0 ]]; then
        echo "    last 12 lines of $log:"
        tail -12 "$log" | sed 's/^/      /'
    fi
done

echo ">>> [$(date '+%F %T')] phase2 sweep complete"
echo ">>> overall summary (phase 1 + phase 2):"
for log in $LOG_DIR/[A-M]_*.log; do
    name=$(basename "$log" .log)
    best=$(grep -E "Best eval KL" "$log" 2>/dev/null | tail -1 | sed 's/.*Best eval KL = //')
    printf "    %-32s %s\n" "$name" "${best:-FAILED}"
done | sort -k2