PAWN / scripts /check_progress.sh

Safetensors migration, checkpoint integrity, and multi-model training. (#1)

230508d unverified about 20 hours ago

1.84 kB

	#!/usr/bin/env bash
	# Check training progress from HuggingFace submodules and local logs.
	# Usage: check_progress.sh [--sync] [LOG_DIR]
	set -euo pipefail

	SYNC=false
	LOG_DIR=""

	for arg in "$@"; do
	case "$arg" in
	--sync) SYNC=true ;;
	*) LOG_DIR="$arg" ;;
	esac
	done
	LOG_DIR="${LOG_DIR:-logs}"

	REPO="$(cd "$(dirname "$0")/.." && pwd)"

	# Sync submodules from HuggingFace
	if $SYNC; then
	bash "$REPO/deploy/sync.sh" 2>/dev/null \|\| true
	fi

	# Show progress from all metrics.jsonl files (local logs + submodules)
	N=5
	{
	find "$LOG_DIR" -name metrics.jsonl -printf '%T@ %p\n' 2>/dev/null
	find "$REPO/checkpoints" -name metrics.jsonl -printf '%T@ %p\n' 2>/dev/null
	} \| sort -rn \| head -n "$N" \| while read -r _ path; do
	run_name="$(basename "$(dirname "$path")")"

	python3 -c "
	import json, sys
	records = [json.loads(l) for l in open('$path')]
	cfg = next((r for r in records if r.get('type') == 'config'), {})
	train = [r for r in records if r.get('type') == 'train']
	if not train:
	print(f'$run_name (no training steps yet)')
	sys.exit(0)
	last = train[-1]
	model = cfg.get('model', {})
	tcfg = cfg.get('training', {})
	variant = f\"{model.get('d_model','?')}d/{model.get('n_layers','?')}L\"
	discard = tcfg.get('discard_ply_limit', False)
	total = tcfg.get('total_steps', '?')
	step = last.get('step', '?')
	loss = last.get('train/loss', last.get('loss', 0))
	acc = last.get('train/accuracy', last.get('acc', 0))
	gs = last.get('games_per_sec', 0)
	print(f'{\"$run_name\":<28} {variant} discard_ply={str(discard):<5} step {step}/{total} loss {loss:.4f} acc {acc:.3f} {gs:.0f} g/s')
	" 2>/dev/null \|\| echo "$run_name (parse error)"
	done

	# Check local training process
	if pgrep -f 'train.py' > /dev/null 2>&1; then
	echo "Local training: RUNNING"
	else
	echo "Local training: NOT RUNNING"
	fi