Lean Laguna: Laguna XS.2 + DFlash — lossless single-GPU speedup + cheaper RL rollouts

8612587 1 day ago

4.12 kB

	#!/usr/bin/env python3
	"""parse_rl_run.py — extract the REAL hosted-RL evidence from a prime train run's logs.

	Produces two JSON artifacts the submission folds in:
	* rl_train_curve.json — per-step dense train reward on the train pool (HumanEval 0-49).
	This is the PRIMARY "real RL, reward moved during training" evidence.
	* rl_after.json — the held-out (HumanEval 50-74) eval trajectory: base (step 0, untrained)
	vs each eval checkpoint. eval_base_model=true gives the BEFORE inside the same run, so
	base->final is a clean before->after on a disjoint split, same harness, no checkpoint to serve.

	Usage: parse_rl_run.py <run_id>
	Reads `prime train logs <run_id>`; honest about whatever the numbers actually are.
	"""
	import json
	import re
	import subprocess
	import sys
	from pathlib import Path

	RUN = sys.argv[1] if len(sys.argv) > 1 else "n8xzr01yconncax7v0i7gn0u"
	RESULTS = Path(__file__).resolve().parents[1] / "results"

	logs = subprocess.run(["prime", "train", "logs", RUN], capture_output=True, text=True).stdout

	# --- train curve: "Step N \| Time: Xs \| Reward: R \| Seq. Length: L" ---
	steps = []
	for m in re.finditer(r"Step (\d+) \\| Time: ([\d.]+)s \\| Reward: ([\d.]+) \\| Seq\. Length: ([\d.]+)", logs):
	steps.append({"step": int(m.group(1)), "train_reward": round(float(m.group(3)), 4),
	"time_s": round(float(m.group(2)), 2), "seq_len": round(float(m.group(4)), 1)})
	steps.sort(key=lambda d: d["step"])

	# --- held-out eval: "Running evals at step=N" then "Evaluated ... Avg@1=X" ---
	evals = []
	pending_step = None
	for line in logs.splitlines():
	ms = re.search(r"Running evals at step=(\d+)", line)
	if ms:
	pending_step = int(ms.group(1))
	me = re.search(r"Evaluated art87able/spec-rl in ([\d.]+)s \(Avg@1=([\d.]+)", line)
	if me and pending_step is not None:
	evals.append({"step": pending_step, "heldout_avg_at_1": round(float(me.group(2)), 4),
	"eval_time_s": round(float(me.group(1)), 2)})
	pending_step = None
	evals.sort(key=lambda d: d["step"])

	train_curve = {
	"note": "Per-step dense unit-test reward on the TRAIN pool (HumanEval 0-49) during real hosted "
	"GRPO post-training of Laguna XS.2 on art87able/spec-rl. temperature=1.0 (exploration), so "
	"step-to-step values carry sampling noise; the trajectory (not any single step) is the signal. "
	"PRIMARY evidence that we post-trained the model, not just evaluated the tool.",
	"run_id": RUN, "env": "art87able/spec-rl@0.1.5", "model": "poolside/Laguna-XS.2",
	"max_steps": 20, "batch_size": 64, "rollouts_per_example": 8, "learning_rate": 1e-6,
	"train_pool": "HumanEval/0-49", "free_hosted_train": True,
	"steps": steps,
	}
	after = {
	"note": "Held-out (HumanEval 50-74, DISJOINT from the train pool) eval trajectory computed BY THE "
	"TRAINER via eval_base_model=true + interval=5. step 0 = BEFORE (untrained base Laguna); later "
	"steps = AFTER checkpoints. Same harness, same hosted inference, no checkpoint served. Eval "
	"sampling carries MoE/temperature noise (see determinism_check.json), so read the trajectory.",
	"run_id": RUN, "env": "art87able/spec-rl@0.1.5", "split": "HumanEval/50-74 (held-out)",
	"model": "poolside/Laguna-XS.2",
	"before_step0_heldout": evals[0]["heldout_avg_at_1"] if evals else None,
	"after_final_heldout": evals[-1]["heldout_avg_at_1"] if evals else None,
	"delta": round(evals[-1]["heldout_avg_at_1"] - evals[0]["heldout_avg_at_1"], 4) if len(evals) >= 2 else None,
	"trajectory": evals,
	"local_greedy_before_ref": "results/rl_before.json (0.9378, greedy T=0 — corroborates step-0 base)",
	}
	(RESULTS / "rl_train_curve.json").write_text(json.dumps(train_curve, indent=2))
	(RESULTS / "rl_after.json").write_text(json.dumps(after, indent=2))
	print("train steps parsed:", len(steps))
	print(" rewards:", [s["train_reward"] for s in steps])
	print("held-out evals:", evals)
	print("BEFORE step0:", after["before_step0_heldout"], "AFTER final:", after["after_final_heldout"], "delta:", after["delta"])