the-apprentice / scripts /curate_trace.py
AndrewRqy
Trace-sharing infra: traces/sample is committable, add curation script + README badge claim
fd94fc8
Raw
History Blame Contribute Delete
4.04 kB
"""Curate a session trace into a publishable sample.
Reads the most-recent oracles-trace-*.jsonl from the traces/ dir and
writes a copy into traces/sample/ under a friendlier filename. The
sample is then committed alongside the repo as the Sharing-is-Caring
badge deliverable.
Usage:
cd oracles_app
../.venv/bin/python scripts/curate_trace.py \\
--label fantasy-en-playthrough
"""
from __future__ import annotations
import argparse
import json
import shutil
import sys
from pathlib import Path
_HERE = Path(__file__).resolve().parent
_APP_ROOT = _HERE.parent
_TRACES_DIR = _APP_ROOT / "traces"
_SAMPLE_DIR = _TRACES_DIR / "sample"
def _newest_session_trace() -> Path:
candidates = sorted(
_TRACES_DIR.glob("oracles-trace-*.jsonl"),
key=lambda p: p.stat().st_mtime,
reverse=True,
)
if not candidates:
sys.exit(
f"ERROR: no session traces found in {_TRACES_DIR}. "
"Run the app first and complete at least one trial."
)
return candidates[0]
def _summarize(path: Path) -> dict:
"""Quick stats so the user can sanity-check before committing."""
stats: dict = {
"n_records": 0,
"modes": {},
"models_requested": {},
"models_returned": {},
"total_completion_tokens": 0,
"total_prompt_tokens": 0,
}
with path.open() as f:
for line in f:
line = line.strip()
if not line:
continue
rec = json.loads(line)
stats["n_records"] += 1
stats["modes"][rec.get("mode", "?")] = stats["modes"].get(rec.get("mode", "?"), 0) + 1
mr = rec.get("model_requested") or rec.get("model", "?")
stats["models_requested"][mr] = stats["models_requested"].get(mr, 0) + 1
mreturn = rec.get("model_returned", "?")
stats["models_returned"][mreturn] = stats["models_returned"].get(mreturn, 0) + 1
usage = rec.get("usage") or {}
stats["total_prompt_tokens"] += int(usage.get("prompt_tokens", 0) or 0)
stats["total_completion_tokens"] += int(usage.get("completion_tokens", 0) or 0)
return stats
def main() -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument(
"--label", default=None,
help="Label to embed in the output filename "
"(default: derived from the source filename).",
)
ap.add_argument(
"--source", default=None,
help="Specific source trace path. Default = newest in traces/.",
)
ap.add_argument(
"--summary-only", action="store_true",
help="Print stats and exit without copying.",
)
args = ap.parse_args()
src = Path(args.source) if args.source else _newest_session_trace()
if not src.exists():
sys.exit(f"ERROR: source not found: {src}")
if not src.is_file():
sys.exit(f"ERROR: source is not a file: {src}")
stats = _summarize(src)
print(f"Source: {src}")
print(f"Records: {stats['n_records']}")
print(f"Modes: {stats['modes']}")
print(f"Models requested: {stats['models_requested']}")
print(f"Models returned: {stats['models_returned']}")
print(f"Prompt tokens used: {stats['total_prompt_tokens']}")
print(f"Output tokens used: {stats['total_completion_tokens']}")
if args.summary_only:
return 0
_SAMPLE_DIR.mkdir(parents=True, exist_ok=True)
label = args.label or src.stem.replace("oracles-trace-", "session-")
dst = _SAMPLE_DIR / f"{label}.jsonl"
if dst.exists():
print(f"\nWARN: {dst} already exists — overwriting.", file=sys.stderr)
shutil.copy2(src, dst)
print(f"\nCopied to: {dst}")
print(f"\nNext steps:")
print(f" git add {dst.relative_to(_APP_ROOT.parent)}")
print(f" git commit -m 'Add sample LLM trace from playthrough'")
print(f" git push")
return 0
if __name__ == "__main__":
sys.exit(main())