File size: 4,040 Bytes
fd94fc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""Curate a session trace into a publishable sample.

Reads the most-recent oracles-trace-*.jsonl from the traces/ dir and
writes a copy into traces/sample/ under a friendlier filename. The
sample is then committed alongside the repo as the Sharing-is-Caring
badge deliverable.

Usage:
    cd oracles_app
    ../.venv/bin/python scripts/curate_trace.py \\
        --label fantasy-en-playthrough
"""

from __future__ import annotations

import argparse
import json
import shutil
import sys
from pathlib import Path

_HERE = Path(__file__).resolve().parent
_APP_ROOT = _HERE.parent
_TRACES_DIR = _APP_ROOT / "traces"
_SAMPLE_DIR = _TRACES_DIR / "sample"


def _newest_session_trace() -> Path:
    candidates = sorted(
        _TRACES_DIR.glob("oracles-trace-*.jsonl"),
        key=lambda p: p.stat().st_mtime,
        reverse=True,
    )
    if not candidates:
        sys.exit(
            f"ERROR: no session traces found in {_TRACES_DIR}. "
            "Run the app first and complete at least one trial."
        )
    return candidates[0]


def _summarize(path: Path) -> dict:
    """Quick stats so the user can sanity-check before committing."""
    stats: dict = {
        "n_records": 0,
        "modes": {},
        "models_requested": {},
        "models_returned": {},
        "total_completion_tokens": 0,
        "total_prompt_tokens": 0,
    }
    with path.open() as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rec = json.loads(line)
            stats["n_records"] += 1
            stats["modes"][rec.get("mode", "?")] = stats["modes"].get(rec.get("mode", "?"), 0) + 1
            mr = rec.get("model_requested") or rec.get("model", "?")
            stats["models_requested"][mr] = stats["models_requested"].get(mr, 0) + 1
            mreturn = rec.get("model_returned", "?")
            stats["models_returned"][mreturn] = stats["models_returned"].get(mreturn, 0) + 1
            usage = rec.get("usage") or {}
            stats["total_prompt_tokens"] += int(usage.get("prompt_tokens", 0) or 0)
            stats["total_completion_tokens"] += int(usage.get("completion_tokens", 0) or 0)
    return stats


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument(
        "--label", default=None,
        help="Label to embed in the output filename "
             "(default: derived from the source filename).",
    )
    ap.add_argument(
        "--source", default=None,
        help="Specific source trace path. Default = newest in traces/.",
    )
    ap.add_argument(
        "--summary-only", action="store_true",
        help="Print stats and exit without copying.",
    )
    args = ap.parse_args()

    src = Path(args.source) if args.source else _newest_session_trace()
    if not src.exists():
        sys.exit(f"ERROR: source not found: {src}")
    if not src.is_file():
        sys.exit(f"ERROR: source is not a file: {src}")

    stats = _summarize(src)
    print(f"Source:               {src}")
    print(f"Records:              {stats['n_records']}")
    print(f"Modes:                {stats['modes']}")
    print(f"Models requested:     {stats['models_requested']}")
    print(f"Models returned:      {stats['models_returned']}")
    print(f"Prompt tokens used:   {stats['total_prompt_tokens']}")
    print(f"Output tokens used:   {stats['total_completion_tokens']}")

    if args.summary_only:
        return 0

    _SAMPLE_DIR.mkdir(parents=True, exist_ok=True)
    label = args.label or src.stem.replace("oracles-trace-", "session-")
    dst = _SAMPLE_DIR / f"{label}.jsonl"
    if dst.exists():
        print(f"\nWARN: {dst} already exists — overwriting.", file=sys.stderr)
    shutil.copy2(src, dst)
    print(f"\nCopied to:            {dst}")
    print(f"\nNext steps:")
    print(f"  git add {dst.relative_to(_APP_ROOT.parent)}")
    print(f"  git commit -m 'Add sample LLM trace from playthrough'")
    print(f"  git push")
    return 0


if __name__ == "__main__":
    sys.exit(main())