case0 / scripts /export_traces.py
HusseinEid's picture
feat: multi-crime cases, scene+exhibit pixel art, background AI generation
80cd1f2 verified
"""Export REAL agent traces for the Hub (Build Small "Sharing is Caring" badge).
Captures two genuine traces from the live, fully-local stack:
1. CASE GENERATION - every prompt the pipeline sends to the in-process llama.cpp model
and the raw completion that came back, for one complete authored case;
2. LIVE INTERROGATION - a short playthrough against the served case: questions (one with
evidence presented), the suspect's spoken reply, and the server-authoritative
suspicion/flags, with wall-clock latency per turn.
Writes ``traces/case0_traces.jsonl`` + ``traces/README.md``. Upload with:
python scripts/export_traces.py # produce the files
python scripts/export_traces.py --push # produce AND push to the Hub dataset
"""
from __future__ import annotations
import json
import sys
import time
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT / "src"))
from case_zero.config import get_settings # noqa: E402
from case_zero.generator.pipeline import generate_case # noqa: E402
from case_zero.llm.backend import GenParams, LLMBackend, make_backend # noqa: E402
DATASET_ID = "HusseinEid/case0-traces"
OUT_DIR = ROOT / "traces"
_DATASET_README = """---
license: apache-2.0
tags:
- build-small-hackathon
- agent-trace
- text-generation
pretty_name: Case Zero agent traces
---
# Case Zero - agent traces
Real traces from [Case Zero](https://huggingface.co/spaces/build-small-hackathon/case0),
a procedural detective game where a single **Qwen2.5-1.5B** model (in-process llama.cpp,
CPU-only, no cloud APIs) authors a complete mystery and then role-plays every suspect
live under interrogation.
`case0_traces.jsonl` - one JSON object per line:
- `type: "generation_call"` - one pipeline LLM call while authoring a case: the exact
`prompt`, the raw `completion`, sampling params, and latency. Two calls author a full
case (world+cast, then mystery); deterministic Python assembles and solver-checks it.
- `type: "interrogation_turn"` - one live turn against the running game server: the
player's `question` (optionally `presented_clue`), the suspect's spoken `reply`, and
the server-authoritative `suspicion` / `flags` that came back, with latency.
Everything was produced by the shipped game code - no hand-editing, no cloud calls.
"""
class _TracingBackend:
"""Wraps the real backend and records every (prompt, completion) pair."""
def __init__(self, inner: LLMBackend) -> None:
self._inner = inner
self.calls: list[dict] = []
def generate(self, prompt: str, params: GenParams) -> str:
t0 = time.time()
out = self._inner.generate(prompt, params)
self.calls.append({
"type": "generation_call",
"prompt": prompt,
"completion": out,
"temperature": params.temperature,
"max_tokens": params.max_tokens,
"constrained": bool(params.grammar or params.json_schema),
"latency_s": round(time.time() - t0, 2),
})
return out
def stream(self, prompt: str, params: GenParams):
yield self.generate(prompt, params)
def _generation_trace(records: list[dict]) -> None:
backend = _TracingBackend(make_backend(get_settings()))
result = generate_case(backend, seed=77321)
for call in backend.calls:
records.append(call)
records.append({
"type": "generation_result",
"case_id": result.case.case_id,
"crime_kind": result.case.crime_kind.value,
"title": result.case.title,
"solvable": result.report.ok,
"attempts": result.attempts,
"n_suspects": len(result.case.suspects),
"n_clues": len(result.case.clues),
})
def _interrogation_trace(records: list[dict]) -> None:
from starlette.testclient import TestClient
from case_zero.api.server import build_server
client = TestClient(build_server())
case = client.post("/api/case", json={}).json()
run_id = case["runId"]
pub = case["case"]
records.append({
"type": "case_served",
"case_id": pub["id"],
"kind": pub.get("kind", "homicide"),
"title": pub["title"],
})
sus = pub["suspects"]
breaking = pub["evidence"][0]["id"]
plan = [
(sus[0]["id"], "Where were you when it happened?", None),
(sus[0]["id"], "Did you have any quarrel with the victim?", None),
(sus[1]["id"], "Walk me through your evening, minute by minute.", None),
(sus[1]["id"], "Explain this.", breaking),
]
for sus_id, question, clue in plan:
t0 = time.time()
body: dict = {"freeText": question}
if clue:
body["presentEvidenceId"] = clue
r = client.post(f"/api/run/{run_id}/interrogate/{sus_id}", json=body).json()
records.append({
"type": "interrogation_turn",
"suspect": sus_id,
"question": question,
"presented_clue": clue,
"reply": r.get("reply"),
"suspicion": r.get("suspicion"),
"suspicion_delta": r.get("suspicionDelta"),
"flags": r.get("flags"),
"latency_s": round(time.time() - t0, 2),
})
def main() -> int:
OUT_DIR.mkdir(parents=True, exist_ok=True)
records: list[dict] = []
print("tracing one full case generation (two model calls)...")
_generation_trace(records)
print("tracing a live interrogation playthrough...")
_interrogation_trace(records)
path = OUT_DIR / "case0_traces.jsonl"
with path.open("w", encoding="utf-8") as fh:
for rec in records:
fh.write(json.dumps(rec, ensure_ascii=False) + "\n")
(OUT_DIR / "README.md").write_text(_DATASET_README, encoding="utf-8")
print(f"wrote {len(records)} records -> {path}")
if "--push" in sys.argv:
from huggingface_hub import HfApi
api = HfApi()
api.create_repo(DATASET_ID, repo_type="dataset", exist_ok=True)
api.upload_folder(repo_id=DATASET_ID, repo_type="dataset", folder_path=str(OUT_DIR))
print(f"pushed -> https://huggingface.co/datasets/{DATASET_ID}")
return 0
if __name__ == "__main__":
raise SystemExit(main())