"""Run 500+ LIVE HTTP API reward dead-tests against a running GhostExec server. Usage: uv run python scripts/run_live_api_dead_500.py --url http://127.0.0.1:8002 --cases 500 """ from __future__ import annotations import argparse import json from pathlib import Path from typing import Any from urllib.parse import urljoin import urllib.error import urllib.request W_CONFLICT = 0.35 W_REL = 0.35 W_TASK = 0.30 OUTPUT_SCALE = 0.48 def _request( base_url: str, method: str, path: str, *, body: dict[str, Any] | None = None, timeout: float = 20.0, ) -> tuple[int, str]: data = None headers = {"Accept": "application/json"} if body is not None: data = json.dumps(body).encode() headers["Content-Type"] = "application/json" req = urllib.request.Request( urljoin(base_url.rstrip("/") + "/", path.lstrip("/")), data=data, headers=headers, method=method, ) try: with urllib.request.urlopen(req, timeout=timeout) as resp: return resp.status, resp.read().decode(errors="replace") except urllib.error.HTTPError as e: return e.code, e.read().decode(errors="replace") def _step_payload_for(i: int) -> dict[str, Any]: templates: list[dict[str, Any]] = [ {"action": {"action_type": "do_nothing"}}, {"action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}}, {"action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}}, {"action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}}, {"action": {"action_type": "archive_email", "email_id": "e09"}}, {"action": {"action_type": "archive_email", "email_id": "bad_id"}}, { "action": { "action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00", } }, { "action": { "action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00", # overlap -> invalid semantic } }, {"action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}}, {"action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}}, {"action": {"action_type": "complete_task", "task_id": "t07"}}, {"action": {"action_type": "complete_task", "task_id": "t09"}}, # already done {"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}}, {"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}}, { "action": { "action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please.", } }, { "action": { "action_type": "send_message", "contact_name": "Nobody", "message_body": "hello", } }, ] return templates[i % len(templates)] def _assert_api_surface(base_url: str) -> None: for path in ("/health", "/metadata", "/state", "/schema", "/openapi.json", "/docs", "/redoc"): code, _ = _request(base_url, "GET", path) assert code == 200, f"{path} -> {code}" assert _request(base_url, "GET", "/reset")[0] == 405 assert _request(base_url, "GET", "/step")[0] == 405 assert _request(base_url, "GET", "/this-path-should-not-exist-ghostexec")[0] == 404 assert _request(base_url, "POST", "/mcp", body={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}})[0] == 200 def main() -> int: p = argparse.ArgumentParser(description="Run live 500+ reward dead-tests.") p.add_argument("--url", default="http://127.0.0.1:8002", help="Base server URL") p.add_argument("--cases", type=int, default=500, help="Number of /reset+/step cases") args = p.parse_args() base_url = args.url.rstrip("/") cases = max(1, args.cases) _assert_api_surface(base_url) out_dir = Path("outputs") / "logs" out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / f"api_dead_live_{cases}.jsonl" passed = 0 failed = 0 failures: list[str] = [] with out_path.open("w", encoding="utf-8") as f: for idx in range(cases): rec: dict[str, Any] = {"idx": idx, "ok": False, "error": None} try: rc, rb = _request( base_url, "POST", "/reset", body={"episode_id": f"live-dead-{idx:04d}", "seed": 42}, ) assert rc == 200, f"reset status {rc}" payload = _step_payload_for(idx) rec["action"] = payload["action"] sc, sb = _request(base_url, "POST", "/step", body=payload) assert sc == 200, f"step status {sc}" body = json.loads(sb) obs = body["observation"] meta = obs.get("metadata") or {} bd = meta.get("reward_breakdown") or {} reward = float(body["reward"]) final = float(bd["final"]) assert reward == final, "reward != breakdown.final" c = float(bd.get("conflict", 0.0)) r = float(bd.get("relationship", 0.0)) t = float(bd.get("task", 0.0)) expected_weighted = OUTPUT_SCALE * (W_CONFLICT * c + W_REL * r + W_TASK * t) assert float(bd["weighted_base"]) == expected_weighted, "weighted_base mismatch" expected_final = ( float(bd.get("weighted_base", 0.0)) + float(bd.get("invalid_step_adjustment", 0.0)) + float(bd.get("episode_completion_bonus", 0.0)) + float(bd.get("catastrophic_penalty", 0.0)) + float(bd.get("do_nothing_floor", 0.0)) ) assert final == expected_final, "final aggregation mismatch" if payload["action"]["action_type"] == "do_nothing": assert float(bd.get("do_nothing_floor", 0.0)) == -0.15, "do_nothing floor mismatch" assert reward < 0, "do_nothing should be negative" if meta.get("step_ok") is False: assert float(bd.get("invalid_step_adjustment", 0.0)) == -0.25, "invalid penalty mismatch" rec["ok"] = True rec["reward"] = reward rec["step_ok"] = meta.get("step_ok") passed += 1 except Exception as e: # noqa: BLE001 rec["ok"] = False rec["error"] = str(e) failed += 1 if len(failures) < 10: failures.append(f"idx={idx}: {e}") finally: f.write(json.dumps(rec, ensure_ascii=False) + "\n") print(f"Live API dead-test complete: passed={passed} failed={failed} total={cases}") print(f"Report: {out_path}") if failures: print("First failures:") for row in failures: print(" -", row) return 0 if failed == 0 else 1 if __name__ == "__main__": raise SystemExit(main())