File size: 7,886 Bytes
942050b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | """Eval artefact writers: JSON (machine-readable) + HTML (portfolio-ready).
JSON layout matches `EvalRun` exactly so a downstream notebook can `json.load`
it and rebuild dataframes. HTML is a single static file — no JS, no CSS
framework, just a server-rendered table per the methodology doc.
"""
from __future__ import annotations
import json
from collections.abc import Iterable, Sequence
from dataclasses import asdict
from datetime import UTC, datetime
from html import escape
from pathlib import Path
from typing import Any
from nl_sql.eval.runner import Configuration, EvalRecord, EvalRun, EvalSummary
REPORTS_ROOT = Path("eval") / "reports"
def write_json_report(
run: EvalRun,
*,
root: Path | str = REPORTS_ROOT,
name_suffix: str = "",
) -> Path:
"""Dump one EvalRun as `eval/reports/<date>/<config>[<-suffix>].json`.
`name_suffix` lets knob-bump runs (top-k=8, fk-hops=2, etc.) live
alongside the baseline report on the same day without overwriting it.
"""
out_dir = _date_dir(root)
out_dir.mkdir(parents=True, exist_ok=True)
suffix = f"-{name_suffix}" if name_suffix else ""
path = out_dir / f"{run.configuration.value}{suffix}.json"
payload = {
"configuration": run.configuration.value,
"sql_model": run.sql_model,
"overall": asdict(run.overall),
"per_difficulty": {k: asdict(v) for k, v in run.per_difficulty.items()},
"records": [asdict(r) for r in run.records],
}
path.write_text(json.dumps(payload, indent=2, default=_json_default), encoding="utf-8")
return path
def write_html_report(runs: Sequence[EvalRun], *, root: Path | str = REPORTS_ROOT) -> Path:
"""Render `eval/reports/<date>/index.html` with one table per run."""
out_dir = _date_dir(root)
out_dir.mkdir(parents=True, exist_ok=True)
path = out_dir / "index.html"
body_parts: list[str] = [
f"<h1>NL→SQL eval — {datetime.now(tz=UTC):%Y-%m-%d}</h1>",
"<p>Source: BIRD Mini-Dev (SQLite). "
"Methodology: <code>docs/03_eval_methodology.md</code>.</p>",
]
body_parts.append(_render_overall_table(runs))
for run in runs:
body_parts.append(_render_run_section(run))
html = _wrap_html("\n".join(body_parts))
path.write_text(html, encoding="utf-8")
return path
def _date_dir(root: Path | str) -> Path:
return Path(root) / datetime.now(tz=UTC).strftime("%Y-%m-%d")
def load_run_from_json(path: Path | str) -> EvalRun:
"""Re-hydrate an EvalRun previously written by `write_json_report`.
Used by the live driver so an HTML report can combine today's freshly-
finished configuration with whatever runs already sit in the same date
directory. Roundtrip-stable on all dataclass fields (tuples come back
as tuples; the `_json_default` writer sends them as lists).
"""
raw = json.loads(Path(path).read_text(encoding="utf-8"))
overall = EvalSummary(**raw["overall"])
per_difficulty = {k: EvalSummary(**v) for k, v in (raw.get("per_difficulty") or {}).items()}
records = [_record_from_dict(r) for r in raw.get("records") or []]
return EvalRun(
configuration=Configuration(raw["configuration"]),
sql_model=raw["sql_model"],
overall=overall,
per_difficulty=per_difficulty,
records=records,
)
def _record_from_dict(raw: dict[str, Any]) -> EvalRecord:
"""Convert one record dict into an EvalRecord; tuple fields are restored."""
coerced = dict(raw)
for key in ("gold_tables", "retrieved_tables"):
value = coerced.get(key)
if isinstance(value, list):
coerced[key] = tuple(value)
return EvalRecord(**coerced)
def _render_overall_table(runs: Iterable[EvalRun]) -> str:
headers = [
"Configuration",
"Model",
"n",
"EA",
"Simple",
"Moderate",
"Challenging",
"Validity",
"Recall@k",
"Empty %",
"P50 latency",
"P95 latency",
]
rows: list[str] = []
for run in runs:
diff = run.per_difficulty
rows.append(
"<tr>"
+ _td(run.configuration.value)
+ _td(run.sql_model)
+ _td(str(run.overall.n))
+ _td(_pct(run.overall.ea))
+ _td(_pct(diff.get("simple", _zero()).ea))
+ _td(_pct(diff.get("moderate", _zero()).ea))
+ _td(_pct(diff.get("challenging", _zero()).ea))
+ _td(_pct(run.overall.validity_rate))
+ _td(_pct(run.overall.schema_recall_at_k))
+ _td(_pct(run.overall.empty_result_rate))
+ _td(_ms(run.overall.latency_p50_ms))
+ _td(_ms(run.overall.latency_p95_ms))
+ "</tr>"
)
return (
"<h2>Summary</h2>"
"<table><thead><tr>"
+ "".join(f"<th>{h}</th>" for h in headers)
+ "</tr></thead><tbody>"
+ "\n".join(rows)
+ "</tbody></table>"
)
def _render_run_section(run: EvalRun) -> str:
return (
f"<h2>{escape(run.configuration.value)}</h2>"
f"<p>Model: <code>{escape(run.sql_model)}</code> · "
f"n={run.overall.n} · "
f"EA={_pct(run.overall.ea)} · "
f"Validity={_pct(run.overall.validity_rate)} · "
f"Recall@k={_pct(run.overall.schema_recall_at_k)}</p>"
+ _render_records_table(run.records[:200])
+ (
f"<p><em>Showing first 200 of {len(run.records)} records.</em></p>"
if len(run.records) > 200
else ""
)
)
def _render_records_table(records: Sequence[EvalRecord]) -> str:
if not records:
return "<p><em>No records.</em></p>"
headers = [
"qid",
"db",
"diff",
"match",
"recall",
"err",
"lat ms",
"tokens",
"question",
]
rows: list[str] = []
for r in records:
rows.append(
"<tr>"
+ _td(str(r.question_id))
+ _td(r.db_id)
+ _td(r.difficulty)
+ _td("✓" if r.match else "✗")
+ _td("✓" if r.schema_recall else "✗")
+ _td(r.error_kind or "")
+ _td(f"{r.latency_ms:.0f}")
+ _td(str(r.input_tokens + r.output_tokens))
+ _td(r.question[:120])
+ "</tr>"
)
return (
"<table><thead><tr>"
+ "".join(f"<th>{h}</th>" for h in headers)
+ "</tr></thead><tbody>"
+ "\n".join(rows)
+ "</tbody></table>"
)
def _wrap_html(body: str) -> str:
return (
"<!doctype html><html><head><meta charset='utf-8'>"
"<title>NL→SQL eval</title>"
"<style>"
"body{font-family:system-ui,Segoe UI,sans-serif;margin:24px;color:#222;}"
"table{border-collapse:collapse;margin:12px 0;font-size:14px;}"
"th,td{border:1px solid #ddd;padding:6px 10px;text-align:left;}"
"th{background:#f6f6f6;}"
"code{background:#f0f0f0;padding:1px 4px;border-radius:2px;}"
"h1{margin-top:0;}h2{margin-top:32px;}"
"</style></head><body>"
f"{body}"
"</body></html>"
)
def _td(text: str) -> str:
return f"<td>{escape(text)}</td>"
def _pct(value: float) -> str:
return f"{value * 100:.1f}%"
def _ms(value: float) -> str:
return f"{value:.0f} ms"
def _zero() -> EvalSummary:
return EvalSummary(
n=0,
ea=0.0,
validity_rate=0.0,
schema_recall_at_k=0.0,
repair_success_rate=0.0,
first_pass_ea=0.0,
empty_result_rate=0.0,
latency_p50_ms=0.0,
latency_p95_ms=0.0,
tokens_p50=0.0,
tokens_p95=0.0,
)
def _json_default(value: object) -> object:
if isinstance(value, tuple):
return list(value)
raise TypeError(f"not json-serialisable: {type(value).__name__}")
|