Spaces:

liovina
/

nl-sql

Sleeping

File size: 7,886 Bytes

942050b

"""Eval artefact writers: JSON (machine-readable) + HTML (portfolio-ready).

JSON layout matches `EvalRun` exactly so a downstream notebook can `json.load`
it and rebuild dataframes. HTML is a single static file — no JS, no CSS
framework, just a server-rendered table per the methodology doc.
"""

from __future__ import annotations

import json
from collections.abc import Iterable, Sequence
from dataclasses import asdict
from datetime import UTC, datetime
from html import escape
from pathlib import Path
from typing import Any

from nl_sql.eval.runner import Configuration, EvalRecord, EvalRun, EvalSummary

REPORTS_ROOT = Path("eval") / "reports"


def write_json_report(
    run: EvalRun,
    *,
    root: Path | str = REPORTS_ROOT,
    name_suffix: str = "",
) -> Path:
    """Dump one EvalRun as `eval/reports/<date>/<config>[<-suffix>].json`.

    `name_suffix` lets knob-bump runs (top-k=8, fk-hops=2, etc.) live
    alongside the baseline report on the same day without overwriting it.
    """
    out_dir = _date_dir(root)
    out_dir.mkdir(parents=True, exist_ok=True)
    suffix = f"-{name_suffix}" if name_suffix else ""
    path = out_dir / f"{run.configuration.value}{suffix}.json"
    payload = {
        "configuration": run.configuration.value,
        "sql_model": run.sql_model,
        "overall": asdict(run.overall),
        "per_difficulty": {k: asdict(v) for k, v in run.per_difficulty.items()},
        "records": [asdict(r) for r in run.records],
    }
    path.write_text(json.dumps(payload, indent=2, default=_json_default), encoding="utf-8")
    return path


def write_html_report(runs: Sequence[EvalRun], *, root: Path | str = REPORTS_ROOT) -> Path:
    """Render `eval/reports/<date>/index.html` with one table per run."""
    out_dir = _date_dir(root)
    out_dir.mkdir(parents=True, exist_ok=True)
    path = out_dir / "index.html"
    body_parts: list[str] = [
        f"<h1>NL→SQL eval — {datetime.now(tz=UTC):%Y-%m-%d}</h1>",
        "<p>Source: BIRD Mini-Dev (SQLite). "
        "Methodology: <code>docs/03_eval_methodology.md</code>.</p>",
    ]
    body_parts.append(_render_overall_table(runs))
    for run in runs:
        body_parts.append(_render_run_section(run))
    html = _wrap_html("\n".join(body_parts))
    path.write_text(html, encoding="utf-8")
    return path


def _date_dir(root: Path | str) -> Path:
    return Path(root) / datetime.now(tz=UTC).strftime("%Y-%m-%d")


def load_run_from_json(path: Path | str) -> EvalRun:
    """Re-hydrate an EvalRun previously written by `write_json_report`.

    Used by the live driver so an HTML report can combine today's freshly-
    finished configuration with whatever runs already sit in the same date
    directory. Roundtrip-stable on all dataclass fields (tuples come back
    as tuples; the `_json_default` writer sends them as lists).
    """
    raw = json.loads(Path(path).read_text(encoding="utf-8"))
    overall = EvalSummary(**raw["overall"])
    per_difficulty = {k: EvalSummary(**v) for k, v in (raw.get("per_difficulty") or {}).items()}
    records = [_record_from_dict(r) for r in raw.get("records") or []]
    return EvalRun(
        configuration=Configuration(raw["configuration"]),
        sql_model=raw["sql_model"],
        overall=overall,
        per_difficulty=per_difficulty,
        records=records,
    )


def _record_from_dict(raw: dict[str, Any]) -> EvalRecord:
    """Convert one record dict into an EvalRecord; tuple fields are restored."""
    coerced = dict(raw)
    for key in ("gold_tables", "retrieved_tables"):
        value = coerced.get(key)
        if isinstance(value, list):
            coerced[key] = tuple(value)
    return EvalRecord(**coerced)


def _render_overall_table(runs: Iterable[EvalRun]) -> str:
    headers = [
        "Configuration",
        "Model",
        "n",
        "EA",
        "Simple",
        "Moderate",
        "Challenging",
        "Validity",
        "Recall@k",
        "Empty %",
        "P50 latency",
        "P95 latency",
    ]
    rows: list[str] = []
    for run in runs:
        diff = run.per_difficulty
        rows.append(
            "<tr>"
            + _td(run.configuration.value)
            + _td(run.sql_model)
            + _td(str(run.overall.n))
            + _td(_pct(run.overall.ea))
            + _td(_pct(diff.get("simple", _zero()).ea))
            + _td(_pct(diff.get("moderate", _zero()).ea))
            + _td(_pct(diff.get("challenging", _zero()).ea))
            + _td(_pct(run.overall.validity_rate))
            + _td(_pct(run.overall.schema_recall_at_k))
            + _td(_pct(run.overall.empty_result_rate))
            + _td(_ms(run.overall.latency_p50_ms))
            + _td(_ms(run.overall.latency_p95_ms))
            + "</tr>"
        )
    return (
        "<h2>Summary</h2>"
        "<table><thead><tr>"
        + "".join(f"<th>{h}</th>" for h in headers)
        + "</tr></thead><tbody>"
        + "\n".join(rows)
        + "</tbody></table>"
    )


def _render_run_section(run: EvalRun) -> str:
    return (
        f"<h2>{escape(run.configuration.value)}</h2>"
        f"<p>Model: <code>{escape(run.sql_model)}</code> · "
        f"n={run.overall.n} · "
        f"EA={_pct(run.overall.ea)} · "
        f"Validity={_pct(run.overall.validity_rate)} · "
        f"Recall@k={_pct(run.overall.schema_recall_at_k)}</p>"
        + _render_records_table(run.records[:200])
        + (
            f"<p><em>Showing first 200 of {len(run.records)} records.</em></p>"
            if len(run.records) > 200
            else ""
        )
    )


def _render_records_table(records: Sequence[EvalRecord]) -> str:
    if not records:
        return "<p><em>No records.</em></p>"
    headers = [
        "qid",
        "db",
        "diff",
        "match",
        "recall",
        "err",
        "lat ms",
        "tokens",
        "question",
    ]
    rows: list[str] = []
    for r in records:
        rows.append(
            "<tr>"
            + _td(str(r.question_id))
            + _td(r.db_id)
            + _td(r.difficulty)
            + _td("✓" if r.match else "✗")
            + _td("✓" if r.schema_recall else "✗")
            + _td(r.error_kind or "")
            + _td(f"{r.latency_ms:.0f}")
            + _td(str(r.input_tokens + r.output_tokens))
            + _td(r.question[:120])
            + "</tr>"
        )
    return (
        "<table><thead><tr>"
        + "".join(f"<th>{h}</th>" for h in headers)
        + "</tr></thead><tbody>"
        + "\n".join(rows)
        + "</tbody></table>"
    )


def _wrap_html(body: str) -> str:
    return (
        "<!doctype html><html><head><meta charset='utf-8'>"
        "<title>NL→SQL eval</title>"
        "<style>"
        "body{font-family:system-ui,Segoe UI,sans-serif;margin:24px;color:#222;}"
        "table{border-collapse:collapse;margin:12px 0;font-size:14px;}"
        "th,td{border:1px solid #ddd;padding:6px 10px;text-align:left;}"
        "th{background:#f6f6f6;}"
        "code{background:#f0f0f0;padding:1px 4px;border-radius:2px;}"
        "h1{margin-top:0;}h2{margin-top:32px;}"
        "</style></head><body>"
        f"{body}"
        "</body></html>"
    )


def _td(text: str) -> str:
    return f"<td>{escape(text)}</td>"


def _pct(value: float) -> str:
    return f"{value * 100:.1f}%"


def _ms(value: float) -> str:
    return f"{value:.0f} ms"


def _zero() -> EvalSummary:
    return EvalSummary(
        n=0,
        ea=0.0,
        validity_rate=0.0,
        schema_recall_at_k=0.0,
        repair_success_rate=0.0,
        first_pass_ea=0.0,
        empty_result_rate=0.0,
        latency_p50_ms=0.0,
        latency_p95_ms=0.0,
        tokens_p50=0.0,
        tokens_p95=0.0,
    )


def _json_default(value: object) -> object:
    if isinstance(value, tuple):
        return list(value)
    raise TypeError(f"not json-serialisable: {type(value).__name__}")