File size: 1,609 Bytes
942050b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | """Evaluation harness: ablation matrix, metrics, HTML report.
Stage 6 of the v2 pipeline (per docs/03_eval_methodology.md).
Public surface:
- `BirdExample`, `load_bird_mini_dev`, `dev_split` β dataset access.
- `compare_results`, `execution_accuracy` β order-insensitive EA, BIRD parity.
- `schema_recall_at_k` β secondary RAG metric.
- `Configuration`, `EvalRecord`, `EvalRun`, `run_config_a` β runner.
- `write_json_report`, `write_html_report` β artefact writers.
"""
from __future__ import annotations
from nl_sql.eval.dataset import (
BirdExample,
dev_split,
extract_gold_tables,
load_bird_mini_dev,
)
from nl_sql.eval.metrics.execution_accuracy import (
ResultComparison,
compare_results,
execution_accuracy,
)
from nl_sql.eval.metrics.schema_recall import schema_recall_at_k
from nl_sql.eval.report import (
load_run_from_json,
write_html_report,
write_json_report,
)
from nl_sql.eval.runner import (
Configuration,
EvalRecord,
EvalRun,
EvalSummary,
run_config_a,
run_config_c,
run_config_d,
run_config_e,
run_config_f,
run_config_g,
)
__all__ = [
"BirdExample",
"Configuration",
"EvalRecord",
"EvalRun",
"EvalSummary",
"ResultComparison",
"compare_results",
"dev_split",
"execution_accuracy",
"extract_gold_tables",
"load_bird_mini_dev",
"load_run_from_json",
"run_config_a",
"run_config_c",
"run_config_d",
"run_config_e",
"run_config_f",
"run_config_g",
"schema_recall_at_k",
"write_html_report",
"write_json_report",
]
|