nl-sql / src /nl_sql /eval /__init__.py
liovina's picture
Deploy NL_SQL HEAD to HF Space
942050b verified
"""Evaluation harness: ablation matrix, metrics, HTML report.
Stage 6 of the v2 pipeline (per docs/03_eval_methodology.md).
Public surface:
- `BirdExample`, `load_bird_mini_dev`, `dev_split` β€” dataset access.
- `compare_results`, `execution_accuracy` β€” order-insensitive EA, BIRD parity.
- `schema_recall_at_k` β€” secondary RAG metric.
- `Configuration`, `EvalRecord`, `EvalRun`, `run_config_a` β€” runner.
- `write_json_report`, `write_html_report` β€” artefact writers.
"""
from __future__ import annotations
from nl_sql.eval.dataset import (
BirdExample,
dev_split,
extract_gold_tables,
load_bird_mini_dev,
)
from nl_sql.eval.metrics.execution_accuracy import (
ResultComparison,
compare_results,
execution_accuracy,
)
from nl_sql.eval.metrics.schema_recall import schema_recall_at_k
from nl_sql.eval.report import (
load_run_from_json,
write_html_report,
write_json_report,
)
from nl_sql.eval.runner import (
Configuration,
EvalRecord,
EvalRun,
EvalSummary,
run_config_a,
run_config_c,
run_config_d,
run_config_e,
run_config_f,
run_config_g,
)
__all__ = [
"BirdExample",
"Configuration",
"EvalRecord",
"EvalRun",
"EvalSummary",
"ResultComparison",
"compare_results",
"dev_split",
"execution_accuracy",
"extract_gold_tables",
"load_bird_mini_dev",
"load_run_from_json",
"run_config_a",
"run_config_c",
"run_config_d",
"run_config_e",
"run_config_f",
"run_config_g",
"schema_recall_at_k",
"write_html_report",
"write_json_report",
]