Spaces:

liovina
/

nl-sql

Sleeping

App Files Files Community

nl-sql / scripts /run_sonnet_voting.py

liovina

Deploy NL_SQL HEAD to HF Space

d48602c verified 14 days ago

raw

history blame contribute delete

7.96 kB

	"""Sonnet 4.6 voting via GraceKelly Perplexity bridge on baseline failures.

	For each failing qid, re-runs the production G pipeline through the
	PerplexityProvider (Claude Sonnet 4.6 thinking via Perplexity Pro web UI).
	Executes the alt SQL against the live engine and writes a voting-shaped
	report compatible with `merge_voting_rescues.py`.

	Latency: 20-40s per question (browser path), so this is a slow run. Budget:
	free via Perplexity Pro subscription, no Groq quota consumed.

	Usage:
	uv run python scripts/run_sonnet_voting.py \
	--baseline eval/reports/2026-05-13/hybrid+multi-vote+critique-v4.json \
	--out eval/reports/2026-05-13/sonnet-voting.json
	uv run python scripts/run_sonnet_voting.py \
	--baseline eval/reports/2026-05-22/v20-kimi-k2-thinking-merged.json \
	--out eval/reports/2026-05-22/sonnet-qid1399.json --only-qids 1399
	"""

	from __future__ import annotations

	import argparse
	import json
	import sys
	import time
	from pathlib import Path
	from typing import Any

	from nl_sql.agent.graph import PipelineConfig, build_pipeline, run_pipeline
	from nl_sql.config import get_settings
	from nl_sql.db.registry import get_default_registry
	from nl_sql.eval.dataset import load_bird_mini_dev
	from nl_sql.eval.metrics.execution_accuracy import compare_results
	from nl_sql.eval.runner import _compose_question, _execute_gold
	from nl_sql.execution.runner import execute_validated
	from nl_sql.llm.cache import CachingEmbeddingProvider
	from nl_sql.llm.providers.mistral import MistralProvider
	from nl_sql.llm.providers.perplexity import PerplexityProvider
	from nl_sql.schema_index.indexer import SchemaIndex


	def main() -> int:
	p = argparse.ArgumentParser(description=__doc__)
	p.add_argument("--baseline", type=Path, required=True)
	p.add_argument("--bird-root", type=Path, default=Path("data/bird_mini_dev/MINIDEV"))
	p.add_argument("--out", type=Path, required=True)
	p.add_argument("--max-cases", type=int, default=200)
	p.add_argument("--skip-qids", default="")
	p.add_argument(
	"--only-qids",
	default="",
	help="comma-separated baseline failure qids to retry exactly, preserving argument order",
	)
	p.add_argument("--model", default="claude-sonnet-4-6")
	args = p.parse_args()

	baseline = json.loads(args.baseline.read_text(encoding="utf-8"))
	fails = [r for r in baseline["records"] if not r.get("match")]
	try:
	only_qids = [int(x) for x in args.only_qids.split(",") if x.strip()]
	except ValueError:
	print("[error] invalid --only-qids: expected comma-separated integers", file=sys.stderr)
	return 3
	if only_qids:
	fails_by_qid = {int(r["question_id"]): r for r in fails}
	missing_qids = [qid for qid in only_qids if qid not in fails_by_qid]
	if missing_qids:
	print(f"[error] qids not found in baseline failures: {missing_qids}", file=sys.stderr)
	return 3
	fails = [fails_by_qid[qid] for qid in only_qids]
	skip = {int(x) for x in args.skip_qids.split(",") if x.strip()}
	fails = [r for r in fails if r["question_id"] not in skip][: args.max_cases]
	print(f"[info] {len(fails)} failures to retry with {args.model}", file=sys.stderr)
	if not fails:
	return 0

	settings = get_settings()
	examples = {e.question_id: e for e in load_bird_mini_dev(args.bird_root)}
	registry = get_default_registry()
	sonnet = PerplexityProvider(model=args.model, timeout_seconds=180.0)
	emb = CachingEmbeddingProvider(
	MistralProvider(api_key=settings.mistral_api_key), cache_dir=settings.llm_cache_dir
	)
	idx = SchemaIndex(persist_dir="chroma_data", embedder=emb)

	cfg = PipelineConfig(
	sql_provider=sonnet,
	explain_provider=sonnet,
	schema_index=idx,
	registry=registry,
	fewshot_top_k=3,
	sort_schema_block=True,
	cross_db_fewshot=True,
	verify_retry_on_empty=False,
	enable_grounded_critique=False,
	)
	pipeline = build_pipeline(cfg)

	records = []
	rescued = 0
	regressed = 0
	same = 0
	out_path = args.out
	out_path.parent.mkdir(parents=True, exist_ok=True)

	for i, br in enumerate(fails, 1):
	qid = br["question_id"]
	ex = examples.get(qid)
	if ex is None:
	continue
	spec = registry.get(ex.registry_db_id)
	engine = spec.make_engine()
	try:
	t0 = time.perf_counter()
	try:
	alt = run_pipeline(
	pipeline,
	question=_compose_question(ex),
	db_id=ex.registry_db_id,
	dialect="sqlite",
	)
	except Exception as exc:
	print(f"[{i:3d}/{len(fails)}] qid={qid} EXC: {str(exc)[:150]}", file=sys.stderr)
	continue
	elapsed = (time.perf_counter() - t0) * 1000.0

	alt_sql = alt.sql or ""
	alt_rows: list[Any] = []
	try:
	outcome = execute_validated(
	engine,
	alt_sql,
	dialect="sqlite",
	statement_timeout_ms=30_000,
	row_cap=10_000,
	)
	if outcome.result:
	alt_rows = list(outcome.result.rows)
	except Exception:
	pass
	try:
	gold_rows, _ = _execute_gold(
	engine, ex.sql, statement_timeout_ms=30_000, row_cap=10_000
	)
	except Exception:
	gold_rows = []
	alt_cmp = compare_results(gold_rows, alt_rows, gold_sql=ex.sql)
	alt_match = bool(alt_cmp.match)

	if alt_match and not br.get("match"):
	rescued += 1
	tag = "RESCUE"
	elif br.get("match") and not alt_match:
	regressed += 1
	tag = "regression"
	else:
	same += 1
	tag = "same"

	records.append(
	{
	"question_id": qid,
	"db_id": ex.db_id,
	"difficulty": ex.difficulty,
	"question": ex.question,
	"gold_sql": ex.sql,
	"baseline_pred": br["pred_sql"],
	"alt_pred": alt_sql,
	"alt_confidence": getattr(alt, "confidence", None),
	"baseline_match": bool(br.get("match")),
	"alt_match": alt_match,
	"vote_match": alt_match,
	"vote_source": "sonnet-bridge",
	"elapsed_ms": elapsed,
	}
	)
	print(
	f"[{i:3d}/{len(fails)}] qid={qid} {ex.difficulty:11s} {tag} ({elapsed / 1000:.1f}s)",
	file=sys.stderr,
	)

	# Snapshot after every record — browser bridge is slow and may
	# die mid-run. We don't want to lose progress.
	out_path.write_text(
	json.dumps(
	{
	"alt_model": f"perplexity:{args.model}",
	"summary": {
	"voted_better": rescued,
	"voted_worse": regressed,
	"voted_same": same,
	},
	"records": records,
	},
	indent=2,
	),
	encoding="utf-8",
	)
	finally:
	engine.dispose()

	print("\n=== sonnet-bridge summary ===", file=sys.stderr)
	print(f" cases: {len(records)}", file=sys.stderr)
	print(f" rescued: {rescued}", file=sys.stderr)
	print(f" regressed: {regressed}", file=sys.stderr)
	print(f" same: {same}", file=sys.stderr)
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())