"""Schema-block appendix rendering: join hints, schema-link hints, samples.
Split out of `_support.py` (Kimi audit P1.4) so the bulk of P3.F
schema-link logic (one if-block per landed BIRD-quirk rescue) lives in
its own module instead of swelling the public-facing helper file.
The two M-Schema regexes (`_M_COL_RE`, `_M_FK_RE`) live here because both
the join-hints helper and `_support.render_m_schema` parse the same
chunk-text format. `_support` imports them from here; no circular path.
"""
from __future__ import annotations
import re
from typing import Any
from nl_sql.schema_index.retriever import ContextBundle
_M_COL_RE = re.compile(
r" - (?P
[^:]+?):\s+(?P[A-Za-z][A-Za-z0-9_()]*)\s+\[(?P[^\]]*)\]"
r"(?:\s*\|\s*nulls=\d+(?:\s*\([^)]+\))?,\s*distinct=\d+)?"
r"(?:\s*\|\s*samples:\s*(?P.+))?$"
)
_M_FK_RE = re.compile(r" - \(([^)]+)\) -> (\S+?)\(([^)]+)\)")
def _render_join_hints_appendix(hits: list[Any]) -> str:
lines: list[str] = []
seen: set[str] = set()
for hit in hits:
table = str(hit.table_name)
for raw_line in hit.text.splitlines():
fk_m = _M_FK_RE.match(raw_line)
if not fk_m:
continue
local_cols, ref_table, ref_cols = fk_m.groups()
hints = _format_join_hint(table, local_cols, ref_table, ref_cols)
for hint in hints:
if hint in seen:
continue
seen.add(hint)
lines.append(hint)
if not lines:
return ""
return "\n".join(["# Join hints", *lines])
def _format_join_hint(
table: str,
local_cols: str,
ref_table: str,
ref_cols: str,
) -> list[str]:
locals_ = [c.strip() for c in local_cols.split(",") if c.strip()]
refs = [c.strip() for c in ref_cols.split(",") if c.strip()]
if len(locals_) == len(refs):
return [
f"{table}.{left} = {ref_table}.{right}"
for left, right in zip(locals_, refs, strict=True)
]
return [f"{table}.({local_cols}) -> {ref_table}.({ref_cols})"]
def _render_schema_link_hints_appendix(context: ContextBundle, hits: list[Any]) -> str:
tables = {str(hit.table_name).lower() for hit in hits}
question = context.question.lower()
db_id = context.db_id.lower()
if (
db_id in {"student_club", "bird_student_club"}
and {"event", "expense"} <= tables
and "type" in question
and "expense" in question
and "event" in question
):
return "\n".join(
[
"# Schema-link hints",
"- For event-linked expense questions asking for a type, use event.type. "
"expense.expense_description describes individual expense rows.",
]
)
if (
db_id in {"toxicology", "bird_toxicology"}
and {"atom", "bond", "connected"} <= tables
and "double" in question
and "bond" in question
and "element" in question
):
return "\n".join(
[
"# Schema-link hints",
"- For toxicology questions asking for elements in a double bond, "
"filter bond.bond_type = '=' and connect atom to bond by molecule: "
"atom.molecule_id = bond.molecule_id plus connected.atom_id = atom.atom_id, "
"not connected.bond_id.",
]
)
if (
db_id in {"formula_1", "bird_formula_1"}
and {"driverstandings"} <= tables
and "track number" in question
):
return "\n".join(
[
"# Schema-link hints",
"- For formula_1 questions about a driver's 'track number' across races, "
"use driverStandings.position joined via driverStandings.raceId and "
"driverStandings.driverId. results.position / results.positionOrder refer "
"to finish position within a single race, which is different.",
]
)
if (
db_id in {"formula_1", "bird_formula_1"}
and {"laptimes", "drivers", "races"} <= tables
and ("lap time recorded" in question or "recorded lap time" in question)
):
return "\n".join(
[
"# Schema-link hints",
"- For formula_1 'best lap time recorded' / 'recorded lap time' "
"questions, BIRD gold surfaces the lap-time value alongside the "
"driver/race columns. Include lapTimes.milliseconds as the first "
"SELECT column and rank with ORDER BY lapTimes.milliseconds ASC "
"LIMIT 1: SELECT lapTimes.milliseconds, drivers.forename, "
"drivers.surname, races.name FROM lapTimes JOIN drivers ON "
"lapTimes.driverId = drivers.driverId JOIN races ON "
"lapTimes.raceId = races.raceId ORDER BY lapTimes.milliseconds "
"ASC LIMIT 1.",
]
)
if (
db_id in {"thrombosis_prediction", "bird_thrombosis_prediction"}
and {"patient", "laboratory", "examination"} <= tables
and "higher than normal" in question
):
return "\n".join(
[
"# Schema-link hints",
"- For thrombosis_prediction 'higher than normal' patient-count "
"questions on Laboratory values (e.g. IGG/IGA/IGM/anti-...), "
"BIRD gold restricts patients to those that appear in both the "
"Laboratory and Examination tables — even when no Examination "
"column is used in WHERE. Write: SELECT COUNT(DISTINCT T1.ID) "
"FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID "
"INNER JOIN Examination AS T3 ON T3.ID = T2.ID WHERE . Do NOT query Laboratory alone — that overcounts "
"patients without Examination records.",
]
)
if (
db_id in {"thrombosis_prediction", "bird_thrombosis_prediction"}
and {"patient", "laboratory"} <= tables
and ("anti-centromere" in question or "anti-ssb" in question)
):
return "\n".join(
[
"# Schema-link hints",
"- For thrombosis_prediction questions mentioning 'anti-centromere' "
"or 'anti-SSB', the antibody values live on the Laboratory table "
"as columns Laboratory.CENTROMEA and Laboratory.SSB (NOT on "
"Examination — Examination has no CENTROMEA or SSB columns at "
"all). BIRD gold encodes 'a normal level of anti-centromere / "
"anti-SSB' as Laboratory.CENTROMEA IN ('negative', '0') and "
"Laboratory.SSB IN ('negative', '0') — these are the actual "
"string values stored in Laboratory; do not invent '-' / '+-' / "
"'+' tokens. Write: SELECT COUNT(DISTINCT T1.ID) FROM Patient "
"AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID WHERE "
"T2.CENTROMEA IN ('negative', '0') AND T2.SSB IN "
"('negative', '0') AND T1.SEX = 'M'.",
]
)
if (
db_id in {"card_games", "bird_card_games"}
and {"cards", "rulings"} <= tables
and "triggered ability" in question
):
return "\n".join(
[
"# Schema-link hints",
"- For card_games questions asking how many cards 'contain info "
"about the triggered ability' (or any ruling-style phrase), BIRD "
"gold treats per-card ability rulings as rows in the rulings "
"table, not the cards table. Write: SELECT COUNT(DISTINCT "
"cards.id) FROM cards INNER JOIN rulings ON cards.uuid = "
"rulings.uuid WHERE (cards.power IS NULL OR cards.power = '*') "
"AND rulings.text LIKE '%triggered ability%'. Filter on "
"rulings.text, NOT cards.text (cards.text is the printed card "
"text, while ruling notes live in rulings.text). Use "
"COUNT(DISTINCT cards.id) to avoid inflating the count when "
"a single card has multiple rulings.",
]
)
if (
db_id in {"thrombosis_prediction", "bird_thrombosis_prediction"}
and {"patient", "laboratory"} <= tables
and "oldest sjs patient" in question
):
return "\n".join(
[
"# Schema-link hints",
"- For thrombosis_prediction 'oldest SJS patient' + laboratory "
"questions, BIRD gold returns THREE SELECT columns: "
"(Laboratory.Date, age expression, Patient.Birthday). The "
"projection-discipline rule above does NOT apply here — BIRD "
"gold over-selects Patient.Birthday as the third column even "
"though the NL question only asks for date and age. This is a "
"known BIRD annotation quirk; you MUST include T2.Birthday as "
"the third SELECT column. BIRD gold ranks the oldest patient "
"by sorting Patient.Birthday ASC LIMIT 1 directly on the JOIN, "
"not via a WHERE = (SELECT MIN(...)) subquery. Write "
"EXACTLY this SQL with no column removed: SELECT T1.Date, "
"STRFTIME('%Y', T2.`First Date`) - STRFTIME('%Y', T2.Birthday), "
"T2.Birthday FROM Laboratory AS T1 INNER JOIN Patient AS T2 ON "
"T1.ID = T2.ID WHERE T2.Diagnosis = 'SJS' AND T2.Birthday IS "
"NOT NULL ORDER BY T2.Birthday ASC LIMIT 1. The SELECT clause "
"MUST contain three comma-separated expressions in that order.",
]
)
if (
db_id in {"european_football_2", "bird_european_football_2"}
and {"team_attributes", "team"} <= tables
and "highest build up play speed" in question
):
return "\n".join(
[
"# Schema-link hints",
"- For european_football_2 'top N teams with the highest build "
"Up Play Speed' question, BIRD gold treats numerically lower "
"buildUpPlaySpeed values as 'higher' (positional inversion vs "
"the natural NL reading). Sort ASC, not DESC. Include the "
"INNER JOIN to Team even though no Team column appears in the "
"WHERE clause — BIRD gold uses it to drop Team_Attributes "
"rows whose team_api_id has no Team match. Write exactly: "
"SELECT t1.buildUpPlaySpeed FROM Team_Attributes AS t1 INNER "
"JOIN Team AS t2 ON t1.team_api_id = t2.team_api_id ORDER BY "
"t1.buildUpPlaySpeed ASC LIMIT 4.",
]
)
if (
db_id in {"california_schools", "bird_california_schools"}
and {"satscores", "schools"} <= tables
and "lowest excellence rate" in question
):
return "\n".join(
[
"# Schema-link hints",
"- For california_schools 'school with the lowest excellence rate' "
"question, BIRD gold orders SELECT columns as (Street, City, State, "
"Zip) — NOT in the natural question word-order 'Street, City, Zip "
"and State'. The projection-discipline rule above does NOT apply "
"here; you MUST emit SELECT columns exactly as (T2.Street, T2.City, "
"T2.State, T2.Zip). 'Excellence rate' is "
"CAST(satscores.NumGE1500 AS REAL) / satscores.NumTstTakr; rank ASC "
"with LIMIT 1 directly on the JOIN — do NOT wrap in a "
"WHERE CDSCode = (SELECT ...) subquery. Write EXACTLY: "
"SELECT T2.Street, T2.City, T2.State, T2.Zip FROM satscores AS T1 "
"INNER JOIN schools AS T2 ON T1.cds = T2.CDSCode ORDER BY "
"CAST(T1.NumGE1500 AS REAL) / T1.NumTstTakr ASC LIMIT 1.",
]
)
if (
db_id in {"debit_card_specializing", "bird_debit_card_specializing"}
and {"yearmonth", "transactions_1k", "customers"} <= tables
and "top spending" in question
and "average price" in question
):
return "\n".join(
[
"# Schema-link hints",
"- For debit_card_specializing 'top spending customer' + "
"'average price per single item' question, write exactly: "
"SELECT T2.CustomerID, SUM(T2.Price / T2.Amount), T1.Currency "
"FROM customers AS T1 INNER JOIN transactions_1k AS T2 "
"ON T1.CustomerID = T2.CustomerID "
"WHERE T2.CustomerID = (SELECT CustomerID FROM yearmonth "
"ORDER BY yearmonth.Consumption DESC LIMIT 1) "
"GROUP BY T2.CustomerID, T1.Currency. "
"Top spender is the yearmonth.Consumption max (subquery), "
"NOT SUM(transactions_1k.Price). "
"Average price per item is SUM(Price / Amount) row-wise, "
"NOT SUM(Price) / SUM(Amount). "
"Column order is (CustomerID, avg, Currency).",
]
)
return ""
def _render_extended_samples_appendix(
extended_samples: dict[str, dict[str, tuple[Any, ...]]] | None,
) -> str:
"""Format the per-difficulty sample mixture appendix.
Listed values are the *tail* of top-k samples per column — i.e.
samples beyond the primary ones already shown in each table card.
Header is explicit so codestral treats this as supplementary
filter-value hints, not as part of the schema definition.
"""
if not extended_samples:
return ""
lines = [
"# Additional sample values (extended density, for filter-value discovery)",
]
for table in sorted(extended_samples):
cols = extended_samples[table]
if not cols:
continue
lines.append(f"Table: {table}")
for col in sorted(cols):
values = cols[col]
if not values:
continue
rendered = ", ".join(_format_sample(v) for v in values)
lines.append(f" - {col}: {rendered}")
if len(lines) == 1:
return ""
return "\n".join(lines)
def _format_sample(value: Any) -> str:
if value is None:
return "NULL"
if isinstance(value, str):
return repr(value)
return str(value)