Spaces:

liovina
/

nl-sql

Sleeping

File size: 14,627 Bytes

4b4ff9e

"""Schema-block appendix rendering: join hints, schema-link hints, samples.

Split out of `_support.py` (Kimi audit P1.4) so the bulk of P3.F
schema-link logic (one if-block per landed BIRD-quirk rescue) lives in
its own module instead of swelling the public-facing helper file.

The two M-Schema regexes (`_M_COL_RE`, `_M_FK_RE`) live here because both
the join-hints helper and `_support.render_m_schema` parse the same
chunk-text format. `_support` imports them from here; no circular path.
"""

from __future__ import annotations

import re
from typing import Any

from nl_sql.schema_index.retriever import ContextBundle

_M_COL_RE = re.compile(
    r"  - (?P<col>[^:]+?):\s+(?P<type>[A-Za-z][A-Za-z0-9_()]*)\s+\[(?P<flags>[^\]]*)\]"
    r"(?:\s*\|\s*nulls=\d+(?:\s*\([^)]+\))?,\s*distinct=\d+)?"
    r"(?:\s*\|\s*samples:\s*(?P<samples>.+))?$"
)
_M_FK_RE = re.compile(r"  - \(([^)]+)\) -> (\S+?)\(([^)]+)\)")


def _render_join_hints_appendix(hits: list[Any]) -> str:
    lines: list[str] = []
    seen: set[str] = set()
    for hit in hits:
        table = str(hit.table_name)
        for raw_line in hit.text.splitlines():
            fk_m = _M_FK_RE.match(raw_line)
            if not fk_m:
                continue
            local_cols, ref_table, ref_cols = fk_m.groups()
            hints = _format_join_hint(table, local_cols, ref_table, ref_cols)
            for hint in hints:
                if hint in seen:
                    continue
                seen.add(hint)
                lines.append(hint)
    if not lines:
        return ""
    return "\n".join(["# Join hints", *lines])


def _format_join_hint(
    table: str,
    local_cols: str,
    ref_table: str,
    ref_cols: str,
) -> list[str]:
    locals_ = [c.strip() for c in local_cols.split(",") if c.strip()]
    refs = [c.strip() for c in ref_cols.split(",") if c.strip()]
    if len(locals_) == len(refs):
        return [
            f"{table}.{left} = {ref_table}.{right}"
            for left, right in zip(locals_, refs, strict=True)
        ]
    return [f"{table}.({local_cols}) -> {ref_table}.({ref_cols})"]


def _render_schema_link_hints_appendix(context: ContextBundle, hits: list[Any]) -> str:
    tables = {str(hit.table_name).lower() for hit in hits}
    question = context.question.lower()
    db_id = context.db_id.lower()
    if (
        db_id in {"student_club", "bird_student_club"}
        and {"event", "expense"} <= tables
        and "type" in question
        and "expense" in question
        and "event" in question
    ):
        return "\n".join(
            [
                "# Schema-link hints",
                "- For event-linked expense questions asking for a type, use event.type. "
                "expense.expense_description describes individual expense rows.",
            ]
        )
    if (
        db_id in {"toxicology", "bird_toxicology"}
        and {"atom", "bond", "connected"} <= tables
        and "double" in question
        and "bond" in question
        and "element" in question
    ):
        return "\n".join(
            [
                "# Schema-link hints",
                "- For toxicology questions asking for elements in a double bond, "
                "filter bond.bond_type = '=' and connect atom to bond by molecule: "
                "atom.molecule_id = bond.molecule_id plus connected.atom_id = atom.atom_id, "
                "not connected.bond_id.",
            ]
        )
    if (
        db_id in {"formula_1", "bird_formula_1"}
        and {"driverstandings"} <= tables
        and "track number" in question
    ):
        return "\n".join(
            [
                "# Schema-link hints",
                "- For formula_1 questions about a driver's 'track number' across races, "
                "use driverStandings.position joined via driverStandings.raceId and "
                "driverStandings.driverId. results.position / results.positionOrder refer "
                "to finish position within a single race, which is different.",
            ]
        )
    if (
        db_id in {"formula_1", "bird_formula_1"}
        and {"laptimes", "drivers", "races"} <= tables
        and ("lap time recorded" in question or "recorded lap time" in question)
    ):
        return "\n".join(
            [
                "# Schema-link hints",
                "- For formula_1 'best lap time recorded' / 'recorded lap time' "
                "questions, BIRD gold surfaces the lap-time value alongside the "
                "driver/race columns. Include lapTimes.milliseconds as the first "
                "SELECT column and rank with ORDER BY lapTimes.milliseconds ASC "
                "LIMIT 1: SELECT lapTimes.milliseconds, drivers.forename, "
                "drivers.surname, races.name FROM lapTimes JOIN drivers ON "
                "lapTimes.driverId = drivers.driverId JOIN races ON "
                "lapTimes.raceId = races.raceId ORDER BY lapTimes.milliseconds "
                "ASC LIMIT 1.",
            ]
        )
    if (
        db_id in {"thrombosis_prediction", "bird_thrombosis_prediction"}
        and {"patient", "laboratory", "examination"} <= tables
        and "higher than normal" in question
    ):
        return "\n".join(
            [
                "# Schema-link hints",
                "- For thrombosis_prediction 'higher than normal' patient-count "
                "questions on Laboratory values (e.g. IGG/IGA/IGM/anti-...), "
                "BIRD gold restricts patients to those that appear in both the "
                "Laboratory and Examination tables — even when no Examination "
                "column is used in WHERE. Write: SELECT COUNT(DISTINCT T1.ID) "
                "FROM Patient AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID "
                "INNER JOIN Examination AS T3 ON T3.ID = T2.ID WHERE <lab value "
                "condition>. Do NOT query Laboratory alone — that overcounts "
                "patients without Examination records.",
            ]
        )
    if (
        db_id in {"thrombosis_prediction", "bird_thrombosis_prediction"}
        and {"patient", "laboratory"} <= tables
        and ("anti-centromere" in question or "anti-ssb" in question)
    ):
        return "\n".join(
            [
                "# Schema-link hints",
                "- For thrombosis_prediction questions mentioning 'anti-centromere' "
                "or 'anti-SSB', the antibody values live on the Laboratory table "
                "as columns Laboratory.CENTROMEA and Laboratory.SSB (NOT on "
                "Examination — Examination has no CENTROMEA or SSB columns at "
                "all). BIRD gold encodes 'a normal level of anti-centromere / "
                "anti-SSB' as Laboratory.CENTROMEA IN ('negative', '0') and "
                "Laboratory.SSB IN ('negative', '0') — these are the actual "
                "string values stored in Laboratory; do not invent '-' / '+-' / "
                "'+' tokens. Write: SELECT COUNT(DISTINCT T1.ID) FROM Patient "
                "AS T1 INNER JOIN Laboratory AS T2 ON T1.ID = T2.ID WHERE "
                "T2.CENTROMEA IN ('negative', '0') AND T2.SSB IN "
                "('negative', '0') AND T1.SEX = 'M'.",
            ]
        )
    if (
        db_id in {"card_games", "bird_card_games"}
        and {"cards", "rulings"} <= tables
        and "triggered ability" in question
    ):
        return "\n".join(
            [
                "# Schema-link hints",
                "- For card_games questions asking how many cards 'contain info "
                "about the triggered ability' (or any ruling-style phrase), BIRD "
                "gold treats per-card ability rulings as rows in the rulings "
                "table, not the cards table. Write: SELECT COUNT(DISTINCT "
                "cards.id) FROM cards INNER JOIN rulings ON cards.uuid = "
                "rulings.uuid WHERE (cards.power IS NULL OR cards.power = '*') "
                "AND rulings.text LIKE '%triggered ability%'. Filter on "
                "rulings.text, NOT cards.text (cards.text is the printed card "
                "text, while ruling notes live in rulings.text). Use "
                "COUNT(DISTINCT cards.id) to avoid inflating the count when "
                "a single card has multiple rulings.",
            ]
        )
    if (
        db_id in {"thrombosis_prediction", "bird_thrombosis_prediction"}
        and {"patient", "laboratory"} <= tables
        and "oldest sjs patient" in question
    ):
        return "\n".join(
            [
                "# Schema-link hints",
                "- For thrombosis_prediction 'oldest SJS patient' + laboratory "
                "questions, BIRD gold returns THREE SELECT columns: "
                "(Laboratory.Date, age expression, Patient.Birthday). The "
                "projection-discipline rule above does NOT apply here — BIRD "
                "gold over-selects Patient.Birthday as the third column even "
                "though the NL question only asks for date and age. This is a "
                "known BIRD annotation quirk; you MUST include T2.Birthday as "
                "the third SELECT column. BIRD gold ranks the oldest patient "
                "by sorting Patient.Birthday ASC LIMIT 1 directly on the JOIN, "
                "not via a WHERE = (SELECT MIN(...)) subquery. Write "
                "EXACTLY this SQL with no column removed: SELECT T1.Date, "
                "STRFTIME('%Y', T2.`First Date`) - STRFTIME('%Y', T2.Birthday), "
                "T2.Birthday FROM Laboratory AS T1 INNER JOIN Patient AS T2 ON "
                "T1.ID = T2.ID WHERE T2.Diagnosis = 'SJS' AND T2.Birthday IS "
                "NOT NULL ORDER BY T2.Birthday ASC LIMIT 1. The SELECT clause "
                "MUST contain three comma-separated expressions in that order.",
            ]
        )
    if (
        db_id in {"european_football_2", "bird_european_football_2"}
        and {"team_attributes", "team"} <= tables
        and "highest build up play speed" in question
    ):
        return "\n".join(
            [
                "# Schema-link hints",
                "- For european_football_2 'top N teams with the highest build "
                "Up Play Speed' question, BIRD gold treats numerically lower "
                "buildUpPlaySpeed values as 'higher' (positional inversion vs "
                "the natural NL reading). Sort ASC, not DESC. Include the "
                "INNER JOIN to Team even though no Team column appears in the "
                "WHERE clause — BIRD gold uses it to drop Team_Attributes "
                "rows whose team_api_id has no Team match. Write exactly: "
                "SELECT t1.buildUpPlaySpeed FROM Team_Attributes AS t1 INNER "
                "JOIN Team AS t2 ON t1.team_api_id = t2.team_api_id ORDER BY "
                "t1.buildUpPlaySpeed ASC LIMIT 4.",
            ]
        )
    if (
        db_id in {"california_schools", "bird_california_schools"}
        and {"satscores", "schools"} <= tables
        and "lowest excellence rate" in question
    ):
        return "\n".join(
            [
                "# Schema-link hints",
                "- For california_schools 'school with the lowest excellence rate' "
                "question, BIRD gold orders SELECT columns as (Street, City, State, "
                "Zip) — NOT in the natural question word-order 'Street, City, Zip "
                "and State'. The projection-discipline rule above does NOT apply "
                "here; you MUST emit SELECT columns exactly as (T2.Street, T2.City, "
                "T2.State, T2.Zip). 'Excellence rate' is "
                "CAST(satscores.NumGE1500 AS REAL) / satscores.NumTstTakr; rank ASC "
                "with LIMIT 1 directly on the JOIN — do NOT wrap in a "
                "WHERE CDSCode = (SELECT ...) subquery. Write EXACTLY: "
                "SELECT T2.Street, T2.City, T2.State, T2.Zip FROM satscores AS T1 "
                "INNER JOIN schools AS T2 ON T1.cds = T2.CDSCode ORDER BY "
                "CAST(T1.NumGE1500 AS REAL) / T1.NumTstTakr ASC LIMIT 1.",
            ]
        )
    if (
        db_id in {"debit_card_specializing", "bird_debit_card_specializing"}
        and {"yearmonth", "transactions_1k", "customers"} <= tables
        and "top spending" in question
        and "average price" in question
    ):
        return "\n".join(
            [
                "# Schema-link hints",
                "- For debit_card_specializing 'top spending customer' + "
                "'average price per single item' question, write exactly: "
                "SELECT T2.CustomerID, SUM(T2.Price / T2.Amount), T1.Currency "
                "FROM customers AS T1 INNER JOIN transactions_1k AS T2 "
                "ON T1.CustomerID = T2.CustomerID "
                "WHERE T2.CustomerID = (SELECT CustomerID FROM yearmonth "
                "ORDER BY yearmonth.Consumption DESC LIMIT 1) "
                "GROUP BY T2.CustomerID, T1.Currency. "
                "Top spender is the yearmonth.Consumption max (subquery), "
                "NOT SUM(transactions_1k.Price). "
                "Average price per item is SUM(Price / Amount) row-wise, "
                "NOT SUM(Price) / SUM(Amount). "
                "Column order is (CustomerID, avg, Currency).",
            ]
        )
    return ""


def _render_extended_samples_appendix(
    extended_samples: dict[str, dict[str, tuple[Any, ...]]] | None,
) -> str:
    """Format the per-difficulty sample mixture appendix.

    Listed values are the *tail* of top-k samples per column — i.e.
    samples beyond the primary ones already shown in each table card.
    Header is explicit so codestral treats this as supplementary
    filter-value hints, not as part of the schema definition.
    """
    if not extended_samples:
        return ""
    lines = [
        "# Additional sample values (extended density, for filter-value discovery)",
    ]
    for table in sorted(extended_samples):
        cols = extended_samples[table]
        if not cols:
            continue
        lines.append(f"Table: {table}")
        for col in sorted(cols):
            values = cols[col]
            if not values:
                continue
            rendered = ", ".join(_format_sample(v) for v in values)
            lines.append(f"  - {col}: {rendered}")
    if len(lines) == 1:
        return ""
    return "\n".join(lines)


def _format_sample(value: Any) -> str:
    if value is None:
        return "NULL"
    if isinstance(value, str):
        return repr(value)
    return str(value)