File size: 6,926 Bytes
942050b 4b4ff9e 942050b 4b4ff9e 942050b 06ed757 942050b d48602c 942050b d48602c 942050b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | """Shared helpers used by multiple nodes.
Public surface (imported by `generate_sql`, `repair_once`, `plan_query`,
`eval.runner`, `tests.test_agent_support`, `scripts.wider_sc_poc`,
`tests.agent.nodes.test_schema_link_hints`):
- `parse_generate_sql_output` β robust JSON-parsing of LLM output
- `render_m_schema` β XiYan-style compact schema rendering
- `render_schema_block` β full schema-card block with hint appendices
- `render_fewshot_block` β few-shot example rendering
Internal helpers are split into two sibling modules (Kimi audit P1.4):
- `_text_utils` β JSON-fence stripping, safe-loads, NaN-safe float coerce,
best-effort SELECT extraction. Used only by `parse_generate_sql_output`.
- `_hints` β M-Schema regexes (`_M_COL_RE`, `_M_FK_RE`), join-hints
appendix, schema-link hints (one if-block per landed P3.F rescue),
extended-samples appendix. Used by `render_m_schema` and
`render_schema_block`.
Both sibling modules import nothing from this file β no circular paths.
"""
from __future__ import annotations
import re
from nl_sql.agent.nodes._hints import (
_M_COL_RE,
_M_FK_RE,
_render_extended_samples_appendix,
_render_join_hints_appendix,
_render_schema_link_hints_appendix,
)
from nl_sql.agent.nodes._text_utils import (
_coerce_float,
_safe_loads,
_strip_code_fence,
_strip_to_sql,
)
from nl_sql.agent.state import GenerateSQLOutput
from nl_sql.schema_index.retriever import ContextBundle
def parse_generate_sql_output(text: str) -> GenerateSQLOutput:
"""Parse the LLM's JSON response into a GenerateSQLOutput.
Handles common deviations: markdown fences, trailing prose, single-quoted
keys (some local models do this). Falls back to extracting the longest
SQL substring if JSON is unrecoverable β confidence drops to 0.
"""
raw = (text or "").strip()
candidate = _strip_code_fence(raw)
parsed = _safe_loads(candidate)
if parsed is None:
# Last-ditch: find the first {...} block anywhere in the text.
match = re.search(r"\{[\s\S]*\}", raw)
if match:
parsed = _safe_loads(match.group(0))
if not isinstance(parsed, dict):
return GenerateSQLOutput(
sql=_strip_to_sql(raw),
rationale="",
tables_used=(),
confidence=0.0,
raw_text=raw,
)
sql = str(parsed.get("sql") or "").strip().rstrip(";")
rationale = str(parsed.get("rationale") or "")
tables = parsed.get("tables_used") or ()
tables_used = tuple(str(t) for t in tables) if isinstance(tables, list) else ()
confidence = _coerce_float(parsed.get("confidence"), default=0.0)
return GenerateSQLOutput(
sql=sql,
rationale=rationale,
tables_used=tables_used,
confidence=confidence,
raw_text=raw,
)
def render_m_schema(context: ContextBundle | None) -> str:
"""Compact M-Schema rendering (XiYan-SQL style) parsed from chunk text.
Replaces verbose table-card dump with: ``table.column (type) [samples]``
per line plus a trailing FK block. Reduces tokens by ~60% and surfaces
FK pairs as first-class signal next to columns instead of buried inside
multi-section cards.
"""
if context is None:
return "(no schema context)"
all_hits = list(context.schema_hits) + list(context.fk_neighbours)
all_hits.sort(key=lambda h: h.table_name.lower())
if not all_hits:
return "(no tables matched)"
col_lines: list[str] = []
fk_lines: list[str] = []
for hit in all_hits:
table = hit.table_name
for raw_line in hit.text.splitlines():
m = _M_COL_RE.match(raw_line)
if m:
col = m.group("col").strip()
col_type = m.group("type")
flags = (m.group("flags") or "").strip()
samples = (m.group("samples") or "").strip()
pk = "PK" in flags.split()
parts = [f"{table}.{col} ({col_type})"]
if pk:
parts.append("[PK]")
if samples:
parts.append(f"[{samples}]")
col_lines.append(" ".join(parts))
continue
fk_m = _M_FK_RE.match(raw_line)
if fk_m:
local_cols, ref_table, ref_cols = fk_m.groups()
fk_lines.append(f"{table}.({local_cols}) -> {ref_table}.({ref_cols})")
blocks: list[str] = ["# Columns", *col_lines] if col_lines else ["(no columns parsed)"]
if fk_lines:
blocks.append("\n# Foreign keys")
blocks.extend(fk_lines)
appendix = _render_extended_samples_appendix(context.extended_samples)
if appendix:
blocks.append(appendix)
return "\n".join(blocks)
def render_schema_block(
context: ContextBundle | None,
*,
sort_alphabetically: bool = False,
) -> str:
"""Render schema chunks + FK neighbours into a single text block.
Order: top-k dense hits first, FK-extended neighbours after. Empty bundle
yields a placeholder so prompt formatting still works.
`sort_alphabetically=True` overrides retrieval order and renders all
tables (dense hits + FK neighbours together) in alphabetical-by-table-name
order. The "FK-related tables" header is omitted in this mode because
the partition no longer exists. Empirically codestral is more accurate
when the schema block matches the alphabetical baseline order produced
by SQLAlchemy's `inspect()` β see docs/SESSION_HANDOFF.md (column-
ordering experiment).
"""
if context is None:
return "(no schema context)"
blocks: list[str] = []
all_hits = list(context.schema_hits) + list(context.fk_neighbours)
if sort_alphabetically:
all_hits.sort(key=lambda h: h.table_name.lower())
blocks.extend(hit.text for hit in all_hits)
else:
blocks.extend(hit.text for hit in context.schema_hits)
if context.fk_neighbours:
blocks.append("# FK-related tables")
blocks.extend(hit.text for hit in context.fk_neighbours)
if not blocks:
return "(no tables matched)"
join_hints = _render_join_hints_appendix(all_hits)
if join_hints:
blocks.append(join_hints)
schema_link_hints = _render_schema_link_hints_appendix(context, all_hits)
if schema_link_hints:
blocks.append(schema_link_hints)
appendix = _render_extended_samples_appendix(context.extended_samples)
if appendix:
blocks.append(appendix)
return "\n\n".join(blocks)
def render_fewshot_block(context: ContextBundle | None) -> str:
if context is None or not context.fewshots:
return "(none)"
lines: list[str] = []
for ex in context.fewshots:
lines.append(f"Q: {ex.question}")
lines.append(f"SQL: {ex.sql}")
lines.append("")
return "\n".join(lines).rstrip()
|