File size: 6,926 Bytes
942050b
 
4b4ff9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
942050b
 
 
 
 
 
4b4ff9e
 
 
 
 
 
 
 
 
 
 
 
 
942050b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06ed757
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
942050b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d48602c
942050b
 
 
 
 
 
 
 
 
 
d48602c
 
 
 
 
 
942050b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
"""Shared helpers used by multiple nodes.

Public surface (imported by `generate_sql`, `repair_once`, `plan_query`,
`eval.runner`, `tests.test_agent_support`, `scripts.wider_sc_poc`,
`tests.agent.nodes.test_schema_link_hints`):

- `parse_generate_sql_output` β€” robust JSON-parsing of LLM output
- `render_m_schema` β€” XiYan-style compact schema rendering
- `render_schema_block` β€” full schema-card block with hint appendices
- `render_fewshot_block` β€” few-shot example rendering

Internal helpers are split into two sibling modules (Kimi audit P1.4):

- `_text_utils` β€” JSON-fence stripping, safe-loads, NaN-safe float coerce,
  best-effort SELECT extraction. Used only by `parse_generate_sql_output`.
- `_hints` β€” M-Schema regexes (`_M_COL_RE`, `_M_FK_RE`), join-hints
  appendix, schema-link hints (one if-block per landed P3.F rescue),
  extended-samples appendix. Used by `render_m_schema` and
  `render_schema_block`.

Both sibling modules import nothing from this file β€” no circular paths.
"""

from __future__ import annotations

import re

from nl_sql.agent.nodes._hints import (
    _M_COL_RE,
    _M_FK_RE,
    _render_extended_samples_appendix,
    _render_join_hints_appendix,
    _render_schema_link_hints_appendix,
)
from nl_sql.agent.nodes._text_utils import (
    _coerce_float,
    _safe_loads,
    _strip_code_fence,
    _strip_to_sql,
)
from nl_sql.agent.state import GenerateSQLOutput
from nl_sql.schema_index.retriever import ContextBundle


def parse_generate_sql_output(text: str) -> GenerateSQLOutput:
    """Parse the LLM's JSON response into a GenerateSQLOutput.

    Handles common deviations: markdown fences, trailing prose, single-quoted
    keys (some local models do this). Falls back to extracting the longest
    SQL substring if JSON is unrecoverable β€” confidence drops to 0.
    """
    raw = (text or "").strip()
    candidate = _strip_code_fence(raw)
    parsed = _safe_loads(candidate)
    if parsed is None:
        # Last-ditch: find the first {...} block anywhere in the text.
        match = re.search(r"\{[\s\S]*\}", raw)
        if match:
            parsed = _safe_loads(match.group(0))

    if not isinstance(parsed, dict):
        return GenerateSQLOutput(
            sql=_strip_to_sql(raw),
            rationale="",
            tables_used=(),
            confidence=0.0,
            raw_text=raw,
        )

    sql = str(parsed.get("sql") or "").strip().rstrip(";")
    rationale = str(parsed.get("rationale") or "")
    tables = parsed.get("tables_used") or ()
    tables_used = tuple(str(t) for t in tables) if isinstance(tables, list) else ()

    confidence = _coerce_float(parsed.get("confidence"), default=0.0)
    return GenerateSQLOutput(
        sql=sql,
        rationale=rationale,
        tables_used=tables_used,
        confidence=confidence,
        raw_text=raw,
    )


def render_m_schema(context: ContextBundle | None) -> str:
    """Compact M-Schema rendering (XiYan-SQL style) parsed from chunk text.

    Replaces verbose table-card dump with: ``table.column (type) [samples]``
    per line plus a trailing FK block. Reduces tokens by ~60% and surfaces
    FK pairs as first-class signal next to columns instead of buried inside
    multi-section cards.
    """
    if context is None:
        return "(no schema context)"
    all_hits = list(context.schema_hits) + list(context.fk_neighbours)
    all_hits.sort(key=lambda h: h.table_name.lower())
    if not all_hits:
        return "(no tables matched)"
    col_lines: list[str] = []
    fk_lines: list[str] = []
    for hit in all_hits:
        table = hit.table_name
        for raw_line in hit.text.splitlines():
            m = _M_COL_RE.match(raw_line)
            if m:
                col = m.group("col").strip()
                col_type = m.group("type")
                flags = (m.group("flags") or "").strip()
                samples = (m.group("samples") or "").strip()
                pk = "PK" in flags.split()
                parts = [f"{table}.{col} ({col_type})"]
                if pk:
                    parts.append("[PK]")
                if samples:
                    parts.append(f"[{samples}]")
                col_lines.append(" ".join(parts))
                continue
            fk_m = _M_FK_RE.match(raw_line)
            if fk_m:
                local_cols, ref_table, ref_cols = fk_m.groups()
                fk_lines.append(f"{table}.({local_cols}) -> {ref_table}.({ref_cols})")
    blocks: list[str] = ["# Columns", *col_lines] if col_lines else ["(no columns parsed)"]
    if fk_lines:
        blocks.append("\n# Foreign keys")
        blocks.extend(fk_lines)
    appendix = _render_extended_samples_appendix(context.extended_samples)
    if appendix:
        blocks.append(appendix)
    return "\n".join(blocks)


def render_schema_block(
    context: ContextBundle | None,
    *,
    sort_alphabetically: bool = False,
) -> str:
    """Render schema chunks + FK neighbours into a single text block.

    Order: top-k dense hits first, FK-extended neighbours after. Empty bundle
    yields a placeholder so prompt formatting still works.

    `sort_alphabetically=True` overrides retrieval order and renders all
    tables (dense hits + FK neighbours together) in alphabetical-by-table-name
    order. The "FK-related tables" header is omitted in this mode because
    the partition no longer exists. Empirically codestral is more accurate
    when the schema block matches the alphabetical baseline order produced
    by SQLAlchemy's `inspect()` β€” see docs/SESSION_HANDOFF.md (column-
    ordering experiment).
    """
    if context is None:
        return "(no schema context)"
    blocks: list[str] = []
    all_hits = list(context.schema_hits) + list(context.fk_neighbours)
    if sort_alphabetically:
        all_hits.sort(key=lambda h: h.table_name.lower())
        blocks.extend(hit.text for hit in all_hits)
    else:
        blocks.extend(hit.text for hit in context.schema_hits)
        if context.fk_neighbours:
            blocks.append("# FK-related tables")
            blocks.extend(hit.text for hit in context.fk_neighbours)
    if not blocks:
        return "(no tables matched)"
    join_hints = _render_join_hints_appendix(all_hits)
    if join_hints:
        blocks.append(join_hints)
    schema_link_hints = _render_schema_link_hints_appendix(context, all_hits)
    if schema_link_hints:
        blocks.append(schema_link_hints)
    appendix = _render_extended_samples_appendix(context.extended_samples)
    if appendix:
        blocks.append(appendix)
    return "\n\n".join(blocks)


def render_fewshot_block(context: ContextBundle | None) -> str:
    if context is None or not context.fewshots:
        return "(none)"
    lines: list[str] = []
    for ex in context.fewshots:
        lines.append(f"Q: {ex.question}")
        lines.append(f"SQL: {ex.sql}")
        lines.append("")
    return "\n".join(lines).rstrip()