ishaq101's picture
feat/Catalog Retrieval System (#1)
6bff5d9
"""Render a `Source` into the canonical text block consumed by the planner."""
from __future__ import annotations
from .models import Source
def render_source(source: Source) -> str:
"""Render a Source as the canonical text block consumed by the planner.
Stable identifiers (source_id / table_id / column_id) are rendered
alongside names. The planner must copy these verbatim into the IR;
the IRValidator does a literal ID lookup, so anything else fails.
Columns show data type, sample values (or `PII (suppressed)`), and
populated stats only (min/max suppressed for string/bool, where they're
useless). Top values are listed when available for low-cardinality cols.
Foreign keys are resolved to names.
"""
lines: list[str] = [
f"Source: {source.name} ({source.source_type})",
f"Source ID: {source.source_id}",
"",
"Tables:",
]
tables_by_id = {t.table_id: t for t in source.tables}
col_names_by_id = {
t.table_id: {c.column_id: c.name for c in t.columns} for t in source.tables
}
for table in source.tables:
rc = table.row_count
rc_str = f" ({rc:,} rows)" if rc is not None else ""
lines.append("")
lines.append(f" Table: {table.name}{rc_str} — id={table.table_id}")
lines.append(" Columns:")
for col in table.columns:
samples = "PII (suppressed)" if col.pii_flag else (col.sample_values or [])
stats_parts: list[str] = []
if col.stats:
if col.stats.min is not None:
stats_parts.append(f"min={col.stats.min}")
if col.stats.max is not None:
stats_parts.append(f"max={col.stats.max}")
if col.stats.mean is not None:
stats_parts.append(f"mean={col.stats.mean:.4g}")
if col.stats.median is not None:
stats_parts.append(f"median={col.stats.median:.4g}")
if col.stats.distinct_count is not None:
stats_parts.append(f"distinct={col.stats.distinct_count}")
if col.stats.top_values:
stats_parts.append(f"top={col.stats.top_values}")
stats_str = (", " + ", ".join(stats_parts)) if stats_parts else ""
lines.append(
f" - {col.name} [{col.data_type}]: samples={samples}{stats_str} — id={col.column_id}"
)
if table.foreign_keys:
lines.append(" Foreign keys:")
cols_in_this_table = {c.column_id: c.name for c in table.columns}
for fk in table.foreign_keys:
src_col_name = cols_in_this_table.get(fk.column_id, fk.column_id)
tgt_table = tables_by_id.get(fk.target_table_id)
tgt_table_name = tgt_table.name if tgt_table else fk.target_table_id
tgt_col_name = col_names_by_id.get(fk.target_table_id, {}).get(
fk.target_column_id, fk.target_column_id
)
lines.append(f" - {src_col_name} -> {tgt_table_name}.{tgt_col_name}")
return "\n".join(lines)