File size: 3,444 Bytes
942050b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | """Humanize raw column labels coming back from the SQL engine.
When the question is "How many schools are exclusively virtual?", the SQL is
``SELECT COUNT(DISTINCT s.CDSCode) ...`` and the engine returns a column
literally named ``COUNT(DISTINCT s.CDSCode)``. Streamlit's metric widget
shows this raw — engineering-correct, recruiter-hostile.
This module maps the raw label to a stable semantic key the UI can localize:
>>> classify_scalar_label("COUNT(DISTINCT s.CDSCode)")
'count'
>>> classify_scalar_label("AVG(score)")
'average'
>>> classify_scalar_label("total_revenue")
'identifier'
>>> classify_scalar_label("")
'result'
Keep the function pure / dependency-free so it can be unit-tested without
touching Streamlit.
"""
from __future__ import annotations
import re
ScalarLabelKind = str
"""One of: count, sum, average, minimum, maximum, ratio, identifier, result.
Returned as a plain string so callers can ``_t(f"scalar_label_{kind}")`` or
build their own lookup. New aggregation kinds can be added without bumping a
type alias."""
_AGGREGATE_PATTERNS: list[tuple[re.Pattern[str], ScalarLabelKind]] = [
(re.compile(r"^\s*count\s*\(", re.IGNORECASE), "count"),
(re.compile(r"^\s*sum\s*\(", re.IGNORECASE), "sum"),
(re.compile(r"^\s*(avg|average)\s*\(", re.IGNORECASE), "average"),
(re.compile(r"^\s*min\s*\(", re.IGNORECASE), "minimum"),
(re.compile(r"^\s*max\s*\(", re.IGNORECASE), "maximum"),
]
_RATIO_PATTERN = re.compile(r"[+\-*/]")
"""A scalar label containing an arithmetic operator outside of a function
call is almost always a ratio / computed expression (``a*100.0/b``,
``a-b``, ``a/b``). The pattern is loose on purpose — `_classify_scalar_label`
checks it only *after* the aggregate functions, so ``COUNT(*) * 1.0``-style
labels still classify as ``count``."""
_LOOKS_LIKE_EXPRESSION = re.compile(r"[()*]")
"""Parens or ``*`` in a label = SQL expression, not a column name."""
def classify_scalar_label(raw: str) -> ScalarLabelKind:
"""Map an engine-returned column label to a UI-localizable kind.
The classifier is intentionally simple — string-level pattern matching,
no SQL parser. The pipeline already validated the SQL upstream
(``execution/guards.py``) so we trust the shape; we only need to decide
*what to show on the metric card* when the label is a raw expression.
"""
if not raw or not raw.strip():
return "result"
for pattern, kind in _AGGREGATE_PATTERNS:
if pattern.search(raw):
return kind
if _RATIO_PATTERN.search(raw):
return "ratio"
if _LOOKS_LIKE_EXPRESSION.search(raw):
return "result"
return "identifier"
def humanize_scalar_label(raw: str, *, fallback: str | None = None) -> str:
"""English-only convenience wrapper for callers that don't want to
localize. Returns a short noun phrase suitable for ``st.metric(label, ...)``.
Pass ``fallback`` to recover the raw label for the ``identifier`` case
(where the engine's column name *was* readable, e.g. ``total_revenue``).
"""
kind = classify_scalar_label(raw)
if kind == "identifier":
return fallback or raw
return _ENGLISH_LABELS[kind]
_ENGLISH_LABELS: dict[ScalarLabelKind, str] = {
"count": "Count",
"sum": "Sum",
"average": "Average",
"minimum": "Minimum",
"maximum": "Maximum",
"ratio": "Ratio",
"result": "Result",
}
|