ground-zero / src /llm /phrasebook.py
jefffffff9
Phrasebook: multi-alias + French source keys, plus misses-script stub
5815492
"""Phrasebook short-circuit — skip the LLM when the user hits a curated phrase.
Purpose
For the 80% of field-demo inputs that are canonical greetings, courtesies,
or basic questions, the LLM adds risk (dialect drift, hallucination,
latency) without adding value — we already have a gold translation. This
module does an English-keyed, fuzzy-normalised match against the curated
phrasebooks in configs/dialect_anchors/{bambara,pular}_phrasebook.json and
returns the target string directly when the match is strong.
Scope
- Only fires when target language is bam or ful. For en/fr output we let
the LLM (or a passthrough) handle it — nothing to short-circuit.
- Source keys can be English and/or French, single or multi-alias. Each
curated row may carry any combination of `source`, `sources` (list),
`source_fr`, `sources_fr` (list); the loader flattens them into one
match-candidate per alias so a typed paraphrase or a French equivalent
hits the same target translation.
Matching
- Exact match on normalised string → score 1.0 ("exact").
- Otherwise SequenceMatcher ratio; threshold DEFAULT_THRESHOLD = 0.88.
- Normalisation: lowercase, strip punctuation (keeps internal apostrophes),
collapse whitespace.
API
lookup(user_text, target_lang) -> dict | None
dict has keys: source, target, category, score, match
"""
from __future__ import annotations
import json
import logging
import re
from difflib import SequenceMatcher
from functools import lru_cache
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
_PHRASEBOOK_DIR = (
Path(__file__).resolve().parent.parent.parent / "configs" / "dialect_anchors"
)
_PHRASEBOOK_FILE = {
"bam": "bambara_phrasebook.json",
"ful": "pular_phrasebook.json",
}
DEFAULT_THRESHOLD = 0.88
def _normalize(text: str) -> str:
"""Lowercase, strip most punctuation, collapse whitespace."""
text = (text or "").lower().strip()
# Keep internal apostrophes (e.g. "don't", "b'a"), drop other punctuation.
text = re.sub(r"[^\w\s']", " ", text, flags=re.UNICODE)
text = re.sub(r"\s+", " ", text)
return text.strip()
def _expand_aliases(entry: dict) -> list[str]:
"""Collect every source-alias on an entry across both languages.
Schema (all fields optional, additive — existing single-`source` rows
keep working unchanged):
source : "good morning" # canonical English
sources : ["morning", "morning!"] # English aliases / paraphrases
source_fr : "bonjour" # canonical French
sources_fr : ["salut", "bonjour à tous"]
"""
out: list[str] = []
for key in ("source", "source_fr"):
v = entry.get(key)
if isinstance(v, str) and v.strip():
out.append(v)
for key in ("sources", "sources_fr"):
vs = entry.get(key)
if isinstance(vs, list):
out.extend(x for x in vs if isinstance(x, str) and x.strip())
return out
@lru_cache(maxsize=4)
def _load_phrasebook(lang: str) -> list[dict]:
"""Load and flatten a phrasebook into one match-candidate per alias.
Each candidate carries the canonical source/target/category for display
and a precomputed normalised alias (`_norm`) for the matcher to compare
against. One curated row with N aliases produces N candidates that all
point at the same target translation — the matcher picks the closest
alias and returns the canonical entry.
"""
fname = _PHRASEBOOK_FILE.get(lang)
if not fname:
return []
path = _PHRASEBOOK_DIR / fname
if not path.exists():
logger.warning("Phrasebook missing: %s", path)
return []
with path.open("r", encoding="utf-8") as f:
data = json.load(f)
pairs = data.get("pairs", [])
candidates: list[dict] = []
for p in pairs:
target = p.get("target", "")
category = p.get("category")
# Canonical source for display: prefer English `source`, else first
# English `sources`, else French canonical, else first French alias.
canonical = (
p.get("source")
or (p.get("sources") or [None])[0]
or p.get("source_fr")
or (p.get("sources_fr") or [None])[0]
or ""
)
for alias in _expand_aliases(p):
candidates.append({
"source": canonical or alias,
"target": target,
"category": category,
"_alias": alias,
"_norm": _normalize(alias),
})
return candidates
def lookup(
user_text: str,
target_lang: str,
threshold: float = DEFAULT_THRESHOLD,
) -> Optional[dict]:
"""Return best curated match for `user_text` in `target_lang`, or None.
Short-circuits only for curated dialects (bam, ful). For any other target
returns None so the caller falls through to the LLM.
"""
pairs = _load_phrasebook(target_lang)
if not pairs:
return None
q = _normalize(user_text)
if not q:
return None
best: Optional[dict] = None
best_score = 0.0
for p in pairs:
src = p.get("_norm", "")
if not src:
continue
if src == q:
return {
"source": p.get("source"),
"target": p.get("target"),
"category": p.get("category"),
"score": 1.0,
"match": "exact",
}
score = SequenceMatcher(None, q, src).ratio()
if score > best_score:
best_score = score
best = p
if best and best_score >= threshold:
return {
"source": best.get("source"),
"target": best.get("target"),
"category": best.get("category"),
"score": round(best_score, 3),
"match": "fuzzy",
}
return None
def top_k(user_text: str, target_lang: str, k: int = 3) -> list[dict]:
"""Return the k closest phrasebook entries to `user_text` regardless of threshold.
Used as RAG-style few-shot context when the strict `lookup()` misses but we
still want to anchor the LLM with locally relevant gold pairs. Returns
results sorted by descending score; never raises.
"""
pairs = _load_phrasebook(target_lang)
if not pairs:
return []
q = _normalize(user_text)
if not q:
return []
scored: list[tuple[float, dict]] = []
for p in pairs:
src = p.get("_norm", "")
if not src:
continue
score = 1.0 if src == q else SequenceMatcher(None, q, src).ratio()
scored.append((score, p))
scored.sort(key=lambda x: x[0], reverse=True)
out: list[dict] = []
seen: set[str] = set() # dedupe: aliases of the same row → one slot
for score, p in scored:
target = p.get("target") or ""
if target in seen:
continue
seen.add(target)
out.append({
"source": p.get("source"),
"target": target,
"category": p.get("category"),
"score": round(score, 3),
})
if len(out) >= k:
break
return out