"""Phrasebook short-circuit — skip the LLM when the user hits a curated phrase. Purpose For the 80% of field-demo inputs that are canonical greetings, courtesies, or basic questions, the LLM adds risk (dialect drift, hallucination, latency) without adding value — we already have a gold translation. This module does an English-keyed, fuzzy-normalised match against the curated phrasebooks in configs/dialect_anchors/{bambara,pular}_phrasebook.json and returns the target string directly when the match is strong. Scope - Only fires when target language is bam or ful. For en/fr output we let the LLM (or a passthrough) handle it — nothing to short-circuit. - Source keys can be English and/or French, single or multi-alias. Each curated row may carry any combination of `source`, `sources` (list), `source_fr`, `sources_fr` (list); the loader flattens them into one match-candidate per alias so a typed paraphrase or a French equivalent hits the same target translation. Matching - Exact match on normalised string → score 1.0 ("exact"). - Otherwise SequenceMatcher ratio; threshold DEFAULT_THRESHOLD = 0.88. - Normalisation: lowercase, strip punctuation (keeps internal apostrophes), collapse whitespace. API lookup(user_text, target_lang) -> dict | None dict has keys: source, target, category, score, match """ from __future__ import annotations import json import logging import re from difflib import SequenceMatcher from functools import lru_cache from pathlib import Path from typing import Optional logger = logging.getLogger(__name__) _PHRASEBOOK_DIR = ( Path(__file__).resolve().parent.parent.parent / "configs" / "dialect_anchors" ) _PHRASEBOOK_FILE = { "bam": "bambara_phrasebook.json", "ful": "pular_phrasebook.json", } DEFAULT_THRESHOLD = 0.88 def _normalize(text: str) -> str: """Lowercase, strip most punctuation, collapse whitespace.""" text = (text or "").lower().strip() # Keep internal apostrophes (e.g. "don't", "b'a"), drop other punctuation. text = re.sub(r"[^\w\s']", " ", text, flags=re.UNICODE) text = re.sub(r"\s+", " ", text) return text.strip() def _expand_aliases(entry: dict) -> list[str]: """Collect every source-alias on an entry across both languages. Schema (all fields optional, additive — existing single-`source` rows keep working unchanged): source : "good morning" # canonical English sources : ["morning", "morning!"] # English aliases / paraphrases source_fr : "bonjour" # canonical French sources_fr : ["salut", "bonjour à tous"] """ out: list[str] = [] for key in ("source", "source_fr"): v = entry.get(key) if isinstance(v, str) and v.strip(): out.append(v) for key in ("sources", "sources_fr"): vs = entry.get(key) if isinstance(vs, list): out.extend(x for x in vs if isinstance(x, str) and x.strip()) return out @lru_cache(maxsize=4) def _load_phrasebook(lang: str) -> list[dict]: """Load and flatten a phrasebook into one match-candidate per alias. Each candidate carries the canonical source/target/category for display and a precomputed normalised alias (`_norm`) for the matcher to compare against. One curated row with N aliases produces N candidates that all point at the same target translation — the matcher picks the closest alias and returns the canonical entry. """ fname = _PHRASEBOOK_FILE.get(lang) if not fname: return [] path = _PHRASEBOOK_DIR / fname if not path.exists(): logger.warning("Phrasebook missing: %s", path) return [] with path.open("r", encoding="utf-8") as f: data = json.load(f) pairs = data.get("pairs", []) candidates: list[dict] = [] for p in pairs: target = p.get("target", "") category = p.get("category") # Canonical source for display: prefer English `source`, else first # English `sources`, else French canonical, else first French alias. canonical = ( p.get("source") or (p.get("sources") or [None])[0] or p.get("source_fr") or (p.get("sources_fr") or [None])[0] or "" ) for alias in _expand_aliases(p): candidates.append({ "source": canonical or alias, "target": target, "category": category, "_alias": alias, "_norm": _normalize(alias), }) return candidates def lookup( user_text: str, target_lang: str, threshold: float = DEFAULT_THRESHOLD, ) -> Optional[dict]: """Return best curated match for `user_text` in `target_lang`, or None. Short-circuits only for curated dialects (bam, ful). For any other target returns None so the caller falls through to the LLM. """ pairs = _load_phrasebook(target_lang) if not pairs: return None q = _normalize(user_text) if not q: return None best: Optional[dict] = None best_score = 0.0 for p in pairs: src = p.get("_norm", "") if not src: continue if src == q: return { "source": p.get("source"), "target": p.get("target"), "category": p.get("category"), "score": 1.0, "match": "exact", } score = SequenceMatcher(None, q, src).ratio() if score > best_score: best_score = score best = p if best and best_score >= threshold: return { "source": best.get("source"), "target": best.get("target"), "category": best.get("category"), "score": round(best_score, 3), "match": "fuzzy", } return None def top_k(user_text: str, target_lang: str, k: int = 3) -> list[dict]: """Return the k closest phrasebook entries to `user_text` regardless of threshold. Used as RAG-style few-shot context when the strict `lookup()` misses but we still want to anchor the LLM with locally relevant gold pairs. Returns results sorted by descending score; never raises. """ pairs = _load_phrasebook(target_lang) if not pairs: return [] q = _normalize(user_text) if not q: return [] scored: list[tuple[float, dict]] = [] for p in pairs: src = p.get("_norm", "") if not src: continue score = 1.0 if src == q else SequenceMatcher(None, q, src).ratio() scored.append((score, p)) scored.sort(key=lambda x: x[0], reverse=True) out: list[dict] = [] seen: set[str] = set() # dedupe: aliases of the same row → one slot for score, p in scored: target = p.get("target") or "" if target in seen: continue seen.add(target) out.append({ "source": p.get("source"), "target": target, "category": p.get("category"), "score": round(score, 3), }) if len(out) >= k: break return out