Spaces:
Running
Running
| """Phrasebook short-circuit — skip the LLM when the user hits a curated phrase. | |
| Purpose | |
| For the 80% of field-demo inputs that are canonical greetings, courtesies, | |
| or basic questions, the LLM adds risk (dialect drift, hallucination, | |
| latency) without adding value — we already have a gold translation. This | |
| module does an English-keyed, fuzzy-normalised match against the curated | |
| phrasebooks in configs/dialect_anchors/{bambara,pular}_phrasebook.json and | |
| returns the target string directly when the match is strong. | |
| Scope | |
| - Only fires when target language is bam or ful. For en/fr output we let | |
| the LLM (or a passthrough) handle it — nothing to short-circuit. | |
| - Source keys can be English and/or French, single or multi-alias. Each | |
| curated row may carry any combination of `source`, `sources` (list), | |
| `source_fr`, `sources_fr` (list); the loader flattens them into one | |
| match-candidate per alias so a typed paraphrase or a French equivalent | |
| hits the same target translation. | |
| Matching | |
| - Exact match on normalised string → score 1.0 ("exact"). | |
| - Otherwise SequenceMatcher ratio; threshold DEFAULT_THRESHOLD = 0.88. | |
| - Normalisation: lowercase, strip punctuation (keeps internal apostrophes), | |
| collapse whitespace. | |
| API | |
| lookup(user_text, target_lang) -> dict | None | |
| dict has keys: source, target, category, score, match | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import re | |
| from difflib import SequenceMatcher | |
| from functools import lru_cache | |
| from pathlib import Path | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| _PHRASEBOOK_DIR = ( | |
| Path(__file__).resolve().parent.parent.parent / "configs" / "dialect_anchors" | |
| ) | |
| _PHRASEBOOK_FILE = { | |
| "bam": "bambara_phrasebook.json", | |
| "ful": "pular_phrasebook.json", | |
| } | |
| DEFAULT_THRESHOLD = 0.88 | |
| def _normalize(text: str) -> str: | |
| """Lowercase, strip most punctuation, collapse whitespace.""" | |
| text = (text or "").lower().strip() | |
| # Keep internal apostrophes (e.g. "don't", "b'a"), drop other punctuation. | |
| text = re.sub(r"[^\w\s']", " ", text, flags=re.UNICODE) | |
| text = re.sub(r"\s+", " ", text) | |
| return text.strip() | |
| def _expand_aliases(entry: dict) -> list[str]: | |
| """Collect every source-alias on an entry across both languages. | |
| Schema (all fields optional, additive — existing single-`source` rows | |
| keep working unchanged): | |
| source : "good morning" # canonical English | |
| sources : ["morning", "morning!"] # English aliases / paraphrases | |
| source_fr : "bonjour" # canonical French | |
| sources_fr : ["salut", "bonjour à tous"] | |
| """ | |
| out: list[str] = [] | |
| for key in ("source", "source_fr"): | |
| v = entry.get(key) | |
| if isinstance(v, str) and v.strip(): | |
| out.append(v) | |
| for key in ("sources", "sources_fr"): | |
| vs = entry.get(key) | |
| if isinstance(vs, list): | |
| out.extend(x for x in vs if isinstance(x, str) and x.strip()) | |
| return out | |
| def _load_phrasebook(lang: str) -> list[dict]: | |
| """Load and flatten a phrasebook into one match-candidate per alias. | |
| Each candidate carries the canonical source/target/category for display | |
| and a precomputed normalised alias (`_norm`) for the matcher to compare | |
| against. One curated row with N aliases produces N candidates that all | |
| point at the same target translation — the matcher picks the closest | |
| alias and returns the canonical entry. | |
| """ | |
| fname = _PHRASEBOOK_FILE.get(lang) | |
| if not fname: | |
| return [] | |
| path = _PHRASEBOOK_DIR / fname | |
| if not path.exists(): | |
| logger.warning("Phrasebook missing: %s", path) | |
| return [] | |
| with path.open("r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| pairs = data.get("pairs", []) | |
| candidates: list[dict] = [] | |
| for p in pairs: | |
| target = p.get("target", "") | |
| category = p.get("category") | |
| # Canonical source for display: prefer English `source`, else first | |
| # English `sources`, else French canonical, else first French alias. | |
| canonical = ( | |
| p.get("source") | |
| or (p.get("sources") or [None])[0] | |
| or p.get("source_fr") | |
| or (p.get("sources_fr") or [None])[0] | |
| or "" | |
| ) | |
| for alias in _expand_aliases(p): | |
| candidates.append({ | |
| "source": canonical or alias, | |
| "target": target, | |
| "category": category, | |
| "_alias": alias, | |
| "_norm": _normalize(alias), | |
| }) | |
| return candidates | |
| def lookup( | |
| user_text: str, | |
| target_lang: str, | |
| threshold: float = DEFAULT_THRESHOLD, | |
| ) -> Optional[dict]: | |
| """Return best curated match for `user_text` in `target_lang`, or None. | |
| Short-circuits only for curated dialects (bam, ful). For any other target | |
| returns None so the caller falls through to the LLM. | |
| """ | |
| pairs = _load_phrasebook(target_lang) | |
| if not pairs: | |
| return None | |
| q = _normalize(user_text) | |
| if not q: | |
| return None | |
| best: Optional[dict] = None | |
| best_score = 0.0 | |
| for p in pairs: | |
| src = p.get("_norm", "") | |
| if not src: | |
| continue | |
| if src == q: | |
| return { | |
| "source": p.get("source"), | |
| "target": p.get("target"), | |
| "category": p.get("category"), | |
| "score": 1.0, | |
| "match": "exact", | |
| } | |
| score = SequenceMatcher(None, q, src).ratio() | |
| if score > best_score: | |
| best_score = score | |
| best = p | |
| if best and best_score >= threshold: | |
| return { | |
| "source": best.get("source"), | |
| "target": best.get("target"), | |
| "category": best.get("category"), | |
| "score": round(best_score, 3), | |
| "match": "fuzzy", | |
| } | |
| return None | |
| def top_k(user_text: str, target_lang: str, k: int = 3) -> list[dict]: | |
| """Return the k closest phrasebook entries to `user_text` regardless of threshold. | |
| Used as RAG-style few-shot context when the strict `lookup()` misses but we | |
| still want to anchor the LLM with locally relevant gold pairs. Returns | |
| results sorted by descending score; never raises. | |
| """ | |
| pairs = _load_phrasebook(target_lang) | |
| if not pairs: | |
| return [] | |
| q = _normalize(user_text) | |
| if not q: | |
| return [] | |
| scored: list[tuple[float, dict]] = [] | |
| for p in pairs: | |
| src = p.get("_norm", "") | |
| if not src: | |
| continue | |
| score = 1.0 if src == q else SequenceMatcher(None, q, src).ratio() | |
| scored.append((score, p)) | |
| scored.sort(key=lambda x: x[0], reverse=True) | |
| out: list[dict] = [] | |
| seen: set[str] = set() # dedupe: aliases of the same row → one slot | |
| for score, p in scored: | |
| target = p.get("target") or "" | |
| if target in seen: | |
| continue | |
| seen.add(target) | |
| out.append({ | |
| "source": p.get("source"), | |
| "target": target, | |
| "category": p.get("category"), | |
| "score": round(score, 3), | |
| }) | |
| if len(out) >= k: | |
| break | |
| return out | |