Spaces:

MataStrategy
/

ground-zero

Running

jefffffff9

Phrasebook: multi-alias + French source keys, plus misses-script stub

5815492 8 days ago

7.32 kB

	"""Phrasebook short-circuit — skip the LLM when the user hits a curated phrase.

	Purpose
	For the 80% of field-demo inputs that are canonical greetings, courtesies,
	or basic questions, the LLM adds risk (dialect drift, hallucination,
	latency) without adding value — we already have a gold translation. This
	module does an English-keyed, fuzzy-normalised match against the curated
	phrasebooks in configs/dialect_anchors/{bambara,pular}_phrasebook.json and
	returns the target string directly when the match is strong.

	Scope
	- Only fires when target language is bam or ful. For en/fr output we let
	the LLM (or a passthrough) handle it — nothing to short-circuit.
	- Source keys can be English and/or French, single or multi-alias. Each
	curated row may carry any combination of `source`, `sources` (list),
	`source_fr`, `sources_fr` (list); the loader flattens them into one
	match-candidate per alias so a typed paraphrase or a French equivalent
	hits the same target translation.

	Matching
	- Exact match on normalised string → score 1.0 ("exact").
	- Otherwise SequenceMatcher ratio; threshold DEFAULT_THRESHOLD = 0.88.
	- Normalisation: lowercase, strip punctuation (keeps internal apostrophes),
	collapse whitespace.

	API
	lookup(user_text, target_lang) -> dict \| None
	dict has keys: source, target, category, score, match
	"""
	from __future__ import annotations

	import json
	import logging
	import re
	from difflib import SequenceMatcher
	from functools import lru_cache
	from pathlib import Path
	from typing import Optional

	logger = logging.getLogger(__name__)

	_PHRASEBOOK_DIR = (
	Path(__file__).resolve().parent.parent.parent / "configs" / "dialect_anchors"
	)

	_PHRASEBOOK_FILE = {
	"bam": "bambara_phrasebook.json",
	"ful": "pular_phrasebook.json",
	}

	DEFAULT_THRESHOLD = 0.88


	def _normalize(text: str) -> str:
	"""Lowercase, strip most punctuation, collapse whitespace."""
	text = (text or "").lower().strip()
	# Keep internal apostrophes (e.g. "don't", "b'a"), drop other punctuation.
	text = re.sub(r"[^\w\s']", " ", text, flags=re.UNICODE)
	text = re.sub(r"\s+", " ", text)
	return text.strip()


	def _expand_aliases(entry: dict) -> list[str]:
	"""Collect every source-alias on an entry across both languages.

	Schema (all fields optional, additive — existing single-`source` rows
	keep working unchanged):
	source : "good morning" # canonical English
	sources : ["morning", "morning!"] # English aliases / paraphrases
	source_fr : "bonjour" # canonical French
	sources_fr : ["salut", "bonjour à tous"]
	"""
	out: list[str] = []
	for key in ("source", "source_fr"):
	v = entry.get(key)
	if isinstance(v, str) and v.strip():
	out.append(v)
	for key in ("sources", "sources_fr"):
	vs = entry.get(key)
	if isinstance(vs, list):
	out.extend(x for x in vs if isinstance(x, str) and x.strip())
	return out


	@lru_cache(maxsize=4)
	def _load_phrasebook(lang: str) -> list[dict]:
	"""Load and flatten a phrasebook into one match-candidate per alias.

	Each candidate carries the canonical source/target/category for display
	and a precomputed normalised alias (`_norm`) for the matcher to compare
	against. One curated row with N aliases produces N candidates that all
	point at the same target translation — the matcher picks the closest
	alias and returns the canonical entry.
	"""
	fname = _PHRASEBOOK_FILE.get(lang)
	if not fname:
	return []
	path = _PHRASEBOOK_DIR / fname
	if not path.exists():
	logger.warning("Phrasebook missing: %s", path)
	return []
	with path.open("r", encoding="utf-8") as f:
	data = json.load(f)
	pairs = data.get("pairs", [])

	candidates: list[dict] = []
	for p in pairs:
	target = p.get("target", "")
	category = p.get("category")
	# Canonical source for display: prefer English `source`, else first
	# English `sources`, else French canonical, else first French alias.
	canonical = (
	p.get("source")
	or (p.get("sources") or [None])[0]
	or p.get("source_fr")
	or (p.get("sources_fr") or [None])[0]
	or ""
	)
	for alias in _expand_aliases(p):
	candidates.append({
	"source": canonical or alias,
	"target": target,
	"category": category,
	"_alias": alias,
	"_norm": _normalize(alias),
	})
	return candidates


	def lookup(
	user_text: str,
	target_lang: str,
	threshold: float = DEFAULT_THRESHOLD,
	) -> Optional[dict]:
	"""Return best curated match for `user_text` in `target_lang`, or None.

	Short-circuits only for curated dialects (bam, ful). For any other target
	returns None so the caller falls through to the LLM.
	"""
	pairs = _load_phrasebook(target_lang)
	if not pairs:
	return None
	q = _normalize(user_text)
	if not q:
	return None

	best: Optional[dict] = None
	best_score = 0.0
	for p in pairs:
	src = p.get("_norm", "")
	if not src:
	continue
	if src == q:
	return {
	"source": p.get("source"),
	"target": p.get("target"),
	"category": p.get("category"),
	"score": 1.0,
	"match": "exact",
	}
	score = SequenceMatcher(None, q, src).ratio()
	if score > best_score:
	best_score = score
	best = p

	if best and best_score >= threshold:
	return {
	"source": best.get("source"),
	"target": best.get("target"),
	"category": best.get("category"),
	"score": round(best_score, 3),
	"match": "fuzzy",
	}
	return None


	def top_k(user_text: str, target_lang: str, k: int = 3) -> list[dict]:
	"""Return the k closest phrasebook entries to `user_text` regardless of threshold.

	Used as RAG-style few-shot context when the strict `lookup()` misses but we
	still want to anchor the LLM with locally relevant gold pairs. Returns
	results sorted by descending score; never raises.
	"""
	pairs = _load_phrasebook(target_lang)
	if not pairs:
	return []
	q = _normalize(user_text)
	if not q:
	return []
	scored: list[tuple[float, dict]] = []
	for p in pairs:
	src = p.get("_norm", "")
	if not src:
	continue
	score = 1.0 if src == q else SequenceMatcher(None, q, src).ratio()
	scored.append((score, p))
	scored.sort(key=lambda x: x[0], reverse=True)
	out: list[dict] = []
	seen: set[str] = set() # dedupe: aliases of the same row → one slot
	for score, p in scored:
	target = p.get("target") or ""
	if target in seen:
	continue
	seen.add(target)
	out.append({
	"source": p.get("source"),
	"target": target,
	"category": p.get("category"),
	"score": round(score, 3),
	})
	if len(out) >= k:
	break
	return out