Spaces:

MataStrategy
/

ground-zero

Sleeping

File size: 7,321 Bytes

"""Phrasebook short-circuit — skip the LLM when the user hits a curated phrase.

Purpose
    For the 80% of field-demo inputs that are canonical greetings, courtesies,
    or basic questions, the LLM adds risk (dialect drift, hallucination,
    latency) without adding value — we already have a gold translation. This
    module does an English-keyed, fuzzy-normalised match against the curated
    phrasebooks in configs/dialect_anchors/{bambara,pular}_phrasebook.json and
    returns the target string directly when the match is strong.

Scope
    - Only fires when target language is bam or ful. For en/fr output we let
      the LLM (or a passthrough) handle it — nothing to short-circuit.
    - Source keys can be English and/or French, single or multi-alias. Each
      curated row may carry any combination of `source`, `sources` (list),
      `source_fr`, `sources_fr` (list); the loader flattens them into one
      match-candidate per alias so a typed paraphrase or a French equivalent
      hits the same target translation.

Matching
    - Exact match on normalised string → score 1.0 ("exact").
    - Otherwise SequenceMatcher ratio; threshold DEFAULT_THRESHOLD = 0.88.
    - Normalisation: lowercase, strip punctuation (keeps internal apostrophes),
      collapse whitespace.

API
    lookup(user_text, target_lang) -> dict | None
        dict has keys: source, target, category, score, match
"""
from __future__ import annotations

import json
import logging
import re
from difflib import SequenceMatcher
from functools import lru_cache
from pathlib import Path
from typing import Optional

logger = logging.getLogger(__name__)

_PHRASEBOOK_DIR = (
    Path(__file__).resolve().parent.parent.parent / "configs" / "dialect_anchors"
)

_PHRASEBOOK_FILE = {
    "bam": "bambara_phrasebook.json",
    "ful": "pular_phrasebook.json",
}

DEFAULT_THRESHOLD = 0.88


def _normalize(text: str) -> str:
    """Lowercase, strip most punctuation, collapse whitespace."""
    text = (text or "").lower().strip()
    # Keep internal apostrophes (e.g. "don't", "b'a"), drop other punctuation.
    text = re.sub(r"[^\w\s']", " ", text, flags=re.UNICODE)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def _expand_aliases(entry: dict) -> list[str]:
    """Collect every source-alias on an entry across both languages.

    Schema (all fields optional, additive — existing single-`source` rows
    keep working unchanged):
        source      : "good morning"           # canonical English
        sources     : ["morning", "morning!"]  # English aliases / paraphrases
        source_fr   : "bonjour"                # canonical French
        sources_fr  : ["salut", "bonjour à tous"]
    """
    out: list[str] = []
    for key in ("source", "source_fr"):
        v = entry.get(key)
        if isinstance(v, str) and v.strip():
            out.append(v)
    for key in ("sources", "sources_fr"):
        vs = entry.get(key)
        if isinstance(vs, list):
            out.extend(x for x in vs if isinstance(x, str) and x.strip())
    return out


@lru_cache(maxsize=4)
def _load_phrasebook(lang: str) -> list[dict]:
    """Load and flatten a phrasebook into one match-candidate per alias.

    Each candidate carries the canonical source/target/category for display
    and a precomputed normalised alias (`_norm`) for the matcher to compare
    against. One curated row with N aliases produces N candidates that all
    point at the same target translation — the matcher picks the closest
    alias and returns the canonical entry.
    """
    fname = _PHRASEBOOK_FILE.get(lang)
    if not fname:
        return []
    path = _PHRASEBOOK_DIR / fname
    if not path.exists():
        logger.warning("Phrasebook missing: %s", path)
        return []
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    pairs = data.get("pairs", [])

    candidates: list[dict] = []
    for p in pairs:
        target = p.get("target", "")
        category = p.get("category")
        # Canonical source for display: prefer English `source`, else first
        # English `sources`, else French canonical, else first French alias.
        canonical = (
            p.get("source")
            or (p.get("sources") or [None])[0]
            or p.get("source_fr")
            or (p.get("sources_fr") or [None])[0]
            or ""
        )
        for alias in _expand_aliases(p):
            candidates.append({
                "source":   canonical or alias,
                "target":   target,
                "category": category,
                "_alias":   alias,
                "_norm":    _normalize(alias),
            })
    return candidates


def lookup(
    user_text: str,
    target_lang: str,
    threshold: float = DEFAULT_THRESHOLD,
) -> Optional[dict]:
    """Return best curated match for `user_text` in `target_lang`, or None.

    Short-circuits only for curated dialects (bam, ful). For any other target
    returns None so the caller falls through to the LLM.
    """
    pairs = _load_phrasebook(target_lang)
    if not pairs:
        return None
    q = _normalize(user_text)
    if not q:
        return None

    best: Optional[dict] = None
    best_score = 0.0
    for p in pairs:
        src = p.get("_norm", "")
        if not src:
            continue
        if src == q:
            return {
                "source":   p.get("source"),
                "target":   p.get("target"),
                "category": p.get("category"),
                "score":    1.0,
                "match":    "exact",
            }
        score = SequenceMatcher(None, q, src).ratio()
        if score > best_score:
            best_score = score
            best = p

    if best and best_score >= threshold:
        return {
            "source":   best.get("source"),
            "target":   best.get("target"),
            "category": best.get("category"),
            "score":    round(best_score, 3),
            "match":    "fuzzy",
        }
    return None


def top_k(user_text: str, target_lang: str, k: int = 3) -> list[dict]:
    """Return the k closest phrasebook entries to `user_text` regardless of threshold.

    Used as RAG-style few-shot context when the strict `lookup()` misses but we
    still want to anchor the LLM with locally relevant gold pairs. Returns
    results sorted by descending score; never raises.
    """
    pairs = _load_phrasebook(target_lang)
    if not pairs:
        return []
    q = _normalize(user_text)
    if not q:
        return []
    scored: list[tuple[float, dict]] = []
    for p in pairs:
        src = p.get("_norm", "")
        if not src:
            continue
        score = 1.0 if src == q else SequenceMatcher(None, q, src).ratio()
        scored.append((score, p))
    scored.sort(key=lambda x: x[0], reverse=True)
    out: list[dict] = []
    seen: set[str] = set()  # dedupe: aliases of the same row → one slot
    for score, p in scored:
        target = p.get("target") or ""
        if target in seen:
            continue
        seen.add(target)
        out.append({
            "source":   p.get("source"),
            "target":   target,
            "category": p.get("category"),
            "score":    round(score, 3),
        })
        if len(out) >= k:
            break
    return out