ground-zero / src /llm /minimal_client.py
jefffffff9
Forbid parenthetical glosses in LLM replies
757e833
"""MinimalClient — dialect-anchored plain-text LLM client for the Month 1–3 rebuild.
Why this exists (and not GemmaClient):
GemmaClient wraps every reply in a JSON object and runs a "teacher / child"
intent-classification flow. That's fine for the full app, but for the minimal
baseline it (a) spends model capacity on JSON compliance, (b) lets the model
drift into neighbouring languages (Wolof, Hausa, Pulaar of Senegal, Fulfulde
of Nigeria, Jula of Côte d'Ivoire), and (c) produces text that isn't clean
for TTS.
This client instead:
- pins the target dialect explicitly (Bambara / Bamako–Mali or Pular / Fuuta
Jallon–Guinea),
- injects the curated 30-phrase gold list for the target language as
few-shot anchoring in the system prompt,
- names forbidden neighbouring languages the model must not code-switch to,
- returns a plain string, ready for MMS-TTS.
GemmaClient and app.py are intentionally untouched.
"""
from __future__ import annotations
import json
import logging
from functools import lru_cache
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
# configs/dialect_anchors/*.json lives at <repo>/configs/dialect_anchors
_ANCHOR_DIR = (
Path(__file__).resolve().parent.parent.parent / "configs" / "dialect_anchors"
)
_ANCHOR_FILE = {
"bam": "bambara_mali.json",
"ful": "pular_guinea.json",
}
LANG_FULL_NAME = {
"bam": "Bambara as spoken in Bamako, Mali",
"ful": "Pular of Fuuta Jallon, as spoken in Guinea",
"fr": "French",
"en": "English",
}
# Neighbouring languages the model is most likely to drift into. Empty for
# fr/en — we don't need to fence those.
FORBIDDEN_DRIFT = {
"bam": (
"Jula / Dyula of Côte d'Ivoire, Wolof, Hausa, Swahili, Lingala, "
"or any other African language"
),
"ful": (
"Pulaar of Senegal, Fulfulde of Nigeria or Cameroon, Wolof, Hausa, "
"Swahili, or any other African language"
),
"fr": "",
"en": "",
}
@lru_cache(maxsize=4)
def _load_anchors(lang: str) -> list[dict]:
"""Load the curated gold-phrase list for `lang`. Cached per process."""
fname = _ANCHOR_FILE.get(lang)
if not fname:
return []
path = _ANCHOR_DIR / fname
if not path.exists():
logger.warning("Dialect anchor file missing: %s", path)
return []
with path.open("r", encoding="utf-8") as f:
data = json.load(f)
return data.get("pairs", [])
def _build_system_prompt(
target_lang: str,
extra_examples: Optional[list[dict]] = None,
) -> str:
"""Assemble the per-call system prompt for a target output language.
`extra_examples`, when supplied, are appended after the curated 30-pair
gold list as additional dynamic few-shot anchoring — used by app_minimal
to inject the top-K nearest phrasebook entries when the strict short-
circuit misses.
"""
full = LANG_FULL_NAME.get(target_lang, "English")
forbidden = FORBIDDEN_DRIFT.get(target_lang, "")
anchors = _load_anchors(target_lang)
lines: list[str] = [
f"You are a warm, concise conversational assistant that replies ONLY in {full}.",
"",
"Your task is to REPLY to the user's message as a person would in "
"conversation — NOT to translate it. If the user greets you, greet them "
"back and ask how they are. If they ask a question, answer it. If they "
"make a statement, respond appropriately. Never simply repeat or "
"translate what they said back to them.",
"",
"Output format: plain natural text only. No JSON, no code fences, no "
"markdown, no translations, no romanisation, no explanations, and "
"ABSOLUTELY no parenthetical glosses, literal translations, or "
"English/French annotations of any kind (do NOT write things like "
"'(Lit: ...)', '(meaning ...)', or any '(English ...)' aside). The "
f"output must be 100% {full} characters and punctuation only. Reply in "
"1–3 short sentences suitable to be read aloud by a text-to-speech voice.",
]
if forbidden:
lines += [
"",
(
f"CRITICAL — dialect fidelity: do NOT use, mix, or substitute words "
f"from {forbidden}. If you are not confident a word belongs to "
f"{full}, rephrase using simpler vocabulary you are certain of, or "
f"apologise briefly in {full} (for example that you did not "
f"understand)."
),
]
if anchors:
lines += [
"",
f"Reference phrases in {full} — these pairs are STYLE/ORTHOGRAPHY "
"examples ONLY (showing how English/French maps to the correct "
"dialect). Do NOT treat them as a translation task: when the user "
"writes one of these source phrases, do not just output its target "
"verbatim — instead REPLY conversationally in the same dialectal "
"style:",
]
for item in anchors:
src = item.get("source", "").strip()
tgt = item.get("target", "").strip()
if src and tgt:
lines.append(f"- {src}{tgt}")
if extra_examples:
lines += [
"",
"Additional reference phrases relevant to the current user input "
f"(curated gold {full} translations — STYLE references only, not a "
"translation task; reply conversationally, do not echo the target "
"verbatim):",
]
for item in extra_examples:
src = (item.get("source") or "").strip()
tgt = (item.get("target") or "").strip()
if src and tgt:
lines.append(f"- {src}{tgt}")
lines += [
"",
f"Always reply in {full}, even if the user writes to you in English, "
"French, or another language. Never translate your own reply.",
]
return "\n".join(lines)
class MinimalClient:
"""Dialect-anchored plain-text LLM client over HF Serverless Inference.
Usage:
client = MinimalClient(model_id="CohereLabs/aya-expanse-32b", hf_token=TOK)
reply = client.chat("Good morning", target_lang="bam")
# → "I ni sɔgɔma. I ka kɛnɛ wa?"
"""
def __init__(
self,
model_id: str = "CohereLabs/aya-expanse-32b",
hf_token: Optional[str] = None,
) -> None:
self.model_id = model_id
self.hf_token = hf_token
self._client = None # lazy init
def _get_client(self):
if self._client is None:
from huggingface_hub import InferenceClient
self._client = InferenceClient(token=self.hf_token)
return self._client
def chat(
self,
user_text: str,
target_lang: str = "bam",
extra_examples: Optional[list[dict]] = None,
) -> str:
"""Return a plain-text reply in `target_lang`.
`extra_examples` (optional) — list of {source, target} dicts that get
appended to the system prompt as additional dynamic few-shot. Used by
app_minimal to RAG-inject the top-K nearest phrasebook entries when
the strict phrasebook short-circuit misses.
On any error returns a short parenthetical error string so the caller
can still feed something into TTS / display.
"""
system_prompt = _build_system_prompt(target_lang, extra_examples)
try:
client = self._get_client()
completion = client.chat_completion(
model=self.model_id,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_text},
],
max_tokens=256,
temperature=0.3,
)
raw = (completion.choices[0].message.content or "").strip()
# Defensive: strip any stray code fences the model may emit anyway.
if raw.startswith("```"):
raw = raw.strip("`").strip()
# If a language tag slipped in on the first line, drop it.
if "\n" in raw:
first, rest = raw.split("\n", 1)
if len(first) < 20 and " " not in first:
raw = rest.strip()
# Defensive: strip parenthetical English/French glosses the model
# sometimes appends despite the prompt — e.g. "Foo bar (Lit: ...)".
# We only strip parentheticals that LOOK like glosses (start with
# Lit/Literal/Meaning/Translation/English/French, or contain ≥3
# consecutive ASCII letters that aren't part of the target script).
import re as _re
raw = _re.sub(
r"\s*\((?:lit\.?|literal(?:ly)?|meaning|translation|english|french|fr|en)[^)]*\)",
"",
raw,
flags=_re.IGNORECASE,
).strip()
return raw
except Exception as exc: # pragma: no cover — surfaced to UI
logger.error("MinimalClient error: %s", exc)
return f"(LLM unavailable: {exc})"