"""MinimalClient — dialect-anchored plain-text LLM client for the Month 1–3 rebuild. Why this exists (and not GemmaClient): GemmaClient wraps every reply in a JSON object and runs a "teacher / child" intent-classification flow. That's fine for the full app, but for the minimal baseline it (a) spends model capacity on JSON compliance, (b) lets the model drift into neighbouring languages (Wolof, Hausa, Pulaar of Senegal, Fulfulde of Nigeria, Jula of Côte d'Ivoire), and (c) produces text that isn't clean for TTS. This client instead: - pins the target dialect explicitly (Bambara / Bamako–Mali or Pular / Fuuta Jallon–Guinea), - injects the curated 30-phrase gold list for the target language as few-shot anchoring in the system prompt, - names forbidden neighbouring languages the model must not code-switch to, - returns a plain string, ready for MMS-TTS. GemmaClient and app.py are intentionally untouched. """ from __future__ import annotations import json import logging from functools import lru_cache from pathlib import Path from typing import Optional logger = logging.getLogger(__name__) # configs/dialect_anchors/*.json lives at /configs/dialect_anchors _ANCHOR_DIR = ( Path(__file__).resolve().parent.parent.parent / "configs" / "dialect_anchors" ) _ANCHOR_FILE = { "bam": "bambara_mali.json", "ful": "pular_guinea.json", } LANG_FULL_NAME = { "bam": "Bambara as spoken in Bamako, Mali", "ful": "Pular of Fuuta Jallon, as spoken in Guinea", "fr": "French", "en": "English", } # Neighbouring languages the model is most likely to drift into. Empty for # fr/en — we don't need to fence those. FORBIDDEN_DRIFT = { "bam": ( "Jula / Dyula of Côte d'Ivoire, Wolof, Hausa, Swahili, Lingala, " "or any other African language" ), "ful": ( "Pulaar of Senegal, Fulfulde of Nigeria or Cameroon, Wolof, Hausa, " "Swahili, or any other African language" ), "fr": "", "en": "", } @lru_cache(maxsize=4) def _load_anchors(lang: str) -> list[dict]: """Load the curated gold-phrase list for `lang`. Cached per process.""" fname = _ANCHOR_FILE.get(lang) if not fname: return [] path = _ANCHOR_DIR / fname if not path.exists(): logger.warning("Dialect anchor file missing: %s", path) return [] with path.open("r", encoding="utf-8") as f: data = json.load(f) return data.get("pairs", []) def _build_system_prompt( target_lang: str, extra_examples: Optional[list[dict]] = None, ) -> str: """Assemble the per-call system prompt for a target output language. `extra_examples`, when supplied, are appended after the curated 30-pair gold list as additional dynamic few-shot anchoring — used by app_minimal to inject the top-K nearest phrasebook entries when the strict short- circuit misses. """ full = LANG_FULL_NAME.get(target_lang, "English") forbidden = FORBIDDEN_DRIFT.get(target_lang, "") anchors = _load_anchors(target_lang) lines: list[str] = [ f"You are a warm, concise conversational assistant that replies ONLY in {full}.", "", "Your task is to REPLY to the user's message as a person would in " "conversation — NOT to translate it. If the user greets you, greet them " "back and ask how they are. If they ask a question, answer it. If they " "make a statement, respond appropriately. Never simply repeat or " "translate what they said back to them.", "", "Output format: plain natural text only. No JSON, no code fences, no " "markdown, no translations, no romanisation, no explanations, and " "ABSOLUTELY no parenthetical glosses, literal translations, or " "English/French annotations of any kind (do NOT write things like " "'(Lit: ...)', '(meaning ...)', or any '(English ...)' aside). The " f"output must be 100% {full} characters and punctuation only. Reply in " "1–3 short sentences suitable to be read aloud by a text-to-speech voice.", ] if forbidden: lines += [ "", ( f"CRITICAL — dialect fidelity: do NOT use, mix, or substitute words " f"from {forbidden}. If you are not confident a word belongs to " f"{full}, rephrase using simpler vocabulary you are certain of, or " f"apologise briefly in {full} (for example that you did not " f"understand)." ), ] if anchors: lines += [ "", f"Reference phrases in {full} — these pairs are STYLE/ORTHOGRAPHY " "examples ONLY (showing how English/French maps to the correct " "dialect). Do NOT treat them as a translation task: when the user " "writes one of these source phrases, do not just output its target " "verbatim — instead REPLY conversationally in the same dialectal " "style:", ] for item in anchors: src = item.get("source", "").strip() tgt = item.get("target", "").strip() if src and tgt: lines.append(f"- {src} → {tgt}") if extra_examples: lines += [ "", "Additional reference phrases relevant to the current user input " f"(curated gold {full} translations — STYLE references only, not a " "translation task; reply conversationally, do not echo the target " "verbatim):", ] for item in extra_examples: src = (item.get("source") or "").strip() tgt = (item.get("target") or "").strip() if src and tgt: lines.append(f"- {src} → {tgt}") lines += [ "", f"Always reply in {full}, even if the user writes to you in English, " "French, or another language. Never translate your own reply.", ] return "\n".join(lines) class MinimalClient: """Dialect-anchored plain-text LLM client over HF Serverless Inference. Usage: client = MinimalClient(model_id="CohereLabs/aya-expanse-32b", hf_token=TOK) reply = client.chat("Good morning", target_lang="bam") # → "I ni sɔgɔma. I ka kɛnɛ wa?" """ def __init__( self, model_id: str = "CohereLabs/aya-expanse-32b", hf_token: Optional[str] = None, ) -> None: self.model_id = model_id self.hf_token = hf_token self._client = None # lazy init def _get_client(self): if self._client is None: from huggingface_hub import InferenceClient self._client = InferenceClient(token=self.hf_token) return self._client def chat( self, user_text: str, target_lang: str = "bam", extra_examples: Optional[list[dict]] = None, ) -> str: """Return a plain-text reply in `target_lang`. `extra_examples` (optional) — list of {source, target} dicts that get appended to the system prompt as additional dynamic few-shot. Used by app_minimal to RAG-inject the top-K nearest phrasebook entries when the strict phrasebook short-circuit misses. On any error returns a short parenthetical error string so the caller can still feed something into TTS / display. """ system_prompt = _build_system_prompt(target_lang, extra_examples) try: client = self._get_client() completion = client.chat_completion( model=self.model_id, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_text}, ], max_tokens=256, temperature=0.3, ) raw = (completion.choices[0].message.content or "").strip() # Defensive: strip any stray code fences the model may emit anyway. if raw.startswith("```"): raw = raw.strip("`").strip() # If a language tag slipped in on the first line, drop it. if "\n" in raw: first, rest = raw.split("\n", 1) if len(first) < 20 and " " not in first: raw = rest.strip() # Defensive: strip parenthetical English/French glosses the model # sometimes appends despite the prompt — e.g. "Foo bar (Lit: ...)". # We only strip parentheticals that LOOK like glosses (start with # Lit/Literal/Meaning/Translation/English/French, or contain ≥3 # consecutive ASCII letters that aren't part of the target script). import re as _re raw = _re.sub( r"\s*\((?:lit\.?|literal(?:ly)?|meaning|translation|english|french|fr|en)[^)]*\)", "", raw, flags=_re.IGNORECASE, ).strip() return raw except Exception as exc: # pragma: no cover — surfaced to UI logger.error("MinimalClient error: %s", exc) return f"(LLM unavailable: {exc})"