""" GemmaClient — wraps the HuggingFace Serverless Inference API for Gemma. The system prompt implements the 'adult-child' logic: - The LLM is a child learning Bambara/Fula from the user (adult/teacher) - vocabulary.jsonl is its primary memory / source of truth - It detects TEACHING intent and returns structured JSON so MemoryManager can persist the new word - It answers QUESTIONS using the vocabulary it has learned Model: configurable via LLM_MODEL_ID env var. Default: Qwen/Qwen2.5-72B-Instruct — reliably available on HF Serverless free tier. Tested models that work on HF Serverless (no paid provider needed): Qwen/Qwen2.5-72B-Instruct ← default, best quality Qwen/Qwen2.5-7B-Instruct ← faster, slightly lower quality mistralai/Mistral-7B-Instruct-v0.3 HuggingFaceH4/zephyr-7b-beta google/gemma-3-4b-it is NOT on the free tier — it requires a paid provider. """ from __future__ import annotations import json import logging import re from typing import Optional logger = logging.getLogger(__name__) SYSTEM_PROMPT_TEMPLATE = """\ You are an AI language assistant learning Bambara and Fula — two West African languages. \ You behave like an eager child learner: you absorb every word the user teaches you, \ and you use what you have already learned to answer questions. YOUR CURRENT VOCABULARY (your only source of truth): {vocabulary_context} RESPONSE RULES — always reply with a single valid JSON object, nothing else: 1. If the user is TEACHING you a word or phrase (e.g. "I ni ce means hello" / \ "X se dit Y en bambara" / "X veut dire Y"), reply: {{ "intent": "teaching", "word": "", "language": "", "translation": "", "translation_language": "", "response": "" }} 2. If the user is ASKING a question you can answer using the vocabulary: {{ "intent": "question", "response": "" }} 3. For general CONVERSATION or GREETING: {{ "intent": "conversation", "response": "" }} Always be warm, encouraging, and curious. If unsure of intent, choose "conversation".\ """ class GemmaClient: """Calls Gemma via HF Serverless Inference API.""" def __init__( self, model_id: str = "Qwen/Qwen2.5-72B-Instruct", hf_token: Optional[str] = None, ) -> None: self.model_id = model_id self.hf_token = hf_token self._client = None # lazy init def _get_client(self): if self._client is None: from huggingface_hub import InferenceClient self._client = InferenceClient(token=self.hf_token) return self._client def chat(self, user_text: str, vocabulary_context: str) -> dict: """ Send a message and get a structured response back. Returns a dict with at minimum: intent, response. On any error returns: {"intent": "error", "response": } """ system_prompt = SYSTEM_PROMPT_TEMPLATE.format( vocabulary_context=vocabulary_context or "(no vocabulary yet)" ) try: client = self._get_client() completion = client.chat_completion( model=self.model_id, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_text}, ], max_tokens=512, temperature=0.4, ) raw = completion.choices[0].message.content.strip() logger.debug("Gemma raw response: %s", raw[:200]) return self._parse(raw) except Exception as exc: logger.error("GemmaClient error: %s", exc) return { "intent": "error", "response": f"(LLM unavailable: {exc})", } # ── Parsing ─────────────────────────────────────────────────────────────── def _parse(self, raw: str) -> dict: """Extract JSON from the model output — handles markdown code fences.""" # Strip markdown code fences if present text = raw.strip() fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL) if fence_match: text = fence_match.group(1) else: # Find first { ... } block brace_match = re.search(r"\{.*\}", text, re.DOTALL) if brace_match: text = brace_match.group(0) try: data = json.loads(text) if "intent" not in data: data["intent"] = "conversation" if "response" not in data: data["response"] = raw # fall back to raw text return data except json.JSONDecodeError: # Return the raw text as a conversation response return {"intent": "conversation", "response": raw}