Spaces:
Sleeping
Sleeping
| """ | |
| GemmaClient — wraps the HuggingFace Serverless Inference API for Gemma. | |
| The system prompt implements the 'adult-child' logic: | |
| - The LLM is a child learning Bambara/Fula from the user (adult/teacher) | |
| - vocabulary.jsonl is its primary memory / source of truth | |
| - It detects TEACHING intent and returns structured JSON so MemoryManager | |
| can persist the new word | |
| - It answers QUESTIONS using the vocabulary it has learned | |
| Model: configurable via LLM_MODEL_ID env var. | |
| Default: Qwen/Qwen2.5-72B-Instruct — reliably available on HF Serverless free tier. | |
| Tested models that work on HF Serverless (no paid provider needed): | |
| Qwen/Qwen2.5-72B-Instruct ← default, best quality | |
| Qwen/Qwen2.5-7B-Instruct ← faster, slightly lower quality | |
| mistralai/Mistral-7B-Instruct-v0.3 | |
| HuggingFaceH4/zephyr-7b-beta | |
| google/gemma-3-4b-it is NOT on the free tier — it requires a paid provider. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import re | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| SYSTEM_PROMPT_TEMPLATE = """\ | |
| You are an AI language assistant learning Bambara and Fula — two West African languages. \ | |
| You behave like an eager child learner: you absorb every word the user teaches you, \ | |
| and you use what you have already learned to answer questions. | |
| YOUR CURRENT VOCABULARY (your only source of truth): | |
| {vocabulary_context} | |
| RESPONSE RULES — always reply with a single valid JSON object, nothing else: | |
| 1. If the user is TEACHING you a word or phrase (e.g. "I ni ce means hello" / \ | |
| "X se dit Y en bambara" / "X veut dire Y"), reply: | |
| {{ | |
| "intent": "teaching", | |
| "word": "<the word/phrase being taught>", | |
| "language": "<bam | ful | fr | en>", | |
| "translation": "<the translation given>", | |
| "translation_language": "<bam | ful | fr | en>", | |
| "response": "<warm acknowledgment in the same language the user used, \ | |
| 1-2 sentences, use the word in a sentence if possible>" | |
| }} | |
| 2. If the user is ASKING a question you can answer using the vocabulary: | |
| {{ | |
| "intent": "question", | |
| "response": "<answer using vocabulary — be honest if you don't know>" | |
| }} | |
| 3. For general CONVERSATION or GREETING: | |
| {{ | |
| "intent": "conversation", | |
| "response": "<natural, friendly reply — 1-3 sentences>" | |
| }} | |
| Always be warm, encouraging, and curious. If unsure of intent, choose "conversation".\ | |
| """ | |
| class GemmaClient: | |
| """Calls Gemma via HF Serverless Inference API.""" | |
| def __init__( | |
| self, | |
| model_id: str = "Qwen/Qwen2.5-72B-Instruct", | |
| hf_token: Optional[str] = None, | |
| ) -> None: | |
| self.model_id = model_id | |
| self.hf_token = hf_token | |
| self._client = None # lazy init | |
| def _get_client(self): | |
| if self._client is None: | |
| from huggingface_hub import InferenceClient | |
| self._client = InferenceClient(token=self.hf_token) | |
| return self._client | |
| def chat(self, user_text: str, vocabulary_context: str) -> dict: | |
| """ | |
| Send a message and get a structured response back. | |
| Returns a dict with at minimum: intent, response. | |
| On any error returns: {"intent": "error", "response": <error message>} | |
| """ | |
| system_prompt = SYSTEM_PROMPT_TEMPLATE.format( | |
| vocabulary_context=vocabulary_context or "(no vocabulary yet)" | |
| ) | |
| try: | |
| client = self._get_client() | |
| completion = client.chat_completion( | |
| model=self.model_id, | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_text}, | |
| ], | |
| max_tokens=512, | |
| temperature=0.4, | |
| ) | |
| raw = completion.choices[0].message.content.strip() | |
| logger.debug("Gemma raw response: %s", raw[:200]) | |
| return self._parse(raw) | |
| except Exception as exc: | |
| logger.error("GemmaClient error: %s", exc) | |
| return { | |
| "intent": "error", | |
| "response": f"(LLM unavailable: {exc})", | |
| } | |
| # ── Parsing ─────────────────────────────────────────────────────────────── | |
| def _parse(self, raw: str) -> dict: | |
| """Extract JSON from the model output — handles markdown code fences.""" | |
| # Strip markdown code fences if present | |
| text = raw.strip() | |
| fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL) | |
| if fence_match: | |
| text = fence_match.group(1) | |
| else: | |
| # Find first { ... } block | |
| brace_match = re.search(r"\{.*\}", text, re.DOTALL) | |
| if brace_match: | |
| text = brace_match.group(0) | |
| try: | |
| data = json.loads(text) | |
| if "intent" not in data: | |
| data["intent"] = "conversation" | |
| if "response" not in data: | |
| data["response"] = raw # fall back to raw text | |
| return data | |
| except json.JSONDecodeError: | |
| # Return the raw text as a conversation response | |
| return {"intent": "conversation", "response": raw} | |