ground-zero / src /llm /gemma_client.py
jefffffff9
Fix: switch LLM to Qwen2.5-72B-Instruct (Gemma not on HF free tier)
61e52d7
"""
GemmaClient — wraps the HuggingFace Serverless Inference API for Gemma.
The system prompt implements the 'adult-child' logic:
- The LLM is a child learning Bambara/Fula from the user (adult/teacher)
- vocabulary.jsonl is its primary memory / source of truth
- It detects TEACHING intent and returns structured JSON so MemoryManager
can persist the new word
- It answers QUESTIONS using the vocabulary it has learned
Model: configurable via LLM_MODEL_ID env var.
Default: Qwen/Qwen2.5-72B-Instruct — reliably available on HF Serverless free tier.
Tested models that work on HF Serverless (no paid provider needed):
Qwen/Qwen2.5-72B-Instruct ← default, best quality
Qwen/Qwen2.5-7B-Instruct ← faster, slightly lower quality
mistralai/Mistral-7B-Instruct-v0.3
HuggingFaceH4/zephyr-7b-beta
google/gemma-3-4b-it is NOT on the free tier — it requires a paid provider.
"""
from __future__ import annotations
import json
import logging
import re
from typing import Optional
logger = logging.getLogger(__name__)
SYSTEM_PROMPT_TEMPLATE = """\
You are an AI language assistant learning Bambara and Fula — two West African languages. \
You behave like an eager child learner: you absorb every word the user teaches you, \
and you use what you have already learned to answer questions.
YOUR CURRENT VOCABULARY (your only source of truth):
{vocabulary_context}
RESPONSE RULES — always reply with a single valid JSON object, nothing else:
1. If the user is TEACHING you a word or phrase (e.g. "I ni ce means hello" / \
"X se dit Y en bambara" / "X veut dire Y"), reply:
{{
"intent": "teaching",
"word": "<the word/phrase being taught>",
"language": "<bam | ful | fr | en>",
"translation": "<the translation given>",
"translation_language": "<bam | ful | fr | en>",
"response": "<warm acknowledgment in the same language the user used, \
1-2 sentences, use the word in a sentence if possible>"
}}
2. If the user is ASKING a question you can answer using the vocabulary:
{{
"intent": "question",
"response": "<answer using vocabulary — be honest if you don't know>"
}}
3. For general CONVERSATION or GREETING:
{{
"intent": "conversation",
"response": "<natural, friendly reply — 1-3 sentences>"
}}
Always be warm, encouraging, and curious. If unsure of intent, choose "conversation".\
"""
class GemmaClient:
"""Calls Gemma via HF Serverless Inference API."""
def __init__(
self,
model_id: str = "Qwen/Qwen2.5-72B-Instruct",
hf_token: Optional[str] = None,
) -> None:
self.model_id = model_id
self.hf_token = hf_token
self._client = None # lazy init
def _get_client(self):
if self._client is None:
from huggingface_hub import InferenceClient
self._client = InferenceClient(token=self.hf_token)
return self._client
def chat(self, user_text: str, vocabulary_context: str) -> dict:
"""
Send a message and get a structured response back.
Returns a dict with at minimum: intent, response.
On any error returns: {"intent": "error", "response": <error message>}
"""
system_prompt = SYSTEM_PROMPT_TEMPLATE.format(
vocabulary_context=vocabulary_context or "(no vocabulary yet)"
)
try:
client = self._get_client()
completion = client.chat_completion(
model=self.model_id,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_text},
],
max_tokens=512,
temperature=0.4,
)
raw = completion.choices[0].message.content.strip()
logger.debug("Gemma raw response: %s", raw[:200])
return self._parse(raw)
except Exception as exc:
logger.error("GemmaClient error: %s", exc)
return {
"intent": "error",
"response": f"(LLM unavailable: {exc})",
}
# ── Parsing ───────────────────────────────────────────────────────────────
def _parse(self, raw: str) -> dict:
"""Extract JSON from the model output — handles markdown code fences."""
# Strip markdown code fences if present
text = raw.strip()
fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
if fence_match:
text = fence_match.group(1)
else:
# Find first { ... } block
brace_match = re.search(r"\{.*\}", text, re.DOTALL)
if brace_match:
text = brace_match.group(0)
try:
data = json.loads(text)
if "intent" not in data:
data["intent"] = "conversation"
if "response" not in data:
data["response"] = raw # fall back to raw text
return data
except json.JSONDecodeError:
# Return the raw text as a conversation response
return {"intent": "conversation", "response": raw}