Spaces:

MataStrategy
/

ground-zero

Sleeping

jefffffff9

Fix: switch LLM to Qwen2.5-72B-Instruct (Gemma not on HF free tier)

61e52d7 about 2 months ago

5.28 kB

	"""
	GemmaClient — wraps the HuggingFace Serverless Inference API for Gemma.

	The system prompt implements the 'adult-child' logic:
	- The LLM is a child learning Bambara/Fula from the user (adult/teacher)
	- vocabulary.jsonl is its primary memory / source of truth
	- It detects TEACHING intent and returns structured JSON so MemoryManager
	can persist the new word
	- It answers QUESTIONS using the vocabulary it has learned

	Model: configurable via LLM_MODEL_ID env var.
	Default: Qwen/Qwen2.5-72B-Instruct — reliably available on HF Serverless free tier.

	Tested models that work on HF Serverless (no paid provider needed):
	Qwen/Qwen2.5-72B-Instruct ← default, best quality
	Qwen/Qwen2.5-7B-Instruct ← faster, slightly lower quality
	mistralai/Mistral-7B-Instruct-v0.3
	HuggingFaceH4/zephyr-7b-beta

	google/gemma-3-4b-it is NOT on the free tier — it requires a paid provider.
	"""
	from __future__ import annotations

	import json
	import logging
	import re
	from typing import Optional

	logger = logging.getLogger(__name__)

	SYSTEM_PROMPT_TEMPLATE = """\
	You are an AI language assistant learning Bambara and Fula — two West African languages. \
	You behave like an eager child learner: you absorb every word the user teaches you, \
	and you use what you have already learned to answer questions.

	YOUR CURRENT VOCABULARY (your only source of truth):
	{vocabulary_context}

	RESPONSE RULES — always reply with a single valid JSON object, nothing else:

	1. If the user is TEACHING you a word or phrase (e.g. "I ni ce means hello" / \
	"X se dit Y en bambara" / "X veut dire Y"), reply:
	{{
	"intent": "teaching",
	"word": "<the word/phrase being taught>",
	"language": "<bam \| ful \| fr \| en>",
	"translation": "<the translation given>",
	"translation_language": "<bam \| ful \| fr \| en>",
	"response": "<warm acknowledgment in the same language the user used, \
	1-2 sentences, use the word in a sentence if possible>"
	}}

	2. If the user is ASKING a question you can answer using the vocabulary:
	{{
	"intent": "question",
	"response": "<answer using vocabulary — be honest if you don't know>"
	}}

	3. For general CONVERSATION or GREETING:
	{{
	"intent": "conversation",
	"response": "<natural, friendly reply — 1-3 sentences>"
	}}

	Always be warm, encouraging, and curious. If unsure of intent, choose "conversation".\
	"""


	class GemmaClient:
	"""Calls Gemma via HF Serverless Inference API."""

	def __init__(
	self,
	model_id: str = "Qwen/Qwen2.5-72B-Instruct",
	hf_token: Optional[str] = None,
	) -> None:
	self.model_id = model_id
	self.hf_token = hf_token
	self._client = None # lazy init

	def _get_client(self):
	if self._client is None:
	from huggingface_hub import InferenceClient
	self._client = InferenceClient(token=self.hf_token)
	return self._client

	def chat(self, user_text: str, vocabulary_context: str) -> dict:
	"""
	Send a message and get a structured response back.
	Returns a dict with at minimum: intent, response.
	On any error returns: {"intent": "error", "response": <error message>}
	"""
	system_prompt = SYSTEM_PROMPT_TEMPLATE.format(
	vocabulary_context=vocabulary_context or "(no vocabulary yet)"
	)

	try:
	client = self._get_client()
	completion = client.chat_completion(
	model=self.model_id,
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_text},
	],
	max_tokens=512,
	temperature=0.4,
	)
	raw = completion.choices[0].message.content.strip()
	logger.debug("Gemma raw response: %s", raw[:200])
	return self._parse(raw)

	except Exception as exc:
	logger.error("GemmaClient error: %s", exc)
	return {
	"intent": "error",
	"response": f"(LLM unavailable: {exc})",
	}

	# ── Parsing ───────────────────────────────────────────────────────────────

	def _parse(self, raw: str) -> dict:
	"""Extract JSON from the model output — handles markdown code fences."""
	# Strip markdown code fences if present
	text = raw.strip()
	fence_match = re.search(r"```(?:json)?\s(\{.?\})\s*```", text, re.DOTALL)
	if fence_match:
	text = fence_match.group(1)
	else:
	# Find first { ... } block
	brace_match = re.search(r"\{.*\}", text, re.DOTALL)
	if brace_match:
	text = brace_match.group(0)

	try:
	data = json.loads(text)
	if "intent" not in data:
	data["intent"] = "conversation"
	if "response" not in data:
	data["response"] = raw # fall back to raw text
	return data
	except json.JSONDecodeError:
	# Return the raw text as a conversation response
	return {"intent": "conversation", "response": raw}