Spaces:
Sleeping
Sleeping
File size: 20,276 Bytes
096b19d dd38e25 096b19d dd38e25 096b19d 082adaa 096b19d 082adaa 096b19d 61e52d7 096b19d 3657607 49910a9 3657607 49910a9 096b19d 3657607 096b19d 3657607 096b19d 3657607 096b19d 3657607 096b19d 3657607 096b19d dd38e25 49910a9 dd38e25 096b19d 49910a9 dd38e25 49910a9 096b19d dd38e25 096b19d dd38e25 096b19d dd38e25 096b19d dd38e25 49910a9 dd38e25 49910a9 096b19d dd38e25 096b19d 3657607 dd38e25 096b19d dd38e25 096b19d dd38e25 096b19d dd38e25 096b19d 49910a9 dd38e25 49910a9 dd38e25 082adaa 096b19d 082adaa 096b19d 082adaa 096b19d 49910a9 3657607 082adaa 096b19d 3657607 49910a9 082adaa dd38e25 49910a9 082adaa 096b19d 082adaa 49910a9 082adaa 096b19d dd38e25 096b19d dd38e25 49910a9 dd38e25 096b19d dd38e25 096b19d 49910a9 096b19d dd38e25 096b19d 49910a9 096b19d dd38e25 096b19d dd38e25 096b19d dd38e25 096b19d 49910a9 dd38e25 096b19d 49910a9 dd38e25 096b19d 49910a9 dd38e25 096b19d dd38e25 096b19d dd38e25 49910a9 096b19d 61e52d7 096b19d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 | """
Sahel-Voice-Lab — Internal Edition (Phase 2: Voice Output)
Stack (100% non-Meta):
STT : openai/whisper-large-v3-turbo
LLM : Qwen/Qwen2.5-72B-Instruct (or LLM_MODEL_ID env var)
TTS : MALIBA-AI/bambara-tts (Bambara) | ous-sow/fula-tts (Fula, after training)
Store: HF Dataset ous-sow/sahel-agri-feedback → vocabulary.jsonl
Flow:
1. User presses Push-to-Talk → records audio
2. Whisper transcribes to text
3. MemoryManager injects current vocabulary into Gemma's system prompt
4. Gemma returns structured JSON:
teaching → MemoryManager.add_word_pair() → push to Hub
question → answer using vocabulary
conversation → natural reply
5. UI shows Gemma's reply + last 5 learned words
"""
from __future__ import annotations
import logging
import os
import sys
import threading
from pathlib import Path
logger = logging.getLogger(__name__)
import gradio as gr
ROOT = Path(__file__).parent
sys.path.insert(0, str(ROOT))
# ── Env ───────────────────────────────────────────────────────────────────────
HF_TOKEN = os.environ.get("HF_TOKEN")
FEEDBACK_REPO_ID = os.environ.get("FEEDBACK_REPO_ID", "ous-sow/sahel-agri-feedback")
WHISPER_MODEL_ID = os.environ.get("WHISPER_MODEL_ID", "openai/whisper-large-v3-turbo")
LLM_MODEL_ID = os.environ.get("LLM_MODEL_ID", "Qwen/Qwen2.5-72B-Instruct")
LANGUAGE_NAMES = {
"bam": "Bambara",
"ful": "Fula / Pular",
"fr": "French",
"en": "English",
}
# ── Singletons ────────────────────────────────────────────────────────────────
from src.memory.memory_manager import MemoryManager
from src.llm.gemma_client import GemmaClient
from src.tts.waxal_tts import WaxalTTSEngine
from src.tts.voice_cloner import VoiceCloner
from src.voice.speaker_profiles import SpeakerProfileManager
from src.engine.stt_processor import (
transcribe_with_confidence,
LOW_CONFIDENCE_THRESHOLD,
CONFUSION_PROMPT,
)
from src.engine.curiosity import CuriosityEngine
_memory = MemoryManager(repo_id=FEEDBACK_REPO_ID, hf_token=HF_TOKEN)
_gemma = GemmaClient(model_id=LLM_MODEL_ID, hf_token=HF_TOKEN)
_tts = WaxalTTSEngine()
_voice_cloner = VoiceCloner()
_speaker_profiles = SpeakerProfileManager()
_curiosity = CuriosityEngine(interval=5)
# Whisper — loaded lazily in background
_whisper_model = None
_whisper_processor = None
_whisper_lock = threading.Lock()
_whisper_status = "not loaded"
# ── Whisper loading ───────────────────────────────────────────────────────────
def _do_load_whisper() -> None:
global _whisper_model, _whisper_processor, _whisper_status
import torch
try:
from transformers.models.whisper import WhisperProcessor, WhisperForConditionalGeneration
except ImportError:
from transformers.models.whisper.processing_whisper import WhisperProcessor
from transformers.models.whisper.modeling_whisper import WhisperForConditionalGeneration
_whisper_status = "loading…"
try:
_whisper_processor = WhisperProcessor.from_pretrained(
WHISPER_MODEL_ID, token=HF_TOKEN
)
_whisper_model = WhisperForConditionalGeneration.from_pretrained(
WHISPER_MODEL_ID, token=HF_TOKEN
)
_whisper_model.eval()
_whisper_status = f"ready ({WHISPER_MODEL_ID})"
except Exception as exc:
_whisper_status = f"error: {exc}"
def _ensure_whisper() -> str:
global _whisper_status
with _whisper_lock:
if _whisper_model is None and "loading" not in _whisper_status:
_whisper_status = "loading…"
threading.Thread(target=_do_load_whisper, daemon=True).start()
return _whisper_status
def _whisper_status_label() -> str:
s = _ensure_whisper()
if "ready" in s: return f"🟢 STT {s}"
if "loading" in s: return f"🟡 STT {s}"
if "error" in s: return f"🔴 STT {s}"
return f"⚪ STT {s}"
def _transcribe(audio_path: str, language_hint: str) -> tuple[str, float]:
"""
Run Whisper STT with confidence scoring.
Returns (text, avg_logprob). avg_logprob < LOW_CONFIDENCE_THRESHOLD → confused.
"""
if _whisper_model is None:
return "", 0.0
import librosa
audio_np, _ = librosa.load(audio_path, sr=16_000, mono=True)
# Whisper has no Bambara/Fula tokens — skip forced language for those
if language_hint in ("bam", "ful"):
forced_ids = None
else:
try:
forced_ids = _whisper_processor.get_decoder_prompt_ids(
language=language_hint, task="transcribe"
)
except Exception:
forced_ids = None
with _whisper_lock:
text, avg_logprob = transcribe_with_confidence(
audio_np,
_whisper_model,
_whisper_processor,
forced_ids,
)
return text, avg_logprob
# ── Core pipeline ─────────────────────────────────────────────────────────────
def _run_llm_and_tts(
transcript: str,
lang_code: str,
history: list,
source_label: str,
active_se=None,
) -> tuple:
"""
Shared core: Gemma → memory update → TTS → optional voice cloning.
Returns: (history, recent_words_md, status_msg, audio_tuple_or_None)
active_se: OpenVoice V2 tone-color SE (numpy array) to clone into, or None
for the base VITS voice.
"""
# 1. Ask Gemma (with vocabulary context)
vocab_ctx = _memory.get_vocabulary_context()
llm_result = _gemma.chat(transcript, vocab_ctx)
intent = llm_result.get("intent", "conversation")
response = llm_result.get("response", "…")
# 2. Persist teaching intent to memory
if intent == "teaching":
word = llm_result.get("word", transcript)
lang = llm_result.get("language", lang_code)
trans = llm_result.get("translation", "")
trans_l = llm_result.get("translation_language", "en")
if word and trans:
_memory.add_word_pair(word, lang, trans, trans_l, source="user_taught")
# 3. TTS → optional voice cloning
audio_out = None
tts_result = _tts.synthesize(response, lang_code)
if tts_result is not None:
audio_np, sr = tts_result
if active_se is not None:
cloned = _voice_cloner.convert(audio_np, sr, active_se)
if cloned is not None:
audio_np, sr = cloned
audio_out = WaxalTTSEngine.audio_to_gradio(audio_np, sr)
# 4. Update chat history
history = list(history or [])
history.append({"role": "user", "content": f"[{LANGUAGE_NAMES.get(lang_code, lang_code)}] {transcript}"})
history.append({"role": "assistant", "content": response})
# 5. Curiosity check — every 5 interactions, ask about a vocabulary gap
curiosity_q = _curiosity.maybe_ask(_memory, _gemma)
if curiosity_q:
history.append({"role": "assistant", "content": f"🌱 {curiosity_q}"})
tts_status = "" if audio_out else " (TTS not available for this language yet)"
status_msg = {
"teaching": f"✅ Word learned and saved!{tts_status}",
"question": f"💬 Answered from vocabulary.{tts_status}",
"conversation": f"💬 Replied.{tts_status}",
"error": "⚠️ LLM error.",
}.get(intent, f"💬 Replied.{tts_status}")
return history, _render_recent_words(), status_msg, audio_out
def process_audio(
audio_path,
language_label: str,
voice_mode: str,
history: list,
) -> tuple:
"""
Full pipeline: audio → speaker ID → Whisper STT → Gemma → TTS → voice clone.
Returns: (history, recent_words_md, status_msg, audio_out)
"""
try:
if audio_path is None:
return history, _render_recent_words(), "⚠️ No audio recorded.", None
lang_code = _label_to_code(language_label)
status = _ensure_whisper()
if _whisper_model is None:
return history, _render_recent_words(), f"⏳ {status} — wait a moment and try again.", None
# Load audio once — used for both speaker ID and STT
import librosa
audio_np, _ = librosa.load(audio_path, sr=16_000, mono=True)
# ── Speaker identification (Task 1) ───────────────────────────────────
uid, _ = _speaker_profiles.identify_or_create(audio_np)
# Extract OpenVoice SE and update the user's profile
if uid is not None:
ov_se = _voice_cloner.extract_se(audio_np, 16_000)
if ov_se is not None:
_speaker_profiles.update_ov_embedding(uid, ov_se)
# ── Select target SE based on mode (Task 3) ───────────────────────────
if voice_mode == "Individual" and uid is not None:
active_se = _speaker_profiles.get_openvoice_se(uid)
else:
active_se = _speaker_profiles.get_collective_embedding()
# ── Transcription with confidence scoring ─────────────────────────────
transcript, avg_logprob = _transcribe(audio_path, lang_code)
if not transcript:
return history, _render_recent_words(), "⚠️ Could not transcribe audio.", None
if avg_logprob < LOW_CONFIDENCE_THRESHOLD:
logger.info(
"Low STT confidence (avg_logprob=%.3f) — switching to confusion prompt",
avg_logprob,
)
transcript = CONFUSION_PROMPT
return _run_llm_and_tts(transcript, lang_code, history, "voice", active_se)
except Exception as exc:
logger.exception("process_audio error")
return history, _render_recent_words(), f"❌ Error: {exc}", None
def process_text(text: str, language_label: str, voice_mode: str, history: list) -> tuple:
"""Text input path — Gemma → TTS → optional voice clone."""
try:
if not text.strip():
return history, _render_recent_words(), "⚠️ Please type something.", None
lang_code = _label_to_code(language_label)
# Text has no speaker signal — use Collective in both modes as fallback
active_se = _speaker_profiles.get_collective_embedding()
return _run_llm_and_tts(text.strip(), lang_code, history, "text", active_se)
except Exception as exc:
logger.exception("process_text error")
return history, _render_recent_words(), f"❌ Error: {exc}", None
# ── Helpers ───────────────────────────────────────────────────────────────────
LANGUAGE_CHOICES = ["Bambara (bam)", "Fula (ful)", "French (fr)", "English (en)"]
def _label_to_code(label: str) -> str:
mapping = {
"Bambara (bam)": "bam",
"Fula (ful)": "ful",
"French (fr)": "fr",
"English (en)": "en",
}
return mapping.get(label, "bam")
def _render_recent_words() -> str:
recent = _memory.get_recent(5)
if not recent:
return "_No words learned yet. Start teaching me! Say something like: **'I ni ce means hello in Bambara'**_"
lines = ["### 📖 Last 5 words learned\n"]
for e in reversed(recent):
lang = LANGUAGE_NAMES.get(e.get("language", "?"), e.get("language", "?"))
word = e.get("word", "")
tr = e.get("translation", "")
tr_l = e.get("translation_language", "")
lines.append(f"**{word}** `[{lang}]` → {tr} `({tr_l})`")
return "\n\n".join(lines)
# ── UI ────────────────────────────────────────────────────────────────────────
def build_ui() -> gr.Blocks:
with gr.Blocks(title="Sahel-Voice-Lab", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"# 🌍 Sahel-Voice-Lab — Internal Edition\n"
"**Phase 1 · The Memory Loop** \n"
"Teach me Bambara and Fula — I will remember every word you share."
)
with gr.Row():
# ── Left column: input + voice output ────────────────────────────
with gr.Column(scale=2):
def _full_status() -> str:
stt = _whisper_status_label()
tts = _tts.get_status()
bam = "🟢" if tts["bam"] == "ready" else ("🟡" if "not" in tts["bam"] else "🔴")
ful = "🟢" if tts["ful"] == "ready" else ("🟡" if "not" in tts["ful"] else "🔴")
spk = _speaker_profiles.get_status()
cln = "🟢 Cloner" if _voice_cloner._ready else (
"🔴 Cloner" if _voice_cloner._error else "🟡 Cloner")
return f"{stt} | TTS Bambara {bam} | TTS Fula {ful}\n{spk} | {cln}"
status_box = gr.Textbox(
value=_full_status(),
label="System status",
interactive=False,
max_lines=2,
)
status_timer = gr.Timer(value=4)
status_timer.tick(fn=_full_status, outputs=status_box)
language_dd = gr.Dropdown(
choices=LANGUAGE_CHOICES,
value="Bambara (bam)",
label="Language you are speaking",
)
voice_mode_radio = gr.Radio(
choices=["Individual", "Collective"],
value="Individual",
label="Voice Mode",
info=(
"Individual — respond in the voice of the last speaker detected. "
"Collective — blend all known voices into one shared voice."
),
)
with gr.Tab("🎙️ Push-to-Talk"):
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Hold to record — release to send",
)
talk_btn = gr.Button("▶ Send Recording", variant="primary", size="lg")
with gr.Tab("⌨️ Type instead"):
text_input = gr.Textbox(
lines=3,
placeholder=(
"Type a message or teach me a word.\n"
"Examples:\n"
" 'I ni ce means hello in Bambara'\n"
" 'Jam waali veut dire bonjour en Fula'\n"
" 'How do you say rain in Bambara?'"
),
label="Message",
)
text_btn = gr.Button("▶ Send", variant="primary")
action_status = gr.Textbox(
label="Last action", interactive=False, max_lines=1
)
# Voice response output
audio_output = gr.Audio(
label="🔊 Voice response",
autoplay=True,
interactive=False,
visible=True,
)
gr.Markdown(
"**Teaching tips:**\n"
"- *'I ni ce means hello in Bambara'*\n"
"- *'Jam waali veut dire bonjour en Fula'*\n"
"- *'How do you say rain in Bambara?'*\n\n"
"Every new word is saved to the Hub automatically.\n\n"
"**TTS note:** Bambara voice is ready. "
"Fula voice requires running `notebooks/train_fula_tts.ipynb` on Kaggle first."
)
# ── Right column: memory + chat ───────────────────────────────────
with gr.Column(scale=3):
recent_words = gr.Markdown(value=_render_recent_words())
gr.Markdown("---")
chatbot = gr.Chatbot(
label="Conversation",
height=420,
type="messages",
bubble_full_width=False,
)
clear_btn = gr.Button("🗑️ Clear conversation", size="sm", variant="secondary")
# ── Wiring ────────────────────────────────────────────────────────────
history_state = gr.State([])
talk_btn.click(
fn=process_audio,
inputs=[audio_input, language_dd, voice_mode_radio, history_state],
outputs=[history_state, recent_words, action_status, audio_output],
).then(
fn=lambda h: h,
inputs=[history_state],
outputs=[chatbot],
)
text_btn.click(
fn=process_text,
inputs=[text_input, language_dd, voice_mode_radio, history_state],
outputs=[history_state, recent_words, action_status, audio_output],
).then(
fn=lambda h: (h, ""),
inputs=[history_state],
outputs=[chatbot, text_input],
)
text_input.submit(
fn=process_text,
inputs=[text_input, language_dd, voice_mode_radio, history_state],
outputs=[history_state, recent_words, action_status, audio_output],
).then(
fn=lambda h: (h, ""),
inputs=[history_state],
outputs=[chatbot, text_input],
)
clear_btn.click(
fn=lambda: ([], _render_recent_words(), "", None),
outputs=[history_state, recent_words, action_status, audio_output],
).then(fn=lambda: [], outputs=[chatbot])
return demo
# ── Entry point ───────────────────────────────────────────────────────────────
# Load vocabulary at startup (background — non-blocking for the UI)
threading.Thread(target=_memory.load, daemon=True).start()
# Begin loading Whisper immediately
_ensure_whisper()
# Preload TTS models in background
_tts.preload()
# Preload speaker identification (SpeechBrain ECAPA-TDNN)
_speaker_profiles.preload()
# Preload voice cloner (OpenVoice V2) — gracefully degrades if unavailable
_voice_cloner.preload()
if __name__ == "__main__":
from dotenv import load_dotenv
load_dotenv()
HF_TOKEN = os.environ.get("HF_TOKEN")
FEEDBACK_REPO_ID = os.environ.get("FEEDBACK_REPO_ID", "ous-sow/sahel-agri-feedback")
WHISPER_MODEL_ID = os.environ.get("WHISPER_MODEL_ID", "openai/whisper-large-v3-turbo")
LLM_MODEL_ID = os.environ.get("LLM_MODEL_ID", "Qwen/Qwen2.5-72B-Instruct")
_memory._hf_token = HF_TOKEN
_memory._repo_id = FEEDBACK_REPO_ID
_gemma._hf_token = HF_TOKEN
print(f"STT model : {WHISPER_MODEL_ID}")
print(f"LLM model : {LLM_MODEL_ID}")
print(f"Store : {FEEDBACK_REPO_ID}")
print(f"HF_TOKEN : {'set' if HF_TOKEN else 'NOT SET — Hub push disabled'}")
print()
demo = build_ui()
demo.launch(
server_port=7860,
inbrowser=False,
share=False,
show_api=False,
ssr_mode=False,
)
|