""" speech_module/transcriber.py Whisper (default) and Wav2Vec2 backends. Hindi support: pass language="hi" and task="translate" to Whisper. Whisper then transcribes Hindi audio AND translates to English in one pass, so Stage 2 (spaCy NLP) receives clean English text with no extra steps. """ from __future__ import annotations import subprocess import tempfile import os from pathlib import Path from typing import Tuple import numpy as np from utils.config import config, SpeechConfig from utils.logger import logger class WhisperTranscriber: def __init__(self, cfg: SpeechConfig = None): self.cfg = cfg or config.speech self._model = None def _load(self): if self._model is None: import whisper logger.info(f"Loading Whisper '{self.cfg.whisper_model_size}' on CPU …") self._model = whisper.load_model(self.cfg.whisper_model_size, device="cpu") logger.info("Whisper ready.") return self._model def _convert_to_wav(self, audio_path: str) -> str: """ Convert any audio format to 16kHz mono WAV using ffmpeg. Required for: - Browser-recorded webm/opus (otherwise Whisper gets garbage) - Hindi audio files which may come in various formats Returns path to temp WAV file (caller must delete). """ tmp_wav = tempfile.mktemp(suffix=".wav") result = subprocess.run( ["ffmpeg", "-y", "-i", audio_path, "-ar", "16000", "-ac", "1", "-f", "wav", tmp_wav], capture_output=True, text=True ) if result.returncode != 0: logger.warning(f"ffmpeg conversion warning: {result.stderr[-300:]}") return tmp_wav def transcribe(self, audio_path: str | Path, language: str = None, task: str = "transcribe") -> Tuple[str, float]: """ Transcribe (and optionally translate) an audio file. Args: audio_path : Path to audio file. language : Source language code. None = auto-detect. Pass "hi" for Hindi. task : "transcribe" → output in source language. "translate" → output in English regardless of source language. For Hindi → English, pass language="hi", task="translate". Returns: (text, confidence) """ audio_path = str(audio_path) if not Path(audio_path).exists(): raise FileNotFoundError(f"Audio not found: {audio_path}") # Always convert to clean 16kHz mono WAV first tmp_wav = self._convert_to_wav(audio_path) try: model = self._load() # Build decode options with anti-hallucination settings decode_kwargs = { "fp16": False, "task": task, "temperature": 0.0, "condition_on_previous_text": False, "initial_prompt": "This is a cooking recipe with ingredients and quantities.", "suppress_tokens": "-1", "without_timestamps": True, } if language: decode_kwargs["language"] = language result = model.transcribe(tmp_wav, **decode_kwargs) text = result["text"].strip() segs = result.get("segments", []) conf = ( float(np.clip(np.exp(np.mean([s.get("avg_logprob", -1.0) for s in segs])), 0, 1)) if segs else 0.5 ) detected_lang = result.get("language", language or "unknown") logger.info( f"Whisper done. lang={detected_lang} task={task} " f"conf={conf:.2f} text={text[:80]}" ) return text, conf finally: # Always clean up the temp WAV try: os.remove(tmp_wav) except Exception: pass class Wav2Vec2Transcriber: """ Wav2Vec2 backend — English only, no translation support. For Hindi, use WhisperTranscriber with task='translate'. """ def __init__(self, cfg: SpeechConfig = None): self.cfg = cfg or config.speech self._processor = self._model = None def _load(self): if self._model is None: from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor self._processor = Wav2Vec2Processor.from_pretrained(self.cfg.wav2vec2_model) self._model = Wav2Vec2ForCTC.from_pretrained(self.cfg.wav2vec2_model) self._model.eval() return self._processor, self._model def transcribe(self, audio_path: str | Path, language: str = None, task: str = "transcribe") -> Tuple[str, float]: import torch import librosa audio_path = Path(audio_path) audio, _ = librosa.load(str(audio_path), sr=self.cfg.sample_rate, mono=True) proc, model = self._load() inputs = proc(audio, sampling_rate=self.cfg.sample_rate, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(inputs.input_values).logits ids = torch.argmax(logits, dim=-1) text = proc.batch_decode(ids)[0].strip().lower() conf = float(torch.softmax(logits, dim=-1).max(dim=-1).values.mean().item()) return text, conf class SpeechTranscriber: """ Unified facade over Whisper and Wav2Vec2. For Hindi speech → English text: transcriber = SpeechTranscriber() text, conf = transcriber.transcribe("audio.wav", language="hi", task="translate") For English speech → English text (default): text, conf = transcriber.transcribe("audio.wav") For auto-detect language → English translation: text, conf = transcriber.transcribe("audio.wav", task="translate") """ def __init__(self, cfg: SpeechConfig = None): self.cfg = cfg or config.speech self._backend = ( WhisperTranscriber(self.cfg) if self.cfg.backend == "whisper" else Wav2Vec2Transcriber(self.cfg) ) def transcribe(self, audio_path: str | Path, language: str = None, task: str = "transcribe") -> Tuple[str, float]: return self._backend.transcribe(audio_path, language=language, task=task) def transcribe_text_passthrough(self, text: str) -> Tuple[str, float]: return text.strip(), 1.0