| """ |
| speech_module/transcriber.py |
| Whisper (default) and Wav2Vec2 backends. |
| |
| Hindi support: pass language="hi" and task="translate" to Whisper. |
| Whisper then transcribes Hindi audio AND translates to English in one pass, |
| so Stage 2 (spaCy NLP) receives clean English text with no extra steps. |
| """ |
|
|
| from __future__ import annotations |
| import subprocess |
| import tempfile |
| import os |
| from pathlib import Path |
| from typing import Tuple |
|
|
| import numpy as np |
|
|
| from utils.config import config, SpeechConfig |
| from utils.logger import logger |
|
|
|
|
| class WhisperTranscriber: |
| def __init__(self, cfg: SpeechConfig = None): |
| self.cfg = cfg or config.speech |
| self._model = None |
|
|
| def _load(self): |
| if self._model is None: |
| import whisper |
| logger.info(f"Loading Whisper '{self.cfg.whisper_model_size}' on CPU β¦") |
| self._model = whisper.load_model(self.cfg.whisper_model_size, device="cpu") |
| logger.info("Whisper ready.") |
| return self._model |
|
|
| def _convert_to_wav(self, audio_path: str) -> str: |
| """ |
| Convert any audio format to 16kHz mono WAV using ffmpeg. |
| Required for: |
| - Browser-recorded webm/opus (otherwise Whisper gets garbage) |
| - Hindi audio files which may come in various formats |
| Returns path to temp WAV file (caller must delete). |
| """ |
| tmp_wav = tempfile.mktemp(suffix=".wav") |
| result = subprocess.run( |
| ["ffmpeg", "-y", "-i", audio_path, |
| "-ar", "16000", "-ac", "1", "-f", "wav", tmp_wav], |
| capture_output=True, text=True |
| ) |
| if result.returncode != 0: |
| logger.warning(f"ffmpeg conversion warning: {result.stderr[-300:]}") |
| return tmp_wav |
|
|
| def transcribe(self, audio_path: str | Path, |
| language: str = None, |
| task: str = "transcribe") -> Tuple[str, float]: |
| """ |
| Transcribe (and optionally translate) an audio file. |
| |
| Args: |
| audio_path : Path to audio file. |
| language : Source language code. None = auto-detect. |
| Pass "hi" for Hindi. |
| task : "transcribe" β output in source language. |
| "translate" β output in English regardless of source language. |
| For Hindi β English, pass language="hi", task="translate". |
| |
| Returns: |
| (text, confidence) |
| """ |
| audio_path = str(audio_path) |
| if not Path(audio_path).exists(): |
| raise FileNotFoundError(f"Audio not found: {audio_path}") |
|
|
| |
| tmp_wav = self._convert_to_wav(audio_path) |
|
|
| try: |
| model = self._load() |
|
|
| |
| decode_kwargs = { |
| "fp16": False, |
| "task": task, |
| "temperature": 0.0, |
| "condition_on_previous_text": False, |
| "initial_prompt": "This is a cooking recipe with ingredients and quantities.", |
| "suppress_tokens": "-1", |
| "without_timestamps": True, |
| } |
| if language: |
| decode_kwargs["language"] = language |
|
|
| result = model.transcribe(tmp_wav, **decode_kwargs) |
| text = result["text"].strip() |
| segs = result.get("segments", []) |
| conf = ( |
| float(np.clip(np.exp(np.mean([s.get("avg_logprob", -1.0) for s in segs])), 0, 1)) |
| if segs else 0.5 |
| ) |
|
|
| detected_lang = result.get("language", language or "unknown") |
| logger.info( |
| f"Whisper done. lang={detected_lang} task={task} " |
| f"conf={conf:.2f} text={text[:80]}" |
| ) |
| return text, conf |
|
|
| finally: |
| |
| try: |
| os.remove(tmp_wav) |
| except Exception: |
| pass |
|
|
|
|
| class Wav2Vec2Transcriber: |
| """ |
| Wav2Vec2 backend β English only, no translation support. |
| For Hindi, use WhisperTranscriber with task='translate'. |
| """ |
| def __init__(self, cfg: SpeechConfig = None): |
| self.cfg = cfg or config.speech |
| self._processor = self._model = None |
|
|
| def _load(self): |
| if self._model is None: |
| from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor |
| self._processor = Wav2Vec2Processor.from_pretrained(self.cfg.wav2vec2_model) |
| self._model = Wav2Vec2ForCTC.from_pretrained(self.cfg.wav2vec2_model) |
| self._model.eval() |
| return self._processor, self._model |
|
|
| def transcribe(self, audio_path: str | Path, |
| language: str = None, |
| task: str = "transcribe") -> Tuple[str, float]: |
| import torch |
| import librosa |
| audio_path = Path(audio_path) |
| audio, _ = librosa.load(str(audio_path), sr=self.cfg.sample_rate, mono=True) |
| proc, model = self._load() |
| inputs = proc(audio, sampling_rate=self.cfg.sample_rate, return_tensors="pt", padding=True) |
| with torch.no_grad(): |
| logits = model(inputs.input_values).logits |
| ids = torch.argmax(logits, dim=-1) |
| text = proc.batch_decode(ids)[0].strip().lower() |
| conf = float(torch.softmax(logits, dim=-1).max(dim=-1).values.mean().item()) |
| return text, conf |
|
|
|
|
| class SpeechTranscriber: |
| """ |
| Unified facade over Whisper and Wav2Vec2. |
| |
| For Hindi speech β English text: |
| transcriber = SpeechTranscriber() |
| text, conf = transcriber.transcribe("audio.wav", language="hi", task="translate") |
| |
| For English speech β English text (default): |
| text, conf = transcriber.transcribe("audio.wav") |
| |
| For auto-detect language β English translation: |
| text, conf = transcriber.transcribe("audio.wav", task="translate") |
| """ |
| def __init__(self, cfg: SpeechConfig = None): |
| self.cfg = cfg or config.speech |
| self._backend = ( |
| WhisperTranscriber(self.cfg) |
| if self.cfg.backend == "whisper" |
| else Wav2Vec2Transcriber(self.cfg) |
| ) |
|
|
| def transcribe(self, audio_path: str | Path, |
| language: str = None, |
| task: str = "transcribe") -> Tuple[str, float]: |
| return self._backend.transcribe(audio_path, language=language, task=task) |
|
|
| def transcribe_text_passthrough(self, text: str) -> Tuple[str, float]: |
| return text.strip(), 1.0 |
|
|