he99codes's picture
Deploying latest raw changes and full functionality
a3fc1ff
"""
speech_module/transcriber1.py
Whisper (default) and Wav2Vec2 backends with Hindi support.
Hindi support: pass language="hi" and task="translate" to Whisper.
Whisper then transcribes Hindi audio AND translates to English in one pass,
so Stage 2 (spaCy NLP) receives clean English text with no extra steps.
"""
from __future__ import annotations
import subprocess
import tempfile
import os
from pathlib import Path
from typing import Tuple
import numpy as np
from utils.config import config, SpeechConfig
from utils.logger import logger
class WhisperTranscriber:
def __init__(self, cfg: SpeechConfig = None):
self.cfg = cfg or config.speech
self._model = None
def _load(self):
if self._model is None:
import whisper
logger.info(f"Loading Whisper '{self.cfg.whisper_model_size}' on CPU …")
self._model = whisper.load_model(self.cfg.whisper_model_size, device="cpu")
logger.info("Whisper ready.")
return self._model
def _convert_to_wav(self, audio_path: str) -> str:
"""
Convert any audio format to 16kHz mono WAV using ffmpeg.
Required for:
- Browser-recorded webm/opus (otherwise Whisper gets garbage)
- Hindi audio files which may come in various formats
Returns path to temp WAV file (caller must delete).
"""
tmp_wav = tempfile.mktemp(suffix=".wav")
result = subprocess.run(
["ffmpeg", "-y", "-i", audio_path,
"-ar", "16000", "-ac", "1", "-f", "wav", tmp_wav],
capture_output=True, text=True
)
if result.returncode != 0:
logger.warning(f"ffmpeg conversion warning: {result.stderr[-300:]}")
return tmp_wav
def transcribe(self, audio_path: str | Path,
language: str = None,
task: str = "transcribe") -> Tuple[str, float]:
"""
Transcribe (and optionally translate) an audio file.
Args:
audio_path : Path to audio file.
language : Source language code. None = auto-detect.
Pass "hi" for Hindi.
task : "transcribe" β†’ output in source language.
"translate" β†’ output in English regardless of source language.
For Hindi β†’ English, pass language="hi", task="translate".
Returns:
(text, confidence)
"""
audio_path = str(audio_path)
if not Path(audio_path).exists():
raise FileNotFoundError(f"Audio not found: {audio_path}")
# Always convert to clean 16kHz mono WAV first
tmp_wav = self._convert_to_wav(audio_path)
try:
model = self._load()
# Build decode options with anti-hallucination settings
decode_kwargs = {
"fp16": False,
"task": task,
"temperature": 0.0,
"condition_on_previous_text": False,
"initial_prompt": "This is a cooking recipe with ingredients and quantities.",
"suppress_tokens": "-1",
"without_timestamps": True,
}
if language:
decode_kwargs["language"] = language
result = model.transcribe(tmp_wav, **decode_kwargs)
text = result["text"].strip()
segs = result.get("segments", [])
conf = (
float(np.clip(np.exp(np.mean([s.get("avg_logprob", -1.0) for s in segs])), 0, 1))
if segs else 0.5
)
detected_lang = result.get("language", language or "unknown")
logger.info(
f"Whisper done. lang={detected_lang} task={task} "
f"conf={conf:.2f} text={text[:80]}"
)
return text, conf
finally:
# Always clean up the temp WAV
try:
os.remove(tmp_wav)
except Exception:
pass
class Wav2Vec2Transcriber:
"""
Wav2Vec2 backend β€” English only, no translation support.
For Hindi, use WhisperTranscriber with task='translate'.
"""
def __init__(self, cfg: SpeechConfig = None):
self.cfg = cfg or config.speech
self._processor = self._model = None
def _load(self):
if self._model is None:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
self._processor = Wav2Vec2Processor.from_pretrained(self.cfg.wav2vec2_model)
self._model = Wav2Vec2ForCTC.from_pretrained(self.cfg.wav2vec2_model)
self._model.eval()
return self._processor, self._model
def transcribe(self, audio_path: str | Path,
language: str = None,
task: str = "transcribe") -> Tuple[str, float]:
import torch
import librosa
audio_path = Path(audio_path)
audio, _ = librosa.load(str(audio_path), sr=self.cfg.sample_rate, mono=True)
proc, model = self._load()
inputs = proc(audio, sampling_rate=self.cfg.sample_rate, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(inputs.input_values).logits
ids = torch.argmax(logits, dim=-1)
text = proc.batch_decode(ids)[0].strip().lower()
conf = float(torch.softmax(logits, dim=-1).max(dim=-1).values.mean().item())
return text, conf
class SpeechTranscriber:
"""
Unified facade over Whisper and Wav2Vec2.
For Hindi speech β†’ English text:
transcriber = SpeechTranscriber()
text, conf = transcriber.transcribe("audio.wav", language="hi", task="translate")
For English speech β†’ English text (default):
text, conf = transcriber.transcribe("audio.wav")
For auto-detect language β†’ English translation:
text, conf = transcriber.transcribe("audio.wav", task="translate")
"""
def __init__(self, cfg: SpeechConfig = None):
self.cfg = cfg or config.speech
self._backend = (
WhisperTranscriber(self.cfg)
if self.cfg.backend == "whisper"
else Wav2Vec2Transcriber(self.cfg)
)
def transcribe(self, audio_path: str | Path,
language: str = None,
task: str = "transcribe") -> Tuple[str, float]:
return self._backend.transcribe(audio_path, language=language, task=task)
def transcribe_text_passthrough(self, text: str) -> Tuple[str, float]:
return text.strip(), 1.0