Spaces:

he99codes
/

Recipe_Health_Classification

Sleeping

App Files Files Community

Recipe_Health_Classification / speech_module /transcriber.py

he99codes

Deploying latest raw changes and full functionality

a3fc1ff about 1 month ago

raw

history blame contribute delete

6.56 kB

	"""
	speech_module/transcriber.py
	Whisper (default) and Wav2Vec2 backends.

	Hindi support: pass language="hi" and task="translate" to Whisper.
	Whisper then transcribes Hindi audio AND translates to English in one pass,
	so Stage 2 (spaCy NLP) receives clean English text with no extra steps.
	"""

	from __future__ import annotations
	import subprocess
	import tempfile
	import os
	from pathlib import Path
	from typing import Tuple

	import numpy as np

	from utils.config import config, SpeechConfig
	from utils.logger import logger


	class WhisperTranscriber:
	def __init__(self, cfg: SpeechConfig = None):
	self.cfg = cfg or config.speech
	self._model = None

	def _load(self):
	if self._model is None:
	import whisper
	logger.info(f"Loading Whisper '{self.cfg.whisper_model_size}' on CPU …")
	self._model = whisper.load_model(self.cfg.whisper_model_size, device="cpu")
	logger.info("Whisper ready.")
	return self._model

	def _convert_to_wav(self, audio_path: str) -> str:
	"""
	Convert any audio format to 16kHz mono WAV using ffmpeg.
	Required for:
	- Browser-recorded webm/opus (otherwise Whisper gets garbage)
	- Hindi audio files which may come in various formats
	Returns path to temp WAV file (caller must delete).
	"""
	tmp_wav = tempfile.mktemp(suffix=".wav")
	result = subprocess.run(
	["ffmpeg", "-y", "-i", audio_path,
	"-ar", "16000", "-ac", "1", "-f", "wav", tmp_wav],
	capture_output=True, text=True
	)
	if result.returncode != 0:
	logger.warning(f"ffmpeg conversion warning: {result.stderr[-300:]}")
	return tmp_wav

	def transcribe(self, audio_path: str \| Path,
	language: str = None,
	task: str = "transcribe") -> Tuple[str, float]:
	"""
	Transcribe (and optionally translate) an audio file.

	Args:
	audio_path : Path to audio file.
	language : Source language code. None = auto-detect.
	Pass "hi" for Hindi.
	task : "transcribe" → output in source language.
	"translate" → output in English regardless of source language.
	For Hindi → English, pass language="hi", task="translate".

	Returns:
	(text, confidence)
	"""
	audio_path = str(audio_path)
	if not Path(audio_path).exists():
	raise FileNotFoundError(f"Audio not found: {audio_path}")

	# Always convert to clean 16kHz mono WAV first
	tmp_wav = self._convert_to_wav(audio_path)

	try:
	model = self._load()

	# Build decode options with anti-hallucination settings
	decode_kwargs = {
	"fp16": False,
	"task": task,
	"temperature": 0.0,
	"condition_on_previous_text": False,
	"initial_prompt": "This is a cooking recipe with ingredients and quantities.",
	"suppress_tokens": "-1",
	"without_timestamps": True,
	}
	if language:
	decode_kwargs["language"] = language

	result = model.transcribe(tmp_wav, **decode_kwargs)
	text = result["text"].strip()
	segs = result.get("segments", [])
	conf = (
	float(np.clip(np.exp(np.mean([s.get("avg_logprob", -1.0) for s in segs])), 0, 1))
	if segs else 0.5
	)

	detected_lang = result.get("language", language or "unknown")
	logger.info(
	f"Whisper done. lang={detected_lang} task={task} "
	f"conf={conf:.2f} text={text[:80]}"
	)
	return text, conf

	finally:
	# Always clean up the temp WAV
	try:
	os.remove(tmp_wav)
	except Exception:
	pass


	class Wav2Vec2Transcriber:
	"""
	Wav2Vec2 backend — English only, no translation support.
	For Hindi, use WhisperTranscriber with task='translate'.
	"""
	def __init__(self, cfg: SpeechConfig = None):
	self.cfg = cfg or config.speech
	self._processor = self._model = None

	def _load(self):
	if self._model is None:
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
	self._processor = Wav2Vec2Processor.from_pretrained(self.cfg.wav2vec2_model)
	self._model = Wav2Vec2ForCTC.from_pretrained(self.cfg.wav2vec2_model)
	self._model.eval()
	return self._processor, self._model

	def transcribe(self, audio_path: str \| Path,
	language: str = None,
	task: str = "transcribe") -> Tuple[str, float]:
	import torch
	import librosa
	audio_path = Path(audio_path)
	audio, _ = librosa.load(str(audio_path), sr=self.cfg.sample_rate, mono=True)
	proc, model = self._load()
	inputs = proc(audio, sampling_rate=self.cfg.sample_rate, return_tensors="pt", padding=True)
	with torch.no_grad():
	logits = model(inputs.input_values).logits
	ids = torch.argmax(logits, dim=-1)
	text = proc.batch_decode(ids)[0].strip().lower()
	conf = float(torch.softmax(logits, dim=-1).max(dim=-1).values.mean().item())
	return text, conf


	class SpeechTranscriber:
	"""
	Unified facade over Whisper and Wav2Vec2.

	For Hindi speech → English text:
	transcriber = SpeechTranscriber()
	text, conf = transcriber.transcribe("audio.wav", language="hi", task="translate")

	For English speech → English text (default):
	text, conf = transcriber.transcribe("audio.wav")

	For auto-detect language → English translation:
	text, conf = transcriber.transcribe("audio.wav", task="translate")
	"""
	def __init__(self, cfg: SpeechConfig = None):
	self.cfg = cfg or config.speech
	self._backend = (
	WhisperTranscriber(self.cfg)
	if self.cfg.backend == "whisper"
	else Wav2Vec2Transcriber(self.cfg)
	)

	def transcribe(self, audio_path: str \| Path,
	language: str = None,
	task: str = "transcribe") -> Tuple[str, float]:
	return self._backend.transcribe(audio_path, language=language, task=task)

	def transcribe_text_passthrough(self, text: str) -> Tuple[str, float]:
	return text.strip(), 1.0