"""
speech_module/transcriber.py
Whisper (default) and Wav2Vec2 backends.

Hindi support: pass language="hi" and task="translate" to Whisper.
Whisper then transcribes Hindi audio AND translates to English in one pass,
so Stage 2 (spaCy NLP) receives clean English text with no extra steps.
"""

from __future__ import annotations
import subprocess
import tempfile
import os
from pathlib import Path
from typing import Tuple

import numpy as np

from utils.config import config, SpeechConfig
from utils.logger import logger


class WhisperTranscriber:
    def __init__(self, cfg: SpeechConfig = None):
        self.cfg = cfg or config.speech
        self._model = None

    def _load(self):
        if self._model is None:
            import whisper
            logger.info(f"Loading Whisper '{self.cfg.whisper_model_size}' on CPU …")
            self._model = whisper.load_model(self.cfg.whisper_model_size, device="cpu")
            logger.info("Whisper ready.")
        return self._model

    def _convert_to_wav(self, audio_path: str) -> str:
        """
        Convert any audio format to 16kHz mono WAV using ffmpeg.
        Required for:
          - Browser-recorded webm/opus (otherwise Whisper gets garbage)
          - Hindi audio files which may come in various formats
        Returns path to temp WAV file (caller must delete).
        """
        tmp_wav = tempfile.mktemp(suffix=".wav")
        result = subprocess.run(
            ["ffmpeg", "-y", "-i", audio_path,
             "-ar", "16000", "-ac", "1", "-f", "wav", tmp_wav],
            capture_output=True, text=True
        )
        if result.returncode != 0:
            logger.warning(f"ffmpeg conversion warning: {result.stderr[-300:]}")
        return tmp_wav

    def transcribe(self, audio_path: str | Path,
                   language: str = None,
                   task: str = "transcribe") -> Tuple[str, float]:
        """
        Transcribe (and optionally translate) an audio file.

        Args:
            audio_path : Path to audio file.
            language   : Source language code. None = auto-detect.
                         Pass "hi" for Hindi.
            task       : "transcribe" → output in source language.
                         "translate"  → output in English regardless of source language.
                         For Hindi → English, pass language="hi", task="translate".

        Returns:
            (text, confidence)
        """
        audio_path = str(audio_path)
        if not Path(audio_path).exists():
            raise FileNotFoundError(f"Audio not found: {audio_path}")

        # Always convert to clean 16kHz mono WAV first
        tmp_wav = self._convert_to_wav(audio_path)

        try:
            model = self._load()

            # Build decode options with anti-hallucination settings
            decode_kwargs = {
                "fp16": False,
                "task": task,
                "temperature": 0.0,
                "condition_on_previous_text": False,
                "initial_prompt": "This is a cooking recipe with ingredients and quantities.",
                "suppress_tokens": "-1",
                "without_timestamps": True,
            }
            if language:
                decode_kwargs["language"] = language

            result = model.transcribe(tmp_wav, **decode_kwargs)
            text = result["text"].strip()
            segs = result.get("segments", [])
            conf = (
                float(np.clip(np.exp(np.mean([s.get("avg_logprob", -1.0) for s in segs])), 0, 1))
                if segs else 0.5
            )

            detected_lang = result.get("language", language or "unknown")
            logger.info(
                f"Whisper done. lang={detected_lang} task={task} "
                f"conf={conf:.2f} text={text[:80]}"
            )
            return text, conf

        finally:
            # Always clean up the temp WAV
            try:
                os.remove(tmp_wav)
            except Exception:
                pass


class Wav2Vec2Transcriber:
    """
    Wav2Vec2 backend — English only, no translation support.
    For Hindi, use WhisperTranscriber with task='translate'.
    """
    def __init__(self, cfg: SpeechConfig = None):
        self.cfg = cfg or config.speech
        self._processor = self._model = None

    def _load(self):
        if self._model is None:
            from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
            self._processor = Wav2Vec2Processor.from_pretrained(self.cfg.wav2vec2_model)
            self._model = Wav2Vec2ForCTC.from_pretrained(self.cfg.wav2vec2_model)
            self._model.eval()
        return self._processor, self._model

    def transcribe(self, audio_path: str | Path,
                   language: str = None,
                   task: str = "transcribe") -> Tuple[str, float]:
        import torch
        import librosa
        audio_path = Path(audio_path)
        audio, _ = librosa.load(str(audio_path), sr=self.cfg.sample_rate, mono=True)
        proc, model = self._load()
        inputs = proc(audio, sampling_rate=self.cfg.sample_rate, return_tensors="pt", padding=True)
        with torch.no_grad():
            logits = model(inputs.input_values).logits
        ids = torch.argmax(logits, dim=-1)
        text = proc.batch_decode(ids)[0].strip().lower()
        conf = float(torch.softmax(logits, dim=-1).max(dim=-1).values.mean().item())
        return text, conf


class SpeechTranscriber:
    """
    Unified facade over Whisper and Wav2Vec2.

    For Hindi speech → English text:
        transcriber = SpeechTranscriber()
        text, conf = transcriber.transcribe("audio.wav", language="hi", task="translate")

    For English speech → English text (default):
        text, conf = transcriber.transcribe("audio.wav")

    For auto-detect language → English translation:
        text, conf = transcriber.transcribe("audio.wav", task="translate")
    """
    def __init__(self, cfg: SpeechConfig = None):
        self.cfg = cfg or config.speech
        self._backend = (
            WhisperTranscriber(self.cfg)
            if self.cfg.backend == "whisper"
            else Wav2Vec2Transcriber(self.cfg)
        )

    def transcribe(self, audio_path: str | Path,
                   language: str = None,
                   task: str = "transcribe") -> Tuple[str, float]:
        return self._backend.transcribe(audio_path, language=language, task=task)

    def transcribe_text_passthrough(self, text: str) -> Tuple[str, float]:
        return text.strip(), 1.0