Spaces:

Rthur2003
/

crowncode-backend

Running

File size: 15,362 Bytes

"""
Meta-classifier service for AURIS score fusion.

Replaces the hand-tuned weighted-average fusion with
a trained stacking ensemble that combines signals from
all analysis towers.

Towers:
  1. wav2vec2 fine-tuned (logit + hidden stats)
  2. Librosa features (spectral, temporal, harmonic)
  3. Vocal analysis (pitch, vibrato, formant, breath)
  4. CLAP embeddings (when available)
  5. FST external (when available)

The meta-classifier was trained on the same dataset
with cross-validated tower outputs.
"""

from __future__ import annotations

import json
import pickle
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional

import numpy as np

from .feature_extractor import AudioFeatures
from .vocal_analyzer import VocalFeatures
from .wav2vec2_detector import Wav2Vec2Result
from .clap_detector import CLAPResult
from .fst_client import FSTResult
from .logging_config import get_logger

logger = get_logger(__name__)

_MODELS_DIR = Path(__file__).resolve().parents[2] / "models"


@dataclass
class MetaResult:
    """Final detection result from meta-classifier."""

    is_ai_generated: bool
    confidence: float
    model_version: str = "auris-v2-meta"
    decision_source: str = "auris_meta"
    analysis_mode: str = "production"

    # Per-tower scores for transparency
    tower_scores: dict = field(default_factory=dict)

    # Explainable indicators (SHAP-based when available)
    indicators: List[str] = field(default_factory=list)

    # Feature importances for this prediction
    top_features: List[dict] = field(default_factory=list)


class MetaClassifierService:
    """
    Trained stacking meta-classifier.

    Combines all tower outputs into a single feature vector,
    runs the trained classifier, and generates explainable
    indicators.

    Falls back to simple averaging when trained model is
    not available (during development before first training).
    """

    def __init__(self) -> None:
        """Initialize meta-classifier with empty state."""
        self._model = None
        self._scaler = None
        self._feature_cols: list[str] = []
        self._initialized = False
        self._trained = False

    def _ensure_loaded(self) -> bool:
        """Load trained meta-classifier if available."""
        if self._initialized:
            return self._trained

        self._initialized = True

        model_path = _MODELS_DIR / "auris_classifier_v1.pkl"
        scaler_path = _MODELS_DIR / "feature_scaler_v1.pkl"
        columns_path = _MODELS_DIR / "feature_columns_v1.json"

        if not model_path.exists():
            logger.info(
                "Meta-classifier not found. "
                "Using fallback fusion."
            )
            return False

        try:
            with open(model_path, "rb") as f:
                self._model = pickle.load(f)
            with open(scaler_path, "rb") as f:
                self._scaler = pickle.load(f)
            with open(columns_path, "r") as f:
                self._feature_cols = json.load(f)

            self._trained = True
            logger.info(
                f"Meta-classifier loaded: "
                f"{type(self._model).__name__}, "
                f"{len(self._feature_cols)} features"
            )
            return True

        except Exception as e:
            logger.error(f"Failed to load meta-classifier: {e}")
            return False

    def predict(
        self,
        features: AudioFeatures,
        vocals: Optional[VocalFeatures] = None,
        wav2vec2: Optional[Wav2Vec2Result] = None,
        clap: Optional[CLAPResult] = None,
        fst: Optional[FSTResult] = None,
    ) -> MetaResult:
        """
        Run meta-classifier on all tower outputs.

        Args:
            features: Librosa-extracted audio features.
            vocals: Vocal analysis results.
            wav2vec2: wav2vec2 tower result.
            clap: CLAP embedding result.
            fst: FST external API result.

        Returns:
            MetaResult with final prediction and explanations.
        """
        is_trained = self._ensure_loaded()

        # Collect per-tower scores for transparency
        tower_scores = {}
        if wav2vec2 and wav2vec2.available:
            tower_scores["wav2vec2"] = wav2vec2.p_ai
        if clap and clap.available:
            tower_scores["clap"] = clap.confidence
        if fst and fst.available:
            tower_scores["fst"] = fst.confidence

        # Local feature score (heuristic, used as fallback signal)
        local_score = (
            features.spectral_regularity * 0.35
            + features.temporal_patterns * 0.35
            + features.harmonic_structure * 0.30
        )
        tower_scores["local_features"] = round(local_score, 4)

        if vocals and vocals.has_vocals:
            tower_scores["vocals"] = vocals.vocal_ai_score

        if is_trained:
            return self._predict_trained(
                features, vocals, wav2vec2, clap, fst,
                tower_scores,
            )
        else:
            return self._predict_fallback(
                features, vocals, wav2vec2, clap, fst,
                tower_scores,
            )

    def _predict_trained(
        self,
        features: AudioFeatures,
        vocals: Optional[VocalFeatures],
        wav2vec2: Optional[Wav2Vec2Result],
        clap: Optional[CLAPResult],
        fst: Optional[FSTResult],
        tower_scores: dict,
    ) -> MetaResult:
        """Prediction using trained meta-classifier."""
        # Build feature vector matching training columns
        feat_dict = self._build_feature_dict(
            features, vocals,
        )

        # Assemble in correct column order
        x = np.array([
            feat_dict.get(col, 0.0)
            for col in self._feature_cols
        ], dtype=np.float32).reshape(1, -1)

        x = np.nan_to_num(x, nan=0.0, posinf=1.0, neginf=-1.0)
        x_scaled = self._scaler.transform(x)

        # Predict
        proba = self._model.predict_proba(x_scaled)[0]
        p_ai = float(proba[1])

        # FST calibration (not in trained model)
        if fst and fst.available:
            tower_scores["fst"] = fst.confidence
            if (p_ai > 0.5) != fst.is_ai:
                # Disagreement — moderate confidence
                p_ai = p_ai * 0.85 + 0.5 * 0.15

        is_ai = p_ai > 0.5
        confidence = round(p_ai if is_ai else 1.0 - p_ai, 4)

        # Generate indicators
        indicators = self._build_indicators(
            is_ai, confidence, features, vocals,
            tower_scores,
        )

        # Feature importances for this prediction
        top_features = self._get_top_features(x_scaled[0])

        return MetaResult(
            is_ai_generated=is_ai,
            confidence=confidence,
            model_version="auris-v2-trained",
            decision_source="auris_meta",
            analysis_mode="production",
            tower_scores=tower_scores,
            indicators=indicators,
            top_features=top_features,
        )

    def _predict_fallback(
        self,
        features: AudioFeatures,
        vocals: Optional[VocalFeatures],
        wav2vec2: Optional[Wav2Vec2Result],
        clap: Optional[CLAPResult],
        fst: Optional[FSTResult],
        tower_scores: dict,
    ) -> MetaResult:
        """
        Fallback when trained model is not available.

        Uses weighted averaging of available tower scores.
        Better than heuristic-only but not data-driven.
        """
        scores = []
        weights = []

        # wav2vec2 gets highest weight if available
        if wav2vec2 and wav2vec2.available:
            scores.append(wav2vec2.p_ai)
            weights.append(0.40)

        # Local features
        local = tower_scores.get("local_features", 0.5)
        scores.append(local)
        weights.append(0.25 if wav2vec2 and wav2vec2.available else 0.45)

        # Vocals
        if vocals and vocals.has_vocals:
            scores.append(vocals.vocal_ai_score)
            weights.append(0.15)

        # CLAP
        if clap and clap.available:
            scores.append(clap.confidence)
            weights.append(0.10)

        # FST
        if fst and fst.available:
            scores.append(fst.confidence)
            weights.append(0.20)

        # Weighted average
        total_w = sum(weights)
        p_ai = sum(
            s * (w / total_w) for s, w in zip(scores, weights)
        )

        is_ai = p_ai > 0.5
        confidence = round(max(0.51, min(0.97, p_ai)), 4)

        indicators = self._build_indicators(
            is_ai, confidence, features, vocals,
            tower_scores,
        )

        return MetaResult(
            is_ai_generated=is_ai,
            confidence=confidence,
            model_version="auris-v1-heuristic",
            decision_source="auris_fallback",
            analysis_mode="production",
            tower_scores=tower_scores,
            indicators=indicators,
        )

    def _build_feature_dict(
        self,
        features: AudioFeatures,
        vocals: Optional[VocalFeatures],
    ) -> dict:
        """Build flat feature dict for meta-classifier."""
        d = {
            "duration_sec": features.duration_sec,
            "sample_rate": features.sample_rate,
            "rms_energy": features.rms_energy,
            "tempo_bpm": features.tempo_bpm,
            "tempo_stability": features.tempo_stability,
            "spectral_centroid_mean": features.spectral_centroid_mean,
            "spectral_centroid_std": features.spectral_centroid_std,
            "spectral_flatness_mean": features.spectral_flatness_mean,
            "mfcc_variance": features.mfcc_variance,
            "chroma_entropy": features.chroma_entropy,
            "harmonic_ratio": features.harmonic_ratio,
            "zero_crossing_rate": features.zero_crossing_rate,
            "spectral_regularity": features.spectral_regularity,
            "temporal_patterns": features.temporal_patterns,
            "harmonic_structure": features.harmonic_structure,
        }

        if vocals:
            d.update({
                "has_vocals": 1.0 if vocals.has_vocals else 0.0,
                "vocal_confidence": vocals.vocal_confidence,
                "vocal_ai_score": vocals.vocal_ai_score,
                "pitch_stability_score": vocals.pitch_stability_score,
                "vibrato_regularity_score": vocals.vibrato_regularity_score,
                "formant_consistency_score": vocals.formant_consistency_score,
                "breath_pattern_score": vocals.breath_pattern_score,
                "vocal_texture_score": vocals.vocal_texture_score,
                "pitch_mean_hz": vocals.pitch_mean_hz,
                "pitch_std_cents": vocals.pitch_std_cents,
                "vibrato_rate_hz": vocals.vibrato_rate_hz,
                "vibrato_extent_cents": vocals.vibrato_extent_cents,
                "vocal_harmonic_ratio": vocals.vocal_harmonic_ratio,
                "vocal_energy_ratio": vocals.vocal_energy_ratio,
            })
        else:
            for key in [
                "has_vocals", "vocal_confidence", "vocal_ai_score",
                "pitch_stability_score", "vibrato_regularity_score",
                "formant_consistency_score", "breath_pattern_score",
                "vocal_texture_score", "pitch_mean_hz",
                "pitch_std_cents", "vibrato_rate_hz",
                "vibrato_extent_cents", "vocal_harmonic_ratio",
                "vocal_energy_ratio",
            ]:
                d[key] = 0.0

        return d

    def _get_top_features(
        self, x: np.ndarray, top_n: int = 5
    ) -> list[dict]:
        """
        Get top contributing features for this prediction.

        Uses feature_importances_ from tree models.
        In future: SHAP values for per-sample explanation.
        """
        if not hasattr(self._model, "feature_importances_"):
            return []

        importances = self._model.feature_importances_
        indices = np.argsort(importances)[::-1][:top_n]

        result = []
        for idx in indices:
            col_name = (
                self._feature_cols[idx]
                if idx < len(self._feature_cols)
                else f"feature_{idx}"
            )
            result.append({
                "feature": col_name,
                "importance": round(float(importances[idx]), 4),
                "value": round(float(x[idx]), 4),
            })

        return result

    @staticmethod
    def _build_indicators(
        is_ai: bool,
        confidence: float,
        features: AudioFeatures,
        vocals: Optional[VocalFeatures],
        tower_scores: dict,
    ) -> list[str]:
        """Generate human-readable indicators."""
        indicators = []

        # Overall
        label = "AI-generated" if is_ai else "human-composed"
        if confidence > 0.85:
            indicators.append(
                f"High confidence: classified as {label}."
            )
        elif confidence > 0.70:
            indicators.append(
                f"Moderate confidence: likely {label}."
            )
        else:
            indicators.append(
                f"Low confidence: borderline {label}."
            )

        # Tower agreement
        ai_towers = sum(
            1 for v in tower_scores.values() if v > 0.5
        )
        human_towers = sum(
            1 for v in tower_scores.values() if v <= 0.5
        )
        total = ai_towers + human_towers

        if total > 1:
            if ai_towers == total:
                indicators.append(
                    f"All {total} analysis signals agree: AI-generated."
                )
            elif human_towers == total:
                indicators.append(
                    f"All {total} analysis signals agree: human-composed."
                )
            else:
                indicators.append(
                    f"Mixed signals: {ai_towers}/{total} indicate AI, "
                    f"{human_towers}/{total} indicate human."
                )

        # Spectral
        if features.spectral_regularity > 0.7:
            indicators.append(
                "High spectral regularity — typical of AI synthesis."
            )
        elif features.spectral_regularity < 0.3:
            indicators.append(
                "Natural spectral variation — consistent with human recording."
            )

        # Temporal
        if features.temporal_patterns > 0.7:
            indicators.append(
                f"Metronomic timing precision "
                f"(tempo jitter: {features.tempo_stability:.3f}s)."
            )

        # Vocals
        if vocals and vocals.has_vocals:
            if vocals.vocal_ai_score > 0.7:
                indicators.append(
                    "Vocal analysis indicates synthetic voice characteristics."
                )
            elif vocals.vocal_ai_score < 0.3:
                indicators.append(
                    "Vocal patterns consistent with natural human singing."
                )

            if vocals.pitch_std_cents < 10:
                indicators.append(
                    f"Pitch jitter ({vocals.pitch_std_cents:.1f} cents) "
                    "is unusually low — suggests synthetic vocal."
                )

        return indicators