""" Meta-classifier service for AURIS score fusion. Replaces the hand-tuned weighted-average fusion with a trained stacking ensemble that combines signals from all analysis towers. Towers: 1. wav2vec2 fine-tuned (logit + hidden stats) 2. Librosa features (spectral, temporal, harmonic) 3. Vocal analysis (pitch, vibrato, formant, breath) 4. CLAP embeddings (when available) 5. FST external (when available) The meta-classifier was trained on the same dataset with cross-validated tower outputs. """ from __future__ import annotations import json import pickle from dataclasses import dataclass, field from pathlib import Path from typing import List, Optional import numpy as np from .feature_extractor import AudioFeatures from .vocal_analyzer import VocalFeatures from .wav2vec2_detector import Wav2Vec2Result from .clap_detector import CLAPResult from .fst_client import FSTResult from .logging_config import get_logger logger = get_logger(__name__) _MODELS_DIR = Path(__file__).resolve().parents[2] / "models" @dataclass class MetaResult: """Final detection result from meta-classifier.""" is_ai_generated: bool confidence: float model_version: str = "auris-v2-meta" decision_source: str = "auris_meta" analysis_mode: str = "production" # Per-tower scores for transparency tower_scores: dict = field(default_factory=dict) # Explainable indicators (SHAP-based when available) indicators: List[str] = field(default_factory=list) # Feature importances for this prediction top_features: List[dict] = field(default_factory=list) class MetaClassifierService: """ Trained stacking meta-classifier. Combines all tower outputs into a single feature vector, runs the trained classifier, and generates explainable indicators. Falls back to simple averaging when trained model is not available (during development before first training). """ def __init__(self) -> None: """Initialize meta-classifier with empty state.""" self._model = None self._scaler = None self._feature_cols: list[str] = [] self._initialized = False self._trained = False def _ensure_loaded(self) -> bool: """Load trained meta-classifier if available.""" if self._initialized: return self._trained self._initialized = True model_path = _MODELS_DIR / "auris_classifier_v1.pkl" scaler_path = _MODELS_DIR / "feature_scaler_v1.pkl" columns_path = _MODELS_DIR / "feature_columns_v1.json" if not model_path.exists(): logger.info( "Meta-classifier not found. " "Using fallback fusion." ) return False try: with open(model_path, "rb") as f: self._model = pickle.load(f) with open(scaler_path, "rb") as f: self._scaler = pickle.load(f) with open(columns_path, "r") as f: self._feature_cols = json.load(f) self._trained = True logger.info( f"Meta-classifier loaded: " f"{type(self._model).__name__}, " f"{len(self._feature_cols)} features" ) return True except Exception as e: logger.error(f"Failed to load meta-classifier: {e}") return False def predict( self, features: AudioFeatures, vocals: Optional[VocalFeatures] = None, wav2vec2: Optional[Wav2Vec2Result] = None, clap: Optional[CLAPResult] = None, fst: Optional[FSTResult] = None, ) -> MetaResult: """ Run meta-classifier on all tower outputs. Args: features: Librosa-extracted audio features. vocals: Vocal analysis results. wav2vec2: wav2vec2 tower result. clap: CLAP embedding result. fst: FST external API result. Returns: MetaResult with final prediction and explanations. """ is_trained = self._ensure_loaded() # Collect per-tower scores for transparency tower_scores = {} if wav2vec2 and wav2vec2.available: tower_scores["wav2vec2"] = wav2vec2.p_ai if clap and clap.available: tower_scores["clap"] = clap.confidence if fst and fst.available: tower_scores["fst"] = fst.confidence # Local feature score (heuristic, used as fallback signal) local_score = ( features.spectral_regularity * 0.35 + features.temporal_patterns * 0.35 + features.harmonic_structure * 0.30 ) tower_scores["local_features"] = round(local_score, 4) if vocals and vocals.has_vocals: tower_scores["vocals"] = vocals.vocal_ai_score if is_trained: return self._predict_trained( features, vocals, wav2vec2, clap, fst, tower_scores, ) else: return self._predict_fallback( features, vocals, wav2vec2, clap, fst, tower_scores, ) def _predict_trained( self, features: AudioFeatures, vocals: Optional[VocalFeatures], wav2vec2: Optional[Wav2Vec2Result], clap: Optional[CLAPResult], fst: Optional[FSTResult], tower_scores: dict, ) -> MetaResult: """Prediction using trained meta-classifier.""" # Build feature vector matching training columns feat_dict = self._build_feature_dict( features, vocals, ) # Assemble in correct column order x = np.array([ feat_dict.get(col, 0.0) for col in self._feature_cols ], dtype=np.float32).reshape(1, -1) x = np.nan_to_num(x, nan=0.0, posinf=1.0, neginf=-1.0) x_scaled = self._scaler.transform(x) # Predict proba = self._model.predict_proba(x_scaled)[0] p_ai = float(proba[1]) # FST calibration (not in trained model) if fst and fst.available: tower_scores["fst"] = fst.confidence if (p_ai > 0.5) != fst.is_ai: # Disagreement — moderate confidence p_ai = p_ai * 0.85 + 0.5 * 0.15 is_ai = p_ai > 0.5 confidence = round(p_ai if is_ai else 1.0 - p_ai, 4) # Generate indicators indicators = self._build_indicators( is_ai, confidence, features, vocals, tower_scores, ) # Feature importances for this prediction top_features = self._get_top_features(x_scaled[0]) return MetaResult( is_ai_generated=is_ai, confidence=confidence, model_version="auris-v2-trained", decision_source="auris_meta", analysis_mode="production", tower_scores=tower_scores, indicators=indicators, top_features=top_features, ) def _predict_fallback( self, features: AudioFeatures, vocals: Optional[VocalFeatures], wav2vec2: Optional[Wav2Vec2Result], clap: Optional[CLAPResult], fst: Optional[FSTResult], tower_scores: dict, ) -> MetaResult: """ Fallback when trained model is not available. Uses weighted averaging of available tower scores. Better than heuristic-only but not data-driven. """ scores = [] weights = [] # wav2vec2 gets highest weight if available if wav2vec2 and wav2vec2.available: scores.append(wav2vec2.p_ai) weights.append(0.40) # Local features local = tower_scores.get("local_features", 0.5) scores.append(local) weights.append(0.25 if wav2vec2 and wav2vec2.available else 0.45) # Vocals if vocals and vocals.has_vocals: scores.append(vocals.vocal_ai_score) weights.append(0.15) # CLAP if clap and clap.available: scores.append(clap.confidence) weights.append(0.10) # FST if fst and fst.available: scores.append(fst.confidence) weights.append(0.20) # Weighted average total_w = sum(weights) p_ai = sum( s * (w / total_w) for s, w in zip(scores, weights) ) is_ai = p_ai > 0.5 confidence = round(max(0.51, min(0.97, p_ai)), 4) indicators = self._build_indicators( is_ai, confidence, features, vocals, tower_scores, ) return MetaResult( is_ai_generated=is_ai, confidence=confidence, model_version="auris-v1-heuristic", decision_source="auris_fallback", analysis_mode="production", tower_scores=tower_scores, indicators=indicators, ) def _build_feature_dict( self, features: AudioFeatures, vocals: Optional[VocalFeatures], ) -> dict: """Build flat feature dict for meta-classifier.""" d = { "duration_sec": features.duration_sec, "sample_rate": features.sample_rate, "rms_energy": features.rms_energy, "tempo_bpm": features.tempo_bpm, "tempo_stability": features.tempo_stability, "spectral_centroid_mean": features.spectral_centroid_mean, "spectral_centroid_std": features.spectral_centroid_std, "spectral_flatness_mean": features.spectral_flatness_mean, "mfcc_variance": features.mfcc_variance, "chroma_entropy": features.chroma_entropy, "harmonic_ratio": features.harmonic_ratio, "zero_crossing_rate": features.zero_crossing_rate, "spectral_regularity": features.spectral_regularity, "temporal_patterns": features.temporal_patterns, "harmonic_structure": features.harmonic_structure, } if vocals: d.update({ "has_vocals": 1.0 if vocals.has_vocals else 0.0, "vocal_confidence": vocals.vocal_confidence, "vocal_ai_score": vocals.vocal_ai_score, "pitch_stability_score": vocals.pitch_stability_score, "vibrato_regularity_score": vocals.vibrato_regularity_score, "formant_consistency_score": vocals.formant_consistency_score, "breath_pattern_score": vocals.breath_pattern_score, "vocal_texture_score": vocals.vocal_texture_score, "pitch_mean_hz": vocals.pitch_mean_hz, "pitch_std_cents": vocals.pitch_std_cents, "vibrato_rate_hz": vocals.vibrato_rate_hz, "vibrato_extent_cents": vocals.vibrato_extent_cents, "vocal_harmonic_ratio": vocals.vocal_harmonic_ratio, "vocal_energy_ratio": vocals.vocal_energy_ratio, }) else: for key in [ "has_vocals", "vocal_confidence", "vocal_ai_score", "pitch_stability_score", "vibrato_regularity_score", "formant_consistency_score", "breath_pattern_score", "vocal_texture_score", "pitch_mean_hz", "pitch_std_cents", "vibrato_rate_hz", "vibrato_extent_cents", "vocal_harmonic_ratio", "vocal_energy_ratio", ]: d[key] = 0.0 return d def _get_top_features( self, x: np.ndarray, top_n: int = 5 ) -> list[dict]: """ Get top contributing features for this prediction. Uses feature_importances_ from tree models. In future: SHAP values for per-sample explanation. """ if not hasattr(self._model, "feature_importances_"): return [] importances = self._model.feature_importances_ indices = np.argsort(importances)[::-1][:top_n] result = [] for idx in indices: col_name = ( self._feature_cols[idx] if idx < len(self._feature_cols) else f"feature_{idx}" ) result.append({ "feature": col_name, "importance": round(float(importances[idx]), 4), "value": round(float(x[idx]), 4), }) return result @staticmethod def _build_indicators( is_ai: bool, confidence: float, features: AudioFeatures, vocals: Optional[VocalFeatures], tower_scores: dict, ) -> list[str]: """Generate human-readable indicators.""" indicators = [] # Overall label = "AI-generated" if is_ai else "human-composed" if confidence > 0.85: indicators.append( f"High confidence: classified as {label}." ) elif confidence > 0.70: indicators.append( f"Moderate confidence: likely {label}." ) else: indicators.append( f"Low confidence: borderline {label}." ) # Tower agreement ai_towers = sum( 1 for v in tower_scores.values() if v > 0.5 ) human_towers = sum( 1 for v in tower_scores.values() if v <= 0.5 ) total = ai_towers + human_towers if total > 1: if ai_towers == total: indicators.append( f"All {total} analysis signals agree: AI-generated." ) elif human_towers == total: indicators.append( f"All {total} analysis signals agree: human-composed." ) else: indicators.append( f"Mixed signals: {ai_towers}/{total} indicate AI, " f"{human_towers}/{total} indicate human." ) # Spectral if features.spectral_regularity > 0.7: indicators.append( "High spectral regularity — typical of AI synthesis." ) elif features.spectral_regularity < 0.3: indicators.append( "Natural spectral variation — consistent with human recording." ) # Temporal if features.temporal_patterns > 0.7: indicators.append( f"Metronomic timing precision " f"(tempo jitter: {features.tempo_stability:.3f}s)." ) # Vocals if vocals and vocals.has_vocals: if vocals.vocal_ai_score > 0.7: indicators.append( "Vocal analysis indicates synthetic voice characteristics." ) elif vocals.vocal_ai_score < 0.3: indicators.append( "Vocal patterns consistent with natural human singing." ) if vocals.pitch_std_cents < 10: indicators.append( f"Pitch jitter ({vocals.pitch_std_cents:.1f} cents) " "is unusually low — suggests synthetic vocal." ) return indicators