""" Real audio feature extraction for AI music detection. Extracts spectral, temporal, and harmonic features from audio using librosa. These features are used to distinguish AI-generated music from human-composed music. """ from __future__ import annotations import io import subprocess import tempfile from dataclasses import dataclass from pathlib import Path from typing import Optional, Union import numpy as np import librosa from .logging_config import get_logger logger = get_logger(__name__) # ── Constants ──────────────────────────────────────────────────────────── _TARGET_SR = 22050 # Standard sample rate for analysis _DURATION_LIMIT = 60.0 # Analyze max 1 minute (sufficient for detection) _N_MFCC = 13 _N_MELS = 128 _HOP_LENGTH = 512 _N_FFT = 2048 @dataclass class AudioFeatures: """Extracted audio features with normalized scores (0.0 – 1.0).""" spectral_regularity: float temporal_patterns: float harmonic_structure: float # Raw metrics for downstream consumers duration_sec: float sample_rate: int rms_energy: float rms_std: float tempo_bpm: float tempo_stability: float # std of inter-beat intervals tempo_cv: float # coefficient of variation of beat intervals spectral_centroid_mean: float spectral_centroid_std: float spectral_flatness_mean: float spectral_flatness_std: float spectral_bandwidth_mean: float spectral_bandwidth_std: float spectral_rolloff_mean: float spectral_rolloff_std: float spectral_contrast_mean: float spectral_contrast_std: float mfcc_variance: float # mean variance across MFCC bands mfcc_delta_var: float # mean variance of MFCC first derivative mfcc_delta2_var: float # mean variance of MFCC second derivative chroma_entropy: float # entropy of chroma distribution chroma_std: float # temporal chroma variability chroma_transition_rate: float # pitch class change rate harmonic_ratio: float # harmonic / (harmonic + percussive) tonnetz_std: float # tonal centroid variability zero_crossing_rate: float zero_crossing_std: float onset_strength_mean: float onset_strength_std: float rms_dynamic_range: float beat_count: int mel_flatness: float @dataclass class AudioMeta: """Basic metadata about the audio file.""" duration_sec: float sample_rate: int format: str channels: int def extract_features( source: Union[Path, bytes, io.BytesIO], *, sr: Optional[int] = None, ) -> AudioFeatures: """ Extract all analysis features from an audio source. Args: source: File path, raw bytes, or BytesIO of the audio. sr: Force a specific sample rate (default: _TARGET_SR). Returns: AudioFeatures with normalized scores and raw metrics. """ target_sr = sr or _TARGET_SR y, actual_sr = _load_audio(source, target_sr) duration_sec = float(len(y) / actual_sr) logger.info( f"Feature extraction: {duration_sec:.1f}s audio @ {actual_sr}Hz " f"({len(y)} samples)" ) # ── Core feature groups ────────────────────────────────────────── spectral = _extract_spectral(y, actual_sr) temporal = _extract_temporal(y, actual_sr) harmonic = _extract_harmonic(y, actual_sr) # ── Composite scores (0.0 = very human, 1.0 = very AI-like) ───── spectral_score = _score_spectral_regularity(spectral) temporal_score = _score_temporal_patterns(temporal) harmonic_score = _score_harmonic_structure(harmonic) return AudioFeatures( spectral_regularity=spectral_score, temporal_patterns=temporal_score, harmonic_structure=harmonic_score, duration_sec=duration_sec, sample_rate=actual_sr, rms_energy=spectral["rms_mean"], rms_std=spectral["rms_std"], tempo_bpm=temporal["tempo_bpm"], tempo_stability=temporal["tempo_stability"], tempo_cv=temporal["tempo_cv"], spectral_centroid_mean=spectral["centroid_mean"], spectral_centroid_std=spectral["centroid_std"], spectral_flatness_mean=spectral["flatness_mean"], spectral_flatness_std=spectral["flatness_std"], spectral_bandwidth_mean=spectral["bandwidth_mean"], spectral_bandwidth_std=spectral["bandwidth_std"], spectral_rolloff_mean=spectral["rolloff_mean"], spectral_rolloff_std=spectral["rolloff_std"], spectral_contrast_mean=spectral["contrast_mean"], spectral_contrast_std=spectral["contrast_std"], mfcc_variance=spectral["mfcc_variance"], mfcc_delta_var=spectral["mfcc_delta_var"], mfcc_delta2_var=spectral["mfcc_delta2_var"], chroma_entropy=harmonic["chroma_entropy"], chroma_std=harmonic["chroma_std"], chroma_transition_rate=harmonic["chroma_transition_rate"], harmonic_ratio=harmonic["harmonic_ratio"], tonnetz_std=harmonic["tonnetz_std"], zero_crossing_rate=temporal["zcr_mean"], zero_crossing_std=temporal["zcr_std"], onset_strength_mean=temporal["onset_mean"], onset_strength_std=temporal["onset_std"], rms_dynamic_range=temporal["rms_dynamic_range"], beat_count=temporal["beat_count"], mel_flatness=spectral["mel_flatness"], ) def extract_meta(source: Union[Path, bytes, io.BytesIO]) -> AudioMeta: """Quick metadata extraction without full feature analysis.""" y, sr = _load_audio(source, _TARGET_SR) fmt = "wav" channels = 1 if isinstance(source, Path): fmt = source.suffix.lstrip(".") or "wav" # librosa always returns mono; detect original channels via soundfile try: import soundfile as sf info = sf.info(str(source)) channels = info.channels except Exception: pass return AudioMeta( duration_sec=float(len(y) / sr), sample_rate=sr, format=fmt, channels=channels, ) # ═══════════════════════════════════════════════════════════════════════ # PRIVATE — Audio loading # ═══════════════════════════════════════════════════════════════════════ def _ffmpeg_decode(data: bytes) -> io.BytesIO: """Decode any audio format (webm, opus, ogg, etc.) to WAV via ffmpeg.""" with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: tmp_path = tmp.name try: result = subprocess.run( ["ffmpeg", "-y", "-i", "pipe:0", "-ar", "22050", "-ac", "1", "-f", "wav", tmp_path], input=data, capture_output=True, timeout=30, ) if result.returncode != 0: raise RuntimeError(f"ffmpeg failed: {result.stderr.decode()[:200]}") with open(tmp_path, "rb") as f: return io.BytesIO(f.read()) finally: Path(tmp_path).unlink(missing_ok=True) def _load_audio( source: Union[Path, bytes, io.BytesIO], target_sr: int, ) -> tuple[np.ndarray, int]: """Load audio from any source, mono, resampled, duration-limited.""" if isinstance(source, bytes): source = io.BytesIO(source) # Read bytes for potential ffmpeg fallback if isinstance(source, io.BytesIO): raw_bytes = source.read() source = io.BytesIO(raw_bytes) else: raw_bytes = None try: y, sr = librosa.load( source if isinstance(source, (str, Path, io.BytesIO)) else str(source), sr=target_sr, mono=True, duration=_DURATION_LIMIT, ) except Exception: # soundfile can't read webm/opus/ogg — try ffmpeg decode to WAV if raw_bytes is None: raise wav_buf = _ffmpeg_decode(raw_bytes) y, sr = librosa.load(wav_buf, sr=target_sr, mono=True, duration=_DURATION_LIMIT) # Guard against silent / corrupt files if len(y) < sr: raise ValueError("Audio too short for analysis (< 1 second)") if np.max(np.abs(y)) < 1e-6: raise ValueError("Audio is silent") return y, sr # ═══════════════════════════════════════════════════════════════════════ # PRIVATE — Spectral features # ═══════════════════════════════════════════════════════════════════════ def _extract_spectral(y: np.ndarray, sr: int) -> dict: """Spectral domain features.""" # Spectral centroid — "brightness" centroid = librosa.feature.spectral_centroid( y=y, sr=sr, hop_length=_HOP_LENGTH )[0] # Spectral flatness — how noise-like vs tonal flatness = librosa.feature.spectral_flatness( y=y, hop_length=_HOP_LENGTH )[0] # Spectral bandwidth bandwidth = librosa.feature.spectral_bandwidth( y=y, sr=sr, hop_length=_HOP_LENGTH )[0] # Spectral rolloff — frequency below which 85% of energy rolloff = librosa.feature.spectral_rolloff( y=y, sr=sr, hop_length=_HOP_LENGTH, roll_percent=0.85 )[0] # Spectral contrast — valley-to-peak in frequency bands contrast = librosa.feature.spectral_contrast( y=y, sr=sr, hop_length=_HOP_LENGTH, n_bands=6 ) # MFCCs — timbre fingerprint mfcc = librosa.feature.mfcc( y=y, sr=sr, n_mfcc=_N_MFCC, hop_length=_HOP_LENGTH ) # RMS energy rms = librosa.feature.rms(y=y, hop_length=_HOP_LENGTH)[0] # Mel spectrogram statistics mel = librosa.feature.melspectrogram( y=y, sr=sr, n_mels=_N_MELS, hop_length=_HOP_LENGTH ) mel_db = librosa.power_to_db(mel, ref=np.max) # MFCC delta (first derivative) and delta-delta (acceleration) mfcc_delta = librosa.feature.delta(mfcc) mfcc_delta2 = librosa.feature.delta(mfcc, order=2) return { "centroid_mean": float(np.mean(centroid)), "centroid_std": float(np.std(centroid)), "flatness_mean": float(np.mean(flatness)), "flatness_std": float(np.std(flatness)), "bandwidth_mean": float(np.mean(bandwidth)), "bandwidth_std": float(np.std(bandwidth)), "rolloff_mean": float(np.mean(rolloff)), "rolloff_std": float(np.std(rolloff)), "contrast_mean": float(np.mean(contrast)), "contrast_std": float(np.std(contrast)), "mfcc_variance": float(np.mean(np.var(mfcc, axis=1))), "mfcc_delta_var": float(np.mean(np.var(mfcc_delta, axis=1))), "mfcc_delta2_var": float(np.mean(np.var(mfcc_delta2, axis=1))), "rms_mean": float(np.mean(rms)), "rms_std": float(np.std(rms)), "mel_flatness": float(np.mean(np.std(mel_db, axis=0))), } # ═══════════════════════════════════════════════════════════════════════ # PRIVATE — Temporal features # ═══════════════════════════════════════════════════════════════════════ def _extract_temporal(y: np.ndarray, sr: int) -> dict: """Time-domain and rhythm features.""" # Tempo and beat tracking tempo, beat_frames = librosa.beat.beat_track( y=y, sr=sr, hop_length=_HOP_LENGTH ) tempo_bpm = float(np.atleast_1d(tempo)[0]) # Beat timing stability beat_times = librosa.frames_to_time(beat_frames, sr=sr, hop_length=_HOP_LENGTH) if len(beat_times) > 2: ibi = np.diff(beat_times) # inter-beat intervals tempo_stability = float(np.std(ibi)) tempo_cv = float(np.std(ibi) / np.mean(ibi)) if np.mean(ibi) > 0 else 0.0 else: tempo_stability = 0.0 tempo_cv = 0.0 # Onset strength — rhythmic energy onset_env = librosa.onset.onset_strength( y=y, sr=sr, hop_length=_HOP_LENGTH ) onset_std = float(np.std(onset_env)) onset_mean = float(np.mean(onset_env)) # Zero-crossing rate — rough texture indicator zcr = librosa.feature.zero_crossing_rate(y, hop_length=_HOP_LENGTH)[0] # RMS energy dynamics — how much volume varies rms = librosa.feature.rms(y=y, hop_length=_HOP_LENGTH)[0] rms_dynamic_range = float(np.max(rms) - np.min(rms)) if len(rms) > 0 else 0.0 return { "tempo_bpm": tempo_bpm, "tempo_stability": tempo_stability, "tempo_cv": tempo_cv, "onset_std": onset_std, "onset_mean": onset_mean, "zcr_mean": float(np.mean(zcr)), "zcr_std": float(np.std(zcr)), "rms_dynamic_range": rms_dynamic_range, "beat_count": len(beat_frames), } # ═══════════════════════════════════════════════════════════════════════ # PRIVATE — Harmonic features # ═══════════════════════════════════════════════════════════════════════ def _extract_harmonic(y: np.ndarray, sr: int) -> dict: """Harmonic and tonal features.""" # Harmonic-percussive separation y_harmonic, y_percussive = librosa.effects.hpss(y) harmonic_energy = float(np.sum(y_harmonic ** 2)) total_energy = float(np.sum(y ** 2)) harmonic_ratio = harmonic_energy / total_energy if total_energy > 0 else 0.5 # Chroma features — pitch class distribution chroma = librosa.feature.chroma_stft( y=y, sr=sr, hop_length=_HOP_LENGTH, n_chroma=12 ) # Chroma entropy — how spread the pitch classes are chroma_mean = np.mean(chroma, axis=1) chroma_mean = chroma_mean / (np.sum(chroma_mean) + 1e-10) chroma_entropy = float(-np.sum(chroma_mean * np.log2(chroma_mean + 1e-10))) # Chroma standard deviation — how stable pitch classes are over time chroma_std = float(np.mean(np.std(chroma, axis=1))) # Tonnetz — tonal centroid features (harmonic relationships) tonnetz = librosa.feature.tonnetz(y=y_harmonic, sr=sr) tonnetz_std = float(np.mean(np.std(tonnetz, axis=1))) # Chroma transition matrix — how often pitch classes change chroma_binary = (chroma > np.median(chroma)).astype(float) chroma_diff = np.diff(chroma_binary, axis=1) chroma_transition_rate = float(np.mean(np.abs(chroma_diff))) return { "harmonic_ratio": harmonic_ratio, "chroma_entropy": chroma_entropy, "chroma_std": chroma_std, "tonnetz_std": tonnetz_std, "chroma_transition_rate": chroma_transition_rate, } # ═══════════════════════════════════════════════════════════════════════ # PRIVATE — Composite scoring # ═══════════════════════════════════════════════════════════════════════ def _sigmoid(x: float, midpoint: float = 0.0, steepness: float = 1.0) -> float: """Sigmoid normalization to [0, 1].""" z = steepness * (x - midpoint) z = max(-20.0, min(20.0, z)) # clamp for numerical stability return 1.0 / (1.0 + np.exp(-z)) def _score_spectral_regularity(spectral: dict) -> float: """ Score how "regular" (AI-like) the spectral content is. AI music tends to have: - Lower spectral centroid variance (more uniform brightness) - Lower MFCC variance (more consistent timbre) - Higher spectral flatness (more even frequency distribution) - Lower mel spectrogram variance over time """ # Low centroid std → high regularity → more AI-like centroid_score = 1.0 - _sigmoid(spectral["centroid_std"], midpoint=800, steepness=0.003) # Low MFCC variance → consistent timbre → more AI-like mfcc_score = 1.0 - _sigmoid(spectral["mfcc_variance"], midpoint=50, steepness=0.03) # High flatness → noise-like distribution → more AI-like flatness_score = _sigmoid(spectral["flatness_mean"], midpoint=0.02, steepness=40) # Low mel variance → uniform spectral energy → more AI-like mel_score = 1.0 - _sigmoid(spectral["mel_flatness"], midpoint=10, steepness=0.1) score = ( centroid_score * 0.3 + mfcc_score * 0.3 + flatness_score * 0.2 + mel_score * 0.2 ) return round(max(0.0, min(0.99, score)), 3) def _score_temporal_patterns(temporal: dict) -> float: """ Score how "metronomic" (AI-like) the temporal patterns are. AI music tends to have: - Very low tempo variability (coefficient of variation) - Consistent onset strength (less dynamic) - Lower RMS dynamic range """ # Low tempo CV → metronomic → more AI-like # Human musicians: CV ~0.05-0.15, AI: CV ~0.01-0.04 tempo_score = 1.0 - _sigmoid(temporal["tempo_cv"], midpoint=0.06, steepness=30) # Low onset std → flat dynamics → more AI-like onset_score = 1.0 - _sigmoid(temporal["onset_std"], midpoint=1.5, steepness=1.0) # Low dynamic range → compressed → more AI-like dynamic_score = 1.0 - _sigmoid(temporal["rms_dynamic_range"], midpoint=0.15, steepness=8) # Low ZCR variance → uniform texture → more AI-like zcr_score = 1.0 - _sigmoid(temporal["zcr_std"], midpoint=0.03, steepness=30) score = ( tempo_score * 0.35 + onset_score * 0.25 + dynamic_score * 0.2 + zcr_score * 0.2 ) return round(max(0.0, min(0.99, score)), 3) def _score_harmonic_structure(harmonic: dict) -> float: """ Score how "predictable" (AI-like) the harmonic content is. AI music tends to have: - Lower chroma entropy (fewer distinct pitch classes used) - Lower chroma transition rate (less harmonic movement) - Lower tonnetz variability (simpler tonal relationships) - Higher harmonic ratio (cleaner separation) """ # Low chroma entropy → fewer pitch classes → more AI-like # Max entropy for 12 pitch classes = log2(12) ≈ 3.58 entropy_score = 1.0 - _sigmoid(harmonic["chroma_entropy"], midpoint=3.2, steepness=3) # Low transition rate → less harmonic movement → more AI-like transition_score = 1.0 - _sigmoid( harmonic["chroma_transition_rate"], midpoint=0.15, steepness=8 ) # Low tonnetz std → simpler relationships → more AI-like tonnetz_score = 1.0 - _sigmoid(harmonic["tonnetz_std"], midpoint=0.15, steepness=8) # High harmonic ratio → too clean → more AI-like hr_score = _sigmoid(harmonic["harmonic_ratio"], midpoint=0.6, steepness=5) score = ( entropy_score * 0.3 + transition_score * 0.25 + tonnetz_score * 0.25 + hr_score * 0.2 ) return round(max(0.0, min(0.99, score)), 3)