Spaces:
Sleeping
Sleeping
| """ | |
| Real audio feature extraction for AI music detection. | |
| Extracts spectral, temporal, and harmonic features from audio | |
| using librosa. These features are used to distinguish AI-generated | |
| music from human-composed music. | |
| """ | |
| from __future__ import annotations | |
| import io | |
| import subprocess | |
| import tempfile | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Optional, Union | |
| import numpy as np | |
| import librosa | |
| from .logging_config import get_logger | |
| logger = get_logger(__name__) | |
| # ── Constants ──────────────────────────────────────────────────────────── | |
| _TARGET_SR = 22050 # Standard sample rate for analysis | |
| _DURATION_LIMIT = 60.0 # Analyze max 1 minute (sufficient for detection) | |
| _N_MFCC = 13 | |
| _N_MELS = 128 | |
| _HOP_LENGTH = 512 | |
| _N_FFT = 2048 | |
| class AudioFeatures: | |
| """Extracted audio features with normalized scores (0.0 – 1.0).""" | |
| spectral_regularity: float | |
| temporal_patterns: float | |
| harmonic_structure: float | |
| # Raw metrics for downstream consumers | |
| duration_sec: float | |
| sample_rate: int | |
| rms_energy: float | |
| rms_std: float | |
| tempo_bpm: float | |
| tempo_stability: float # std of inter-beat intervals | |
| tempo_cv: float # coefficient of variation of beat intervals | |
| spectral_centroid_mean: float | |
| spectral_centroid_std: float | |
| spectral_flatness_mean: float | |
| spectral_flatness_std: float | |
| spectral_bandwidth_mean: float | |
| spectral_bandwidth_std: float | |
| spectral_rolloff_mean: float | |
| spectral_rolloff_std: float | |
| spectral_contrast_mean: float | |
| spectral_contrast_std: float | |
| mfcc_variance: float # mean variance across MFCC bands | |
| mfcc_delta_var: float # mean variance of MFCC first derivative | |
| mfcc_delta2_var: float # mean variance of MFCC second derivative | |
| chroma_entropy: float # entropy of chroma distribution | |
| chroma_std: float # temporal chroma variability | |
| chroma_transition_rate: float # pitch class change rate | |
| harmonic_ratio: float # harmonic / (harmonic + percussive) | |
| tonnetz_std: float # tonal centroid variability | |
| zero_crossing_rate: float | |
| zero_crossing_std: float | |
| onset_strength_mean: float | |
| onset_strength_std: float | |
| rms_dynamic_range: float | |
| beat_count: int | |
| mel_flatness: float | |
| class AudioMeta: | |
| """Basic metadata about the audio file.""" | |
| duration_sec: float | |
| sample_rate: int | |
| format: str | |
| channels: int | |
| def extract_features( | |
| source: Union[Path, bytes, io.BytesIO], | |
| *, | |
| sr: Optional[int] = None, | |
| ) -> AudioFeatures: | |
| """ | |
| Extract all analysis features from an audio source. | |
| Args: | |
| source: File path, raw bytes, or BytesIO of the audio. | |
| sr: Force a specific sample rate (default: _TARGET_SR). | |
| Returns: | |
| AudioFeatures with normalized scores and raw metrics. | |
| """ | |
| target_sr = sr or _TARGET_SR | |
| y, actual_sr = _load_audio(source, target_sr) | |
| duration_sec = float(len(y) / actual_sr) | |
| logger.info( | |
| f"Feature extraction: {duration_sec:.1f}s audio @ {actual_sr}Hz " | |
| f"({len(y)} samples)" | |
| ) | |
| # ── Core feature groups ────────────────────────────────────────── | |
| spectral = _extract_spectral(y, actual_sr) | |
| temporal = _extract_temporal(y, actual_sr) | |
| harmonic = _extract_harmonic(y, actual_sr) | |
| # ── Composite scores (0.0 = very human, 1.0 = very AI-like) ───── | |
| spectral_score = _score_spectral_regularity(spectral) | |
| temporal_score = _score_temporal_patterns(temporal) | |
| harmonic_score = _score_harmonic_structure(harmonic) | |
| return AudioFeatures( | |
| spectral_regularity=spectral_score, | |
| temporal_patterns=temporal_score, | |
| harmonic_structure=harmonic_score, | |
| duration_sec=duration_sec, | |
| sample_rate=actual_sr, | |
| rms_energy=spectral["rms_mean"], | |
| rms_std=spectral["rms_std"], | |
| tempo_bpm=temporal["tempo_bpm"], | |
| tempo_stability=temporal["tempo_stability"], | |
| tempo_cv=temporal["tempo_cv"], | |
| spectral_centroid_mean=spectral["centroid_mean"], | |
| spectral_centroid_std=spectral["centroid_std"], | |
| spectral_flatness_mean=spectral["flatness_mean"], | |
| spectral_flatness_std=spectral["flatness_std"], | |
| spectral_bandwidth_mean=spectral["bandwidth_mean"], | |
| spectral_bandwidth_std=spectral["bandwidth_std"], | |
| spectral_rolloff_mean=spectral["rolloff_mean"], | |
| spectral_rolloff_std=spectral["rolloff_std"], | |
| spectral_contrast_mean=spectral["contrast_mean"], | |
| spectral_contrast_std=spectral["contrast_std"], | |
| mfcc_variance=spectral["mfcc_variance"], | |
| mfcc_delta_var=spectral["mfcc_delta_var"], | |
| mfcc_delta2_var=spectral["mfcc_delta2_var"], | |
| chroma_entropy=harmonic["chroma_entropy"], | |
| chroma_std=harmonic["chroma_std"], | |
| chroma_transition_rate=harmonic["chroma_transition_rate"], | |
| harmonic_ratio=harmonic["harmonic_ratio"], | |
| tonnetz_std=harmonic["tonnetz_std"], | |
| zero_crossing_rate=temporal["zcr_mean"], | |
| zero_crossing_std=temporal["zcr_std"], | |
| onset_strength_mean=temporal["onset_mean"], | |
| onset_strength_std=temporal["onset_std"], | |
| rms_dynamic_range=temporal["rms_dynamic_range"], | |
| beat_count=temporal["beat_count"], | |
| mel_flatness=spectral["mel_flatness"], | |
| ) | |
| def extract_meta(source: Union[Path, bytes, io.BytesIO]) -> AudioMeta: | |
| """Quick metadata extraction without full feature analysis.""" | |
| y, sr = _load_audio(source, _TARGET_SR) | |
| fmt = "wav" | |
| channels = 1 | |
| if isinstance(source, Path): | |
| fmt = source.suffix.lstrip(".") or "wav" | |
| # librosa always returns mono; detect original channels via soundfile | |
| try: | |
| import soundfile as sf | |
| info = sf.info(str(source)) | |
| channels = info.channels | |
| except Exception: | |
| pass | |
| return AudioMeta( | |
| duration_sec=float(len(y) / sr), | |
| sample_rate=sr, | |
| format=fmt, | |
| channels=channels, | |
| ) | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Audio loading | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _ffmpeg_decode(data: bytes) -> io.BytesIO: | |
| """Decode any audio format (webm, opus, ogg, etc.) to WAV via ffmpeg.""" | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: | |
| tmp_path = tmp.name | |
| try: | |
| result = subprocess.run( | |
| ["ffmpeg", "-y", "-i", "pipe:0", "-ar", "22050", "-ac", "1", | |
| "-f", "wav", tmp_path], | |
| input=data, | |
| capture_output=True, | |
| timeout=30, | |
| ) | |
| if result.returncode != 0: | |
| raise RuntimeError(f"ffmpeg failed: {result.stderr.decode()[:200]}") | |
| with open(tmp_path, "rb") as f: | |
| return io.BytesIO(f.read()) | |
| finally: | |
| Path(tmp_path).unlink(missing_ok=True) | |
| def _load_audio( | |
| source: Union[Path, bytes, io.BytesIO], | |
| target_sr: int, | |
| ) -> tuple[np.ndarray, int]: | |
| """Load audio from any source, mono, resampled, duration-limited.""" | |
| if isinstance(source, bytes): | |
| source = io.BytesIO(source) | |
| # Read bytes for potential ffmpeg fallback | |
| if isinstance(source, io.BytesIO): | |
| raw_bytes = source.read() | |
| source = io.BytesIO(raw_bytes) | |
| else: | |
| raw_bytes = None | |
| try: | |
| y, sr = librosa.load( | |
| source if isinstance(source, (str, Path, io.BytesIO)) else str(source), | |
| sr=target_sr, | |
| mono=True, | |
| duration=_DURATION_LIMIT, | |
| ) | |
| except Exception: | |
| # soundfile can't read webm/opus/ogg — try ffmpeg decode to WAV | |
| if raw_bytes is None: | |
| raise | |
| wav_buf = _ffmpeg_decode(raw_bytes) | |
| y, sr = librosa.load(wav_buf, sr=target_sr, mono=True, duration=_DURATION_LIMIT) | |
| # Guard against silent / corrupt files | |
| if len(y) < sr: | |
| raise ValueError("Audio too short for analysis (< 1 second)") | |
| if np.max(np.abs(y)) < 1e-6: | |
| raise ValueError("Audio is silent") | |
| return y, sr | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Spectral features | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _extract_spectral(y: np.ndarray, sr: int) -> dict: | |
| """Spectral domain features.""" | |
| # Spectral centroid — "brightness" | |
| centroid = librosa.feature.spectral_centroid( | |
| y=y, sr=sr, hop_length=_HOP_LENGTH | |
| )[0] | |
| # Spectral flatness — how noise-like vs tonal | |
| flatness = librosa.feature.spectral_flatness( | |
| y=y, hop_length=_HOP_LENGTH | |
| )[0] | |
| # Spectral bandwidth | |
| bandwidth = librosa.feature.spectral_bandwidth( | |
| y=y, sr=sr, hop_length=_HOP_LENGTH | |
| )[0] | |
| # Spectral rolloff — frequency below which 85% of energy | |
| rolloff = librosa.feature.spectral_rolloff( | |
| y=y, sr=sr, hop_length=_HOP_LENGTH, roll_percent=0.85 | |
| )[0] | |
| # Spectral contrast — valley-to-peak in frequency bands | |
| contrast = librosa.feature.spectral_contrast( | |
| y=y, sr=sr, hop_length=_HOP_LENGTH, n_bands=6 | |
| ) | |
| # MFCCs — timbre fingerprint | |
| mfcc = librosa.feature.mfcc( | |
| y=y, sr=sr, n_mfcc=_N_MFCC, hop_length=_HOP_LENGTH | |
| ) | |
| # RMS energy | |
| rms = librosa.feature.rms(y=y, hop_length=_HOP_LENGTH)[0] | |
| # Mel spectrogram statistics | |
| mel = librosa.feature.melspectrogram( | |
| y=y, sr=sr, n_mels=_N_MELS, hop_length=_HOP_LENGTH | |
| ) | |
| mel_db = librosa.power_to_db(mel, ref=np.max) | |
| # MFCC delta (first derivative) and delta-delta (acceleration) | |
| mfcc_delta = librosa.feature.delta(mfcc) | |
| mfcc_delta2 = librosa.feature.delta(mfcc, order=2) | |
| return { | |
| "centroid_mean": float(np.mean(centroid)), | |
| "centroid_std": float(np.std(centroid)), | |
| "flatness_mean": float(np.mean(flatness)), | |
| "flatness_std": float(np.std(flatness)), | |
| "bandwidth_mean": float(np.mean(bandwidth)), | |
| "bandwidth_std": float(np.std(bandwidth)), | |
| "rolloff_mean": float(np.mean(rolloff)), | |
| "rolloff_std": float(np.std(rolloff)), | |
| "contrast_mean": float(np.mean(contrast)), | |
| "contrast_std": float(np.std(contrast)), | |
| "mfcc_variance": float(np.mean(np.var(mfcc, axis=1))), | |
| "mfcc_delta_var": float(np.mean(np.var(mfcc_delta, axis=1))), | |
| "mfcc_delta2_var": float(np.mean(np.var(mfcc_delta2, axis=1))), | |
| "rms_mean": float(np.mean(rms)), | |
| "rms_std": float(np.std(rms)), | |
| "mel_flatness": float(np.mean(np.std(mel_db, axis=0))), | |
| } | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Temporal features | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _extract_temporal(y: np.ndarray, sr: int) -> dict: | |
| """Time-domain and rhythm features.""" | |
| # Tempo and beat tracking | |
| tempo, beat_frames = librosa.beat.beat_track( | |
| y=y, sr=sr, hop_length=_HOP_LENGTH | |
| ) | |
| tempo_bpm = float(np.atleast_1d(tempo)[0]) | |
| # Beat timing stability | |
| beat_times = librosa.frames_to_time(beat_frames, sr=sr, hop_length=_HOP_LENGTH) | |
| if len(beat_times) > 2: | |
| ibi = np.diff(beat_times) # inter-beat intervals | |
| tempo_stability = float(np.std(ibi)) | |
| tempo_cv = float(np.std(ibi) / np.mean(ibi)) if np.mean(ibi) > 0 else 0.0 | |
| else: | |
| tempo_stability = 0.0 | |
| tempo_cv = 0.0 | |
| # Onset strength — rhythmic energy | |
| onset_env = librosa.onset.onset_strength( | |
| y=y, sr=sr, hop_length=_HOP_LENGTH | |
| ) | |
| onset_std = float(np.std(onset_env)) | |
| onset_mean = float(np.mean(onset_env)) | |
| # Zero-crossing rate — rough texture indicator | |
| zcr = librosa.feature.zero_crossing_rate(y, hop_length=_HOP_LENGTH)[0] | |
| # RMS energy dynamics — how much volume varies | |
| rms = librosa.feature.rms(y=y, hop_length=_HOP_LENGTH)[0] | |
| rms_dynamic_range = float(np.max(rms) - np.min(rms)) if len(rms) > 0 else 0.0 | |
| return { | |
| "tempo_bpm": tempo_bpm, | |
| "tempo_stability": tempo_stability, | |
| "tempo_cv": tempo_cv, | |
| "onset_std": onset_std, | |
| "onset_mean": onset_mean, | |
| "zcr_mean": float(np.mean(zcr)), | |
| "zcr_std": float(np.std(zcr)), | |
| "rms_dynamic_range": rms_dynamic_range, | |
| "beat_count": len(beat_frames), | |
| } | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Harmonic features | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _extract_harmonic(y: np.ndarray, sr: int) -> dict: | |
| """Harmonic and tonal features.""" | |
| # Harmonic-percussive separation | |
| y_harmonic, y_percussive = librosa.effects.hpss(y) | |
| harmonic_energy = float(np.sum(y_harmonic ** 2)) | |
| total_energy = float(np.sum(y ** 2)) | |
| harmonic_ratio = harmonic_energy / total_energy if total_energy > 0 else 0.5 | |
| # Chroma features — pitch class distribution | |
| chroma = librosa.feature.chroma_stft( | |
| y=y, sr=sr, hop_length=_HOP_LENGTH, n_chroma=12 | |
| ) | |
| # Chroma entropy — how spread the pitch classes are | |
| chroma_mean = np.mean(chroma, axis=1) | |
| chroma_mean = chroma_mean / (np.sum(chroma_mean) + 1e-10) | |
| chroma_entropy = float(-np.sum(chroma_mean * np.log2(chroma_mean + 1e-10))) | |
| # Chroma standard deviation — how stable pitch classes are over time | |
| chroma_std = float(np.mean(np.std(chroma, axis=1))) | |
| # Tonnetz — tonal centroid features (harmonic relationships) | |
| tonnetz = librosa.feature.tonnetz(y=y_harmonic, sr=sr) | |
| tonnetz_std = float(np.mean(np.std(tonnetz, axis=1))) | |
| # Chroma transition matrix — how often pitch classes change | |
| chroma_binary = (chroma > np.median(chroma)).astype(float) | |
| chroma_diff = np.diff(chroma_binary, axis=1) | |
| chroma_transition_rate = float(np.mean(np.abs(chroma_diff))) | |
| return { | |
| "harmonic_ratio": harmonic_ratio, | |
| "chroma_entropy": chroma_entropy, | |
| "chroma_std": chroma_std, | |
| "tonnetz_std": tonnetz_std, | |
| "chroma_transition_rate": chroma_transition_rate, | |
| } | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Composite scoring | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _sigmoid(x: float, midpoint: float = 0.0, steepness: float = 1.0) -> float: | |
| """Sigmoid normalization to [0, 1].""" | |
| z = steepness * (x - midpoint) | |
| z = max(-20.0, min(20.0, z)) # clamp for numerical stability | |
| return 1.0 / (1.0 + np.exp(-z)) | |
| def _score_spectral_regularity(spectral: dict) -> float: | |
| """ | |
| Score how "regular" (AI-like) the spectral content is. | |
| AI music tends to have: | |
| - Lower spectral centroid variance (more uniform brightness) | |
| - Lower MFCC variance (more consistent timbre) | |
| - Higher spectral flatness (more even frequency distribution) | |
| - Lower mel spectrogram variance over time | |
| """ | |
| # Low centroid std → high regularity → more AI-like | |
| centroid_score = 1.0 - _sigmoid(spectral["centroid_std"], midpoint=800, steepness=0.003) | |
| # Low MFCC variance → consistent timbre → more AI-like | |
| mfcc_score = 1.0 - _sigmoid(spectral["mfcc_variance"], midpoint=50, steepness=0.03) | |
| # High flatness → noise-like distribution → more AI-like | |
| flatness_score = _sigmoid(spectral["flatness_mean"], midpoint=0.02, steepness=40) | |
| # Low mel variance → uniform spectral energy → more AI-like | |
| mel_score = 1.0 - _sigmoid(spectral["mel_flatness"], midpoint=10, steepness=0.1) | |
| score = ( | |
| centroid_score * 0.3 | |
| + mfcc_score * 0.3 | |
| + flatness_score * 0.2 | |
| + mel_score * 0.2 | |
| ) | |
| return round(max(0.0, min(0.99, score)), 3) | |
| def _score_temporal_patterns(temporal: dict) -> float: | |
| """ | |
| Score how "metronomic" (AI-like) the temporal patterns are. | |
| AI music tends to have: | |
| - Very low tempo variability (coefficient of variation) | |
| - Consistent onset strength (less dynamic) | |
| - Lower RMS dynamic range | |
| """ | |
| # Low tempo CV → metronomic → more AI-like | |
| # Human musicians: CV ~0.05-0.15, AI: CV ~0.01-0.04 | |
| tempo_score = 1.0 - _sigmoid(temporal["tempo_cv"], midpoint=0.06, steepness=30) | |
| # Low onset std → flat dynamics → more AI-like | |
| onset_score = 1.0 - _sigmoid(temporal["onset_std"], midpoint=1.5, steepness=1.0) | |
| # Low dynamic range → compressed → more AI-like | |
| dynamic_score = 1.0 - _sigmoid(temporal["rms_dynamic_range"], midpoint=0.15, steepness=8) | |
| # Low ZCR variance → uniform texture → more AI-like | |
| zcr_score = 1.0 - _sigmoid(temporal["zcr_std"], midpoint=0.03, steepness=30) | |
| score = ( | |
| tempo_score * 0.35 | |
| + onset_score * 0.25 | |
| + dynamic_score * 0.2 | |
| + zcr_score * 0.2 | |
| ) | |
| return round(max(0.0, min(0.99, score)), 3) | |
| def _score_harmonic_structure(harmonic: dict) -> float: | |
| """ | |
| Score how "predictable" (AI-like) the harmonic content is. | |
| AI music tends to have: | |
| - Lower chroma entropy (fewer distinct pitch classes used) | |
| - Lower chroma transition rate (less harmonic movement) | |
| - Lower tonnetz variability (simpler tonal relationships) | |
| - Higher harmonic ratio (cleaner separation) | |
| """ | |
| # Low chroma entropy → fewer pitch classes → more AI-like | |
| # Max entropy for 12 pitch classes = log2(12) ≈ 3.58 | |
| entropy_score = 1.0 - _sigmoid(harmonic["chroma_entropy"], midpoint=3.2, steepness=3) | |
| # Low transition rate → less harmonic movement → more AI-like | |
| transition_score = 1.0 - _sigmoid( | |
| harmonic["chroma_transition_rate"], midpoint=0.15, steepness=8 | |
| ) | |
| # Low tonnetz std → simpler relationships → more AI-like | |
| tonnetz_score = 1.0 - _sigmoid(harmonic["tonnetz_std"], midpoint=0.15, steepness=8) | |
| # High harmonic ratio → too clean → more AI-like | |
| hr_score = _sigmoid(harmonic["harmonic_ratio"], midpoint=0.6, steepness=5) | |
| score = ( | |
| entropy_score * 0.3 | |
| + transition_score * 0.25 | |
| + tonnetz_score * 0.25 | |
| + hr_score * 0.2 | |
| ) | |
| return round(max(0.0, min(0.99, score)), 3) | |