crowncode-backend / app /services /feature_extractor.py
Rthur2003's picture
fix: add ffmpeg decoding fallback for unsupported audio formats in feature extraction
14fc9a2
Raw
History Blame Contribute Delete
19.8 kB
"""
Real audio feature extraction for AI music detection.
Extracts spectral, temporal, and harmonic features from audio
using librosa. These features are used to distinguish AI-generated
music from human-composed music.
"""
from __future__ import annotations
import io
import subprocess
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Union
import numpy as np
import librosa
from .logging_config import get_logger
logger = get_logger(__name__)
# ── Constants ────────────────────────────────────────────────────────────
_TARGET_SR = 22050 # Standard sample rate for analysis
_DURATION_LIMIT = 60.0 # Analyze max 1 minute (sufficient for detection)
_N_MFCC = 13
_N_MELS = 128
_HOP_LENGTH = 512
_N_FFT = 2048
@dataclass
class AudioFeatures:
"""Extracted audio features with normalized scores (0.0 – 1.0)."""
spectral_regularity: float
temporal_patterns: float
harmonic_structure: float
# Raw metrics for downstream consumers
duration_sec: float
sample_rate: int
rms_energy: float
rms_std: float
tempo_bpm: float
tempo_stability: float # std of inter-beat intervals
tempo_cv: float # coefficient of variation of beat intervals
spectral_centroid_mean: float
spectral_centroid_std: float
spectral_flatness_mean: float
spectral_flatness_std: float
spectral_bandwidth_mean: float
spectral_bandwidth_std: float
spectral_rolloff_mean: float
spectral_rolloff_std: float
spectral_contrast_mean: float
spectral_contrast_std: float
mfcc_variance: float # mean variance across MFCC bands
mfcc_delta_var: float # mean variance of MFCC first derivative
mfcc_delta2_var: float # mean variance of MFCC second derivative
chroma_entropy: float # entropy of chroma distribution
chroma_std: float # temporal chroma variability
chroma_transition_rate: float # pitch class change rate
harmonic_ratio: float # harmonic / (harmonic + percussive)
tonnetz_std: float # tonal centroid variability
zero_crossing_rate: float
zero_crossing_std: float
onset_strength_mean: float
onset_strength_std: float
rms_dynamic_range: float
beat_count: int
mel_flatness: float
@dataclass
class AudioMeta:
"""Basic metadata about the audio file."""
duration_sec: float
sample_rate: int
format: str
channels: int
def extract_features(
source: Union[Path, bytes, io.BytesIO],
*,
sr: Optional[int] = None,
) -> AudioFeatures:
"""
Extract all analysis features from an audio source.
Args:
source: File path, raw bytes, or BytesIO of the audio.
sr: Force a specific sample rate (default: _TARGET_SR).
Returns:
AudioFeatures with normalized scores and raw metrics.
"""
target_sr = sr or _TARGET_SR
y, actual_sr = _load_audio(source, target_sr)
duration_sec = float(len(y) / actual_sr)
logger.info(
f"Feature extraction: {duration_sec:.1f}s audio @ {actual_sr}Hz "
f"({len(y)} samples)"
)
# ── Core feature groups ──────────────────────────────────────────
spectral = _extract_spectral(y, actual_sr)
temporal = _extract_temporal(y, actual_sr)
harmonic = _extract_harmonic(y, actual_sr)
# ── Composite scores (0.0 = very human, 1.0 = very AI-like) ─────
spectral_score = _score_spectral_regularity(spectral)
temporal_score = _score_temporal_patterns(temporal)
harmonic_score = _score_harmonic_structure(harmonic)
return AudioFeatures(
spectral_regularity=spectral_score,
temporal_patterns=temporal_score,
harmonic_structure=harmonic_score,
duration_sec=duration_sec,
sample_rate=actual_sr,
rms_energy=spectral["rms_mean"],
rms_std=spectral["rms_std"],
tempo_bpm=temporal["tempo_bpm"],
tempo_stability=temporal["tempo_stability"],
tempo_cv=temporal["tempo_cv"],
spectral_centroid_mean=spectral["centroid_mean"],
spectral_centroid_std=spectral["centroid_std"],
spectral_flatness_mean=spectral["flatness_mean"],
spectral_flatness_std=spectral["flatness_std"],
spectral_bandwidth_mean=spectral["bandwidth_mean"],
spectral_bandwidth_std=spectral["bandwidth_std"],
spectral_rolloff_mean=spectral["rolloff_mean"],
spectral_rolloff_std=spectral["rolloff_std"],
spectral_contrast_mean=spectral["contrast_mean"],
spectral_contrast_std=spectral["contrast_std"],
mfcc_variance=spectral["mfcc_variance"],
mfcc_delta_var=spectral["mfcc_delta_var"],
mfcc_delta2_var=spectral["mfcc_delta2_var"],
chroma_entropy=harmonic["chroma_entropy"],
chroma_std=harmonic["chroma_std"],
chroma_transition_rate=harmonic["chroma_transition_rate"],
harmonic_ratio=harmonic["harmonic_ratio"],
tonnetz_std=harmonic["tonnetz_std"],
zero_crossing_rate=temporal["zcr_mean"],
zero_crossing_std=temporal["zcr_std"],
onset_strength_mean=temporal["onset_mean"],
onset_strength_std=temporal["onset_std"],
rms_dynamic_range=temporal["rms_dynamic_range"],
beat_count=temporal["beat_count"],
mel_flatness=spectral["mel_flatness"],
)
def extract_meta(source: Union[Path, bytes, io.BytesIO]) -> AudioMeta:
"""Quick metadata extraction without full feature analysis."""
y, sr = _load_audio(source, _TARGET_SR)
fmt = "wav"
channels = 1
if isinstance(source, Path):
fmt = source.suffix.lstrip(".") or "wav"
# librosa always returns mono; detect original channels via soundfile
try:
import soundfile as sf
info = sf.info(str(source))
channels = info.channels
except Exception:
pass
return AudioMeta(
duration_sec=float(len(y) / sr),
sample_rate=sr,
format=fmt,
channels=channels,
)
# ═══════════════════════════════════════════════════════════════════════
# PRIVATE — Audio loading
# ═══════════════════════════════════════════════════════════════════════
def _ffmpeg_decode(data: bytes) -> io.BytesIO:
"""Decode any audio format (webm, opus, ogg, etc.) to WAV via ffmpeg."""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp_path = tmp.name
try:
result = subprocess.run(
["ffmpeg", "-y", "-i", "pipe:0", "-ar", "22050", "-ac", "1",
"-f", "wav", tmp_path],
input=data,
capture_output=True,
timeout=30,
)
if result.returncode != 0:
raise RuntimeError(f"ffmpeg failed: {result.stderr.decode()[:200]}")
with open(tmp_path, "rb") as f:
return io.BytesIO(f.read())
finally:
Path(tmp_path).unlink(missing_ok=True)
def _load_audio(
source: Union[Path, bytes, io.BytesIO],
target_sr: int,
) -> tuple[np.ndarray, int]:
"""Load audio from any source, mono, resampled, duration-limited."""
if isinstance(source, bytes):
source = io.BytesIO(source)
# Read bytes for potential ffmpeg fallback
if isinstance(source, io.BytesIO):
raw_bytes = source.read()
source = io.BytesIO(raw_bytes)
else:
raw_bytes = None
try:
y, sr = librosa.load(
source if isinstance(source, (str, Path, io.BytesIO)) else str(source),
sr=target_sr,
mono=True,
duration=_DURATION_LIMIT,
)
except Exception:
# soundfile can't read webm/opus/ogg — try ffmpeg decode to WAV
if raw_bytes is None:
raise
wav_buf = _ffmpeg_decode(raw_bytes)
y, sr = librosa.load(wav_buf, sr=target_sr, mono=True, duration=_DURATION_LIMIT)
# Guard against silent / corrupt files
if len(y) < sr:
raise ValueError("Audio too short for analysis (< 1 second)")
if np.max(np.abs(y)) < 1e-6:
raise ValueError("Audio is silent")
return y, sr
# ═══════════════════════════════════════════════════════════════════════
# PRIVATE — Spectral features
# ═══════════════════════════════════════════════════════════════════════
def _extract_spectral(y: np.ndarray, sr: int) -> dict:
"""Spectral domain features."""
# Spectral centroid — "brightness"
centroid = librosa.feature.spectral_centroid(
y=y, sr=sr, hop_length=_HOP_LENGTH
)[0]
# Spectral flatness — how noise-like vs tonal
flatness = librosa.feature.spectral_flatness(
y=y, hop_length=_HOP_LENGTH
)[0]
# Spectral bandwidth
bandwidth = librosa.feature.spectral_bandwidth(
y=y, sr=sr, hop_length=_HOP_LENGTH
)[0]
# Spectral rolloff — frequency below which 85% of energy
rolloff = librosa.feature.spectral_rolloff(
y=y, sr=sr, hop_length=_HOP_LENGTH, roll_percent=0.85
)[0]
# Spectral contrast — valley-to-peak in frequency bands
contrast = librosa.feature.spectral_contrast(
y=y, sr=sr, hop_length=_HOP_LENGTH, n_bands=6
)
# MFCCs — timbre fingerprint
mfcc = librosa.feature.mfcc(
y=y, sr=sr, n_mfcc=_N_MFCC, hop_length=_HOP_LENGTH
)
# RMS energy
rms = librosa.feature.rms(y=y, hop_length=_HOP_LENGTH)[0]
# Mel spectrogram statistics
mel = librosa.feature.melspectrogram(
y=y, sr=sr, n_mels=_N_MELS, hop_length=_HOP_LENGTH
)
mel_db = librosa.power_to_db(mel, ref=np.max)
# MFCC delta (first derivative) and delta-delta (acceleration)
mfcc_delta = librosa.feature.delta(mfcc)
mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
return {
"centroid_mean": float(np.mean(centroid)),
"centroid_std": float(np.std(centroid)),
"flatness_mean": float(np.mean(flatness)),
"flatness_std": float(np.std(flatness)),
"bandwidth_mean": float(np.mean(bandwidth)),
"bandwidth_std": float(np.std(bandwidth)),
"rolloff_mean": float(np.mean(rolloff)),
"rolloff_std": float(np.std(rolloff)),
"contrast_mean": float(np.mean(contrast)),
"contrast_std": float(np.std(contrast)),
"mfcc_variance": float(np.mean(np.var(mfcc, axis=1))),
"mfcc_delta_var": float(np.mean(np.var(mfcc_delta, axis=1))),
"mfcc_delta2_var": float(np.mean(np.var(mfcc_delta2, axis=1))),
"rms_mean": float(np.mean(rms)),
"rms_std": float(np.std(rms)),
"mel_flatness": float(np.mean(np.std(mel_db, axis=0))),
}
# ═══════════════════════════════════════════════════════════════════════
# PRIVATE — Temporal features
# ═══════════════════════════════════════════════════════════════════════
def _extract_temporal(y: np.ndarray, sr: int) -> dict:
"""Time-domain and rhythm features."""
# Tempo and beat tracking
tempo, beat_frames = librosa.beat.beat_track(
y=y, sr=sr, hop_length=_HOP_LENGTH
)
tempo_bpm = float(np.atleast_1d(tempo)[0])
# Beat timing stability
beat_times = librosa.frames_to_time(beat_frames, sr=sr, hop_length=_HOP_LENGTH)
if len(beat_times) > 2:
ibi = np.diff(beat_times) # inter-beat intervals
tempo_stability = float(np.std(ibi))
tempo_cv = float(np.std(ibi) / np.mean(ibi)) if np.mean(ibi) > 0 else 0.0
else:
tempo_stability = 0.0
tempo_cv = 0.0
# Onset strength — rhythmic energy
onset_env = librosa.onset.onset_strength(
y=y, sr=sr, hop_length=_HOP_LENGTH
)
onset_std = float(np.std(onset_env))
onset_mean = float(np.mean(onset_env))
# Zero-crossing rate — rough texture indicator
zcr = librosa.feature.zero_crossing_rate(y, hop_length=_HOP_LENGTH)[0]
# RMS energy dynamics — how much volume varies
rms = librosa.feature.rms(y=y, hop_length=_HOP_LENGTH)[0]
rms_dynamic_range = float(np.max(rms) - np.min(rms)) if len(rms) > 0 else 0.0
return {
"tempo_bpm": tempo_bpm,
"tempo_stability": tempo_stability,
"tempo_cv": tempo_cv,
"onset_std": onset_std,
"onset_mean": onset_mean,
"zcr_mean": float(np.mean(zcr)),
"zcr_std": float(np.std(zcr)),
"rms_dynamic_range": rms_dynamic_range,
"beat_count": len(beat_frames),
}
# ═══════════════════════════════════════════════════════════════════════
# PRIVATE — Harmonic features
# ═══════════════════════════════════════════════════════════════════════
def _extract_harmonic(y: np.ndarray, sr: int) -> dict:
"""Harmonic and tonal features."""
# Harmonic-percussive separation
y_harmonic, y_percussive = librosa.effects.hpss(y)
harmonic_energy = float(np.sum(y_harmonic ** 2))
total_energy = float(np.sum(y ** 2))
harmonic_ratio = harmonic_energy / total_energy if total_energy > 0 else 0.5
# Chroma features — pitch class distribution
chroma = librosa.feature.chroma_stft(
y=y, sr=sr, hop_length=_HOP_LENGTH, n_chroma=12
)
# Chroma entropy — how spread the pitch classes are
chroma_mean = np.mean(chroma, axis=1)
chroma_mean = chroma_mean / (np.sum(chroma_mean) + 1e-10)
chroma_entropy = float(-np.sum(chroma_mean * np.log2(chroma_mean + 1e-10)))
# Chroma standard deviation — how stable pitch classes are over time
chroma_std = float(np.mean(np.std(chroma, axis=1)))
# Tonnetz — tonal centroid features (harmonic relationships)
tonnetz = librosa.feature.tonnetz(y=y_harmonic, sr=sr)
tonnetz_std = float(np.mean(np.std(tonnetz, axis=1)))
# Chroma transition matrix — how often pitch classes change
chroma_binary = (chroma > np.median(chroma)).astype(float)
chroma_diff = np.diff(chroma_binary, axis=1)
chroma_transition_rate = float(np.mean(np.abs(chroma_diff)))
return {
"harmonic_ratio": harmonic_ratio,
"chroma_entropy": chroma_entropy,
"chroma_std": chroma_std,
"tonnetz_std": tonnetz_std,
"chroma_transition_rate": chroma_transition_rate,
}
# ═══════════════════════════════════════════════════════════════════════
# PRIVATE — Composite scoring
# ═══════════════════════════════════════════════════════════════════════
def _sigmoid(x: float, midpoint: float = 0.0, steepness: float = 1.0) -> float:
"""Sigmoid normalization to [0, 1]."""
z = steepness * (x - midpoint)
z = max(-20.0, min(20.0, z)) # clamp for numerical stability
return 1.0 / (1.0 + np.exp(-z))
def _score_spectral_regularity(spectral: dict) -> float:
"""
Score how "regular" (AI-like) the spectral content is.
AI music tends to have:
- Lower spectral centroid variance (more uniform brightness)
- Lower MFCC variance (more consistent timbre)
- Higher spectral flatness (more even frequency distribution)
- Lower mel spectrogram variance over time
"""
# Low centroid std → high regularity → more AI-like
centroid_score = 1.0 - _sigmoid(spectral["centroid_std"], midpoint=800, steepness=0.003)
# Low MFCC variance → consistent timbre → more AI-like
mfcc_score = 1.0 - _sigmoid(spectral["mfcc_variance"], midpoint=50, steepness=0.03)
# High flatness → noise-like distribution → more AI-like
flatness_score = _sigmoid(spectral["flatness_mean"], midpoint=0.02, steepness=40)
# Low mel variance → uniform spectral energy → more AI-like
mel_score = 1.0 - _sigmoid(spectral["mel_flatness"], midpoint=10, steepness=0.1)
score = (
centroid_score * 0.3
+ mfcc_score * 0.3
+ flatness_score * 0.2
+ mel_score * 0.2
)
return round(max(0.0, min(0.99, score)), 3)
def _score_temporal_patterns(temporal: dict) -> float:
"""
Score how "metronomic" (AI-like) the temporal patterns are.
AI music tends to have:
- Very low tempo variability (coefficient of variation)
- Consistent onset strength (less dynamic)
- Lower RMS dynamic range
"""
# Low tempo CV → metronomic → more AI-like
# Human musicians: CV ~0.05-0.15, AI: CV ~0.01-0.04
tempo_score = 1.0 - _sigmoid(temporal["tempo_cv"], midpoint=0.06, steepness=30)
# Low onset std → flat dynamics → more AI-like
onset_score = 1.0 - _sigmoid(temporal["onset_std"], midpoint=1.5, steepness=1.0)
# Low dynamic range → compressed → more AI-like
dynamic_score = 1.0 - _sigmoid(temporal["rms_dynamic_range"], midpoint=0.15, steepness=8)
# Low ZCR variance → uniform texture → more AI-like
zcr_score = 1.0 - _sigmoid(temporal["zcr_std"], midpoint=0.03, steepness=30)
score = (
tempo_score * 0.35
+ onset_score * 0.25
+ dynamic_score * 0.2
+ zcr_score * 0.2
)
return round(max(0.0, min(0.99, score)), 3)
def _score_harmonic_structure(harmonic: dict) -> float:
"""
Score how "predictable" (AI-like) the harmonic content is.
AI music tends to have:
- Lower chroma entropy (fewer distinct pitch classes used)
- Lower chroma transition rate (less harmonic movement)
- Lower tonnetz variability (simpler tonal relationships)
- Higher harmonic ratio (cleaner separation)
"""
# Low chroma entropy → fewer pitch classes → more AI-like
# Max entropy for 12 pitch classes = log2(12) ≈ 3.58
entropy_score = 1.0 - _sigmoid(harmonic["chroma_entropy"], midpoint=3.2, steepness=3)
# Low transition rate → less harmonic movement → more AI-like
transition_score = 1.0 - _sigmoid(
harmonic["chroma_transition_rate"], midpoint=0.15, steepness=8
)
# Low tonnetz std → simpler relationships → more AI-like
tonnetz_score = 1.0 - _sigmoid(harmonic["tonnetz_std"], midpoint=0.15, steepness=8)
# High harmonic ratio → too clean → more AI-like
hr_score = _sigmoid(harmonic["harmonic_ratio"], midpoint=0.6, steepness=5)
score = (
entropy_score * 0.3
+ transition_score * 0.25
+ tonnetz_score * 0.25
+ hr_score * 0.2
)
return round(max(0.0, min(0.99, score)), 3)