""" Evaluation framework for AURIS models. Measures accuracy, precision, recall, F1, ROC-AUC against labeled data. Used for: 1. Baseline measurement of heuristic system 2. Validation of trained models 3. A/B comparison between model versions """ from __future__ import annotations import csv import sys from pathlib import Path from typing import Optional import numpy as np try: from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, ) except ImportError: print("ERROR: scikit-learn required. pip install scikit-learn") sys.exit(1) def load_features_csv(path: str | Path) -> tuple[np.ndarray, np.ndarray]: """ Load features CSV into X (features) and y (labels). Returns: X: (n_samples, n_features) array y: (n_samples,) array of 0/1 labels """ rows = [] labels = [] with open(path, "r", encoding="utf-8") as f: reader = csv.DictReader(f) _EXCLUDE = {"file_path", "label_int", "duration_sec", "sample_rate"} feature_cols = [ c for c in reader.fieldnames if c not in _EXCLUDE ] for row in reader: feat_values = [] for col in feature_cols: try: feat_values.append(float(row[col])) except (ValueError, KeyError): feat_values.append(0.0) rows.append(feat_values) labels.append(int(row["label_int"])) X = np.array(rows, dtype=np.float32) y = np.array(labels, dtype=np.int32) print(f"Loaded {len(y)} samples, {X.shape[1]} features") print(f" AI: {np.sum(y == 1)}, Human: {np.sum(y == 0)}") return X, y def evaluate_predictions( y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optional[np.ndarray] = None, title: str = "Model", ) -> dict: """ Compute and print all evaluation metrics. Args: y_true: Ground truth labels (0/1). y_pred: Predicted labels (0/1). y_prob: Predicted probabilities for positive class. title: Title for the report. Returns: Dict of metric name -> value. """ acc = accuracy_score(y_true, y_pred) prec = precision_score(y_true, y_pred, zero_division=0) rec = recall_score(y_true, y_pred, zero_division=0) f1 = f1_score(y_true, y_pred, zero_division=0) metrics = { "accuracy": round(acc, 4), "precision": round(prec, 4), "recall": round(rec, 4), "f1_score": round(f1, 4), } if y_prob is not None: try: auc = roc_auc_score(y_true, y_prob) metrics["roc_auc"] = round(auc, 4) except ValueError: metrics["roc_auc"] = None cm = confusion_matrix(y_true, y_pred) # Print report print(f"\n{'=' * 50}") print(f" {title} - Evaluation Report") print(f"{'=' * 50}") print(f" Accuracy: {acc:.4f} ({acc:.1%})") print(f" Precision: {prec:.4f}") print(f" Recall: {rec:.4f}") print(f" F1 Score: {f1:.4f}") if "roc_auc" in metrics and metrics["roc_auc"] is not None: print(f" ROC-AUC: {metrics['roc_auc']:.4f}") print(f"\n Confusion Matrix:") print(f" Predicted") print(f" Actual Human AI") print(f" Human {cm[0][0]:>6} {cm[0][1]:>6}") print(f" AI {cm[1][0]:>6} {cm[1][1]:>6}") print(f"\n{classification_report(y_true, y_pred, target_names=['Human', 'AI'])}") return metrics def evaluate_heuristic_baseline(features_csv: str | Path) -> dict: """ Evaluate the current heuristic scoring system as baseline. The heuristic system uses the 'spectral_regularity', 'temporal_patterns', 'harmonic_structure' scores (which are sigmoid-transformed heuristics) to make a weighted average prediction. """ X, y = load_features_csv(features_csv) # Read feature column names with open(features_csv, "r", encoding="utf-8") as f: reader = csv.DictReader(f) feature_cols = [ c for c in reader.fieldnames if c not in ("file_path", "label_int") ] # Find indices of heuristic score columns sr_idx = feature_cols.index("spectral_regularity") tp_idx = feature_cols.index("temporal_patterns") hs_idx = feature_cols.index("harmonic_structure") # Current heuristic: weighted average heuristic_scores = ( X[:, sr_idx] * 0.35 + X[:, tp_idx] * 0.35 + X[:, hs_idx] * 0.30 ) # Also try with vocal score if available vai_idx = feature_cols.index("vocal_ai_score") has_v_idx = feature_cols.index("has_vocals") combined_scores = np.where( X[:, has_v_idx] > 0.5, heuristic_scores * 0.65 + X[:, vai_idx] * 0.35, heuristic_scores, ) y_pred_heuristic = (heuristic_scores > 0.5).astype(int) y_pred_combined = (combined_scores > 0.5).astype(int) print("\n" + "=" * 60) print(" BASELINE EVALUATION - Current Heuristic System") print("=" * 60) print("\n--- Heuristic Only (spectral + temporal + harmonic) ---") m1 = evaluate_predictions( y, y_pred_heuristic, heuristic_scores, title="Heuristic (no vocals)", ) print("\n--- Heuristic + Vocal Score ---") m2 = evaluate_predictions( y, y_pred_combined, combined_scores, title="Heuristic + Vocals", ) return {"heuristic_only": m1, "heuristic_vocals": m2} if __name__ == "__main__": csv_path = sys.argv[1] if len(sys.argv) > 1 else "data/sonics/features.csv" evaluate_heuristic_baseline(csv_path)