| | """ |
| | Human Correlation Analysis |
| | |
| | Analyzes correlation between MSCI scores and human judgments. |
| | This addresses RQ3: "Does MSCI correlate with human judgments of multimodal coherence?" |
| | |
| | Key analyses: |
| | - Spearman rank correlation (for ordinal human ratings) |
| | - Pearson correlation (for continuous relationship) |
| | - Per-dimension correlations (text-image, text-audio, image-audio) |
| | - Agreement analysis |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | import json |
| | from dataclasses import dataclass |
| | from pathlib import Path |
| | from typing import Any, Dict, List, Optional, Tuple |
| | import numpy as np |
| | from scipy import stats |
| |
|
| |
|
| | @dataclass |
| | class CorrelationResult: |
| | """Result of a correlation analysis.""" |
| | variable1: str |
| | variable2: str |
| | spearman_rho: float |
| | spearman_p: float |
| | pearson_r: float |
| | pearson_p: float |
| | n: int |
| | ci_lower: float |
| | ci_upper: float |
| | significant: bool |
| | interpretation: str |
| |
|
| | def to_dict(self) -> Dict[str, Any]: |
| | """Convert to dictionary.""" |
| | return { |
| | "variable1": self.variable1, |
| | "variable2": self.variable2, |
| | "spearman_rho": self.spearman_rho, |
| | "spearman_p": self.spearman_p, |
| | "pearson_r": self.pearson_r, |
| | "pearson_p": self.pearson_p, |
| | "n": self.n, |
| | "ci_95": [self.ci_lower, self.ci_upper], |
| | "significant": self.significant, |
| | "interpretation": self.interpretation, |
| | } |
| |
|
| |
|
| | class HumanCorrelationAnalyzer: |
| | """ |
| | Analyzes correlation between MSCI and human judgments. |
| | |
| | RQ3: "Does MSCI correlate with human judgments of multimodal coherence?" |
| | H0: ρ(MSCI, human) ≤ 0 |
| | H1: ρ(MSCI, human) > 0 |
| | """ |
| |
|
| | def __init__(self, alpha: float = 0.05): |
| | self.alpha = alpha |
| |
|
| | def compute_correlation( |
| | self, |
| | msci_scores: List[float], |
| | human_scores: List[float], |
| | var1_name: str = "MSCI", |
| | var2_name: str = "Human", |
| | ) -> CorrelationResult: |
| | """ |
| | Compute correlation with confidence interval. |
| | |
| | Args: |
| | msci_scores: MSCI scores |
| | human_scores: Human coherence scores (normalized to 0-1) |
| | var1_name: Name for first variable |
| | var2_name: Name for second variable |
| | |
| | Returns: |
| | CorrelationResult with all statistics |
| | """ |
| | if len(msci_scores) != len(human_scores): |
| | raise ValueError("Score lists must have same length") |
| |
|
| | n = len(msci_scores) |
| | if n < 3: |
| | return CorrelationResult( |
| | variable1=var1_name, |
| | variable2=var2_name, |
| | spearman_rho=0.0, |
| | spearman_p=1.0, |
| | pearson_r=0.0, |
| | pearson_p=1.0, |
| | n=n, |
| | ci_lower=-1.0, |
| | ci_upper=1.0, |
| | significant=False, |
| | interpretation="Insufficient data (N < 3)", |
| | ) |
| |
|
| | |
| | spearman = stats.spearmanr(msci_scores, human_scores) |
| |
|
| | |
| | pearson = stats.pearsonr(msci_scores, human_scores) |
| |
|
| | |
| | z = np.arctanh(spearman.correlation) |
| | se_z = 1 / np.sqrt(n - 3) |
| | z_crit = stats.norm.ppf(1 - self.alpha / 2) |
| | ci_lower = np.tanh(z - z_crit * se_z) |
| | ci_upper = np.tanh(z + z_crit * se_z) |
| |
|
| | |
| | significant = spearman.pvalue / 2 < self.alpha and spearman.correlation > 0 |
| |
|
| | |
| | interpretation = self._interpret_correlation( |
| | spearman.correlation, spearman.pvalue / 2, significant |
| | ) |
| |
|
| | return CorrelationResult( |
| | variable1=var1_name, |
| | variable2=var2_name, |
| | spearman_rho=float(spearman.correlation), |
| | spearman_p=float(spearman.pvalue), |
| | pearson_r=float(pearson.statistic), |
| | pearson_p=float(pearson.pvalue), |
| | n=n, |
| | ci_lower=float(ci_lower), |
| | ci_upper=float(ci_upper), |
| | significant=significant, |
| | interpretation=interpretation, |
| | ) |
| |
|
| | def _interpret_correlation( |
| | self, |
| | rho: float, |
| | p_one_tailed: float, |
| | significant: bool, |
| | ) -> str: |
| | """Generate interpretation of correlation.""" |
| | if not significant: |
| | if p_one_tailed >= self.alpha: |
| | return f"No significant positive correlation (ρ={rho:.3f}, p={p_one_tailed:.4f})" |
| | else: |
| | return f"Significant negative correlation (unexpected; ρ={rho:.3f})" |
| |
|
| | abs_rho = abs(rho) |
| | if abs_rho >= 0.7: |
| | strength = "strong" |
| | elif abs_rho >= 0.5: |
| | strength = "moderate-strong" |
| | elif abs_rho >= 0.3: |
| | strength = "moderate" |
| | else: |
| | strength = "weak" |
| |
|
| | return f"Significant {strength} positive correlation (ρ={rho:.3f}, p={p_one_tailed:.4f})" |
| |
|
| | def analyze_from_human_eval( |
| | self, |
| | human_eval_path: Path, |
| | msci_scores: Optional[Dict[str, float]] = None, |
| | ) -> Dict[str, Any]: |
| | """ |
| | Analyze correlation from human evaluation session. |
| | |
| | Args: |
| | human_eval_path: Path to human evaluation session JSON |
| | msci_scores: Optional dict of sample_id -> MSCI score |
| | |
| | Returns: |
| | Comprehensive correlation analysis |
| | """ |
| | from src.evaluation.human_eval_schema import EvaluationSession |
| |
|
| | session = EvaluationSession.load(Path(human_eval_path)) |
| |
|
| | |
| | if msci_scores is None: |
| | msci_scores = {} |
| | for sample in session.samples: |
| | if sample.msci_score is not None: |
| | msci_scores[sample.sample_id] = sample.msci_score |
| |
|
| | |
| | pairs: List[Dict[str, Any]] = [] |
| |
|
| | for eval in session.evaluations: |
| | if eval.is_rerating: |
| | continue |
| | if eval.sample_id not in msci_scores: |
| | continue |
| |
|
| | pairs.append({ |
| | "sample_id": eval.sample_id, |
| | "msci": msci_scores[eval.sample_id], |
| | "human_weighted": eval.weighted_score(), |
| | "human_overall": eval.overall_coherence / 5.0, |
| | "human_ti": eval.text_image_coherence / 5.0, |
| | "human_ta": eval.text_audio_coherence / 5.0, |
| | "human_ia": eval.image_audio_coherence / 5.0, |
| | }) |
| |
|
| | if len(pairs) < 3: |
| | return { |
| | "error": "Insufficient paired data", |
| | "n_pairs": len(pairs), |
| | } |
| |
|
| | |
| | msci = [p["msci"] for p in pairs] |
| | human_weighted = [p["human_weighted"] for p in pairs] |
| | human_overall = [p["human_overall"] for p in pairs] |
| | human_ti = [p["human_ti"] for p in pairs] |
| | human_ta = [p["human_ta"] for p in pairs] |
| | human_ia = [p["human_ia"] for p in pairs] |
| |
|
| | |
| | results = { |
| | "n_pairs": len(pairs), |
| | "overall_correlation": self.compute_correlation( |
| | msci, human_weighted, "MSCI", "Human Weighted Score" |
| | ).to_dict(), |
| | "overall_rating_correlation": self.compute_correlation( |
| | msci, human_overall, "MSCI", "Human Overall Rating" |
| | ).to_dict(), |
| | "per_dimension": { |
| | "text_image": self.compute_correlation( |
| | msci, human_ti, "MSCI", "Human Text-Image" |
| | ).to_dict(), |
| | "text_audio": self.compute_correlation( |
| | msci, human_ta, "MSCI", "Human Text-Audio" |
| | ).to_dict(), |
| | "image_audio": self.compute_correlation( |
| | msci, human_ia, "MSCI", "Human Image-Audio" |
| | ).to_dict(), |
| | }, |
| | } |
| |
|
| | |
| | main_corr = results["overall_correlation"] |
| | results["rq3_verdict"] = self._rq3_verdict(main_corr) |
| |
|
| | return results |
| |
|
| | def _rq3_verdict(self, correlation: Dict[str, Any]) -> Dict[str, Any]: |
| | """Generate RQ3 verdict from correlation result.""" |
| | rho = correlation["spearman_rho"] |
| | p = correlation["spearman_p"] |
| | significant = correlation["significant"] |
| |
|
| | if significant and rho > 0.3: |
| | verdict = "SUPPORTED" |
| | explanation = ( |
| | f"MSCI shows significant positive correlation with human judgments " |
| | f"(ρ={rho:.3f}, p={p/2:.4f}). MSCI is a valid proxy for human-perceived coherence." |
| | ) |
| | elif significant and rho > 0: |
| | verdict = "WEAKLY SUPPORTED" |
| | explanation = ( |
| | f"MSCI shows significant but weak correlation with human judgments " |
| | f"(ρ={rho:.3f}). MSCI captures some aspects of human-perceived coherence." |
| | ) |
| | elif not significant and rho > 0: |
| | verdict = "NOT SUPPORTED" |
| | explanation = ( |
| | f"No significant correlation between MSCI and human judgments " |
| | f"(ρ={rho:.3f}, p={p/2:.4f}). MSCI may not reliably reflect human perception." |
| | ) |
| | else: |
| | verdict = "CONTRADICTED" |
| | explanation = ( |
| | f"Unexpected negative correlation (ρ={rho:.3f}). " |
| | f"MSCI may be inversely related to human perception." |
| | ) |
| |
|
| | return { |
| | "verdict": verdict, |
| | "explanation": explanation, |
| | "threshold_met": significant and rho > 0.3, |
| | "rho": rho, |
| | "p_value": p / 2, |
| | } |
| |
|
| | def analyze_disagreements( |
| | self, |
| | pairs: List[Dict[str, Any]], |
| | threshold: float = 0.2, |
| | ) -> Dict[str, Any]: |
| | """ |
| | Analyze cases where MSCI and human judgments disagree. |
| | |
| | Args: |
| | pairs: List of dicts with 'msci' and 'human_weighted' keys |
| | threshold: Disagreement threshold (normalized) |
| | |
| | Returns: |
| | Analysis of disagreement patterns |
| | """ |
| | disagreements = [] |
| |
|
| | for pair in pairs: |
| | msci = pair.get("msci", 0) |
| | human = pair.get("human_weighted", 0) |
| | diff = msci - human |
| |
|
| | if abs(diff) > threshold: |
| | disagreements.append({ |
| | "sample_id": pair.get("sample_id"), |
| | "msci": msci, |
| | "human": human, |
| | "difference": diff, |
| | "type": "MSCI_overestimates" if diff > 0 else "MSCI_underestimates", |
| | }) |
| |
|
| | n_total = len(pairs) |
| | n_disagree = len(disagreements) |
| |
|
| | overestimates = [d for d in disagreements if d["type"] == "MSCI_overestimates"] |
| | underestimates = [d for d in disagreements if d["type"] == "MSCI_underestimates"] |
| |
|
| | return { |
| | "n_total": n_total, |
| | "n_disagreements": n_disagree, |
| | "disagreement_rate": n_disagree / n_total if n_total > 0 else 0, |
| | "n_overestimates": len(overestimates), |
| | "n_underestimates": len(underestimates), |
| | "mean_overestimate": ( |
| | np.mean([d["difference"] for d in overestimates]) |
| | if overestimates else 0 |
| | ), |
| | "mean_underestimate": ( |
| | np.mean([abs(d["difference"]) for d in underestimates]) |
| | if underestimates else 0 |
| | ), |
| | "samples": disagreements, |
| | } |
| |
|
| | def generate_report( |
| | self, |
| | analysis_results: Dict[str, Any], |
| | output_path: Optional[Path] = None, |
| | ) -> Dict[str, Any]: |
| | """ |
| | Generate comprehensive human correlation report. |
| | |
| | Args: |
| | analysis_results: Results from analyze_from_human_eval |
| | output_path: Optional path to save report |
| | |
| | Returns: |
| | Complete correlation report |
| | """ |
| | report = { |
| | "analysis_type": "MSCI-Human Correlation Analysis", |
| | "research_question": "RQ3: Does MSCI correlate with human judgments?", |
| | "hypothesis": { |
| | "H0": "ρ(MSCI, human) ≤ 0", |
| | "H1": "ρ(MSCI, human) > 0", |
| | "threshold": "ρ > 0.3 for meaningful validity", |
| | }, |
| | "results": analysis_results, |
| | } |
| |
|
| | |
| | verdict = analysis_results.get("rq3_verdict", {}) |
| | if verdict.get("verdict") == "SUPPORTED": |
| | report["recommendations"] = [ |
| | "MSCI can be used as a proxy for human coherence judgments", |
| | "Consider using MSCI for automated evaluation at scale", |
| | ] |
| | elif verdict.get("verdict") == "WEAKLY SUPPORTED": |
| | report["recommendations"] = [ |
| | "MSCI provides some signal but should not be sole metric", |
| | "Consider combining MSCI with other metrics or human spot-checks", |
| | "Investigate which dimensions MSCI captures well vs poorly", |
| | ] |
| | else: |
| | report["recommendations"] = [ |
| | "MSCI may not reliably reflect human perception", |
| | "Consider revising MSCI weights or embedding approach", |
| | "Human evaluation remains necessary for validation", |
| | "Investigate failure modes to improve MSCI", |
| | ] |
| |
|
| | if output_path: |
| | output_path = Path(output_path) |
| | output_path.parent.mkdir(parents=True, exist_ok=True) |
| | with output_path.open("w", encoding="utf-8") as f: |
| | json.dump(report, f, indent=2, ensure_ascii=False) |
| |
|
| | return report |
| |
|