MultiModal-Coherence-AI / src /experiments /statistical_analysis.py
pratik-250620's picture
Upload folder using huggingface_hub
6835659 verified
"""
Statistical Analysis Module
Provides rigorous statistical tools for hypothesis testing:
- Paired t-tests for within-subject designs
- Effect sizes (Cohen's d)
- Confidence intervals
- Multiple comparison correction
- Power analysis
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import List, Tuple, Dict, Any, Optional
import numpy as np
from scipy import stats
@dataclass
class StatisticalResult:
"""
Result of a statistical test.
Includes all information needed for scientific reporting:
- Test statistic and p-value
- Effect size with interpretation
- Confidence interval
- Sample statistics
"""
test_name: str
statistic: float
p_value: float
effect_size: float
effect_size_name: str
ci_lower: float
ci_upper: float
confidence_level: float
n: int
mean_diff: float
std_diff: float
significant: bool
interpretation: str
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization."""
return {
"test_name": self.test_name,
"statistic": self.statistic,
"p_value": self.p_value,
"effect_size": self.effect_size,
"effect_size_name": self.effect_size_name,
"ci_lower": self.ci_lower,
"ci_upper": self.ci_upper,
"confidence_level": self.confidence_level,
"n": self.n,
"mean_diff": self.mean_diff,
"std_diff": self.std_diff,
"significant": self.significant,
"interpretation": self.interpretation,
}
def __str__(self) -> str:
"""Human-readable summary."""
sig_marker = "*" if self.significant else ""
return (
f"{self.test_name}: t({self.n-1})={self.statistic:.3f}, "
f"p={self.p_value:.4f}{sig_marker}, "
f"{self.effect_size_name}={self.effect_size:.3f}, "
f"95% CI [{self.ci_lower:.4f}, {self.ci_upper:.4f}]"
)
def paired_ttest(
condition1: List[float],
condition2: List[float],
alpha: float = 0.05,
alternative: str = "two-sided",
) -> StatisticalResult:
"""
Perform paired samples t-test.
For testing H0: μ1 = μ2 vs H1: μ1 ≠ μ2 (or one-sided alternatives)
Args:
condition1: Scores from first condition
condition2: Scores from second condition (same subjects)
alpha: Significance level
alternative: "two-sided", "greater", or "less"
Returns:
StatisticalResult with all test statistics
"""
if len(condition1) != len(condition2):
raise ValueError("Conditions must have same length for paired test")
n = len(condition1)
if n < 2:
raise ValueError("Need at least 2 observations")
c1 = np.array(condition1)
c2 = np.array(condition2)
differences = c1 - c2
# Perform t-test
result = stats.ttest_rel(c1, c2, alternative=alternative)
# Effect size (Cohen's d for paired samples)
d = compute_effect_size(condition1, condition2, paired=True)
# Confidence interval for mean difference
mean_diff = np.mean(differences)
std_diff = np.std(differences, ddof=1)
se = std_diff / np.sqrt(n)
t_crit = stats.t.ppf(1 - alpha / 2, df=n - 1)
ci_lower = mean_diff - t_crit * se
ci_upper = mean_diff + t_crit * se
# Interpretation
significant = result.pvalue < alpha
interpretation = _interpret_effect_size(d)
return StatisticalResult(
test_name="Paired t-test",
statistic=float(result.statistic),
p_value=float(result.pvalue),
effect_size=float(d),
effect_size_name="Cohen's d",
ci_lower=float(ci_lower),
ci_upper=float(ci_upper),
confidence_level=1 - alpha,
n=n,
mean_diff=float(mean_diff),
std_diff=float(std_diff),
significant=significant,
interpretation=interpretation,
)
def independent_ttest(
group1: List[float],
group2: List[float],
alpha: float = 0.05,
equal_var: bool = False,
alternative: str = "two-sided",
) -> StatisticalResult:
"""
Perform independent samples t-test.
Args:
group1: Scores from first group
group2: Scores from second group
alpha: Significance level
equal_var: Assume equal variances (use Welch's t-test if False)
alternative: "two-sided", "greater", or "less"
Returns:
StatisticalResult with all test statistics
"""
g1 = np.array(group1)
g2 = np.array(group2)
# Perform t-test
result = stats.ttest_ind(g1, g2, equal_var=equal_var, alternative=alternative)
# Effect size (Cohen's d)
d = compute_effect_size(group1, group2, paired=False)
# Confidence interval for difference in means
mean_diff = np.mean(g1) - np.mean(g2)
n1, n2 = len(g1), len(g2)
if equal_var:
# Pooled standard error
pooled_var = ((n1 - 1) * np.var(g1, ddof=1) + (n2 - 1) * np.var(g2, ddof=1)) / (n1 + n2 - 2)
se = np.sqrt(pooled_var * (1/n1 + 1/n2))
df = n1 + n2 - 2
else:
# Welch-Satterthwaite approximation
var1, var2 = np.var(g1, ddof=1), np.var(g2, ddof=1)
se = np.sqrt(var1/n1 + var2/n2)
df = (var1/n1 + var2/n2)**2 / ((var1/n1)**2/(n1-1) + (var2/n2)**2/(n2-1))
t_crit = stats.t.ppf(1 - alpha / 2, df=df)
ci_lower = mean_diff - t_crit * se
ci_upper = mean_diff + t_crit * se
significant = result.pvalue < alpha
interpretation = _interpret_effect_size(d)
return StatisticalResult(
test_name="Independent t-test" + ("" if equal_var else " (Welch's)"),
statistic=float(result.statistic),
p_value=float(result.pvalue),
effect_size=float(d),
effect_size_name="Cohen's d",
ci_lower=float(ci_lower),
ci_upper=float(ci_upper),
confidence_level=1 - alpha,
n=n1 + n2,
mean_diff=float(mean_diff),
std_diff=float(se * np.sqrt(n1 + n2)), # Approximate pooled SD
significant=significant,
interpretation=interpretation,
)
def compute_effect_size(
group1: List[float],
group2: List[float],
paired: bool = True,
) -> float:
"""
Compute Cohen's d effect size.
For paired data: d = mean(diff) / std(diff)
For independent: d = (mean1 - mean2) / pooled_std
Args:
group1: First group/condition scores
group2: Second group/condition scores
paired: Whether data is paired
Returns:
Cohen's d effect size
"""
g1 = np.array(group1)
g2 = np.array(group2)
if paired:
differences = g1 - g2
d = np.mean(differences) / np.std(differences, ddof=1)
else:
n1, n2 = len(g1), len(g2)
var1 = np.var(g1, ddof=1)
var2 = np.var(g2, ddof=1)
pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
d = (np.mean(g1) - np.mean(g2)) / pooled_std
return float(d)
def _interpret_effect_size(d: float) -> str:
"""Interpret Cohen's d according to conventional thresholds."""
abs_d = abs(d)
if abs_d < 0.2:
size = "negligible"
elif abs_d < 0.5:
size = "small"
elif abs_d < 0.8:
size = "medium"
else:
size = "large"
direction = "positive" if d > 0 else "negative" if d < 0 else "no"
return f"{size} {direction} effect"
def compute_confidence_interval(
data: List[float],
confidence: float = 0.95,
) -> Tuple[float, float, float]:
"""
Compute confidence interval for mean.
Args:
data: Sample data
confidence: Confidence level (default 0.95 for 95% CI)
Returns:
Tuple of (mean, ci_lower, ci_upper)
"""
arr = np.array(data)
n = len(arr)
mean = np.mean(arr)
se = stats.sem(arr)
alpha = 1 - confidence
t_crit = stats.t.ppf(1 - alpha / 2, df=n - 1)
ci_lower = mean - t_crit * se
ci_upper = mean + t_crit * se
return float(mean), float(ci_lower), float(ci_upper)
def bonferroni_correction(
p_values: List[float],
alpha: float = 0.05,
) -> Tuple[float, List[bool]]:
"""
Apply Bonferroni correction for multiple comparisons.
Args:
p_values: List of p-values from multiple tests
alpha: Family-wise error rate
Returns:
Tuple of (corrected_alpha, list of significant results)
"""
n_tests = len(p_values)
corrected_alpha = alpha / n_tests
significant = [p < corrected_alpha for p in p_values]
return corrected_alpha, significant
def holm_bonferroni_correction(
p_values: List[float],
alpha: float = 0.05,
) -> Tuple[List[float], List[bool]]:
"""
Apply Holm-Bonferroni (step-down) correction.
More powerful than standard Bonferroni while controlling FWER.
Args:
p_values: List of p-values from multiple tests
alpha: Family-wise error rate
Returns:
Tuple of (adjusted_p_values, list of significant results)
"""
n = len(p_values)
indices = np.argsort(p_values)
sorted_p = np.array(p_values)[indices]
adjusted_p = np.zeros(n)
significant = [False] * n
for i, idx in enumerate(indices):
adjusted_p[idx] = sorted_p[i] * (n - i)
# Enforce monotonicity
adjusted_p = np.minimum.accumulate(adjusted_p[np.argsort(indices)][::-1])[::-1]
adjusted_p = np.minimum(adjusted_p, 1.0)
significant = [p < alpha for p in adjusted_p]
return list(adjusted_p), significant
def power_analysis_paired_ttest(
effect_size: float,
alpha: float = 0.05,
power: float = 0.80,
) -> int:
"""
Compute required sample size for paired t-test.
Args:
effect_size: Expected Cohen's d
alpha: Significance level
power: Desired statistical power
Returns:
Required sample size (N)
"""
from scipy.stats import norm
z_alpha = norm.ppf(1 - alpha / 2)
z_beta = norm.ppf(power)
n = ((z_alpha + z_beta) / effect_size) ** 2
return int(np.ceil(n))
def bootstrap_ci(
data: List[float],
n_bootstrap: int = 10000,
confidence: float = 0.95,
statistic: str = "mean",
seed: int = 42,
) -> Dict[str, float]:
"""
Compute bootstrap confidence interval.
Non-parametric CI that makes no distributional assumptions.
Uses BCa (bias-corrected and accelerated) percentile method.
Args:
data: Sample data
n_bootstrap: Number of bootstrap resamples
confidence: Confidence level (default 0.95 for 95% CI)
statistic: "mean" or "median"
seed: Random seed for reproducibility
Returns:
Dictionary with point estimate, ci_lower, ci_upper, se
"""
arr = np.array(data)
n = len(arr)
rng = np.random.default_rng(seed)
stat_fn = np.mean if statistic == "mean" else np.median
observed = float(stat_fn(arr))
# Generate bootstrap distribution
boot_stats = np.empty(n_bootstrap)
for i in range(n_bootstrap):
sample = rng.choice(arr, size=n, replace=True)
boot_stats[i] = stat_fn(sample)
# BCa correction: bias correction factor
z0 = stats.norm.ppf(np.mean(boot_stats < observed))
# Acceleration factor (jackknife)
jackknife_stats = np.empty(n)
for i in range(n):
jack_sample = np.delete(arr, i)
jackknife_stats[i] = stat_fn(jack_sample)
jack_mean = np.mean(jackknife_stats)
num = np.sum((jack_mean - jackknife_stats) ** 3)
den = 6 * (np.sum((jack_mean - jackknife_stats) ** 2) ** 1.5)
a = num / den if den != 0 else 0.0
# Adjusted percentiles
alpha = 1 - confidence
z_lower = stats.norm.ppf(alpha / 2)
z_upper = stats.norm.ppf(1 - alpha / 2)
p_lower = stats.norm.cdf(z0 + (z0 + z_lower) / (1 - a * (z0 + z_lower)))
p_upper = stats.norm.cdf(z0 + (z0 + z_upper) / (1 - a * (z0 + z_upper)))
# Clamp to valid range
p_lower = np.clip(p_lower, 0.001, 0.999)
p_upper = np.clip(p_upper, 0.001, 0.999)
ci_lower = float(np.percentile(boot_stats, p_lower * 100))
ci_upper = float(np.percentile(boot_stats, p_upper * 100))
return {
"estimate": observed,
"ci_lower": ci_lower,
"ci_upper": ci_upper,
"se": float(np.std(boot_stats)),
"confidence": confidence,
"n_bootstrap": n_bootstrap,
"method": "BCa bootstrap",
}
def bootstrap_ci_diff(
group1: List[float],
group2: List[float],
n_bootstrap: int = 10000,
confidence: float = 0.95,
paired: bool = True,
seed: int = 42,
) -> Dict[str, float]:
"""
Bootstrap CI for the difference between two groups.
Args:
group1: First group scores
group2: Second group scores
n_bootstrap: Number of bootstrap resamples
confidence: Confidence level
paired: Whether data is paired (same subjects)
seed: Random seed
Returns:
Dictionary with mean difference, ci_lower, ci_upper
"""
g1 = np.array(group1)
g2 = np.array(group2)
rng = np.random.default_rng(seed)
if paired:
diffs = g1 - g2
n = len(diffs)
observed_diff = float(np.mean(diffs))
boot_diffs = np.empty(n_bootstrap)
for i in range(n_bootstrap):
sample = rng.choice(diffs, size=n, replace=True)
boot_diffs[i] = np.mean(sample)
else:
n1, n2 = len(g1), len(g2)
observed_diff = float(np.mean(g1) - np.mean(g2))
boot_diffs = np.empty(n_bootstrap)
for i in range(n_bootstrap):
s1 = rng.choice(g1, size=n1, replace=True)
s2 = rng.choice(g2, size=n2, replace=True)
boot_diffs[i] = np.mean(s1) - np.mean(s2)
alpha = 1 - confidence
ci_lower = float(np.percentile(boot_diffs, (alpha / 2) * 100))
ci_upper = float(np.percentile(boot_diffs, (1 - alpha / 2) * 100))
return {
"mean_diff": observed_diff,
"ci_lower": ci_lower,
"ci_upper": ci_upper,
"se": float(np.std(boot_diffs)),
"confidence": confidence,
"n_bootstrap": n_bootstrap,
}
def descriptive_stats(data: List[float]) -> Dict[str, float]:
"""
Compute descriptive statistics for a sample.
Args:
data: Sample data
Returns:
Dictionary with mean, std, median, min, max, N, bootstrap CI
"""
arr = np.array(data)
boot = bootstrap_ci(list(arr))
return {
"n": len(arr),
"mean": float(np.mean(arr)),
"std": float(np.std(arr, ddof=1)),
"median": float(np.median(arr)),
"min": float(np.min(arr)),
"max": float(np.max(arr)),
"se": float(stats.sem(arr)),
"ci_lower_95": boot["ci_lower"],
"ci_upper_95": boot["ci_upper"],
}
def shapiro_wilk_test(
data: List[float],
alpha: float = 0.05,
) -> Dict[str, Any]:
"""
Shapiro-Wilk test for normality.
Args:
data: Sample data (paired differences for paired tests)
alpha: Significance level
Returns:
Dictionary with W statistic, p-value, and whether normality holds
"""
arr = np.array(data)
w, p = stats.shapiro(arr)
return {
"test_name": "Shapiro-Wilk",
"W": float(w),
"p_value": float(p),
"normal": bool(p > alpha),
"alpha": alpha,
}
def wilcoxon_signed_rank(
condition1: List[float],
condition2: List[float],
alpha: float = 0.05,
alternative: str = "two-sided",
) -> Dict[str, Any]:
"""
Wilcoxon signed-rank test (non-parametric alternative to paired t-test).
Args:
condition1: Scores from first condition
condition2: Scores from second condition (same subjects)
alpha: Significance level
alternative: "two-sided", "greater", or "less"
Returns:
Dictionary with test statistic, p-value, and rank-biserial correlation
"""
c1 = np.array(condition1)
c2 = np.array(condition2)
diff = c1 - c2
result = stats.wilcoxon(diff, alternative=alternative)
n = len(diff)
# Rank-biserial correlation as effect size
r_rb = 1 - (2 * float(result.statistic)) / (n * (n + 1) / 2)
return {
"test_name": "Wilcoxon signed-rank",
"statistic": float(result.statistic),
"p_value": float(result.pvalue),
"effect_size": float(r_rb),
"effect_size_name": "rank-biserial r",
"n": n,
"significant": bool(result.pvalue < alpha),
"alpha": alpha,
}
def cohens_d_ci(
d: float,
n: int,
alpha: float = 0.05,
) -> Dict[str, float]:
"""
Approximate 95% confidence interval for Cohen's d.
Uses the formula: SE(d) = sqrt(1/n + d^2 / (2*n))
Args:
d: Cohen's d point estimate
n: Sample size (number of pairs for paired tests)
alpha: Significance level
Returns:
Dictionary with d, ci_lower, ci_upper
"""
se = np.sqrt(1 / n + d**2 / (2 * n))
z = stats.norm.ppf(1 - alpha / 2)
return {
"d": float(d),
"ci_lower": float(d - z * se),
"ci_upper": float(d + z * se),
"se": float(se),
}
def compare_all_pairs(
conditions: Dict[str, List[float]],
alpha: float = 0.05,
paired: bool = True,
correction: str = "holm",
) -> Dict[str, StatisticalResult]:
"""
Compare all pairs of conditions with multiple comparison correction.
Args:
conditions: Dictionary mapping condition names to scores
alpha: Family-wise error rate
paired: Whether data is paired
correction: "bonferroni" or "holm"
Returns:
Dictionary of comparison results
"""
condition_names = list(conditions.keys())
n_conditions = len(condition_names)
results = {}
p_values = []
comparison_keys = []
# Perform all pairwise comparisons
for i in range(n_conditions):
for j in range(i + 1, n_conditions):
name1, name2 = condition_names[i], condition_names[j]
key = f"{name1}_vs_{name2}"
if paired:
result = paired_ttest(
conditions[name1], conditions[name2], alpha=alpha
)
else:
result = independent_ttest(
conditions[name1], conditions[name2], alpha=alpha
)
results[key] = result
p_values.append(result.p_value)
comparison_keys.append(key)
# Apply correction
if correction == "bonferroni":
corrected_alpha, significant = bonferroni_correction(p_values, alpha)
adjusted_p = [p * len(p_values) for p in p_values]
else: # holm
adjusted_p, significant = holm_bonferroni_correction(p_values, alpha)
# Update results with corrected significance
for key, adj_p, sig in zip(comparison_keys, adjusted_p, significant):
result = results[key]
# Create new result with adjusted values
results[key] = StatisticalResult(
test_name=result.test_name + f" ({correction}-corrected)",
statistic=result.statistic,
p_value=min(adj_p, 1.0), # Original p-value replaced with adjusted
effect_size=result.effect_size,
effect_size_name=result.effect_size_name,
ci_lower=result.ci_lower,
ci_upper=result.ci_upper,
confidence_level=result.confidence_level,
n=result.n,
mean_diff=result.mean_diff,
std_diff=result.std_diff,
significant=sig,
interpretation=result.interpretation,
)
return results
def spearman_correlation(
x: List[float],
y: List[float],
alpha: float = 0.05,
) -> Dict[str, Any]:
"""
Compute Spearman rank correlation with confidence interval.
Args:
x: First variable
y: Second variable
alpha: Significance level
Returns:
Dictionary with correlation, p-value, CI, and interpretation
"""
result = stats.spearmanr(x, y)
rho = result.correlation
p = result.pvalue
n = len(x)
# Fisher z-transformation for CI
z = np.arctanh(rho)
se_z = 1 / np.sqrt(n - 3)
z_crit = stats.norm.ppf(1 - alpha / 2)
ci_z_lower = z - z_crit * se_z
ci_z_upper = z + z_crit * se_z
ci_lower = np.tanh(ci_z_lower)
ci_upper = np.tanh(ci_z_upper)
# Interpretation
abs_rho = abs(rho)
if abs_rho < 0.1:
strength = "negligible"
elif abs_rho < 0.3:
strength = "weak"
elif abs_rho < 0.5:
strength = "moderate"
elif abs_rho < 0.7:
strength = "strong"
else:
strength = "very strong"
direction = "positive" if rho > 0 else "negative"
significant = p < alpha
return {
"rho": float(rho),
"p_value": float(p),
"n": n,
"ci_lower": float(ci_lower),
"ci_upper": float(ci_upper),
"confidence_level": 1 - alpha,
"significant": significant,
"interpretation": f"{'Significant' if significant else 'Non-significant'} {strength} {direction} correlation",
}