Spaces:

pratik-250620
/

MultiModal-Coherence-AI

Running

App Files Files Community

MultiModal-Coherence-AI / src /experiments /statistical_analysis.py

pratik-250620

Upload folder using huggingface_hub

6835659 verified 17 days ago

raw

history blame contribute delete

21.2 kB

	"""
	Statistical Analysis Module

	Provides rigorous statistical tools for hypothesis testing:
	- Paired t-tests for within-subject designs
	- Effect sizes (Cohen's d)
	- Confidence intervals
	- Multiple comparison correction
	- Power analysis
	"""

	from __future__ import annotations

	from dataclasses import dataclass
	from typing import List, Tuple, Dict, Any, Optional
	import numpy as np
	from scipy import stats


	@dataclass
	class StatisticalResult:
	"""
	Result of a statistical test.

	Includes all information needed for scientific reporting:
	- Test statistic and p-value
	- Effect size with interpretation
	- Confidence interval
	- Sample statistics
	"""
	test_name: str
	statistic: float
	p_value: float
	effect_size: float
	effect_size_name: str
	ci_lower: float
	ci_upper: float
	confidence_level: float
	n: int
	mean_diff: float
	std_diff: float
	significant: bool
	interpretation: str

	def to_dict(self) -> Dict[str, Any]:
	"""Convert to dictionary for serialization."""
	return {
	"test_name": self.test_name,
	"statistic": self.statistic,
	"p_value": self.p_value,
	"effect_size": self.effect_size,
	"effect_size_name": self.effect_size_name,
	"ci_lower": self.ci_lower,
	"ci_upper": self.ci_upper,
	"confidence_level": self.confidence_level,
	"n": self.n,
	"mean_diff": self.mean_diff,
	"std_diff": self.std_diff,
	"significant": self.significant,
	"interpretation": self.interpretation,
	}

	def __str__(self) -> str:
	"""Human-readable summary."""
	sig_marker = "*" if self.significant else ""
	return (
	f"{self.test_name}: t({self.n-1})={self.statistic:.3f}, "
	f"p={self.p_value:.4f}{sig_marker}, "
	f"{self.effect_size_name}={self.effect_size:.3f}, "
	f"95% CI [{self.ci_lower:.4f}, {self.ci_upper:.4f}]"
	)


	def paired_ttest(
	condition1: List[float],
	condition2: List[float],
	alpha: float = 0.05,
	alternative: str = "two-sided",
	) -> StatisticalResult:
	"""
	Perform paired samples t-test.

	For testing H0: μ1 = μ2 vs H1: μ1 ≠ μ2 (or one-sided alternatives)

	Args:
	condition1: Scores from first condition
	condition2: Scores from second condition (same subjects)
	alpha: Significance level
	alternative: "two-sided", "greater", or "less"

	Returns:
	StatisticalResult with all test statistics
	"""
	if len(condition1) != len(condition2):
	raise ValueError("Conditions must have same length for paired test")

	n = len(condition1)
	if n < 2:
	raise ValueError("Need at least 2 observations")

	c1 = np.array(condition1)
	c2 = np.array(condition2)
	differences = c1 - c2

	# Perform t-test
	result = stats.ttest_rel(c1, c2, alternative=alternative)

	# Effect size (Cohen's d for paired samples)
	d = compute_effect_size(condition1, condition2, paired=True)

	# Confidence interval for mean difference
	mean_diff = np.mean(differences)
	std_diff = np.std(differences, ddof=1)
	se = std_diff / np.sqrt(n)
	t_crit = stats.t.ppf(1 - alpha / 2, df=n - 1)
	ci_lower = mean_diff - t_crit * se
	ci_upper = mean_diff + t_crit * se

	# Interpretation
	significant = result.pvalue < alpha
	interpretation = _interpret_effect_size(d)

	return StatisticalResult(
	test_name="Paired t-test",
	statistic=float(result.statistic),
	p_value=float(result.pvalue),
	effect_size=float(d),
	effect_size_name="Cohen's d",
	ci_lower=float(ci_lower),
	ci_upper=float(ci_upper),
	confidence_level=1 - alpha,
	n=n,
	mean_diff=float(mean_diff),
	std_diff=float(std_diff),
	significant=significant,
	interpretation=interpretation,
	)


	def independent_ttest(
	group1: List[float],
	group2: List[float],
	alpha: float = 0.05,
	equal_var: bool = False,
	alternative: str = "two-sided",
	) -> StatisticalResult:
	"""
	Perform independent samples t-test.

	Args:
	group1: Scores from first group
	group2: Scores from second group
	alpha: Significance level
	equal_var: Assume equal variances (use Welch's t-test if False)
	alternative: "two-sided", "greater", or "less"

	Returns:
	StatisticalResult with all test statistics
	"""
	g1 = np.array(group1)
	g2 = np.array(group2)

	# Perform t-test
	result = stats.ttest_ind(g1, g2, equal_var=equal_var, alternative=alternative)

	# Effect size (Cohen's d)
	d = compute_effect_size(group1, group2, paired=False)

	# Confidence interval for difference in means
	mean_diff = np.mean(g1) - np.mean(g2)
	n1, n2 = len(g1), len(g2)

	if equal_var:
	# Pooled standard error
	pooled_var = ((n1 - 1) * np.var(g1, ddof=1) + (n2 - 1) * np.var(g2, ddof=1)) / (n1 + n2 - 2)
	se = np.sqrt(pooled_var * (1/n1 + 1/n2))
	df = n1 + n2 - 2
	else:
	# Welch-Satterthwaite approximation
	var1, var2 = np.var(g1, ddof=1), np.var(g2, ddof=1)
	se = np.sqrt(var1/n1 + var2/n2)
	df = (var1/n1 + var2/n2)2 / ((var1/n1)2/(n1-1) + (var2/n2)**2/(n2-1))

	t_crit = stats.t.ppf(1 - alpha / 2, df=df)
	ci_lower = mean_diff - t_crit * se
	ci_upper = mean_diff + t_crit * se

	significant = result.pvalue < alpha
	interpretation = _interpret_effect_size(d)

	return StatisticalResult(
	test_name="Independent t-test" + ("" if equal_var else " (Welch's)"),
	statistic=float(result.statistic),
	p_value=float(result.pvalue),
	effect_size=float(d),
	effect_size_name="Cohen's d",
	ci_lower=float(ci_lower),
	ci_upper=float(ci_upper),
	confidence_level=1 - alpha,
	n=n1 + n2,
	mean_diff=float(mean_diff),
	std_diff=float(se * np.sqrt(n1 + n2)), # Approximate pooled SD
	significant=significant,
	interpretation=interpretation,
	)


	def compute_effect_size(
	group1: List[float],
	group2: List[float],
	paired: bool = True,
	) -> float:
	"""
	Compute Cohen's d effect size.

	For paired data: d = mean(diff) / std(diff)
	For independent: d = (mean1 - mean2) / pooled_std

	Args:
	group1: First group/condition scores
	group2: Second group/condition scores
	paired: Whether data is paired

	Returns:
	Cohen's d effect size
	"""
	g1 = np.array(group1)
	g2 = np.array(group2)

	if paired:
	differences = g1 - g2
	d = np.mean(differences) / np.std(differences, ddof=1)
	else:
	n1, n2 = len(g1), len(g2)
	var1 = np.var(g1, ddof=1)
	var2 = np.var(g2, ddof=1)
	pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
	d = (np.mean(g1) - np.mean(g2)) / pooled_std

	return float(d)


	def _interpret_effect_size(d: float) -> str:
	"""Interpret Cohen's d according to conventional thresholds."""
	abs_d = abs(d)
	if abs_d < 0.2:
	size = "negligible"
	elif abs_d < 0.5:
	size = "small"
	elif abs_d < 0.8:
	size = "medium"
	else:
	size = "large"

	direction = "positive" if d > 0 else "negative" if d < 0 else "no"
	return f"{size} {direction} effect"


	def compute_confidence_interval(
	data: List[float],
	confidence: float = 0.95,
	) -> Tuple[float, float, float]:
	"""
	Compute confidence interval for mean.

	Args:
	data: Sample data
	confidence: Confidence level (default 0.95 for 95% CI)

	Returns:
	Tuple of (mean, ci_lower, ci_upper)
	"""
	arr = np.array(data)
	n = len(arr)
	mean = np.mean(arr)
	se = stats.sem(arr)

	alpha = 1 - confidence
	t_crit = stats.t.ppf(1 - alpha / 2, df=n - 1)

	ci_lower = mean - t_crit * se
	ci_upper = mean + t_crit * se

	return float(mean), float(ci_lower), float(ci_upper)


	def bonferroni_correction(
	p_values: List[float],
	alpha: float = 0.05,
	) -> Tuple[float, List[bool]]:
	"""
	Apply Bonferroni correction for multiple comparisons.

	Args:
	p_values: List of p-values from multiple tests
	alpha: Family-wise error rate

	Returns:
	Tuple of (corrected_alpha, list of significant results)
	"""
	n_tests = len(p_values)
	corrected_alpha = alpha / n_tests
	significant = [p < corrected_alpha for p in p_values]

	return corrected_alpha, significant


	def holm_bonferroni_correction(
	p_values: List[float],
	alpha: float = 0.05,
	) -> Tuple[List[float], List[bool]]:
	"""
	Apply Holm-Bonferroni (step-down) correction.

	More powerful than standard Bonferroni while controlling FWER.

	Args:
	p_values: List of p-values from multiple tests
	alpha: Family-wise error rate

	Returns:
	Tuple of (adjusted_p_values, list of significant results)
	"""
	n = len(p_values)
	indices = np.argsort(p_values)
	sorted_p = np.array(p_values)[indices]

	adjusted_p = np.zeros(n)
	significant = [False] * n

	for i, idx in enumerate(indices):
	adjusted_p[idx] = sorted_p[i] * (n - i)

	# Enforce monotonicity
	adjusted_p = np.minimum.accumulate(adjusted_p[np.argsort(indices)][::-1])[::-1]
	adjusted_p = np.minimum(adjusted_p, 1.0)

	significant = [p < alpha for p in adjusted_p]

	return list(adjusted_p), significant


	def power_analysis_paired_ttest(
	effect_size: float,
	alpha: float = 0.05,
	power: float = 0.80,
	) -> int:
	"""
	Compute required sample size for paired t-test.

	Args:
	effect_size: Expected Cohen's d
	alpha: Significance level
	power: Desired statistical power

	Returns:
	Required sample size (N)
	"""
	from scipy.stats import norm

	z_alpha = norm.ppf(1 - alpha / 2)
	z_beta = norm.ppf(power)

	n = ((z_alpha + z_beta) / effect_size) ** 2

	return int(np.ceil(n))


	def bootstrap_ci(
	data: List[float],
	n_bootstrap: int = 10000,
	confidence: float = 0.95,
	statistic: str = "mean",
	seed: int = 42,
	) -> Dict[str, float]:
	"""
	Compute bootstrap confidence interval.

	Non-parametric CI that makes no distributional assumptions.
	Uses BCa (bias-corrected and accelerated) percentile method.

	Args:
	data: Sample data
	n_bootstrap: Number of bootstrap resamples
	confidence: Confidence level (default 0.95 for 95% CI)
	statistic: "mean" or "median"
	seed: Random seed for reproducibility

	Returns:
	Dictionary with point estimate, ci_lower, ci_upper, se
	"""
	arr = np.array(data)
	n = len(arr)
	rng = np.random.default_rng(seed)

	stat_fn = np.mean if statistic == "mean" else np.median
	observed = float(stat_fn(arr))

	# Generate bootstrap distribution
	boot_stats = np.empty(n_bootstrap)
	for i in range(n_bootstrap):
	sample = rng.choice(arr, size=n, replace=True)
	boot_stats[i] = stat_fn(sample)

	# BCa correction: bias correction factor
	z0 = stats.norm.ppf(np.mean(boot_stats < observed))

	# Acceleration factor (jackknife)
	jackknife_stats = np.empty(n)
	for i in range(n):
	jack_sample = np.delete(arr, i)
	jackknife_stats[i] = stat_fn(jack_sample)
	jack_mean = np.mean(jackknife_stats)
	num = np.sum((jack_mean - jackknife_stats) ** 3)
	den = 6 * (np.sum((jack_mean - jackknife_stats) 2) 1.5)
	a = num / den if den != 0 else 0.0

	# Adjusted percentiles
	alpha = 1 - confidence
	z_lower = stats.norm.ppf(alpha / 2)
	z_upper = stats.norm.ppf(1 - alpha / 2)

	p_lower = stats.norm.cdf(z0 + (z0 + z_lower) / (1 - a * (z0 + z_lower)))
	p_upper = stats.norm.cdf(z0 + (z0 + z_upper) / (1 - a * (z0 + z_upper)))

	# Clamp to valid range
	p_lower = np.clip(p_lower, 0.001, 0.999)
	p_upper = np.clip(p_upper, 0.001, 0.999)

	ci_lower = float(np.percentile(boot_stats, p_lower * 100))
	ci_upper = float(np.percentile(boot_stats, p_upper * 100))

	return {
	"estimate": observed,
	"ci_lower": ci_lower,
	"ci_upper": ci_upper,
	"se": float(np.std(boot_stats)),
	"confidence": confidence,
	"n_bootstrap": n_bootstrap,
	"method": "BCa bootstrap",
	}


	def bootstrap_ci_diff(
	group1: List[float],
	group2: List[float],
	n_bootstrap: int = 10000,
	confidence: float = 0.95,
	paired: bool = True,
	seed: int = 42,
	) -> Dict[str, float]:
	"""
	Bootstrap CI for the difference between two groups.

	Args:
	group1: First group scores
	group2: Second group scores
	n_bootstrap: Number of bootstrap resamples
	confidence: Confidence level
	paired: Whether data is paired (same subjects)
	seed: Random seed

	Returns:
	Dictionary with mean difference, ci_lower, ci_upper
	"""
	g1 = np.array(group1)
	g2 = np.array(group2)
	rng = np.random.default_rng(seed)

	if paired:
	diffs = g1 - g2
	n = len(diffs)
	observed_diff = float(np.mean(diffs))

	boot_diffs = np.empty(n_bootstrap)
	for i in range(n_bootstrap):
	sample = rng.choice(diffs, size=n, replace=True)
	boot_diffs[i] = np.mean(sample)
	else:
	n1, n2 = len(g1), len(g2)
	observed_diff = float(np.mean(g1) - np.mean(g2))

	boot_diffs = np.empty(n_bootstrap)
	for i in range(n_bootstrap):
	s1 = rng.choice(g1, size=n1, replace=True)
	s2 = rng.choice(g2, size=n2, replace=True)
	boot_diffs[i] = np.mean(s1) - np.mean(s2)

	alpha = 1 - confidence
	ci_lower = float(np.percentile(boot_diffs, (alpha / 2) * 100))
	ci_upper = float(np.percentile(boot_diffs, (1 - alpha / 2) * 100))

	return {
	"mean_diff": observed_diff,
	"ci_lower": ci_lower,
	"ci_upper": ci_upper,
	"se": float(np.std(boot_diffs)),
	"confidence": confidence,
	"n_bootstrap": n_bootstrap,
	}


	def descriptive_stats(data: List[float]) -> Dict[str, float]:
	"""
	Compute descriptive statistics for a sample.

	Args:
	data: Sample data

	Returns:
	Dictionary with mean, std, median, min, max, N, bootstrap CI
	"""
	arr = np.array(data)
	boot = bootstrap_ci(list(arr))
	return {
	"n": len(arr),
	"mean": float(np.mean(arr)),
	"std": float(np.std(arr, ddof=1)),
	"median": float(np.median(arr)),
	"min": float(np.min(arr)),
	"max": float(np.max(arr)),
	"se": float(stats.sem(arr)),
	"ci_lower_95": boot["ci_lower"],
	"ci_upper_95": boot["ci_upper"],
	}


	def shapiro_wilk_test(
	data: List[float],
	alpha: float = 0.05,
	) -> Dict[str, Any]:
	"""
	Shapiro-Wilk test for normality.

	Args:
	data: Sample data (paired differences for paired tests)
	alpha: Significance level

	Returns:
	Dictionary with W statistic, p-value, and whether normality holds
	"""
	arr = np.array(data)
	w, p = stats.shapiro(arr)
	return {
	"test_name": "Shapiro-Wilk",
	"W": float(w),
	"p_value": float(p),
	"normal": bool(p > alpha),
	"alpha": alpha,
	}


	def wilcoxon_signed_rank(
	condition1: List[float],
	condition2: List[float],
	alpha: float = 0.05,
	alternative: str = "two-sided",
	) -> Dict[str, Any]:
	"""
	Wilcoxon signed-rank test (non-parametric alternative to paired t-test).

	Args:
	condition1: Scores from first condition
	condition2: Scores from second condition (same subjects)
	alpha: Significance level
	alternative: "two-sided", "greater", or "less"

	Returns:
	Dictionary with test statistic, p-value, and rank-biserial correlation
	"""
	c1 = np.array(condition1)
	c2 = np.array(condition2)
	diff = c1 - c2

	result = stats.wilcoxon(diff, alternative=alternative)
	n = len(diff)

	# Rank-biserial correlation as effect size
	r_rb = 1 - (2 * float(result.statistic)) / (n * (n + 1) / 2)

	return {
	"test_name": "Wilcoxon signed-rank",
	"statistic": float(result.statistic),
	"p_value": float(result.pvalue),
	"effect_size": float(r_rb),
	"effect_size_name": "rank-biserial r",
	"n": n,
	"significant": bool(result.pvalue < alpha),
	"alpha": alpha,
	}


	def cohens_d_ci(
	d: float,
	n: int,
	alpha: float = 0.05,
	) -> Dict[str, float]:
	"""
	Approximate 95% confidence interval for Cohen's d.

	Uses the formula: SE(d) = sqrt(1/n + d^2 / (2*n))

	Args:
	d: Cohen's d point estimate
	n: Sample size (number of pairs for paired tests)
	alpha: Significance level

	Returns:
	Dictionary with d, ci_lower, ci_upper
	"""
	se = np.sqrt(1 / n + d*2 / (2 n))
	z = stats.norm.ppf(1 - alpha / 2)
	return {
	"d": float(d),
	"ci_lower": float(d - z * se),
	"ci_upper": float(d + z * se),
	"se": float(se),
	}


	def compare_all_pairs(
	conditions: Dict[str, List[float]],
	alpha: float = 0.05,
	paired: bool = True,
	correction: str = "holm",
	) -> Dict[str, StatisticalResult]:
	"""
	Compare all pairs of conditions with multiple comparison correction.

	Args:
	conditions: Dictionary mapping condition names to scores
	alpha: Family-wise error rate
	paired: Whether data is paired
	correction: "bonferroni" or "holm"

	Returns:
	Dictionary of comparison results
	"""
	condition_names = list(conditions.keys())
	n_conditions = len(condition_names)

	results = {}
	p_values = []
	comparison_keys = []

	# Perform all pairwise comparisons
	for i in range(n_conditions):
	for j in range(i + 1, n_conditions):
	name1, name2 = condition_names[i], condition_names[j]
	key = f"{name1}_vs_{name2}"

	if paired:
	result = paired_ttest(
	conditions[name1], conditions[name2], alpha=alpha
	)
	else:
	result = independent_ttest(
	conditions[name1], conditions[name2], alpha=alpha
	)

	results[key] = result
	p_values.append(result.p_value)
	comparison_keys.append(key)

	# Apply correction
	if correction == "bonferroni":
	corrected_alpha, significant = bonferroni_correction(p_values, alpha)
	adjusted_p = [p * len(p_values) for p in p_values]
	else: # holm
	adjusted_p, significant = holm_bonferroni_correction(p_values, alpha)

	# Update results with corrected significance
	for key, adj_p, sig in zip(comparison_keys, adjusted_p, significant):
	result = results[key]
	# Create new result with adjusted values
	results[key] = StatisticalResult(
	test_name=result.test_name + f" ({correction}-corrected)",
	statistic=result.statistic,
	p_value=min(adj_p, 1.0), # Original p-value replaced with adjusted
	effect_size=result.effect_size,
	effect_size_name=result.effect_size_name,
	ci_lower=result.ci_lower,
	ci_upper=result.ci_upper,
	confidence_level=result.confidence_level,
	n=result.n,
	mean_diff=result.mean_diff,
	std_diff=result.std_diff,
	significant=sig,
	interpretation=result.interpretation,
	)

	return results


	def spearman_correlation(
	x: List[float],
	y: List[float],
	alpha: float = 0.05,
	) -> Dict[str, Any]:
	"""
	Compute Spearman rank correlation with confidence interval.

	Args:
	x: First variable
	y: Second variable
	alpha: Significance level

	Returns:
	Dictionary with correlation, p-value, CI, and interpretation
	"""
	result = stats.spearmanr(x, y)
	rho = result.correlation
	p = result.pvalue
	n = len(x)

	# Fisher z-transformation for CI
	z = np.arctanh(rho)
	se_z = 1 / np.sqrt(n - 3)
	z_crit = stats.norm.ppf(1 - alpha / 2)
	ci_z_lower = z - z_crit * se_z
	ci_z_upper = z + z_crit * se_z
	ci_lower = np.tanh(ci_z_lower)
	ci_upper = np.tanh(ci_z_upper)

	# Interpretation
	abs_rho = abs(rho)
	if abs_rho < 0.1:
	strength = "negligible"
	elif abs_rho < 0.3:
	strength = "weak"
	elif abs_rho < 0.5:
	strength = "moderate"
	elif abs_rho < 0.7:
	strength = "strong"
	else:
	strength = "very strong"

	direction = "positive" if rho > 0 else "negative"
	significant = p < alpha

	return {
	"rho": float(rho),
	"p_value": float(p),
	"n": n,
	"ci_lower": float(ci_lower),
	"ci_upper": float(ci_upper),
	"confidence_level": 1 - alpha,
	"significant": significant,
	"interpretation": f"{'Significant' if significant else 'Non-significant'} {strength} {direction} correlation",
	}