"""
Statistical Analysis Module

Provides rigorous statistical tools for hypothesis testing:
- Paired t-tests for within-subject designs
- Effect sizes (Cohen's d)
- Confidence intervals
- Multiple comparison correction
- Power analysis
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import List, Tuple, Dict, Any, Optional
import numpy as np
from scipy import stats


@dataclass
class StatisticalResult:
    """
    Result of a statistical test.

    Includes all information needed for scientific reporting:
    - Test statistic and p-value
    - Effect size with interpretation
    - Confidence interval
    - Sample statistics
    """
    test_name: str
    statistic: float
    p_value: float
    effect_size: float
    effect_size_name: str
    ci_lower: float
    ci_upper: float
    confidence_level: float
    n: int
    mean_diff: float
    std_diff: float
    significant: bool
    interpretation: str

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for serialization."""
        return {
            "test_name": self.test_name,
            "statistic": self.statistic,
            "p_value": self.p_value,
            "effect_size": self.effect_size,
            "effect_size_name": self.effect_size_name,
            "ci_lower": self.ci_lower,
            "ci_upper": self.ci_upper,
            "confidence_level": self.confidence_level,
            "n": self.n,
            "mean_diff": self.mean_diff,
            "std_diff": self.std_diff,
            "significant": self.significant,
            "interpretation": self.interpretation,
        }

    def __str__(self) -> str:
        """Human-readable summary."""
        sig_marker = "*" if self.significant else ""
        return (
            f"{self.test_name}: t({self.n-1})={self.statistic:.3f}, "
            f"p={self.p_value:.4f}{sig_marker}, "
            f"{self.effect_size_name}={self.effect_size:.3f}, "
            f"95% CI [{self.ci_lower:.4f}, {self.ci_upper:.4f}]"
        )


def paired_ttest(
    condition1: List[float],
    condition2: List[float],
    alpha: float = 0.05,
    alternative: str = "two-sided",
) -> StatisticalResult:
    """
    Perform paired samples t-test.

    For testing H0: μ1 = μ2 vs H1: μ1 ≠ μ2 (or one-sided alternatives)

    Args:
        condition1: Scores from first condition
        condition2: Scores from second condition (same subjects)
        alpha: Significance level
        alternative: "two-sided", "greater", or "less"

    Returns:
        StatisticalResult with all test statistics
    """
    if len(condition1) != len(condition2):
        raise ValueError("Conditions must have same length for paired test")

    n = len(condition1)
    if n < 2:
        raise ValueError("Need at least 2 observations")

    c1 = np.array(condition1)
    c2 = np.array(condition2)
    differences = c1 - c2

    # Perform t-test
    result = stats.ttest_rel(c1, c2, alternative=alternative)

    # Effect size (Cohen's d for paired samples)
    d = compute_effect_size(condition1, condition2, paired=True)

    # Confidence interval for mean difference
    mean_diff = np.mean(differences)
    std_diff = np.std(differences, ddof=1)
    se = std_diff / np.sqrt(n)
    t_crit = stats.t.ppf(1 - alpha / 2, df=n - 1)
    ci_lower = mean_diff - t_crit * se
    ci_upper = mean_diff + t_crit * se

    # Interpretation
    significant = result.pvalue < alpha
    interpretation = _interpret_effect_size(d)

    return StatisticalResult(
        test_name="Paired t-test",
        statistic=float(result.statistic),
        p_value=float(result.pvalue),
        effect_size=float(d),
        effect_size_name="Cohen's d",
        ci_lower=float(ci_lower),
        ci_upper=float(ci_upper),
        confidence_level=1 - alpha,
        n=n,
        mean_diff=float(mean_diff),
        std_diff=float(std_diff),
        significant=significant,
        interpretation=interpretation,
    )


def independent_ttest(
    group1: List[float],
    group2: List[float],
    alpha: float = 0.05,
    equal_var: bool = False,
    alternative: str = "two-sided",
) -> StatisticalResult:
    """
    Perform independent samples t-test.

    Args:
        group1: Scores from first group
        group2: Scores from second group
        alpha: Significance level
        equal_var: Assume equal variances (use Welch's t-test if False)
        alternative: "two-sided", "greater", or "less"

    Returns:
        StatisticalResult with all test statistics
    """
    g1 = np.array(group1)
    g2 = np.array(group2)

    # Perform t-test
    result = stats.ttest_ind(g1, g2, equal_var=equal_var, alternative=alternative)

    # Effect size (Cohen's d)
    d = compute_effect_size(group1, group2, paired=False)

    # Confidence interval for difference in means
    mean_diff = np.mean(g1) - np.mean(g2)
    n1, n2 = len(g1), len(g2)

    if equal_var:
        # Pooled standard error
        pooled_var = ((n1 - 1) * np.var(g1, ddof=1) + (n2 - 1) * np.var(g2, ddof=1)) / (n1 + n2 - 2)
        se = np.sqrt(pooled_var * (1/n1 + 1/n2))
        df = n1 + n2 - 2
    else:
        # Welch-Satterthwaite approximation
        var1, var2 = np.var(g1, ddof=1), np.var(g2, ddof=1)
        se = np.sqrt(var1/n1 + var2/n2)
        df = (var1/n1 + var2/n2)**2 / ((var1/n1)**2/(n1-1) + (var2/n2)**2/(n2-1))

    t_crit = stats.t.ppf(1 - alpha / 2, df=df)
    ci_lower = mean_diff - t_crit * se
    ci_upper = mean_diff + t_crit * se

    significant = result.pvalue < alpha
    interpretation = _interpret_effect_size(d)

    return StatisticalResult(
        test_name="Independent t-test" + ("" if equal_var else " (Welch's)"),
        statistic=float(result.statistic),
        p_value=float(result.pvalue),
        effect_size=float(d),
        effect_size_name="Cohen's d",
        ci_lower=float(ci_lower),
        ci_upper=float(ci_upper),
        confidence_level=1 - alpha,
        n=n1 + n2,
        mean_diff=float(mean_diff),
        std_diff=float(se * np.sqrt(n1 + n2)),  # Approximate pooled SD
        significant=significant,
        interpretation=interpretation,
    )


def compute_effect_size(
    group1: List[float],
    group2: List[float],
    paired: bool = True,
) -> float:
    """
    Compute Cohen's d effect size.

    For paired data: d = mean(diff) / std(diff)
    For independent: d = (mean1 - mean2) / pooled_std

    Args:
        group1: First group/condition scores
        group2: Second group/condition scores
        paired: Whether data is paired

    Returns:
        Cohen's d effect size
    """
    g1 = np.array(group1)
    g2 = np.array(group2)

    if paired:
        differences = g1 - g2
        d = np.mean(differences) / np.std(differences, ddof=1)
    else:
        n1, n2 = len(g1), len(g2)
        var1 = np.var(g1, ddof=1)
        var2 = np.var(g2, ddof=1)
        pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
        d = (np.mean(g1) - np.mean(g2)) / pooled_std

    return float(d)


def _interpret_effect_size(d: float) -> str:
    """Interpret Cohen's d according to conventional thresholds."""
    abs_d = abs(d)
    if abs_d < 0.2:
        size = "negligible"
    elif abs_d < 0.5:
        size = "small"
    elif abs_d < 0.8:
        size = "medium"
    else:
        size = "large"

    direction = "positive" if d > 0 else "negative" if d < 0 else "no"
    return f"{size} {direction} effect"


def compute_confidence_interval(
    data: List[float],
    confidence: float = 0.95,
) -> Tuple[float, float, float]:
    """
    Compute confidence interval for mean.

    Args:
        data: Sample data
        confidence: Confidence level (default 0.95 for 95% CI)

    Returns:
        Tuple of (mean, ci_lower, ci_upper)
    """
    arr = np.array(data)
    n = len(arr)
    mean = np.mean(arr)
    se = stats.sem(arr)

    alpha = 1 - confidence
    t_crit = stats.t.ppf(1 - alpha / 2, df=n - 1)

    ci_lower = mean - t_crit * se
    ci_upper = mean + t_crit * se

    return float(mean), float(ci_lower), float(ci_upper)


def bonferroni_correction(
    p_values: List[float],
    alpha: float = 0.05,
) -> Tuple[float, List[bool]]:
    """
    Apply Bonferroni correction for multiple comparisons.

    Args:
        p_values: List of p-values from multiple tests
        alpha: Family-wise error rate

    Returns:
        Tuple of (corrected_alpha, list of significant results)
    """
    n_tests = len(p_values)
    corrected_alpha = alpha / n_tests
    significant = [p < corrected_alpha for p in p_values]

    return corrected_alpha, significant


def holm_bonferroni_correction(
    p_values: List[float],
    alpha: float = 0.05,
) -> Tuple[List[float], List[bool]]:
    """
    Apply Holm-Bonferroni (step-down) correction.

    More powerful than standard Bonferroni while controlling FWER.

    Args:
        p_values: List of p-values from multiple tests
        alpha: Family-wise error rate

    Returns:
        Tuple of (adjusted_p_values, list of significant results)
    """
    n = len(p_values)
    indices = np.argsort(p_values)
    sorted_p = np.array(p_values)[indices]

    adjusted_p = np.zeros(n)
    significant = [False] * n

    for i, idx in enumerate(indices):
        adjusted_p[idx] = sorted_p[i] * (n - i)

    # Enforce monotonicity
    adjusted_p = np.minimum.accumulate(adjusted_p[np.argsort(indices)][::-1])[::-1]
    adjusted_p = np.minimum(adjusted_p, 1.0)

    significant = [p < alpha for p in adjusted_p]

    return list(adjusted_p), significant


def power_analysis_paired_ttest(
    effect_size: float,
    alpha: float = 0.05,
    power: float = 0.80,
) -> int:
    """
    Compute required sample size for paired t-test.

    Args:
        effect_size: Expected Cohen's d
        alpha: Significance level
        power: Desired statistical power

    Returns:
        Required sample size (N)
    """
    from scipy.stats import norm

    z_alpha = norm.ppf(1 - alpha / 2)
    z_beta = norm.ppf(power)

    n = ((z_alpha + z_beta) / effect_size) ** 2

    return int(np.ceil(n))


def bootstrap_ci(
    data: List[float],
    n_bootstrap: int = 10000,
    confidence: float = 0.95,
    statistic: str = "mean",
    seed: int = 42,
) -> Dict[str, float]:
    """
    Compute bootstrap confidence interval.

    Non-parametric CI that makes no distributional assumptions.
    Uses BCa (bias-corrected and accelerated) percentile method.

    Args:
        data: Sample data
        n_bootstrap: Number of bootstrap resamples
        confidence: Confidence level (default 0.95 for 95% CI)
        statistic: "mean" or "median"
        seed: Random seed for reproducibility

    Returns:
        Dictionary with point estimate, ci_lower, ci_upper, se
    """
    arr = np.array(data)
    n = len(arr)
    rng = np.random.default_rng(seed)

    stat_fn = np.mean if statistic == "mean" else np.median
    observed = float(stat_fn(arr))

    # Generate bootstrap distribution
    boot_stats = np.empty(n_bootstrap)
    for i in range(n_bootstrap):
        sample = rng.choice(arr, size=n, replace=True)
        boot_stats[i] = stat_fn(sample)

    # BCa correction: bias correction factor
    z0 = stats.norm.ppf(np.mean(boot_stats < observed))

    # Acceleration factor (jackknife)
    jackknife_stats = np.empty(n)
    for i in range(n):
        jack_sample = np.delete(arr, i)
        jackknife_stats[i] = stat_fn(jack_sample)
    jack_mean = np.mean(jackknife_stats)
    num = np.sum((jack_mean - jackknife_stats) ** 3)
    den = 6 * (np.sum((jack_mean - jackknife_stats) ** 2) ** 1.5)
    a = num / den if den != 0 else 0.0

    # Adjusted percentiles
    alpha = 1 - confidence
    z_lower = stats.norm.ppf(alpha / 2)
    z_upper = stats.norm.ppf(1 - alpha / 2)

    p_lower = stats.norm.cdf(z0 + (z0 + z_lower) / (1 - a * (z0 + z_lower)))
    p_upper = stats.norm.cdf(z0 + (z0 + z_upper) / (1 - a * (z0 + z_upper)))

    # Clamp to valid range
    p_lower = np.clip(p_lower, 0.001, 0.999)
    p_upper = np.clip(p_upper, 0.001, 0.999)

    ci_lower = float(np.percentile(boot_stats, p_lower * 100))
    ci_upper = float(np.percentile(boot_stats, p_upper * 100))

    return {
        "estimate": observed,
        "ci_lower": ci_lower,
        "ci_upper": ci_upper,
        "se": float(np.std(boot_stats)),
        "confidence": confidence,
        "n_bootstrap": n_bootstrap,
        "method": "BCa bootstrap",
    }


def bootstrap_ci_diff(
    group1: List[float],
    group2: List[float],
    n_bootstrap: int = 10000,
    confidence: float = 0.95,
    paired: bool = True,
    seed: int = 42,
) -> Dict[str, float]:
    """
    Bootstrap CI for the difference between two groups.

    Args:
        group1: First group scores
        group2: Second group scores
        n_bootstrap: Number of bootstrap resamples
        confidence: Confidence level
        paired: Whether data is paired (same subjects)
        seed: Random seed

    Returns:
        Dictionary with mean difference, ci_lower, ci_upper
    """
    g1 = np.array(group1)
    g2 = np.array(group2)
    rng = np.random.default_rng(seed)

    if paired:
        diffs = g1 - g2
        n = len(diffs)
        observed_diff = float(np.mean(diffs))

        boot_diffs = np.empty(n_bootstrap)
        for i in range(n_bootstrap):
            sample = rng.choice(diffs, size=n, replace=True)
            boot_diffs[i] = np.mean(sample)
    else:
        n1, n2 = len(g1), len(g2)
        observed_diff = float(np.mean(g1) - np.mean(g2))

        boot_diffs = np.empty(n_bootstrap)
        for i in range(n_bootstrap):
            s1 = rng.choice(g1, size=n1, replace=True)
            s2 = rng.choice(g2, size=n2, replace=True)
            boot_diffs[i] = np.mean(s1) - np.mean(s2)

    alpha = 1 - confidence
    ci_lower = float(np.percentile(boot_diffs, (alpha / 2) * 100))
    ci_upper = float(np.percentile(boot_diffs, (1 - alpha / 2) * 100))

    return {
        "mean_diff": observed_diff,
        "ci_lower": ci_lower,
        "ci_upper": ci_upper,
        "se": float(np.std(boot_diffs)),
        "confidence": confidence,
        "n_bootstrap": n_bootstrap,
    }


def descriptive_stats(data: List[float]) -> Dict[str, float]:
    """
    Compute descriptive statistics for a sample.

    Args:
        data: Sample data

    Returns:
        Dictionary with mean, std, median, min, max, N, bootstrap CI
    """
    arr = np.array(data)
    boot = bootstrap_ci(list(arr))
    return {
        "n": len(arr),
        "mean": float(np.mean(arr)),
        "std": float(np.std(arr, ddof=1)),
        "median": float(np.median(arr)),
        "min": float(np.min(arr)),
        "max": float(np.max(arr)),
        "se": float(stats.sem(arr)),
        "ci_lower_95": boot["ci_lower"],
        "ci_upper_95": boot["ci_upper"],
    }


def shapiro_wilk_test(
    data: List[float],
    alpha: float = 0.05,
) -> Dict[str, Any]:
    """
    Shapiro-Wilk test for normality.

    Args:
        data: Sample data (paired differences for paired tests)
        alpha: Significance level

    Returns:
        Dictionary with W statistic, p-value, and whether normality holds
    """
    arr = np.array(data)
    w, p = stats.shapiro(arr)
    return {
        "test_name": "Shapiro-Wilk",
        "W": float(w),
        "p_value": float(p),
        "normal": bool(p > alpha),
        "alpha": alpha,
    }


def wilcoxon_signed_rank(
    condition1: List[float],
    condition2: List[float],
    alpha: float = 0.05,
    alternative: str = "two-sided",
) -> Dict[str, Any]:
    """
    Wilcoxon signed-rank test (non-parametric alternative to paired t-test).

    Args:
        condition1: Scores from first condition
        condition2: Scores from second condition (same subjects)
        alpha: Significance level
        alternative: "two-sided", "greater", or "less"

    Returns:
        Dictionary with test statistic, p-value, and rank-biserial correlation
    """
    c1 = np.array(condition1)
    c2 = np.array(condition2)
    diff = c1 - c2

    result = stats.wilcoxon(diff, alternative=alternative)
    n = len(diff)

    # Rank-biserial correlation as effect size
    r_rb = 1 - (2 * float(result.statistic)) / (n * (n + 1) / 2)

    return {
        "test_name": "Wilcoxon signed-rank",
        "statistic": float(result.statistic),
        "p_value": float(result.pvalue),
        "effect_size": float(r_rb),
        "effect_size_name": "rank-biserial r",
        "n": n,
        "significant": bool(result.pvalue < alpha),
        "alpha": alpha,
    }


def cohens_d_ci(
    d: float,
    n: int,
    alpha: float = 0.05,
) -> Dict[str, float]:
    """
    Approximate 95% confidence interval for Cohen's d.

    Uses the formula: SE(d) = sqrt(1/n + d^2 / (2*n))

    Args:
        d: Cohen's d point estimate
        n: Sample size (number of pairs for paired tests)
        alpha: Significance level

    Returns:
        Dictionary with d, ci_lower, ci_upper
    """
    se = np.sqrt(1 / n + d**2 / (2 * n))
    z = stats.norm.ppf(1 - alpha / 2)
    return {
        "d": float(d),
        "ci_lower": float(d - z * se),
        "ci_upper": float(d + z * se),
        "se": float(se),
    }


def compare_all_pairs(
    conditions: Dict[str, List[float]],
    alpha: float = 0.05,
    paired: bool = True,
    correction: str = "holm",
) -> Dict[str, StatisticalResult]:
    """
    Compare all pairs of conditions with multiple comparison correction.

    Args:
        conditions: Dictionary mapping condition names to scores
        alpha: Family-wise error rate
        paired: Whether data is paired
        correction: "bonferroni" or "holm"

    Returns:
        Dictionary of comparison results
    """
    condition_names = list(conditions.keys())
    n_conditions = len(condition_names)

    results = {}
    p_values = []
    comparison_keys = []

    # Perform all pairwise comparisons
    for i in range(n_conditions):
        for j in range(i + 1, n_conditions):
            name1, name2 = condition_names[i], condition_names[j]
            key = f"{name1}_vs_{name2}"

            if paired:
                result = paired_ttest(
                    conditions[name1], conditions[name2], alpha=alpha
                )
            else:
                result = independent_ttest(
                    conditions[name1], conditions[name2], alpha=alpha
                )

            results[key] = result
            p_values.append(result.p_value)
            comparison_keys.append(key)

    # Apply correction
    if correction == "bonferroni":
        corrected_alpha, significant = bonferroni_correction(p_values, alpha)
        adjusted_p = [p * len(p_values) for p in p_values]
    else:  # holm
        adjusted_p, significant = holm_bonferroni_correction(p_values, alpha)

    # Update results with corrected significance
    for key, adj_p, sig in zip(comparison_keys, adjusted_p, significant):
        result = results[key]
        # Create new result with adjusted values
        results[key] = StatisticalResult(
            test_name=result.test_name + f" ({correction}-corrected)",
            statistic=result.statistic,
            p_value=min(adj_p, 1.0),  # Original p-value replaced with adjusted
            effect_size=result.effect_size,
            effect_size_name=result.effect_size_name,
            ci_lower=result.ci_lower,
            ci_upper=result.ci_upper,
            confidence_level=result.confidence_level,
            n=result.n,
            mean_diff=result.mean_diff,
            std_diff=result.std_diff,
            significant=sig,
            interpretation=result.interpretation,
        )

    return results


def spearman_correlation(
    x: List[float],
    y: List[float],
    alpha: float = 0.05,
) -> Dict[str, Any]:
    """
    Compute Spearman rank correlation with confidence interval.

    Args:
        x: First variable
        y: Second variable
        alpha: Significance level

    Returns:
        Dictionary with correlation, p-value, CI, and interpretation
    """
    result = stats.spearmanr(x, y)
    rho = result.correlation
    p = result.pvalue
    n = len(x)

    # Fisher z-transformation for CI
    z = np.arctanh(rho)
    se_z = 1 / np.sqrt(n - 3)
    z_crit = stats.norm.ppf(1 - alpha / 2)
    ci_z_lower = z - z_crit * se_z
    ci_z_upper = z + z_crit * se_z
    ci_lower = np.tanh(ci_z_lower)
    ci_upper = np.tanh(ci_z_upper)

    # Interpretation
    abs_rho = abs(rho)
    if abs_rho < 0.1:
        strength = "negligible"
    elif abs_rho < 0.3:
        strength = "weak"
    elif abs_rho < 0.5:
        strength = "moderate"
    elif abs_rho < 0.7:
        strength = "strong"
    else:
        strength = "very strong"

    direction = "positive" if rho > 0 else "negative"
    significant = p < alpha

    return {
        "rho": float(rho),
        "p_value": float(p),
        "n": n,
        "ci_lower": float(ci_lower),
        "ci_upper": float(ci_upper),
        "confidence_level": 1 - alpha,
        "significant": significant,
        "interpretation": f"{'Significant' if significant else 'Non-significant'} {strength} {direction} correlation",
    }