again / controllers /hypothesis_controller.py
Beam2513's picture
Upload 127 files
798602c verified
from __future__ import annotations
from typing import Iterable, Tuple
import numpy as np
import pandas as pd
from core.hypothesis_tests import (
one_sample_ttest,
two_sample_ttest,
variance_test,
one_way_anova,
)
ROUND = 4
def _round_table(table: pd.DataFrame, decimals: int = ROUND) -> pd.DataFrame:
"""Round only numeric columns of the result table."""
if table is None:
return table
tbl = table.copy()
num_cols = tbl.select_dtypes(include="number").columns
if len(num_cols) > 0:
tbl[num_cols] = tbl[num_cols].round(decimals)
return tbl
def _ensure_numeric_series(df: pd.DataFrame, column: str) -> np.ndarray:
if df is None:
raise ValueError("No dataset loaded.")
if column not in df.columns:
raise ValueError(f"Column '{column}' not found in the dataset.")
series = df[column].dropna()
if series.empty:
raise ValueError("No valid data in the selected column.")
return series.to_numpy()
def _materialize_group(
df: pd.DataFrame,
numeric_col: str,
cat_col: str | None,
cat_vals: Iterable[str],
) -> np.ndarray:
if cat_col is None:
raise ValueError("No categorical column selected.")
if cat_col not in df.columns:
raise ValueError(f"Categorical column '{cat_col}' not found in the dataset.")
# Cast selected values to the actual dtype of the column
if cat_vals is None:
values = []
else:
values = list(cat_vals)
if not values:
raise ValueError(f"No categories selected for column '{cat_col}'.")
cat_series = pd.Series(values).astype(df[cat_col].dtype)
mask = df[cat_col].isin(cat_series)
series = df.loc[mask, numeric_col].dropna()
if series.empty:
raise ValueError("One or more groups are empty after filtering.")
return series.to_numpy()
def run_hypothesis_testing(
*,
df: pd.DataFrame | None,
numeric_col: str,
hypo_test: str,
mu0_text: str,
alternative: str,
include_graph: bool,
bootstrap_samples: int,
cat_col1: str | None,
cat_vals1: list[str],
name_group1: str,
cat_col2: str | None,
cat_vals2: list[str],
name_group2: str,
cat_col3: str | None,
cat_vals3: list[str],
plot_type: str,
correction: bool,
test_type: str,
) -> Tuple[pd.DataFrame, object | None]:
"""
High-level dispatcher used by the Hypothesis Testing tab.
Returns:
(result_table, figure_or_none)
"""
if df is None:
raise ValueError("No dataset loaded.")
# Common numeric data check
_ = _ensure_numeric_series(df, numeric_col)
# ------------------------------------------------------------
# One-sample t-test
# ------------------------------------------------------------
if hypo_test == "One sample Student's t-test":
if not mu0_text.strip():
raise ValueError("μ₀ must be specified for the one-sample t-test.")
try:
mu0 = float(mu0_text)
except Exception:
raise ValueError("μ₀ must be a numeric value.")
sample = df[numeric_col].dropna().to_numpy()
table, fig = one_sample_ttest(
sample=sample,
mu0=mu0,
alternative=alternative,
numeric_col=numeric_col,
bootstrap_samples=bootstrap_samples,
include_graph=include_graph,
)
table = _round_table(table)
return table, fig
# ------------------------------------------------------------
# Two-sample t-test
# ------------------------------------------------------------
if hypo_test == "Two samples Student's t-test":
group1 = _materialize_group(df, numeric_col, cat_col1, cat_vals1)
group2 = _materialize_group(df, numeric_col, cat_col2, cat_vals2)
# If names are empty, fall back to defaults
name1 = name_group1 or "Group 1"
name2 = name_group2 or "Group 2"
table, fig = two_sample_ttest(
group1=group1,
group2=group2,
numeric_col=numeric_col,
name_group1=name1,
name_group2=name2,
alternative=alternative,
correction=correction,
plot_type=plot_type,
bootstrap_samples=bootstrap_samples,
include_graph=include_graph,
)
table = _round_table(table)
return table, fig
# ------------------------------------------------------------
# Equal variance between two groups
# ------------------------------------------------------------
if hypo_test == "Equal variance between two groups":
group1 = _materialize_group(df, numeric_col, cat_col1, cat_vals1)
group2 = _materialize_group(df, numeric_col, cat_col2, cat_vals2)
name1 = name_group1 or "Group 1"
name2 = name_group2 or "Group 2"
table, fig = variance_test(
group1=group1,
group2=group2,
name_group1=name1,
name_group2=name2,
test_type=test_type,
include_graph=include_graph,
bootstrap_samples=bootstrap_samples,
)
table = _round_table(table)
return table, fig
# ------------------------------------------------------------
# One-way ANOVA
# ------------------------------------------------------------
if hypo_test == "One-way ANOVA":
if cat_col3 is None:
raise ValueError("A categorical column must be selected for ANOVA.")
if cat_col3 not in df.columns:
raise ValueError(
f"Categorical column '{cat_col3}' not found in the dataset."
)
if not cat_vals3:
raise ValueError("At least one category must be selected for ANOVA.")
cat_series = pd.Series(cat_vals3).astype(df[cat_col3].dtype)
data_group = df[df[cat_col3].isin(cat_series)][[numeric_col, cat_col3]].dropna()
table, fig = one_way_anova(
data_group=data_group,
numeric_col=numeric_col,
cat_col=cat_col3,
)
table = _round_table(table)
return table, fig
# ------------------------------------------------------------
# Fallback
# ------------------------------------------------------------
raise ValueError(f"Unknown hypothesis test: {hypo_test}")