File size: 6,609 Bytes
798602c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
from __future__ import annotations

from typing import Iterable, Tuple

import numpy as np
import pandas as pd

from core.hypothesis_tests import (
    one_sample_ttest,
    two_sample_ttest,
    variance_test,
    one_way_anova,
)

ROUND = 4


def _round_table(table: pd.DataFrame, decimals: int = ROUND) -> pd.DataFrame:
    """Round only numeric columns of the result table."""
    if table is None:
        return table
    tbl = table.copy()
    num_cols = tbl.select_dtypes(include="number").columns
    if len(num_cols) > 0:
        tbl[num_cols] = tbl[num_cols].round(decimals)
    return tbl

def _ensure_numeric_series(df: pd.DataFrame, column: str) -> np.ndarray:
    if df is None:
        raise ValueError("No dataset loaded.")
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in the dataset.")

    series = df[column].dropna()
    if series.empty:
        raise ValueError("No valid data in the selected column.")
    return series.to_numpy()


def _materialize_group(

    df: pd.DataFrame,

    numeric_col: str,

    cat_col: str | None,

    cat_vals: Iterable[str],

) -> np.ndarray:
    if cat_col is None:
        raise ValueError("No categorical column selected.")

    if cat_col not in df.columns:
        raise ValueError(f"Categorical column '{cat_col}' not found in the dataset.")

    # Cast selected values to the actual dtype of the column
    if cat_vals is None:
        values = []
    else:
        values = list(cat_vals)

    if not values:
        raise ValueError(f"No categories selected for column '{cat_col}'.")

    cat_series = pd.Series(values).astype(df[cat_col].dtype)
    mask = df[cat_col].isin(cat_series)
    series = df.loc[mask, numeric_col].dropna()

    if series.empty:
        raise ValueError("One or more groups are empty after filtering.")
    return series.to_numpy()


def run_hypothesis_testing(

    *,

    df: pd.DataFrame | None,

    numeric_col: str,

    hypo_test: str,

    mu0_text: str,

    alternative: str,

    include_graph: bool,

    bootstrap_samples: int,

    cat_col1: str | None,

    cat_vals1: list[str],

    name_group1: str,

    cat_col2: str | None,

    cat_vals2: list[str],

    name_group2: str,

    cat_col3: str | None,

    cat_vals3: list[str],

    plot_type: str,

    correction: bool,

    test_type: str,

) -> Tuple[pd.DataFrame, object | None]:
    """

    High-level dispatcher used by the Hypothesis Testing tab.



    Returns:

        (result_table, figure_or_none)

    """
    if df is None:
        raise ValueError("No dataset loaded.")

    # Common numeric data check
    _ = _ensure_numeric_series(df, numeric_col)

    # ------------------------------------------------------------
    # One-sample t-test
    # ------------------------------------------------------------
    if hypo_test == "One sample Student's t-test":
        if not mu0_text.strip():
            raise ValueError("μ₀ must be specified for the one-sample t-test.")
        try:
            mu0 = float(mu0_text)
        except Exception:
            raise ValueError("μ₀ must be a numeric value.")

        sample = df[numeric_col].dropna().to_numpy()

        table, fig = one_sample_ttest(
            sample=sample,
            mu0=mu0,
            alternative=alternative,
            numeric_col=numeric_col,
            bootstrap_samples=bootstrap_samples,
            include_graph=include_graph,
        )
        table = _round_table(table)
        return table, fig

    # ------------------------------------------------------------
    # Two-sample t-test
    # ------------------------------------------------------------
    if hypo_test == "Two samples Student's t-test":
        group1 = _materialize_group(df, numeric_col, cat_col1, cat_vals1)
        group2 = _materialize_group(df, numeric_col, cat_col2, cat_vals2)

        # If names are empty, fall back to defaults
        name1 = name_group1 or "Group 1"
        name2 = name_group2 or "Group 2"

        table, fig = two_sample_ttest(
            group1=group1,
            group2=group2,
            numeric_col=numeric_col,
            name_group1=name1,
            name_group2=name2,
            alternative=alternative,
            correction=correction,
            plot_type=plot_type,
            bootstrap_samples=bootstrap_samples,
            include_graph=include_graph,
        )
        table = _round_table(table)
        return table, fig

    # ------------------------------------------------------------
    # Equal variance between two groups
    # ------------------------------------------------------------
    if hypo_test == "Equal variance between two groups":
        group1 = _materialize_group(df, numeric_col, cat_col1, cat_vals1)
        group2 = _materialize_group(df, numeric_col, cat_col2, cat_vals2)

        name1 = name_group1 or "Group 1"
        name2 = name_group2 or "Group 2"

        table, fig = variance_test(
            group1=group1,
            group2=group2,
            name_group1=name1,
            name_group2=name2,
            test_type=test_type,
            include_graph=include_graph,
            bootstrap_samples=bootstrap_samples,
        )
        table = _round_table(table)
        return table, fig

    # ------------------------------------------------------------
    # One-way ANOVA
    # ------------------------------------------------------------
    if hypo_test == "One-way ANOVA":
        if cat_col3 is None:
            raise ValueError("A categorical column must be selected for ANOVA.")

        if cat_col3 not in df.columns:
            raise ValueError(
                f"Categorical column '{cat_col3}' not found in the dataset."
            )

        if not cat_vals3:
            raise ValueError("At least one category must be selected for ANOVA.")

        cat_series = pd.Series(cat_vals3).astype(df[cat_col3].dtype)
        data_group = df[df[cat_col3].isin(cat_series)][[numeric_col, cat_col3]].dropna()

        table, fig = one_way_anova(
            data_group=data_group,
            numeric_col=numeric_col,
            cat_col=cat_col3,
        )
        table = _round_table(table)
        return table, fig

    # ------------------------------------------------------------
    # Fallback
    # ------------------------------------------------------------
    raise ValueError(f"Unknown hypothesis test: {hypo_test}")