File size: 4,991 Bytes
798602c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from __future__ import annotations

from typing import List, Optional, Sequence, Tuple

from matplotlib.figure import Figure
import numpy as np
import pandas as pd

from core.linear_regression import run_linear_regression as _run_linear_regression


def _select_working_dataframe(

    df: Optional[pd.DataFrame],

    filtered_df: Optional[pd.DataFrame],

) -> pd.DataFrame:
    """

    Use the filtered dataframe if it is non-empty; otherwise fall back to the

    original dataframe. This mirrors the behaviour used in other tabs.

    """
    if df is None:
        raise ValueError("No dataset loaded.")

    if filtered_df is not None and not filtered_df.empty:
        return filtered_df

    if df.empty:
        raise ValueError("The dataset is empty.")

    return df


def _parse_confidence_level(text: str) -> float:
    """

    Parse a confidence level like '0.95' into an alpha value for statsmodels.



    Returns

    -------

    alpha : float

        Significance level (e.g. 0.05 for a 95% confidence level).

    """
    s = str(text).strip()
    if not s:
        raise ValueError("Confidence level is required (e.g. 0.95).")
    try:
        level = float(s)
    except ValueError as exc:
        raise ValueError("Confidence level must be a numeric value between 0 and 1.") from exc

    if not (0 < level < 1):
        raise ValueError("Confidence level must be between 0 and 1 (e.g. 0.95).")

    # statsmodels expects alpha, not the confidence level itself
    return 1.0 - level


def _parse_range(text: str) -> Optional[np.ndarray]:
    """

    Parse a range string like '0, 10' into a numpy array suitable for predictions.



    Returns

    -------

    np.ndarray or None

        If the string is empty or only whitespace, returns None.

        Otherwise returns a 1-D array of 100 evenly spaced values between

        the parsed minimum and maximum.

    """
    s = str(text).strip()
    if not s:
        return None

    parts = s.split(",")
    if len(parts) != 2:
        raise ValueError("Range must have the form 'min, max'.")

    try:
        lo = float(parts[0].strip())
        hi = float(parts[1].strip())
    except ValueError as exc:
        raise ValueError("Range values must be numeric (e.g. '0, 10').") from exc

    if lo >= hi:
        raise ValueError("Range minimum must be strictly less than the maximum.")

    return np.linspace(lo, hi, 100)


def run_linear_regression(

    *,

    df: Optional[pd.DataFrame],

    filtered_df: Optional[pd.DataFrame],

    formula_check: bool,

    formula_text: str,

    formula_latex: str,

    dependent_var: Optional[str],

    independent_vars: List[str],

    alpha_input: str,

    intercept: bool,

    graph_check: bool,

    graph_type: str,

    show_ci: bool,

    show_pi: bool,

    fit_to_obs: bool,

    x_range_text: str,

    round_digits: int = 4,

) -> Tuple[str, pd.DataFrame, Optional[Figure]]:
    """

    High-level controller used by the Linear Regression tab.



    This function takes raw user input from the UI, performs validation and

    parsing, calls the stats layer, and returns a tuple:



        (summary_html, params_df_rounded, figure)



    Any exceptions should be caught in the tab layer and turned into user-

    facing error messages.

    """
    working_df = _select_working_dataframe(df, filtered_df)

    if dependent_var is None or dependent_var == "":
        raise ValueError("Please select a dependent variable.")

    if not independent_vars:
        raise ValueError("Please select at least one independent variable.")

    # For the "Simple Regression" graph we require exactly one independent variable.
    if graph_check and graph_type == "Simple Regression" and len(independent_vars) != 1:
        raise ValueError(
            "The 'Simple Regression' graph is only available when exactly one "
            "independent variable is selected."
        )

    # Parse confidence level
    alpha = _parse_confidence_level(alpha_input)

    # Parse X range only when needed: Simple Regression + graph + not fit_to_obs
    x_vector = None
    if graph_check and graph_type == "Simple Regression" and not fit_to_obs:
        x_vector = _parse_range(x_range_text)

    summary_html, params_df, fig = _run_linear_regression(
        df=working_df,
        formula_check=formula_check,
        formula_text=formula_text,
        formula_latex=formula_latex,
        dependent_var=dependent_var,
        independent_vars=independent_vars,
        alpha=alpha,
        intercept=intercept,
        create_graph=graph_check,
        graph_type=graph_type,
        show_ci=show_ci,
        show_pi=show_pi,
        fit_to_obs=fit_to_obs,
        x_vector=x_vector,
    )

    # Rounding happens here, not in the stats layer.
    params_df_rounded = params_df.round(round_digits)

    return summary_html, params_df_rounded, fig