File size: 4,991 Bytes
798602c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | from __future__ import annotations
from typing import List, Optional, Sequence, Tuple
from matplotlib.figure import Figure
import numpy as np
import pandas as pd
from core.linear_regression import run_linear_regression as _run_linear_regression
def _select_working_dataframe(
df: Optional[pd.DataFrame],
filtered_df: Optional[pd.DataFrame],
) -> pd.DataFrame:
"""
Use the filtered dataframe if it is non-empty; otherwise fall back to the
original dataframe. This mirrors the behaviour used in other tabs.
"""
if df is None:
raise ValueError("No dataset loaded.")
if filtered_df is not None and not filtered_df.empty:
return filtered_df
if df.empty:
raise ValueError("The dataset is empty.")
return df
def _parse_confidence_level(text: str) -> float:
"""
Parse a confidence level like '0.95' into an alpha value for statsmodels.
Returns
-------
alpha : float
Significance level (e.g. 0.05 for a 95% confidence level).
"""
s = str(text).strip()
if not s:
raise ValueError("Confidence level is required (e.g. 0.95).")
try:
level = float(s)
except ValueError as exc:
raise ValueError("Confidence level must be a numeric value between 0 and 1.") from exc
if not (0 < level < 1):
raise ValueError("Confidence level must be between 0 and 1 (e.g. 0.95).")
# statsmodels expects alpha, not the confidence level itself
return 1.0 - level
def _parse_range(text: str) -> Optional[np.ndarray]:
"""
Parse a range string like '0, 10' into a numpy array suitable for predictions.
Returns
-------
np.ndarray or None
If the string is empty or only whitespace, returns None.
Otherwise returns a 1-D array of 100 evenly spaced values between
the parsed minimum and maximum.
"""
s = str(text).strip()
if not s:
return None
parts = s.split(",")
if len(parts) != 2:
raise ValueError("Range must have the form 'min, max'.")
try:
lo = float(parts[0].strip())
hi = float(parts[1].strip())
except ValueError as exc:
raise ValueError("Range values must be numeric (e.g. '0, 10').") from exc
if lo >= hi:
raise ValueError("Range minimum must be strictly less than the maximum.")
return np.linspace(lo, hi, 100)
def run_linear_regression(
*,
df: Optional[pd.DataFrame],
filtered_df: Optional[pd.DataFrame],
formula_check: bool,
formula_text: str,
formula_latex: str,
dependent_var: Optional[str],
independent_vars: List[str],
alpha_input: str,
intercept: bool,
graph_check: bool,
graph_type: str,
show_ci: bool,
show_pi: bool,
fit_to_obs: bool,
x_range_text: str,
round_digits: int = 4,
) -> Tuple[str, pd.DataFrame, Optional[Figure]]:
"""
High-level controller used by the Linear Regression tab.
This function takes raw user input from the UI, performs validation and
parsing, calls the stats layer, and returns a tuple:
(summary_html, params_df_rounded, figure)
Any exceptions should be caught in the tab layer and turned into user-
facing error messages.
"""
working_df = _select_working_dataframe(df, filtered_df)
if dependent_var is None or dependent_var == "":
raise ValueError("Please select a dependent variable.")
if not independent_vars:
raise ValueError("Please select at least one independent variable.")
# For the "Simple Regression" graph we require exactly one independent variable.
if graph_check and graph_type == "Simple Regression" and len(independent_vars) != 1:
raise ValueError(
"The 'Simple Regression' graph is only available when exactly one "
"independent variable is selected."
)
# Parse confidence level
alpha = _parse_confidence_level(alpha_input)
# Parse X range only when needed: Simple Regression + graph + not fit_to_obs
x_vector = None
if graph_check and graph_type == "Simple Regression" and not fit_to_obs:
x_vector = _parse_range(x_range_text)
summary_html, params_df, fig = _run_linear_regression(
df=working_df,
formula_check=formula_check,
formula_text=formula_text,
formula_latex=formula_latex,
dependent_var=dependent_var,
independent_vars=independent_vars,
alpha=alpha,
intercept=intercept,
create_graph=graph_check,
graph_type=graph_type,
show_ci=show_ci,
show_pi=show_pi,
fit_to_obs=fit_to_obs,
x_vector=x_vector,
)
# Rounding happens here, not in the stats layer.
params_df_rounded = params_df.round(round_digits)
return summary_html, params_df_rounded, fig
|