File size: 11,606 Bytes
798602c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
from __future__ import annotations

from typing import Optional, Tuple

import numpy as np
import pandas as pd

from core.estimation.inference.estimators import estimate_mean, estimate_sigma
from core.estimation.inference.ci import (
    ci_mean_analytic,
    ci_mean_bootstrap,
    ci_median_analytic,
    ci_median_bootstrap,
)
from core.estimation.inference.pi import (
    pi_mean,
    pi_median,
    pi_iqr,
    pi_bootstrap,
)
from core.estimation.graphical_analysis import (
    plot_histogram_with_overlays,
    plot_ecdf,
)


# ---------------------------------------------------------------------
# Utilities (aligned with inference_controller)
# ---------------------------------------------------------------------
def select_distribution(mean_estimator: str, sigma_estimator: str) -> str:
    if (
        mean_estimator == "Sample Mean"
        and sigma_estimator == "Deviation (1 ddof)"
    ):
        return "t"
    return "norm"


def validate_deviation_estimator(*, sigma_estimator: str, n: int):
    if sigma_estimator == "Range (bias corrected)" and n > 25:
        raise ValueError(
            "Range-based confidence intervals require n ≤ 25. "
            "Use another estimator or bootstrap."
        )


def _prepare_series(

    df: pd.DataFrame,

    column: str,

    weights_col: Optional[str],

) -> tuple[np.ndarray, Optional[np.ndarray]]:
    if df is None:
        raise ValueError("No data loaded. Please load a dataset first.")

    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in the dataframe.")

    series = df[column].dropna()
    if series.empty:
        raise ValueError(f"Column '{column}' has no non-missing values.")

    weights = None
    if weights_col is not None:
        if weights_col not in df.columns:
            raise ValueError(
                f"Weights column '{weights_col}' not found in the dataframe."
            )
        weights_series = df[weights_col].reindex(series.index).dropna()
        common_idx = series.index.intersection(weights_series.index)
        series = series.loc[common_idx]
        weights_series = weights_series.loc[common_idx]
        weights = weights_series.to_numpy()

    return series.to_numpy(), weights


def run_graphical_analysis(

    *,

    df: pd.DataFrame,

    column: str,

    graph_type: str,

    # Histogram / PMF controls

    add_kde: bool,

    add_data: bool,

    add_normal: bool,

    add_ci: bool,

    ci_choice: str,

    add_pi: bool,

    pi_choice: str,

    # Estimators

    mean_estimator: str,

    median_estimator: str,

    sigma_estimator: str,

    trim_param,

    winsor_limits,

    weights_col: Optional[str],

    # Normal μ source

    normal_mu_source: str,

    # Bootstrap options

    bootstrap_mean: bool,

    bootstrap_median: bool,

    bootstrap_sigma: bool,

    bootstrap_prediction: bool,

    bootstrap_samples: int,

    # CI/PI confidence level

    ci_pi_conf_level: float,

    # ECDF controls

    ecdf_add_conf: bool,

    ecdf_conf_level: float,

    ecdf_add_normal: bool,

):
    data, weights = _prepare_series(df, column, weights_col)

    if not (0.0 < ci_pi_conf_level < 1.0):
        raise ValueError("Confidence level for CI/PI must be in (0, 1).")

    if graph_type in ("Histogram", "Empirical Probability Mass Function"):
        return _run_hist_or_pmf(
            data=data,
            var_name=column,
            graph_type=graph_type,
            add_kde=add_kde,
            add_data=add_data,
            add_normal=add_normal,
            add_ci=add_ci,
            ci_choice=ci_choice,
            add_pi=add_pi,
            pi_choice=pi_choice,
            mean_estimator=mean_estimator,
            median_estimator=median_estimator,
            sigma_estimator=sigma_estimator,
            trim_param=trim_param,
            winsor_limits=winsor_limits,
            weights=weights,
            normal_mu_source=normal_mu_source,
            bootstrap_mean=bootstrap_mean,
            bootstrap_median=bootstrap_median,
            bootstrap_sigma=bootstrap_sigma,
            bootstrap_prediction=bootstrap_prediction,
            bootstrap_samples=bootstrap_samples,
            ci_pi_conf_level=ci_pi_conf_level,
        )

    if graph_type == "Empirical Cumulative Distribution Function (ECDF)":
        return _run_ecdf(
            data=data,
            var_name=column,
            ecdf_add_conf=ecdf_add_conf,
            ecdf_conf_level=ecdf_conf_level,
            ecdf_add_normal=ecdf_add_normal,
            mean_estimator=mean_estimator,
            sigma_estimator=sigma_estimator,
            trim_param=trim_param,
            winsor_limits=winsor_limits,
            weights=weights,
            normal_mu_source=normal_mu_source,
        )

    raise ValueError(f"Unknown graph type: {graph_type}")


def _run_hist_or_pmf(

    *,

    data: np.ndarray,

    var_name: str,

    graph_type: str,

    add_kde: bool,

    add_data: bool,

    add_normal: bool,

    add_ci: bool,

    ci_choice: str,

    add_pi: bool,

    pi_choice: str,

    mean_estimator: str,

    median_estimator: str,

    sigma_estimator: str,

    trim_param,

    winsor_limits,

    weights: Optional[np.ndarray],

    normal_mu_source: str,

    bootstrap_mean: bool,

    bootstrap_median: bool,

    bootstrap_sigma: bool,

    bootstrap_prediction: bool,

    bootstrap_samples: int,

    ci_pi_conf_level: float,

):
    alpha = 1.0 - ci_pi_conf_level

    n = len(data)
    validate_deviation_estimator(
        sigma_estimator=sigma_estimator,
        n=n,
    )

    ci_mean_interval = None
    ci_median_interval = None
    pi_interval = None
    hat_mu = None
    hat_sigma = None

    need_intervals = add_ci or add_pi or add_normal

    if need_intervals:
        # --- Parameters for Normal overlay ---
        if add_normal:
            if normal_mu_source == "Mean-based CI":
                hat_mu = estimate_mean(
                    data,
                    mean_estimator,
                    trim_param=trim_param,
                    winsor_limits=winsor_limits,
                    weights=weights,
                )
            else:
                hat_mu = float(np.median(data))

            hat_sigma = estimate_sigma(
                data=data,
                estimator=sigma_estimator,
            )

        # --- Confidence intervals ---
        if add_ci:
            dist = select_distribution(mean_estimator, sigma_estimator)

            # CI for mean
            if bootstrap_mean:
                ci_mean_interval = ci_mean_bootstrap(
                    data=data,
                    estimator=mean_estimator,
                    alpha=alpha,
                    trim_param=trim_param,
                    winsor_limits=winsor_limits,
                    weights=weights,
                    B=bootstrap_samples,
                )
            else:
                ci_mean_interval = ci_mean_analytic(
                    data=data,
                    estimator=mean_estimator,
                    alpha=alpha,
                    dist=dist,
                    sigma_estimator=sigma_estimator,
                    trim_param=trim_param,
                    winsor_limits=winsor_limits,
                    weights=weights,
                )

            # CI for median
            if bootstrap_median:
                ci_median_interval = ci_median_bootstrap(
                    data=data,
                    alpha=alpha,
                    B=bootstrap_samples,
                )
            else:
                ci_median_interval = ci_median_analytic(
                    data=data,
                    alpha=alpha,
                    sigma_estimator=sigma_estimator,
                )

            # Respect user choice (Mean / Median / Both)
            if ci_choice == "Mean":
                ci_median_interval = None
            elif ci_choice == "Median":
                ci_mean_interval = None

        # --- Prediction intervals ---
        if add_pi:
            dist = select_distribution(mean_estimator, sigma_estimator)
            if pi_choice == "Mean":
                pi_interval = pi_mean(
                    data=data,
                    alpha=alpha,
                    estimator=mean_estimator,
                    dist=dist,
                    sigma_estimator=sigma_estimator,
                    trim_param=trim_param,
                    winsor_limits=winsor_limits,
                    weights=weights,
                )
            elif pi_choice == "Median":
                # New API: pi_median only needs data, alpha and sigma_estimator
                pi_interval = pi_median(
                    data=data,
                    alpha=alpha,
                    sigma_estimator=sigma_estimator,
                )
            elif pi_choice == "IQR":
                pi_interval = pi_iqr(
                    data=data,
                    alpha=alpha,
                )
            elif pi_choice == "Bootstrap":
                if not bootstrap_prediction:
                    raise ValueError(
                        "To use the Bootstrap prediction interval, enable the "
                        "'Bootstrap Prediction' option in the estimator settings."
                    )
                pi_interval = pi_bootstrap(
                    data=data,
                    alpha=alpha,
                    B=bootstrap_samples,
                )
            else:
                raise ValueError(
                    f"Unknown prediction-interval choice: {pi_choice}"
                )

    fig = plot_histogram_with_overlays(
        data=data,
        graph_type=graph_type,
        var_name=var_name,
        add_kde=add_kde,
        add_data=add_data,
        add_normal=add_normal,
        hat_mu=hat_mu,
        hat_sigma=hat_sigma,
        ci_mean_interval=ci_mean_interval,
        ci_median_interval=ci_median_interval,
        pi_interval=pi_interval,
    )

    return fig


def _run_ecdf(

    *,

    data: np.ndarray,

    var_name: str,

    ecdf_add_conf: bool,

    ecdf_conf_level: float,

    ecdf_add_normal: bool,

    mean_estimator: str,

    sigma_estimator: str,

    trim_param,

    winsor_limits,

    weights: Optional[np.ndarray],

    normal_mu_source: str,

):
    if not (0.0 < ecdf_conf_level < 1.0):
        raise ValueError("ECDF confidence level must be in (0, 1).")

    alpha = 1.0 - ecdf_conf_level

    n = len(data)
    validate_deviation_estimator(
        sigma_estimator=sigma_estimator,
        n=n,
    )

    hat_mu = None
    hat_sigma = None

    if ecdf_add_normal:
        if normal_mu_source == "Mean-based CI":
            hat_mu = estimate_mean(
                data,
                mean_estimator,
                trim_param=trim_param,
                winsor_limits=winsor_limits,
                weights=weights,
            )
        else:
            hat_mu = float(np.median(data))

        hat_sigma = estimate_sigma(
            data=data,
            estimator=sigma_estimator,
        )

    fig = plot_ecdf(
        data=data,
        var_name=var_name,
        alpha=alpha,
        add_conf_band=ecdf_add_conf,
        add_normal=ecdf_add_normal,
        hat_mu=hat_mu,
        hat_sigma=hat_sigma,
    )
    return fig