| | import numpy as np |
| | import scipy.stats as stats |
| | import matplotlib.pyplot as plt |
| | import math |
| | from sklearn.metrics import roc_curve, roc_auc_score |
| | from typing import Tuple, Optional |
| | from sklearn.preprocessing import label_binarize |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | def compute_midrank( |
| | x: np.ndarray |
| | ) -> np.ndarray: |
| | """Computes midranks. |
| | Args: |
| | x - a 1D numpy array |
| | Returns: |
| | array of midranks |
| | """ |
| | J = np.argsort(x) |
| | Z = x[J] |
| | N = len(x) |
| | T = np.zeros(N, dtype=float) |
| | i = 0 |
| | while i < N: |
| | j = i |
| | while j < N and Z[j] == Z[i]: |
| | j += 1 |
| | T[i:j] = 0.5*(i + j - 1) |
| | i = j |
| | T2 = np.empty(N, dtype=float) |
| | |
| | |
| | T2[J] = T + 1 |
| | return T2 |
| |
|
| |
|
| | def compute_midrank_weight( |
| | x: np.ndarray, |
| | sample_weight: np.ndarray |
| | ) -> np.ndarray: |
| | """Computes midranks. |
| | Args: |
| | x - a 1D numpy array |
| | Returns: |
| | array of midranks |
| | """ |
| | J = np.argsort(x) |
| | Z = x[J] |
| | cumulative_weight = np.cumsum(sample_weight[J]) |
| | N = len(x) |
| | T = np.zeros(N, dtype=float) |
| | i = 0 |
| | while i < N: |
| | j = i |
| | while j < N and Z[j] == Z[i]: |
| | j += 1 |
| | T[i:j] = cumulative_weight[i:j].mean() |
| | i = j |
| | T2 = np.empty(N, dtype=float) |
| | T2[J] = T |
| | return T2 |
| |
|
| |
|
| | def fastDeLong( |
| | predictions_sorted_transposed: np.ndarray, |
| | label_1_count: int |
| | ) -> Tuple[np.ndarray, np.ndarray]: |
| | """ |
| | The fast version of DeLong's method for computing the covariance of |
| | unadjusted AUC. |
| | Args: |
| | predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples] |
| | sorted such as the examples with label "1" are first |
| | Returns: |
| | (AUC value, DeLong covariance) |
| | Reference: |
| | @article{sun2014fast, |
| | title={Fast Implementation of DeLong's Algorithm for |
| | Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves}, |
| | author={Xu Sun and Weichao Xu}, |
| | journal={IEEE Signal Processing Letters}, |
| | volume={21}, |
| | number={11}, |
| | pages={1389--1393}, |
| | year={2014}, |
| | publisher={IEEE} |
| | } |
| | """ |
| | |
| | m = label_1_count |
| | n = predictions_sorted_transposed.shape[1] - m |
| | positive_examples = predictions_sorted_transposed[:, :m] |
| | negative_examples = predictions_sorted_transposed[:, m:] |
| | k = predictions_sorted_transposed.shape[0] |
| |
|
| | tx = np.empty([k, m], dtype=float) |
| | ty = np.empty([k, n], dtype=float) |
| | tz = np.empty([k, m + n], dtype=float) |
| | for r in range(k): |
| | tx[r, :] = compute_midrank(positive_examples[r, :]) |
| | ty[r, :] = compute_midrank(negative_examples[r, :]) |
| | tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :]) |
| | aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n |
| | v01 = (tz[:, :m] - tx[:, :]) / n |
| | v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m |
| | sx = np.cov(v01) |
| | sy = np.cov(v10) |
| | delongcov = sx / m + sy / n |
| | return aucs, delongcov |
| |
|
| |
|
| | def calc_pvalue( |
| | aucs: np.ndarray, |
| | sigma: np.ndarray |
| | ) -> float: |
| | """Computes log(10) of p-values. |
| | Args: |
| | aucs: 1D array of AUCs |
| | sigma: AUC DeLong covariances |
| | Returns: |
| | log10(pvalue) |
| | """ |
| | l = np.array([[1, -1]]) |
| | z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T)) |
| | return float(np.log10(2) + stats.norm.logsf(z, loc=0, scale=1).item() / np.log(10)) |
| |
|
| |
|
| |
|
| | def compute_ground_truth_statistics( |
| | ground_truth: np.ndarray, |
| | sample_weight: Optional[np.ndarray] = None |
| | ) -> Tuple[np.ndarray, int, Optional[np.ndarray]]: |
| | assert np.array_equal(np.unique(ground_truth), [0, 1]) |
| | order = (-ground_truth).argsort() |
| | label_1_count = int(ground_truth.sum()) |
| | if sample_weight is None: |
| | ordered_sample_weight = None |
| | else: |
| | ordered_sample_weight = sample_weight[order] |
| |
|
| | return order, label_1_count, ordered_sample_weight |
| |
|
| |
|
| | def delong_roc_variance( |
| | ground_truth: np.ndarray, |
| | predictions: np.ndarray |
| | ) -> Tuple[float, np.ndarray]: |
| | """ |
| | Computes ROC AUC variance for a single set of predictions |
| | Args: |
| | ground_truth: np.array of 0 and 1 |
| | predictions: np.array of floats of the probability of being class 1 |
| | """ |
| | sample_weight = None |
| | order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics( |
| | ground_truth, sample_weight) |
| | predictions_sorted_transposed = predictions[np.newaxis, order] |
| | aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count) |
| | assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers" |
| | return aucs[0], delongcov |
| |
|
| |
|
| | def delong_roc_test( |
| | ground_truth: np.ndarray, |
| | predictions_one: np.ndarray, |
| | predictions_two: np.ndarray |
| | ) -> float: |
| | """ |
| | Computes log(p-value) for hypothesis that two ROC AUCs are different |
| | Args: |
| | ground_truth: np.array of 0 and 1 |
| | predictions_one: predictions of the first model, |
| | np.array of floats of the probability of being class 1 |
| | predictions_two: predictions of the second model, |
| | np.array of floats of the probability of being class 1 |
| | """ |
| | sample_weight = None |
| | order, label_1_count, _ = compute_ground_truth_statistics(ground_truth) |
| | predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order] |
| | aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count) |
| | return calc_pvalue(aucs, delongcov) |
| |
|
| |
|
| | def roc_auc_ci_score(y_true: np.ndarray, y_pred: np.ndarray, alpha: float = 0.95) -> Tuple[float, np.ndarray]: |
| | auc, auc_cov = delong_roc_variance(y_true, y_pred) |
| | auc_std = np.sqrt(auc_cov) |
| |
|
| | |
| | if auc_std < 1e-10: |
| | if auc == 1.0: |
| | ci = np.array([1.0, 1.0]) |
| | elif auc == 0.0: |
| | ci = np.array([0.0, 0.0]) |
| | else: |
| | |
| | ci = np.array([auc, auc]) |
| | else: |
| | lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2) |
| | ci = stats.norm.ppf( |
| | lower_upper_q, |
| | loc=auc, |
| | scale=auc_std) |
| |
|
| | |
| | ci[ci > 1] = 1 |
| | ci[ci < 0] = 0 |
| |
|
| | return auc, ci |
| |
|
| |
|
| | def bootstrap_auc_ci( |
| | y_true: np.ndarray, |
| | y_score: np.ndarray, |
| | n_bootstraps: int = 1000, |
| | seed: int = 42 |
| | ) -> Tuple[float, np.ndarray]: |
| | rng = np.random.RandomState(seed) |
| | aucs = [] |
| |
|
| | for _ in range(n_bootstraps): |
| | indices = rng.randint(0, len(y_true), len(y_true)) |
| | if len(np.unique(y_true[indices])) < 2: |
| | continue |
| | y_true_boot = y_true[indices] |
| | y_score_boot = y_score[indices] |
| | aucs.append(roc_auc_score(y_true_boot, y_score_boot)) |
| |
|
| | print("This gives an empirical confidence interval of the AUC using bootstrapping. It may differ slightly due to randomness.") |
| | |
| | aucs = np.array(aucs) |
| | return np.mean(aucs), np.percentile(aucs, [2.5, 97.5]) |
| | |
| |
|
| | def bootstrap_roc_curve_ci( |
| | y_true: np.ndarray, |
| | y_score: np.ndarray, |
| | n_bootstraps: int = 1000, |
| | seed: int = 42 |
| | ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: |
| | rng = np.random.RandomState(seed) |
| | tpr_list = [] |
| | fpr_linspace = np.linspace(0, 1, 100) |
| |
|
| | for _ in range(n_bootstraps): |
| | indices = rng.randint(0, len(y_true), len(y_true)) |
| | if len(np.unique(y_true[indices])) < 2: |
| | continue |
| | y_true_boot = y_true[indices] |
| | y_score_boot = y_score[indices] |
| |
|
| | fpr_boot, tpr_boot, _ = roc_curve(y_true_boot, y_score_boot) |
| | tpr_interp = np.interp(fpr_linspace, fpr_boot, tpr_boot) |
| | tpr_interp[0] = 0.0 |
| | tpr_list.append(tpr_interp) |
| |
|
| | tpr_arr = np.array(tpr_list) |
| | tpr_mean = np.mean(tpr_arr, axis=0) |
| | tpr_lower = np.percentile(tpr_arr, 2.5, axis=0) |
| | tpr_upper = np.percentile(tpr_arr, 97.5, axis=0) |
| |
|
| | return fpr_linspace, tpr_mean, tpr_lower, tpr_upper |
| |
|
| |
|
| | def _prepare_targets_scores( |
| | y_true: np.ndarray, |
| | y_score: np.ndarray |
| | ): |
| | """ |
| | Detect task type & return (Y_onehot, Y_score_2D, n_classes, task_name) |
| | Works for binary, multiclass and multilabel. For binary we make sure |
| | to return TWO columns (neg / pos) so that the downstream loop over |
| | classes [0, 1] is always valid. |
| | """ |
| | |
| | if y_true.ndim == 1: |
| | n_classes = int(np.max(y_true)) + 1 |
| | if n_classes == 2: |
| | task_name = "binary" |
| |
|
| | |
| | y_true_1hot = np.column_stack([1 - y_true, y_true]) |
| |
|
| | |
| | if y_score.ndim == 1: |
| | y_score_2d = np.column_stack([1 - y_score, y_score]) |
| | else: |
| | if y_score.shape[1] == 1: |
| | y_score_2d = np.column_stack([1 - y_score[:, 0], y_score[:, 0]]) |
| | else: |
| | y_score_2d = y_score |
| |
|
| | else: |
| | task_name = "multiclass" |
| | y_true_1hot = label_binarize(y_true, classes=list(range(n_classes))) |
| | y_score_2d = y_score |
| |
|
| | |
| | else: |
| | task_name = "multilabel" |
| | n_classes = y_true.shape[1] |
| | y_true_1hot = y_true.astype(int) |
| | y_score_2d = y_score |
| |
|
| | return y_true_1hot, y_score_2d, n_classes, task_name |
| |
|
| |
|
| |
|
| | def plot_roc_with_ci( |
| | y_true: np.ndarray, |
| | y_score: np.ndarray, |
| | save_path: Optional[str] = None, |
| | fig_title: Optional[str] = None, |
| | n_bootstraps: int = 1000, |
| | seed: int = 42, |
| | ) -> None: |
| | """ |
| | Draw ROC curves (with 95 % CI) for binary / multiclass / multilabel setups |
| | on one canvas with tidy sub-plots. |
| | |
| | Parameters |
| | ---------- |
| | y_true : array-like |
| | * binary / multiclass : shape (N,) |
| | * multilabel : shape (N, C) |
| | y_score : array-like |
| | probability scores – same shape as y_true except for binary |
| | where shape can be (N,) or (N, 2) (class-1 prob in column 1) |
| | save_path : str | None |
| | if given, the figure is stored as PNG. |
| | fig_title : str | None |
| | custom super-title. Defaults to "ROC curves". |
| | """ |
| | Y, S, C, task = _prepare_targets_scores(y_true, y_score) |
| |
|
| | |
| | n_rows = math.ceil(math.sqrt(C)) |
| | n_cols = math.ceil(C / n_rows) |
| | fig, axes = plt.subplots( |
| | n_rows, n_cols, figsize=(4.5 * n_cols, 4.5 * n_rows), dpi=200, |
| | squeeze=False |
| | ) |
| |
|
| | |
| | for cls in range(C): |
| | y_true_cls = Y[:, cls] |
| | y_score_cls = S[:, cls] |
| |
|
| | fpr, tpr_mean, tpr_low, tpr_up = bootstrap_roc_curve_ci( |
| | y_true_cls, y_score_cls, |
| | n_bootstraps=n_bootstraps, seed=seed |
| | ) |
| | auc, ci = roc_auc_ci_score(y_true_cls, y_score_cls) |
| | ci = ci.tolist() |
| | r, c = divmod(cls, n_cols) |
| | ax = axes[r][c] |
| |
|
| | |
| | ax.plot(fpr, tpr_mean, lw=1.5, label=f"AUC = {auc:.3f}, CI = {ci[0]:.3f} - {ci[1]:.3f}") |
| | ax.fill_between(fpr, tpr_low, tpr_up, alpha=.25, label="95 % CI") |
| | ax.plot([0, 1], [0, 1], "k--", lw=.8) |
| |
|
| | |
| | ax.set_title(f"Class {cls}") |
| | ax.set_xlabel("FPR") |
| | ax.set_ylabel("TPR") |
| | ax.set_xlim(0, 1) |
| | ax.set_ylim(0, 1) |
| | ax.grid(ls="--", alpha=.4) |
| | ax.legend(fontsize=8, loc="lower right") |
| |
|
| | |
| | for side in ["top", "right"]: |
| | ax.spines[side].set_visible(False) |
| |
|
| | |
| | for extra in range(C, n_rows * n_cols): |
| | r, c = divmod(extra, n_cols) |
| | fig.delaxes(axes[r][c]) |
| |
|
| | if fig_title: |
| | title = fig_title |
| | else: |
| | title = f"ROC Curve (AUC = {auc:.3f}, 95% CI = {ci[0]:.3f} - {ci[1]:.3f})" |
| | fig.suptitle(title, fontsize=14) |
| | plt.tight_layout(rect=[0, 0.03, 1, 0.97]) |
| |
|
| | if save_path: |
| | fig.savefig(save_path, dpi=300) |
| | print(f"Saved ROC panel ➜ {save_path}") |
| | else: |
| | plt.show() |
| |
|
| |
|
| |
|
| |
|