File size: 12,941 Bytes

714cf46

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import math
from sklearn.metrics import roc_curve, roc_auc_score
from typing import Tuple, Optional
from sklearn.preprocessing import label_binarize

# from https://github.com/PatWalters/comparing_classifiers/blob/master/delong_ci.py
# from https://github.com/yandexdataschool/roc_comparison/blob/master/compare_auc_delong_xu.py

# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(
        x: np.ndarray
    ) -> np.ndarray:
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5*(i + j - 1)
        i = j
    T2 = np.empty(N, dtype=float)
    # Note(kazeevn) +1 is due to Python using 0-based indexing
    # instead of 1-based in the AUC formula in the paper
    T2[J] = T + 1
    return T2


def compute_midrank_weight(
        x: np.ndarray,
        sample_weight: np.ndarray
    ) -> np.ndarray:
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    cumulative_weight = np.cumsum(sample_weight[J])
    N = len(x)
    T = np.zeros(N, dtype=float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = cumulative_weight[i:j].mean()
        i = j
    T2 = np.empty(N, dtype=float)
    T2[J] = T
    return T2


def fastDeLong(
        predictions_sorted_transposed: np.ndarray,
        label_1_count: int
    ) -> Tuple[np.ndarray, np.ndarray]:
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=float)
    ty = np.empty([k, n], dtype=float)
    tz = np.empty([k, m + n], dtype=float)
    for r in range(k):
        tx[r, :] = compute_midrank(positive_examples[r, :])
        ty[r, :] = compute_midrank(negative_examples[r, :])
        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
    v01 = (tz[:, :m] - tx[:, :]) / n
    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def calc_pvalue(
        aucs: np.ndarray,
        sigma: np.ndarray
    ) -> float:
    """Computes log(10) of p-values.
    Args:
       aucs: 1D array of AUCs
       sigma: AUC DeLong covariances
    Returns:
       log10(pvalue)
    """
    l = np.array([[1, -1]])
    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
    return float(np.log10(2) + stats.norm.logsf(z, loc=0, scale=1).item() / np.log(10))



def compute_ground_truth_statistics(
        ground_truth: np.ndarray,
        sample_weight: Optional[np.ndarray] = None
    ) -> Tuple[np.ndarray, int, Optional[np.ndarray]]:
    assert np.array_equal(np.unique(ground_truth), [0, 1])
    order = (-ground_truth).argsort()
    label_1_count = int(ground_truth.sum())
    if sample_weight is None:
        ordered_sample_weight = None
    else:
        ordered_sample_weight = sample_weight[order]

    return order, label_1_count, ordered_sample_weight


def delong_roc_variance(
        ground_truth: np.ndarray,
        predictions: np.ndarray
    ) -> Tuple[float, np.ndarray]:
    """
    Computes ROC AUC variance for a single set of predictions
    Args:
       ground_truth: np.array of 0 and 1
       predictions: np.array of floats of the probability of being class 1
    """
    sample_weight = None
    order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
        ground_truth, sample_weight)
    predictions_sorted_transposed = predictions[np.newaxis, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
    return aucs[0], delongcov


def delong_roc_test(
        ground_truth: np.ndarray,
        predictions_one: np.ndarray,
        predictions_two: np.ndarray
    ) -> float:
    """
    Computes log(p-value) for hypothesis that two ROC AUCs are different
    Args:
       ground_truth: np.array of 0 and 1
       predictions_one: predictions of the first model,
          np.array of floats of the probability of being class 1
       predictions_two: predictions of the second model,
          np.array of floats of the probability of being class 1
    """
    sample_weight = None
    order, label_1_count, _ = compute_ground_truth_statistics(ground_truth)
    predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    return calc_pvalue(aucs, delongcov)


def roc_auc_ci_score(y_true: np.ndarray, y_pred: np.ndarray, alpha: float = 0.95) -> Tuple[float, np.ndarray]:
    auc, auc_cov = delong_roc_variance(y_true, y_pred)
    auc_std = np.sqrt(auc_cov)

    # Handle edge cases when auc_std is zero or very small
    if auc_std < 1e-10:
        if auc == 1.0:
            ci = np.array([1.0, 1.0])
        elif auc == 0.0:
            ci = np.array([0.0, 0.0])
        else:
            # If std dev is extremely low but AUC is not exactly 0 or 1
            ci = np.array([auc, auc])
    else:
        lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)
        ci = stats.norm.ppf(
            lower_upper_q,
            loc=auc,
            scale=auc_std)

        # Ensure confidence intervals within [0,1]
        ci[ci > 1] = 1
        ci[ci < 0] = 0

    return auc, ci


def bootstrap_auc_ci(
        y_true: np.ndarray,
        y_score: np.ndarray,
        n_bootstraps: int = 1000,
        seed: int = 42
    ) -> Tuple[float, np.ndarray]:
    rng = np.random.RandomState(seed)
    aucs = []

    for _ in range(n_bootstraps):
        indices = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[indices])) < 2:
            continue
        y_true_boot = y_true[indices]
        y_score_boot = y_score[indices]
        aucs.append(roc_auc_score(y_true_boot, y_score_boot))

    print("This gives an empirical confidence interval of the AUC using bootstrapping. It may differ slightly due to randomness.")
    
    aucs = np.array(aucs)
    return np.mean(aucs), np.percentile(aucs, [2.5, 97.5])
    

def bootstrap_roc_curve_ci(
        y_true: np.ndarray,
        y_score: np.ndarray,
        n_bootstraps: int = 1000,
        seed: int = 42
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    rng = np.random.RandomState(seed)
    tpr_list = []
    fpr_linspace = np.linspace(0, 1, 100)

    for _ in range(n_bootstraps):
        indices = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[indices])) < 2:
            continue
        y_true_boot = y_true[indices]
        y_score_boot = y_score[indices]

        fpr_boot, tpr_boot, _ = roc_curve(y_true_boot, y_score_boot)
        tpr_interp = np.interp(fpr_linspace, fpr_boot, tpr_boot)
        tpr_interp[0] = 0.0
        tpr_list.append(tpr_interp)

    tpr_arr = np.array(tpr_list)
    tpr_mean = np.mean(tpr_arr, axis=0)
    tpr_lower = np.percentile(tpr_arr, 2.5, axis=0)
    tpr_upper = np.percentile(tpr_arr, 97.5, axis=0)

    return fpr_linspace, tpr_mean, tpr_lower, tpr_upper


def _prepare_targets_scores(
    y_true: np.ndarray,
    y_score: np.ndarray
):
    """
    Detect task type & return (Y_onehot, Y_score_2D, n_classes, task_name)
    Works for binary, multiclass and multilabel.  For binary we make sure
    to return TWO columns (neg / pos) so that the downstream loop over
    classes [0, 1] is always valid.
    """
    # ---------- binary or multiclass (single-label) ----------
    if y_true.ndim == 1:
        n_classes = int(np.max(y_true)) + 1          # assumes labels start at 0
        if n_classes == 2:
            task_name = "binary"

            # --- one-hot targets (N, 2): [neg, pos] ------------
            y_true_1hot = np.column_stack([1 - y_true, y_true])

            # --- probability array  (N, 2): P(neg), P(pos) -----
            if y_score.ndim == 1:                    # shape (N,)
                y_score_2d = np.column_stack([1 - y_score, y_score])
            else:                                    # shape (N, k)
                if y_score.shape[1] == 1:            # (N, 1)
                    y_score_2d = np.column_stack([1 - y_score[:, 0], y_score[:, 0]])
                else:                                # already (N, 2)
                    y_score_2d = y_score

        else:                                        # -------- multiclass -------
            task_name = "multiclass"
            y_true_1hot = label_binarize(y_true, classes=list(range(n_classes)))
            y_score_2d  = y_score                      # expected shape (N, C)

    # ---------- multilabel (already one-hot) ------------------
    else:
        task_name  = "multilabel"
        n_classes  = y_true.shape[1]
        y_true_1hot = y_true.astype(int)
        y_score_2d  = y_score

    return y_true_1hot, y_score_2d, n_classes, task_name



def plot_roc_with_ci(
    y_true:  np.ndarray,
    y_score: np.ndarray,
    save_path: Optional[str] = None,
    fig_title: Optional[str] = None,
    n_bootstraps: int = 1000,
    seed: int = 42,
) -> None:
    """
    Draw ROC curves (with 95 % CI) for binary / multiclass / multilabel setups
    on one canvas with tidy sub-plots.

    Parameters
    ----------
    y_true : array-like
        * binary / multiclass : shape (N,)
        * multilabel         : shape (N, C)
    y_score : array-like
        probability scores – same shape as y_true except for binary
        where shape can be (N,) or (N, 2) (class-1 prob in column 1)
    save_path : str | None
        if given, the figure is stored as PNG.
    fig_title : str | None
        custom super-title.  Defaults to "ROC curves".
    """
    Y, S, C, task = _prepare_targets_scores(y_true, y_score)

    # -------- set up subplot grid -------------
    n_rows = math.ceil(math.sqrt(C))
    n_cols = math.ceil(C / n_rows)
    fig, axes = plt.subplots(
        n_rows, n_cols, figsize=(4.5 * n_cols, 4.5 * n_rows), dpi=200,
        squeeze=False
    )

    # -------- iterate over classes -------------
    for cls in range(C):
        y_true_cls  = Y[:, cls]
        y_score_cls = S[:, cls]

        fpr, tpr_mean, tpr_low, tpr_up = bootstrap_roc_curve_ci(
            y_true_cls, y_score_cls,
            n_bootstraps=n_bootstraps, seed=seed
        )
        auc, ci = roc_auc_ci_score(y_true_cls, y_score_cls)
        ci = ci.tolist()
        r, c = divmod(cls, n_cols)
        ax = axes[r][c]

        # main ROC and band
        ax.plot(fpr, tpr_mean, lw=1.5, label=f"AUC = {auc:.3f}, CI = {ci[0]:.3f} - {ci[1]:.3f}")
        ax.fill_between(fpr, tpr_low, tpr_up, alpha=.25, label="95 % CI")
        ax.plot([0, 1], [0, 1], "k--", lw=.8)

        # cosmetics
        ax.set_title(f"Class {cls}")
        ax.set_xlabel("FPR")
        ax.set_ylabel("TPR")
        ax.set_xlim(0, 1)
        ax.set_ylim(0, 1)
        ax.grid(ls="--", alpha=.4)
        ax.legend(fontsize=8, loc="lower right")

        # drop spines
        for side in ["top", "right"]:
            ax.spines[side].set_visible(False)

    # hide empty panels if any
    for extra in range(C, n_rows * n_cols):
        r, c = divmod(extra, n_cols)
        fig.delaxes(axes[r][c])

    if fig_title:
        title = fig_title
    else:
        title = f"ROC Curve (AUC = {auc:.3f}, 95% CI = {ci[0]:.3f} - {ci[1]:.3f})"
    fig.suptitle(title, fontsize=14)
    plt.tight_layout(rect=[0, 0.03, 1, 0.97])

    if save_path:
        fig.savefig(save_path, dpi=300)
        print(f"Saved ROC panel ➜ {save_path}")
    else:
        plt.show()