nikraf's picture
Upload folder using huggingface_hub
714cf46 verified
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import math
from sklearn.metrics import roc_curve, roc_auc_score
from typing import Tuple, Optional
from sklearn.preprocessing import label_binarize
# from https://github.com/PatWalters/comparing_classifiers/blob/master/delong_ci.py
# from https://github.com/yandexdataschool/roc_comparison/blob/master/compare_auc_delong_xu.py
# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(
x: np.ndarray
) -> np.ndarray:
"""Computes midranks.
Args:
x - a 1D numpy array
Returns:
array of midranks
"""
J = np.argsort(x)
Z = x[J]
N = len(x)
T = np.zeros(N, dtype=float)
i = 0
while i < N:
j = i
while j < N and Z[j] == Z[i]:
j += 1
T[i:j] = 0.5*(i + j - 1)
i = j
T2 = np.empty(N, dtype=float)
# Note(kazeevn) +1 is due to Python using 0-based indexing
# instead of 1-based in the AUC formula in the paper
T2[J] = T + 1
return T2
def compute_midrank_weight(
x: np.ndarray,
sample_weight: np.ndarray
) -> np.ndarray:
"""Computes midranks.
Args:
x - a 1D numpy array
Returns:
array of midranks
"""
J = np.argsort(x)
Z = x[J]
cumulative_weight = np.cumsum(sample_weight[J])
N = len(x)
T = np.zeros(N, dtype=float)
i = 0
while i < N:
j = i
while j < N and Z[j] == Z[i]:
j += 1
T[i:j] = cumulative_weight[i:j].mean()
i = j
T2 = np.empty(N, dtype=float)
T2[J] = T
return T2
def fastDeLong(
predictions_sorted_transposed: np.ndarray,
label_1_count: int
) -> Tuple[np.ndarray, np.ndarray]:
"""
The fast version of DeLong's method for computing the covariance of
unadjusted AUC.
Args:
predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
sorted such as the examples with label "1" are first
Returns:
(AUC value, DeLong covariance)
Reference:
@article{sun2014fast,
title={Fast Implementation of DeLong's Algorithm for
Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
author={Xu Sun and Weichao Xu},
journal={IEEE Signal Processing Letters},
volume={21},
number={11},
pages={1389--1393},
year={2014},
publisher={IEEE}
}
"""
# Short variables are named as they are in the paper
m = label_1_count
n = predictions_sorted_transposed.shape[1] - m
positive_examples = predictions_sorted_transposed[:, :m]
negative_examples = predictions_sorted_transposed[:, m:]
k = predictions_sorted_transposed.shape[0]
tx = np.empty([k, m], dtype=float)
ty = np.empty([k, n], dtype=float)
tz = np.empty([k, m + n], dtype=float)
for r in range(k):
tx[r, :] = compute_midrank(positive_examples[r, :])
ty[r, :] = compute_midrank(negative_examples[r, :])
tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
v01 = (tz[:, :m] - tx[:, :]) / n
v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
sx = np.cov(v01)
sy = np.cov(v10)
delongcov = sx / m + sy / n
return aucs, delongcov
def calc_pvalue(
aucs: np.ndarray,
sigma: np.ndarray
) -> float:
"""Computes log(10) of p-values.
Args:
aucs: 1D array of AUCs
sigma: AUC DeLong covariances
Returns:
log10(pvalue)
"""
l = np.array([[1, -1]])
z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
return float(np.log10(2) + stats.norm.logsf(z, loc=0, scale=1).item() / np.log(10))
def compute_ground_truth_statistics(
ground_truth: np.ndarray,
sample_weight: Optional[np.ndarray] = None
) -> Tuple[np.ndarray, int, Optional[np.ndarray]]:
assert np.array_equal(np.unique(ground_truth), [0, 1])
order = (-ground_truth).argsort()
label_1_count = int(ground_truth.sum())
if sample_weight is None:
ordered_sample_weight = None
else:
ordered_sample_weight = sample_weight[order]
return order, label_1_count, ordered_sample_weight
def delong_roc_variance(
ground_truth: np.ndarray,
predictions: np.ndarray
) -> Tuple[float, np.ndarray]:
"""
Computes ROC AUC variance for a single set of predictions
Args:
ground_truth: np.array of 0 and 1
predictions: np.array of floats of the probability of being class 1
"""
sample_weight = None
order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
ground_truth, sample_weight)
predictions_sorted_transposed = predictions[np.newaxis, order]
aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
return aucs[0], delongcov
def delong_roc_test(
ground_truth: np.ndarray,
predictions_one: np.ndarray,
predictions_two: np.ndarray
) -> float:
"""
Computes log(p-value) for hypothesis that two ROC AUCs are different
Args:
ground_truth: np.array of 0 and 1
predictions_one: predictions of the first model,
np.array of floats of the probability of being class 1
predictions_two: predictions of the second model,
np.array of floats of the probability of being class 1
"""
sample_weight = None
order, label_1_count, _ = compute_ground_truth_statistics(ground_truth)
predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
return calc_pvalue(aucs, delongcov)
def roc_auc_ci_score(y_true: np.ndarray, y_pred: np.ndarray, alpha: float = 0.95) -> Tuple[float, np.ndarray]:
auc, auc_cov = delong_roc_variance(y_true, y_pred)
auc_std = np.sqrt(auc_cov)
# Handle edge cases when auc_std is zero or very small
if auc_std < 1e-10:
if auc == 1.0:
ci = np.array([1.0, 1.0])
elif auc == 0.0:
ci = np.array([0.0, 0.0])
else:
# If std dev is extremely low but AUC is not exactly 0 or 1
ci = np.array([auc, auc])
else:
lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)
ci = stats.norm.ppf(
lower_upper_q,
loc=auc,
scale=auc_std)
# Ensure confidence intervals within [0,1]
ci[ci > 1] = 1
ci[ci < 0] = 0
return auc, ci
def bootstrap_auc_ci(
y_true: np.ndarray,
y_score: np.ndarray,
n_bootstraps: int = 1000,
seed: int = 42
) -> Tuple[float, np.ndarray]:
rng = np.random.RandomState(seed)
aucs = []
for _ in range(n_bootstraps):
indices = rng.randint(0, len(y_true), len(y_true))
if len(np.unique(y_true[indices])) < 2:
continue
y_true_boot = y_true[indices]
y_score_boot = y_score[indices]
aucs.append(roc_auc_score(y_true_boot, y_score_boot))
print("This gives an empirical confidence interval of the AUC using bootstrapping. It may differ slightly due to randomness.")
aucs = np.array(aucs)
return np.mean(aucs), np.percentile(aucs, [2.5, 97.5])
def bootstrap_roc_curve_ci(
y_true: np.ndarray,
y_score: np.ndarray,
n_bootstraps: int = 1000,
seed: int = 42
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
rng = np.random.RandomState(seed)
tpr_list = []
fpr_linspace = np.linspace(0, 1, 100)
for _ in range(n_bootstraps):
indices = rng.randint(0, len(y_true), len(y_true))
if len(np.unique(y_true[indices])) < 2:
continue
y_true_boot = y_true[indices]
y_score_boot = y_score[indices]
fpr_boot, tpr_boot, _ = roc_curve(y_true_boot, y_score_boot)
tpr_interp = np.interp(fpr_linspace, fpr_boot, tpr_boot)
tpr_interp[0] = 0.0
tpr_list.append(tpr_interp)
tpr_arr = np.array(tpr_list)
tpr_mean = np.mean(tpr_arr, axis=0)
tpr_lower = np.percentile(tpr_arr, 2.5, axis=0)
tpr_upper = np.percentile(tpr_arr, 97.5, axis=0)
return fpr_linspace, tpr_mean, tpr_lower, tpr_upper
def _prepare_targets_scores(
y_true: np.ndarray,
y_score: np.ndarray
):
"""
Detect task type & return (Y_onehot, Y_score_2D, n_classes, task_name)
Works for binary, multiclass and multilabel. For binary we make sure
to return TWO columns (neg / pos) so that the downstream loop over
classes [0, 1] is always valid.
"""
# ---------- binary or multiclass (single-label) ----------
if y_true.ndim == 1:
n_classes = int(np.max(y_true)) + 1 # assumes labels start at 0
if n_classes == 2:
task_name = "binary"
# --- one-hot targets (N, 2): [neg, pos] ------------
y_true_1hot = np.column_stack([1 - y_true, y_true])
# --- probability array (N, 2): P(neg), P(pos) -----
if y_score.ndim == 1: # shape (N,)
y_score_2d = np.column_stack([1 - y_score, y_score])
else: # shape (N, k)
if y_score.shape[1] == 1: # (N, 1)
y_score_2d = np.column_stack([1 - y_score[:, 0], y_score[:, 0]])
else: # already (N, 2)
y_score_2d = y_score
else: # -------- multiclass -------
task_name = "multiclass"
y_true_1hot = label_binarize(y_true, classes=list(range(n_classes)))
y_score_2d = y_score # expected shape (N, C)
# ---------- multilabel (already one-hot) ------------------
else:
task_name = "multilabel"
n_classes = y_true.shape[1]
y_true_1hot = y_true.astype(int)
y_score_2d = y_score
return y_true_1hot, y_score_2d, n_classes, task_name
def plot_roc_with_ci(
y_true: np.ndarray,
y_score: np.ndarray,
save_path: Optional[str] = None,
fig_title: Optional[str] = None,
n_bootstraps: int = 1000,
seed: int = 42,
) -> None:
"""
Draw ROC curves (with 95 % CI) for binary / multiclass / multilabel setups
on one canvas with tidy sub-plots.
Parameters
----------
y_true : array-like
* binary / multiclass : shape (N,)
* multilabel : shape (N, C)
y_score : array-like
probability scores – same shape as y_true except for binary
where shape can be (N,) or (N, 2) (class-1 prob in column 1)
save_path : str | None
if given, the figure is stored as PNG.
fig_title : str | None
custom super-title. Defaults to "ROC curves".
"""
Y, S, C, task = _prepare_targets_scores(y_true, y_score)
# -------- set up subplot grid -------------
n_rows = math.ceil(math.sqrt(C))
n_cols = math.ceil(C / n_rows)
fig, axes = plt.subplots(
n_rows, n_cols, figsize=(4.5 * n_cols, 4.5 * n_rows), dpi=200,
squeeze=False
)
# -------- iterate over classes -------------
for cls in range(C):
y_true_cls = Y[:, cls]
y_score_cls = S[:, cls]
fpr, tpr_mean, tpr_low, tpr_up = bootstrap_roc_curve_ci(
y_true_cls, y_score_cls,
n_bootstraps=n_bootstraps, seed=seed
)
auc, ci = roc_auc_ci_score(y_true_cls, y_score_cls)
ci = ci.tolist()
r, c = divmod(cls, n_cols)
ax = axes[r][c]
# main ROC and band
ax.plot(fpr, tpr_mean, lw=1.5, label=f"AUC = {auc:.3f}, CI = {ci[0]:.3f} - {ci[1]:.3f}")
ax.fill_between(fpr, tpr_low, tpr_up, alpha=.25, label="95 % CI")
ax.plot([0, 1], [0, 1], "k--", lw=.8)
# cosmetics
ax.set_title(f"Class {cls}")
ax.set_xlabel("FPR")
ax.set_ylabel("TPR")
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.grid(ls="--", alpha=.4)
ax.legend(fontsize=8, loc="lower right")
# drop spines
for side in ["top", "right"]:
ax.spines[side].set_visible(False)
# hide empty panels if any
for extra in range(C, n_rows * n_cols):
r, c = divmod(extra, n_cols)
fig.delaxes(axes[r][c])
if fig_title:
title = fig_title
else:
title = f"ROC Curve (AUC = {auc:.3f}, 95% CI = {ci[0]:.3f} - {ci[1]:.3f})"
fig.suptitle(title, fontsize=14)
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
if save_path:
fig.savefig(save_path, dpi=300)
print(f"Saved ROC panel ➜ {save_path}")
else:
plt.show()