| """ |
| Copied from MMAction2 |
| https://github.com/open-mmlab/mmaction2/blob/master/mmaction/core/evaluation/eval_detection.py |
| """ |
| import json |
| import numpy as np |
| from sklearn.metrics import precision_recall_curve |
|
|
|
|
| def load_jsonl(filename): |
| with open(filename, "r") as f: |
| return [json.loads(l.strip("\n")) for l in f.readlines()] |
|
|
|
|
| def compute_temporal_iou_batch_paired(pred_windows, gt_windows): |
| """ compute intersection-over-union along temporal axis for each pair of windows in pred_windows and gt_windows. |
| Args: |
| pred_windows: np.ndarray, (N, 2), [st (float), ed (float)] * N |
| gt_windows: np.ndarray, (N, 2), [st (float), ed (float)] * N |
| Returns: |
| iou (float): np.ndarray, (N, ) |
| |
| References: |
| for np.divide with zeros, see https://stackoverflow.com/a/37977222 |
| """ |
| intersection = np.maximum( |
| 0, np.minimum(pred_windows[:, 1], gt_windows[:, 1]) - np.maximum(pred_windows[:, 0], gt_windows[:, 0]) |
| ) |
| union = np.maximum(pred_windows[:, 1], gt_windows[:, 1]) \ |
| - np.minimum(pred_windows[:, 0], gt_windows[:, 0]) |
| return np.divide(intersection, union, out=np.zeros_like(intersection), where=union != 0) |
|
|
|
|
| def compute_temporal_iou_batch_cross(spans1, spans2): |
| """ |
| Args: |
| spans1: (N, 2) np.ndarray, each row defines a span [st, ed] |
| spans2: (M, 2) np.ndarray, ... |
| |
| Returns: |
| iou: (N, M) np.ndarray |
| union: (N, M) np.ndarray |
| >>> spans1 = np.array([[0, 0.2, 0.9], [0.5, 1.0, 0.2]]) |
| >>> spans2 = np.array([[0, 0.3], [0., 1.0]]) |
| >>> compute_temporal_iou_batch_cross(spans1, spans2) |
| (tensor([[0.6667, 0.2000], |
| [0.0000, 0.5000]]), |
| tensor([[0.3000, 1.0000], |
| [0.8000, 1.0000]])) |
| """ |
| areas1 = spans1[:, 1] - spans1[:, 0] |
| areas2 = spans2[:, 1] - spans2[:, 0] |
|
|
| left = np.maximum(spans1[:, None, 0], spans2[None, :, 0]) |
| right = np.minimum(spans1[:, None, 1], spans2[None, :, 1]) |
|
|
| inter = np.clip(right - left, 0, None) |
| union = areas1[:, None] + areas2[None, :] - inter |
|
|
| iou = inter / union |
| return iou, union |
|
|
|
|
| def interpolated_precision_recall(precision, recall): |
| """Interpolated AP - VOCdevkit from VOC 2011. |
| |
| Args: |
| precision (np.ndarray): The precision of different thresholds. |
| recall (np.ndarray): The recall of different thresholds. |
| |
| Returns: |
| float: Average precision score. |
| """ |
| mprecision = np.hstack([[0], precision, [0]]) |
| mrecall = np.hstack([[0], recall, [1]]) |
| for i in range(len(mprecision) - 1)[::-1]: |
| mprecision[i] = max(mprecision[i], mprecision[i + 1]) |
| idx = np.where(mrecall[1::] != mrecall[0:-1])[0] + 1 |
| ap = np.sum((mrecall[idx] - mrecall[idx - 1]) * mprecision[idx]) |
| return ap |
|
|
|
|
| def compute_average_precision_detection(ground_truth, |
| prediction, |
| tiou_thresholds=np.linspace( |
| 0.5, 0.95, 10)): |
| """Compute average precision (detection task) between ground truth and |
| predictions data frames. If multiple predictions occurs for the same |
| predicted segment, only the one with highest score is matches as true |
| positive. This code is greatly inspired by Pascal VOC devkit. |
| |
| Args: |
| ground_truth (list[dict]): List containing the ground truth instances |
| (dictionaries). Required keys are 'video-id', 't-start' and |
| 't-end'. |
| prediction (list[dict]): List containing the prediction instances |
| (dictionaries). Required keys are: 'video-id', 't-start', 't-end' |
| and 'score'. |
| tiou_thresholds (np.ndarray): A 1darray indicates the temporal |
| intersection over union threshold, which is optional. |
| Default: ``np.linspace(0.5, 0.95, 10)``. |
| |
| Returns: |
| Float: ap, Average precision score. |
| """ |
| num_thresholds = len(tiou_thresholds) |
| num_gts = len(ground_truth) |
| num_preds = len(prediction) |
| ap = np.zeros(num_thresholds) |
| if len(prediction) == 0: |
| return ap |
|
|
| num_positive = float(num_gts) |
| lock_gt = np.ones((num_thresholds, num_gts)) * -1 |
| |
| prediction.sort(key=lambda x: -x['score']) |
| |
| tp = np.zeros((num_thresholds, num_preds)) |
| fp = np.zeros((num_thresholds, num_preds)) |
|
|
| |
| ground_truth_by_videoid = {} |
| for i, item in enumerate(ground_truth): |
| item['index'] = i |
| ground_truth_by_videoid.setdefault(item['video-id'], []).append(item) |
|
|
| |
| for idx, pred in enumerate(prediction): |
| if pred['video-id'] in ground_truth_by_videoid: |
| gts = ground_truth_by_videoid[pred['video-id']] |
| else: |
| fp[:, idx] = 1 |
| continue |
|
|
| _pred = np.array([[pred['t-start'], pred['t-end']], ]) |
| _gt = np.array([[gt['t-start'], gt['t-end']] for gt in gts]) |
| tiou_arr = compute_temporal_iou_batch_cross(_pred, _gt)[0] |
|
|
| tiou_arr = tiou_arr.reshape(-1) |
| |
| tiou_sorted_idx = tiou_arr.argsort()[::-1] |
| for t_idx, tiou_threshold in enumerate(tiou_thresholds): |
| for j_idx in tiou_sorted_idx: |
| if tiou_arr[j_idx] < tiou_threshold: |
| fp[t_idx, idx] = 1 |
| break |
| if lock_gt[t_idx, gts[j_idx]['index']] >= 0: |
| continue |
| |
| tp[t_idx, idx] = 1 |
| lock_gt[t_idx, gts[j_idx]['index']] = idx |
| break |
|
|
| if fp[t_idx, idx] == 0 and tp[t_idx, idx] == 0: |
| fp[t_idx, idx] = 1 |
|
|
| tp_cumsum = np.cumsum(tp, axis=1).astype(float) |
| fp_cumsum = np.cumsum(fp, axis=1).astype(float) |
| recall_cumsum = tp_cumsum / num_positive |
|
|
| precision_cumsum = tp_cumsum / (tp_cumsum + fp_cumsum) |
|
|
| for t_idx in range(len(tiou_thresholds)): |
| ap[t_idx] = interpolated_precision_recall(precision_cumsum[t_idx, :], |
| recall_cumsum[t_idx, :]) |
| return ap |
|
|
|
|
| def get_ap(y_true, y_predict, interpolate=True, point_11=False): |
| """ |
| Average precision in different formats: (non-) interpolated and/or 11-point approximated |
| point_11=True and interpolate=True corresponds to the 11-point interpolated AP used in |
| the PASCAL VOC challenge up to the 2008 edition and has been verfied against the vlfeat implementation |
| The exact average precision (interpolate=False, point_11=False) corresponds to the one of vl_feat |
| |
| :param y_true: list/ numpy vector of true labels in {0,1} for each element |
| :param y_predict: predicted score for each element |
| :param interpolate: Use interpolation? |
| :param point_11: Use 11-point approximation to average precision? |
| :return: average precision |
| |
| ref: https://github.com/gyglim/video2gif_dataset/blob/master/v2g_evaluation/__init__.py |
| |
| """ |
| |
| assert len(y_true) == len(y_predict), "Prediction and ground truth need to be of the same length" |
| if len(set(y_true)) == 1: |
| if y_true[0] == 0: |
| return 0 |
| |
| else: |
| return 1 |
| else: |
| assert sorted(set(y_true)) == [0, 1], "Ground truth can only contain elements {0,1}" |
|
|
| |
| precision, recall, _ = precision_recall_curve(y_true, y_predict) |
| recall = recall.astype(np.float32) |
|
|
| if interpolate: |
| for i in range(1, len(precision)): |
| precision[i] = max(precision[i - 1], precision[i]) |
|
|
| if point_11: |
| precision_11 = [precision[np.where(recall >= t)[0][-1]] for t in np.arange(0, 1.01, 0.1)] |
| return np.mean(precision_11) |
| else: |
| indices = np.where(np.diff(recall)) |
| return np.mean(precision[indices]) |
|
|