| | import numpy as np |
| | from sklearn.linear_model import Ridge |
| | from sklearn.cluster import Birch |
| | from pynndescent import NNDescent |
| | from sklearn.neighbors import NearestNeighbors |
| | |
| |
|
| | def find_cluster(trajectories, sub_labels, new_sample): |
| | nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(trajectories) |
| | distances, indices = nbrs.kneighbors(new_sample[np.newaxis,:]) |
| | nearest_neighbor_idx = indices[0, 1] |
| | nearest_neighbor_dist = distances[0, 1] |
| |
|
| | cls_idx = sub_labels[nearest_neighbor_idx] |
| | samples_in_cls = trajectories[np.argwhere(sub_labels==cls_idx).squeeze()] |
| |
|
| |
|
| | |
| | n_trees = min(64, 5 + int(round(samples_in_cls.shape[0] ** 0.5 / 20.0))) |
| | |
| | n_iters = max(5, int(round(np.log2(samples_in_cls.shape[0])))) |
| | |
| | nnd = NNDescent( |
| | samples_in_cls, |
| | n_neighbors=2, |
| | metric="euclidean", |
| | n_trees=n_trees, |
| | n_iters=n_iters, |
| | max_candidates=60, |
| | verbose=False |
| | ) |
| | _, dists = nnd.neighbor_graph |
| | dists = dists[:, 1] |
| | max_dist = dists.max() |
| | if nearest_neighbor_dist< max_dist: |
| | return cls_idx, nearest_neighbor_idx |
| | else: |
| | return -1, -1 |
| |
|
| | |
| |
|
| | class TrajectoryManager: |
| | def __init__(self, embeddings_2d, cls_num, period=100, metric="a"): |
| | """ trajectory manager with no feedback |
| | sample abnormal samples based on trajectories |
| | Parameters |
| | ---------- |
| | samples: ndarray, shape(train_num, repr_dim) |
| | embeddings_2d : ndarray, shape (train_num, epoch_num, 2) |
| | all 2d embeddings of representations by timevis |
| | cls_num: int |
| | the number of classes to cluster |
| | period: int |
| | We only look at the last *period* epochs of trajectory |
| | """ |
| | self.embeddings_2d = embeddings_2d |
| | train_num,time_steps, _ = embeddings_2d.shape |
| | self.train_num = train_num |
| | self.time_steps = time_steps |
| | self.period = period |
| | self.metric = metric |
| | self.cls_num = cls_num |
| |
|
| | self.v = self.embeddings_2d[:, -period:,:][:,1:,:] - self.embeddings_2d[:, -period:,:][:,:-1,:] |
| | self.a = self.v[:,1:,:]-self.v[:,:-1,:] |
| | |
| | def clustered(self): |
| | brc = Birch(n_clusters=self.cls_num) |
| | if self.metric == "v": |
| | brc.fit(self.v.reshape(self.train_num, -1)) |
| | elif self.metric == "a": |
| | brc.fit(self.a.reshape(self.train_num, -1)) |
| | else: |
| | print("Not a valid metric") |
| | |
| | self.predict_sub_labels = brc.labels_ |
| | self.suspect_clean = np.argsort(np.bincount(self.predict_sub_labels))[-3:] |
| |
|
| | |
| | self.sample_rate = np.ones(self.cls_num) |
| | self.sample_rate[self.suspect_clean] = 0. |
| | self.selected = np.zeros(self.train_num) |
| | |
| | def sample_batch(self, budget): |
| | not_selected = np.argwhere(self.selected==0).squeeze() |
| | s_idxs = np.random.choice(not_selected, size=budget) |
| | return s_idxs |
| | |
| | def sample_normal(self, budget): |
| | |
| | normal_rate = 1 - self.sample_rate |
| | sample_rate = normal_rate[self.predict_sub_labels] |
| | |
| | not_selected = np.argwhere(self.selected==0).squeeze() |
| | |
| | normed_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected]) |
| | s_idxs = np.random.choice(not_selected, p=normed_rate,size=budget) |
| | return s_idxs |
| | |
| | def update_belief(self, idxs): |
| | |
| | if len(idxs)>0: |
| | self.selected[idxs] = 1 |
| | |
| |
|
| | class FeedbackTrajectoryManager(TrajectoryManager): |
| | """ trajectory manager with feedback |
| | sample abnormal samples based on trajectories |
| | """ |
| | def __init__(self, embeddings_2d, cls_num, period=100, metric="a"): |
| | super().__init__(embeddings_2d, cls_num, period, metric) |
| | |
| | def clustered(self): |
| | super().clustered() |
| | |
| | |
| | |
| | self.user_acc = np.zeros(self.train_num) |
| | self.user_rej = np.zeros(self.train_num) |
| | |
| | def sample_batch(self, budget, return_scores=False): |
| | acc_idxs = np.argwhere(self.user_acc==1).squeeze() |
| | rej_idxs = np.argwhere(self.user_rej==1).squeeze() |
| |
|
| | acc_rate = np.zeros(self.train_num) |
| | rej_rate = np.zeros(self.train_num) |
| | if len(acc_idxs)>0: |
| | acc_rate[acc_idxs]=1. |
| | if len(rej_idxs)>0: |
| | rej_rate[rej_idxs]=1. |
| |
|
| | if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
| | raise Exception("Intersection between acc idxs and rej idxs!") |
| |
|
| | exploit_rate = np.zeros(self.cls_num) |
| | explore_rate = np.zeros(self.cls_num) |
| | for cls in range(self.cls_num): |
| | cls_idxs = np.argwhere(self.predict_sub_labels==cls).squeeze() |
| | acc_num = np.sum(acc_rate[cls_idxs]) |
| | rej_num = np.sum(rej_rate[cls_idxs]) |
| | query_sum = acc_num + rej_num |
| | if query_sum > 0: |
| | exploit_rate[cls] = acc_num/query_sum |
| | explore_rate[cls] = 1 - query_sum/len(cls_idxs) |
| | |
| | |
| | rate = (explore_rate + exploit_rate)* self.sample_rate |
| | sample_rate = rate[self.predict_sub_labels] |
| | not_selected = np.argwhere(self.selected==0).squeeze() |
| | norm_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected]) |
| | s_idxs = np.random.choice(not_selected, p=norm_rate, size=budget, replace=False) |
| |
|
| | if return_scores: |
| | scores = sample_rate[s_idxs] |
| | return s_idxs, scores |
| | return s_idxs |
| | |
| | def update_belief(self, acc_idxs, rej_idxs): |
| | if len(acc_idxs)>0: |
| | self.user_acc[acc_idxs]=1 |
| | self.selected[acc_idxs] = 1 |
| | if len(rej_idxs)>0: |
| | self.user_rej[rej_idxs]=1 |
| | self.selected[rej_idxs] = 1 |
| | |
| |
|
| | class TBSampling(TrajectoryManager): |
| | """with no memory, for user study""" |
| | def __init__(self, embeddings_2d, cls_num, period=100, metric="a"): |
| | super().__init__(embeddings_2d, cls_num, period, metric) |
| |
|
| | def sample_batch(self, acc_idxs, rej_idxs, budget, return_scores=True): |
| | selected = np.zeros(self.train_num) |
| | if len(acc_idxs)>0: |
| | selected[acc_idxs] = 1. |
| | if len(rej_idxs)>0: |
| | selected[rej_idxs] = 1. |
| | if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
| | raise Exception("Intersection between acc idxs and rej idxs!") |
| | |
| | sample_rate = self.sample_rate[self.predict_sub_labels] |
| | not_selected = np.argwhere(selected==0).squeeze() |
| | norm_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected]) |
| | s_idxs = np.random.choice(not_selected, p=norm_rate, size=budget, replace=False) |
| | if return_scores: |
| | scores = sample_rate[s_idxs] |
| | return s_idxs, scores |
| | return s_idxs |
| |
|
| |
|
| | class FeedbackSampling(TrajectoryManager): |
| | """with no memory, for user study""" |
| | def __init__(self, embeddings_2d, cls_num, period=100, metric="a"): |
| | super().__init__(embeddings_2d, cls_num, period, metric) |
| |
|
| | def sample_batch(self, acc_idxs, rej_idxs, budget, return_scores=True): |
| | acc_rate = np.zeros(self.train_num) |
| | rej_rate = np.zeros(self.train_num) |
| | selected = np.zeros(self.train_num) |
| | if len(acc_idxs)>0: |
| | acc_rate[acc_idxs]=1. |
| | selected[acc_idxs] = 1. |
| | if len(rej_idxs)>0: |
| | rej_rate[rej_idxs]=1. |
| | selected[rej_idxs] = 1. |
| | if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
| | raise Exception("Intersection between acc idxs and rej idxs!") |
| |
|
| | exploit_rate = np.zeros(self.cls_num) |
| | explore_rate = np.zeros(self.cls_num) |
| | for cls in range(self.cls_num): |
| | cls_idxs = np.argwhere(self.predict_sub_labels==cls).squeeze() |
| | acc_num = np.sum(acc_rate[cls_idxs]) |
| | rej_num = np.sum(rej_rate[cls_idxs]) |
| | query_sum = acc_num + rej_num |
| | if query_sum > 0: |
| | exploit_rate[cls] = acc_num/query_sum |
| | explore_rate[cls] = 1 - query_sum/len(cls_idxs) |
| | |
| | |
| | rate = (explore_rate + exploit_rate)* self.sample_rate |
| | sample_rate = rate[self.predict_sub_labels] |
| | not_selected = np.argwhere(selected==0).squeeze() |
| | norm_rate = sample_rate[not_selected]/np.sum(sample_rate[not_selected]) |
| | s_idxs = np.random.choice(not_selected, p=norm_rate, size=budget, replace=False) |
| | if return_scores: |
| | scores = sample_rate[s_idxs] |
| | return s_idxs, scores |
| | return s_idxs |
| |
|
| | |
| | class Recommender: |
| | def __init__(self, uncertainty, embeddings_2d, cls_num, period): |
| | """ Recommend samples based on uncertainty and embeddings |
| | """ |
| | self.uncertainty = uncertainty |
| | self.embeddings_2d = embeddings_2d |
| | train_num,time_steps, _ = embeddings_2d.shape |
| | self.train_num = train_num |
| | self.time_steps = time_steps |
| | self.period = period |
| | self.cls_num = cls_num |
| | |
| | self.position = self.embeddings_2d[:, -period:,:].reshape(self.train_num, -1) |
| | self.v = self.embeddings_2d[:, -period:,:][:,1:,:] - self.embeddings_2d[:, -period:,:][:,:-1,:] |
| | self.a = (self.v[:,1:,:]-self.v[:,:-1,:]).reshape(self.train_num, -1) |
| | self.v = self.v.reshape(self.train_num, -1) |
| | |
| | @property |
| | def _sample_p_scores(self): |
| | return self.p_scores[self.predict_p_sub_labels] |
| | |
| | @property |
| | def _sample_v_scores(self): |
| | return self.v_scores[self.predict_v_sub_labels] |
| | |
| | @property |
| | def _sample_a_scores(self): |
| | return self.a_scores[self.predict_a_sub_labels] |
| | |
| | def clustered(self): |
| | brc = Birch(n_clusters=self.cls_num) |
| | brc.fit(self.v.reshape(self.train_num, -1)) |
| | self.predict_v_sub_labels = brc.labels_ |
| | self.v_scores = np.zeros(self.cls_num) |
| | for cls in range(self.cls_num): |
| | self.v_scores[cls] = 1 - np.sum(self.predict_v_sub_labels==cls)/self.train_num |
| |
|
| | brc = Birch(n_clusters=self.cls_num) |
| | brc.fit(self.a.reshape(self.train_num, -1)) |
| | self.predict_a_sub_labels = brc.labels_ |
| | self.a_scores = np.zeros(self.cls_num) |
| | for cls in range(self.cls_num): |
| | self.a_scores[cls] = 1 - np.sum(self.predict_a_sub_labels==cls)/self.train_num |
| |
|
| | brc = Birch(n_clusters=self.cls_num) |
| | brc.fit(self.position.reshape(self.train_num, -1)) |
| | self.predict_p_sub_labels = brc.labels_ |
| | self.p_scores = np.zeros(self.cls_num) |
| | for cls in range(self.cls_num): |
| | self.p_scores[cls] = 1 - np.sum(self.predict_p_sub_labels==cls)/self.train_num |
| |
|
| | def sample_batch_init(self, acc_idxs, rej_idxs, budget): |
| | scores = (self.uncertainty + self.v_scores[self.predict_v_sub_labels]+self.a_scores[self.predict_a_sub_labels]+self.p_scores[self.predict_p_sub_labels])/4 |
| | selected = np.zeros(self.train_num) |
| | if len(acc_idxs)>0: |
| | selected[acc_idxs] = 1. |
| | if len(rej_idxs)>0: |
| | selected[rej_idxs] = 1. |
| | if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
| | raise Exception("Intersection between acc idxs and rej idxs!") |
| | not_selected_idxs = np.argwhere(selected==0).squeeze() |
| | norm_rate = scores[not_selected_idxs]/np.sum(scores[not_selected_idxs]) |
| | s_idxs = np.random.choice(not_selected_idxs, p=norm_rate, size=budget, replace=False) |
| | return s_idxs, scores[s_idxs] |
| | |
| | def sample_batch(self, acc_idxs, rej_idxs, budget, return_coef=False): |
| | if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
| | raise Exception("Intersection between acc idxs and rej idxs!") |
| | |
| | s1 = self.uncertainty |
| | s2 = self.v_scores[self.predict_v_sub_labels] |
| | s3 = self.a_scores[self.predict_a_sub_labels] |
| | s4 = self.p_scores[self.predict_p_sub_labels] |
| | X = np.vstack((s1,s2,s3,s4)).transpose([1,0]) |
| |
|
| | exp_idxs = np.concatenate((acc_idxs, rej_idxs), axis=0) |
| | target_X = X[exp_idxs] |
| | target_Y = np.zeros(len(exp_idxs)) |
| | target_Y[:len(acc_idxs)] = 1 |
| | krr = Ridge(alpha=1.0) |
| | krr.fit(target_X, target_Y) |
| | scores = krr.predict(X) |
| |
|
| | not_selected = np.setdiff1d(np.arange(self.train_num), exp_idxs) |
| | remain_scores = scores[not_selected] |
| | args = np.argsort(remain_scores)[-budget:] |
| | selected_idxs = not_selected[args] |
| | if return_coef: |
| | return selected_idxs, scores[selected_idxs], krr.coef_ |
| | return selected_idxs, scores[selected_idxs] |
| | |
| | def sample_batch_normal_init(self, acc_idxs, rej_idxs, budget): |
| | |
| | scores = (self.uncertainty + self.v_scores[self.predict_v_sub_labels]+self.a_scores[self.predict_a_sub_labels]+self.p_scores[self.predict_p_sub_labels])/4 |
| | |
| | selected = np.zeros(self.train_num) |
| | if len(acc_idxs)>0: |
| | selected[acc_idxs] = 1. |
| | if len(rej_idxs)>0: |
| | selected[rej_idxs] = 1. |
| | if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
| | raise Exception("Intersection between acc idxs and rej idxs!") |
| | not_selected_idxs = np.argwhere(selected==0).squeeze() |
| |
|
| | norm_rate = (1 - scores[not_selected_idxs])/np.sum((1 - scores[not_selected_idxs])) |
| | s_idxs = np.random.choice(not_selected_idxs, p=norm_rate, size=budget, replace=False) |
| | return s_idxs, scores[s_idxs] |
| | |
| | def sample_batch_normal(self, acc_idxs, rej_idxs, budget): |
| | if len(np.intersect1d(acc_idxs, rej_idxs))>0: |
| | raise Exception("Intersection between acc idxs and rej idxs!") |
| | |
| | s1 = self.uncertainty |
| | s2 = self.v_scores[self.predict_v_sub_labels] |
| | s3 = self.a_scores[self.predict_a_sub_labels] |
| | s4 = self.p_scores[self.predict_p_sub_labels] |
| | X = np.vstack((s1,s2,s3,s4)).transpose([1,0]) |
| |
|
| | exp_idxs = np.concatenate((acc_idxs, rej_idxs), axis=0) |
| | target_X = X[exp_idxs] |
| | target_Y = np.zeros(len(exp_idxs)) |
| | target_Y[:len(acc_idxs)] = 1 |
| | krr = Ridge(alpha=1.0) |
| | krr.fit(target_X, target_Y) |
| | scores = krr.predict(X) |
| |
|
| | not_selected = np.setdiff1d(np.arange(self.train_num), exp_idxs) |
| | remain_scores = scores[not_selected] |
| | args = np.argsort(remain_scores)[:budget] |
| | selected_idxs = not_selected[args] |
| | return selected_idxs, scores[selected_idxs] |
| | |
| | def score_new_sample(self, sample_trajectory, return_nearest=False): |
| | new_position = sample_trajectory.reshape(-1) |
| | new_v = sample_trajectory[1:, :] - sample_trajectory[:-1, :] |
| | new_a = (new_v[1:,:]-new_v[:-1,:]).reshape(-1) |
| | new_v = new_v.reshape(-1) |
| |
|
| | position_cls, p_nearest_idx = find_cluster(self.position, self.predict_p_sub_labels, new_position) |
| | v_cls, v_nearest_idx = find_cluster(self.v, self.predict_v_sub_labels, new_v) |
| | a_cls, a_nearest_idx = find_cluster(self.a, self.predict_a_sub_labels, new_a) |
| |
|
| | new_p_score = self.p_scores[position_cls] if position_cls>=0 else 1-1/self.train_num |
| | new_v_score = self.v_scores[v_cls] if v_cls>=0 else 1-1/self.train_num |
| | new_a_score = self.a_scores[a_cls] if a_cls>=0 else 1-1/self.train_num |
| | if return_nearest: |
| | return (new_p_score, new_v_score, new_a_score), (p_nearest_idx, v_nearest_idx, a_nearest_idx) |
| | return new_p_score, new_v_score, new_a_score |
| |
|
| | |
| |
|
| |
|
| |
|
| |
|