| """ |
| Modelling process |
| """ |
| import pandas as pd |
| import numpy as np |
| import pickle |
| import matplotlib.pyplot as plt |
| import mlflow |
| from matplotlib import rcParams |
| from sklearn.cluster import AgglomerativeClustering, KMeans |
| from sklearn.decomposition import PCA |
| from sklearn.metrics import (davies_bouldin_score, silhouette_score, |
| accuracy_score, confusion_matrix, |
| ConfusionMatrixDisplay) |
| from sklearn.multiclass import OneVsRestClassifier |
| from sklearn.tree import DecisionTreeClassifier |
| import os |
|
|
|
|
| |
| rcParams['figure.figsize'] = 20, 5 |
| rcParams['axes.spines.top'] = False |
| rcParams['axes.spines.right'] = False |
|
|
| |
| year = 2019 |
| model_type = 'hierarchical' |
| data_type = 'train' |
| k = 3 |
| stamp = str(pd.Timestamp.now(tz='GMT+0'))[:16].replace(':', '').replace(' ', '_') |
| data_path = '<YOUR_DATA_PATH>/Model_E_Extracts/' |
|
|
| |
| mlflow.set_tracking_uri("file:/.") |
| tracking_uri = mlflow.get_tracking_uri() |
| experiment_name = 'Model E: one vs rest adaption DTC ' + model_type |
| run_name = "_".join((str(year), model_type, stamp)) |
| description = "Clustering model with one vs rest adaption (DTC) for COPD data in " + str(year) |
|
|
|
|
| def extract_year(df, year): |
| """ |
| Extract 1 year of data |
| -------- |
| :param df: dataframe to extract from |
| :param year: year to select data from |
| :return: data from chosen year |
| """ |
| return df[df.year == year] |
|
|
|
|
| def read_yearly_data(typ, year): |
| """ |
| Read in data for year required |
| -------- |
| :param typ: type of data to read in |
| :param year: year to select data from |
| :return: data from chosen year and ids |
| """ |
| df = pd.read_pickle(data_path + 'min_max_' + typ + '.pkl') |
| df_year = extract_year(df, year) |
| ids = df_year.pop('SafeHavenID').to_list() |
| df_year = df_year.drop('year', axis=1) |
|
|
| return df_year, ids |
|
|
|
|
| def plot_variance(df, typ): |
| """ |
| Plot PCA variance |
| --------- |
| :param df: dataframe to process with PCA |
| :param typ: type of plot - for 'full' data or 'reduced' |
| :return: pca object |
| """ |
| pca = PCA().fit(df) |
| n = list(range(1, len(df.columns) + 1)) |
| evr = pca.explained_variance_ratio_.cumsum() |
| fig, ax = plt.subplots() |
| ax.plot(n, evr) |
| title = 'PCA Variance - ' + typ |
| ax.set_title(title, size=20) |
| ax.set_xlabel('Number of principal components') |
| ax.set_ylabel('Cumulative explained variance') |
| ax.grid() |
| plt.tight_layout() |
| mlflow.log_figure(fig, 'fig/' + title + '.png') |
|
|
| return pca |
|
|
|
|
| def extract_pca_loadings(df, pca_object): |
| """ |
| Extract PCA loadings |
| -------- |
| :param df: dataframe to reduce with pca |
| :param pca_object: pca object with feature loadings |
| :return: loadings table |
| """ |
| cols = df.columns |
| loadings = pd.DataFrame( |
| data=pca_object.components_.T * np.sqrt(pca_object.explained_variance_), |
| columns=[f'PC{i}' for i in range(1, len(cols) + 1)], |
| index=cols) |
|
|
| return loadings |
|
|
|
|
| def plot_loadings(loadings): |
| """ |
| Plot loadings for PC1 returned from PCA |
| -------- |
| :param loadings: table of feature correlations to PC1 |
| :return: updated loadings table |
| """ |
| loadings_abs = loadings.abs().sort_values(by='PC1', ascending=False) |
| pc1_abs = loadings_abs[['PC1']].reset_index() |
| col_map = {'index': 'Attribute', 'PC1': 'AbsCorrWithPC1'} |
| pc1_abs = pc1_abs.rename(col_map, axis=1) |
| fig, ax = plt.subplots() |
| pc1_abs.plot(ax=ax, kind='bar') |
| title = 'PCA loading scores (PC1)' |
| ax.set_title(title, size=20) |
| ax.set_xticks(ticks=pc1_abs.index, labels=pc1_abs.Attribute, rotation='vertical') |
| ax.set_xlabel('Attribute') |
| ax.set_ylabel('AbsCorrWithPC1') |
| plt.tight_layout() |
| mlflow.log_figure(fig, 'fig/' + title + '.png') |
|
|
| return pc1_abs |
|
|
|
|
| def extract_array(df, pca_object, typ): |
| """ |
| Extract data to pass to clustering algos |
| -------- |
| :param df: dataframe to convert |
| :param pca_object: initialised PCA object |
| :param typ: type of return needed, either 'train' or 'test' |
| :return: converted array (and PCA object if training) |
| """ |
| if typ == 'train': |
| pca_func = pca_object.fit_transform |
| else: |
| pca_func = pca_object.transform |
|
|
| pca_data = pd.DataFrame(pca_func(df)).to_numpy() |
|
|
| if typ == 'train': |
| pca_file = data_path + run_name + '_pca.pkl' |
| pickle.dump(pca_object, open(pca_file, 'wb')) |
| |
| return pca_data |
|
|
|
|
| def get_kmeans_score(data, k): |
| ''' |
| Calculate K-Means Davies Bouldin and Silhouette scores |
| -------- |
| :param data: dataset to fit K-Means to |
| :param k: number of centers/clusters |
| :return: Scores |
| ''' |
| kmeans = KMeans(n_clusters=k) |
| model = kmeans.fit_predict(data) |
| db_score = davies_bouldin_score(data, model) |
| sil_score = silhouette_score(data, model) |
| |
| return db_score, sil_score |
|
|
|
|
| def plot_DB(df): |
| """ |
| Extract David Bouldin score and plot for a range of cluster numbers, |
| applied using K-Means clustering. |
| |
| "Davies Bouldin index represents the average 'similarity' of clusters, |
| where similarity is a measure that relates cluster distance to cluster |
| size" - the lowest score indicates best cluster set. |
| -------- |
| :param df: dataframe to plot from |
| """ |
| db_scores = [] |
| sil_scores = [] |
| centers = list(range(2, 10)) |
| for center in centers: |
| db_score, sil_score = get_kmeans_score(df, center) |
| db_scores.append(db_score) |
| sil_scores.append(sil_score) |
|
|
| |
| fig, ax = plt.subplots() |
| ax.plot(centers, db_scores, linestyle='--', marker='o', color='b') |
| ax.set_xlabel('K') |
| ax.set_ylabel('Davies Bouldin score') |
| title = 'Davies Bouldin score vs. K' |
| ax.set_title(title, size=20) |
| plt.tight_layout() |
| mlflow.log_figure(fig, 'fig/' + title + '.png') |
|
|
| |
| fig, ax = plt.subplots() |
| ax.plot(centers, sil_scores, linestyle='--', marker='o', color='b') |
| ax.set_xlabel('K') |
| ax.set_ylabel('Silhouette score') |
| title = 'Silhouette score vs. K' |
| ax.set_title(title, size=20) |
| plt.tight_layout() |
| mlflow.log_figure(fig, 'fig/' + title + '.png') |
|
|
|
|
| def plot_clust(df, labels): |
| """ |
| Plot clusters |
| -------- |
| :param df: dataframe to plot clusters from |
| :param labels: cluster labels |
| """ |
| fig = plt.figure(figsize=(10, 10)) |
| ax = fig.add_subplot(111, projection='3d') |
| sc = ax.scatter(df[:, 0], df[:, 1], df[:, 2], c=labels) |
| ax.set_xlabel('Principal Component 1') |
| ax.set_ylabel('Principal Component 2') |
| ax.set_zlabel('Principal Component 3') |
| ax.legend(*sc.legend_elements(), title='clusters') |
| title = 'Clusters' |
| ax.set_title(title, size=20) |
| plt.tight_layout() |
| mlflow.log_figure(fig, 'fig/' + title + '.png') |
|
|
|
|
| def save_clusters(typ, labels): |
| """ |
| Save results from clustering |
| -------- |
| :param typ: type of datasets - train, val |
| :param labels: labels from clustering to add to df |
| :param cols: columns to use for training |
| :return: reduced dataframe in numpy format |
| """ |
| df_full = pd.read_pickle(data_path + 'filled_' + typ + '.pkl') |
| df = df_full[df_full.year == year] |
| df['cluster'] = labels |
| df.to_pickle(data_path + '_'.join((run_name, typ, 'clusters.pkl'))) |
|
|
|
|
| def main(): |
|
|
| |
| df_train, train_ids = read_yearly_data('train', year) |
| df_val, val_ids = read_yearly_data('val', year) |
|
|
| |
| print('Setting up ML Flow run') |
| mlflow.set_tracking_uri('http://127.0.0.1:5000/') |
| mlflow.set_experiment(experiment_name) |
| mlflow.start_run(run_name=run_name, description=description) |
| mlflow.set_tag("model.name", model_type) |
| mlflow.set_tag("model.training_data", "EXAMPLE_STUDY_DATA") |
| mlflow.set_tag("model.training_year", year) |
| mlflow.log_param("n_cols", len(df_train.columns) - 1) |
| mlflow.log_param("k", k) |
|
|
| |
| print('Feature reduction stage 1') |
| pca = plot_variance(df_train, 'full') |
| loadings = extract_pca_loadings(df_train, pca) |
| pc1_abs_loadings = plot_loadings(loadings) |
| variance_full = pca.explained_variance_ratio_.cumsum() |
|
|
| n_cols = np.argmax(variance_full >= 0.9) + 1 |
|
|
| mlflow.log_param("pca_stage_1", n_cols) |
| columns = pc1_abs_loadings.Attribute[:n_cols].values |
| np.save(data_path + run_name + '_cols.npy', columns) |
|
|
| |
| df_train_reduced = df_train[columns] |
| df_val_reduced = df_val[columns] |
|
|
| |
| print('Feature reduction stage 2') |
| pca_n_cols = plot_variance(df_train_reduced, 'reduced') |
| variance_reduced = pca_n_cols.explained_variance_ratio_.cumsum() |
|
|
| n_components = np.argmax(variance_reduced >= 0.8) + 1 |
| mlflow.log_param("pca_stage_2", n_components) |
| pca_reduced = PCA(n_components=n_components) |
| data_train = extract_array(df_train_reduced, pca_reduced, 'train') |
| data_val = extract_array(df_val_reduced, pca_reduced, 'test') |
|
|
| |
| print('Detecting best cluster number') |
| plot_DB(data_train) |
|
|
| |
| print('Cluster model training') |
| data = np.concatenate((data_train, data_val)) |
| cluster_model = AgglomerativeClustering(n_clusters=k, linkage="ward") |
| |
| cluster_model.fit(data) |
| cluster_model_file = data_path + "_".join((run_name, model_type, 'cluster_model.pkl')) |
| pickle.dump(cluster_model, open(cluster_model_file, 'wb')) |
|
|
| |
| labels = cluster_model.labels_ |
| train_labels = labels[:len(train_ids)] |
| val_labels = labels[len(train_ids):] |
| save_clusters('train', train_labels) |
| save_clusters('val', val_labels) |
|
|
| |
| plot_clust(data, labels) |
|
|
| |
| print('BLR classifier training') |
|
|
| |
| clf_pre = DecisionTreeClassifier(random_state=42) |
| clf = OneVsRestClassifier(clf_pre) |
| clf.fit(df_train_reduced.to_numpy(), train_labels) |
| clf_model_file = data_path + run_name + '_dtc_model.pkl' |
| pickle.dump(clf, open(clf_model_file, 'wb')) |
|
|
| |
| n_classes = len(set(train_labels)) |
| n_features = df_train_reduced.shape[1] |
|
|
| fig, axs = plt.subplots(n_classes, 1, figsize=(10, 5 * n_classes)) |
|
|
| |
| fig.subplots_adjust(hspace=0.99) |
|
|
| |
| for i in range(n_classes): |
| |
| importance = clf.estimators_[i].feature_importances_ |
| |
| |
| indices = np.argsort(importance)[::-1] |
| |
| |
| axs[i].bar(range(n_features), importance[indices]) |
| axs[i].set_xticks(range(n_features)) |
| axs[i].set_xticklabels(np.array(df_train_reduced.columns)[indices], rotation=90, fontsize=9) |
| axs[i].set_xlabel('Features') |
| axs[i].set_ylabel('Importance') |
| axs[i].set_title('Class {} Feature Importance'.format(i)) |
|
|
| |
| plt.subplots_adjust(hspace=0.5) |
|
|
| |
| tmpfile = "plot.png" |
| fig.savefig(tmpfile) |
|
|
| |
| with open(tmpfile, "rb") as fig: |
| mlflow.log_artifact(tmpfile, "feature_importance.png") |
|
|
| |
| os.remove(tmpfile) |
|
|
| |
| val_pred = clf.predict(df_val_reduced.to_numpy()) |
| accuracy = accuracy_score(val_labels, val_pred) |
| mlflow.log_metric('dtc accuracy', accuracy) |
|
|
| cm = confusion_matrix(val_labels, val_pred, labels=clf.classes_) |
| disp = ConfusionMatrixDisplay( |
| confusion_matrix=cm, display_labels=clf.classes_) |
| disp.plot() |
| plt.tight_layout() |
| mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png') |
| |
| |
| mlflow.end_run() |
|
|
|
|
| main() |
|
|