| """ |
| Modelling process |
| """ |
| import json |
| import pandas as pd |
| import numpy as np |
| import pickle |
| import matplotlib.pyplot as plt |
| import mlflow |
| from matplotlib import rcParams |
| from sklearn.cluster import AgglomerativeClustering, KMeans |
| from sklearn.tree import DecisionTreeClassifier as DTC |
| from sklearn.decomposition import PCA |
| from sklearn.metrics import (davies_bouldin_score, silhouette_score, |
| accuracy_score, confusion_matrix, |
| ConfusionMatrixDisplay) |
|
|
|
|
| |
| rcParams['figure.figsize'] = 20, 5 |
| rcParams['axes.spines.top'] = False |
| rcParams['axes.spines.right'] = False |
|
|
|
|
| def extract_year(df, eoy_date): |
| """ |
| Extract 1 year of data |
| -------- |
| :param df: dataframe to extract from |
| :param eoy_date: user-specified EOY date for training |
| :return: data from chosen year |
| """ |
| return df[df.eoy == eoy_date] |
|
|
|
|
| def read_yearly_data(data_path, typ, eoy_date): |
| """ |
| Read in data for year required |
| -------- |
| :param data_path: path to generated data |
| :param typ: type of data to read in |
| :param eoy_date: end of year date to select data from |
| :return: data from chosen year and ids |
| """ |
| df = pd.read_pickle(data_path + 'min_max_' + typ + '.pkl') |
| df_year = extract_year(df, eoy_date) |
| ids = df_year.pop('SafeHavenID').to_list() |
| df_year = df_year.drop('eoy', axis=1) |
|
|
| return df_year, ids |
|
|
|
|
| def plot_variance(df, typ): |
| """ |
| Plot PCA variance |
| --------- |
| :param df: dataframe to process with PCA |
| :param typ: type of plot - for 'full' data or 'reduced' |
| :return: pca object |
| """ |
| pca = PCA().fit(df) |
| n = list(range(1, len(df.columns) + 1)) |
| evr = pca.explained_variance_ratio_.cumsum() |
| fig, ax = plt.subplots() |
| ax.plot(n, evr) |
| title = 'PCA Variance - ' + typ |
| ax.set_title(title, size=20) |
| ax.set_xlabel('Number of principal components') |
| ax.set_ylabel('Cumulative explained variance') |
| ax.grid() |
| plt.tight_layout() |
| mlflow.log_figure(fig, 'fig/' + title + '.png') |
|
|
| return pca |
|
|
|
|
| def extract_pca_loadings(df, pca_object): |
| """ |
| Extract PCA loadings |
| -------- |
| :param df: dataframe to reduce with pca |
| :param pca_object: pca object with feature loadings |
| :return: loadings table |
| """ |
| cols = df.columns |
| loadings = pd.DataFrame( |
| data=pca_object.components_.T * np.sqrt(pca_object.explained_variance_), |
| columns=[f'PC{i}' for i in range(1, len(cols) + 1)], |
| index=cols) |
|
|
| return loadings |
|
|
|
|
| def plot_loadings(loadings): |
| """ |
| Plot loadings for PC1 returned from PCA |
| -------- |
| :param loadings: table of feature correlations to PC1 |
| :return: updated loadings table |
| """ |
| loadings_abs = loadings.abs().sort_values(by='PC1', ascending=False) |
| pc1_abs = loadings_abs[['PC1']].reset_index() |
| col_map = {'index': 'Attribute', 'PC1': 'AbsCorrWithPC1'} |
| pc1_abs = pc1_abs.rename(col_map, axis=1) |
| fig, ax = plt.subplots() |
| pc1_abs.plot(ax=ax, kind='bar') |
| title = 'PCA loading scores (PC1)' |
| ax.set_title(title, size=20) |
| ax.set_xticks(ticks=pc1_abs.index, labels=pc1_abs.Attribute, rotation='vertical') |
| ax.set_xlabel('Attribute') |
| ax.set_ylabel('AbsCorrWithPC1') |
| plt.tight_layout() |
| mlflow.log_figure(fig, 'fig/' + title + '.png') |
|
|
| return pc1_abs |
|
|
|
|
| def extract_array(df, data_path, run_name, pca_object, typ): |
| """ |
| Extract data to pass to clustering algos |
| -------- |
| :param df: dataframe to convert |
| :param data_path: path to generated data |
| :param run_name: name of run in ML Flow |
| :param pca_object: initialised PCA object |
| :param typ: type of return needed, either 'train' or 'test' |
| :return: converted array (and PCA object if training) |
| """ |
| if typ == 'train': |
| pca_func = pca_object.fit_transform |
| else: |
| pca_func = pca_object.transform |
|
|
| pca_data = pd.DataFrame(pca_func(df)).to_numpy() |
|
|
| if typ == 'train': |
| pca_file = data_path + run_name + '_pca.pkl' |
| pickle.dump(pca_object, open(pca_file, 'wb')) |
| |
| return pca_data |
|
|
|
|
| def get_kmeans_score(data, k): |
| ''' |
| Calculate K-Means Davies Bouldin and Silhouette scores |
| -------- |
| :param data: dataset to fit K-Means to |
| :param k: number of centers/clusters |
| :return: Scores |
| ''' |
| kmeans = KMeans(n_clusters=k) |
| model = kmeans.fit_predict(data) |
| db_score = davies_bouldin_score(data, model) |
| sil_score = silhouette_score(data, model) |
| |
| return db_score, sil_score |
|
|
|
|
| def plot_DB(df): |
| """ |
| Extract David Bouldin score and plot for a range of cluster numbers, |
| applied using K-Means clustering. |
| |
| "Davies Bouldin index represents the average 'similarity' of clusters, |
| where similarity is a measure that relates cluster distance to cluster |
| size" - the lowest score indicates best cluster set. |
| -------- |
| :param df: dataframe to plot from |
| """ |
| db_scores = [] |
| sil_scores = [] |
| centers = list(range(2, 10)) |
| for center in centers: |
| db_score, sil_score = get_kmeans_score(df, center) |
| db_scores.append(db_score) |
| sil_scores.append(sil_score) |
|
|
| |
| fig, ax = plt.subplots() |
| ax.plot(centers, db_scores, linestyle='--', marker='o', color='b') |
| ax.set_xlabel('K') |
| ax.set_ylabel('Davies Bouldin score') |
| title = 'Davies Bouldin score vs. K' |
| ax.set_title(title, size=20) |
| plt.tight_layout() |
| mlflow.log_figure(fig, 'fig/' + title + '.png') |
|
|
| |
| fig, ax = plt.subplots() |
| ax.plot(centers, sil_scores, linestyle='--', marker='o', color='b') |
| ax.set_xlabel('K') |
| ax.set_ylabel('Silhouette score') |
| title = 'Silhouette score vs. K' |
| ax.set_title(title, size=20) |
| plt.tight_layout() |
| mlflow.log_figure(fig, 'fig/' + title + '.png') |
|
|
|
|
| def plot_clust(df, labels): |
| """ |
| Plot clusters |
| -------- |
| :param df: dataframe to plot clusters from |
| :param labels: cluster labels |
| """ |
| fig = plt.figure(figsize=(10, 10)) |
| ax = fig.add_subplot(111, projection='3d') |
| sc = ax.scatter(df[:, 0], df[:, 1], df[:, 2], c=labels) |
| ax.set_xlabel('Principal Component 1') |
| ax.set_ylabel('Principal Component 2') |
| ax.set_zlabel('Principal Component 3') |
| ax.legend(*sc.legend_elements(), title='clusters') |
| title = 'Clusters' |
| ax.set_title(title, size=20) |
| plt.tight_layout() |
| mlflow.log_figure(fig, 'fig/' + title + '.png') |
|
|
|
|
| def save_clusters(data_path, run_name, eoy_date, typ, labels): |
| """ |
| Save results from clustering |
| -------- |
| :param typ: type of datasets - train, val |
| :param labels: labels from clustering to add to df |
| :param cols: columns to use for training |
| :return: reduced dataframe in numpy format |
| """ |
| df_full = pd.read_pickle(data_path + 'filled_' + typ + '.pkl') |
| df = df_full[df_full.eoy == eoy_date] |
| df['cluster'] = labels |
| df.to_pickle(data_path + '_'.join((run_name, typ, 'clusters.pkl'))) |
|
|
|
|
| def main(): |
|
|
| |
| with open('../../../config.json') as json_config_file: |
| config = json.load(json_config_file) |
|
|
| |
| eoy_date = config['date'] |
| data_path = config['model_data_path'] |
| model_type = config['model_type'] |
|
|
| |
| with open(model_type + '_params.json') as json_params_file: |
| model_params = json.load(json_params_file) |
|
|
| |
| stamp = str(pd.Timestamp.now(tz='GMT+0'))[:16].replace(':', '').replace(' ', '_') |
| experiment_name = 'Model E - Date Specific: ' + model_type |
| run_name = "_".join((str(eoy_date), model_type, stamp)) |
| description = "Clustering model for COPD data in the year prior to " + str(eoy_date) |
|
|
| |
| print('Setting up ML Flow run') |
| mlflow.set_tracking_uri('http://127.0.0.1:5000/') |
| mlflow.set_experiment(experiment_name) |
| mlflow.start_run(run_name=run_name, description=description) |
| mlflow.set_tag("model.name", model_type) |
| mlflow.set_tag("model.training_data", config['extract_data_path']) |
| mlflow.set_tag("model.training_date", eoy_date) |
| mlflow.log_param("k", model_params['n_clusters']) |
|
|
| |
| df_train, train_ids = read_yearly_data(data_path, 'train', eoy_date) |
| df_val, val_ids = read_yearly_data(data_path, 'val', eoy_date) |
| mlflow.log_param("n_cols", len(df_train.columns)) |
|
|
| |
| df_train, train_ids = read_yearly_data(data_path, 'train', eoy_date) |
| df_val, val_ids = read_yearly_data(data_path, 'val', eoy_date) |
| mlflow.log_param("n_cols", len(df_train.columns)) |
|
|
| |
| print('Feature reduction stage 1') |
| pca = plot_variance(df_train, 'full') |
| loadings = extract_pca_loadings(df_train, pca) |
| pc1_abs_loadings = plot_loadings(loadings) |
| variance_full = pca.explained_variance_ratio_.cumsum() |
| n_cols = np.argmax(variance_full >= 0.9) + 1 |
| mlflow.log_param("pca_stage_1", n_cols) |
| columns = pc1_abs_loadings.Attribute[:n_cols].values |
| np.save(data_path + run_name + '_cols.npy', columns) |
|
|
| |
| df_train_reduced = df_train[columns] |
| df_val_reduced = df_val[columns] |
|
|
| |
| print('Feature reduction stage 2') |
| pca_n_cols = plot_variance(df_train_reduced, 'reduced') |
| variance_reduced = pca_n_cols.explained_variance_ratio_.cumsum() |
| n_components = np.argmax(variance_reduced >= 0.8) + 1 |
| mlflow.log_param("pca_stage_2", n_components) |
| pca_reduced = PCA(n_components=n_components) |
| data_train = extract_array( |
| df_train_reduced, data_path, run_name, pca_reduced, 'train') |
| data_val = extract_array( |
| df_val_reduced, data_path, run_name, pca_reduced, 'test') |
|
|
| |
| print('Detecting best cluster number') |
| plot_DB(data_train) |
|
|
| |
| print('Cluster model training') |
| data = np.concatenate((data_train, data_val)) |
| if model_type == 'hierarchical': |
| cluster_model = AgglomerativeClustering(**model_params) |
| else: |
| cluster_model = KMeans(**model_params) |
| cluster_model.fit(data) |
| cluster_model_file = data_path + "_".join((run_name, model_type, 'cluster_model.pkl')) |
| pickle.dump(cluster_model, open(cluster_model_file, 'wb')) |
|
|
| |
| labels = cluster_model.labels_ |
| train_labels = labels[:len(train_ids)] |
| val_labels = labels[len(train_ids):] |
| save_clusters(data_path, run_name, eoy_date, 'train', train_labels) |
| save_clusters(data_path, run_name, eoy_date, 'val', val_labels) |
|
|
| |
| plot_clust(data, labels) |
|
|
| |
| with open('dtc_params.json') as dtc_params_file: |
| dtc_params = json.load(dtc_params_file) |
|
|
| |
| print('Decision tree classifier training') |
| clf = DTC(**dtc_params).fit(df_train_reduced.to_numpy(), train_labels) |
| clf_model_file = data_path + run_name + '_dtc_model.pkl' |
| pickle.dump(clf, open(clf_model_file, 'wb')) |
|
|
| |
| val_pred = clf.predict(df_val_reduced.to_numpy()) |
|
|
| accuracy = accuracy_score(val_labels, val_pred) |
| mlflow.log_metric('dtc accuracy', accuracy) |
|
|
| |
| cm = confusion_matrix(val_labels, val_pred, labels=clf.classes_) |
| disp = ConfusionMatrixDisplay( |
| confusion_matrix=cm, display_labels=clf.classes_) |
| disp.plot() |
| plt.tight_layout() |
| mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png') |
|
|
| |
| mlflow.end_run() |
|
|
|
|
| main() |
|
|