| """Perform cross validation using a variety of algorithms.""" |
| import os |
| import pandas as pd |
| import numpy as np |
|
|
| from lenusml import splits, plots |
|
|
| |
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.model_selection import cross_validate, cross_val_predict |
| from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier |
| from interpret.glassbox import ExplainableBoostingClassifier |
| import lightgbm as lgb |
| import xgboost as xgb |
| import mlflow |
|
|
|
|
| data_dir = '../data/models/model1/' |
| cohort_info_dir = '../data/cohort_info/' |
| output_dir = '../data/models/model1/output' |
|
|
| |
| fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'), |
| allow_pickle=True) |
| train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl')) |
|
|
| |
| cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients, |
| train_data=train_data, |
| id_column='StudyId') |
|
|
| |
| cols_to_drop = ['StudyId', 'IsExac'] |
| features_list = [col for col in train_data.columns if col not in cols_to_drop] |
|
|
| |
| features = train_data[features_list].astype('float') |
| target = train_data.IsExac.astype('float') |
|
|
| scale_pos_weight = target.value_counts()[0] / target.value_counts()[1] |
|
|
| mlflow.set_tracking_uri("sqlite:///mlruns.sqlite") |
| mlflow.set_experiment('model_drop2') |
|
|
| |
| scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc', |
| 'average_precision', 'neg_brier_score'] |
| scale_pos_weight = target.value_counts()[0] / target.value_counts()[1] |
|
|
| models = [] |
| models.append((RandomForestClassifier(random_state=0), 'random_forest')) |
| models.append((RandomForestClassifier(random_state=0, class_weight='balanced'), |
| 'random_forest_class_weight')) |
| models.append((BalancedBaggingClassifier(random_state=0), |
| 'balanced_bagging')) |
| models.append((BalancedRandomForestClassifier(random_state=0), 'balanced_random_forest')) |
| models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, |
| eval_metric='logloss'), 'xgb')) |
| models.append((lgb.LGBMClassifier(random_state=0), 'lgbm')) |
| models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, |
| eval_metric='logloss', scale_pos_weight=scale_pos_weight), 'xgb_spw')) |
| models.append((ExplainableBoostingClassifier(random_state=0), 'ebm')) |
|
|
| with mlflow.start_run(run_name='model_selection'): |
| |
| for model in models: |
| with mlflow.start_run(run_name=model[1], nested=True): |
| |
| artifact_dir = './tmp' |
| os.makedirs(artifact_dir, exist_ok=True) |
| |
| |
| for f in os.listdir(artifact_dir): |
| os.remove(os.path.join(artifact_dir, f)) |
|
|
| crossval = cross_validate(model[0], features, target, |
| cv=cross_validation_fold_indices, |
| return_estimator=True, scoring=scoring) |
| |
| probabilities_cv = cross_val_predict(model[0], features, target, |
| cv=cross_validation_fold_indices, |
| method='predict_proba')[:, 1] |
| model_scores = pd.DataFrame({'model_score': probabilities_cv, |
| 'true_label': target}) |
|
|
| |
| for score in scoring: |
| mlflow.log_metric(score, crossval['test_' + score].mean()) |
|
|
| |
| params = model[0].get_params() |
| for param in params: |
| mlflow.log_param(param, params[param]) |
|
|
| plots.plot_lift_curve(scores=model_scores, savefig=True, |
| output_dir=artifact_dir, figname='lift_curve.png') |
| plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True, |
| output_dir=artifact_dir, |
| figname='cumulative_gains_curve.png') |
|
|
| |
| plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac', |
| negative_class_name='No exac', savefig=True, |
| output_dir=artifact_dir, |
| figname='model_score_distribution.png') |
|
|
| |
| mlflow.log_artifacts(artifact_dir) |
| mlflow.end_run() |
|
|