| """Prepare final train set (encoded, scaled and imputed) and save artifacts.""" |
| import copd |
| import json |
| import joblib |
| from lenusml import encoding |
| import os |
| import pandas as pd |
| import numpy as np |
| from sklearn.preprocessing import MinMaxScaler |
| from sklearn.impute import SimpleImputer |
|
|
| data_dir = '<YOUR_DATA_PATH>/train_data/' |
| cohort_info_dir = '../data/cohort_info/' |
| output_data_dir = '../data/models/model1' |
|
|
| data = pd.read_pickle(os.path.join(data_dir, 'train_data.pkl')) |
|
|
| |
| |
| |
| bool_mapping = {True: 1, False: 0} |
| data['RequiredAcuteNIV'] = data.RequiredAcuteNIV.replace(bool_mapping) |
| data['RequiredICUAdmission'] = data.RequiredICUAdmission.replace(bool_mapping) |
|
|
| |
| sex_mapping = {'F': 1, 'M': 0} |
| data['Sex_F'] = data.Sex.map(sex_mapping) |
| data = data.drop(columns=['Sex']) |
|
|
| |
| |
| |
| cat = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProCat.txt'), |
| delimiter="|") |
|
|
| symptom_diary = pd.read_csv( |
| os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProSymptomDiary.txt'), |
| usecols=['PatientId', 'SubmissionTime', 'SymptomDiaryQ1', 'SymptomDiaryQ2', |
| 'SymptomDiaryQ3', 'SymptomDiaryQ8', 'SymptomDiaryQ9', 'SymptomDiaryQ10'], |
| delimiter="|") |
|
|
| cat['SubmissionTime'] = pd.to_datetime(cat.SubmissionTime, |
| utc=True).dt.normalize() |
| symptom_diary['SubmissionTime'] = pd.to_datetime(symptom_diary.SubmissionTime, |
| utc=True).dt.normalize() |
|
|
|
|
| |
| cat = cat[cat.PatientId.isin(data.PatientId)] |
| symptom_diary = symptom_diary[symptom_diary.PatientId.isin(data.PatientId)] |
|
|
| |
| |
| daily_pros = pd.merge(cat.drop_duplicates(subset=['PatientId', 'SubmissionTime']), |
| symptom_diary.drop_duplicates(subset=['PatientId', |
| 'SubmissionTime']), |
| on=['PatientId', 'SubmissionTime'], how='inner') |
|
|
| |
| numeric_pros = ['CATQ1', 'CATQ2', 'CATQ3', 'CATQ4', 'CATQ5', 'CATQ6', 'CATQ7', |
| 'CATQ8', 'SymptomDiaryQ1', 'SymptomDiaryQ2', 'Score'] |
|
|
| mean_pros = copd.rolling_mean_previous_period(df=daily_pros, cols=numeric_pros, |
| date_col='SubmissionTime', |
| id_col='StudyId', window=3) |
|
|
| |
| daily_pros = daily_pros.merge(mean_pros, on=['StudyId', 'SubmissionTime'], how='left') |
|
|
| daily_pros = copd.calculate_diff_from_rolling_mean(df=daily_pros, cols=numeric_pros) |
|
|
| |
| daily_pros = daily_pros.loc[:, ~daily_pros.columns.str.endswith('_ave')] |
|
|
| |
| train_data = pd.merge_asof(data.sort_values(by='DateOfEvent'), daily_pros.drop( |
| columns=['StudyId']).sort_values(by='SubmissionTime'), |
| left_on='DateOfEvent', right_on='SubmissionTime', |
| by='PatientId', direction='backward') |
|
|
| |
| |
| |
| comorbidities = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetCoMorbidityDetails.txt', |
| delimiter='|') |
| comorbidities = comorbidities.drop(columns=['Id', 'Created']) |
| |
| comorbidity_list = list(comorbidities.columns) |
| comorbidity_list.remove('PatientId') |
|
|
| |
| comorbidities = comorbidities[comorbidities.PatientId.isin(data.PatientId)] |
| print('Train patients with entries in CopdDatasetCoMorbidityDetails: {} out of {}'.format( |
| len(comorbidities), len(data.PatientId.unique()))) |
| comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace( |
| bool_mapping).fillna(0) |
| print('Comorbidity counts:', '\n', comorbidities[comorbidity_list].sum()) |
|
|
| |
| train_data = train_data.merge(comorbidities, on='PatientId', how='left') |
| print('Comorbidity counts after merging with patient days:', '\n', |
| train_data[comorbidity_list].sum()) |
| train_data[comorbidity_list] = train_data[comorbidity_list].fillna(0) |
|
|
| |
| train_data['Comorbidities'] = train_data[comorbidity_list].sum(axis=1) |
| comorb_counts = train_data.groupby('StudyId')['Comorbidities'].max().reset_index() |
| |
| |
|
|
| |
| comorbidity_list.remove('AsthmaOverlap') |
| train_data = train_data.drop(columns=comorbidity_list) |
|
|
| |
| |
| |
| |
| inhaler_type = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetUsualTherapies.txt', |
| delimiter='|', usecols=['StudyId', 'InhalerType']) |
| |
| inhaler_type = inhaler_type[inhaler_type.StudyId.isin(data.StudyId)] |
| |
| inhaler_type = copd.triple_inhaler_therapy_service( |
| df=inhaler_type, id_col='StudyId', inhaler_col='InhalerType', include_mitt=True) |
|
|
| print('Patients taking triple inhaler therapy: ', '\n', |
| inhaler_type.TripleTherapy.value_counts()) |
| train_data = train_data.merge(inhaler_type, on='StudyId', how='left') |
|
|
| |
| |
| |
|
|
| |
| |
| train_data['SymptomDiaryQ8'] = train_data.SymptomDiaryQ8.replace( |
| {1: 'Not difficult', 2: 'A little difficult', 3: 'Quite difficult', |
| 4: 'Very difficult', np.nan: 'None'}) |
|
|
| |
| |
| train_data['SymptomDiaryQ9'] = train_data.SymptomDiaryQ9.replace( |
| {1: 'Watery', 2: 'Sticky liquid', 3: 'Semi-solid', 4: 'Solid', np.nan: 'None'}) |
|
|
| |
| |
| train_data['SymptomDiaryQ10'] = train_data.SymptomDiaryQ10.replace( |
| {1: 'White', 2: 'Yellow', 3: 'Green', 4: 'Dark green', np.nan: 'None'}) |
|
|
| |
| train_data['SmokingStatus'] = train_data.SmokingStatus.replace( |
| {1: 'Smoker', 2: 'Ex-smoker', 3: 'Non-smoker'}) |
|
|
| train_data['InExacWindow'] = train_data.IsExac.replace({0: False, 1: True}) |
|
|
| |
| |
| |
|
|
| train_data['DaysSinceCAT'] = (train_data.DateOfEvent - |
| train_data.SubmissionTime).dt.days.astype('int') |
|
|
| DaysSinceCAT_cutoff = 14 |
| train_data = train_data[train_data.DaysSinceCAT <= DaysSinceCAT_cutoff] |
|
|
| |
| |
| |
|
|
| |
| exac_bins = [-1, 0, 21, 90, 180, np.inf] |
| exac_labels = ['None', '<21 days', '21 - 89 days', '90 - 179 days', '>= 180 days'] |
|
|
| train_data['DaysSinceLastExac'] = copd.bin_numeric_column( |
| col=train_data['DaysSinceLastExac'], bins=exac_bins, labels=exac_labels) |
|
|
| |
| age_bins = [0, 50, 60, 70, 80, np.inf] |
| age_labels = ['<50', '50-59', '60-69', '70-79', '80+'] |
|
|
| train_data['Age'] = copd.bin_numeric_column( |
| col=train_data['Age'], bins=age_bins, labels=age_labels) |
|
|
| |
| comorb_bins = [0, 1, 3, np.inf] |
| comorb_labels = ['None', '1-2', '3+'] |
| train_data['Comorbidities'] = copd.bin_numeric_column( |
| col=train_data['Comorbidities'], bins=comorb_bins, labels=comorb_labels) |
|
|
| comorb_counts['Comorbidities_binned'] = copd.bin_numeric_column( |
| col=comorb_counts['Comorbidities'], bins=comorb_bins, labels=comorb_labels) |
|
|
| |
| spirometry_bins = [0, 30, 50, 80, np.inf] |
| spirometry_labels = ['Very severe', 'Severe', 'Moderate', 'Mild'] |
|
|
| train_data['FEV1PercentPredicted'] = copd.bin_numeric_column( |
| col=train_data['LungFunction_FEV1PercentPredicted'], bins=spirometry_bins, |
| labels=spirometry_labels) |
|
|
| train_data = train_data.drop(columns=['LungFunction_FEV1PercentPredicted']) |
| |
| train_data.loc[ |
| train_data['FEV1PercentPredicted'] == 'nan', 'FEV1PercentPredicted'] = 'Mild' |
| train_data['FEV1PercentPredicted'].value_counts() |
|
|
| |
| |
| |
| train_data['HighestEosinophilCount_0_3'] = np.where( |
| train_data['LabsHighestEosinophilCount'] >= 0.3, 1, 0) |
| train_data = train_data.drop(columns=['LabsHighestEosinophilCount']) |
|
|
| |
| categorical_columns = ['SmokingStatus', 'SymptomDiaryQ8', 'SymptomDiaryQ9', |
| 'SymptomDiaryQ10', 'DaysSinceLastExac', 'Age', 'Comorbidities', |
| 'FEV1PercentPredicted'] |
|
|
| train_data[categorical_columns] = train_data[categorical_columns].astype("str") |
|
|
| |
| |
| target_encodings = encoding.get_target_encodings(train_data=train_data, |
| cols_to_encode=categorical_columns, |
| target='IsExac') |
|
|
| |
| data_encoded = encoding.apply_target_encodings(data=train_data, |
| encodings=target_encodings, |
| cols_to_encode=categorical_columns) |
|
|
| |
| |
| |
| data_encoded = data_encoded.drop(columns=['PatientId', 'InExacWindow', |
| 'DateOfEvent', 'SubmissionTime', |
| 'FirstSubmissionDate', 'LatestPredictionDate']) |
|
|
| scaler = MinMaxScaler() |
| |
| train_data_scaled = scaler.fit_transform( |
| data_encoded.drop(columns=['StudyId', 'IsExac'])) |
|
|
| |
| train_data_scaled = pd.DataFrame(train_data_scaled, columns=data_encoded.drop( |
| columns=['StudyId', 'IsExac']).columns) |
| train_data_scaled.insert(0, 'StudyId', data_encoded.StudyId.values) |
| train_data_scaled['IsExac'] = data_encoded.IsExac.values |
| print('Train data scaled') |
|
|
| |
| |
| |
| imputer = SimpleImputer(missing_values=np.nan, strategy='median') |
|
|
| |
| train_data_imputed = imputer.fit_transform(train_data_scaled.drop( |
| columns=['StudyId', 'IsExac'])) |
|
|
| |
| train_data_imputed = pd.DataFrame(train_data_imputed, columns=train_data_scaled.drop( |
| columns=['StudyId', 'IsExac']).columns) |
| train_data_imputed.insert(0, 'StudyId', train_data_scaled.StudyId.values) |
| train_data_imputed['IsExac'] = train_data_scaled.IsExac.values |
| print('Train data imputed') |
|
|
| |
| |
| |
| artifact_dir = os.path.join(output_data_dir, 'artifacts') |
| os.makedirs(artifact_dir, exist_ok=True) |
| |
| for f in os.listdir(artifact_dir): |
| os.remove(os.path.join(artifact_dir, f)) |
|
|
|
|
| |
| json.dump(target_encodings, open(os.path.join(artifact_dir, |
| 'target_encodings.json'), 'w')) |
| |
| joblib.dump(scaler, os.path.join(artifact_dir, 'scaler.pkl')) |
| print('Minmax scaler saved') |
|
|
| |
| joblib.dump(imputer, os.path.join(artifact_dir, 'imputer.pkl')) |
| print('Median imputer saved') |
|
|
| |
| |
| |
|
|
| |
| train_data_imputed.to_pickle(os.path.join(output_data_dir, 'train_data.pkl')) |
| print('Final train data saved') |
|
|