| """ |
| To Do: |
| - Refactor script to be more readable/smaller main function |
| """ |
| import json |
| import pandas as pd |
| import numpy as np |
| from datetime import timedelta |
|
|
|
|
| def read_pkl_data(dataset, data_path, path_type): |
| """ |
| Read in pickled dataset |
| -------- |
| :param dataset: type of dataset to read in |
| :param data_path: path to generated data |
| :param path_type: type of path to read from |
| :return: dataframe |
| """ |
| print('Reading in ' + dataset) |
|
|
| file_path = data_path + dataset |
| if path_type == 'data': |
| file_path += '_proc.pkl' |
| else: |
| file_path += '_first_dates.pkl' |
| |
| return pd.read_pickle(file_path) |
|
|
|
|
| def fill_eth_grp_data(df): |
| """ |
| Fill nulls in eth_grp column introduced in joining |
| :param df: dataframe to update |
| :return: Filled dataframe |
| """ |
| df['eth_grp'] = df.groupby('SafeHavenID').eth_grp.apply( |
| lambda x: x.ffill().bfill()) |
| df['eth_grp'] = df['eth_grp'].fillna('Unknown') |
|
|
| return df |
|
|
|
|
| def fill_to_date_columns(df): |
| """ |
| Fill nulls in to_date columns introduced in joining |
| :param df: dataframe to update |
| :return: Filled dataframe |
| """ |
| to_date_cols = ['adm_to_date', 'copd_to_date', 'resp_to_date', |
| 'presc_to_date', 'rescue_to_date', 'labs_to_date', |
| 'anxiety_depression_to_date', |
| 'anxiety_depression_presc_to_date'] |
| df[to_date_cols] = df.groupby('SafeHavenID')[to_date_cols].apply( |
| lambda x: x.ffill().fillna(0)) |
|
|
| return df |
|
|
|
|
| def fill_yearly_columns(df): |
| """ |
| Fill nulls in yearly columns introduced in joining |
| :param df: dataframe to update |
| :return: Filled dataframe |
| """ |
| zero_cols = ['adm_per_year', 'total_hosp_days', 'mean_los', |
| 'copd_per_year', 'resp_per_year', 'comorb_per_year', |
| 'salbutamol_per_year', |
| 'saba_inhaler_per_year', 'laba_inhaler_per_year', |
| 'lama_inhaler_per_year', 'sama_inhaler_per_year', |
| 'ics_inhaler_per_year', 'laba_ics_inhaler_per_year', |
| 'lama_laba_ics_inhaler_per_year', 'saba_sama_inhaler_per_year', |
| 'mcs_inhaler_per_year', 'rescue_meds_per_year', |
| 'presc_per_year', 'labs_per_year', |
| 'anxiety_depression_per_year', 'anxiety_depression_presc_per_year'] |
| df[zero_cols] = df[zero_cols].fillna(0) |
|
|
| return df |
|
|
|
|
| def fill_days_since(df, typ): |
| """ |
| Fill days_since_copd/resp/rescue |
| :param df: dataframe to update |
| :param typ: type of feature to fill ('copd', 'resp', 'rescue') |
| :return: Filled dataframe |
| """ |
| df['days_since_' + typ] = df.eoy - df[typ + '_date'].ffill() |
|
|
| return df |
|
|
|
|
| def process_first_dates(df): |
| """ |
| Process dataframe containing patient's first date in the health board region |
| -------- |
| :param df: dataframe to process |
| :return: processed dataframe |
| """ |
| df = df.set_index('SafeHavenID') |
| entry_dataset = df.idxmin(axis=1).apply(lambda x: x.split('_')[1]) |
| first_entry = df.min(axis=1) |
| df['entry_dataset'] = entry_dataset |
| df['first_entry'] = first_entry |
| df_reduced = df[['entry_dataset', 'first_entry']].reset_index() |
|
|
| return df_reduced |
|
|
|
|
| def find_closest_simd(v): |
| """ |
| Find closest SIMD vigintile for each row 'v' |
| -------- |
| :param v: row of data from apply statement |
| :param typ: type of simd column to add |
| :return: simd value |
| """ |
| simd_years = [2009, 2012, 2016] |
| bools = [v.eoy.year >= year for year in simd_years] |
| if any(bools): |
| simd_year = str(simd_years[np.where(bools)[0][-1]]) |
| v['simd_quintile'] = v['simd_' + simd_year + '_quintile'] |
| v['simd_decile'] = v['simd_' + simd_year + '_decile'] |
| v['simd_vigintile'] = v['simd_' + simd_year + '_vigintile'] |
| else: |
| v['simd_quintile'] = np.nan |
| v['simd_decile'] = np.nan |
| v['simd_vigintile'] = np.nan |
|
|
| return v |
|
|
|
|
| def main(): |
|
|
| |
| with open('../../../config.json') as json_config_file: |
| config = json.load(json_config_file) |
| data_path = config['model_data_path'] |
|
|
| |
| adm = read_pkl_data('adm', data_path, 'data') |
| comorb = read_pkl_data('comorb', data_path, 'data') |
| presc = read_pkl_data('presc', data_path, 'data') |
| labs = read_pkl_data('labs', data_path, 'data') |
| demo = read_pkl_data('demo', data_path, 'data') |
|
|
| |
| df = adm.join( |
| comorb, how='left').join( |
| presc, how='outer').join( |
| labs, how='outer') |
| df = df.reset_index() |
|
|
| |
| print('Filling data') |
| df = fill_eth_grp_data(df) |
| df = fill_to_date_columns(df) |
| df = fill_yearly_columns(df) |
|
|
| |
| for typ in ['copd', 'resp', 'rescue', 'adm']: |
| df = df.groupby('SafeHavenID').apply(fill_days_since, typ) |
|
|
| |
| ds_cols = ['days_since_copd', 'days_since_resp'] |
| df['days_since_copd_resp'] = df[ds_cols].min(axis=1) |
|
|
| |
| print('Adding first dates') |
| adm_dates = read_pkl_data('adm', data_path, 'date') |
| presc_dates = read_pkl_data('presc', data_path, 'date') |
| labs_dates = read_pkl_data('labs', data_path, 'date') |
|
|
| |
| first_dates = pd.merge( |
| pd.merge(adm_dates, presc_dates, how="outer", on='SafeHavenID'), |
| labs_dates, how="outer", on='SafeHavenID') |
|
|
| |
| first_dates.to_pickle(data_path + 'overall_first_dates.pkl') |
|
|
| |
| date_data = process_first_dates(first_dates) |
|
|
| |
| print('Merging data') |
| df_merged = pd.merge(df, date_data, on='SafeHavenID', how='inner') |
|
|
| |
| ggc_years = (df_merged.eoy - df_merged.first_entry) / np.timedelta64(1, 'Y') |
| df_merged['ggc_years'] = round(ggc_years) |
|
|
| |
| df_merged = pd.merge(df_merged, demo, on='SafeHavenID') |
|
|
| |
| dt_diff = df_merged.eoy - pd.to_datetime(df_merged.obf_dob) |
| df_merged['age'] = dt_diff // timedelta(days=365.2425) |
|
|
| |
| df_merged = df_merged.apply(find_closest_simd, axis=1) |
|
|
| |
| cols2drop = ['copd_date', 'resp_date', 'adm_date', 'rescue_date', |
| 'simd_2009_quintile', 'simd_2009_decile', |
| 'simd_2009_vigintile', 'simd_2012_quintile', |
| 'simd_2012_decile', 'simd_2012_vigintile', |
| 'simd_2016_quintile', 'simd_2016_decile', |
| 'simd_2016_vigintile', 'days_since_copd', |
| 'days_since_resp'] |
| df_merged = df_merged.drop(cols2drop, axis=1) |
|
|
| |
| df_merged.to_pickle(data_path + 'merged_full.pkl') |
|
|
|
|
| main() |
|
|