| """ |
| Script to remove all receiver IDs from relevant data sources. |
| """ |
| import json |
| import pandas as pd |
| from sklearn.model_selection import train_test_split |
|
|
|
|
| def get_ids(path): |
| """ |
| Read in IDs |
| -------- |
| :return: list of SafeHavenIDs |
| """ |
| print('Loading IDs from ' + path) |
|
|
| df = pd.read_csv(path, encoding="cp1252") |
| ids = df['SafeHavenID'].tolist() |
|
|
| return ids |
|
|
|
|
| def save_rec_sup(df, data_path, rec_ids, sup_ids): |
| """ |
| Remove receiver IDs from dataframe and pickle the dataset |
| -------- |
| :param df: pandas dataframe to remove ids from |
| :param data_path: path to generated data |
| :param rec_ids: list of SafeHavenIDs in receiver cohort to remove |
| :param sup_ids: list of SafeHavenIDs in scale-up cohort to remove |
| :return: None |
| """ |
| print('Saving REC and SUP data') |
|
|
| |
| df_rec = df[df['SafeHavenID'].isin(rec_ids)] |
| df_sup = df[df['SafeHavenID'].isin(sup_ids)] |
| df = df[~df['SafeHavenID'].isin(rec_ids + sup_ids)] |
|
|
| |
| df_rec.to_pickle(data_path + 'merged_rec.pkl') |
| df_sup.to_pickle(data_path + 'merged_sup.pkl') |
|
|
| return df |
|
|
|
|
| def save_df_ids(df, data_path, ids, typ): |
| """ |
| Save train, test or validation ids and corresponding data |
| -------- |
| :param df: dataframe |
| :param data_path: path to generated data |
| :param ids: list of SafeHavenIDs |
| :param typ: type of dataset to create, 'train', 'test', 'val' |
| """ |
| print('Saving ' + typ + ' data') |
|
|
| df_ids = pd.DataFrame(ids, columns=['SafeHavenID']) |
| df_ids.to_pickle(data_path + typ + '_ids.pkl') |
| df_ids_data = df[df['SafeHavenID'].isin(ids)] |
| df_ids_data.to_pickle(data_path + 'merged_' + typ + '.pkl') |
|
|
|
|
| def df_tts(df, data_path): |
| """ |
| Split data into training and testing sets and save dataframes |
| -------- |
| :param df: pandas dataframe to split |
| :param data_path: path to generated data |
| :return: None |
| """ |
| |
| ids = df['SafeHavenID'].tolist() |
| train_ids, test_ids = train_test_split( |
| ids, test_size=0.2, random_state=42) |
| train_ids, val_ids = train_test_split( |
| train_ids, test_size=0.25, random_state=42) |
|
|
| |
| save_df_ids(df, data_path, train_ids, 'train') |
| save_df_ids(df, data_path, test_ids, 'test') |
| save_df_ids(df, data_path, val_ids, 'val') |
|
|
|
|
| def main(): |
|
|
| |
| with open('../../../config.json') as json_config_file: |
| config = json.load(json_config_file) |
|
|
| |
| data_path = config['model_data_path'] |
| rec_path = config['rec_data_path'] + 'Cohort3Rand.csv' |
| sup_path = config['sup_data_path'] + 'Scale_Up_lookup.csv' |
|
|
| |
| rec_ids = get_ids(rec_path) |
| sup_ids = get_ids(sup_path) |
| |
| |
| df = pd.read_pickle(data_path + 'merged.pkl') |
| df = save_rec_sup(df, data_path, rec_ids, sup_ids) |
|
|
| |
| df_tts(df, data_path) |
|
|
|
|
| main() |
|
|