| |
| import pandas as pd |
|
|
| |
| file_path = '<YOUR_DATA_PATH>/' |
| input_file_path = file_path + 'data_for_model_e_columns/' |
|
|
|
|
| def read_data(file): |
| """ |
| Read in data source |
| -------- |
| :param file: string filename |
| :return: dataframe |
| """ |
| df = pd.read_csv(file) |
| return df |
| |
|
|
| def format_data(data, IDs, onboard): |
| """ |
| Convert datetime columns to datetime format, filter to only include RECEIVER and scale up 1 IDs, |
| and join onboarding dates to exacerbations data for each study ID |
| -------- |
| :param data: exacerbations dataframe |
| :param IDs: dataframe containing RC and SU1 study IDs |
| :param onboard: dataframe containing onboarding dates |
| :return: formatted dataframe |
| """ |
| data['SubmissionTime'] = pd.to_datetime(data['SubmissionTime'], utc=True) |
| onboard['OB_date'] = pd.to_datetime(onboard['OB_date'], utc=True) |
| onboard['yearcensor'] = onboard['OB_date'] + pd.offsets.DateOffset(days=365) |
| data = pd.merge(IDs, data, on="Study_ID", how="left") |
| data = pd.merge(data, onboard, on="Study_ID", how="left") |
| return data |
|
|
|
|
| def filter_study_censor(data): |
| """ |
| Filter the dataframe to only contain data obtained before the study censor date |
| -------- |
| :param data: dataframe |
| :return: dataframe containing data obtained before the study censor date |
| """ |
| return data[data['SubmissionTime'] < '2021-09-01'] |
|
|
|
|
| def filter_first_year(data): |
| """ |
| Filter a dataframe to only contain data obtained in the first year post-onboarding |
| -------- |
| :param data: dataframe |
| :return: dataframe containing only data obtained in the first year post-onboarding |
| """ |
| return data[data['yearcensor'] >= data['SubmissionTime']] |
|
|
|
|
| def get_exac_data(data, onboard, IDs): |
| """ |
| Calculate the number of exacerbations to year censor and study censor |
| and the length of time to first exacerbation for each study ID and save the |
| resulting dataframe |
| -------- |
| :param censor_data: PRO LOGIC exacerbations data censored at the study censor date |
| :param year_censor_data: PRO LOGIC exacerbations data censored a year post onboaridng |
| :param onboard: Dataframe showing onboarding dates for the study participants |
| :param IDs: Dataframe containing all RC and SU1 study IDs |
| :return: dataframe showing exacerbation counts and the length of time to first exacerbation for each study ID |
| """ |
| censor_data = filter_study_censor(data) |
| year_censor_data = filter_first_year(data) |
|
|
| censor_sum = censor_data.groupby("Study_ID").SubmissionTime.agg( |
| first_exacerbation='min', |
| exacerbation_count_to_censor='count').copy() |
| censor_sum = pd.merge(censor_sum, onboard, on="Study_ID", how="outer") |
| censor_sum["days_to_first_exacerbation"] = (censor_sum["first_exacerbation"] - censor_sum["OB_date"]).dt.days |
| |
| year_censor_sum = year_censor_data.groupby("Study_ID").SubmissionTime.agg( |
| exacerbation_count_to_year='count').copy() |
| |
| PRO_LOGIC_exacerbation_data = pd.merge(censor_sum, year_censor_sum, on="Study_ID", how="outer") |
| PRO_LOGIC_exacerbation_data = pd.merge(IDs, PRO_LOGIC_exacerbation_data, on="Study_ID", how="left") |
|
|
| PRO_LOGIC_exacerbation_data.to_csv(file_path + 'PRO_LOGIC_exacerbation_data.csv') |
|
|
|
|
| def main(): |
| |
| PRO_LOGIC_data = input_file_path + "PRO_LOGIC_exacerbations_and_dates.csv" |
| RC_SU1_IDs_data_file = input_file_path + "RC_SU1_IDs.csv" |
| onboard_file = input_file_path + "onboarding_dates.csv" |
|
|
| PRO_LOGIC_data = read_data(PRO_LOGIC_data) |
| RC_SU1_IDs = read_data(RC_SU1_IDs_data_file) |
| Onboard = read_data(onboard_file) |
|
|
| |
| PRO_LOGIC_data = format_data(PRO_LOGIC_data, RC_SU1_IDs, Onboard) |
|
|
| |
| get_exac_data(PRO_LOGIC_data, Onboard, RC_SU1_IDs) |
|
|
|
|
| main() |