File size: 2,250 Bytes
b077775 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | import pandas as pd
def preprocess_real_data(train, test, target):
"""Preprocesses the real training and testing datasets by selecting relevant features and encoding categorical variables.
Args:
train (pd.DataFrame): The real training dataset.
test (pd.DataFrame): The real testing dataset.
target (str): The name of the target variable.
Returns:
tuple: A tuple containing the preprocessed training features, training target, testing features, and testing target.
"""
train['Outstanding_Debt'] = train['Outstanding_Debt'] / 1000
test['Outstanding_Debt'] = test['Outstanding_Debt'] / 1000
cols = [
'Num_Credit_Card',
'Changed_Credit_Limit',
'Delay_from_due_date',
'Interest_Rate',
'Credit_Mix',
'Outstanding_Debt',
target
]
train = train[cols]
test = test[cols]
train = pd.get_dummies(train, columns=['Credit_Mix'], drop_first=True)
test = pd.get_dummies(test, columns=['Credit_Mix'], drop_first=True)
X_real_train = train.drop(columns=[target])
y_real_train = train[target]
X_real_test = test.drop(columns=[target])
y_real_test = test[target]
return X_real_train, y_real_train, X_real_test, y_real_test
def preprocess_synthetic_data(synthetic_data, target):
"""Preprocesses the synthetic dataset by selecting relevant features and encoding categorical variables.
Args:
synthetic_data (pd.DataFrame): The synthetic dataset to preprocess.
target (str): The name of the target variable.
Returns:
tuple: A tuple containing the preprocessed synthetic features and synthetic target.
"""
synthetic_data['Outstanding_Debt'] = synthetic_data['Outstanding_Debt'] / 1000
synthetic_data = synthetic_data[[
'Num_Credit_Card',
'Changed_Credit_Limit',
'Delay_from_due_date',
'Interest_Rate',
'Credit_Mix',
'Outstanding_Debt',
target
]]
synthetic_data = pd.get_dummies(synthetic_data, columns=['Credit_Mix'], drop_first=True)
X_synthetic_train = synthetic_data.drop(columns=[target])
y_synthetic_train = synthetic_data[target]
return X_synthetic_train, y_synthetic_train |