CreditScoringSystemSimulation / app /utils /credit_preprocessor.py
Zirok05's picture
test (#1)
bd19734
Raw
History Blame Contribute Delete
14.2 kB
import pandas as pd
import numpy as np
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
class CreditDataPreprocessor(BaseEstimator, TransformerMixin):
# Полный препроцессинг данных
def __init__(self,
NumberOfDependents_fill_value=0,
NumberOfDependents_up_threshold=10,
MonthlyIncome_fill_value=0,
RevolvingUtilizationOfUnsecuredLines_drop_threshold=2,
age_low_drop_threshold=18,
age_up_drop_threshold=80,
DebtRatio_up_threshold=5,
PastDueRiskScore_weights=[1.0, 1.2, 1.3],
NumberRealEstateLoansOrLines_drop_threshold=20,
drop_special_codes=False):
self.NumberOfDependents_fill_value = NumberOfDependents_fill_value
self.NumberOfDependents_up_threshold = NumberOfDependents_up_threshold
self.MonthlyIncome_fill_value = MonthlyIncome_fill_value
self.RevolvingUtilizationOfUnsecuredLines_drop_threshold = RevolvingUtilizationOfUnsecuredLines_drop_threshold
self.age_low_drop_threshold = age_low_drop_threshold
self.age_up_drop_threshold = age_up_drop_threshold
self.DebtRatio_up_threshold = DebtRatio_up_threshold
self.PastDueRiskScore_weights = PastDueRiskScore_weights
self.NumberRealEstateLoansOrLines_drop_threshold = NumberRealEstateLoansOrLines_drop_threshold
self.drop_special_codes = drop_special_codes
def fit(self, X, y=None):
return self
def transform(self, X):
X_copy = X.copy()
X_copy['NumberOfDependents'] = X_copy['NumberOfDependents'].fillna(value=self.NumberOfDependents_fill_value)
X_copy['NumberOfDependents'] = X_copy['NumberOfDependents'].clip(0, self.NumberOfDependents_up_threshold).copy()
X_copy['MonthlyIncomeIsMissing'] = 0
X_copy.loc[X_copy['MonthlyIncome'].isna(), 'MonthlyIncomeIsMissing'] = 1
X_copy['MonthlyIncome'] = X['MonthlyIncome'].fillna(value=self.MonthlyIncome_fill_value)
X_copy['RevolvingUtilizationOverOne'] = 0.0
X_copy.loc[X_copy['RevolvingUtilizationOfUnsecuredLines'] > 1, 'RevolvingUtilizationOverOne'] = 1.0
X_copy['RevolvingUtilizationOfUnsecuredLines'] = X_copy['RevolvingUtilizationOfUnsecuredLines'].clip(0,
1).copy()
X_copy['DebtPayments'] = 0.0
X_copy.loc[X_copy['MonthlyIncome'] == 0, 'DebtPayments'] = X_copy.loc[X_copy['MonthlyIncome'] == 0, 'DebtRatio']
X_copy.loc[X_copy['MonthlyIncome'] != 0, 'DebtPayments'] = X_copy.loc[
X_copy['MonthlyIncome'] != 0, 'DebtRatio'] * \
X_copy.loc[
X_copy['MonthlyIncome'] != 0, 'MonthlyIncome']
X_copy['DebtRatio'] = X_copy['DebtRatio'].clip(0, self.DebtRatio_up_threshold).copy()
X_copy['DebtPayments_over_10k'] = 0.0
X_copy.loc[X_copy['DebtPayments'] > 10000, 'DebtPayments_over_10k'] = 1.0
X_copy['DebtPayments'] = X_copy['DebtPayments'].clip(0, 10000).copy()
X_copy['MonthlyIncome_over_20k'] = 0.0
X_copy.loc[X_copy['MonthlyIncome'] >= 20000, 'MonthlyIncome_over_20k'] = 1.0
X_copy['MonthlyIncome'] = X_copy['MonthlyIncome'].clip(0, 20000)
X_copy['Code96'] = 0.0
X_copy['Code98'] = 0.0
X_copy.loc[X_copy['NumberOfTime30-59DaysPastDueNotWorse'] == 96, 'Code96'] = 1.0
X_copy.loc[X_copy['NumberOfTime30-59DaysPastDueNotWorse'] == 98, 'Code98'] = 1.0
X_copy['PastDueRiskScore'] = (
self.PastDueRiskScore_weights[0] * X_copy['NumberOfTime30-59DaysPastDueNotWorse'] +
self.PastDueRiskScore_weights[1] * X_copy['NumberOfTime60-89DaysPastDueNotWorse'] +
self.PastDueRiskScore_weights[2] * X_copy['NumberOfTimes90DaysLate'])
X_copy.loc[X_copy['NumberOfTime30-59DaysPastDueNotWorse'] == 96, 'PastDueRiskScore'] = 96
X_copy.loc[X_copy['NumberOfTime30-59DaysPastDueNotWorse'] == 98, 'PastDueRiskScore'] = 98
X_copy = X_copy.drop(columns=['NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTime60-89DaysPastDueNotWorse',
'NumberOfTimes90DaysLate'])
X_copy['NumberOfOpenCreditLinesAndLoans_over_30'] = 0.0
X_copy.loc[X_copy['NumberOfOpenCreditLinesAndLoans'] > 30, 'NumberOfOpenCreditLinesAndLoans_over_30'] = 1.0
X_copy['NumberOfOpenCreditLinesAndLoans'] = X_copy['NumberOfOpenCreditLinesAndLoans'].clip(0, 30).copy()
X_copy['NumberRealEstateLoansOrLines_over_5'] = 0.0
X_copy.loc[X_copy['NumberRealEstateLoansOrLines'] > 5, 'NumberRealEstateLoansOrLines_over_5'] = 1.0
X_copy['NumberRealEstateLoansOrLines'] = X_copy['NumberRealEstateLoansOrLines'].clip(0, 5).copy()
X_copy['ConsumerCredit_Group'] = pd.cut(X_copy['NumberOfOpenCreditLinesAndLoans'],
bins=[0, 1, 2, 6, 15, 31],
labels=[
'0_loans',
'1_loans',
'2-5_loans',
'6-14_loans',
'16-30_loans'
])
consumer_dummy = pd.get_dummies(X_copy['ConsumerCredit_Group'], prefix='Consumer', drop_first=False).astype(
'float')
X_copy['RealEstateLoans_Group'] = pd.cut(X_copy['NumberRealEstateLoansOrLines'],
bins=[-1, 0, 3, 100],
labels=[
'0_loans',
'1-3_loans',
'4+_loans',
])
estate_dummy = pd.get_dummies(X_copy['RealEstateLoans_Group'], prefix='RealEstateLoans',
drop_first=False).astype('float')
X_copy = pd.concat([X_copy, consumer_dummy, estate_dummy], axis=1).copy()
X_copy = X_copy.drop(columns=['ConsumerCredit_Group',
'RealEstateLoans_Group']).copy()
X_copy = X_copy.drop(columns=['Consumer_6-14_loans',
'RealEstateLoans_0_loans']).copy()
X_copy = X_copy.drop(columns=['NumberOfOpenCreditLinesAndLoans',
'NumberRealEstateLoansOrLines',
'MonthlyIncomeIsMissing',
'MonthlyIncome_over_20k',
'Consumer_0_loans',
'NumberOfOpenCreditLinesAndLoans_over_30']).copy()
if self.drop_special_codes:
X_copy = X_copy.drop(columns=['Code96', 'Code98'])
return X_copy
def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)
def clean_train(self, X, y=None):
mask = (
(X[
'RevolvingUtilizationOfUnsecuredLines'] <= self.RevolvingUtilizationOfUnsecuredLines_drop_threshold) &
(X['age'] >= self.age_low_drop_threshold) &
(X['age'] <= self.age_up_drop_threshold) &
(X['NumberRealEstateLoansOrLines'] <= self.NumberRealEstateLoansOrLines_drop_threshold)
)
X_clean = X[mask].copy()
if y is not None:
y_clean = y[mask].copy()
return X_clean, y_clean
return X_clean
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
class CreditScaler(BaseEstimator, TransformerMixin):
"""
Масштабирует только не-булевые колонки.
Можно задать различные способы масштабирования
"""
def __init__(self, scaler_type='standard'):
"""
Параметр scaler_type - тип scaler'а.
Доступные типы:
- 'standard': StandardScaler (среднее=0, дисперсия=1)
- 'robust': RobustScaler (устойчив к выбросам)
- 'minmax': MinMaxScaler (приводит к [0, 1])
- 'maxabs': MaxAbsScaler (приводит к [-1, 1])
"""
self.boolean_columns = [
'RevolvingUtilizationOverOne',
'DebtPayments_over_10k',
'Code96',
'Code98',
'NumberRealEstateLoansOrLines_over_5',
'Consumer_1_loans',
'Consumer_2-5_loans',
'Consumer_16-30_loans',
'RealEstateLoans_1-3_loans',
'RealEstateLoans_4+_loans'
]
self.scaler_type = scaler_type
self._create_scaler()
# Эти переменные заполнятся во время fit
self.columns_to_scale_ = None
self.n_features_in_ = None
self.feature_names_in_ = None
def _create_scaler(self):
"""Создает scaler по типу"""
if self.scaler_type == 'standard':
self.scaler = StandardScaler()
elif self.scaler_type == 'robust':
self.scaler = RobustScaler()
elif self.scaler_type == 'minmax':
self.scaler = MinMaxScaler()
elif self.scaler_type == 'maxabs':
self.scaler = MaxAbsScaler()
else:
raise ValueError(
f"Unknown scaler_type: {self.scaler_type}. "
f"Available: standard, robust, minmax, maxabs"
)
def fit(self, X, y=None):
"""
Определяет колонки для масштабирования (все, кроме булевых)
и обучает scaler.
"""
self.feature_names_in_ = X.columns.tolist()
self.n_features_in_ = len(self.feature_names_in_)
self.columns_to_scale_ = [
col for col in self.feature_names_in_
if col not in self.boolean_columns
]
self.scaler.fit(X[self.columns_to_scale_])
return self
def transform(self, X, y=None):
"""
Масштабирует только не-булевы колонки.
"""
X_copy = X.copy()
X_copy[self.columns_to_scale_] = self.scaler.transform(X_copy[self.columns_to_scale_])
return X_copy
def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X, y)
def get_feature_names_out(self, input_features=None):
"""Для совместимости с sklearn"""
if input_features is not None:
return input_features
return self.feature_names_in_ if self.feature_names_in_ is not None else []
def set_params(self, **params):
"""Для совместимости с GridSearchCV"""
if 'scaler_type' in params and params['scaler_type'] != self.scaler_type:
self.scaler_type = params['scaler_type']
self._create_scaler()
return super().set_params(**params)
def check_business_rules(age, monthly_income, monthly_debt, debt_ratio,
late_90, late_60_89, late_30_59, credit_lines,
real_estate, utilization, dependents):
# КРИТИЧЕСКИЕ ПРАВИЛА - сразу отказ
if age < 18:
return {
'needs_manual': False,
'message': 'Возраст менее 18 лет - кредит не выдаётся',
'decision': 1 # отказ
}
# СПЕЦИАЛЬНЫЕ БАНКОВСКИЕ КОДЫ - сразу ручной разбор
if (late_90 == 98) or (late_60_89 == 98) or (late_30_59 == 98):
return {
'needs_manual': True,
'message': 'Код 98: Списание долга как безнадежного',
'decision': None
}
if (late_90 == 96) or (late_60_89 == 96) or (late_30_59 == 96):
return {
'needs_manual': True,
'message': 'Код 96: Изъятие залога или реализация имущества',
'decision': None
}
# КРИТИЧЕСКИЕ ПРАВИЛА - сразу ручной разбор
if age > 80:
return {
'needs_manual': True,
'message': 'Возраст > 80 лет - требуется ручной разбор (индивидуальные условия)',
'decision': None
}
if monthly_income > 1000000:
return {
'needs_manual': True,
'message': 'Доход свыше 1,000,000 $ - требуется ручной разбор',
'decision': None
}
if monthly_debt > 1000000:
return {
'needs_manual': True,
'message': 'Платежи свыше 1,000,000 $ - требуется ручной разбор',
'decision': None
}
if utilization > 2:
return {
'needs_manual': True,
'message': 'Использование кредитных средств превышает 200%',
'decision': None
}
if real_estate > 20:
return {
'needs_manual': True,
'message': 'Количество кредитов под залог недвижимости слишком велико - ручной разбор',
'decision': None
}
# 4. ВСЕ ПРОВЕРКИ ПРОЙДЕНЫ - допуск к авторазбору моделью
return {
'needs_manual': False,
'decision': None,
}