he99codes's picture
deploy automl
211c37c
"""
Preprocessing Pipeline: Handles numeric, categorical, and text features.
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Optional
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
try:
from category_encoders import TargetEncoder
HAS_TARGET_ENC = True
except ImportError:
HAS_TARGET_ENC = False
class PreprocessingPipeline:
"""Auto-build preprocessing pipeline based on feature types."""
def __init__(
self,
numeric_cols: List[str],
categorical_cols: List[str],
text_cols: List[str],
task_type: str,
cat_threshold: int = 15,
):
self.numeric_cols = numeric_cols
self.categorical_cols = categorical_cols
self.text_cols = text_cols
self.task_type = task_type
self.cat_threshold = cat_threshold
self.pipeline: Optional[ColumnTransformer] = None
self.target_encoder: Optional[LabelEncoder] = None
self._fitted = False
def build(self, df: pd.DataFrame) -> "PreprocessingPipeline":
transformers = []
if self.numeric_cols:
numeric_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler()),
])
transformers.append(("numeric", numeric_pipeline, self.numeric_cols))
ohe_cols, te_cols = [], []
for col in self.categorical_cols:
n_unique = df[col].nunique()
if n_unique <= self.cat_threshold or not HAS_TARGET_ENC:
ohe_cols.append(col)
else:
te_cols.append(col)
if ohe_cols:
ohe_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])
transformers.append(("ohe_cat", ohe_pipeline, ohe_cols))
if te_cols and HAS_TARGET_ENC:
from category_encoders import TargetEncoder as TE
te_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", TE()),
])
transformers.append(("te_cat", te_pipeline, te_cols))
elif te_cols:
# Fallback: ordinal encode high-cardinality cats
ord_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
])
transformers.append(("ord_cat", ord_pipeline, te_cols))
if self.text_cols:
for col in self.text_cols:
transformers.append((
f"text_{col}",
TfidfVectorizer(max_features=50, ngram_range=(1, 2)),
col,
))
if not transformers:
# Fallback: passthrough all columns as numeric
transformers.append(("passthrough", "passthrough", list(df.columns)))
self.pipeline = ColumnTransformer(
transformers=transformers,
remainder="drop",
sparse_threshold=0.0,
)
return self
def fit_transform(self, X: pd.DataFrame, y: pd.Series) -> np.ndarray:
Xt = self.pipeline.fit_transform(X, y)
if sp.issparse(Xt):
Xt = Xt.toarray()
self._fitted = True
return np.nan_to_num(Xt.astype(np.float32))
def transform(self, X: pd.DataFrame) -> np.ndarray:
Xt = self.pipeline.transform(X)
if sp.issparse(Xt):
Xt = Xt.toarray()
return np.nan_to_num(Xt.astype(np.float32))
def fit_transform_target(self, y: pd.Series) -> np.ndarray:
if self.task_type == "classification":
self.target_encoder = LabelEncoder()
return self.target_encoder.fit_transform(y)
return y.values.astype(np.float32)
def transform_target(self, y: pd.Series) -> np.ndarray:
if self.task_type == "classification" and self.target_encoder:
return self.target_encoder.transform(y)
return y.values.astype(np.float32)
def inverse_transform_target(self, y: np.ndarray) -> np.ndarray:
if self.task_type == "classification" and self.target_encoder:
return self.target_encoder.inverse_transform(y)
return y
def get_categorical_dims(self, df: pd.DataFrame) -> Dict[str, int]:
return {col: int(df[col].nunique()) + 1 for col in self.categorical_cols}