he99codes's picture
deploy automl
211c37c
"""
Feature Engineering: Auto-generates polynomial, interaction, log/sqrt features.
Optional feature selection via SelectKBest.
"""
import numpy as np
import pandas as pd
from typing import Optional, Tuple
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
class FeatureEngineer:
"""Generates and optionally selects engineered features."""
def __init__(
self,
task_type: str,
use_polynomial: bool = True,
use_log: bool = True,
use_sqrt: bool = True,
use_interactions: bool = True,
select_k: Optional[int] = None,
poly_degree: int = 2,
):
self.task_type = task_type
self.use_polynomial = use_polynomial
self.use_log = use_log
self.use_sqrt = use_sqrt
self.use_interactions = use_interactions
self.select_k = select_k
self.poly_degree = poly_degree
self._poly: Optional[PolynomialFeatures] = None
self._selector: Optional[SelectKBest] = None
self._n_original: int = 0
self._fitted = False
def fit_transform(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
self._n_original = X.shape[1]
X_out = self._generate(X, fit=True)
if self.select_k and self.select_k < X_out.shape[1]:
score_func = f_classif if self.task_type == "classification" else f_regression
self._selector = SelectKBest(score_func=score_func, k=self.select_k)
X_out = self._selector.fit_transform(X_out, y)
self._fitted = True
return X_out.astype(np.float32)
def transform(self, X: np.ndarray) -> np.ndarray:
X_out = self._generate(X, fit=False)
if self._selector is not None:
X_out = self._selector.transform(X_out)
return X_out.astype(np.float32)
def _generate(self, X: np.ndarray, fit: bool) -> np.ndarray:
parts = [X]
eps = 1e-6
# Log transform (only positive values)
if self.use_log:
# Shift to positive before log
X_shifted = X - X.min(axis=0) + eps
log_feats = np.log1p(X_shifted)
parts.append(log_feats)
# Sqrt transform
if self.use_sqrt:
X_shifted = X - X.min(axis=0)
sqrt_feats = np.sqrt(X_shifted)
parts.append(sqrt_feats)
# Polynomial features (degree 2 without bias)
if self.use_polynomial or self.use_interactions:
# Limit to first 20 features to avoid explosion
X_sub = X[:, :min(20, X.shape[1])]
if fit:
self._poly = PolynomialFeatures(
degree=self.poly_degree,
interaction_only=not self.use_polynomial,
include_bias=False,
)
poly_feats = self._poly.fit_transform(X_sub)
else:
poly_feats = self._poly.transform(X_sub)
# Remove original features (already in parts[0])
n_orig_sub = X_sub.shape[1]
poly_feats = poly_feats[:, n_orig_sub:]
parts.append(poly_feats)
X_out = np.hstack(parts)
# Replace any NaN/inf
X_out = np.nan_to_num(X_out, nan=0.0, posinf=0.0, neginf=0.0)
return X_out
@property
def n_features_out(self) -> int:
if not self._fitted:
return 0
# Approximate; actual is computed during fit
return -1 # Will be set after fit