Spaces:
Running
Running
| """ | |
| Feature Engineering: Auto-generates polynomial, interaction, log/sqrt features. | |
| Optional feature selection via SelectKBest. | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| from typing import Optional, Tuple | |
| from sklearn.preprocessing import PolynomialFeatures | |
| from sklearn.feature_selection import SelectKBest, f_classif, f_regression | |
| class FeatureEngineer: | |
| """Generates and optionally selects engineered features.""" | |
| def __init__( | |
| self, | |
| task_type: str, | |
| use_polynomial: bool = True, | |
| use_log: bool = True, | |
| use_sqrt: bool = True, | |
| use_interactions: bool = True, | |
| select_k: Optional[int] = None, | |
| poly_degree: int = 2, | |
| ): | |
| self.task_type = task_type | |
| self.use_polynomial = use_polynomial | |
| self.use_log = use_log | |
| self.use_sqrt = use_sqrt | |
| self.use_interactions = use_interactions | |
| self.select_k = select_k | |
| self.poly_degree = poly_degree | |
| self._poly: Optional[PolynomialFeatures] = None | |
| self._selector: Optional[SelectKBest] = None | |
| self._n_original: int = 0 | |
| self._fitted = False | |
| def fit_transform(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: | |
| self._n_original = X.shape[1] | |
| X_out = self._generate(X, fit=True) | |
| if self.select_k and self.select_k < X_out.shape[1]: | |
| score_func = f_classif if self.task_type == "classification" else f_regression | |
| self._selector = SelectKBest(score_func=score_func, k=self.select_k) | |
| X_out = self._selector.fit_transform(X_out, y) | |
| self._fitted = True | |
| return X_out.astype(np.float32) | |
| def transform(self, X: np.ndarray) -> np.ndarray: | |
| X_out = self._generate(X, fit=False) | |
| if self._selector is not None: | |
| X_out = self._selector.transform(X_out) | |
| return X_out.astype(np.float32) | |
| def _generate(self, X: np.ndarray, fit: bool) -> np.ndarray: | |
| parts = [X] | |
| eps = 1e-6 | |
| # Log transform (only positive values) | |
| if self.use_log: | |
| # Shift to positive before log | |
| X_shifted = X - X.min(axis=0) + eps | |
| log_feats = np.log1p(X_shifted) | |
| parts.append(log_feats) | |
| # Sqrt transform | |
| if self.use_sqrt: | |
| X_shifted = X - X.min(axis=0) | |
| sqrt_feats = np.sqrt(X_shifted) | |
| parts.append(sqrt_feats) | |
| # Polynomial features (degree 2 without bias) | |
| if self.use_polynomial or self.use_interactions: | |
| # Limit to first 20 features to avoid explosion | |
| X_sub = X[:, :min(20, X.shape[1])] | |
| if fit: | |
| self._poly = PolynomialFeatures( | |
| degree=self.poly_degree, | |
| interaction_only=not self.use_polynomial, | |
| include_bias=False, | |
| ) | |
| poly_feats = self._poly.fit_transform(X_sub) | |
| else: | |
| poly_feats = self._poly.transform(X_sub) | |
| # Remove original features (already in parts[0]) | |
| n_orig_sub = X_sub.shape[1] | |
| poly_feats = poly_feats[:, n_orig_sub:] | |
| parts.append(poly_feats) | |
| X_out = np.hstack(parts) | |
| # Replace any NaN/inf | |
| X_out = np.nan_to_num(X_out, nan=0.0, posinf=0.0, neginf=0.0) | |
| return X_out | |
| def n_features_out(self) -> int: | |
| if not self._fitted: | |
| return 0 | |
| # Approximate; actual is computed during fit | |
| return -1 # Will be set after fit | |