File size: 3,540 Bytes
211c37c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""
Feature Engineering: Auto-generates polynomial, interaction, log/sqrt features.
Optional feature selection via SelectKBest.
"""

import numpy as np
import pandas as pd
from typing import Optional, Tuple

from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif, f_regression


class FeatureEngineer:
    """Generates and optionally selects engineered features."""

    def __init__(
        self,
        task_type: str,
        use_polynomial: bool = True,
        use_log: bool = True,
        use_sqrt: bool = True,
        use_interactions: bool = True,
        select_k: Optional[int] = None,
        poly_degree: int = 2,
    ):
        self.task_type = task_type
        self.use_polynomial = use_polynomial
        self.use_log = use_log
        self.use_sqrt = use_sqrt
        self.use_interactions = use_interactions
        self.select_k = select_k
        self.poly_degree = poly_degree

        self._poly: Optional[PolynomialFeatures] = None
        self._selector: Optional[SelectKBest] = None
        self._n_original: int = 0
        self._fitted = False

    def fit_transform(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        self._n_original = X.shape[1]
        X_out = self._generate(X, fit=True)

        if self.select_k and self.select_k < X_out.shape[1]:
            score_func = f_classif if self.task_type == "classification" else f_regression
            self._selector = SelectKBest(score_func=score_func, k=self.select_k)
            X_out = self._selector.fit_transform(X_out, y)

        self._fitted = True
        return X_out.astype(np.float32)

    def transform(self, X: np.ndarray) -> np.ndarray:
        X_out = self._generate(X, fit=False)
        if self._selector is not None:
            X_out = self._selector.transform(X_out)
        return X_out.astype(np.float32)

    def _generate(self, X: np.ndarray, fit: bool) -> np.ndarray:
        parts = [X]
        eps = 1e-6

        # Log transform (only positive values)
        if self.use_log:
            # Shift to positive before log
            X_shifted = X - X.min(axis=0) + eps
            log_feats = np.log1p(X_shifted)
            parts.append(log_feats)

        # Sqrt transform
        if self.use_sqrt:
            X_shifted = X - X.min(axis=0)
            sqrt_feats = np.sqrt(X_shifted)
            parts.append(sqrt_feats)

        # Polynomial features (degree 2 without bias)
        if self.use_polynomial or self.use_interactions:
            # Limit to first 20 features to avoid explosion
            X_sub = X[:, :min(20, X.shape[1])]
            if fit:
                self._poly = PolynomialFeatures(
                    degree=self.poly_degree,
                    interaction_only=not self.use_polynomial,
                    include_bias=False,
                )
                poly_feats = self._poly.fit_transform(X_sub)
            else:
                poly_feats = self._poly.transform(X_sub)

            # Remove original features (already in parts[0])
            n_orig_sub = X_sub.shape[1]
            poly_feats = poly_feats[:, n_orig_sub:]
            parts.append(poly_feats)

        X_out = np.hstack(parts)
        # Replace any NaN/inf
        X_out = np.nan_to_num(X_out, nan=0.0, posinf=0.0, neginf=0.0)
        return X_out

    @property
    def n_features_out(self) -> int:
        if not self._fitted:
            return 0
        # Approximate; actual is computed during fit
        return -1  # Will be set after fit