File size: 3,479 Bytes
7d87fe9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""Feature engineering and preprocessing for pricing model."""

import logging
from typing import Tuple

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

logger = logging.getLogger(__name__)


# Feature definitions based on EDA
CATEGORICAL_FEATURES = [
    "model_key",
    "fuel",
    "paint_color",
    "car_type",
]

BOOLEAN_FEATURES = [
    "private_parking_available",
    "has_gps",
    "has_air_conditioning",
    "automatic_car",
    "has_getaround_connect",
    "has_speed_regulator",
    "winter_tires",
]

NUMERICAL_FEATURES = [
    "mileage",
    "engine_power",
]

TARGET = "rental_price_per_day"


def load_data(filepath: str) -> pd.DataFrame:
    """Load pricing dataset from CSV.

    Args:
        filepath: Path to the CSV file.

    Returns:
        DataFrame with loaded data.

    Raises:
        FileNotFoundError: If file does not exist.
        pd.errors.ParserError: If CSV parsing fails.
    """
    logger.info("Loading data from %s", filepath)
    df = pd.read_csv(filepath, index_col=0)
    logger.info("Loaded %d rows, %d columns", df.shape[0], df.shape[1])
    return df


def create_preprocessor() -> ColumnTransformer:
    """Create sklearn preprocessor for features.

    The preprocessor applies:
    - StandardScaler to numerical features (mileage, engine_power)
    - Passthrough for boolean features (already 0/1)
    - OneHotEncoder for categorical features (model_key, fuel, paint_color, car_type)

    Returns:
        ColumnTransformer configured for all feature types.
    """
    preprocessor = ColumnTransformer(
        transformers=[
            (
                "num",
                StandardScaler(),
                NUMERICAL_FEATURES,
            ),
            (
                "bool",
                "passthrough",
                BOOLEAN_FEATURES,
            ),
            (
                "cat",
                OneHotEncoder(handle_unknown="ignore", sparse_output=False),
                CATEGORICAL_FEATURES,
            ),
        ],
        remainder="drop",
    )
    logger.debug(
        "Created preprocessor with %d transformers", len(preprocessor.transformers)
    )
    return preprocessor


def prepare_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    """Split dataframe into features X and target y.

    Converts boolean columns to int (0/1) for sklearn compatibility.

    Args:
        df: DataFrame with all columns including target.

    Returns:
        Tuple of (X, y) where X is features DataFrame and y is target Series.

    Raises:
        KeyError: If required columns are missing.
    """
    required_cols = (
        NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES + [TARGET]
    )
    missing_cols = set(required_cols) - set(df.columns)
    if missing_cols:
        raise KeyError(f"Missing columns: {missing_cols}")

    feature_cols = NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES
    X = df[feature_cols].copy()

    for col in BOOLEAN_FEATURES:
        X[col] = X[col].astype(int)

    y = df[TARGET].copy()

    logger.info("Prepared features: X shape %s, y shape %s", X.shape, y.shape)
    return X, y


def get_feature_names() -> list[str]:
    """Return list of all feature names used.

    Returns:
        List of feature names in order: numerical, boolean, categorical.
    """
    return NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES