get-around-api / src /ml /preprocessing.py
sam-bot's picture
1st commit
7d87fe9
"""Feature engineering and preprocessing for pricing model."""
import logging
from typing import Tuple
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
logger = logging.getLogger(__name__)
# Feature definitions based on EDA
CATEGORICAL_FEATURES = [
"model_key",
"fuel",
"paint_color",
"car_type",
]
BOOLEAN_FEATURES = [
"private_parking_available",
"has_gps",
"has_air_conditioning",
"automatic_car",
"has_getaround_connect",
"has_speed_regulator",
"winter_tires",
]
NUMERICAL_FEATURES = [
"mileage",
"engine_power",
]
TARGET = "rental_price_per_day"
def load_data(filepath: str) -> pd.DataFrame:
"""Load pricing dataset from CSV.
Args:
filepath: Path to the CSV file.
Returns:
DataFrame with loaded data.
Raises:
FileNotFoundError: If file does not exist.
pd.errors.ParserError: If CSV parsing fails.
"""
logger.info("Loading data from %s", filepath)
df = pd.read_csv(filepath, index_col=0)
logger.info("Loaded %d rows, %d columns", df.shape[0], df.shape[1])
return df
def create_preprocessor() -> ColumnTransformer:
"""Create sklearn preprocessor for features.
The preprocessor applies:
- StandardScaler to numerical features (mileage, engine_power)
- Passthrough for boolean features (already 0/1)
- OneHotEncoder for categorical features (model_key, fuel, paint_color, car_type)
Returns:
ColumnTransformer configured for all feature types.
"""
preprocessor = ColumnTransformer(
transformers=[
(
"num",
StandardScaler(),
NUMERICAL_FEATURES,
),
(
"bool",
"passthrough",
BOOLEAN_FEATURES,
),
(
"cat",
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
CATEGORICAL_FEATURES,
),
],
remainder="drop",
)
logger.debug(
"Created preprocessor with %d transformers", len(preprocessor.transformers)
)
return preprocessor
def prepare_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
"""Split dataframe into features X and target y.
Converts boolean columns to int (0/1) for sklearn compatibility.
Args:
df: DataFrame with all columns including target.
Returns:
Tuple of (X, y) where X is features DataFrame and y is target Series.
Raises:
KeyError: If required columns are missing.
"""
required_cols = (
NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES + [TARGET]
)
missing_cols = set(required_cols) - set(df.columns)
if missing_cols:
raise KeyError(f"Missing columns: {missing_cols}")
feature_cols = NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES
X = df[feature_cols].copy()
for col in BOOLEAN_FEATURES:
X[col] = X[col].astype(int)
y = df[TARGET].copy()
logger.info("Prepared features: X shape %s, y shape %s", X.shape, y.shape)
return X, y
def get_feature_names() -> list[str]:
"""Return list of all feature names used.
Returns:
List of feature names in order: numerical, boolean, categorical.
"""
return NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES