Spaces:

sam-bot
/

get-around-api

Sleeping

File size: 3,479 Bytes

7d87fe9

"""Feature engineering and preprocessing for pricing model."""

import logging
from typing import Tuple

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

logger = logging.getLogger(__name__)


# Feature definitions based on EDA
CATEGORICAL_FEATURES = [
    "model_key",
    "fuel",
    "paint_color",
    "car_type",
]

BOOLEAN_FEATURES = [
    "private_parking_available",
    "has_gps",
    "has_air_conditioning",
    "automatic_car",
    "has_getaround_connect",
    "has_speed_regulator",
    "winter_tires",
]

NUMERICAL_FEATURES = [
    "mileage",
    "engine_power",
]

TARGET = "rental_price_per_day"


def load_data(filepath: str) -> pd.DataFrame:
    """Load pricing dataset from CSV.

    Args:
        filepath: Path to the CSV file.

    Returns:
        DataFrame with loaded data.

    Raises:
        FileNotFoundError: If file does not exist.
        pd.errors.ParserError: If CSV parsing fails.
    """
    logger.info("Loading data from %s", filepath)
    df = pd.read_csv(filepath, index_col=0)
    logger.info("Loaded %d rows, %d columns", df.shape[0], df.shape[1])
    return df


def create_preprocessor() -> ColumnTransformer:
    """Create sklearn preprocessor for features.

    The preprocessor applies:
    - StandardScaler to numerical features (mileage, engine_power)
    - Passthrough for boolean features (already 0/1)
    - OneHotEncoder for categorical features (model_key, fuel, paint_color, car_type)

    Returns:
        ColumnTransformer configured for all feature types.
    """
    preprocessor = ColumnTransformer(
        transformers=[
            (
                "num",
                StandardScaler(),
                NUMERICAL_FEATURES,
            ),
            (
                "bool",
                "passthrough",
                BOOLEAN_FEATURES,
            ),
            (
                "cat",
                OneHotEncoder(handle_unknown="ignore", sparse_output=False),
                CATEGORICAL_FEATURES,
            ),
        ],
        remainder="drop",
    )
    logger.debug(
        "Created preprocessor with %d transformers", len(preprocessor.transformers)
    )
    return preprocessor


def prepare_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    """Split dataframe into features X and target y.

    Converts boolean columns to int (0/1) for sklearn compatibility.

    Args:
        df: DataFrame with all columns including target.

    Returns:
        Tuple of (X, y) where X is features DataFrame and y is target Series.

    Raises:
        KeyError: If required columns are missing.
    """
    required_cols = (
        NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES + [TARGET]
    )
    missing_cols = set(required_cols) - set(df.columns)
    if missing_cols:
        raise KeyError(f"Missing columns: {missing_cols}")

    feature_cols = NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES
    X = df[feature_cols].copy()

    for col in BOOLEAN_FEATURES:
        X[col] = X[col].astype(int)

    y = df[TARGET].copy()

    logger.info("Prepared features: X shape %s, y shape %s", X.shape, y.shape)
    return X, y


def get_feature_names() -> list[str]:
    """Return list of all feature names used.

    Returns:
        List of feature names in order: numerical, boolean, categorical.
    """
    return NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES