Spaces:
Sleeping
Sleeping
| """Feature engineering and preprocessing for pricing model.""" | |
| import logging | |
| from typing import Tuple | |
| import pandas as pd | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler | |
| logger = logging.getLogger(__name__) | |
| # Feature definitions based on EDA | |
| CATEGORICAL_FEATURES = [ | |
| "model_key", | |
| "fuel", | |
| "paint_color", | |
| "car_type", | |
| ] | |
| BOOLEAN_FEATURES = [ | |
| "private_parking_available", | |
| "has_gps", | |
| "has_air_conditioning", | |
| "automatic_car", | |
| "has_getaround_connect", | |
| "has_speed_regulator", | |
| "winter_tires", | |
| ] | |
| NUMERICAL_FEATURES = [ | |
| "mileage", | |
| "engine_power", | |
| ] | |
| TARGET = "rental_price_per_day" | |
| def load_data(filepath: str) -> pd.DataFrame: | |
| """Load pricing dataset from CSV. | |
| Args: | |
| filepath: Path to the CSV file. | |
| Returns: | |
| DataFrame with loaded data. | |
| Raises: | |
| FileNotFoundError: If file does not exist. | |
| pd.errors.ParserError: If CSV parsing fails. | |
| """ | |
| logger.info("Loading data from %s", filepath) | |
| df = pd.read_csv(filepath, index_col=0) | |
| logger.info("Loaded %d rows, %d columns", df.shape[0], df.shape[1]) | |
| return df | |
| def create_preprocessor() -> ColumnTransformer: | |
| """Create sklearn preprocessor for features. | |
| The preprocessor applies: | |
| - StandardScaler to numerical features (mileage, engine_power) | |
| - Passthrough for boolean features (already 0/1) | |
| - OneHotEncoder for categorical features (model_key, fuel, paint_color, car_type) | |
| Returns: | |
| ColumnTransformer configured for all feature types. | |
| """ | |
| preprocessor = ColumnTransformer( | |
| transformers=[ | |
| ( | |
| "num", | |
| StandardScaler(), | |
| NUMERICAL_FEATURES, | |
| ), | |
| ( | |
| "bool", | |
| "passthrough", | |
| BOOLEAN_FEATURES, | |
| ), | |
| ( | |
| "cat", | |
| OneHotEncoder(handle_unknown="ignore", sparse_output=False), | |
| CATEGORICAL_FEATURES, | |
| ), | |
| ], | |
| remainder="drop", | |
| ) | |
| logger.debug( | |
| "Created preprocessor with %d transformers", len(preprocessor.transformers) | |
| ) | |
| return preprocessor | |
| def prepare_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]: | |
| """Split dataframe into features X and target y. | |
| Converts boolean columns to int (0/1) for sklearn compatibility. | |
| Args: | |
| df: DataFrame with all columns including target. | |
| Returns: | |
| Tuple of (X, y) where X is features DataFrame and y is target Series. | |
| Raises: | |
| KeyError: If required columns are missing. | |
| """ | |
| required_cols = ( | |
| NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES + [TARGET] | |
| ) | |
| missing_cols = set(required_cols) - set(df.columns) | |
| if missing_cols: | |
| raise KeyError(f"Missing columns: {missing_cols}") | |
| feature_cols = NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES | |
| X = df[feature_cols].copy() | |
| for col in BOOLEAN_FEATURES: | |
| X[col] = X[col].astype(int) | |
| y = df[TARGET].copy() | |
| logger.info("Prepared features: X shape %s, y shape %s", X.shape, y.shape) | |
| return X, y | |
| def get_feature_names() -> list[str]: | |
| """Return list of all feature names used. | |
| Returns: | |
| List of feature names in order: numerical, boolean, categorical. | |
| """ | |
| return NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES | |