Spaces:

sam-bot
/

get-around-api

Sleeping

App Files Files Community

get-around-api / src /ml /preprocessing.py

sam-bot

1st commit

7d87fe9 3 months ago

raw

history blame contribute delete

3.48 kB

	"""Feature engineering and preprocessing for pricing model."""

	import logging
	from typing import Tuple

	import pandas as pd
	from sklearn.compose import ColumnTransformer
	from sklearn.preprocessing import OneHotEncoder, StandardScaler

	logger = logging.getLogger(__name__)


	# Feature definitions based on EDA
	CATEGORICAL_FEATURES = [
	"model_key",
	"fuel",
	"paint_color",
	"car_type",
	]

	BOOLEAN_FEATURES = [
	"private_parking_available",
	"has_gps",
	"has_air_conditioning",
	"automatic_car",
	"has_getaround_connect",
	"has_speed_regulator",
	"winter_tires",
	]

	NUMERICAL_FEATURES = [
	"mileage",
	"engine_power",
	]

	TARGET = "rental_price_per_day"


	def load_data(filepath: str) -> pd.DataFrame:
	"""Load pricing dataset from CSV.

	Args:
	filepath: Path to the CSV file.

	Returns:
	DataFrame with loaded data.

	Raises:
	FileNotFoundError: If file does not exist.
	pd.errors.ParserError: If CSV parsing fails.
	"""
	logger.info("Loading data from %s", filepath)
	df = pd.read_csv(filepath, index_col=0)
	logger.info("Loaded %d rows, %d columns", df.shape[0], df.shape[1])
	return df


	def create_preprocessor() -> ColumnTransformer:
	"""Create sklearn preprocessor for features.

	The preprocessor applies:
	- StandardScaler to numerical features (mileage, engine_power)
	- Passthrough for boolean features (already 0/1)
	- OneHotEncoder for categorical features (model_key, fuel, paint_color, car_type)

	Returns:
	ColumnTransformer configured for all feature types.
	"""
	preprocessor = ColumnTransformer(
	transformers=[
	(
	"num",
	StandardScaler(),
	NUMERICAL_FEATURES,
	),
	(
	"bool",
	"passthrough",
	BOOLEAN_FEATURES,
	),
	(
	"cat",
	OneHotEncoder(handle_unknown="ignore", sparse_output=False),
	CATEGORICAL_FEATURES,
	),
	],
	remainder="drop",
	)
	logger.debug(
	"Created preprocessor with %d transformers", len(preprocessor.transformers)
	)
	return preprocessor


	def prepare_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
	"""Split dataframe into features X and target y.

	Converts boolean columns to int (0/1) for sklearn compatibility.

	Args:
	df: DataFrame with all columns including target.

	Returns:
	Tuple of (X, y) where X is features DataFrame and y is target Series.

	Raises:
	KeyError: If required columns are missing.
	"""
	required_cols = (
	NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES + [TARGET]
	)
	missing_cols = set(required_cols) - set(df.columns)
	if missing_cols:
	raise KeyError(f"Missing columns: {missing_cols}")

	feature_cols = NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES
	X = df[feature_cols].copy()

	for col in BOOLEAN_FEATURES:
	X[col] = X[col].astype(int)

	y = df[TARGET].copy()

	logger.info("Prepared features: X shape %s, y shape %s", X.shape, y.shape)
	return X, y


	def get_feature_names() -> list[str]:
	"""Return list of all feature names used.

	Returns:
	List of feature names in order: numerical, boolean, categorical.
	"""
	return NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES