Spaces:

Britzzy
/

fairvalue-api

Running

fairvalue-api / src /models /train_xgboost.py

FairValue

feat: production web app — React/Vite frontend + FastAPI backend with Render/Vercel deployment

b72652e 9 days ago

7.48 kB

	import xgboost as xgb
	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split, RandomizedSearchCV
	from sklearn.metrics import mean_absolute_error, mean_squared_error


	def prepare_data_for_training(df, target_col='Transfer_Fee_2024_GBP'):
	"""
	Separates features from target and applies log1p to the target.

	Design decision — market_value_in_eur:
	────────────────────────────────────────
	Previous code dropped this as "leaky". That was wrong for our use case:

	TRUE leakage = using the SAME transfer's fee data during training
	(e.g. Adjusted_Fee_EUR, Transfer_Fee_EUR from the same record).

	market_value_in_eur is NOT leaky because:
	1. It is a published, pre-transfer datapoint every club already knows.
	2. In the app, users explicitly enter their market value estimate.
	3. Without it, the model cannot distinguish Mbappe from a League Two
	player — all predictions collapse to the population median (~£40m).
	4. The model should learn the RELATIONSHIP between market value and
	actual transfer fee (premiums/discounts), not pretend MV doesn't exist.

	ACTUALLY leaky columns are only those derived from the same transfer record:
	- Transfer_Fee_EUR, Adjusted_Fee_EUR, Inflation_Multiplier, Transfer_Year
	"""
	X = df.drop(
	columns=[
	target_col, 'Player_ID', 'Name', 'name', 'name_x', 'name_y',
	'Transfer_Date', 'Adjusted_Fee_EUR', 'Inflation_Multiplier',
	'Transfer_Year', 'player_id'
	],
	errors='ignore'
	)

	# Drop only same-transaction fee history — NOT market_value_in_eur
	truly_leaky = [
	c for c in X.columns
	if ('fee' in c.lower() and 'transfer' in c.lower()) # e.g. Transfer_Fee_EUR
	or ('adjusted' in c.lower() and 'fee' in c.lower()) # e.g. Adjusted_Fee_EUR
	]
	X = X.drop(columns=truly_leaky, errors='ignore')

	print(f" Training features ({len(X.columns)}): {sorted(X.columns.tolist())}")
	if 'market_value_in_eur' in X.columns:
	print(" ✅ market_value_in_eur INCLUDED — elite player differentiation enabled.")
	else:
	print(" ⚠️ market_value_in_eur NOT FOUND in dataset — check pipeline renaming.")

	y = np.log1p(df[target_col])
	return X, y


	def compute_sample_weights(y_log):
	"""
	Industry-standard approach to handle transfer fee class imbalance.

	Replaces SMOTE-for-regression, which introduced synthetic noise:
	- SMOTE was designed for classification, not regression
	- Discretising log-price into bins and re-interpolating targets
	introduces distributional artifacts that hurt generalisation
	- XGBoost's native sample_weight is cleaner, faster, and more principled

	sample_weight tells XGBoost "getting this prediction right matters more".
	It does NOT create fake data — it simply re-weights gradient updates.

	Tier thresholds (2024 GBP equivalent):
	Elite: > £60m → weight 5.0 (Bellingham, Haaland tier)
	High: £25–60m → weight 2.5 (Premier League regulars)
	Mid: £5–25m → weight 1.5 (Championship / squad depth)
	Standard: < £5m → weight 1.0 (lower leagues)
	"""
	y_real = np.expm1(y_log)
	weights = np.where(y_real > 60_000_000, 5.0,
	np.where(y_real > 25_000_000, 2.5,
	np.where(y_real > 5_000_000, 1.5,
	1.0)))

	total_weighted = (weights * np.ones_like(y_real)).sum()
	print(f"\n Sample weight breakdown:")
	print(f" Elite (>£60m): {(weights == 5.0).sum():>5,} records × 5.0 "
	f"= {(weights == 5.0).sum() * 5:.0f} effective samples")
	print(f" High (£25-60m): {(weights == 2.5).sum():>5,} records × 2.5 "
	f"= {(weights == 2.5).sum() * 2.5:.0f} effective samples")
	print(f" Mid (£5-25m): {(weights == 1.5).sum():>5,} records × 1.5 "
	f"= {(weights == 1.5).sum() * 1.5:.0f} effective samples")
	print(f" Standard (<£5m): {(weights == 1.0).sum():>5,} records × 1.0 "
	f"= {(weights == 1.0).sum() * 1.0:.0f} effective samples")
	print(f" Total effective: {total_weighted:>7,.0f}")
	return weights


	def train_model(X, y):
	"""
	Trains the XGBoost regressor using RandomizedSearchCV + sample weights.

	Key improvements over previous version:
	- sample_weight replaces SMOTE: native, noise-free, principled
	- Added reg_alpha (L1) to complement reg_lambda (L2) regularisation
	- Elite-specific MAE reported alongside global MAE for transparency
	- Weights are computed on train split only (clean test evaluation)
	"""
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42
	)

	# Compute weights on training set ONLY — test set uses uniform weighting
	sample_weights_train = compute_sample_weights(y_train)

	xgb_reg = xgb.XGBRegressor(
	objective='reg:squarederror',
	random_state=42,
	tree_method='hist' # Fastest for large datasets
	)

	param_dist = {
	'n_estimators': [200, 400, 600, 800],
	'learning_rate': [0.01, 0.03, 0.05, 0.08],
	'max_depth': [4, 5, 6, 7],
	'min_child_weight': [1, 2, 3, 5], # Protects against overfitting on rare elites
	'subsample': [0.7, 0.8, 0.9, 1.0],
	'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
	'gamma': [0, 0.05, 0.1, 0.2],
	'reg_lambda': [0.5, 1.0, 1.5, 2.0], # L2 regularisation
	'reg_alpha': [0, 0.05, 0.1, 0.2], # L1 regularisation (new)
	}

	print("\nBeginning Hyperparameter Search (50 iterations, 5-fold CV)...")
	search = RandomizedSearchCV(
	xgb_reg,
	param_distributions=param_dist,
	n_iter=50,
	cv=5,
	scoring='neg_mean_absolute_error',
	n_jobs=-1,
	random_state=42,
	verbose=1
	)

	# This is the key line — XGBoost re-weights gradient updates per sample
	search.fit(X_train, y_train, sample_weight=sample_weights_train)

	best_model = search.best_estimator_
	print(f"\nBest hyperparameters: {search.best_params_}")

	# ── Evaluate on clean hold-out (no sample weights at test time) ───────────
	predictions_log = best_model.predict(X_test)
	predictions_real = np.expm1(predictions_log)
	y_test_real = np.expm1(y_test)

	mae = mean_absolute_error(y_test_real, predictions_real)
	rmse = np.sqrt(mean_squared_error(y_test_real, predictions_real))

	print(f"\n{'='*55}")
	print(f" Model Evaluation (held-out test set)")
	print(f"{'='*55}")
	print(f" Global MAE: £{mae:>12,.0f}")
	print(f" Global RMSE: £{rmse:>12,.0f}")

	# Elite-specific performance (the critical metric for our use case)
	for threshold, label in [(60_000_000, ">£60m"), (25_000_000, ">£25m")]:
	mask = y_test_real > threshold
	if mask.sum() > 0:
	tier_mae = mean_absolute_error(
	y_test_real[mask], predictions_real[mask]
	)
	print(f" Elite MAE ({label}): £{tier_mae:>12,.0f} [{mask.sum()} players]")

	print(f"{'='*55}")

	best_model.save_model("fairvalue_xgboost.json")
	print(" Model saved: fairvalue_xgboost.json")

	return best_model, mae