fairvalue-api / src /models /train_xgboost.py
FairValue
feat: production web app — React/Vite frontend + FastAPI backend with Render/Vercel deployment
b72652e
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
def prepare_data_for_training(df, target_col='Transfer_Fee_2024_GBP'):
"""
Separates features from target and applies log1p to the target.
Design decision — market_value_in_eur:
────────────────────────────────────────
Previous code dropped this as "leaky". That was wrong for our use case:
TRUE leakage = using the SAME transfer's fee data during training
(e.g. Adjusted_Fee_EUR, Transfer_Fee_EUR from the same record).
market_value_in_eur is NOT leaky because:
1. It is a published, pre-transfer datapoint every club already knows.
2. In the app, users explicitly enter their market value estimate.
3. Without it, the model cannot distinguish Mbappe from a League Two
player — all predictions collapse to the population median (~£40m).
4. The model should learn the RELATIONSHIP between market value and
actual transfer fee (premiums/discounts), not pretend MV doesn't exist.
ACTUALLY leaky columns are only those derived from the same transfer record:
- Transfer_Fee_EUR, Adjusted_Fee_EUR, Inflation_Multiplier, Transfer_Year
"""
X = df.drop(
columns=[
target_col, 'Player_ID', 'Name', 'name', 'name_x', 'name_y',
'Transfer_Date', 'Adjusted_Fee_EUR', 'Inflation_Multiplier',
'Transfer_Year', 'player_id'
],
errors='ignore'
)
# Drop only same-transaction fee history — NOT market_value_in_eur
truly_leaky = [
c for c in X.columns
if ('fee' in c.lower() and 'transfer' in c.lower()) # e.g. Transfer_Fee_EUR
or ('adjusted' in c.lower() and 'fee' in c.lower()) # e.g. Adjusted_Fee_EUR
]
X = X.drop(columns=truly_leaky, errors='ignore')
print(f" Training features ({len(X.columns)}): {sorted(X.columns.tolist())}")
if 'market_value_in_eur' in X.columns:
print(" ✅ market_value_in_eur INCLUDED — elite player differentiation enabled.")
else:
print(" ⚠️ market_value_in_eur NOT FOUND in dataset — check pipeline renaming.")
y = np.log1p(df[target_col])
return X, y
def compute_sample_weights(y_log):
"""
Industry-standard approach to handle transfer fee class imbalance.
Replaces SMOTE-for-regression, which introduced synthetic noise:
- SMOTE was designed for classification, not regression
- Discretising log-price into bins and re-interpolating targets
introduces distributional artifacts that hurt generalisation
- XGBoost's native sample_weight is cleaner, faster, and more principled
sample_weight tells XGBoost "getting this prediction right matters more".
It does NOT create fake data — it simply re-weights gradient updates.
Tier thresholds (2024 GBP equivalent):
Elite: > £60m → weight 5.0 (Bellingham, Haaland tier)
High: £25–60m → weight 2.5 (Premier League regulars)
Mid: £5–25m → weight 1.5 (Championship / squad depth)
Standard: < £5m → weight 1.0 (lower leagues)
"""
y_real = np.expm1(y_log)
weights = np.where(y_real > 60_000_000, 5.0,
np.where(y_real > 25_000_000, 2.5,
np.where(y_real > 5_000_000, 1.5,
1.0)))
total_weighted = (weights * np.ones_like(y_real)).sum()
print(f"\n Sample weight breakdown:")
print(f" Elite (>£60m): {(weights == 5.0).sum():>5,} records × 5.0 "
f"= {(weights == 5.0).sum() * 5:.0f} effective samples")
print(f" High (£25-60m): {(weights == 2.5).sum():>5,} records × 2.5 "
f"= {(weights == 2.5).sum() * 2.5:.0f} effective samples")
print(f" Mid (£5-25m): {(weights == 1.5).sum():>5,} records × 1.5 "
f"= {(weights == 1.5).sum() * 1.5:.0f} effective samples")
print(f" Standard (<£5m): {(weights == 1.0).sum():>5,} records × 1.0 "
f"= {(weights == 1.0).sum() * 1.0:.0f} effective samples")
print(f" Total effective: {total_weighted:>7,.0f}")
return weights
def train_model(X, y):
"""
Trains the XGBoost regressor using RandomizedSearchCV + sample weights.
Key improvements over previous version:
- sample_weight replaces SMOTE: native, noise-free, principled
- Added reg_alpha (L1) to complement reg_lambda (L2) regularisation
- Elite-specific MAE reported alongside global MAE for transparency
- Weights are computed on train split only (clean test evaluation)
"""
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Compute weights on training set ONLY — test set uses uniform weighting
sample_weights_train = compute_sample_weights(y_train)
xgb_reg = xgb.XGBRegressor(
objective='reg:squarederror',
random_state=42,
tree_method='hist' # Fastest for large datasets
)
param_dist = {
'n_estimators': [200, 400, 600, 800],
'learning_rate': [0.01, 0.03, 0.05, 0.08],
'max_depth': [4, 5, 6, 7],
'min_child_weight': [1, 2, 3, 5], # Protects against overfitting on rare elites
'subsample': [0.7, 0.8, 0.9, 1.0],
'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
'gamma': [0, 0.05, 0.1, 0.2],
'reg_lambda': [0.5, 1.0, 1.5, 2.0], # L2 regularisation
'reg_alpha': [0, 0.05, 0.1, 0.2], # L1 regularisation (new)
}
print("\nBeginning Hyperparameter Search (50 iterations, 5-fold CV)...")
search = RandomizedSearchCV(
xgb_reg,
param_distributions=param_dist,
n_iter=50,
cv=5,
scoring='neg_mean_absolute_error',
n_jobs=-1,
random_state=42,
verbose=1
)
# This is the key line — XGBoost re-weights gradient updates per sample
search.fit(X_train, y_train, sample_weight=sample_weights_train)
best_model = search.best_estimator_
print(f"\nBest hyperparameters: {search.best_params_}")
# ── Evaluate on clean hold-out (no sample weights at test time) ───────────
predictions_log = best_model.predict(X_test)
predictions_real = np.expm1(predictions_log)
y_test_real = np.expm1(y_test)
mae = mean_absolute_error(y_test_real, predictions_real)
rmse = np.sqrt(mean_squared_error(y_test_real, predictions_real))
print(f"\n{'='*55}")
print(f" Model Evaluation (held-out test set)")
print(f"{'='*55}")
print(f" Global MAE: £{mae:>12,.0f}")
print(f" Global RMSE: £{rmse:>12,.0f}")
# Elite-specific performance (the critical metric for our use case)
for threshold, label in [(60_000_000, ">£60m"), (25_000_000, ">£25m")]:
mask = y_test_real > threshold
if mask.sum() > 0:
tier_mae = mean_absolute_error(
y_test_real[mask], predictions_real[mask]
)
print(f" Elite MAE ({label}): £{tier_mae:>12,.0f} [{mask.sum()} players]")
print(f"{'='*55}")
best_model.save_model("fairvalue_xgboost.json")
print(" Model saved: fairvalue_xgboost.json")
return best_model, mae