Spaces:
Running
Running
FairValue
feat: production web app — React/Vite frontend + FastAPI backend with Render/Vercel deployment
b72652e | import xgboost as xgb | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split, RandomizedSearchCV | |
| from sklearn.metrics import mean_absolute_error, mean_squared_error | |
| def prepare_data_for_training(df, target_col='Transfer_Fee_2024_GBP'): | |
| """ | |
| Separates features from target and applies log1p to the target. | |
| Design decision — market_value_in_eur: | |
| ──────────────────────────────────────── | |
| Previous code dropped this as "leaky". That was wrong for our use case: | |
| TRUE leakage = using the SAME transfer's fee data during training | |
| (e.g. Adjusted_Fee_EUR, Transfer_Fee_EUR from the same record). | |
| market_value_in_eur is NOT leaky because: | |
| 1. It is a published, pre-transfer datapoint every club already knows. | |
| 2. In the app, users explicitly enter their market value estimate. | |
| 3. Without it, the model cannot distinguish Mbappe from a League Two | |
| player — all predictions collapse to the population median (~£40m). | |
| 4. The model should learn the RELATIONSHIP between market value and | |
| actual transfer fee (premiums/discounts), not pretend MV doesn't exist. | |
| ACTUALLY leaky columns are only those derived from the same transfer record: | |
| - Transfer_Fee_EUR, Adjusted_Fee_EUR, Inflation_Multiplier, Transfer_Year | |
| """ | |
| X = df.drop( | |
| columns=[ | |
| target_col, 'Player_ID', 'Name', 'name', 'name_x', 'name_y', | |
| 'Transfer_Date', 'Adjusted_Fee_EUR', 'Inflation_Multiplier', | |
| 'Transfer_Year', 'player_id' | |
| ], | |
| errors='ignore' | |
| ) | |
| # Drop only same-transaction fee history — NOT market_value_in_eur | |
| truly_leaky = [ | |
| c for c in X.columns | |
| if ('fee' in c.lower() and 'transfer' in c.lower()) # e.g. Transfer_Fee_EUR | |
| or ('adjusted' in c.lower() and 'fee' in c.lower()) # e.g. Adjusted_Fee_EUR | |
| ] | |
| X = X.drop(columns=truly_leaky, errors='ignore') | |
| print(f" Training features ({len(X.columns)}): {sorted(X.columns.tolist())}") | |
| if 'market_value_in_eur' in X.columns: | |
| print(" ✅ market_value_in_eur INCLUDED — elite player differentiation enabled.") | |
| else: | |
| print(" ⚠️ market_value_in_eur NOT FOUND in dataset — check pipeline renaming.") | |
| y = np.log1p(df[target_col]) | |
| return X, y | |
| def compute_sample_weights(y_log): | |
| """ | |
| Industry-standard approach to handle transfer fee class imbalance. | |
| Replaces SMOTE-for-regression, which introduced synthetic noise: | |
| - SMOTE was designed for classification, not regression | |
| - Discretising log-price into bins and re-interpolating targets | |
| introduces distributional artifacts that hurt generalisation | |
| - XGBoost's native sample_weight is cleaner, faster, and more principled | |
| sample_weight tells XGBoost "getting this prediction right matters more". | |
| It does NOT create fake data — it simply re-weights gradient updates. | |
| Tier thresholds (2024 GBP equivalent): | |
| Elite: > £60m → weight 5.0 (Bellingham, Haaland tier) | |
| High: £25–60m → weight 2.5 (Premier League regulars) | |
| Mid: £5–25m → weight 1.5 (Championship / squad depth) | |
| Standard: < £5m → weight 1.0 (lower leagues) | |
| """ | |
| y_real = np.expm1(y_log) | |
| weights = np.where(y_real > 60_000_000, 5.0, | |
| np.where(y_real > 25_000_000, 2.5, | |
| np.where(y_real > 5_000_000, 1.5, | |
| 1.0))) | |
| total_weighted = (weights * np.ones_like(y_real)).sum() | |
| print(f"\n Sample weight breakdown:") | |
| print(f" Elite (>£60m): {(weights == 5.0).sum():>5,} records × 5.0 " | |
| f"= {(weights == 5.0).sum() * 5:.0f} effective samples") | |
| print(f" High (£25-60m): {(weights == 2.5).sum():>5,} records × 2.5 " | |
| f"= {(weights == 2.5).sum() * 2.5:.0f} effective samples") | |
| print(f" Mid (£5-25m): {(weights == 1.5).sum():>5,} records × 1.5 " | |
| f"= {(weights == 1.5).sum() * 1.5:.0f} effective samples") | |
| print(f" Standard (<£5m): {(weights == 1.0).sum():>5,} records × 1.0 " | |
| f"= {(weights == 1.0).sum() * 1.0:.0f} effective samples") | |
| print(f" Total effective: {total_weighted:>7,.0f}") | |
| return weights | |
| def train_model(X, y): | |
| """ | |
| Trains the XGBoost regressor using RandomizedSearchCV + sample weights. | |
| Key improvements over previous version: | |
| - sample_weight replaces SMOTE: native, noise-free, principled | |
| - Added reg_alpha (L1) to complement reg_lambda (L2) regularisation | |
| - Elite-specific MAE reported alongside global MAE for transparency | |
| - Weights are computed on train split only (clean test evaluation) | |
| """ | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42 | |
| ) | |
| # Compute weights on training set ONLY — test set uses uniform weighting | |
| sample_weights_train = compute_sample_weights(y_train) | |
| xgb_reg = xgb.XGBRegressor( | |
| objective='reg:squarederror', | |
| random_state=42, | |
| tree_method='hist' # Fastest for large datasets | |
| ) | |
| param_dist = { | |
| 'n_estimators': [200, 400, 600, 800], | |
| 'learning_rate': [0.01, 0.03, 0.05, 0.08], | |
| 'max_depth': [4, 5, 6, 7], | |
| 'min_child_weight': [1, 2, 3, 5], # Protects against overfitting on rare elites | |
| 'subsample': [0.7, 0.8, 0.9, 1.0], | |
| 'colsample_bytree': [0.7, 0.8, 0.9, 1.0], | |
| 'gamma': [0, 0.05, 0.1, 0.2], | |
| 'reg_lambda': [0.5, 1.0, 1.5, 2.0], # L2 regularisation | |
| 'reg_alpha': [0, 0.05, 0.1, 0.2], # L1 regularisation (new) | |
| } | |
| print("\nBeginning Hyperparameter Search (50 iterations, 5-fold CV)...") | |
| search = RandomizedSearchCV( | |
| xgb_reg, | |
| param_distributions=param_dist, | |
| n_iter=50, | |
| cv=5, | |
| scoring='neg_mean_absolute_error', | |
| n_jobs=-1, | |
| random_state=42, | |
| verbose=1 | |
| ) | |
| # This is the key line — XGBoost re-weights gradient updates per sample | |
| search.fit(X_train, y_train, sample_weight=sample_weights_train) | |
| best_model = search.best_estimator_ | |
| print(f"\nBest hyperparameters: {search.best_params_}") | |
| # ── Evaluate on clean hold-out (no sample weights at test time) ─────────── | |
| predictions_log = best_model.predict(X_test) | |
| predictions_real = np.expm1(predictions_log) | |
| y_test_real = np.expm1(y_test) | |
| mae = mean_absolute_error(y_test_real, predictions_real) | |
| rmse = np.sqrt(mean_squared_error(y_test_real, predictions_real)) | |
| print(f"\n{'='*55}") | |
| print(f" Model Evaluation (held-out test set)") | |
| print(f"{'='*55}") | |
| print(f" Global MAE: £{mae:>12,.0f}") | |
| print(f" Global RMSE: £{rmse:>12,.0f}") | |
| # Elite-specific performance (the critical metric for our use case) | |
| for threshold, label in [(60_000_000, ">£60m"), (25_000_000, ">£25m")]: | |
| mask = y_test_real > threshold | |
| if mask.sum() > 0: | |
| tier_mae = mean_absolute_error( | |
| y_test_real[mask], predictions_real[mask] | |
| ) | |
| print(f" Elite MAE ({label}): £{tier_mae:>12,.0f} [{mask.sum()} players]") | |
| print(f"{'='*55}") | |
| best_model.save_model("fairvalue_xgboost.json") | |
| print(" Model saved: fairvalue_xgboost.json") | |
| return best_model, mae | |