Spaces:
Running
Running
File size: 7,479 Bytes
b72652e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
def prepare_data_for_training(df, target_col='Transfer_Fee_2024_GBP'):
"""
Separates features from target and applies log1p to the target.
Design decision — market_value_in_eur:
────────────────────────────────────────
Previous code dropped this as "leaky". That was wrong for our use case:
TRUE leakage = using the SAME transfer's fee data during training
(e.g. Adjusted_Fee_EUR, Transfer_Fee_EUR from the same record).
market_value_in_eur is NOT leaky because:
1. It is a published, pre-transfer datapoint every club already knows.
2. In the app, users explicitly enter their market value estimate.
3. Without it, the model cannot distinguish Mbappe from a League Two
player — all predictions collapse to the population median (~£40m).
4. The model should learn the RELATIONSHIP between market value and
actual transfer fee (premiums/discounts), not pretend MV doesn't exist.
ACTUALLY leaky columns are only those derived from the same transfer record:
- Transfer_Fee_EUR, Adjusted_Fee_EUR, Inflation_Multiplier, Transfer_Year
"""
X = df.drop(
columns=[
target_col, 'Player_ID', 'Name', 'name', 'name_x', 'name_y',
'Transfer_Date', 'Adjusted_Fee_EUR', 'Inflation_Multiplier',
'Transfer_Year', 'player_id'
],
errors='ignore'
)
# Drop only same-transaction fee history — NOT market_value_in_eur
truly_leaky = [
c for c in X.columns
if ('fee' in c.lower() and 'transfer' in c.lower()) # e.g. Transfer_Fee_EUR
or ('adjusted' in c.lower() and 'fee' in c.lower()) # e.g. Adjusted_Fee_EUR
]
X = X.drop(columns=truly_leaky, errors='ignore')
print(f" Training features ({len(X.columns)}): {sorted(X.columns.tolist())}")
if 'market_value_in_eur' in X.columns:
print(" ✅ market_value_in_eur INCLUDED — elite player differentiation enabled.")
else:
print(" ⚠️ market_value_in_eur NOT FOUND in dataset — check pipeline renaming.")
y = np.log1p(df[target_col])
return X, y
def compute_sample_weights(y_log):
"""
Industry-standard approach to handle transfer fee class imbalance.
Replaces SMOTE-for-regression, which introduced synthetic noise:
- SMOTE was designed for classification, not regression
- Discretising log-price into bins and re-interpolating targets
introduces distributional artifacts that hurt generalisation
- XGBoost's native sample_weight is cleaner, faster, and more principled
sample_weight tells XGBoost "getting this prediction right matters more".
It does NOT create fake data — it simply re-weights gradient updates.
Tier thresholds (2024 GBP equivalent):
Elite: > £60m → weight 5.0 (Bellingham, Haaland tier)
High: £25–60m → weight 2.5 (Premier League regulars)
Mid: £5–25m → weight 1.5 (Championship / squad depth)
Standard: < £5m → weight 1.0 (lower leagues)
"""
y_real = np.expm1(y_log)
weights = np.where(y_real > 60_000_000, 5.0,
np.where(y_real > 25_000_000, 2.5,
np.where(y_real > 5_000_000, 1.5,
1.0)))
total_weighted = (weights * np.ones_like(y_real)).sum()
print(f"\n Sample weight breakdown:")
print(f" Elite (>£60m): {(weights == 5.0).sum():>5,} records × 5.0 "
f"= {(weights == 5.0).sum() * 5:.0f} effective samples")
print(f" High (£25-60m): {(weights == 2.5).sum():>5,} records × 2.5 "
f"= {(weights == 2.5).sum() * 2.5:.0f} effective samples")
print(f" Mid (£5-25m): {(weights == 1.5).sum():>5,} records × 1.5 "
f"= {(weights == 1.5).sum() * 1.5:.0f} effective samples")
print(f" Standard (<£5m): {(weights == 1.0).sum():>5,} records × 1.0 "
f"= {(weights == 1.0).sum() * 1.0:.0f} effective samples")
print(f" Total effective: {total_weighted:>7,.0f}")
return weights
def train_model(X, y):
"""
Trains the XGBoost regressor using RandomizedSearchCV + sample weights.
Key improvements over previous version:
- sample_weight replaces SMOTE: native, noise-free, principled
- Added reg_alpha (L1) to complement reg_lambda (L2) regularisation
- Elite-specific MAE reported alongside global MAE for transparency
- Weights are computed on train split only (clean test evaluation)
"""
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Compute weights on training set ONLY — test set uses uniform weighting
sample_weights_train = compute_sample_weights(y_train)
xgb_reg = xgb.XGBRegressor(
objective='reg:squarederror',
random_state=42,
tree_method='hist' # Fastest for large datasets
)
param_dist = {
'n_estimators': [200, 400, 600, 800],
'learning_rate': [0.01, 0.03, 0.05, 0.08],
'max_depth': [4, 5, 6, 7],
'min_child_weight': [1, 2, 3, 5], # Protects against overfitting on rare elites
'subsample': [0.7, 0.8, 0.9, 1.0],
'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
'gamma': [0, 0.05, 0.1, 0.2],
'reg_lambda': [0.5, 1.0, 1.5, 2.0], # L2 regularisation
'reg_alpha': [0, 0.05, 0.1, 0.2], # L1 regularisation (new)
}
print("\nBeginning Hyperparameter Search (50 iterations, 5-fold CV)...")
search = RandomizedSearchCV(
xgb_reg,
param_distributions=param_dist,
n_iter=50,
cv=5,
scoring='neg_mean_absolute_error',
n_jobs=-1,
random_state=42,
verbose=1
)
# This is the key line — XGBoost re-weights gradient updates per sample
search.fit(X_train, y_train, sample_weight=sample_weights_train)
best_model = search.best_estimator_
print(f"\nBest hyperparameters: {search.best_params_}")
# ── Evaluate on clean hold-out (no sample weights at test time) ───────────
predictions_log = best_model.predict(X_test)
predictions_real = np.expm1(predictions_log)
y_test_real = np.expm1(y_test)
mae = mean_absolute_error(y_test_real, predictions_real)
rmse = np.sqrt(mean_squared_error(y_test_real, predictions_real))
print(f"\n{'='*55}")
print(f" Model Evaluation (held-out test set)")
print(f"{'='*55}")
print(f" Global MAE: £{mae:>12,.0f}")
print(f" Global RMSE: £{rmse:>12,.0f}")
# Elite-specific performance (the critical metric for our use case)
for threshold, label in [(60_000_000, ">£60m"), (25_000_000, ">£25m")]:
mask = y_test_real > threshold
if mask.sum() > 0:
tier_mae = mean_absolute_error(
y_test_real[mask], predictions_real[mask]
)
print(f" Elite MAE ({label}): £{tier_mae:>12,.0f} [{mask.sum()} players]")
print(f"{'='*55}")
best_model.save_model("fairvalue_xgboost.json")
print(" Model saved: fairvalue_xgboost.json")
return best_model, mae
|