File size: 7,479 Bytes
b72652e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error


def prepare_data_for_training(df, target_col='Transfer_Fee_2024_GBP'):
    """
    Separates features from target and applies log1p to the target.

    Design decision — market_value_in_eur:
    ────────────────────────────────────────
    Previous code dropped this as "leaky". That was wrong for our use case:

    TRUE leakage = using the SAME transfer's fee data during training
    (e.g. Adjusted_Fee_EUR, Transfer_Fee_EUR from the same record).

    market_value_in_eur is NOT leaky because:
      1. It is a published, pre-transfer datapoint every club already knows.
      2. In the app, users explicitly enter their market value estimate.
      3. Without it, the model cannot distinguish Mbappe from a League Two
         player — all predictions collapse to the population median (~£40m).
      4. The model should learn the RELATIONSHIP between market value and
         actual transfer fee (premiums/discounts), not pretend MV doesn't exist.

    ACTUALLY leaky columns are only those derived from the same transfer record:
      - Transfer_Fee_EUR, Adjusted_Fee_EUR, Inflation_Multiplier, Transfer_Year
    """
    X = df.drop(
        columns=[
            target_col, 'Player_ID', 'Name', 'name', 'name_x', 'name_y',
            'Transfer_Date', 'Adjusted_Fee_EUR', 'Inflation_Multiplier',
            'Transfer_Year', 'player_id'
        ],
        errors='ignore'
    )

    # Drop only same-transaction fee history — NOT market_value_in_eur
    truly_leaky = [
        c for c in X.columns
        if ('fee' in c.lower() and 'transfer' in c.lower())   # e.g. Transfer_Fee_EUR
        or ('adjusted' in c.lower() and 'fee' in c.lower())   # e.g. Adjusted_Fee_EUR
    ]
    X = X.drop(columns=truly_leaky, errors='ignore')

    print(f"  Training features ({len(X.columns)}): {sorted(X.columns.tolist())}")
    if 'market_value_in_eur' in X.columns:
        print("  ✅ market_value_in_eur INCLUDED — elite player differentiation enabled.")
    else:
        print("  ⚠️  market_value_in_eur NOT FOUND in dataset — check pipeline renaming.")

    y = np.log1p(df[target_col])
    return X, y


def compute_sample_weights(y_log):
    """
    Industry-standard approach to handle transfer fee class imbalance.

    Replaces SMOTE-for-regression, which introduced synthetic noise:
      - SMOTE was designed for classification, not regression
      - Discretising log-price into bins and re-interpolating targets
        introduces distributional artifacts that hurt generalisation
      - XGBoost's native sample_weight is cleaner, faster, and more principled

    sample_weight tells XGBoost "getting this prediction right matters more".
    It does NOT create fake data — it simply re-weights gradient updates.

    Tier thresholds (2024 GBP equivalent):
      Elite:     > £60m    → weight 5.0   (Bellingham, Haaland tier)
      High:      £25–60m   → weight 2.5   (Premier League regulars)
      Mid:       £5–25m    → weight 1.5   (Championship / squad depth)
      Standard:  < £5m     → weight 1.0   (lower leagues)
    """
    y_real = np.expm1(y_log)
    weights = np.where(y_real > 60_000_000, 5.0,
              np.where(y_real > 25_000_000, 2.5,
              np.where(y_real >  5_000_000, 1.5,
              1.0)))

    total_weighted = (weights * np.ones_like(y_real)).sum()
    print(f"\n  Sample weight breakdown:")
    print(f"    Elite  (>£60m):    {(weights == 5.0).sum():>5,} records × 5.0 "
          f"= {(weights == 5.0).sum() * 5:.0f} effective samples")
    print(f"    High   (£25-60m):  {(weights == 2.5).sum():>5,} records × 2.5 "
          f"= {(weights == 2.5).sum() * 2.5:.0f} effective samples")
    print(f"    Mid    (£5-25m):   {(weights == 1.5).sum():>5,} records × 1.5 "
          f"= {(weights == 1.5).sum() * 1.5:.0f} effective samples")
    print(f"    Standard (<£5m):   {(weights == 1.0).sum():>5,} records × 1.0 "
          f"= {(weights == 1.0).sum() * 1.0:.0f} effective samples")
    print(f"    Total effective:   {total_weighted:>7,.0f}")
    return weights


def train_model(X, y):
    """
    Trains the XGBoost regressor using RandomizedSearchCV + sample weights.

    Key improvements over previous version:
      - sample_weight replaces SMOTE: native, noise-free, principled
      - Added reg_alpha (L1) to complement reg_lambda (L2) regularisation
      - Elite-specific MAE reported alongside global MAE for transparency
      - Weights are computed on train split only (clean test evaluation)
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Compute weights on training set ONLY — test set uses uniform weighting
    sample_weights_train = compute_sample_weights(y_train)

    xgb_reg = xgb.XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        tree_method='hist'  # Fastest for large datasets
    )

    param_dist = {
        'n_estimators':     [200, 400, 600, 800],
        'learning_rate':    [0.01, 0.03, 0.05, 0.08],
        'max_depth':        [4, 5, 6, 7],
        'min_child_weight': [1, 2, 3, 5],    # Protects against overfitting on rare elites
        'subsample':        [0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        'gamma':            [0, 0.05, 0.1, 0.2],
        'reg_lambda':       [0.5, 1.0, 1.5, 2.0],  # L2 regularisation
        'reg_alpha':        [0, 0.05, 0.1, 0.2],    # L1 regularisation (new)
    }

    print("\nBeginning Hyperparameter Search (50 iterations, 5-fold CV)...")
    search = RandomizedSearchCV(
        xgb_reg,
        param_distributions=param_dist,
        n_iter=50,
        cv=5,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )

    # This is the key line — XGBoost re-weights gradient updates per sample
    search.fit(X_train, y_train, sample_weight=sample_weights_train)

    best_model = search.best_estimator_
    print(f"\nBest hyperparameters: {search.best_params_}")

    # ── Evaluate on clean hold-out (no sample weights at test time) ───────────
    predictions_log  = best_model.predict(X_test)
    predictions_real = np.expm1(predictions_log)
    y_test_real      = np.expm1(y_test)

    mae  = mean_absolute_error(y_test_real, predictions_real)
    rmse = np.sqrt(mean_squared_error(y_test_real, predictions_real))

    print(f"\n{'='*55}")
    print(f"  Model Evaluation (held-out test set)")
    print(f"{'='*55}")
    print(f"  Global MAE:  £{mae:>12,.0f}")
    print(f"  Global RMSE: £{rmse:>12,.0f}")

    # Elite-specific performance (the critical metric for our use case)
    for threshold, label in [(60_000_000, ">£60m"), (25_000_000, ">£25m")]:
        mask = y_test_real > threshold
        if mask.sum() > 0:
            tier_mae = mean_absolute_error(
                y_test_real[mask], predictions_real[mask]
            )
            print(f"  Elite MAE ({label}): £{tier_mae:>12,.0f}  [{mask.sum()} players]")

    print(f"{'='*55}")

    best_model.save_model("fairvalue_xgboost.json")
    print("  Model saved: fairvalue_xgboost.json")

    return best_model, mae