Spaces:

Britzzy
/

fairvalue-api

Running

File size: 7,479 Bytes

b72652e

import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error


def prepare_data_for_training(df, target_col='Transfer_Fee_2024_GBP'):
    """
    Separates features from target and applies log1p to the target.

    Design decision — market_value_in_eur:
    ────────────────────────────────────────
    Previous code dropped this as "leaky". That was wrong for our use case:

    TRUE leakage = using the SAME transfer's fee data during training
    (e.g. Adjusted_Fee_EUR, Transfer_Fee_EUR from the same record).

    market_value_in_eur is NOT leaky because:
      1. It is a published, pre-transfer datapoint every club already knows.
      2. In the app, users explicitly enter their market value estimate.
      3. Without it, the model cannot distinguish Mbappe from a League Two
         player — all predictions collapse to the population median (~£40m).
      4. The model should learn the RELATIONSHIP between market value and
         actual transfer fee (premiums/discounts), not pretend MV doesn't exist.

    ACTUALLY leaky columns are only those derived from the same transfer record:
      - Transfer_Fee_EUR, Adjusted_Fee_EUR, Inflation_Multiplier, Transfer_Year
    """
    X = df.drop(
        columns=[
            target_col, 'Player_ID', 'Name', 'name', 'name_x', 'name_y',
            'Transfer_Date', 'Adjusted_Fee_EUR', 'Inflation_Multiplier',
            'Transfer_Year', 'player_id'
        ],
        errors='ignore'
    )

    # Drop only same-transaction fee history — NOT market_value_in_eur
    truly_leaky = [
        c for c in X.columns
        if ('fee' in c.lower() and 'transfer' in c.lower())   # e.g. Transfer_Fee_EUR
        or ('adjusted' in c.lower() and 'fee' in c.lower())   # e.g. Adjusted_Fee_EUR
    ]
    X = X.drop(columns=truly_leaky, errors='ignore')

    print(f"  Training features ({len(X.columns)}): {sorted(X.columns.tolist())}")
    if 'market_value_in_eur' in X.columns:
        print("  ✅ market_value_in_eur INCLUDED — elite player differentiation enabled.")
    else:
        print("  ⚠️  market_value_in_eur NOT FOUND in dataset — check pipeline renaming.")

    y = np.log1p(df[target_col])
    return X, y


def compute_sample_weights(y_log):
    """
    Industry-standard approach to handle transfer fee class imbalance.

    Replaces SMOTE-for-regression, which introduced synthetic noise:
      - SMOTE was designed for classification, not regression
      - Discretising log-price into bins and re-interpolating targets
        introduces distributional artifacts that hurt generalisation
      - XGBoost's native sample_weight is cleaner, faster, and more principled

    sample_weight tells XGBoost "getting this prediction right matters more".
    It does NOT create fake data — it simply re-weights gradient updates.

    Tier thresholds (2024 GBP equivalent):
      Elite:     > £60m    → weight 5.0   (Bellingham, Haaland tier)
      High:      £25–60m   → weight 2.5   (Premier League regulars)
      Mid:       £5–25m    → weight 1.5   (Championship / squad depth)
      Standard:  < £5m     → weight 1.0   (lower leagues)
    """
    y_real = np.expm1(y_log)
    weights = np.where(y_real > 60_000_000, 5.0,
              np.where(y_real > 25_000_000, 2.5,
              np.where(y_real >  5_000_000, 1.5,
              1.0)))

    total_weighted = (weights * np.ones_like(y_real)).sum()
    print(f"\n  Sample weight breakdown:")
    print(f"    Elite  (>£60m):    {(weights == 5.0).sum():>5,} records × 5.0 "
          f"= {(weights == 5.0).sum() * 5:.0f} effective samples")
    print(f"    High   (£25-60m):  {(weights == 2.5).sum():>5,} records × 2.5 "
          f"= {(weights == 2.5).sum() * 2.5:.0f} effective samples")
    print(f"    Mid    (£5-25m):   {(weights == 1.5).sum():>5,} records × 1.5 "
          f"= {(weights == 1.5).sum() * 1.5:.0f} effective samples")
    print(f"    Standard (<£5m):   {(weights == 1.0).sum():>5,} records × 1.0 "
          f"= {(weights == 1.0).sum() * 1.0:.0f} effective samples")
    print(f"    Total effective:   {total_weighted:>7,.0f}")
    return weights


def train_model(X, y):
    """
    Trains the XGBoost regressor using RandomizedSearchCV + sample weights.

    Key improvements over previous version:
      - sample_weight replaces SMOTE: native, noise-free, principled
      - Added reg_alpha (L1) to complement reg_lambda (L2) regularisation
      - Elite-specific MAE reported alongside global MAE for transparency
      - Weights are computed on train split only (clean test evaluation)
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Compute weights on training set ONLY — test set uses uniform weighting
    sample_weights_train = compute_sample_weights(y_train)

    xgb_reg = xgb.XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        tree_method='hist'  # Fastest for large datasets
    )

    param_dist = {
        'n_estimators':     [200, 400, 600, 800],
        'learning_rate':    [0.01, 0.03, 0.05, 0.08],
        'max_depth':        [4, 5, 6, 7],
        'min_child_weight': [1, 2, 3, 5],    # Protects against overfitting on rare elites
        'subsample':        [0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        'gamma':            [0, 0.05, 0.1, 0.2],
        'reg_lambda':       [0.5, 1.0, 1.5, 2.0],  # L2 regularisation
        'reg_alpha':        [0, 0.05, 0.1, 0.2],    # L1 regularisation (new)
    }

    print("\nBeginning Hyperparameter Search (50 iterations, 5-fold CV)...")
    search = RandomizedSearchCV(
        xgb_reg,
        param_distributions=param_dist,
        n_iter=50,
        cv=5,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )

    # This is the key line — XGBoost re-weights gradient updates per sample
    search.fit(X_train, y_train, sample_weight=sample_weights_train)

    best_model = search.best_estimator_
    print(f"\nBest hyperparameters: {search.best_params_}")

    # ── Evaluate on clean hold-out (no sample weights at test time) ───────────
    predictions_log  = best_model.predict(X_test)
    predictions_real = np.expm1(predictions_log)
    y_test_real      = np.expm1(y_test)

    mae  = mean_absolute_error(y_test_real, predictions_real)
    rmse = np.sqrt(mean_squared_error(y_test_real, predictions_real))

    print(f"\n{'='*55}")
    print(f"  Model Evaluation (held-out test set)")
    print(f"{'='*55}")
    print(f"  Global MAE:  £{mae:>12,.0f}")
    print(f"  Global RMSE: £{rmse:>12,.0f}")

    # Elite-specific performance (the critical metric for our use case)
    for threshold, label in [(60_000_000, ">£60m"), (25_000_000, ">£25m")]:
        mask = y_test_real > threshold
        if mask.sum() > 0:
            tier_mae = mean_absolute_error(
                y_test_real[mask], predictions_real[mask]
            )
            print(f"  Elite MAE ({label}): £{tier_mae:>12,.0f}  [{mask.sum()} players]")

    print(f"{'='*55}")

    best_model.save_model("fairvalue_xgboost.json")
    print("  Model saved: fairvalue_xgboost.json")

    return best_model, mae