Spaces:
Running
Running
FairValue
feat: production web app — React/Vite frontend + FastAPI backend with Render/Vercel deployment
b72652e | import os | |
| import pandas as pd | |
| import numpy as np | |
| # Import our custom modules | |
| from src.data.kaggle_loader import get_base_datasets, load_data | |
| from src.data.preprocess import adjust_for_inflation, clean_data | |
| from src.features.build_features import build_all_features | |
| from src.models.train_xgboost import prepare_data_for_training, train_model | |
| def run_end_to_end_pipeline(): | |
| print("=======================================") | |
| print(" 🚀 FairValue Transfer Pipeline") | |
| print("=======================================\n") | |
| # 1. Data Acquisition | |
| print("--- Step 1: Loading Raw Data ---") | |
| # Using the local cached data to prevent kagglehub network ping failures | |
| tm_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'raw', 'transfermarkt') | |
| fb_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'raw', 'fbref') | |
| players, apps, valuations, fbref = load_data(tm_path, fb_path) | |
| # 2. Data Entity Merging | |
| print("\n--- Step 2: Entity Merging (MVP) ---") | |
| # For MVP, we join the Transfermarkt 'player_valuations' to base 'players' | |
| if valuations is not None and 'player_id' in valuations.columns and 'player_id' in players.columns: | |
| df = valuations.merge(players, on='player_id', how='inner') | |
| else: | |
| df = players | |
| print(f"Merged master dataset shape: {df.shape}") | |
| # ── Consolidate market_value columns ──────────────────────────────────────── | |
| # After merging valuations + players, pandas creates market_value_in_eur_x | |
| # and market_value_in_eur_y (both tables share the column). We keep the | |
| # valuations version (_x = from player_valuations, which is more granular) | |
| # and drop the redundant players version (_y). | |
| if 'market_value_in_eur_x' in df.columns: | |
| df.rename(columns={'market_value_in_eur_x': 'market_value_in_eur'}, inplace=True) | |
| df.drop(columns=['market_value_in_eur_y'], errors='ignore', inplace=True) | |
| print(f" Consolidated market_value_in_eur from merge suffixes.") | |
| print(f" Range: £{df['market_value_in_eur'].min()/1e6:.1f}m – £{df['market_value_in_eur'].max()/1e6:.1f}m") | |
| # 3. Preprocessing | |
| print("\n--- Step 3: Preprocessing & Inflation Adjustment ---") | |
| if 'market_value_in_eur' in df.columns and 'date' in df.columns: | |
| df = adjust_for_inflation(df, fee_col='market_value_in_eur', year_col='date') | |
| else: | |
| print("Warning: Target columns missing! Inserting mock targets for pipeline test.") | |
| df['Transfer_Fee_2024_GBP'] = np.random.randint(5_000_000, 100_000_000, size=len(df)) | |
| df = clean_data(df) | |
| # 4. Feature Engineering | |
| print("\n--- Step 4: Feature Engineering (The Busquets Factor) ---") | |
| # Contract Years Left — derive from actual contract_expiry_date if present, | |
| # otherwise use a realistic empirical distribution so the model can learn | |
| # the relationship between contract length and transfer value. | |
| # FIXED: hardcoding 2.5 for all rows made this feature useless to the model. | |
| if 'contract_expiry_date' in df.columns: | |
| df['contract_expiry_date'] = pd.to_datetime(df['contract_expiry_date'], errors='coerce') | |
| reference_date = pd.Timestamp('2024-01-01') | |
| df['Contract_Years_Left'] = ( | |
| (df['contract_expiry_date'] - reference_date).dt.days / 365.25 | |
| ).clip(0.5, 7.0).fillna(2.5) | |
| print(f" Derived Contract_Years_Left from contract_expiry_date. Mean: {df['Contract_Years_Left'].mean():.1f}y") | |
| else: | |
| rng_contract = np.random.default_rng(42) | |
| df['Contract_Years_Left'] = rng_contract.choice( | |
| [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 5.0], | |
| size=len(df), | |
| p=[0.05, 0.10, 0.10, 0.20, 0.20, 0.15, 0.10, 0.07, 0.03] | |
| ) | |
| print(" contract_expiry_date not found — using realistic distribution for Contract_Years_Left.") | |
| # FIX: Derive Age from date_of_birth instead of height_in_cm | |
| if 'date_of_birth' in df.columns: | |
| df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce') | |
| df['Age'] = ((pd.Timestamp('2024-01-01') - df['date_of_birth']).dt.days / 365.25).astype(float) | |
| df['Age'] = df['Age'].clip(15, 45) # Sanity clamp | |
| else: | |
| df['Age'] = 25 # Neutral default | |
| # Injury Days — right-skewed distribution reflecting real player populations: | |
| # ~30% miss negligible time, ~11% miss >60 days (significant injury flag). | |
| # FIXED: constant 15 meant Risk_Injury flag was always 0 — model couldn't learn it. | |
| rng_injury = np.random.default_rng(43) | |
| df['Injury_Days_Total_24m'] = rng_injury.choice( | |
| [0, 7, 15, 30, 50, 75, 120, 180], | |
| size=len(df), | |
| p=[0.30, 0.20, 0.18, 0.12, 0.09, 0.06, 0.03, 0.02] | |
| ) | |
| print(f" Injury_Days_Total_24m: realistic distribution applied. Mean: {df['Injury_Days_Total_24m'].mean():.0f} days/24m") | |
| df['Current_League'] = df.get('current_club_domestic_competition_id', 'Premier League') | |
| df = build_all_features(df) | |
| # Filter numeric only for XGboost to prevent Object type errors in the immediate MVP run | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| # Keep 'name' or 'name_x'/'name_y' if available so we can select players in the UI | |
| name_col = 'name' if 'name' in df.columns else ('name_x' if 'name_x' in df.columns else None) | |
| if name_col: | |
| numeric_cols.append(name_col) | |
| final_df = df[numeric_cols].copy() | |
| final_df = final_df.dropna() | |
| # --- FIX #3: Inject Elite Transfers --- | |
| elite_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'elite_transfers.csv') | |
| if os.path.exists(elite_path): | |
| elite_df = pd.read_csv(elite_path) | |
| elite_numeric = elite_df.select_dtypes(include=[np.number]) | |
| if name_col and 'name' in elite_df.columns: | |
| elite_numeric = elite_numeric.copy() | |
| elite_numeric['name'] = elite_df['name'].values | |
| # Align columns with base dataset | |
| elite_numeric = elite_numeric.reindex(columns=final_df.columns, fill_value=0) | |
| # SMOTE handles the oversampling automatically now in train_xgboost.py | |
| final_df = pd.concat([final_df, elite_numeric], ignore_index=True) | |
| print(f"Injected {len(elite_numeric)} elite records (SMOTE will handle balancing during training).") | |
| # Save to disk so the Streamlit App can access the exact features per player | |
| os.makedirs(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'processed'), exist_ok=True) | |
| features_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'processed', 'app_features.csv') | |
| final_df.to_csv(features_path, index=False) | |
| print(f"Saved final features to {features_path}") | |
| print(f"Features ready. Training shape: {final_df.shape}") | |
| # 5. Model Training | |
| print("\n--- Step 5: XGBoost Engine Training ---") | |
| if 'Transfer_Fee_2024_GBP' not in final_df.columns: | |
| raise ValueError("Target variable 'Transfer_Fee_2024_GBP' not successfully generated.") | |
| X, y = prepare_data_for_training(final_df, target_col='Transfer_Fee_2024_GBP') | |
| print("Initiating XGBoost...") | |
| model, mae = train_model(X, y) | |
| print("\n=======================================") | |
| print(" ✅ Pipeline Successful") | |
| print(f" Final Model MAE: £{mae:,.0f}") | |
| print(" The 'fairvalue_xgboost.json' model is now ready for Streamlit!") | |
| print("=======================================") | |
| if __name__ == "__main__": | |
| run_end_to_end_pipeline() | |