fairvalue-api / src /data /kaggle_loader.py
FairValue
feat: production web app — React/Vite frontend + FastAPI backend with Render/Vercel deployment
b72652e
import kagglehub
import pandas as pd
import os
import shutil
def download_datasets():
"""
Downloads the Transfermarkt dataset and an FBref dataset.
"""
print("Downloading Transfermarkt dataset...")
tm_path = kagglehub.dataset_download("davidcariboo/player-scores")
print(f"Transfermarkt dataset downloaded locally to {tm_path}")
print("Downloading FBref dataset...")
try:
# Using a popular FBref Kaggle dataset for advanced metrics
fb_path = kagglehub.dataset_download("vivovinco/20222023-football-player-stats")
print(f"FBref dataset downloaded locally to {fb_path}")
except Exception as e:
print(f"Warning: Could not download FBref dataset: {e}")
fb_path = None
return tm_path, fb_path
def load_data(tm_path, fb_path):
"""
Loads necessary csv files into pandas DataFrames.
"""
# Transfermarkt data
players_path = os.path.join(tm_path, "players.csv")
appearances_path = os.path.join(tm_path, "appearances.csv")
player_valuations_path = os.path.join(tm_path, "player_valuations.csv")
players_df = pd.read_csv(players_path)
appearances_df = pd.read_csv(appearances_path)
valuations_df = None
if os.path.exists(player_valuations_path):
valuations_df = pd.read_csv(player_valuations_path)
# FBref data
fbref_df = None
if fb_path and os.path.exists(fb_path):
for file in os.listdir(fb_path):
if file.endswith('.csv'):
# Many FBref datasets use latin1 encoding
fbref_df = pd.read_csv(os.path.join(fb_path, file), encoding='latin1', low_memory=False)
break
return players_df, appearances_df, valuations_df, fbref_df
def get_base_datasets(stage_locally=True):
"""Downloads and returns all dataframes, optionally staging them locally."""
tm_path, fb_path = download_datasets()
if stage_locally:
print("Staging datasets into local data/raw directory...")
# Get absolute path relative to project root
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
tm_raw_dir = os.path.join(project_root, "data", "raw", "transfermarkt")
fb_raw_dir = os.path.join(project_root, "data", "raw", "fbref")
os.makedirs(tm_raw_dir, exist_ok=True)
os.makedirs(fb_raw_dir, exist_ok=True)
for f in os.listdir(tm_path):
if f.endswith('.csv'):
shutil.copy(os.path.join(tm_path, f), tm_raw_dir)
if fb_path and os.path.exists(fb_path):
for f in os.listdir(fb_path):
if f.endswith('.csv'):
shutil.copy(os.path.join(fb_path, f), fb_raw_dir)
print(f"Data staged successfully into {os.path.join(project_root, 'data', 'raw')}.")
return load_data(tm_path, fb_path)
if __name__ == "__main__":
players, apps, valuations, fbref = get_base_datasets()
print("\n--- Data Loading Summary ---")
print(f"Loaded {len(players)} Transfermarkt players.")
print(f"Loaded {len(apps)} match appearances.")
if valuations is not None:
print(f"Loaded {len(valuations)} transfer valuations.")
if fbref is not None:
print(f"Loaded {len(fbref)} FBref player records with advanced metrics like xG and xA.")