File size: 3,423 Bytes
b72652e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import kagglehub
import pandas as pd
import os
import shutil

def download_datasets():
    """
    Downloads the Transfermarkt dataset and an FBref dataset.
    """
    print("Downloading Transfermarkt dataset...")
    tm_path = kagglehub.dataset_download("davidcariboo/player-scores")
    print(f"Transfermarkt dataset downloaded locally to {tm_path}")
    
    print("Downloading FBref dataset...")
    try:
        # Using a popular FBref Kaggle dataset for advanced metrics
        fb_path = kagglehub.dataset_download("vivovinco/20222023-football-player-stats")
        print(f"FBref dataset downloaded locally to {fb_path}")
    except Exception as e:
        print(f"Warning: Could not download FBref dataset: {e}")
        fb_path = None
        
    return tm_path, fb_path

def load_data(tm_path, fb_path):
    """
    Loads necessary csv files into pandas DataFrames.
    """
    # Transfermarkt data
    players_path = os.path.join(tm_path, "players.csv")
    appearances_path = os.path.join(tm_path, "appearances.csv")
    player_valuations_path = os.path.join(tm_path, "player_valuations.csv")

    players_df = pd.read_csv(players_path)
    appearances_df = pd.read_csv(appearances_path)
    
    valuations_df = None
    if os.path.exists(player_valuations_path):
        valuations_df = pd.read_csv(player_valuations_path)
    
    # FBref data
    fbref_df = None
    if fb_path and os.path.exists(fb_path):
        for file in os.listdir(fb_path):
            if file.endswith('.csv'):
                # Many FBref datasets use latin1 encoding
                fbref_df = pd.read_csv(os.path.join(fb_path, file), encoding='latin1', low_memory=False)
                break
                
    return players_df, appearances_df, valuations_df, fbref_df

def get_base_datasets(stage_locally=True):
    """Downloads and returns all dataframes, optionally staging them locally."""
    tm_path, fb_path = download_datasets()
    
    if stage_locally:
        print("Staging datasets into local data/raw directory...")
        # Get absolute path relative to project root
        project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
        tm_raw_dir = os.path.join(project_root, "data", "raw", "transfermarkt")
        fb_raw_dir = os.path.join(project_root, "data", "raw", "fbref")
        
        os.makedirs(tm_raw_dir, exist_ok=True)
        os.makedirs(fb_raw_dir, exist_ok=True)
        
        for f in os.listdir(tm_path):
            if f.endswith('.csv'):
                shutil.copy(os.path.join(tm_path, f), tm_raw_dir)
                
        if fb_path and os.path.exists(fb_path):
            for f in os.listdir(fb_path):
                if f.endswith('.csv'):
                    shutil.copy(os.path.join(fb_path, f), fb_raw_dir)
                    
        print(f"Data staged successfully into {os.path.join(project_root, 'data', 'raw')}.")
        
    return load_data(tm_path, fb_path)

if __name__ == "__main__":
    players, apps, valuations, fbref = get_base_datasets()
    
    print("\n--- Data Loading Summary ---")
    print(f"Loaded {len(players)} Transfermarkt players.")
    print(f"Loaded {len(apps)} match appearances.")
    if valuations is not None:
        print(f"Loaded {len(valuations)} transfer valuations.")
    if fbref is not None:
        print(f"Loaded {len(fbref)} FBref player records with advanced metrics like xG and xA.")