| | """Implements serializers for Opta data.""" |
| |
|
| | import copy |
| | import datetime |
| | import glob |
| | import os |
| | import re |
| | import warnings |
| | from collections.abc import Mapping |
| | from pathlib import Path |
| | from typing import Any, Optional, Union, cast |
| |
|
| | import pandas as pd |
| | from pandera.typing import DataFrame |
| |
|
| | from socceraction.data.base import EventDataLoader |
| |
|
| | from .parsers import ( |
| | F1JSONParser, |
| | F7XMLParser, |
| | F9JSONParser, |
| | F24JSONParser, |
| | F24XMLParser, |
| | MA1JSONParser, |
| | MA3JSONParser, |
| | OptaParser, |
| | WhoScoredParser, |
| | ) |
| | from .schema import ( |
| | OptaCompetitionSchema, |
| | OptaEventSchema, |
| | OptaGameSchema, |
| | OptaPlayerSchema, |
| | OptaTeamSchema, |
| | ) |
| |
|
| | _jsonparsers = { |
| | "f1": F1JSONParser, |
| | "f9": F9JSONParser, |
| | "f24": F24JSONParser, |
| | "ma1": MA1JSONParser, |
| | "ma3": MA3JSONParser, |
| | } |
| |
|
| | _xmlparsers = { |
| | "f7": F7XMLParser, |
| | "f24": F24XMLParser, |
| | } |
| |
|
| | _statsperformparsers = { |
| | "ma1": MA1JSONParser, |
| | "ma3": MA3JSONParser, |
| | } |
| |
|
| | _whoscoredparsers = { |
| | "whoscored": WhoScoredParser, |
| | } |
| |
|
| | _eventtypesdf = pd.DataFrame( |
| | [ |
| | (1, "pass"), |
| | (2, "offside pass"), |
| | (3, "take on"), |
| | (4, "foul"), |
| | (5, "out"), |
| | (6, "corner awarded"), |
| | (7, "tackle"), |
| | (8, "interception"), |
| | (9, "turnover"), |
| | (10, "save"), |
| | (11, "claim"), |
| | (12, "clearance"), |
| | (13, "miss"), |
| | (14, "post"), |
| | (15, "attempt saved"), |
| | (16, "goal"), |
| | (17, "card"), |
| | (18, "player off"), |
| | (19, "player on"), |
| | (20, "player retired"), |
| | (21, "player returns"), |
| | (22, "player becomes goalkeeper"), |
| | (23, "goalkeeper becomes player"), |
| | (24, "condition change"), |
| | (25, "official change"), |
| | (26, "unknown26"), |
| | (27, "start delay"), |
| | (28, "end delay"), |
| | (29, "unknown29"), |
| | (30, "end"), |
| | (31, "unknown31"), |
| | (32, "start"), |
| | (33, "unknown33"), |
| | (34, "team set up"), |
| | (35, "player changed position"), |
| | (36, "player changed jersey number"), |
| | (37, "collection end"), |
| | (38, "temp_goal"), |
| | (39, "temp_attempt"), |
| | (40, "formation change"), |
| | (41, "punch"), |
| | (42, "good skill"), |
| | (43, "deleted event"), |
| | (44, "aerial"), |
| | (45, "challenge"), |
| | (46, "unknown46"), |
| | (47, "rescinded card"), |
| | (48, "unknown46"), |
| | (49, "ball recovery"), |
| | (50, "dispossessed"), |
| | (51, "error"), |
| | (52, "keeper pick-up"), |
| | (53, "cross not claimed"), |
| | (54, "smother"), |
| | (55, "offside provoked"), |
| | (56, "shield ball opp"), |
| | (57, "foul throw in"), |
| | (58, "penalty faced"), |
| | (59, "keeper sweeper"), |
| | (60, "chance missed"), |
| | (61, "ball touch"), |
| | (62, "unknown62"), |
| | (63, "temp_save"), |
| | (64, "resume"), |
| | (65, "contentious referee decision"), |
| | (66, "possession data"), |
| | (67, "50/50"), |
| | (68, "referee drop ball"), |
| | (69, "failed to block"), |
| | (70, "injury time announcement"), |
| | (71, "coach setup"), |
| | (72, "caught offside"), |
| | (73, "other ball contact"), |
| | (74, "blocked pass"), |
| | (75, "delayed start"), |
| | (76, "early end"), |
| | (77, "player off pitch"), |
| | (78, "temp card"), |
| | (79, "coverage interruption"), |
| | (80, "drop of ball"), |
| | (81, "obstacle"), |
| | (83, "attempted tackle"), |
| | (84, "deleted after review"), |
| | (10000, "offside given"), |
| | ], |
| | columns=["type_id", "type_name"], |
| | ) |
| |
|
| |
|
| | def _deepupdate(target: dict[Any, Any], src: dict[Any, Any]) -> None: |
| | """Deep update target dict with src. |
| | |
| | For each k,v in src: if k doesn't exist in target, it is deep copied from |
| | src to target. Otherwise, if v is a list, target[k] is extended with |
| | src[k]. If v is a set, target[k] is updated with v, If v is a dict, |
| | recursively deep-update it. |
| | |
| | Parameters |
| | ---------- |
| | target: dict |
| | The original dictionary which is updated. |
| | src: dict |
| | The dictionary with which `target` is updated. |
| | |
| | Examples |
| | -------- |
| | >>> t = {'name': 'ferry', 'hobbies': ['programming', 'sci-fi']} |
| | >>> deepupdate(t, {'hobbies': ['gaming']}) |
| | >>> print(t) |
| | {'name': 'ferry', 'hobbies': ['programming', 'sci-fi', 'gaming']} |
| | """ |
| | for k, v in src.items(): |
| | if isinstance(v, list): |
| | if k not in target: |
| | target[k] = copy.deepcopy(v) |
| | else: |
| | target[k].extend(v) |
| | elif isinstance(v, dict): |
| | if k not in target: |
| | target[k] = copy.deepcopy(v) |
| | else: |
| | _deepupdate(target[k], v) |
| | elif isinstance(v, set): |
| | if k not in target: |
| | target[k] = v.copy() |
| | else: |
| | target[k].update(v.copy()) |
| | else: |
| | target[k] = copy.copy(v) |
| |
|
| |
|
| | def _extract_ids_from_path(path: str, pattern: str) -> dict[str, Union[str, int]]: |
| | regex = re.compile( |
| | ".+?" |
| | + re.escape(pattern) |
| | .replace(r"\{competition_id\}", r"(?P<competition_id>[a-zA-Zà-üÀ-Ü0-9-_ ]+)") |
| | .replace(r"\{season_id\}", r"(?P<season_id>[a-zA-Zà-üÀ-Ü0-9-_ ]+)") |
| | .replace(r"\{game_id\}", r"(?P<game_id>[a-zA-Zà-üÀ-Ü0-9-_ ]+)") |
| | ) |
| | m = re.match(regex, path) |
| | if m is None: |
| | raise ValueError(f"The filepath {path} does not match the format {pattern}.") |
| | ids = m.groupdict() |
| | return {k: int(v) if v.isdigit() else v for k, v in ids.items()} |
| |
|
| |
|
| | class OptaLoader(EventDataLoader): |
| | """Load Opta data feeds from a local folder. |
| | |
| | Parameters |
| | ---------- |
| | root : str |
| | Root-path of the data. |
| | parser : str or dict |
| | Either 'xml', 'json', 'statsperform', 'whoscored' or a dict with |
| | a custom parser for each feed. The default xml parser supports F7 and |
| | F24 feeds; the default json parser supports F1, F9 and F24 feeds, the |
| | StatsPerform parser supports MA1 and MA3 feeds. Custom parsers can be |
| | specified as:: |
| | |
| | { |
| | 'feed1_name': Feed1Parser |
| | 'feed2_name': Feed2Parser |
| | } |
| | |
| | where Feed1Parser and Feed2Parser are classes implementing |
| | :class:`~socceraction.spadl.opta.OptaParser` and 'feed1_name' and |
| | 'feed2_name' are a unique ID for each feed that matches to the keys in |
| | `feeds`. |
| | feeds : dict |
| | Glob pattern describing from which files the data from a specific game |
| | can be retrieved. For example, if files are named:: |
| | |
| | f7-1-2021-17362.xml |
| | f24-1-2021-17362.xml |
| | |
| | use:: |
| | |
| | feeds = { |
| | 'f7': "f7-{competition_id}-{season_id}-{game_id}.xml", |
| | 'f24': "f24-{competition_id}-{season_id}-{game_id}.xml" |
| | } |
| | |
| | Raises |
| | ------ |
| | ValueError |
| | If an invalid parser is provided. |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | root: str, |
| | parser: Union[str, Mapping[str, type[OptaParser]]] = "xml", |
| | feeds: Optional[dict[str, str]] = None, |
| | ) -> None: |
| | self.root = root |
| | if parser == "json": |
| | if feeds is None: |
| | feeds = { |
| | "f1": "f1-{competition_id}-{season_id}.json", |
| | "f9": "f9-{competition_id}-{season_id}-{game_id}.json", |
| | "f24": "f24-{competition_id}-{season_id}-{game_id}.json", |
| | } |
| | self.parsers = self._get_parsers_for_feeds(_jsonparsers, feeds) |
| | elif parser == "xml": |
| | if feeds is None: |
| | feeds = { |
| | "f7": "f7-{competition_id}-{season_id}-{game_id}.xml", |
| | "f24": "f24-{competition_id}-{season_id}-{game_id}.xml", |
| | } |
| | self.parsers = self._get_parsers_for_feeds(_xmlparsers, feeds) |
| | elif parser == "statsperform": |
| | if feeds is None: |
| | feeds = { |
| | "ma1": "ma1-{competition_id}-{season_id}.json", |
| | "ma3": "ma3-{competition_id}-{season_id}-{game_id}.json", |
| | } |
| | self.parsers = self._get_parsers_for_feeds(_statsperformparsers, feeds) |
| | elif parser == "whoscored": |
| | if feeds is None: |
| | feeds = { |
| | "whoscored": "{competition_id}-{season_id}-{game_id}.json", |
| | } |
| | self.parsers = self._get_parsers_for_feeds(_whoscoredparsers, feeds) |
| | elif isinstance(parser, dict): |
| | if feeds is None: |
| | raise ValueError("You must specify a feed for each parser.") |
| | self.parsers = self._get_parsers_for_feeds(parser, feeds) |
| | else: |
| | raise ValueError("Invalid parser provided.") |
| | self.feeds = {k: str(Path(v)) for k, v in feeds.items()} |
| |
|
| | def _get_parsers_for_feeds( |
| | self, available_parsers: Mapping[str, type[OptaParser]], feeds: dict[str, str] |
| | ) -> Mapping[str, type[OptaParser]]: |
| | """Select the appropriate parser for each feed. |
| | |
| | Parameters |
| | ---------- |
| | available_parsers : dict(str, OptaParser) |
| | Dictionary with all available parsers. |
| | feeds : dict(str, str) |
| | All feeds that should be parsed. |
| | |
| | Returns |
| | ------- |
| | dict(str, OptaParser) |
| | A mapping between all feeds that should be parsed and the |
| | corresponding parser class. |
| | |
| | Warns |
| | ----- |
| | Raises a warning if there is no parser available for any of the |
| | provided feeds. |
| | """ |
| | parsers = {} |
| | for feed in feeds: |
| | if feed in available_parsers: |
| | parsers[feed] = available_parsers[feed] |
| | else: |
| | warnings.warn(f"No parser available for {feed} feeds. This feed is ignored.") |
| | return parsers |
| |
|
| | def competitions(self) -> DataFrame[OptaCompetitionSchema]: |
| | """Return a dataframe with all available competitions and seasons. |
| | |
| | Returns |
| | ------- |
| | pd.DataFrame |
| | A dataframe containing all available competitions and seasons. See |
| | :class:`~socceraction.spadl.opta.OptaCompetitionSchema` for the schema. |
| | """ |
| | data: dict[int, dict[str, Any]] = {} |
| | loaded_seasons = set() |
| | for feed, feed_pattern in self.feeds.items(): |
| | glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id="*") |
| | feed_files = glob.glob(os.path.join(self.root, glob_pattern)) |
| | for ffp in feed_files: |
| | ids = _extract_ids_from_path(ffp, feed_pattern) |
| | |
| | |
| | |
| | competition_id = ids.get("competition_id") |
| | season_id = ids.get("season_id") |
| | if competition_id is not None and season_id is not None: |
| | if (competition_id, season_id) in loaded_seasons: |
| | continue |
| | else: |
| | loaded_seasons.add((competition_id, season_id)) |
| | parser = self.parsers[feed](ffp, **ids) |
| | _deepupdate(data, parser.extract_competitions()) |
| | return cast(DataFrame[OptaCompetitionSchema], pd.DataFrame(list(data.values()))) |
| |
|
| | def games(self, competition_id: int, season_id: int) -> DataFrame[OptaGameSchema]: |
| | """Return a dataframe with all available games in a season. |
| | |
| | Parameters |
| | ---------- |
| | competition_id : int |
| | The ID of the competition. |
| | season_id : int |
| | The ID of the season. |
| | |
| | Returns |
| | ------- |
| | pd.DataFrame |
| | A dataframe containing all available games. See |
| | :class:`~socceraction.spadl.opta.OptaGameSchema` for the schema. |
| | """ |
| | data: dict[int, dict[str, Any]] = {} |
| | for feed, feed_pattern in self.feeds.items(): |
| | glob_pattern = feed_pattern.format( |
| | competition_id=competition_id, season_id=season_id, game_id="*" |
| | ) |
| | feed_files = glob.glob(os.path.join(self.root, glob_pattern)) |
| | for ffp in feed_files: |
| | ids = _extract_ids_from_path(ffp, feed_pattern) |
| | parser = self.parsers[feed](ffp, **ids) |
| | _deepupdate(data, parser.extract_games()) |
| | return cast(DataFrame[OptaGameSchema], pd.DataFrame(list(data.values()))) |
| |
|
| | def teams(self, game_id: int) -> DataFrame[OptaTeamSchema]: |
| | """Return a dataframe with both teams that participated in a game. |
| | |
| | Parameters |
| | ---------- |
| | game_id : int |
| | The ID of the game. |
| | |
| | Returns |
| | ------- |
| | pd.DataFrame |
| | A dataframe containing both teams. See |
| | :class:`~socceraction.spadl.opta.OptaTeamSchema` for the schema. |
| | """ |
| | data: dict[int, dict[str, Any]] = {} |
| | for feed, feed_pattern in self.feeds.items(): |
| | glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id=game_id) |
| | feed_files = glob.glob(os.path.join(self.root, glob_pattern)) |
| | for ffp in feed_files: |
| | ids = _extract_ids_from_path(ffp, feed_pattern) |
| | parser = self.parsers[feed](ffp, **ids) |
| | _deepupdate(data, parser.extract_teams()) |
| | return cast(DataFrame[OptaTeamSchema], pd.DataFrame(list(data.values()))) |
| |
|
| | def players(self, game_id: int) -> DataFrame[OptaPlayerSchema]: |
| | """Return a dataframe with all players that participated in a game. |
| | |
| | Parameters |
| | ---------- |
| | game_id : int |
| | The ID of the game. |
| | |
| | Returns |
| | ------- |
| | pd.DataFrame |
| | A dataframe containing all players. See |
| | :class:`~socceraction.spadl.opta.OptaPlayerSchema` for the schema. |
| | """ |
| | data: dict[int, dict[str, Any]] = {} |
| | for feed, feed_pattern in self.feeds.items(): |
| | glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id=game_id) |
| | feed_files = glob.glob(os.path.join(self.root, glob_pattern)) |
| | for ffp in feed_files: |
| | ids = _extract_ids_from_path(ffp, feed_pattern) |
| | parser = self.parsers[feed](ffp, **ids) |
| | _deepupdate(data, parser.extract_players()) |
| | df_players = pd.DataFrame(list(data.values())) |
| | df_players["game_id"] = game_id |
| | return cast(DataFrame[OptaPlayerSchema], df_players) |
| |
|
| | def events(self, game_id: int) -> DataFrame[OptaEventSchema]: |
| | """Return a dataframe with the event stream of a game. |
| | |
| | Parameters |
| | ---------- |
| | game_id : int |
| | The ID of the game. |
| | |
| | Returns |
| | ------- |
| | pd.DataFrame |
| | A dataframe containing the event stream. See |
| | :class:`~socceraction.spadl.opta.OptaEventSchema` for the schema. |
| | """ |
| | data: dict[int, dict[str, Any]] = {} |
| | for feed, feed_pattern in self.feeds.items(): |
| | glob_pattern = feed_pattern.format(competition_id="*", season_id="*", game_id=game_id) |
| | feed_files = glob.glob(os.path.join(self.root, glob_pattern)) |
| | for ffp in feed_files: |
| | ids = _extract_ids_from_path(ffp, feed_pattern) |
| | parser = self.parsers[feed](ffp, **ids) |
| | _deepupdate(data, parser.extract_events()) |
| | events = ( |
| | pd.DataFrame(list(data.values())) |
| | .merge(_eventtypesdf, on="type_id", how="left") |
| | .sort_values( |
| | ["game_id", "period_id", "minute", "second", "timestamp"], kind="mergesort" |
| | ) |
| | .reset_index(drop=True) |
| | ) |
| |
|
| | |
| | events.loc[events.second < 0, "second"] = 0 |
| | events = events.sort_values( |
| | ["game_id", "period_id", "minute", "second", "timestamp"], kind="mergesort" |
| | ) |
| |
|
| | |
| | events = events[events.type_id != 43] |
| | events = events[ |
| | ~( |
| | (events.timestamp < datetime.datetime(1900, 1, 1)) |
| | | (events.timestamp > datetime.datetime(2100, 1, 1)) |
| | ) |
| | ] |
| |
|
| | return cast(DataFrame[OptaEventSchema], events) |
| |
|