Spaces:
Sleeping
Sleeping
| # std lib | |
| import os | |
| from pathlib import Path | |
| # 3rd party imports | |
| import pandas as pd | |
| # local imports | |
| from huggingface_hub import snapshot_download | |
| DATA_DIR = Path(__file__).resolve().parent / "data" | |
| def get_full_gaia_level1_data(): | |
| """ | |
| Download the full GAIA level 1 data and save it locally in data/. | |
| """ | |
| os.makedirs(DATA_DIR, exist_ok=True) | |
| # Fallback to common HF token names; token can be optional for public datasets. | |
| token = os.getenv("HF_FINAL_ASSIGNMENT_DRAFT") or os.getenv("HF_TOKEN") | |
| kwargs = { | |
| "repo_id": "gaia-benchmark/GAIA", | |
| "repo_type": "dataset", | |
| "local_dir": DATA_DIR, | |
| } | |
| if token: | |
| kwargs["token"] = token | |
| snapshot_download(**kwargs) | |
| def get_file_from_gaia_level1_data(task_id: str): | |
| """ | |
| Given a GAIA level 1 task ID, return the corresponding file path in the local directory data/gaia_level1_data. | |
| les fichiers servant au test HF sont dans data/gaia_level1_data/2023_level1/validation/ | |
| """ | |
| # lire le contenu du dossier data/gaia_level1_data/2023_level1/validation et afficher les chemins des fichiers présents | |
| validation_dir = DATA_DIR / "2023" / "validation" | |
| for root, _, files in os.walk(validation_dir): | |
| for file in files: | |
| if task_id in file: | |
| return os.path.join(root, file) | |
| return None | |
| def get_question(task_id: str) -> str: | |
| """ | |
| Given a GAIA level 1 task ID, return the corresponding question. | |
| """ | |
| metadata_file = os.path.join( | |
| DATA_DIR, "2023", "validation", "metadata.level1.parquet" | |
| ) | |
| if os.path.exists(metadata_file): | |
| metadata_df = pd.read_parquet(metadata_file) | |
| return metadata_df.loc[metadata_df["task_id"] == task_id, "Question"].values[0] | |
| else: | |
| print(f"Metadata file not found: {metadata_file}") | |
| return "" | |
| def ensure_validation_data(base_dir: Path): | |
| """ | |
| Ensure GAIA 2023 level1 validation files are present. | |
| Returns (ok: bool, error_message: str | None). | |
| """ | |
| validation_dir = base_dir / "data" / "2023" / "validation" | |
| metadata_file = validation_dir / "metadata.level1.parquet" | |
| need_download_reason = None | |
| # Fast fail: missing or empty validation directory | |
| if not validation_dir.exists() or not any(validation_dir.iterdir()): | |
| need_download_reason = f"Validation data not found in {validation_dir}" | |
| # Metadata is required to validate expected files | |
| elif not metadata_file.exists(): | |
| need_download_reason = ( | |
| f"Metadata file not found: {metadata_file}. Cannot verify expected files" | |
| ) | |
| else: | |
| try: | |
| # Read only the needed column for speed/memory | |
| metadata_df = pd.read_parquet(metadata_file, columns=["file_name"]) | |
| expected_files = { | |
| str(name) for name in metadata_df["file_name"].dropna().unique() | |
| } | |
| present_files = {p.name for p in validation_dir.iterdir() if p.is_file()} | |
| missing_files = expected_files - present_files | |
| if missing_files: | |
| need_download_reason = ( | |
| f"Missing {len(missing_files)} expected validation files" | |
| ) | |
| except Exception as e: | |
| need_download_reason = ( | |
| f"Error reading metadata ({metadata_file}): {e}. " | |
| "Cannot verify expected files" | |
| ) | |
| if need_download_reason is not None: | |
| print(f"{need_download_reason}. Downloading full GAIA level 1 data...") | |
| try: | |
| get_full_gaia_level1_data() | |
| print("Data download completed.") | |
| except Exception as e: | |
| error_message = f"Error downloading GAIA level 1 data: {e}" | |
| print(error_message) | |
| return False, error_message | |
| else: | |
| print("All expected validation files are present. Skipping data download.") | |
| return True, None | |
| if __name__ == "__main__": | |
| # get_full_gaia_level1_data() | |
| print(get_file_from_gaia_level1_data("cca530fc-4052-43b2-b130-b30968d8aa44")) | |
| # print(get_question("cca530fc-4052-43b2-b130-b30968d8aa44")) | |