| """Download datasets that back target databases. |
| |
| Run examples (idempotent — already-present files are skipped): |
| uv run python scripts/download_data.py chinook |
| uv run python scripts/download_data.py bird-mini-dev |
| uv run python scripts/download_data.py all |
| |
| Outputs land under data/ which is gitignored. Each downloader records a |
| SHA-256 next to the file so eval reports can pin dataset checksums. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import hashlib |
| import shutil |
| import sys |
| import zipfile |
| from collections.abc import Callable |
| from pathlib import Path |
| from typing import Final |
|
|
| import requests |
|
|
| DATA_ROOT: Final = Path("data") |
|
|
| CHINOOK_URL: Final = ( |
| "https://github.com/lerocha/chinook-database/raw/master/" |
| "ChinookDatabase/DataSources/Chinook_Sqlite.sqlite" |
| ) |
| CHINOOK_FILENAME: Final = "Chinook.sqlite" |
|
|
| |
| |
| |
| |
| |
| BIRD_MINI_DEV_GDRIVE_ID: Final = "13VLWIwpw5E3d5DUkMvzw7hvHE67a4XkG" |
| BIRD_MINI_DEV_ARCHIVE: Final = "minidev.zip" |
| BIRD_MINI_DEV_INNER_PREFIX: Final = "minidev/" |
|
|
|
|
| def _download_file(url: str, dest: Path, *, chunk_size: int = 1 << 15) -> Path: |
| dest.parent.mkdir(parents=True, exist_ok=True) |
| if dest.exists(): |
| print(f"[skip] {dest} already present ({dest.stat().st_size:,} bytes)") |
| return dest |
| print(f"[download] {url} → {dest}") |
| with requests.get(url, stream=True, timeout=120) as response: |
| response.raise_for_status() |
| with dest.open("wb") as fh: |
| for chunk in response.iter_content(chunk_size=chunk_size): |
| if chunk: |
| fh.write(chunk) |
| print(f"[done] {dest} ({dest.stat().st_size:,} bytes)") |
| return dest |
|
|
|
|
| def _sha256(path: Path) -> str: |
| digest = hashlib.sha256() |
| with path.open("rb") as fh: |
| for chunk in iter(lambda: fh.read(1 << 20), b""): |
| digest.update(chunk) |
| return digest.hexdigest() |
|
|
|
|
| def _write_checksum(path: Path) -> None: |
| sha = _sha256(path) |
| sums = path.with_suffix(path.suffix + ".sha256") |
| sums.write_text(f"{sha} {path.name}\n", encoding="utf-8") |
| print(f"[checksum] {sums}") |
|
|
|
|
| def download_chinook() -> None: |
| target_dir = DATA_ROOT / "chinook" |
| dest = target_dir / CHINOOK_FILENAME |
| _download_file(CHINOOK_URL, dest) |
| _write_checksum(dest) |
|
|
|
|
| def download_bird_mini_dev() -> None: |
| target_dir = DATA_ROOT / "bird_mini_dev" |
| target_dir.mkdir(parents=True, exist_ok=True) |
| minidev_dir = target_dir / "MINIDEV" |
| if minidev_dir.is_dir() and (minidev_dir / "dev_databases").is_dir(): |
| print(f"[skip] {minidev_dir} already populated") |
| return |
|
|
| archive = target_dir / BIRD_MINI_DEV_ARCHIVE |
| if not archive.exists(): |
| import gdown |
|
|
| url = f"https://drive.google.com/uc?id={BIRD_MINI_DEV_GDRIVE_ID}" |
| print(f"[gdown] {url} → {archive}") |
| gdown.download(url, str(archive), quiet=False) |
| else: |
| print(f"[skip] {archive} already downloaded ({archive.stat().st_size:,} bytes)") |
| _write_checksum(archive) |
|
|
| print( |
| f"[unzip] {archive} → {target_dir} (stripping '{BIRD_MINI_DEV_INNER_PREFIX}', skipping __MACOSX)" |
| ) |
| with zipfile.ZipFile(archive) as zf: |
| for member in zf.infolist(): |
| name = member.filename |
| if name.startswith("__MACOSX/") or "/._" in name or name.endswith("/.DS_Store"): |
| continue |
| if not name.startswith(BIRD_MINI_DEV_INNER_PREFIX): |
| continue |
| stripped = name[len(BIRD_MINI_DEV_INNER_PREFIX) :] |
| if not stripped: |
| continue |
| dest = target_dir / stripped |
| if member.is_dir(): |
| dest.mkdir(parents=True, exist_ok=True) |
| continue |
| dest.parent.mkdir(parents=True, exist_ok=True) |
| with zf.open(member) as src, dest.open("wb") as fh: |
| shutil.copyfileobj(src, fh) |
| print(f"[done] {minidev_dir}") |
|
|
|
|
| DOWNLOADERS: Final[dict[str, Callable[[], None]]] = { |
| "chinook": download_chinook, |
| "bird-mini-dev": download_bird_mini_dev, |
| } |
|
|
|
|
| def main() -> int: |
| parser = argparse.ArgumentParser(description=__doc__) |
| parser.add_argument( |
| "datasets", |
| nargs="+", |
| choices=[*DOWNLOADERS, "all"], |
| help="Which dataset(s) to download.", |
| ) |
| args = parser.parse_args() |
| targets = list(DOWNLOADERS) if "all" in args.datasets else args.datasets |
| for name in targets: |
| DOWNLOADERS[name]() |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|