nl-sql / scripts /download_data.py
liovina's picture
Deploy NL_SQL HEAD to HF Space
942050b verified
"""Download datasets that back target databases.
Run examples (idempotent — already-present files are skipped):
uv run python scripts/download_data.py chinook
uv run python scripts/download_data.py bird-mini-dev
uv run python scripts/download_data.py all
Outputs land under data/ which is gitignored. Each downloader records a
SHA-256 next to the file so eval reports can pin dataset checksums.
"""
from __future__ import annotations
import argparse
import hashlib
import shutil
import sys
import zipfile
from collections.abc import Callable
from pathlib import Path
from typing import Final
import requests
DATA_ROOT: Final = Path("data")
CHINOOK_URL: Final = (
"https://github.com/lerocha/chinook-database/raw/master/"
"ChinookDatabase/DataSources/Chinook_Sqlite.sqlite"
)
CHINOOK_FILENAME: Final = "Chinook.sqlite"
# BIRD Mini-Dev: the canonical bundle (questions + 11 SQLite DBs) lives on Google
# Drive. The official Aliyun mirror is firewalled in some regions; the HuggingFace
# `birdsql/bird_mini_dev` repo only carries the questions JSON, not the SQLite
# databases, so we cannot use snapshot_download here. gdown handles GD's confirm
# token redirect for >100 MB files.
BIRD_MINI_DEV_GDRIVE_ID: Final = "13VLWIwpw5E3d5DUkMvzw7hvHE67a4XkG"
BIRD_MINI_DEV_ARCHIVE: Final = "minidev.zip"
BIRD_MINI_DEV_INNER_PREFIX: Final = "minidev/" # zip wraps everything one level deep
def _download_file(url: str, dest: Path, *, chunk_size: int = 1 << 15) -> Path:
dest.parent.mkdir(parents=True, exist_ok=True)
if dest.exists():
print(f"[skip] {dest} already present ({dest.stat().st_size:,} bytes)")
return dest
print(f"[download] {url}{dest}")
with requests.get(url, stream=True, timeout=120) as response:
response.raise_for_status()
with dest.open("wb") as fh:
for chunk in response.iter_content(chunk_size=chunk_size):
if chunk:
fh.write(chunk)
print(f"[done] {dest} ({dest.stat().st_size:,} bytes)")
return dest
def _sha256(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as fh:
for chunk in iter(lambda: fh.read(1 << 20), b""):
digest.update(chunk)
return digest.hexdigest()
def _write_checksum(path: Path) -> None:
sha = _sha256(path)
sums = path.with_suffix(path.suffix + ".sha256")
sums.write_text(f"{sha} {path.name}\n", encoding="utf-8")
print(f"[checksum] {sums}")
def download_chinook() -> None:
target_dir = DATA_ROOT / "chinook"
dest = target_dir / CHINOOK_FILENAME
_download_file(CHINOOK_URL, dest)
_write_checksum(dest)
def download_bird_mini_dev() -> None:
target_dir = DATA_ROOT / "bird_mini_dev"
target_dir.mkdir(parents=True, exist_ok=True)
minidev_dir = target_dir / "MINIDEV"
if minidev_dir.is_dir() and (minidev_dir / "dev_databases").is_dir():
print(f"[skip] {minidev_dir} already populated")
return
archive = target_dir / BIRD_MINI_DEV_ARCHIVE
if not archive.exists():
import gdown
url = f"https://drive.google.com/uc?id={BIRD_MINI_DEV_GDRIVE_ID}"
print(f"[gdown] {url}{archive}")
gdown.download(url, str(archive), quiet=False)
else:
print(f"[skip] {archive} already downloaded ({archive.stat().st_size:,} bytes)")
_write_checksum(archive)
print(
f"[unzip] {archive}{target_dir} (stripping '{BIRD_MINI_DEV_INNER_PREFIX}', skipping __MACOSX)"
)
with zipfile.ZipFile(archive) as zf:
for member in zf.infolist():
name = member.filename
if name.startswith("__MACOSX/") or "/._" in name or name.endswith("/.DS_Store"):
continue
if not name.startswith(BIRD_MINI_DEV_INNER_PREFIX):
continue
stripped = name[len(BIRD_MINI_DEV_INNER_PREFIX) :]
if not stripped:
continue
dest = target_dir / stripped
if member.is_dir():
dest.mkdir(parents=True, exist_ok=True)
continue
dest.parent.mkdir(parents=True, exist_ok=True)
with zf.open(member) as src, dest.open("wb") as fh:
shutil.copyfileobj(src, fh)
print(f"[done] {minidev_dir}")
DOWNLOADERS: Final[dict[str, Callable[[], None]]] = {
"chinook": download_chinook,
"bird-mini-dev": download_bird_mini_dev,
}
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"datasets",
nargs="+",
choices=[*DOWNLOADERS, "all"],
help="Which dataset(s) to download.",
)
args = parser.parse_args()
targets = list(DOWNLOADERS) if "all" in args.datasets else args.datasets
for name in targets:
DOWNLOADERS[name]()
return 0
if __name__ == "__main__":
sys.exit(main())