Spaces:

liovina
/

nl-sql

Running

App Files Files Community

nl-sql / scripts /download_data.py

liovina

Deploy NL_SQL HEAD to HF Space

942050b verified 22 days ago

raw

history blame contribute delete

4.9 kB

	"""Download datasets that back target databases.

	Run examples (idempotent — already-present files are skipped):
	uv run python scripts/download_data.py chinook
	uv run python scripts/download_data.py bird-mini-dev
	uv run python scripts/download_data.py all

	Outputs land under data/ which is gitignored. Each downloader records a
	SHA-256 next to the file so eval reports can pin dataset checksums.
	"""

	from __future__ import annotations

	import argparse
	import hashlib
	import shutil
	import sys
	import zipfile
	from collections.abc import Callable
	from pathlib import Path
	from typing import Final

	import requests

	DATA_ROOT: Final = Path("data")

	CHINOOK_URL: Final = (
	"https://github.com/lerocha/chinook-database/raw/master/"
	"ChinookDatabase/DataSources/Chinook_Sqlite.sqlite"
	)
	CHINOOK_FILENAME: Final = "Chinook.sqlite"

	# BIRD Mini-Dev: the canonical bundle (questions + 11 SQLite DBs) lives on Google
	# Drive. The official Aliyun mirror is firewalled in some regions; the HuggingFace
	# `birdsql/bird_mini_dev` repo only carries the questions JSON, not the SQLite
	# databases, so we cannot use snapshot_download here. gdown handles GD's confirm
	# token redirect for >100 MB files.
	BIRD_MINI_DEV_GDRIVE_ID: Final = "13VLWIwpw5E3d5DUkMvzw7hvHE67a4XkG"
	BIRD_MINI_DEV_ARCHIVE: Final = "minidev.zip"
	BIRD_MINI_DEV_INNER_PREFIX: Final = "minidev/" # zip wraps everything one level deep


	def _download_file(url: str, dest: Path, *, chunk_size: int = 1 << 15) -> Path:
	dest.parent.mkdir(parents=True, exist_ok=True)
	if dest.exists():
	print(f"[skip] {dest} already present ({dest.stat().st_size:,} bytes)")
	return dest
	print(f"[download] {url} → {dest}")
	with requests.get(url, stream=True, timeout=120) as response:
	response.raise_for_status()
	with dest.open("wb") as fh:
	for chunk in response.iter_content(chunk_size=chunk_size):
	if chunk:
	fh.write(chunk)
	print(f"[done] {dest} ({dest.stat().st_size:,} bytes)")
	return dest


	def _sha256(path: Path) -> str:
	digest = hashlib.sha256()
	with path.open("rb") as fh:
	for chunk in iter(lambda: fh.read(1 << 20), b""):
	digest.update(chunk)
	return digest.hexdigest()


	def _write_checksum(path: Path) -> None:
	sha = _sha256(path)
	sums = path.with_suffix(path.suffix + ".sha256")
	sums.write_text(f"{sha} {path.name}\n", encoding="utf-8")
	print(f"[checksum] {sums}")


	def download_chinook() -> None:
	target_dir = DATA_ROOT / "chinook"
	dest = target_dir / CHINOOK_FILENAME
	_download_file(CHINOOK_URL, dest)
	_write_checksum(dest)


	def download_bird_mini_dev() -> None:
	target_dir = DATA_ROOT / "bird_mini_dev"
	target_dir.mkdir(parents=True, exist_ok=True)
	minidev_dir = target_dir / "MINIDEV"
	if minidev_dir.is_dir() and (minidev_dir / "dev_databases").is_dir():
	print(f"[skip] {minidev_dir} already populated")
	return

	archive = target_dir / BIRD_MINI_DEV_ARCHIVE
	if not archive.exists():
	import gdown

	url = f"https://drive.google.com/uc?id={BIRD_MINI_DEV_GDRIVE_ID}"
	print(f"[gdown] {url} → {archive}")
	gdown.download(url, str(archive), quiet=False)
	else:
	print(f"[skip] {archive} already downloaded ({archive.stat().st_size:,} bytes)")
	_write_checksum(archive)

	print(
	f"[unzip] {archive} → {target_dir} (stripping '{BIRD_MINI_DEV_INNER_PREFIX}', skipping __MACOSX)"
	)
	with zipfile.ZipFile(archive) as zf:
	for member in zf.infolist():
	name = member.filename
	if name.startswith("__MACOSX/") or "/._" in name or name.endswith("/.DS_Store"):
	continue
	if not name.startswith(BIRD_MINI_DEV_INNER_PREFIX):
	continue
	stripped = name[len(BIRD_MINI_DEV_INNER_PREFIX) :]
	if not stripped:
	continue
	dest = target_dir / stripped
	if member.is_dir():
	dest.mkdir(parents=True, exist_ok=True)
	continue
	dest.parent.mkdir(parents=True, exist_ok=True)
	with zf.open(member) as src, dest.open("wb") as fh:
	shutil.copyfileobj(src, fh)
	print(f"[done] {minidev_dir}")


	DOWNLOADERS: Final[dict[str, Callable[[], None]]] = {
	"chinook": download_chinook,
	"bird-mini-dev": download_bird_mini_dev,
	}


	def main() -> int:
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(
	"datasets",
	nargs="+",
	choices=[*DOWNLOADERS, "all"],
	help="Which dataset(s) to download.",
	)
	args = parser.parse_args()
	targets = list(DOWNLOADERS) if "all" in args.datasets else args.datasets
	for name in targets:
	DOWNLOADERS[name]()
	return 0


	if __name__ == "__main__":
	sys.exit(main())