| | from pathlib import Path |
| | import os |
| |
|
| | def download_data(): |
| | try: |
| | from huggingface_hub import snapshot_download |
| | except ImportError: |
| | print("huggingface_hub not installed. Installing...") |
| | os.system("pip install huggingface_hub") |
| | from huggingface_hub import snapshot_download |
| | |
| | |
| | HF_REPO_ID = os.getenv("HF_DATA_REPO", "hungnha/do_an_tot_nghiep") |
| | |
| | data_path = Path("data") |
| | |
| | if data_path.exists() and any(data_path.iterdir()): |
| | print("Data folder already exists. Skipping download.") |
| | print(f"To re-download, delete the 'data/' folder first.") |
| | return |
| | |
| | print(f"Downloading data from HuggingFace: {HF_REPO_ID}") |
| | print("This may take a few minutes...") |
| | |
| | try: |
| | snapshot_download( |
| | repo_id=HF_REPO_ID, |
| | repo_type="dataset", |
| | local_dir="data", |
| | local_dir_use_symlinks=False, |
| | ) |
| | print("Download complete!") |
| | print(f"Data saved to: {data_path.absolute()}") |
| | except Exception as e: |
| | print(f"Error downloading data: {e}") |
| | print("\nTips:") |
| | print(" 1. Make sure the HF_DATA_REPO environment variable is set correctly") |
| | print(" 2. Or update HF_REPO_ID in this script") |
| | print(" 3. If repo is private, run: huggingface-cli login") |
| | raise |
| |
|
| | if __name__ == "__main__": |
| | download_data() |
| |
|