DouDou commited on
Commit
28e980a
·
verified ·
1 Parent(s): 7793dac

Upload upload_to_hf.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. upload_to_hf.py +462 -0
upload_to_hf.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Upload three datasets to Hugging Face Hub.
4
+
5
+ Datasets:
6
+ DATA1: Domain-Specific Code Dataset (115GB, 178 CSV files)
7
+ DATA2: Code-Documentation Alignment Dataset (2.9GB, 1 JSONL file)
8
+ DATA3: Programming Problems Generation Dataset (496MB, 1 JSONL file)
9
+
10
+ Usage:
11
+ # First, login to Hugging Face:
12
+ huggingface-cli login
13
+
14
+ # Upload all three datasets:
15
+ python upload_to_hf.py --hf_user YOUR_USERNAME
16
+
17
+ # Upload a specific dataset:
18
+ python upload_to_hf.py --hf_user YOUR_USERNAME --dataset data1
19
+ python upload_to_hf.py --hf_user YOUR_USERNAME --dataset data2
20
+ python upload_to_hf.py --hf_user YOUR_USERNAME --dataset data3
21
+
22
+ # Use a Hugging Face organization instead of user:
23
+ python upload_to_hf.py --hf_user YOUR_ORG --dataset all
24
+
25
+ # Custom repo names:
26
+ python upload_to_hf.py --hf_user YOUR_USERNAME \\
27
+ --repo_name_data1 my-code-dataset \\
28
+ --repo_name_data2 my-alignment-dataset \\
29
+ --repo_name_data3 my-problems-dataset
30
+
31
+ # Dry run (only create repos and README, no file upload):
32
+ python upload_to_hf.py --hf_user YOUR_USERNAME --dry_run
33
+ """
34
+
35
+ import os
36
+ import argparse
37
+ import logging
38
+ import time
39
+ from pathlib import Path
40
+ from huggingface_hub import HfApi, create_repo
41
+ from huggingface_hub.utils import HfHubHTTPError
42
+
43
+ logging.basicConfig(
44
+ level=logging.INFO,
45
+ format="%(asctime)s [%(levelname)s] %(message)s",
46
+ datefmt="%Y-%m-%d %H:%M:%S",
47
+ )
48
+ logger = logging.getLogger(__name__)
49
+
50
+ # ============================================================
51
+ # Dataset paths
52
+ # ============================================================
53
+ DATA1_DIR = Path("/home/weifengsun/tangou1/domain_code/src/workdir/dataset_csv")
54
+ DATA2_FILE = Path("/home/weifengsun/tangou1/step2/step22/output/alignment.jsonl")
55
+ DATA3_FILE = Path("/home/weifengsun/tangou1/domain_code/src/datasets/instruct_data/programming_problems.jsonl")
56
+
57
+ # README files
58
+ DATA1_README = Path("/home/weifengsun/tangou1/DATA1_README.md")
59
+ DATA2_README = Path("/home/weifengsun/tangou1/DATA2_README.md")
60
+ DATA3_README = Path("/home/weifengsun/tangou1/DATA3_README.md")
61
+
62
+
63
+ # ============================================================
64
+ # Dataset Card templates (prepended to README content)
65
+ # ============================================================
66
+
67
+ DATA1_CARD = """---
68
+ license: apache-2.0
69
+ task_categories:
70
+ - text-generation
71
+ language:
72
+ - code
73
+ tags:
74
+ - code
75
+ - scientific-computing
76
+ - domain-specific
77
+ - chemistry
78
+ - biology
79
+ - physics
80
+ size_categories:
81
+ - 1M<n<10M
82
+ ---
83
+
84
+ """
85
+
86
+ DATA2_CARD = """---
87
+ license: apache-2.0
88
+ task_categories:
89
+ - text-generation
90
+ - text2text-generation
91
+ language:
92
+ - code
93
+ tags:
94
+ - code
95
+ - documentation
96
+ - docstring-generation
97
+ - code-documentation-alignment
98
+ - scientific-computing
99
+ size_categories:
100
+ - 100K<n<1M
101
+ ---
102
+
103
+ """
104
+
105
+ DATA3_CARD = """---
106
+ license: apache-2.0
107
+ task_categories:
108
+ - text-generation
109
+ - question-answering
110
+ language:
111
+ - code
112
+ - en
113
+ tags:
114
+ - code
115
+ - programming-problems
116
+ - scientific-computing
117
+ - problem-generation
118
+ size_categories:
119
+ - 10K<n<100K
120
+ ---
121
+
122
+ """
123
+
124
+
125
+ def build_readme(card_header: str, readme_path: Path) -> str:
126
+ """Combine YAML front-matter with existing README content."""
127
+ readme_content = ""
128
+ if readme_path.exists():
129
+ readme_content = readme_path.read_text(encoding="utf-8")
130
+ return card_header + readme_content
131
+
132
+
133
+ # ============================================================
134
+ # Upload functions
135
+ # ============================================================
136
+
137
+ def upload_file_with_retry(
138
+ api: HfApi,
139
+ file_path: Path,
140
+ path_in_repo: str,
141
+ repo_id: str,
142
+ max_retries: int = 5,
143
+ base_delay: float = 2.0,
144
+ max_delay: float = 300.0,
145
+ check_existing: bool = True,
146
+ ):
147
+ """
148
+ Upload a file with retry logic and rate limiting.
149
+
150
+ Args:
151
+ api: HfApi instance
152
+ file_path: Local file path
153
+ path_in_repo: Path in repository
154
+ repo_id: Repository ID
155
+ max_retries: Maximum number of retries
156
+ base_delay: Base delay in seconds for exponential backoff
157
+ max_delay: Maximum delay in seconds
158
+ check_existing: Check if file already exists before uploading
159
+ """
160
+ # Check if file already exists
161
+ if check_existing:
162
+ try:
163
+ repo_info = api.repo_info(repo_id, repo_type="dataset", files_metadata=True)
164
+ existing_files = {f.rfilename for f in repo_info.siblings if hasattr(f, 'rfilename')}
165
+ if path_in_repo in existing_files:
166
+ logger.info(f" File {path_in_repo} already exists, skipping.")
167
+ return True
168
+ except Exception as e:
169
+ logger.debug(f"Could not check existing files: {e}")
170
+
171
+ for attempt in range(1, max_retries + 1):
172
+ try:
173
+ api.upload_file(
174
+ path_or_fileobj=str(file_path),
175
+ path_in_repo=path_in_repo,
176
+ repo_id=repo_id,
177
+ repo_type="dataset",
178
+ )
179
+ return True
180
+ except HfHubHTTPError as e:
181
+ status_code = getattr(e, 'status_code', None) or (e.response.status_code if hasattr(e, 'response') and e.response else None)
182
+ if status_code == 429: # Too Many Requests
183
+ # Extract retry-after header if available
184
+ retry_after = None
185
+ if hasattr(e, 'response') and e.response:
186
+ retry_after = e.response.headers.get("Retry-After")
187
+ if retry_after:
188
+ wait_time = min(float(retry_after), max_delay)
189
+ logger.warning(
190
+ f" Rate limited (429). Waiting {wait_time:.1f}s (Retry-After header)..."
191
+ )
192
+ else:
193
+ # Exponential backoff: 2^attempt seconds, capped at max_delay
194
+ wait_time = min(base_delay * (2 ** (attempt - 1)), max_delay)
195
+ logger.warning(
196
+ f" Rate limited (429). Waiting {wait_time:.1f}s (attempt {attempt}/{max_retries})..."
197
+ )
198
+ time.sleep(wait_time)
199
+ continue
200
+ elif status_code and status_code >= 500: # Server errors
201
+ wait_time = min(base_delay * (2 ** (attempt - 1)), max_delay)
202
+ logger.warning(
203
+ f" Server error ({status_code}). Waiting {wait_time:.1f}s (attempt {attempt}/{max_retries})..."
204
+ )
205
+ time.sleep(wait_time)
206
+ continue
207
+ else:
208
+ # Other HTTP errors (4xx except 429) - don't retry
209
+ logger.error(f" HTTP error {status_code}: {e}")
210
+ raise
211
+ except Exception as e:
212
+ if attempt == max_retries:
213
+ logger.error(f" Failed after {max_retries} attempts: {e}")
214
+ raise
215
+ wait_time = min(base_delay * (2 ** (attempt - 1)), max_delay)
216
+ logger.warning(
217
+ f" Error: {e}. Waiting {wait_time:.1f}s (attempt {attempt}/{max_retries})..."
218
+ )
219
+ time.sleep(wait_time)
220
+
221
+ return False
222
+
223
+
224
+ def upload_data1(api: HfApi, repo_id: str, dry_run: bool = False, delay_between_files: float = 3.0):
225
+ """Upload DATA1: Domain-Specific Code Dataset (178 CSV files, ~115GB)."""
226
+ logger.info(f"{'[DRY RUN] ' if dry_run else ''}Uploading DATA1 to {repo_id}")
227
+
228
+ # Create repo
229
+ create_repo(repo_id, repo_type="dataset", exist_ok=True, private=False)
230
+ logger.info(f"Repository {repo_id} created/verified.")
231
+
232
+ # Upload README
233
+ readme = build_readme(DATA1_CARD, DATA1_README)
234
+ try:
235
+ api.upload_file(
236
+ path_or_fileobj=readme.encode("utf-8"),
237
+ path_in_repo="README.md",
238
+ repo_id=repo_id,
239
+ repo_type="dataset",
240
+ )
241
+ logger.info("README.md uploaded.")
242
+ except Exception as e:
243
+ logger.warning(f"README upload failed (may already exist): {e}")
244
+
245
+ if dry_run:
246
+ logger.info("[DRY RUN] Skipping file uploads.")
247
+ return
248
+
249
+ # Upload CSV files one by one (some files are very large)
250
+ csv_files = sorted(DATA1_DIR.glob("*.csv"))
251
+ total = len(csv_files)
252
+ logger.info(f"Found {total} CSV files to upload.")
253
+ logger.info(f"Using {delay_between_files}s delay between files to avoid rate limiting.")
254
+
255
+ successful = 0
256
+ failed = 0
257
+
258
+ for idx, csv_file in enumerate(csv_files, 1):
259
+ size_mb = csv_file.stat().st_size / (1024 * 1024)
260
+ logger.info(f"[{idx}/{total}] Uploading {csv_file.name} ({size_mb:.1f} MB)...")
261
+
262
+ try:
263
+ success = upload_file_with_retry(
264
+ api=api,
265
+ file_path=csv_file,
266
+ path_in_repo=f"data/{csv_file.name}",
267
+ repo_id=repo_id,
268
+ max_retries=5,
269
+ base_delay=5.0, # Start with 5s delay for 429 errors
270
+ max_delay=300.0, # Max 5 minutes wait
271
+ check_existing=True,
272
+ )
273
+
274
+ if success:
275
+ successful += 1
276
+ logger.info(f"[{idx}/{total}] ✓ {csv_file.name} uploaded. ({successful} successful, {failed} failed)")
277
+ else:
278
+ failed += 1
279
+ logger.error(f"[{idx}/{total}] ✗ {csv_file.name} failed after retries.")
280
+ except Exception as e:
281
+ failed += 1
282
+ logger.error(f"[{idx}/{total}] ✗ Failed to upload {csv_file.name}: {e}")
283
+
284
+ # Add delay between files to avoid rate limiting (except for last file)
285
+ if idx < total:
286
+ logger.debug(f"Waiting {delay_between_files}s before next file...")
287
+ time.sleep(delay_between_files)
288
+
289
+ logger.info(f"Upload complete: {successful} successful, {failed} failed out of {total} files.")
290
+
291
+
292
+ def upload_data2(api: HfApi, repo_id: str, dry_run: bool = False):
293
+ """Upload DATA2: Code-Documentation Alignment Dataset (~2.9GB)."""
294
+ logger.info(f"{'[DRY RUN] ' if dry_run else ''}Uploading DATA2 to {repo_id}")
295
+
296
+ # Create repo
297
+ create_repo(repo_id, repo_type="dataset", exist_ok=True, private=False)
298
+ logger.info(f"Repository {repo_id} created/verified.")
299
+
300
+ # Upload README
301
+ readme = build_readme(DATA2_CARD, DATA2_README)
302
+ api.upload_file(
303
+ path_or_fileobj=readme.encode("utf-8"),
304
+ path_in_repo="README.md",
305
+ repo_id=repo_id,
306
+ repo_type="dataset",
307
+ )
308
+ logger.info("README.md uploaded.")
309
+
310
+ if dry_run:
311
+ logger.info("[DRY RUN] Skipping file uploads.")
312
+ return
313
+
314
+ size_mb = DATA2_FILE.stat().st_size / (1024 * 1024)
315
+ logger.info(f"Uploading {DATA2_FILE.name} ({size_mb:.1f} MB)...")
316
+ api.upload_file(
317
+ path_or_fileobj=str(DATA2_FILE),
318
+ path_in_repo=f"data/{DATA2_FILE.name}",
319
+ repo_id=repo_id,
320
+ repo_type="dataset",
321
+ )
322
+ logger.info(f"✓ {DATA2_FILE.name} uploaded.")
323
+
324
+
325
+ def upload_data3(api: HfApi, repo_id: str, dry_run: bool = False):
326
+ """Upload DATA3: Programming Problems Generation Dataset (~496MB)."""
327
+ logger.info(f"{'[DRY RUN] ' if dry_run else ''}Uploading DATA3 to {repo_id}")
328
+
329
+ # Create repo
330
+ create_repo(repo_id, repo_type="dataset", exist_ok=True, private=False)
331
+ logger.info(f"Repository {repo_id} created/verified.")
332
+
333
+ # Upload README
334
+ readme = build_readme(DATA3_CARD, DATA3_README)
335
+ api.upload_file(
336
+ path_or_fileobj=readme.encode("utf-8"),
337
+ path_in_repo="README.md",
338
+ repo_id=repo_id,
339
+ repo_type="dataset",
340
+ )
341
+ logger.info("README.md uploaded.")
342
+
343
+ if dry_run:
344
+ logger.info("[DRY RUN] Skipping file uploads.")
345
+ return
346
+
347
+ size_mb = DATA3_FILE.stat().st_size / (1024 * 1024)
348
+ logger.info(f"Uploading {DATA3_FILE.name} ({size_mb:.1f} MB)...")
349
+ api.upload_file(
350
+ path_or_fileobj=str(DATA3_FILE),
351
+ path_in_repo=f"data/{DATA3_FILE.name}",
352
+ repo_id=repo_id,
353
+ repo_type="dataset",
354
+ )
355
+ logger.info(f"✓ {DATA3_FILE.name} uploaded.")
356
+
357
+
358
+ # ============================================================
359
+ # Main
360
+ # ============================================================
361
+
362
+ def main():
363
+ parser = argparse.ArgumentParser(
364
+ description="Upload datasets to Hugging Face Hub",
365
+ formatter_class=argparse.RawDescriptionHelpFormatter,
366
+ epilog=__doc__,
367
+ )
368
+ parser.add_argument(
369
+ "--hf_user", type=str, required=True,
370
+ help="Hugging Face username or organization name",
371
+ )
372
+ parser.add_argument(
373
+ "--dataset", type=str, default="all", choices=["all", "data1", "data2", "data3"],
374
+ help="Which dataset to upload (default: all)",
375
+ )
376
+ parser.add_argument(
377
+ "--repo_name_data1", type=str, default="SciCode-Domain-Code",
378
+ help="Repository name for DATA1 (default: SciCode-Domain-Code)",
379
+ )
380
+ parser.add_argument(
381
+ "--repo_name_data2", type=str, default="SciCode-Doc-Alignment",
382
+ help="Repository name for DATA2 (default: SciCode-Doc-Alignment)",
383
+ )
384
+ parser.add_argument(
385
+ "--repo_name_data3", type=str, default="SciCode-Programming-Problems",
386
+ help="Repository name for DATA3 (default: SciCode-Programming-Problems)",
387
+ )
388
+ parser.add_argument(
389
+ "--dry_run", action="store_true",
390
+ help="Only create repos and upload READMEs, skip data files",
391
+ )
392
+ parser.add_argument(
393
+ "--private", action="store_true",
394
+ help="Create private repositories (default: public)",
395
+ )
396
+ parser.add_argument(
397
+ "--delay", type=float, default=3.0,
398
+ help="Delay in seconds between file uploads (default: 3.0, increase if getting 429 errors)",
399
+ )
400
+ args = parser.parse_args()
401
+
402
+ # Verify data paths exist
403
+ checks = {
404
+ "data1": DATA1_DIR,
405
+ "data2": DATA2_FILE,
406
+ "data3": DATA3_FILE,
407
+ }
408
+ for name, path in checks.items():
409
+ if args.dataset in ("all", name) and not path.exists():
410
+ logger.error(f"Data path not found: {path}")
411
+ return
412
+
413
+ api = HfApi()
414
+
415
+ # Check authentication
416
+ try:
417
+ user_info = api.whoami()
418
+ logger.info(f"Logged in as: {user_info.get('name', user_info.get('fullname', 'unknown'))}")
419
+ except Exception:
420
+ logger.error(
421
+ "Not logged in to Hugging Face. Please run:\n"
422
+ " huggingface-cli login\n"
423
+ "or set the HF_TOKEN environment variable."
424
+ )
425
+ return
426
+
427
+ repo_ids = {
428
+ "data1": f"{args.hf_user}/{args.repo_name_data1}",
429
+ "data2": f"{args.hf_user}/{args.repo_name_data2}",
430
+ "data3": f"{args.hf_user}/{args.repo_name_data3}",
431
+ }
432
+
433
+ upload_fns = {
434
+ "data1": upload_data1,
435
+ "data2": upload_data2,
436
+ "data3": upload_data3,
437
+ }
438
+
439
+ targets = ["data1", "data2", "data3"] if args.dataset == "all" else [args.dataset]
440
+
441
+ logger.info("=" * 60)
442
+ logger.info("Upload Plan:")
443
+ for t in targets:
444
+ logger.info(f" {t.upper()} -> {repo_ids[t]}")
445
+ logger.info("=" * 60)
446
+
447
+ for t in targets:
448
+ try:
449
+ if t == "data1":
450
+ upload_fns[t](api, repo_ids[t], dry_run=args.dry_run, delay_between_files=args.delay)
451
+ else:
452
+ upload_fns[t](api, repo_ids[t], dry_run=args.dry_run)
453
+ logger.info(f"✓ {t.upper()} upload completed.\n")
454
+ except Exception as e:
455
+ logger.error(f"✗ {t.upper()} upload failed: {e}\n")
456
+
457
+ logger.info("All done!")
458
+
459
+
460
+ if __name__ == "__main__":
461
+ main()
462
+