DouDou commited on
Commit
b736b8f
·
verified ·
1 Parent(s): 28e980a

Remove upload_to_hf.py

Browse files
Files changed (1) hide show
  1. upload_to_hf.py +0 -462
upload_to_hf.py DELETED
@@ -1,462 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Upload three datasets to Hugging Face Hub.
4
-
5
- Datasets:
6
- DATA1: Domain-Specific Code Dataset (115GB, 178 CSV files)
7
- DATA2: Code-Documentation Alignment Dataset (2.9GB, 1 JSONL file)
8
- DATA3: Programming Problems Generation Dataset (496MB, 1 JSONL file)
9
-
10
- Usage:
11
- # First, login to Hugging Face:
12
- huggingface-cli login
13
-
14
- # Upload all three datasets:
15
- python upload_to_hf.py --hf_user YOUR_USERNAME
16
-
17
- # Upload a specific dataset:
18
- python upload_to_hf.py --hf_user YOUR_USERNAME --dataset data1
19
- python upload_to_hf.py --hf_user YOUR_USERNAME --dataset data2
20
- python upload_to_hf.py --hf_user YOUR_USERNAME --dataset data3
21
-
22
- # Use a Hugging Face organization instead of user:
23
- python upload_to_hf.py --hf_user YOUR_ORG --dataset all
24
-
25
- # Custom repo names:
26
- python upload_to_hf.py --hf_user YOUR_USERNAME \\
27
- --repo_name_data1 my-code-dataset \\
28
- --repo_name_data2 my-alignment-dataset \\
29
- --repo_name_data3 my-problems-dataset
30
-
31
- # Dry run (only create repos and README, no file upload):
32
- python upload_to_hf.py --hf_user YOUR_USERNAME --dry_run
33
- """
34
-
35
- import os
36
- import argparse
37
- import logging
38
- import time
39
- from pathlib import Path
40
- from huggingface_hub import HfApi, create_repo
41
- from huggingface_hub.utils import HfHubHTTPError
42
-
43
- logging.basicConfig(
44
- level=logging.INFO,
45
- format="%(asctime)s [%(levelname)s] %(message)s",
46
- datefmt="%Y-%m-%d %H:%M:%S",
47
- )
48
- logger = logging.getLogger(__name__)
49
-
50
- # ============================================================
51
- # Dataset paths
52
- # ============================================================
53
- DATA1_DIR = Path("/home/weifengsun/tangou1/domain_code/src/workdir/dataset_csv")
54
- DATA2_FILE = Path("/home/weifengsun/tangou1/step2/step22/output/alignment.jsonl")
55
- DATA3_FILE = Path("/home/weifengsun/tangou1/domain_code/src/datasets/instruct_data/programming_problems.jsonl")
56
-
57
- # README files
58
- DATA1_README = Path("/home/weifengsun/tangou1/DATA1_README.md")
59
- DATA2_README = Path("/home/weifengsun/tangou1/DATA2_README.md")
60
- DATA3_README = Path("/home/weifengsun/tangou1/DATA3_README.md")
61
-
62
-
63
- # ============================================================
64
- # Dataset Card templates (prepended to README content)
65
- # ============================================================
66
-
67
- DATA1_CARD = """---
68
- license: apache-2.0
69
- task_categories:
70
- - text-generation
71
- language:
72
- - code
73
- tags:
74
- - code
75
- - scientific-computing
76
- - domain-specific
77
- - chemistry
78
- - biology
79
- - physics
80
- size_categories:
81
- - 1M<n<10M
82
- ---
83
-
84
- """
85
-
86
- DATA2_CARD = """---
87
- license: apache-2.0
88
- task_categories:
89
- - text-generation
90
- - text2text-generation
91
- language:
92
- - code
93
- tags:
94
- - code
95
- - documentation
96
- - docstring-generation
97
- - code-documentation-alignment
98
- - scientific-computing
99
- size_categories:
100
- - 100K<n<1M
101
- ---
102
-
103
- """
104
-
105
- DATA3_CARD = """---
106
- license: apache-2.0
107
- task_categories:
108
- - text-generation
109
- - question-answering
110
- language:
111
- - code
112
- - en
113
- tags:
114
- - code
115
- - programming-problems
116
- - scientific-computing
117
- - problem-generation
118
- size_categories:
119
- - 10K<n<100K
120
- ---
121
-
122
- """
123
-
124
-
125
- def build_readme(card_header: str, readme_path: Path) -> str:
126
- """Combine YAML front-matter with existing README content."""
127
- readme_content = ""
128
- if readme_path.exists():
129
- readme_content = readme_path.read_text(encoding="utf-8")
130
- return card_header + readme_content
131
-
132
-
133
- # ============================================================
134
- # Upload functions
135
- # ============================================================
136
-
137
- def upload_file_with_retry(
138
- api: HfApi,
139
- file_path: Path,
140
- path_in_repo: str,
141
- repo_id: str,
142
- max_retries: int = 5,
143
- base_delay: float = 2.0,
144
- max_delay: float = 300.0,
145
- check_existing: bool = True,
146
- ):
147
- """
148
- Upload a file with retry logic and rate limiting.
149
-
150
- Args:
151
- api: HfApi instance
152
- file_path: Local file path
153
- path_in_repo: Path in repository
154
- repo_id: Repository ID
155
- max_retries: Maximum number of retries
156
- base_delay: Base delay in seconds for exponential backoff
157
- max_delay: Maximum delay in seconds
158
- check_existing: Check if file already exists before uploading
159
- """
160
- # Check if file already exists
161
- if check_existing:
162
- try:
163
- repo_info = api.repo_info(repo_id, repo_type="dataset", files_metadata=True)
164
- existing_files = {f.rfilename for f in repo_info.siblings if hasattr(f, 'rfilename')}
165
- if path_in_repo in existing_files:
166
- logger.info(f" File {path_in_repo} already exists, skipping.")
167
- return True
168
- except Exception as e:
169
- logger.debug(f"Could not check existing files: {e}")
170
-
171
- for attempt in range(1, max_retries + 1):
172
- try:
173
- api.upload_file(
174
- path_or_fileobj=str(file_path),
175
- path_in_repo=path_in_repo,
176
- repo_id=repo_id,
177
- repo_type="dataset",
178
- )
179
- return True
180
- except HfHubHTTPError as e:
181
- status_code = getattr(e, 'status_code', None) or (e.response.status_code if hasattr(e, 'response') and e.response else None)
182
- if status_code == 429: # Too Many Requests
183
- # Extract retry-after header if available
184
- retry_after = None
185
- if hasattr(e, 'response') and e.response:
186
- retry_after = e.response.headers.get("Retry-After")
187
- if retry_after:
188
- wait_time = min(float(retry_after), max_delay)
189
- logger.warning(
190
- f" Rate limited (429). Waiting {wait_time:.1f}s (Retry-After header)..."
191
- )
192
- else:
193
- # Exponential backoff: 2^attempt seconds, capped at max_delay
194
- wait_time = min(base_delay * (2 ** (attempt - 1)), max_delay)
195
- logger.warning(
196
- f" Rate limited (429). Waiting {wait_time:.1f}s (attempt {attempt}/{max_retries})..."
197
- )
198
- time.sleep(wait_time)
199
- continue
200
- elif status_code and status_code >= 500: # Server errors
201
- wait_time = min(base_delay * (2 ** (attempt - 1)), max_delay)
202
- logger.warning(
203
- f" Server error ({status_code}). Waiting {wait_time:.1f}s (attempt {attempt}/{max_retries})..."
204
- )
205
- time.sleep(wait_time)
206
- continue
207
- else:
208
- # Other HTTP errors (4xx except 429) - don't retry
209
- logger.error(f" HTTP error {status_code}: {e}")
210
- raise
211
- except Exception as e:
212
- if attempt == max_retries:
213
- logger.error(f" Failed after {max_retries} attempts: {e}")
214
- raise
215
- wait_time = min(base_delay * (2 ** (attempt - 1)), max_delay)
216
- logger.warning(
217
- f" Error: {e}. Waiting {wait_time:.1f}s (attempt {attempt}/{max_retries})..."
218
- )
219
- time.sleep(wait_time)
220
-
221
- return False
222
-
223
-
224
- def upload_data1(api: HfApi, repo_id: str, dry_run: bool = False, delay_between_files: float = 3.0):
225
- """Upload DATA1: Domain-Specific Code Dataset (178 CSV files, ~115GB)."""
226
- logger.info(f"{'[DRY RUN] ' if dry_run else ''}Uploading DATA1 to {repo_id}")
227
-
228
- # Create repo
229
- create_repo(repo_id, repo_type="dataset", exist_ok=True, private=False)
230
- logger.info(f"Repository {repo_id} created/verified.")
231
-
232
- # Upload README
233
- readme = build_readme(DATA1_CARD, DATA1_README)
234
- try:
235
- api.upload_file(
236
- path_or_fileobj=readme.encode("utf-8"),
237
- path_in_repo="README.md",
238
- repo_id=repo_id,
239
- repo_type="dataset",
240
- )
241
- logger.info("README.md uploaded.")
242
- except Exception as e:
243
- logger.warning(f"README upload failed (may already exist): {e}")
244
-
245
- if dry_run:
246
- logger.info("[DRY RUN] Skipping file uploads.")
247
- return
248
-
249
- # Upload CSV files one by one (some files are very large)
250
- csv_files = sorted(DATA1_DIR.glob("*.csv"))
251
- total = len(csv_files)
252
- logger.info(f"Found {total} CSV files to upload.")
253
- logger.info(f"Using {delay_between_files}s delay between files to avoid rate limiting.")
254
-
255
- successful = 0
256
- failed = 0
257
-
258
- for idx, csv_file in enumerate(csv_files, 1):
259
- size_mb = csv_file.stat().st_size / (1024 * 1024)
260
- logger.info(f"[{idx}/{total}] Uploading {csv_file.name} ({size_mb:.1f} MB)...")
261
-
262
- try:
263
- success = upload_file_with_retry(
264
- api=api,
265
- file_path=csv_file,
266
- path_in_repo=f"data/{csv_file.name}",
267
- repo_id=repo_id,
268
- max_retries=5,
269
- base_delay=5.0, # Start with 5s delay for 429 errors
270
- max_delay=300.0, # Max 5 minutes wait
271
- check_existing=True,
272
- )
273
-
274
- if success:
275
- successful += 1
276
- logger.info(f"[{idx}/{total}] ✓ {csv_file.name} uploaded. ({successful} successful, {failed} failed)")
277
- else:
278
- failed += 1
279
- logger.error(f"[{idx}/{total}] ✗ {csv_file.name} failed after retries.")
280
- except Exception as e:
281
- failed += 1
282
- logger.error(f"[{idx}/{total}] ✗ Failed to upload {csv_file.name}: {e}")
283
-
284
- # Add delay between files to avoid rate limiting (except for last file)
285
- if idx < total:
286
- logger.debug(f"Waiting {delay_between_files}s before next file...")
287
- time.sleep(delay_between_files)
288
-
289
- logger.info(f"Upload complete: {successful} successful, {failed} failed out of {total} files.")
290
-
291
-
292
- def upload_data2(api: HfApi, repo_id: str, dry_run: bool = False):
293
- """Upload DATA2: Code-Documentation Alignment Dataset (~2.9GB)."""
294
- logger.info(f"{'[DRY RUN] ' if dry_run else ''}Uploading DATA2 to {repo_id}")
295
-
296
- # Create repo
297
- create_repo(repo_id, repo_type="dataset", exist_ok=True, private=False)
298
- logger.info(f"Repository {repo_id} created/verified.")
299
-
300
- # Upload README
301
- readme = build_readme(DATA2_CARD, DATA2_README)
302
- api.upload_file(
303
- path_or_fileobj=readme.encode("utf-8"),
304
- path_in_repo="README.md",
305
- repo_id=repo_id,
306
- repo_type="dataset",
307
- )
308
- logger.info("README.md uploaded.")
309
-
310
- if dry_run:
311
- logger.info("[DRY RUN] Skipping file uploads.")
312
- return
313
-
314
- size_mb = DATA2_FILE.stat().st_size / (1024 * 1024)
315
- logger.info(f"Uploading {DATA2_FILE.name} ({size_mb:.1f} MB)...")
316
- api.upload_file(
317
- path_or_fileobj=str(DATA2_FILE),
318
- path_in_repo=f"data/{DATA2_FILE.name}",
319
- repo_id=repo_id,
320
- repo_type="dataset",
321
- )
322
- logger.info(f"✓ {DATA2_FILE.name} uploaded.")
323
-
324
-
325
- def upload_data3(api: HfApi, repo_id: str, dry_run: bool = False):
326
- """Upload DATA3: Programming Problems Generation Dataset (~496MB)."""
327
- logger.info(f"{'[DRY RUN] ' if dry_run else ''}Uploading DATA3 to {repo_id}")
328
-
329
- # Create repo
330
- create_repo(repo_id, repo_type="dataset", exist_ok=True, private=False)
331
- logger.info(f"Repository {repo_id} created/verified.")
332
-
333
- # Upload README
334
- readme = build_readme(DATA3_CARD, DATA3_README)
335
- api.upload_file(
336
- path_or_fileobj=readme.encode("utf-8"),
337
- path_in_repo="README.md",
338
- repo_id=repo_id,
339
- repo_type="dataset",
340
- )
341
- logger.info("README.md uploaded.")
342
-
343
- if dry_run:
344
- logger.info("[DRY RUN] Skipping file uploads.")
345
- return
346
-
347
- size_mb = DATA3_FILE.stat().st_size / (1024 * 1024)
348
- logger.info(f"Uploading {DATA3_FILE.name} ({size_mb:.1f} MB)...")
349
- api.upload_file(
350
- path_or_fileobj=str(DATA3_FILE),
351
- path_in_repo=f"data/{DATA3_FILE.name}",
352
- repo_id=repo_id,
353
- repo_type="dataset",
354
- )
355
- logger.info(f"✓ {DATA3_FILE.name} uploaded.")
356
-
357
-
358
- # ============================================================
359
- # Main
360
- # ============================================================
361
-
362
- def main():
363
- parser = argparse.ArgumentParser(
364
- description="Upload datasets to Hugging Face Hub",
365
- formatter_class=argparse.RawDescriptionHelpFormatter,
366
- epilog=__doc__,
367
- )
368
- parser.add_argument(
369
- "--hf_user", type=str, required=True,
370
- help="Hugging Face username or organization name",
371
- )
372
- parser.add_argument(
373
- "--dataset", type=str, default="all", choices=["all", "data1", "data2", "data3"],
374
- help="Which dataset to upload (default: all)",
375
- )
376
- parser.add_argument(
377
- "--repo_name_data1", type=str, default="SciCode-Domain-Code",
378
- help="Repository name for DATA1 (default: SciCode-Domain-Code)",
379
- )
380
- parser.add_argument(
381
- "--repo_name_data2", type=str, default="SciCode-Doc-Alignment",
382
- help="Repository name for DATA2 (default: SciCode-Doc-Alignment)",
383
- )
384
- parser.add_argument(
385
- "--repo_name_data3", type=str, default="SciCode-Programming-Problems",
386
- help="Repository name for DATA3 (default: SciCode-Programming-Problems)",
387
- )
388
- parser.add_argument(
389
- "--dry_run", action="store_true",
390
- help="Only create repos and upload READMEs, skip data files",
391
- )
392
- parser.add_argument(
393
- "--private", action="store_true",
394
- help="Create private repositories (default: public)",
395
- )
396
- parser.add_argument(
397
- "--delay", type=float, default=3.0,
398
- help="Delay in seconds between file uploads (default: 3.0, increase if getting 429 errors)",
399
- )
400
- args = parser.parse_args()
401
-
402
- # Verify data paths exist
403
- checks = {
404
- "data1": DATA1_DIR,
405
- "data2": DATA2_FILE,
406
- "data3": DATA3_FILE,
407
- }
408
- for name, path in checks.items():
409
- if args.dataset in ("all", name) and not path.exists():
410
- logger.error(f"Data path not found: {path}")
411
- return
412
-
413
- api = HfApi()
414
-
415
- # Check authentication
416
- try:
417
- user_info = api.whoami()
418
- logger.info(f"Logged in as: {user_info.get('name', user_info.get('fullname', 'unknown'))}")
419
- except Exception:
420
- logger.error(
421
- "Not logged in to Hugging Face. Please run:\n"
422
- " huggingface-cli login\n"
423
- "or set the HF_TOKEN environment variable."
424
- )
425
- return
426
-
427
- repo_ids = {
428
- "data1": f"{args.hf_user}/{args.repo_name_data1}",
429
- "data2": f"{args.hf_user}/{args.repo_name_data2}",
430
- "data3": f"{args.hf_user}/{args.repo_name_data3}",
431
- }
432
-
433
- upload_fns = {
434
- "data1": upload_data1,
435
- "data2": upload_data2,
436
- "data3": upload_data3,
437
- }
438
-
439
- targets = ["data1", "data2", "data3"] if args.dataset == "all" else [args.dataset]
440
-
441
- logger.info("=" * 60)
442
- logger.info("Upload Plan:")
443
- for t in targets:
444
- logger.info(f" {t.upper()} -> {repo_ids[t]}")
445
- logger.info("=" * 60)
446
-
447
- for t in targets:
448
- try:
449
- if t == "data1":
450
- upload_fns[t](api, repo_ids[t], dry_run=args.dry_run, delay_between_files=args.delay)
451
- else:
452
- upload_fns[t](api, repo_ids[t], dry_run=args.dry_run)
453
- logger.info(f"✓ {t.upper()} upload completed.\n")
454
- except Exception as e:
455
- logger.error(f"✗ {t.upper()} upload failed: {e}\n")
456
-
457
- logger.info("All done!")
458
-
459
-
460
- if __name__ == "__main__":
461
- main()
462
-