DouDou commited on
Commit
7793dac
·
verified ·
1 Parent(s): 4416f19

Upload upload_code_to_hf.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. upload_code_to_hf.py +267 -0
upload_code_to_hf.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Upload dataset_builder code repository to Hugging Face Hub.
4
+
5
+ Usage:
6
+ # Upload to your personal account:
7
+ python upload_code_to_hf.py --hf_user YOUR_USERNAME
8
+
9
+ # Upload to an organization:
10
+ python upload_code_to_hf.py --hf_user YOUR_ORG
11
+
12
+ # Custom repository name:
13
+ python upload_code_to_hf.py --hf_user YOUR_USERNAME --repo_name my-dataset-builder
14
+
15
+ # Upload to a model repository (default):
16
+ python upload_code_to_hf.py --hf_user YOUR_USERNAME
17
+
18
+ # Upload to a space repository:
19
+ python upload_code_to_hf.py --hf_user YOUR_USERNAME --repo_type space
20
+ """
21
+
22
+ import os
23
+ import argparse
24
+ import logging
25
+ from pathlib import Path
26
+ from huggingface_hub import HfApi, create_repo
27
+ from huggingface_hub.utils import HfHubHTTPError
28
+ import time
29
+
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format="%(asctime)s [%(levelname)s] %(message)s",
33
+ datefmt="%Y-%m-%d %H:%M:%S",
34
+ )
35
+ logger = logging.getLogger(__name__)
36
+
37
+ # Files/directories to exclude from upload
38
+ EXCLUDE_PATTERNS = {
39
+ "__pycache__",
40
+ "*.pyc",
41
+ "*.pyo",
42
+ "*.pyd",
43
+ ".git",
44
+ ".gitignore",
45
+ ".DS_Store",
46
+ "*.log",
47
+ "*.swp",
48
+ "*.swo",
49
+ "*~",
50
+ ".pytest_cache",
51
+ ".mypy_cache",
52
+ ".ruff_cache",
53
+ "*.egg-info",
54
+ "dist",
55
+ "build",
56
+ ".venv",
57
+ "venv",
58
+ "env",
59
+ ".env",
60
+ "node_modules",
61
+ ".idea",
62
+ ".vscode",
63
+ ".cursor",
64
+ }
65
+
66
+ # Files to always include (even if they match exclude patterns)
67
+ ALWAYS_INCLUDE = {
68
+ ".gitignore",
69
+ "README.md",
70
+ "requirements.txt",
71
+ "setup.py",
72
+ "pyproject.toml",
73
+ }
74
+
75
+
76
+ def should_exclude(file_path: Path, root: Path) -> bool:
77
+ """Check if a file should be excluded from upload."""
78
+ rel_path = file_path.relative_to(root)
79
+
80
+ # Always include certain files
81
+ if rel_path.name in ALWAYS_INCLUDE:
82
+ return False
83
+
84
+ # Check directory names
85
+ for part in rel_path.parts:
86
+ if part in EXCLUDE_PATTERNS:
87
+ return True
88
+ if part.startswith(".") and part not in ALWAYS_INCLUDE:
89
+ return True
90
+
91
+ # Check file extensions
92
+ if file_path.suffix in {".pyc", ".pyo", ".pyd"}:
93
+ return True
94
+
95
+ # Check for log files
96
+ if file_path.suffix == ".log":
97
+ return True
98
+
99
+ return False
100
+
101
+
102
+ def get_files_to_upload(root: Path) -> list[Path]:
103
+ """Get all files to upload, excluding patterns."""
104
+ files = []
105
+ for file_path in root.rglob("*"):
106
+ if file_path.is_file() and not should_exclude(file_path, root):
107
+ files.append(file_path)
108
+ return sorted(files)
109
+
110
+
111
+ def upload_code_repo(
112
+ api: HfApi,
113
+ repo_id: str,
114
+ code_dir: Path,
115
+ repo_type: str = "model",
116
+ delay_between_files: float = 1.0,
117
+ ):
118
+ """Upload code repository to Hugging Face Hub."""
119
+ logger.info(f"Uploading code from {code_dir} to {repo_id} (type: {repo_type})")
120
+
121
+ # Create repo
122
+ create_repo(repo_id, repo_type=repo_type, exist_ok=True, private=False)
123
+ logger.info(f"Repository {repo_id} created/verified.")
124
+
125
+ # Get all files to upload
126
+ files = get_files_to_upload(code_dir)
127
+ total = len(files)
128
+ logger.info(f"Found {total} files to upload.")
129
+
130
+ if total == 0:
131
+ logger.warning("No files to upload!")
132
+ return
133
+
134
+ successful = 0
135
+ failed = 0
136
+
137
+ for idx, file_path in enumerate(files, 1):
138
+ # Calculate relative path in repository
139
+ rel_path = file_path.relative_to(code_dir)
140
+ path_in_repo = str(rel_path).replace("\\", "/") # Normalize path separators
141
+
142
+ size_kb = file_path.stat().st_size / 1024
143
+ logger.info(f"[{idx}/{total}] Uploading {path_in_repo} ({size_kb:.1f} KB)...")
144
+
145
+ try:
146
+ # Read file content
147
+ with open(file_path, "rb") as f:
148
+ content = f.read()
149
+
150
+ # Upload file
151
+ for attempt in range(1, 4): # Max 3 retries
152
+ try:
153
+ api.upload_file(
154
+ path_or_fileobj=content,
155
+ path_in_repo=path_in_repo,
156
+ repo_id=repo_id,
157
+ repo_type=repo_type,
158
+ )
159
+ successful += 1
160
+ logger.info(f"[{idx}/{total}] ✓ {path_in_repo} uploaded.")
161
+ break
162
+ except HfHubHTTPError as e:
163
+ status_code = getattr(e, 'status_code', None) or (
164
+ e.response.status_code if hasattr(e, 'response') and e.response else None
165
+ )
166
+ if status_code == 429: # Rate limited
167
+ wait_time = min(5.0 * (2 ** (attempt - 1)), 60.0)
168
+ logger.warning(
169
+ f" Rate limited (429). Waiting {wait_time:.1f}s (attempt {attempt}/3)..."
170
+ )
171
+ time.sleep(wait_time)
172
+ continue
173
+ else:
174
+ raise
175
+ except Exception as e:
176
+ if attempt == 3:
177
+ raise
178
+ wait_time = 2.0 * attempt
179
+ logger.warning(f" Error: {e}. Waiting {wait_time:.1f}s (attempt {attempt}/3)...")
180
+ time.sleep(wait_time)
181
+
182
+ except Exception as e:
183
+ failed += 1
184
+ logger.error(f"[{idx}/{total}] ✗ Failed to upload {path_in_repo}: {e}")
185
+
186
+ # Add delay between files (except for last file)
187
+ if idx < total:
188
+ time.sleep(delay_between_files)
189
+
190
+ logger.info(f"Upload complete: {successful} successful, {failed} failed out of {total} files.")
191
+
192
+
193
+ def main():
194
+ parser = argparse.ArgumentParser(
195
+ description="Upload dataset_builder code repository to Hugging Face Hub",
196
+ formatter_class=argparse.RawDescriptionHelpFormatter,
197
+ epilog=__doc__,
198
+ )
199
+ parser.add_argument(
200
+ "--hf_user", type=str, required=True,
201
+ help="Hugging Face username or organization name",
202
+ )
203
+ parser.add_argument(
204
+ "--repo_name", type=str, default="dataset-builder",
205
+ help="Repository name (default: dataset-builder)",
206
+ )
207
+ parser.add_argument(
208
+ "--repo_type", type=str, default="model", choices=["model", "space"],
209
+ help="Repository type (default: model)",
210
+ )
211
+ parser.add_argument(
212
+ "--code_dir", type=str, default=None,
213
+ help="Code directory to upload (default: current directory)",
214
+ )
215
+ parser.add_argument(
216
+ "--delay", type=float, default=1.0,
217
+ help="Delay in seconds between file uploads (default: 1.0)",
218
+ )
219
+ args = parser.parse_args()
220
+
221
+ # Determine code directory
222
+ if args.code_dir:
223
+ code_dir = Path(args.code_dir).resolve()
224
+ else:
225
+ code_dir = Path(__file__).parent.resolve()
226
+
227
+ if not code_dir.exists():
228
+ logger.error(f"Code directory not found: {code_dir}")
229
+ return
230
+
231
+ # Verify authentication
232
+ api = HfApi()
233
+ try:
234
+ user_info = api.whoami()
235
+ logger.info(f"Logged in as: {user_info.get('name', user_info.get('fullname', 'unknown'))}")
236
+ except Exception:
237
+ logger.error(
238
+ "Not logged in to Hugging Face. Please run:\n"
239
+ " huggingface-cli login\n"
240
+ "or set the HF_TOKEN environment variable."
241
+ )
242
+ return
243
+
244
+ repo_id = f"{args.hf_user}/{args.repo_name}"
245
+
246
+ logger.info("=" * 60)
247
+ logger.info(f"Upload Plan:")
248
+ logger.info(f" Code directory: {code_dir}")
249
+ logger.info(f" Repository: {repo_id} (type: {args.repo_type})")
250
+ logger.info("=" * 60)
251
+
252
+ try:
253
+ upload_code_repo(
254
+ api=api,
255
+ repo_id=repo_id,
256
+ code_dir=code_dir,
257
+ repo_type=args.repo_type,
258
+ delay_between_files=args.delay,
259
+ )
260
+ logger.info("✓ Code repository upload completed!")
261
+ except Exception as e:
262
+ logger.error(f"✗ Code repository upload failed: {e}")
263
+
264
+
265
+ if __name__ == "__main__":
266
+ main()
267
+