DouDou commited on
Commit
f0b48c5
·
verified ·
1 Parent(s): debcb41

Upload data1/scripts/export_files_to_csv.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data1/scripts/export_files_to_csv.py +402 -0
data1/scripts/export_files_to_csv.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Export repository files to CSV datasets grouped by keyword.
4
+
5
+ This script processes all files in repos_filtered directory, groups them by keyword
6
+ from repos_check_history.csv, and exports to separate CSV files for each keyword.
7
+ """
8
+
9
+ import os
10
+ import csv
11
+ import re
12
+ from pathlib import Path
13
+ from collections import defaultdict
14
+ from typing import Dict, List, Tuple, Optional
15
+ import pandas as pd
16
+ from tqdm import tqdm
17
+ import logging
18
+
19
+ # Setup logging
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format='%(asctime)s - %(levelname)s - %(message)s',
23
+ handlers=[
24
+ logging.FileHandler('export_files_to_csv.log'),
25
+ logging.StreamHandler()
26
+ ]
27
+ )
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # Configuration
31
+ REPOS_FILTERED_DIR = Path("/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered")
32
+ REPOS_CHECK_HISTORY_CSV = Path("/home/weifengsun/tangou1/domain_code/src/workdir/repos_check_history.csv")
33
+ OUTPUT_DIR = Path("/home/weifengsun/tangou1/domain_code/src/workdir/dataset_csv")
34
+ MAX_FILE_SIZE = None # No size limit - process all files
35
+
36
+ # Directories to skip
37
+ SKIP_DIRS = {'.git', 'node_modules', '__pycache__', '.pytest_cache', '.mypy_cache',
38
+ 'venv', 'env', '.venv', '.env', 'dist', 'build', '.eggs', '*.egg-info'}
39
+
40
+ # Binary file extensions to skip
41
+ BINARY_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.svg',
42
+ '.pdf', '.zip', '.tar', '.gz', '.bz2', '.xz', '.7z',
43
+ '.exe', '.dll', '.so', '.dylib', '.bin', '.o', '.a',
44
+ '.pyc', '.pyo', '.pyd', '.class', '.jar', '.war',
45
+ '.mp3', '.mp4', '.avi', '.mov', '.wav', '.flac',
46
+ '.db', '.sqlite', '.sqlite3', '.h5', '.hdf5', '.pkl', '.pickle'}
47
+
48
+ # Language mapping based on file extension
49
+ LANGUAGE_MAP = {
50
+ '.py': 'Python',
51
+ '.js': 'JavaScript',
52
+ '.ts': 'TypeScript',
53
+ '.java': 'Java',
54
+ '.cpp': 'C++',
55
+ '.c': 'C',
56
+ '.cs': 'C#',
57
+ '.go': 'Go',
58
+ '.rs': 'Rust',
59
+ '.rb': 'Ruby',
60
+ '.php': 'PHP',
61
+ '.swift': 'Swift',
62
+ '.kt': 'Kotlin',
63
+ '.scala': 'Scala',
64
+ '.r': 'R',
65
+ '.m': 'MATLAB',
66
+ '.jl': 'Julia',
67
+ '.sh': 'Shell',
68
+ '.bash': 'Bash',
69
+ '.zsh': 'Zsh',
70
+ '.sql': 'SQL',
71
+ '.html': 'HTML',
72
+ '.css': 'CSS',
73
+ '.xml': 'XML',
74
+ '.json': 'JSON',
75
+ '.yaml': 'YAML',
76
+ '.yml': 'YAML',
77
+ '.md': 'Markdown',
78
+ '.tex': 'LaTeX',
79
+ '.f90': 'Fortran',
80
+ '.f': 'Fortran',
81
+ '.f77': 'Fortran',
82
+ '.f95': 'Fortran',
83
+ '.cu': 'CUDA',
84
+ '.cl': 'OpenCL',
85
+ '.hs': 'Haskell',
86
+ '.ml': 'OCaml',
87
+ '.fs': 'F#',
88
+ '.vb': 'Visual Basic',
89
+ '.pl': 'Perl',
90
+ '.pm': 'Perl',
91
+ '.lua': 'Lua',
92
+ '.vim': 'Vim script',
93
+ '.cmake': 'CMake',
94
+ '.makefile': 'Makefile',
95
+ '.dockerfile': 'Dockerfile',
96
+ }
97
+
98
+
99
+ def sanitize_keyword(keyword: str) -> str:
100
+ """Sanitize keyword for use in filename."""
101
+ # Replace special characters with underscores
102
+ sanitized = re.sub(r'[^\w\s-]', '_', keyword)
103
+ # Replace spaces with underscores
104
+ sanitized = re.sub(r'[\s-]+', '_', sanitized)
105
+ # Remove multiple underscores
106
+ sanitized = re.sub(r'_+', '_', sanitized)
107
+ # Remove leading/trailing underscores
108
+ sanitized = sanitized.strip('_')
109
+ return sanitized
110
+
111
+
112
+ def load_keyword_mapping() -> Dict[str, str]:
113
+ """Load keyword mapping from repos_check_history.csv."""
114
+ logger.info(f"Loading keyword mapping from {REPOS_CHECK_HISTORY_CSV}")
115
+
116
+ mapping = {}
117
+ try:
118
+ # Read CSV in chunks to handle large file
119
+ chunk_size = 100000
120
+ for chunk in pd.read_csv(REPOS_CHECK_HISTORY_CSV, chunksize=chunk_size):
121
+ for _, row in chunk.iterrows():
122
+ full_name = row['full_name']
123
+ keyword = row['keyword']
124
+ mapping[full_name] = keyword
125
+
126
+ logger.info(f"Loaded {len(mapping)} keyword mappings")
127
+ return mapping
128
+ except Exception as e:
129
+ logger.error(f"Error loading keyword mapping: {e}")
130
+ raise
131
+
132
+
133
+ def is_binary_file(file_path: Path) -> bool:
134
+ """Check if file is binary by extension and content."""
135
+ # Check extension
136
+ if file_path.suffix.lower() in BINARY_EXTENSIONS:
137
+ return True
138
+
139
+ # Check if file is in skip directory
140
+ for part in file_path.parts:
141
+ if part in SKIP_DIRS or part.startswith('.'):
142
+ return True
143
+
144
+ # Try to read first 512 bytes to detect binary content
145
+ try:
146
+ with open(file_path, 'rb') as f:
147
+ chunk = f.read(512)
148
+ # Check for null bytes (common in binary files)
149
+ if b'\x00' in chunk:
150
+ return True
151
+ # Check if content is mostly printable
152
+ try:
153
+ chunk.decode('utf-8')
154
+ except UnicodeDecodeError:
155
+ return True
156
+ except Exception:
157
+ return True
158
+
159
+ return False
160
+
161
+
162
+ def should_skip_file(file_path: Path) -> bool:
163
+ """Determine if file should be skipped."""
164
+ # Check if in skip directory
165
+ for part in file_path.parts:
166
+ if part in SKIP_DIRS:
167
+ return True
168
+ if part.startswith('.') and part != '.':
169
+ return True
170
+
171
+ # Skip README and README_SUMMARY markdown files
172
+ file_name = file_path.name.lower()
173
+ if file_name.startswith('readme') and file_path.suffix.lower() in {'.md', '.markdown', '.txt'}:
174
+ return True
175
+
176
+ # Check if binary
177
+ if is_binary_file(file_path):
178
+ return True
179
+
180
+ return False
181
+
182
+
183
+ def get_language(file_path: Path) -> str:
184
+ """Get programming language from file extension."""
185
+ ext = file_path.suffix.lower()
186
+ return LANGUAGE_MAP.get(ext, 'Unknown')
187
+
188
+
189
+ def read_file_content(file_path: Path) -> Optional[str]:
190
+ """Read file content, handling encoding issues."""
191
+ try:
192
+ # Try UTF-8 first
193
+ with open(file_path, 'r', encoding='utf-8') as f:
194
+ content = f.read()
195
+ return content
196
+ except UnicodeDecodeError:
197
+ # Try other encodings
198
+ encodings = ['latin-1', 'iso-8859-1', 'cp1252']
199
+ for encoding in encodings:
200
+ try:
201
+ with open(file_path, 'r', encoding=encoding) as f:
202
+ content = f.read()
203
+ logger.warning(f"Read {file_path} with {encoding} encoding")
204
+ return content
205
+ except (UnicodeDecodeError, LookupError):
206
+ continue
207
+
208
+ logger.warning(f"Could not decode {file_path}, skipping")
209
+ return None
210
+ except Exception as e:
211
+ logger.error(f"Error reading {file_path}: {e}")
212
+ return None
213
+
214
+
215
+ def process_file(file_path: Path, repo_name: str, keyword: str) -> Optional[Dict]:
216
+ """Process a single file and return its metadata and content."""
217
+ if should_skip_file(file_path):
218
+ return None
219
+
220
+ try:
221
+ file_size = file_path.stat().st_size
222
+
223
+ # Get relative path from repo root
224
+ repo_dir = REPOS_FILTERED_DIR / repo_name
225
+ try:
226
+ relative_path = file_path.relative_to(repo_dir)
227
+ except ValueError:
228
+ # File is not in repo directory (shouldn't happen)
229
+ return None
230
+
231
+ # Read content
232
+ content = read_file_content(file_path)
233
+ if content is None:
234
+ return None
235
+
236
+ # Count lines
237
+ line_count = content.count('\n') + (1 if content else 0)
238
+
239
+ return {
240
+ 'keyword': keyword,
241
+ 'repo_name': repo_name.replace('___', '/'), # Convert to full_name format
242
+ 'file_path': str(relative_path),
243
+ 'file_extension': file_path.suffix,
244
+ 'file_size': file_size,
245
+ 'line_count': line_count,
246
+ 'content': content,
247
+ 'language': get_language(file_path)
248
+ }
249
+ except Exception as e:
250
+ logger.error(f"Error processing {file_path}: {e}")
251
+ return None
252
+
253
+
254
+ def process_repo(repo_name: str, keyword_mapping: Dict[str, str]) -> List[Dict]:
255
+ """Process all files in a repository."""
256
+ repo_dir = REPOS_FILTERED_DIR / repo_name
257
+
258
+ if not repo_dir.exists() or not repo_dir.is_dir():
259
+ return []
260
+
261
+ # Get keyword for this repo
262
+ full_name = repo_name.replace('___', '/')
263
+ keyword = keyword_mapping.get(full_name)
264
+
265
+ if not keyword:
266
+ logger.debug(f"No keyword found for {full_name}, skipping")
267
+ return []
268
+
269
+ results = []
270
+
271
+ # Walk through all files
272
+ try:
273
+ for root, dirs, files in os.walk(repo_dir):
274
+ # Filter out skip directories
275
+ dirs[:] = [d for d in dirs if d not in SKIP_DIRS and not d.startswith('.')]
276
+
277
+ for file in files:
278
+ file_path = Path(root) / file
279
+ result = process_file(file_path, repo_name, keyword)
280
+ if result:
281
+ results.append(result)
282
+ except Exception as e:
283
+ logger.error(f"Error walking {repo_dir}: {e}")
284
+
285
+ return results
286
+
287
+
288
+ class CSVWriterManager:
289
+ """Manager for CSV writers - handles opening, writing, and closing CSV files."""
290
+
291
+ def __init__(self, output_dir: Path):
292
+ self.output_dir = output_dir
293
+ self.writers = {} # keyword -> (file_handle, csv_writer)
294
+ self.file_counts = defaultdict(int) # keyword -> count
295
+ self.fieldnames = ['keyword', 'repo_name', 'file_path', 'file_extension',
296
+ 'file_size', 'line_count', 'content', 'language']
297
+
298
+ def get_writer(self, keyword: str):
299
+ """Get or create a CSV writer for a keyword."""
300
+ if keyword not in self.writers:
301
+ sanitized_keyword = sanitize_keyword(keyword)
302
+ output_file = self.output_dir / f"dataset_{sanitized_keyword}.csv"
303
+
304
+ file_handle = open(output_file, 'w', newline='', encoding='utf-8')
305
+ writer = csv.DictWriter(file_handle, fieldnames=self.fieldnames, quoting=csv.QUOTE_ALL)
306
+ writer.writeheader()
307
+
308
+ self.writers[keyword] = (file_handle, writer)
309
+
310
+ return self.writers[keyword][1]
311
+
312
+ def write_row(self, keyword: str, row: Dict):
313
+ """Write a row to the appropriate CSV file."""
314
+ writer = self.get_writer(keyword)
315
+ writer.writerow(row)
316
+ self.file_counts[keyword] += 1
317
+
318
+ def close_all(self):
319
+ """Close all open file handles."""
320
+ for keyword, (file_handle, _) in self.writers.items():
321
+ file_handle.close()
322
+ logger.info(f"Closed dataset_{sanitize_keyword(keyword)}.csv with {self.file_counts[keyword]} files")
323
+
324
+ def get_stats(self) -> Tuple[int, int]:
325
+ """Return (total_keywords, total_files)."""
326
+ return len(self.writers), sum(self.file_counts.values())
327
+
328
+
329
+ def main():
330
+ """Main function with streaming write to avoid memory issues."""
331
+ logger.info("Starting file export to CSV (streaming mode)")
332
+
333
+ # Create output directory
334
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
335
+ logger.info(f"Output directory: {OUTPUT_DIR}")
336
+
337
+ # Load keyword mapping
338
+ keyword_mapping = load_keyword_mapping()
339
+
340
+ # Get all repository directories
341
+ logger.info("Scanning repository directories...")
342
+ repo_dirs = [d.name for d in REPOS_FILTERED_DIR.iterdir() if d.is_dir()]
343
+ logger.info(f"Found {len(repo_dirs)} repositories")
344
+
345
+ # Initialize CSV writer manager (streaming mode - write as we go)
346
+ csv_manager = CSVWriterManager(OUTPUT_DIR)
347
+
348
+ # Process repositories and write immediately
349
+ logger.info("Processing repositories (streaming mode - writing as we go)...")
350
+
351
+ total_files_processed = 0
352
+ repos_processed = 0
353
+ repos_with_no_keyword = 0
354
+
355
+ try:
356
+ with tqdm(total=len(repo_dirs), desc="Processing repos") as pbar:
357
+ for repo_name in repo_dirs:
358
+ # Get keyword for this repo
359
+ full_name = repo_name.replace('___', '/')
360
+ keyword = keyword_mapping.get(full_name)
361
+
362
+ if not keyword:
363
+ repos_with_no_keyword += 1
364
+ pbar.update(1)
365
+ continue
366
+
367
+ # Process repository
368
+ results = process_repo(repo_name, keyword_mapping)
369
+
370
+ if results:
371
+ # Write results immediately to CSV (streaming)
372
+ for result in results:
373
+ csv_manager.write_row(result['keyword'], result)
374
+ total_files_processed += 1
375
+ repos_processed += 1
376
+
377
+ pbar.update(1)
378
+
379
+ # Periodic logging
380
+ if repos_processed > 0 and repos_processed % 1000 == 0:
381
+ logger.info(f"Progress: {repos_processed} repos, {total_files_processed} files")
382
+
383
+ finally:
384
+ # Close all CSV files
385
+ csv_manager.close_all()
386
+
387
+ # Print summary
388
+ total_keywords, total_files = csv_manager.get_stats()
389
+
390
+ logger.info("=" * 60)
391
+ logger.info("Export completed!")
392
+ logger.info(f"Repositories processed: {repos_processed}")
393
+ logger.info(f"Repositories with no keyword mapping: {repos_with_no_keyword}")
394
+ logger.info(f"Total keywords: {total_keywords}")
395
+ logger.info(f"Total files exported: {total_files}")
396
+ logger.info(f"Output directory: {OUTPUT_DIR}")
397
+ logger.info("=" * 60)
398
+
399
+
400
+ if __name__ == "__main__":
401
+ main()
402
+