DouDou commited on
Commit
f1c06ef
·
verified ·
1 Parent(s): ce13bef

Upload data1/reporting/code_file_stats_fast.py with huggingface_hub

Browse files
data1/reporting/code_file_stats_fast.py ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stage C: 代码文件级统计(优化版 - 大幅提速)
3
+
4
+ 优化策略:
5
+ 1. 使用简化的统计方法替代复杂正则匹配
6
+ 2. 对大文件使用粗略估计
7
+ 3. 断点续传支持
8
+ 4. 批量处理减少IPC开销
9
+ 5. 跳过详细函数参数分析,使用快速计数
10
+ """
11
+ import os
12
+ import json
13
+ import sys
14
+ from pathlib import Path
15
+ from collections import defaultdict, Counter
16
+ from tqdm import tqdm
17
+ import statistics
18
+ import math
19
+ from multiprocessing import Pool, cpu_count
20
+ import pandas as pd
21
+ import pickle
22
+ import hashlib
23
+
24
+ # ============== 快速统计函数(替代复杂正则) ==============
25
+
26
+ # 函数关键字(用于快速计数)
27
+ FUNC_KEYWORDS = {
28
+ 'python': [b'def '],
29
+ 'jupyter': [b'def '],
30
+ 'java': [b'public ', b'private ', b'protected ', b'void ', b'static '],
31
+ 'c/c++': [b'void ', b'int ', b'float ', b'double ', b'char ', b'bool '],
32
+ 'go': [b'func '],
33
+ 'rust': [b'fn '],
34
+ 'r': [b'function(', b'function ('],
35
+ 'matlab': [b'function '],
36
+ 'shell': [b'function ', b'() {'],
37
+ 'fortran': [b'subroutine ', b'function ', b'SUBROUTINE ', b'FUNCTION '],
38
+ }
39
+
40
+ # 注释标记
41
+ COMMENT_MARKERS = {
42
+ 'python': (b'#', b'"""', b"'''"),
43
+ 'jupyter': (b'#', b'"""', b"'''"),
44
+ 'java': (b'//', b'/*'),
45
+ 'c/c++': (b'//', b'/*'),
46
+ 'go': (b'//', b'/*'),
47
+ 'rust': (b'//', b'/*'),
48
+ 'r': (b'#',),
49
+ 'matlab': (b'%', b'%{'),
50
+ 'shell': (b'#',),
51
+ 'fortran': (b'!',),
52
+ }
53
+
54
+ # 文件扩展名映射
55
+ EXT_MAP = {
56
+ '.py': 'python', '.java': 'java', '.c': 'c/c++', '.h': 'c/c++',
57
+ '.hh': 'c/c++', '.hpp': 'c/c++', '.cpp': 'c/c++', '.cc': 'c/c++',
58
+ '.cxx': 'c/c++', '.c++': 'c/c++', '.f': 'fortran', '.f90': 'fortran',
59
+ '.f95': 'fortran', '.F': 'fortran', '.r': 'r', '.m': 'matlab',
60
+ '.sh': 'shell', '.bash': 'shell', '.rs': 'rust', '.go': 'go',
61
+ '.ipynb': 'jupyter'
62
+ }
63
+
64
+
65
+ def detect_language_fast(file_path: str) -> str:
66
+ """快速语言检测"""
67
+ ext = os.path.splitext(file_path)[1].lower()
68
+ return EXT_MAP.get(ext, 'unknown')
69
+
70
+
71
+ def fast_analyze_file(file_path: Path, repo_name: str, max_file_size_bytes: int = 2*1024*1024) -> dict:
72
+ """
73
+ 快速分析单个代码文件(使用字节操作,比字符串快得多)
74
+ """
75
+ try:
76
+ file_size = file_path.stat().st_size
77
+ if file_size > max_file_size_bytes:
78
+ return None
79
+
80
+ ext = file_path.suffix.lower()
81
+
82
+ # Notebook 特殊处理
83
+ if ext == '.ipynb':
84
+ return fast_analyze_notebook(file_path, repo_name, file_size)
85
+
86
+ # 读取文件(二进制模式,更快)
87
+ try:
88
+ with open(file_path, 'rb') as f:
89
+ content = f.read()
90
+ except:
91
+ return None
92
+
93
+ lang = detect_language_fast(str(file_path))
94
+
95
+ # 快速统计
96
+ lines = content.count(b'\n') + 1
97
+
98
+ # 快速注释行估计(计数注释标记)
99
+ comment_lines = 0
100
+ if lang in COMMENT_MARKERS:
101
+ for marker in COMMENT_MARKERS[lang]:
102
+ comment_lines += content.count(marker)
103
+ # 粗略估计:假设每个注释标记对应一行注释
104
+ comment_lines = min(comment_lines, lines // 2) # 限制最多一半是注释
105
+
106
+ # 快速函数计数
107
+ functions = 0
108
+ if lang in FUNC_KEYWORDS:
109
+ for keyword in FUNC_KEYWORDS[lang]:
110
+ functions += content.count(keyword)
111
+
112
+ # 快速token估计(空白分割)
113
+ tokens = len(content.split())
114
+
115
+ # 空行计数(快速方法)
116
+ empty_lines = content.count(b'\n\n') + content.count(b'\r\n\r\n')
117
+
118
+ code_lines = max(0, lines - empty_lines - comment_lines)
119
+
120
+ return {
121
+ 'repo_name': repo_name,
122
+ 'file_path': str(file_path.name), # 只保存文件名,减少内存
123
+ 'file_size_bytes': file_size,
124
+ 'language': lang,
125
+ 'total_lines': lines,
126
+ 'comment_lines': comment_lines,
127
+ 'code_lines': code_lines,
128
+ 'tokens': tokens,
129
+ 'functions': functions,
130
+ 'parameters': functions * 2, # 粗略估计:平均每个函数2个参数
131
+ }
132
+ except Exception:
133
+ return None
134
+
135
+
136
+ def fast_analyze_notebook(file_path: Path, repo_name: str, file_size: int) -> dict:
137
+ """快速分析 Jupyter Notebook"""
138
+ try:
139
+ with open(file_path, 'rb') as f:
140
+ content = f.read()
141
+
142
+ # 快速计数 code cells
143
+ code_cell_count = content.count(b'"cell_type": "code"') + content.count(b'"cell_type":"code"')
144
+
145
+ # 估计代码行数
146
+ lines = content.count(b'\n') + 1
147
+ code_lines = code_cell_count * 10 # 粗略估计每个cell 10行代码
148
+
149
+ return {
150
+ 'repo_name': repo_name,
151
+ 'file_path': str(file_path.name),
152
+ 'file_size_bytes': file_size,
153
+ 'language': 'jupyter',
154
+ 'total_lines': lines,
155
+ 'comment_lines': code_cell_count, # markdown cells 算注释
156
+ 'code_lines': code_lines,
157
+ 'tokens': len(content.split()),
158
+ 'functions': content.count(b'def '),
159
+ 'parameters': content.count(b'def ') * 2,
160
+ }
161
+ except:
162
+ return None
163
+
164
+
165
+ def _default_repo_stats():
166
+ """Factory function for defaultdict"""
167
+ return {
168
+ 'total_files': 0,
169
+ 'total_lines': 0,
170
+ 'total_code_lines': 0,
171
+ 'total_comment_lines': 0,
172
+ 'total_tokens': 0,
173
+ 'total_functions': 0,
174
+ 'total_parameters': 0,
175
+ 'languages': Counter(),
176
+ 'file_sizes': [],
177
+ }
178
+
179
+
180
+ # 跳过目录
181
+ SKIP_DIRS = {
182
+ '.git', 'node_modules', 'vendor', 'dist', 'build', '__pycache__',
183
+ '.pytest_cache', '.ipynb_checkpoints', 'venv', 'env', '.venv',
184
+ 'target', '.idea', '.vscode', '.mypy_cache', '.tox', '.eggs',
185
+ 'site-packages', 'lib', 'libs', 'third_party', 'external'
186
+ }
187
+
188
+ # 代码文件扩展名
189
+ CODE_EXTENSIONS = {
190
+ '.py', '.java', '.c', '.h', '.hh', '.hpp', '.cpp', '.cc', '.cxx', '.c++',
191
+ '.f', '.f90', '.f95', '.F', '.r', '.m', '.sh', '.bash', '.rs', '.go',
192
+ '.ipynb'
193
+ }
194
+
195
+
196
+ def scan_repo_fast(args):
197
+ """快速扫描单个仓库(用于多进程)"""
198
+ repo_path, max_file_size_bytes, max_files_per_repo = args
199
+ repo_name = repo_path.name
200
+ repo_files = []
201
+ file_count = 0
202
+
203
+ try:
204
+ for root, dirs, files in os.walk(repo_path):
205
+ # 跳过不需要的目录
206
+ dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
207
+
208
+ for file in files:
209
+ if file_count >= max_files_per_repo:
210
+ break
211
+
212
+ file_path = Path(root) / file
213
+ ext = file_path.suffix.lower()
214
+
215
+ # 只处理代码文件
216
+ if ext in CODE_EXTENSIONS:
217
+ result = fast_analyze_file(file_path, repo_name, max_file_size_bytes)
218
+ if result:
219
+ repo_files.append(result)
220
+ file_count += 1
221
+
222
+ if file_count >= max_files_per_repo:
223
+ break
224
+ except Exception:
225
+ pass
226
+
227
+ return repo_files
228
+
229
+
230
+ class CodeFileStatsFast:
231
+ def __init__(self, repos_dir, output_dir, top_n=None, max_file_size_mb=2, max_files_per_repo=500):
232
+ self.repos_dir = Path(repos_dir)
233
+ self.output_dir = Path(output_dir)
234
+ self.output_dir.mkdir(parents=True, exist_ok=True)
235
+ self.top_n = top_n
236
+ self.max_file_size_bytes = max_file_size_mb * 1024 * 1024
237
+ self.max_files_per_repo = max_files_per_repo # 限制每个仓库最多分析的文件数
238
+
239
+ self.file_stats = []
240
+ self.repo_stats = defaultdict(_default_repo_stats)
241
+
242
+ # 断点续传支持
243
+ self.checkpoint_file = self.output_dir / 'checkpoint.pkl'
244
+ self.processed_repos = set()
245
+
246
+ def load_checkpoint(self):
247
+ """加载断点"""
248
+ if self.checkpoint_file.exists():
249
+ try:
250
+ with open(self.checkpoint_file, 'rb') as f:
251
+ data = pickle.load(f)
252
+ self.processed_repos = data.get('processed_repos', set())
253
+ self.file_stats = data.get('file_stats', [])
254
+ print(f"Loaded checkpoint: {len(self.processed_repos)} repos already processed")
255
+ return True
256
+ except:
257
+ pass
258
+ return False
259
+
260
+ def save_checkpoint(self):
261
+ """保存断点"""
262
+ try:
263
+ with open(self.checkpoint_file, 'wb') as f:
264
+ pickle.dump({
265
+ 'processed_repos': self.processed_repos,
266
+ 'file_stats': self.file_stats,
267
+ }, f)
268
+ except:
269
+ pass
270
+
271
+ def scan_all_repos(self, num_workers=None):
272
+ """扫描所有仓库(优化版)"""
273
+ if num_workers is None:
274
+ num_workers = min(cpu_count(), 48) # 增加进程数
275
+
276
+ # 加载断点
277
+ self.load_checkpoint()
278
+
279
+ # 获取所有仓库目录
280
+ all_repos = sorted([d for d in self.repos_dir.iterdir() if d.is_dir()])
281
+ if self.top_n is None:
282
+ selected_repos = all_repos
283
+ else:
284
+ selected_repos = all_repos[:self.top_n]
285
+
286
+ # 过滤已处理的仓库
287
+ repos_to_process = [r for r in selected_repos if r.name not in self.processed_repos]
288
+
289
+ print(f"Total repos: {len(selected_repos)} ({'all' if self.top_n is None else f'top {self.top_n}'}), Already processed: {len(self.processed_repos)}, To process: {len(repos_to_process)}")
290
+ print(f"Using {num_workers} workers...")
291
+
292
+ if not repos_to_process:
293
+ print("All repos already processed!")
294
+ return
295
+
296
+ # 准备参数
297
+ args_list = [(repo, self.max_file_size_bytes, self.max_files_per_repo) for repo in repos_to_process]
298
+
299
+ # 使用更大的 chunksize 减少 IPC 开销
300
+ chunksize = max(1, len(repos_to_process) // (num_workers * 10))
301
+
302
+ # 多进程处理
303
+ processed_count = 0
304
+ checkpoint_interval = 500 # 每处理500个仓库保存一次断点
305
+
306
+ with Pool(processes=num_workers) as pool:
307
+ for repo_files in tqdm(
308
+ pool.imap_unordered(scan_repo_fast, args_list, chunksize=chunksize),
309
+ total=len(repos_to_process),
310
+ desc="Scanning repos"
311
+ ):
312
+ if repo_files:
313
+ self.file_stats.extend(repo_files)
314
+ if repo_files:
315
+ self.processed_repos.add(repo_files[0]['repo_name'])
316
+
317
+ processed_count += 1
318
+
319
+ # 定期保存断点
320
+ if processed_count % checkpoint_interval == 0:
321
+ self.save_checkpoint()
322
+ print(f"\nCheckpoint saved: {len(self.processed_repos)} repos processed, {len(self.file_stats)} files found")
323
+
324
+ # 最终保存断点
325
+ self.save_checkpoint()
326
+ print(f"Found {len(self.file_stats)} code files from {len(self.processed_repos)} repos")
327
+
328
+ def aggregate_repo_stats(self):
329
+ """聚合仓库级统计(与原版兼容)"""
330
+ for file_stat in self.file_stats:
331
+ repo = file_stat['repo_name']
332
+ self.repo_stats[repo]['total_files'] += 1
333
+ self.repo_stats[repo]['total_lines'] += file_stat['total_lines']
334
+ self.repo_stats[repo]['total_code_lines'] += file_stat['code_lines']
335
+ self.repo_stats[repo]['total_comment_lines'] += file_stat['comment_lines']
336
+ self.repo_stats[repo]['total_tokens'] += file_stat['tokens']
337
+ self.repo_stats[repo]['total_functions'] += file_stat['functions']
338
+ self.repo_stats[repo]['total_parameters'] += file_stat['parameters']
339
+ self.repo_stats[repo]['languages'][file_stat['language']] += 1
340
+ self.repo_stats[repo]['file_sizes'].append(file_stat['file_size_bytes'])
341
+
342
+ # 转换为可序列化格式
343
+ repo_stats_list = []
344
+ for repo, stats in self.repo_stats.items():
345
+ total_files = stats['total_files']
346
+ if total_files == 0:
347
+ continue
348
+
349
+ stats_dict = {
350
+ 'repo_name': repo,
351
+ 'full_name': repo.replace('___', '/'),
352
+ 'total_files': total_files,
353
+ 'total_lines': stats['total_lines'],
354
+ 'total_code_lines': stats['total_code_lines'],
355
+ 'total_comment_lines': stats['total_comment_lines'],
356
+ 'total_tokens': stats['total_tokens'],
357
+ 'total_functions': stats['total_functions'],
358
+ 'total_parameters': stats['total_parameters'],
359
+ 'language_count': len(stats['languages']),
360
+ 'primary_language': stats['languages'].most_common(1)[0][0] if stats['languages'] else 'unknown',
361
+ 'primary_language_files': stats['languages'].most_common(1)[0][1] if stats['languages'] else 0,
362
+ }
363
+
364
+ # 派生指标
365
+ if stats['total_lines'] > 0:
366
+ stats_dict['comment_ratio'] = stats['total_comment_lines'] / stats['total_lines']
367
+ else:
368
+ stats_dict['comment_ratio'] = 0
369
+
370
+ if stats['total_functions'] > 0:
371
+ stats_dict['avg_func_length'] = stats['total_code_lines'] / stats['total_functions']
372
+ stats_dict['avg_params_per_func'] = stats['total_parameters'] / stats['total_functions']
373
+ else:
374
+ stats_dict['avg_func_length'] = 0
375
+ stats_dict['avg_params_per_func'] = 0
376
+
377
+ # 语言多样性(熵)- 与原版兼容
378
+ if stats['languages']:
379
+ total_lang_files = sum(stats['languages'].values())
380
+ entropy = 0
381
+ for count in stats['languages'].values():
382
+ p = count / total_lang_files
383
+ if p > 0:
384
+ entropy -= p * math.log2(p)
385
+ stats_dict['language_entropy'] = entropy
386
+ else:
387
+ stats_dict['language_entropy'] = 0
388
+
389
+ # 文件大小统计 - 与原版兼容
390
+ if stats['file_sizes']:
391
+ stats_dict['avg_file_size_kb'] = statistics.mean(stats['file_sizes']) / 1024
392
+ stats_dict['max_file_size_mb'] = max(stats['file_sizes']) / (1024 * 1024)
393
+ else:
394
+ stats_dict['avg_file_size_kb'] = 0
395
+ stats_dict['max_file_size_mb'] = 0
396
+
397
+ # 主语言占比 - 与原版兼容
398
+ if stats['languages']:
399
+ primary_lang_count = stats['languages'].most_common(1)[0][1]
400
+ stats_dict['primary_language_ratio'] = primary_lang_count / total_files
401
+ else:
402
+ stats_dict['primary_language_ratio'] = 0
403
+
404
+ repo_stats_list.append(stats_dict)
405
+
406
+ return repo_stats_list
407
+
408
+ def save_results(self):
409
+ """保存结果"""
410
+ # 保存文件级统计(抽样)
411
+ file_df = pd.DataFrame(self.file_stats)
412
+ if len(file_df) > 10000:
413
+ file_df_sample = file_df.sample(n=10000, random_state=42)
414
+ else:
415
+ file_df_sample = file_df
416
+
417
+ # 使用与原版相同的文件名,以便兼容 visualization 和 insights
418
+ file_df_sample.to_csv(self.output_dir / 'file_level_metrics_sampled.csv', index=False)
419
+
420
+ # 保存仓库级统计(动态文件名)
421
+ repo_stats_list = self.aggregate_repo_stats()
422
+ repo_df = pd.DataFrame(repo_stats_list)
423
+ top_n_suffix = f"_top{self.top_n}" if self.top_n else ""
424
+ repo_df.to_csv(self.output_dir / f'repo_level_metrics{top_n_suffix}.csv', index=False)
425
+
426
+ # 汇总统计
427
+ summary = {
428
+ 'total_files': len(self.file_stats),
429
+ 'total_repos': len(self.repo_stats),
430
+ 'avg_files_per_repo': len(self.file_stats) / len(self.repo_stats) if self.repo_stats else 0,
431
+ }
432
+
433
+ # 按语言统计
434
+ lang_counter = Counter(f['language'] for f in self.file_stats)
435
+ summary['files_by_language'] = dict(lang_counter.most_common(20))
436
+
437
+ # 使用与原版相同的文件名
438
+ with open(self.output_dir / 'code_stats_summary.json', 'w', encoding='utf-8') as f:
439
+ json.dump(summary, f, indent=2, ensure_ascii=False)
440
+
441
+ # 清理断点文件
442
+ if self.checkpoint_file.exists():
443
+ self.checkpoint_file.unlink()
444
+
445
+ def run(self, num_workers=None):
446
+ """执行完整流程"""
447
+ print("Stage C (Fast): Analyzing code files...")
448
+ self.scan_all_repos(num_workers=num_workers)
449
+ print("Aggregating repo-level stats...")
450
+ print("Saving results...")
451
+ self.save_results()
452
+ print(f"Code file stats complete! Results saved to {self.output_dir}")
453
+
454
+
455
+ if __name__ == "__main__":
456
+ repos_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered"
457
+ output_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/code_stats"
458
+
459
+ # 使用优化版本
460
+ stats = CodeFileStatsFast(
461
+ repos_dir,
462
+ output_dir,
463
+ top_n=15000,
464
+ max_file_size_mb=2,
465
+ max_files_per_repo=500 # 限制每个仓库最多500个文件
466
+ )
467
+ stats.run(num_workers=48) # 使用更多进程
468
+