DouDou commited on
Commit
ce13bef
·
verified ·
1 Parent(s): 0a4edf5

Upload data1/reporting/code_file_stats.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data1/reporting/code_file_stats.py +321 -0
data1/reporting/code_file_stats.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stage C: 代码文件级统计(复用analysis.py的逻辑)
3
+ 对前15000仓库进行代码文件分析
4
+ """
5
+ import os
6
+ import json
7
+ import sys
8
+ from pathlib import Path
9
+ from collections import defaultdict, Counter
10
+ from tqdm import tqdm
11
+ import statistics
12
+ import math
13
+ from multiprocessing import Pool, cpu_count
14
+ import pandas as pd
15
+
16
+ # 导入analysis.py的函数
17
+ sys.path.insert(0, str(Path(__file__).parent.parent))
18
+ from analysis import (
19
+ detect_language, count_comments, count_tokens,
20
+ count_functions_and_parameters, analyze_code
21
+ )
22
+
23
+
24
+ def _default_repo_stats():
25
+ """Factory function for defaultdict (must be top-level for pickle)"""
26
+ return {
27
+ 'total_files': 0,
28
+ 'total_lines': 0,
29
+ 'total_code_lines': 0,
30
+ 'total_comment_lines': 0,
31
+ 'total_tokens': 0,
32
+ 'total_functions': 0,
33
+ 'total_parameters': 0,
34
+ 'languages': Counter(),
35
+ 'file_sizes': [],
36
+ }
37
+
38
+
39
+ class CodeFileStats:
40
+ def __init__(self, repos_dir, output_dir, top_n=15000, max_file_size_mb=2):
41
+ self.repos_dir = Path(repos_dir)
42
+ self.output_dir = Path(output_dir)
43
+ self.output_dir.mkdir(parents=True, exist_ok=True)
44
+ self.top_n = top_n
45
+ self.max_file_size_bytes = max_file_size_mb * 1024 * 1024
46
+
47
+ # 跳过目录
48
+ self.skip_dirs = {
49
+ '.git', 'node_modules', 'vendor', 'dist', 'build', '__pycache__',
50
+ '.pytest_cache', '.ipynb_checkpoints', 'venv', 'env', '.venv',
51
+ 'target', '.idea', '.vscode', '.mypy_cache', '.tox'
52
+ }
53
+
54
+ # 代码文件扩展名(基于analysis.py)
55
+ self.code_extensions = {
56
+ '.py', '.java', '.c', '.h', '.hh', '.hpp', '.cpp', '.cc', '.cxx', '.c++',
57
+ '.f', '.f90', '.f95', '.F', '.r', '.m', '.sh', '.bash', '.rs', '.go',
58
+ '.ipynb' # Notebook单独处理
59
+ }
60
+
61
+ self.file_stats = []
62
+ self.repo_stats = defaultdict(_default_repo_stats)
63
+
64
+ def parse_notebook(self, file_path):
65
+ """解析Jupyter Notebook,提取代码cells"""
66
+ try:
67
+ with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
68
+ nb = json.load(f)
69
+
70
+ code_cells = []
71
+ for cell in nb.get('cells', []):
72
+ if cell.get('cell_type') == 'code':
73
+ source = cell.get('source', [])
74
+ if isinstance(source, list):
75
+ code = ''.join(source)
76
+ else:
77
+ code = str(source)
78
+ if code.strip():
79
+ code_cells.append(code)
80
+
81
+ return '\n'.join(code_cells)
82
+ except:
83
+ return None
84
+
85
+ def analyze_file(self, file_path, repo_name):
86
+ """分析单个代码文件"""
87
+ try:
88
+ # 检查文件大小
89
+ file_size = file_path.stat().st_size
90
+ if file_size > self.max_file_size_bytes:
91
+ return None
92
+
93
+ # 读取文件
94
+ if file_path.suffix.lower() == '.ipynb':
95
+ code = self.parse_notebook(file_path)
96
+ if not code:
97
+ return None
98
+ lang = 'jupyter'
99
+ else:
100
+ try:
101
+ with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
102
+ code = f.read()
103
+ except:
104
+ return None
105
+
106
+ # 使用analysis.py的analyze_code函数
107
+ result = analyze_code(code, str(file_path))
108
+ result['repo_name'] = repo_name
109
+ result['file_path'] = str(file_path.relative_to(self.repos_dir / repo_name))
110
+ result['file_size_bytes'] = file_size
111
+
112
+ # 派生指标
113
+ if result['total_lines'] > 0:
114
+ result['comment_ratio'] = result['comment_lines'] / result['total_lines']
115
+ else:
116
+ result['comment_ratio'] = 0
117
+
118
+ if result['total_lines'] > 0:
119
+ result['code_density'] = result['code_lines'] / result['total_lines']
120
+ else:
121
+ result['code_density'] = 0
122
+
123
+ if result['code_lines'] > 0:
124
+ result['avg_tokens_per_line'] = result['tokens'] / result['code_lines']
125
+ else:
126
+ result['avg_tokens_per_line'] = 0
127
+
128
+ if result['functions'] > 0:
129
+ result['avg_params_per_func'] = result['parameters'] / result['functions']
130
+ else:
131
+ result['avg_params_per_func'] = 0
132
+
133
+ # 对于notebook,保持language为jupyter
134
+ if file_path.suffix.lower() == '.ipynb':
135
+ result['language'] = 'jupyter'
136
+
137
+ return result
138
+ except Exception as e:
139
+ return None
140
+
141
+ def scan_repo(self, repo_path):
142
+ """扫描单个仓���的所有代码文件"""
143
+ repo_name = repo_path.name
144
+ repo_files = []
145
+
146
+ for root, dirs, files in os.walk(repo_path):
147
+ # 跳过不需要的目录
148
+ dirs[:] = [d for d in dirs if d not in self.skip_dirs]
149
+
150
+ for file in files:
151
+ file_path = Path(root) / file
152
+ ext = file_path.suffix.lower()
153
+
154
+ # 只处理代码文件
155
+ if ext in self.code_extensions or ext == '':
156
+ result = self.analyze_file(file_path, repo_name)
157
+ if result:
158
+ repo_files.append(result)
159
+
160
+ return repo_files
161
+
162
+ def scan_all_repos(self, num_workers=None):
163
+ """扫描所有仓库(多进程优化版)"""
164
+ if num_workers is None:
165
+ num_workers = min(cpu_count(), 32) # 限制最大进程数,避免内存问题
166
+
167
+ # 获取所有仓库目录
168
+ all_repos = sorted([d for d in self.repos_dir.iterdir() if d.is_dir()])
169
+ selected_repos = all_repos[:self.top_n]
170
+
171
+ print(f"Scanning {len(selected_repos)} repos for code files using {num_workers} workers...")
172
+
173
+ # 使用较小的 chunksize 以便进度条能够实时更新
174
+ chunksize = 1
175
+
176
+ # 多进程处理(使用 imap_unordered 更快返回结果)
177
+ with Pool(processes=num_workers) as pool:
178
+ results = list(tqdm(
179
+ pool.imap_unordered(self.scan_repo, selected_repos, chunksize=chunksize),
180
+ total=len(selected_repos),
181
+ desc="Scanning repos"
182
+ ))
183
+
184
+ # 扁平化结果
185
+ for repo_files in results:
186
+ self.file_stats.extend(repo_files)
187
+
188
+ print(f"Found {len(self.file_stats)} code files")
189
+
190
+ def aggregate_repo_stats(self):
191
+ """聚合仓库级统计"""
192
+ for file_stat in self.file_stats:
193
+ repo = file_stat['repo_name']
194
+ self.repo_stats[repo]['total_files'] += 1
195
+ self.repo_stats[repo]['total_lines'] += file_stat['total_lines']
196
+ self.repo_stats[repo]['total_code_lines'] += file_stat['code_lines']
197
+ self.repo_stats[repo]['total_comment_lines'] += file_stat['comment_lines']
198
+ self.repo_stats[repo]['total_tokens'] += file_stat['tokens']
199
+ self.repo_stats[repo]['total_functions'] += file_stat['functions']
200
+ self.repo_stats[repo]['total_parameters'] += file_stat['parameters']
201
+ self.repo_stats[repo]['languages'][file_stat['language']] += 1
202
+ self.repo_stats[repo]['file_sizes'].append(file_stat['file_size_bytes'])
203
+
204
+ # 转换为可序列化格式
205
+ repo_stats_list = []
206
+ for repo, stats in self.repo_stats.items():
207
+ total_files = stats['total_files']
208
+ stats_dict = {
209
+ 'repo_name': repo,
210
+ 'full_name': repo.replace('___', '/'),
211
+ 'total_files': total_files,
212
+ 'total_lines': stats['total_lines'],
213
+ 'total_code_lines': stats['total_code_lines'],
214
+ 'total_comment_lines': stats['total_comment_lines'],
215
+ 'total_tokens': stats['total_tokens'],
216
+ 'total_functions': stats['total_functions'],
217
+ 'total_parameters': stats['total_parameters'],
218
+ 'language_count': len(stats['languages']),
219
+ 'primary_language': stats['languages'].most_common(1)[0][0] if stats['languages'] else 'unknown',
220
+ 'primary_language_files': stats['languages'].most_common(1)[0][1] if stats['languages'] else 0,
221
+ }
222
+
223
+ # 派生指标
224
+ if stats['total_lines'] > 0:
225
+ stats_dict['comment_ratio'] = stats['total_comment_lines'] / stats['total_lines']
226
+ else:
227
+ stats_dict['comment_ratio'] = 0
228
+
229
+ if stats['total_functions'] > 0:
230
+ stats_dict['avg_func_length'] = stats['total_code_lines'] / stats['total_functions']
231
+ stats_dict['avg_params_per_func'] = stats['total_parameters'] / stats['total_functions']
232
+ else:
233
+ stats_dict['avg_func_length'] = 0
234
+ stats_dict['avg_params_per_func'] = 0
235
+
236
+ # 语言多样性(熵)
237
+ if stats['languages']:
238
+ total_lang_files = sum(stats['languages'].values())
239
+ entropy = 0
240
+ for count in stats['languages'].values():
241
+ p = count / total_lang_files
242
+ if p > 0:
243
+ entropy -= p * math.log2(p)
244
+ stats_dict['language_entropy'] = entropy
245
+ else:
246
+ stats_dict['language_entropy'] = 0
247
+
248
+ # 文件大小统计
249
+ if stats['file_sizes']:
250
+ stats_dict['avg_file_size_kb'] = statistics.mean(stats['file_sizes']) / 1024
251
+ stats_dict['max_file_size_mb'] = max(stats['file_sizes']) / (1024 * 1024)
252
+
253
+ # 主语言占比
254
+ if stats['languages']:
255
+ primary_lang_count = stats['languages'].most_common(1)[0][1]
256
+ stats_dict['primary_language_ratio'] = primary_lang_count / total_files
257
+ else:
258
+ stats_dict['primary_language_ratio'] = 0
259
+
260
+ repo_stats_list.append(stats_dict)
261
+
262
+ return repo_stats_list
263
+
264
+ def save_results(self):
265
+ """保存结果"""
266
+ # 保存文件级统计(抽样,只保存前10000或按大小排序的异常值)
267
+ file_df = pd.DataFrame(self.file_stats)
268
+ if len(file_df) > 10000:
269
+ # 保存最大和最小的文件
270
+ file_df_large = file_df.nlargest(5000, 'file_size_bytes')
271
+ file_df_small = file_df.nsmallest(5000, 'file_size_bytes')
272
+ file_df_sample = pd.concat([file_df_large, file_df_small]).drop_duplicates()
273
+ else:
274
+ file_df_sample = file_df
275
+
276
+ file_df_sample.to_csv(self.output_dir / 'file_level_metrics_sampled.csv', index=False)
277
+
278
+ # 保存仓库级统计
279
+ repo_stats_list = self.aggregate_repo_stats()
280
+ repo_df = pd.DataFrame(repo_stats_list)
281
+ repo_df.to_csv(self.output_dir / 'repo_level_metrics_top15000.csv', index=False)
282
+
283
+ # 汇总统计
284
+ summary = {
285
+ 'total_files': len(self.file_stats),
286
+ 'total_repos': len(self.repo_stats),
287
+ 'avg_files_per_repo': len(self.file_stats) / len(self.repo_stats) if self.repo_stats else 0,
288
+ }
289
+
290
+ # 按语言统计
291
+ lang_counter = Counter(f['language'] for f in self.file_stats)
292
+ summary['files_by_language'] = dict(lang_counter.most_common(20))
293
+
294
+ if repo_stats_list:
295
+ summary['repo_stats'] = {
296
+ 'avg_total_lines': statistics.mean([r['total_lines'] for r in repo_stats_list]),
297
+ 'avg_code_lines': statistics.mean([r['total_code_lines'] for r in repo_stats_list]),
298
+ 'avg_comment_lines': statistics.mean([r['total_comment_lines'] for r in repo_stats_list]),
299
+ 'avg_tokens': statistics.mean([r['total_tokens'] for r in repo_stats_list]),
300
+ 'avg_functions': statistics.mean([r['total_functions'] for r in repo_stats_list]),
301
+ }
302
+
303
+ with open(self.output_dir / 'code_stats_summary.json', 'w', encoding='utf-8') as f:
304
+ json.dump(summary, f, indent=2, ensure_ascii=False)
305
+
306
+ def run(self, num_workers=None):
307
+ """执行完整流程"""
308
+ print("Stage C: Analyzing code files...")
309
+ self.scan_all_repos(num_workers=num_workers)
310
+ print("Aggregating repo-level stats...")
311
+ print("Saving results...")
312
+ self.save_results()
313
+ print(f"Code file stats complete! Results saved to {self.output_dir}")
314
+
315
+
316
+ if __name__ == "__main__":
317
+ repos_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered"
318
+ output_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/code_stats"
319
+ stats = CodeFileStats(repos_dir, output_dir, top_n=15000)
320
+ stats.run()
321
+