DouDou commited on
Commit
9f60e31
·
verified ·
1 Parent(s): c1ef1a3

Upload data1/reporting/stage_a_stats.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data1/reporting/stage_a_stats.py +368 -0
data1/reporting/stage_a_stats.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stage A: 流式统计 repos_searched.csv (130万+行)
3
+ 统计缺失率、by_keyword/by_language/by_license/by_year、stars/forks/open_issues/size分布
4
+ """
5
+ import csv
6
+ import sys
7
+ from collections import defaultdict, Counter
8
+ from datetime import datetime
9
+ import json
10
+ from pathlib import Path
11
+ from tqdm import tqdm
12
+ import statistics
13
+ import re
14
+
15
+ csv.field_size_limit(sys.maxsize)
16
+
17
+
18
+ class StageAStats:
19
+ def __init__(self, csv_path, output_dir):
20
+ self.csv_path = csv_path
21
+ self.output_dir = Path(output_dir)
22
+ self.output_dir.mkdir(parents=True, exist_ok=True)
23
+
24
+ # 统计数据容器
25
+ self.stats = {
26
+ 'total_records': 0,
27
+ 'unique_repos': set(),
28
+ 'unique_owners': set(),
29
+ 'unique_keywords': set(),
30
+ 'by_keyword': defaultdict(lambda: {'count': 0, 'repos': set()}),
31
+ 'by_language': defaultdict(int),
32
+ 'by_license': defaultdict(int),
33
+ 'by_year': defaultdict(int),
34
+ 'by_year_month': defaultdict(int),
35
+ 'missing': defaultdict(int),
36
+ 'topics_stats': {'counts': [], 'missing': 0},
37
+ 'desc_lengths': [],
38
+ 'stars': [],
39
+ 'forks': [],
40
+ 'open_issues': [],
41
+ 'size': [],
42
+ 'archived': {'yes': 0, 'no': 0},
43
+ 'has_wiki': {'yes': 0, 'no': 0},
44
+ 'default_branch': defaultdict(int),
45
+ 'repo_keyword_count': defaultdict(int), # 每个repo命中多少个keyword
46
+ 'topics_list': [], # 所有topics
47
+ }
48
+
49
+ def parse_date(self, date_str):
50
+ """解析日期字符串"""
51
+ if not date_str or date_str.strip() == '':
52
+ return None
53
+ try:
54
+ # ISO format: 2017-04-26T11:03:50Z
55
+ date_str = date_str.replace('Z', '+00:00')
56
+ return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
57
+ except:
58
+ return None
59
+
60
+ def safe_int(self, val):
61
+ """安全转换为整数"""
62
+ if not val or val == '':
63
+ return None
64
+ try:
65
+ return int(float(val))
66
+ except:
67
+ return None
68
+
69
+ def safe_bool(self, val):
70
+ """安全转换为布尔值"""
71
+ if not val or val == '':
72
+ return None
73
+ val = str(val).strip().lower()
74
+ return val in ('true', 'yes', '1', 't')
75
+
76
+ def is_empty(self, val):
77
+ """判断字段是否为空"""
78
+ if val is None:
79
+ return True
80
+ val = str(val).strip()
81
+ return val == '' or val.lower() == 'none'
82
+
83
+ def process_row(self, row):
84
+ """处理单行数据"""
85
+ self.stats['total_records'] += 1
86
+
87
+ # 基础字段
88
+ keyword = row.get('keyword', '').strip()
89
+ full_name = row.get('full_name', '').strip()
90
+ owner = row.get('owner', '').strip()
91
+ language = row.get('language', '').strip()
92
+ license_val = row.get('license', '').strip()
93
+ topics = row.get('topics', '').strip()
94
+ description = row.get('description', '').strip()
95
+ default_branch = row.get('default_branch', '').strip()
96
+
97
+ # 唯一值统计
98
+ if keyword:
99
+ self.stats['unique_keywords'].add(keyword)
100
+ if full_name:
101
+ self.stats['unique_repos'].add(full_name)
102
+ self.stats['repo_keyword_count'][full_name] += 1
103
+ if owner:
104
+ self.stats['unique_owners'].add(owner)
105
+
106
+ # by_keyword
107
+ if keyword:
108
+ self.stats['by_keyword'][keyword]['count'] += 1
109
+ if full_name:
110
+ self.stats['by_keyword'][keyword]['repos'].add(full_name)
111
+
112
+ # 缺失率统计
113
+ for field in ['description', 'language', 'topics', 'license']:
114
+ if self.is_empty(row.get(field, '')):
115
+ self.stats['missing'][field] += 1
116
+
117
+ # language
118
+ if language:
119
+ self.stats['by_language'][language] += 1
120
+ else:
121
+ self.stats['by_language']['<empty>'] += 1
122
+
123
+ # license
124
+ if license_val:
125
+ self.stats['by_license'][license_val] += 1
126
+ else:
127
+ self.stats['by_license']['<empty>'] += 1
128
+
129
+ # default_branch
130
+ if default_branch:
131
+ self.stats['default_branch'][default_branch] += 1
132
+
133
+ # topics统计
134
+ if self.is_empty(topics):
135
+ self.stats['topics_stats']['missing'] += 1
136
+ else:
137
+ topics_list = [t.strip() for t in topics.split(',') if t.strip()]
138
+ self.stats['topics_stats']['counts'].append(len(topics_list))
139
+ self.stats['topics_list'].extend(topics_list)
140
+
141
+ # description长度
142
+ if description:
143
+ self.stats['desc_lengths'].append(len(description))
144
+
145
+ # 数值字段
146
+ stars = self.safe_int(row.get('stars'))
147
+ if stars is not None:
148
+ self.stats['stars'].append(stars)
149
+
150
+ forks = self.safe_int(row.get('forks'))
151
+ if forks is not None:
152
+ self.stats['forks'].append(forks)
153
+
154
+ open_issues = self.safe_int(row.get('open_issues'))
155
+ if open_issues is not None:
156
+ self.stats['open_issues'].append(open_issues)
157
+
158
+ size = self.safe_int(row.get('size'))
159
+ if size is not None:
160
+ self.stats['size'].append(size)
161
+
162
+ # 布尔字段
163
+ archived = self.safe_bool(row.get('archived'))
164
+ if archived is not None:
165
+ if archived:
166
+ self.stats['archived']['yes'] += 1
167
+ else:
168
+ self.stats['archived']['no'] += 1
169
+
170
+ has_wiki = self.safe_bool(row.get('has_wiki'))
171
+ if has_wiki is not None:
172
+ if has_wiki:
173
+ self.stats['has_wiki']['yes'] += 1
174
+ else:
175
+ self.stats['has_wiki']['no'] += 1
176
+
177
+ # 时间字段
178
+ created_at = self.parse_date(row.get('created_at'))
179
+ if created_at:
180
+ year = created_at.year
181
+ self.stats['by_year'][year] += 1
182
+ year_month = f"{year}-{created_at.month:02d}"
183
+ self.stats['by_year_month'][year_month] += 1
184
+
185
+ def process_csv(self):
186
+ """流式处理CSV文件"""
187
+ print(f"Processing {self.csv_path}...")
188
+
189
+ with open(self.csv_path, 'r', encoding='utf-8', errors='replace') as f:
190
+ reader = csv.DictReader(f)
191
+ for row in tqdm(reader, desc="Processing repos_searched.csv"):
192
+ self.process_row(row)
193
+
194
+ # 转换set为计数
195
+ self.stats['unique_repos_count'] = len(self.stats['unique_repos'])
196
+ self.stats['unique_owners_count'] = len(self.stats['unique_owners'])
197
+ self.stats['unique_keywords_count'] = len(self.stats['unique_keywords'])
198
+
199
+ # 转换by_keyword的repos set为计数
200
+ for kw in self.stats['by_keyword']:
201
+ self.stats['by_keyword'][kw]['unique_repos'] = len(self.stats['by_keyword'][kw]['repos'])
202
+ del self.stats['by_keyword'][kw]['repos'] # 释放内存
203
+
204
+ def save_summary(self):
205
+ """保存总体汇总"""
206
+ summary = {
207
+ 'total_records': self.stats['total_records'],
208
+ 'unique_repos': self.stats['unique_repos_count'],
209
+ 'unique_owners': self.stats['unique_owners_count'],
210
+ 'unique_keywords': self.stats['unique_keywords_count'],
211
+ 'missing_rates': {
212
+ field: self.stats['missing'][field] / self.stats['total_records'] * 100
213
+ for field in ['description', 'language', 'topics', 'license']
214
+ },
215
+ 'archived_ratio': self.stats['archived']['yes'] / (self.stats['archived']['yes'] + self.stats['archived']['no']) * 100 if (self.stats['archived']['yes'] + self.stats['archived']['no']) > 0 else 0,
216
+ 'has_wiki_ratio': self.stats['has_wiki']['yes'] / (self.stats['has_wiki']['yes'] + self.stats['has_wiki']['no']) * 100 if (self.stats['has_wiki']['yes'] + self.stats['has_wiki']['no']) > 0 else 0,
217
+ }
218
+
219
+ # 添加分位数统计
220
+ if self.stats['stars']:
221
+ sorted_stars = sorted(self.stats['stars'])
222
+ summary['stars'] = {
223
+ 'min': sorted_stars[0],
224
+ 'p50': sorted_stars[len(sorted_stars)//2],
225
+ 'p90': sorted_stars[int(len(sorted_stars)*0.9)],
226
+ 'p99': sorted_stars[int(len(sorted_stars)*0.99)],
227
+ 'max': sorted_stars[-1],
228
+ 'mean': statistics.mean(sorted_stars),
229
+ }
230
+
231
+ if self.stats['topics_stats']['counts']:
232
+ counts = self.stats['topics_stats']['counts']
233
+ summary['topics_per_repo'] = {
234
+ 'mean': statistics.mean(counts),
235
+ 'median': statistics.median(counts),
236
+ 'max': max(counts),
237
+ }
238
+
239
+ if self.stats['desc_lengths']:
240
+ lengths = self.stats['desc_lengths']
241
+ summary['description_length'] = {
242
+ 'mean': statistics.mean(lengths),
243
+ 'median': statistics.median(lengths),
244
+ 'max': max(lengths),
245
+ }
246
+
247
+ with open(self.output_dir / 'summary_overall.json', 'w', encoding='utf-8') as f:
248
+ json.dump(summary, f, indent=2, ensure_ascii=False)
249
+
250
+ # 保存CSV格式
251
+ import pandas as pd
252
+ summary_df = pd.DataFrame([{
253
+ 'metric': k,
254
+ 'value': v if not isinstance(v, dict) else json.dumps(v)
255
+ } for k, v in summary.items()])
256
+ summary_df.to_csv(self.output_dir / 'summary_overall.csv', index=False)
257
+
258
+ def save_by_keyword(self):
259
+ """保存按keyword的统计"""
260
+ rows = []
261
+ for kw, data in self.stats['by_keyword'].items():
262
+ rows.append({
263
+ 'keyword': kw,
264
+ 'count': data['count'],
265
+ 'unique_repos': data['unique_repos'],
266
+ })
267
+
268
+ import pandas as pd
269
+ df = pd.DataFrame(rows)
270
+ df = df.sort_values('count', ascending=False)
271
+ df.to_csv(self.output_dir / 'by_keyword.csv', index=False)
272
+
273
+ def save_by_language(self):
274
+ """保存按语言的统计"""
275
+ import pandas as pd
276
+ df = pd.DataFrame([
277
+ {'language': lang, 'count': count}
278
+ for lang, count in sorted(self.stats['by_language'].items(), key=lambda x: -x[1])
279
+ ])
280
+ df.to_csv(self.output_dir / 'by_language.csv', index=False)
281
+
282
+ def save_by_license(self):
283
+ """保存按license的统计"""
284
+ import pandas as pd
285
+ df = pd.DataFrame([
286
+ {'license': lic, 'count': count}
287
+ for lic, count in sorted(self.stats['by_license'].items(), key=lambda x: -x[1])
288
+ ])
289
+ df.to_csv(self.output_dir / 'by_license.csv', index=False)
290
+
291
+ def save_by_year(self):
292
+ """保存按年份的统计"""
293
+ import pandas as pd
294
+ df = pd.DataFrame([
295
+ {'year': year, 'count': count}
296
+ for year, count in sorted(self.stats['by_year'].items())
297
+ ])
298
+ df.to_csv(self.output_dir / 'by_year.csv', index=False)
299
+
300
+ # 按年月
301
+ df_month = pd.DataFrame([
302
+ {'year_month': ym, 'count': count}
303
+ for ym, count in sorted(self.stats['by_year_month'].items())
304
+ ])
305
+ df_month.to_csv(self.output_dir / 'by_year_month.csv', index=False)
306
+
307
+ def save_top_repos(self):
308
+ """保存Top仓库(需要重新读取,这里先保存stars列表)"""
309
+ # 由于是流式处理,Top仓库需要单独处理或二次扫描
310
+ # 这里先保存stars分布数据
311
+ if self.stats['stars']:
312
+ import pandas as pd
313
+ df = pd.DataFrame({
314
+ 'stars': sorted(self.stats['stars'], reverse=True)
315
+ })
316
+ df.to_csv(self.output_dir / 'stars_distribution.csv', index=False)
317
+
318
+ def save_top_topics(self):
319
+ """保存Top topics"""
320
+ topic_counter = Counter(self.stats['topics_list'])
321
+ import pandas as pd
322
+ df = pd.DataFrame([
323
+ {'topic': topic, 'count': count}
324
+ for topic, count in topic_counter.most_common(100)
325
+ ])
326
+ df.to_csv(self.output_dir / 'top_topics.csv', index=False)
327
+
328
+ def save_repo_keyword_counts(self):
329
+ """保存每个仓库命中的keyword数量分布"""
330
+ import pandas as pd
331
+ counts = list(self.stats['repo_keyword_count'].values())
332
+ df = pd.DataFrame({
333
+ 'keyword_count': counts
334
+ })
335
+ df.to_csv(self.output_dir / 'repo_keyword_count_distribution.csv', index=False)
336
+
337
+ def save_default_branch(self):
338
+ """保存default_branch分布"""
339
+ import pandas as pd
340
+ df = pd.DataFrame([
341
+ {'branch': branch, 'count': count}
342
+ for branch, count in sorted(self.stats['default_branch'].items(), key=lambda x: -x[1])
343
+ ])
344
+ df.to_csv(self.output_dir / 'default_branch_distribution.csv', index=False)
345
+
346
+ def run(self):
347
+ """执行完整流程"""
348
+ print("Stage A: Processing repos_searched.csv...")
349
+ self.process_csv()
350
+ print("Saving results...")
351
+ self.save_summary()
352
+ self.save_by_keyword()
353
+ self.save_by_language()
354
+ self.save_by_license()
355
+ self.save_by_year()
356
+ self.save_top_repos()
357
+ self.save_top_topics()
358
+ self.save_repo_keyword_counts()
359
+ self.save_default_branch()
360
+ print(f"Stage A complete! Results saved to {self.output_dir}")
361
+
362
+
363
+ if __name__ == "__main__":
364
+ csv_path = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_searched.csv"
365
+ output_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/stage_a"
366
+ stats = StageAStats(csv_path, output_dir)
367
+ stats.run()
368
+