DouDou commited on
Upload data1/reporting/stage_a_stats.py with huggingface_hub
Browse files- data1/reporting/stage_a_stats.py +368 -0
data1/reporting/stage_a_stats.py
ADDED
|
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Stage A: 流式统计 repos_searched.csv (130万+行)
|
| 3 |
+
统计缺失率、by_keyword/by_language/by_license/by_year、stars/forks/open_issues/size分布
|
| 4 |
+
"""
|
| 5 |
+
import csv
|
| 6 |
+
import sys
|
| 7 |
+
from collections import defaultdict, Counter
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
import json
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from tqdm import tqdm
|
| 12 |
+
import statistics
|
| 13 |
+
import re
|
| 14 |
+
|
| 15 |
+
csv.field_size_limit(sys.maxsize)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class StageAStats:
|
| 19 |
+
def __init__(self, csv_path, output_dir):
|
| 20 |
+
self.csv_path = csv_path
|
| 21 |
+
self.output_dir = Path(output_dir)
|
| 22 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 23 |
+
|
| 24 |
+
# 统计数据容器
|
| 25 |
+
self.stats = {
|
| 26 |
+
'total_records': 0,
|
| 27 |
+
'unique_repos': set(),
|
| 28 |
+
'unique_owners': set(),
|
| 29 |
+
'unique_keywords': set(),
|
| 30 |
+
'by_keyword': defaultdict(lambda: {'count': 0, 'repos': set()}),
|
| 31 |
+
'by_language': defaultdict(int),
|
| 32 |
+
'by_license': defaultdict(int),
|
| 33 |
+
'by_year': defaultdict(int),
|
| 34 |
+
'by_year_month': defaultdict(int),
|
| 35 |
+
'missing': defaultdict(int),
|
| 36 |
+
'topics_stats': {'counts': [], 'missing': 0},
|
| 37 |
+
'desc_lengths': [],
|
| 38 |
+
'stars': [],
|
| 39 |
+
'forks': [],
|
| 40 |
+
'open_issues': [],
|
| 41 |
+
'size': [],
|
| 42 |
+
'archived': {'yes': 0, 'no': 0},
|
| 43 |
+
'has_wiki': {'yes': 0, 'no': 0},
|
| 44 |
+
'default_branch': defaultdict(int),
|
| 45 |
+
'repo_keyword_count': defaultdict(int), # 每个repo命中多少个keyword
|
| 46 |
+
'topics_list': [], # 所有topics
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
def parse_date(self, date_str):
|
| 50 |
+
"""解析日期字符串"""
|
| 51 |
+
if not date_str or date_str.strip() == '':
|
| 52 |
+
return None
|
| 53 |
+
try:
|
| 54 |
+
# ISO format: 2017-04-26T11:03:50Z
|
| 55 |
+
date_str = date_str.replace('Z', '+00:00')
|
| 56 |
+
return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
|
| 57 |
+
except:
|
| 58 |
+
return None
|
| 59 |
+
|
| 60 |
+
def safe_int(self, val):
|
| 61 |
+
"""安全转换为整数"""
|
| 62 |
+
if not val or val == '':
|
| 63 |
+
return None
|
| 64 |
+
try:
|
| 65 |
+
return int(float(val))
|
| 66 |
+
except:
|
| 67 |
+
return None
|
| 68 |
+
|
| 69 |
+
def safe_bool(self, val):
|
| 70 |
+
"""安全转换为布尔值"""
|
| 71 |
+
if not val or val == '':
|
| 72 |
+
return None
|
| 73 |
+
val = str(val).strip().lower()
|
| 74 |
+
return val in ('true', 'yes', '1', 't')
|
| 75 |
+
|
| 76 |
+
def is_empty(self, val):
|
| 77 |
+
"""判断字段是否为空"""
|
| 78 |
+
if val is None:
|
| 79 |
+
return True
|
| 80 |
+
val = str(val).strip()
|
| 81 |
+
return val == '' or val.lower() == 'none'
|
| 82 |
+
|
| 83 |
+
def process_row(self, row):
|
| 84 |
+
"""处理单行数据"""
|
| 85 |
+
self.stats['total_records'] += 1
|
| 86 |
+
|
| 87 |
+
# 基础字段
|
| 88 |
+
keyword = row.get('keyword', '').strip()
|
| 89 |
+
full_name = row.get('full_name', '').strip()
|
| 90 |
+
owner = row.get('owner', '').strip()
|
| 91 |
+
language = row.get('language', '').strip()
|
| 92 |
+
license_val = row.get('license', '').strip()
|
| 93 |
+
topics = row.get('topics', '').strip()
|
| 94 |
+
description = row.get('description', '').strip()
|
| 95 |
+
default_branch = row.get('default_branch', '').strip()
|
| 96 |
+
|
| 97 |
+
# 唯一值统计
|
| 98 |
+
if keyword:
|
| 99 |
+
self.stats['unique_keywords'].add(keyword)
|
| 100 |
+
if full_name:
|
| 101 |
+
self.stats['unique_repos'].add(full_name)
|
| 102 |
+
self.stats['repo_keyword_count'][full_name] += 1
|
| 103 |
+
if owner:
|
| 104 |
+
self.stats['unique_owners'].add(owner)
|
| 105 |
+
|
| 106 |
+
# by_keyword
|
| 107 |
+
if keyword:
|
| 108 |
+
self.stats['by_keyword'][keyword]['count'] += 1
|
| 109 |
+
if full_name:
|
| 110 |
+
self.stats['by_keyword'][keyword]['repos'].add(full_name)
|
| 111 |
+
|
| 112 |
+
# 缺失率统计
|
| 113 |
+
for field in ['description', 'language', 'topics', 'license']:
|
| 114 |
+
if self.is_empty(row.get(field, '')):
|
| 115 |
+
self.stats['missing'][field] += 1
|
| 116 |
+
|
| 117 |
+
# language
|
| 118 |
+
if language:
|
| 119 |
+
self.stats['by_language'][language] += 1
|
| 120 |
+
else:
|
| 121 |
+
self.stats['by_language']['<empty>'] += 1
|
| 122 |
+
|
| 123 |
+
# license
|
| 124 |
+
if license_val:
|
| 125 |
+
self.stats['by_license'][license_val] += 1
|
| 126 |
+
else:
|
| 127 |
+
self.stats['by_license']['<empty>'] += 1
|
| 128 |
+
|
| 129 |
+
# default_branch
|
| 130 |
+
if default_branch:
|
| 131 |
+
self.stats['default_branch'][default_branch] += 1
|
| 132 |
+
|
| 133 |
+
# topics统计
|
| 134 |
+
if self.is_empty(topics):
|
| 135 |
+
self.stats['topics_stats']['missing'] += 1
|
| 136 |
+
else:
|
| 137 |
+
topics_list = [t.strip() for t in topics.split(',') if t.strip()]
|
| 138 |
+
self.stats['topics_stats']['counts'].append(len(topics_list))
|
| 139 |
+
self.stats['topics_list'].extend(topics_list)
|
| 140 |
+
|
| 141 |
+
# description长度
|
| 142 |
+
if description:
|
| 143 |
+
self.stats['desc_lengths'].append(len(description))
|
| 144 |
+
|
| 145 |
+
# 数值字段
|
| 146 |
+
stars = self.safe_int(row.get('stars'))
|
| 147 |
+
if stars is not None:
|
| 148 |
+
self.stats['stars'].append(stars)
|
| 149 |
+
|
| 150 |
+
forks = self.safe_int(row.get('forks'))
|
| 151 |
+
if forks is not None:
|
| 152 |
+
self.stats['forks'].append(forks)
|
| 153 |
+
|
| 154 |
+
open_issues = self.safe_int(row.get('open_issues'))
|
| 155 |
+
if open_issues is not None:
|
| 156 |
+
self.stats['open_issues'].append(open_issues)
|
| 157 |
+
|
| 158 |
+
size = self.safe_int(row.get('size'))
|
| 159 |
+
if size is not None:
|
| 160 |
+
self.stats['size'].append(size)
|
| 161 |
+
|
| 162 |
+
# 布尔字段
|
| 163 |
+
archived = self.safe_bool(row.get('archived'))
|
| 164 |
+
if archived is not None:
|
| 165 |
+
if archived:
|
| 166 |
+
self.stats['archived']['yes'] += 1
|
| 167 |
+
else:
|
| 168 |
+
self.stats['archived']['no'] += 1
|
| 169 |
+
|
| 170 |
+
has_wiki = self.safe_bool(row.get('has_wiki'))
|
| 171 |
+
if has_wiki is not None:
|
| 172 |
+
if has_wiki:
|
| 173 |
+
self.stats['has_wiki']['yes'] += 1
|
| 174 |
+
else:
|
| 175 |
+
self.stats['has_wiki']['no'] += 1
|
| 176 |
+
|
| 177 |
+
# 时间字段
|
| 178 |
+
created_at = self.parse_date(row.get('created_at'))
|
| 179 |
+
if created_at:
|
| 180 |
+
year = created_at.year
|
| 181 |
+
self.stats['by_year'][year] += 1
|
| 182 |
+
year_month = f"{year}-{created_at.month:02d}"
|
| 183 |
+
self.stats['by_year_month'][year_month] += 1
|
| 184 |
+
|
| 185 |
+
def process_csv(self):
|
| 186 |
+
"""流式处理CSV文件"""
|
| 187 |
+
print(f"Processing {self.csv_path}...")
|
| 188 |
+
|
| 189 |
+
with open(self.csv_path, 'r', encoding='utf-8', errors='replace') as f:
|
| 190 |
+
reader = csv.DictReader(f)
|
| 191 |
+
for row in tqdm(reader, desc="Processing repos_searched.csv"):
|
| 192 |
+
self.process_row(row)
|
| 193 |
+
|
| 194 |
+
# 转换set为计数
|
| 195 |
+
self.stats['unique_repos_count'] = len(self.stats['unique_repos'])
|
| 196 |
+
self.stats['unique_owners_count'] = len(self.stats['unique_owners'])
|
| 197 |
+
self.stats['unique_keywords_count'] = len(self.stats['unique_keywords'])
|
| 198 |
+
|
| 199 |
+
# 转换by_keyword的repos set为计数
|
| 200 |
+
for kw in self.stats['by_keyword']:
|
| 201 |
+
self.stats['by_keyword'][kw]['unique_repos'] = len(self.stats['by_keyword'][kw]['repos'])
|
| 202 |
+
del self.stats['by_keyword'][kw]['repos'] # 释放内存
|
| 203 |
+
|
| 204 |
+
def save_summary(self):
|
| 205 |
+
"""保存总体汇总"""
|
| 206 |
+
summary = {
|
| 207 |
+
'total_records': self.stats['total_records'],
|
| 208 |
+
'unique_repos': self.stats['unique_repos_count'],
|
| 209 |
+
'unique_owners': self.stats['unique_owners_count'],
|
| 210 |
+
'unique_keywords': self.stats['unique_keywords_count'],
|
| 211 |
+
'missing_rates': {
|
| 212 |
+
field: self.stats['missing'][field] / self.stats['total_records'] * 100
|
| 213 |
+
for field in ['description', 'language', 'topics', 'license']
|
| 214 |
+
},
|
| 215 |
+
'archived_ratio': self.stats['archived']['yes'] / (self.stats['archived']['yes'] + self.stats['archived']['no']) * 100 if (self.stats['archived']['yes'] + self.stats['archived']['no']) > 0 else 0,
|
| 216 |
+
'has_wiki_ratio': self.stats['has_wiki']['yes'] / (self.stats['has_wiki']['yes'] + self.stats['has_wiki']['no']) * 100 if (self.stats['has_wiki']['yes'] + self.stats['has_wiki']['no']) > 0 else 0,
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
# 添加分位数统计
|
| 220 |
+
if self.stats['stars']:
|
| 221 |
+
sorted_stars = sorted(self.stats['stars'])
|
| 222 |
+
summary['stars'] = {
|
| 223 |
+
'min': sorted_stars[0],
|
| 224 |
+
'p50': sorted_stars[len(sorted_stars)//2],
|
| 225 |
+
'p90': sorted_stars[int(len(sorted_stars)*0.9)],
|
| 226 |
+
'p99': sorted_stars[int(len(sorted_stars)*0.99)],
|
| 227 |
+
'max': sorted_stars[-1],
|
| 228 |
+
'mean': statistics.mean(sorted_stars),
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
if self.stats['topics_stats']['counts']:
|
| 232 |
+
counts = self.stats['topics_stats']['counts']
|
| 233 |
+
summary['topics_per_repo'] = {
|
| 234 |
+
'mean': statistics.mean(counts),
|
| 235 |
+
'median': statistics.median(counts),
|
| 236 |
+
'max': max(counts),
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
if self.stats['desc_lengths']:
|
| 240 |
+
lengths = self.stats['desc_lengths']
|
| 241 |
+
summary['description_length'] = {
|
| 242 |
+
'mean': statistics.mean(lengths),
|
| 243 |
+
'median': statistics.median(lengths),
|
| 244 |
+
'max': max(lengths),
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
with open(self.output_dir / 'summary_overall.json', 'w', encoding='utf-8') as f:
|
| 248 |
+
json.dump(summary, f, indent=2, ensure_ascii=False)
|
| 249 |
+
|
| 250 |
+
# 保存CSV格式
|
| 251 |
+
import pandas as pd
|
| 252 |
+
summary_df = pd.DataFrame([{
|
| 253 |
+
'metric': k,
|
| 254 |
+
'value': v if not isinstance(v, dict) else json.dumps(v)
|
| 255 |
+
} for k, v in summary.items()])
|
| 256 |
+
summary_df.to_csv(self.output_dir / 'summary_overall.csv', index=False)
|
| 257 |
+
|
| 258 |
+
def save_by_keyword(self):
|
| 259 |
+
"""保存按keyword的统计"""
|
| 260 |
+
rows = []
|
| 261 |
+
for kw, data in self.stats['by_keyword'].items():
|
| 262 |
+
rows.append({
|
| 263 |
+
'keyword': kw,
|
| 264 |
+
'count': data['count'],
|
| 265 |
+
'unique_repos': data['unique_repos'],
|
| 266 |
+
})
|
| 267 |
+
|
| 268 |
+
import pandas as pd
|
| 269 |
+
df = pd.DataFrame(rows)
|
| 270 |
+
df = df.sort_values('count', ascending=False)
|
| 271 |
+
df.to_csv(self.output_dir / 'by_keyword.csv', index=False)
|
| 272 |
+
|
| 273 |
+
def save_by_language(self):
|
| 274 |
+
"""保存按语言的统计"""
|
| 275 |
+
import pandas as pd
|
| 276 |
+
df = pd.DataFrame([
|
| 277 |
+
{'language': lang, 'count': count}
|
| 278 |
+
for lang, count in sorted(self.stats['by_language'].items(), key=lambda x: -x[1])
|
| 279 |
+
])
|
| 280 |
+
df.to_csv(self.output_dir / 'by_language.csv', index=False)
|
| 281 |
+
|
| 282 |
+
def save_by_license(self):
|
| 283 |
+
"""保存按license的统计"""
|
| 284 |
+
import pandas as pd
|
| 285 |
+
df = pd.DataFrame([
|
| 286 |
+
{'license': lic, 'count': count}
|
| 287 |
+
for lic, count in sorted(self.stats['by_license'].items(), key=lambda x: -x[1])
|
| 288 |
+
])
|
| 289 |
+
df.to_csv(self.output_dir / 'by_license.csv', index=False)
|
| 290 |
+
|
| 291 |
+
def save_by_year(self):
|
| 292 |
+
"""保存按年份的统计"""
|
| 293 |
+
import pandas as pd
|
| 294 |
+
df = pd.DataFrame([
|
| 295 |
+
{'year': year, 'count': count}
|
| 296 |
+
for year, count in sorted(self.stats['by_year'].items())
|
| 297 |
+
])
|
| 298 |
+
df.to_csv(self.output_dir / 'by_year.csv', index=False)
|
| 299 |
+
|
| 300 |
+
# 按年月
|
| 301 |
+
df_month = pd.DataFrame([
|
| 302 |
+
{'year_month': ym, 'count': count}
|
| 303 |
+
for ym, count in sorted(self.stats['by_year_month'].items())
|
| 304 |
+
])
|
| 305 |
+
df_month.to_csv(self.output_dir / 'by_year_month.csv', index=False)
|
| 306 |
+
|
| 307 |
+
def save_top_repos(self):
|
| 308 |
+
"""保存Top仓库(需要重新读取,这里先保存stars列表)"""
|
| 309 |
+
# 由于是流式处理,Top仓库需要单独处理或二次扫描
|
| 310 |
+
# 这里先保存stars分布数据
|
| 311 |
+
if self.stats['stars']:
|
| 312 |
+
import pandas as pd
|
| 313 |
+
df = pd.DataFrame({
|
| 314 |
+
'stars': sorted(self.stats['stars'], reverse=True)
|
| 315 |
+
})
|
| 316 |
+
df.to_csv(self.output_dir / 'stars_distribution.csv', index=False)
|
| 317 |
+
|
| 318 |
+
def save_top_topics(self):
|
| 319 |
+
"""保存Top topics"""
|
| 320 |
+
topic_counter = Counter(self.stats['topics_list'])
|
| 321 |
+
import pandas as pd
|
| 322 |
+
df = pd.DataFrame([
|
| 323 |
+
{'topic': topic, 'count': count}
|
| 324 |
+
for topic, count in topic_counter.most_common(100)
|
| 325 |
+
])
|
| 326 |
+
df.to_csv(self.output_dir / 'top_topics.csv', index=False)
|
| 327 |
+
|
| 328 |
+
def save_repo_keyword_counts(self):
|
| 329 |
+
"""保存每个仓库命中的keyword数量分布"""
|
| 330 |
+
import pandas as pd
|
| 331 |
+
counts = list(self.stats['repo_keyword_count'].values())
|
| 332 |
+
df = pd.DataFrame({
|
| 333 |
+
'keyword_count': counts
|
| 334 |
+
})
|
| 335 |
+
df.to_csv(self.output_dir / 'repo_keyword_count_distribution.csv', index=False)
|
| 336 |
+
|
| 337 |
+
def save_default_branch(self):
|
| 338 |
+
"""保存default_branch分布"""
|
| 339 |
+
import pandas as pd
|
| 340 |
+
df = pd.DataFrame([
|
| 341 |
+
{'branch': branch, 'count': count}
|
| 342 |
+
for branch, count in sorted(self.stats['default_branch'].items(), key=lambda x: -x[1])
|
| 343 |
+
])
|
| 344 |
+
df.to_csv(self.output_dir / 'default_branch_distribution.csv', index=False)
|
| 345 |
+
|
| 346 |
+
def run(self):
|
| 347 |
+
"""执行完整流程"""
|
| 348 |
+
print("Stage A: Processing repos_searched.csv...")
|
| 349 |
+
self.process_csv()
|
| 350 |
+
print("Saving results...")
|
| 351 |
+
self.save_summary()
|
| 352 |
+
self.save_by_keyword()
|
| 353 |
+
self.save_by_language()
|
| 354 |
+
self.save_by_license()
|
| 355 |
+
self.save_by_year()
|
| 356 |
+
self.save_top_repos()
|
| 357 |
+
self.save_top_topics()
|
| 358 |
+
self.save_repo_keyword_counts()
|
| 359 |
+
self.save_default_branch()
|
| 360 |
+
print(f"Stage A complete! Results saved to {self.output_dir}")
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
if __name__ == "__main__":
|
| 364 |
+
csv_path = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_searched.csv"
|
| 365 |
+
output_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/stage_a"
|
| 366 |
+
stats = StageAStats(csv_path, output_dir)
|
| 367 |
+
stats.run()
|
| 368 |
+
|