DouDou commited on
Commit
284b084
·
verified ·
1 Parent(s): 880e02b

Upload data1/compute_stars_keywords.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data1/compute_stars_keywords.py +103 -0
data1/compute_stars_keywords.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import sys
3
+ from collections import defaultdict
4
+ from statistics import mean, variance
5
+
6
+
7
+ # ------------------------
8
+ # 读取 CSV
9
+ # ------------------------
10
+ def load_csv(path):
11
+ keywords = []
12
+ stars = []
13
+
14
+ with open(path, "r", encoding="utf-8", errors="replace") as f:
15
+ reader = csv.DictReader(f)
16
+ for row in reader:
17
+ if "keyword" in row:
18
+ keywords.append(row["keyword"])
19
+
20
+ if "stars" in row:
21
+ try:
22
+ stars.append(int(row["stars"]))
23
+ except:
24
+ pass # 非数字跳过
25
+
26
+ return keywords, stars
27
+
28
+
29
+ # ------------------------
30
+ # 区间构造与统计
31
+ # ------------------------
32
+ def make_bins(start, end, step):
33
+ bins = list(range(start, end + step, step))
34
+ labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
35
+ labels.append(f"{end}+")
36
+ return bins, labels
37
+
38
+
39
+ def distribute(values, bins, labels):
40
+ dist = {label: 0 for label in labels}
41
+
42
+ for v in values:
43
+ placed = False
44
+ for i in range(len(bins) - 1):
45
+ if bins[i] <= v < bins[i+1]:
46
+ dist[labels[i]] += 1
47
+ placed = True
48
+ break
49
+ if not placed:
50
+ dist[labels[-1]] += 1
51
+
52
+ return dist
53
+
54
+
55
+ # ------------------------
56
+ # 主统计函数
57
+ # ------------------------
58
+ def analyze_csv(path, start, end, step):
59
+
60
+ print(f"\n========== 分析 CSV 文件:{path} ==========")
61
+
62
+ keywords, stars = load_csv(path)
63
+
64
+ print("\n========== keyword 分布 ==========")
65
+ keyword_count = defaultdict(int)
66
+
67
+ for kw in keywords:
68
+ keyword_count[kw] += 1
69
+
70
+ total_keywords = len(keywords)
71
+
72
+ for kw, cnt in sorted(keyword_count.items(), key=lambda x: -x[1]):
73
+ pct = cnt / total_keywords * 100
74
+ print(f"{kw}: {cnt} ({pct:.2f}%)")
75
+
76
+ print("\n========== stars 统计 ==========")
77
+ if len(stars) == 0:
78
+ print("没有 stars 字段或无有效数据")
79
+ else:
80
+ print(f"个数: {len(stars)}")
81
+ print(f"最小值: {min(stars)}")
82
+ print(f"最大值: {max(stars)}")
83
+ print(f"均值: {mean(stars):.2f}")
84
+ if len(stars) >= 2:
85
+ print(f"方差: {variance(stars):.2f}")
86
+
87
+ # 区间统计
88
+ bins, labels = make_bins(start, end, step)
89
+ dist = distribute(stars, bins, labels)
90
+
91
+ print(f"区间分布")
92
+ for lab in labels:
93
+ cnt = dist[lab]
94
+ pct = cnt / len(stars) * 100
95
+ print(f"{lab}: {cnt} ({pct:.2f}%)")
96
+
97
+
98
+ if __name__ == "__main__":
99
+ path = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_checked.csv"
100
+ start = 0
101
+ end = 200
102
+ step = 20
103
+ analyze_csv(path, start, end, step)