DouDou commited on
Commit
e10e693
·
verified ·
1 Parent(s): 284b084

Upload data1/compute_statistics.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data1/compute_statistics.py +168 -0
data1/compute_statistics.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import glob
3
+ import sys
4
+ from collections import defaultdict
5
+ from statistics import mean, variance
6
+
7
+ TARGET_LANGS = {
8
+ "python",
9
+ "java",
10
+ "c/c++",
11
+ "fortran",
12
+ "r",
13
+ "matlab",
14
+ "shell",
15
+ "rust",
16
+ "go",
17
+ }
18
+
19
+
20
+ # ------------------------
21
+ # 读取全部 JSONL 数据
22
+ # ------------------------
23
+ def load_jsonl_data(pattern="*.jsonl"):
24
+ JSONL_FILES = glob.glob(pattern)
25
+ # print("=====")
26
+ # print(JSONL_FILES)
27
+
28
+ language_count = defaultdict(int)
29
+ field_data = defaultdict(list) # 保存各字段所有值
30
+ field_data_by_lang = defaultdict(lambda: defaultdict(list))
31
+
32
+ for filename in JSONL_FILES:
33
+ with open(filename, "r", encoding="utf-8") as f:
34
+ for line in f:
35
+ line = line.strip()
36
+ if not line:
37
+ continue
38
+
39
+ obj = json.loads(line)
40
+ lang = obj.get("language", "unknown")
41
+ language_count[lang] += 1
42
+
43
+ # 记录所有字段的值
44
+ for k, v in obj.items():
45
+ if isinstance(v, (int, float)):
46
+ field_data[k].append(v)
47
+ field_data_by_lang[lang][k].append(v)
48
+
49
+ return language_count, field_data, field_data_by_lang
50
+
51
+
52
+ # ------------------------
53
+ # 打印语言分布
54
+ # ------------------------
55
+ def print_language_distribution(language_count):
56
+ print("\n========== 语言分布(language counts & percentage) ==========")
57
+ total_items = sum(language_count.values())
58
+
59
+ for lang, count in sorted(language_count.items(), key=lambda x: -x[1]):
60
+ pct = count / total_items * 100
61
+ print(f"{lang}: {count} ({pct:.2f}%)")
62
+
63
+
64
+ # ------------------------
65
+ # 区间统计(通用)
66
+ # ------------------------
67
+ def compute_bins(start, end, step):
68
+ bins = list(range(start, end + step, step))
69
+ labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins) - 1)]
70
+ labels.append(f"{end}+")
71
+ return bins, labels
72
+
73
+
74
+ def compute_distribution(values, bins, labels):
75
+ dist = {label: 0 for label in labels}
76
+
77
+ for v in values:
78
+ placed = False
79
+ for i in range(len(bins) - 1):
80
+ if bins[i] <= v < bins[i + 1]:
81
+ dist[labels[i]] += 1
82
+ placed = True
83
+ break
84
+ if not placed:
85
+ dist[labels[-1]] += 1
86
+ return dist
87
+
88
+
89
+ def print_distribution(title, dist, total_count):
90
+ print(f"{title}")
91
+ for label, count in dist.items():
92
+ pct = count / total_count * 100
93
+ print(f" {label}: {count} ({pct:.2f}%)")
94
+
95
+
96
+ # ------------------------
97
+ # 统计某字段的分布
98
+ # ------------------------
99
+ def analyze_field_distribution(jsonl_dir, field, start, end, step):
100
+ print(f"\n================= 分析字段:{field} =================")
101
+
102
+ # 加载数据
103
+ language_count, field_data, field_data_by_lang = load_jsonl_data(jsonl_dir)
104
+
105
+ # 打印语言分布
106
+ print_language_distribution(language_count)
107
+
108
+ # 检查字段是否存在
109
+ if field not in field_data:
110
+ print(f"\n字段 '{field}' 在数据中不存在!")
111
+ return
112
+
113
+ values = []
114
+ for lang in TARGET_LANGS:
115
+ values.extend(field_data_by_lang.get(lang, {}).get(field, []))
116
+
117
+ print(f"\n========== {field} 整体统计 ==========")
118
+ print(f"个数: {len(values)}")
119
+ print(f"最小值: {min(values)}")
120
+ print(f"最大值: {max(values)}")
121
+ print(f"均值: {mean(values):.2f}")
122
+ if len(values) >= 2:
123
+ print(f"方差: {variance(values):.2f}")
124
+ else:
125
+ print("方差: N/A")
126
+
127
+ # 计算区间
128
+ bins, labels = compute_bins(start, end, step)
129
+
130
+ # 整体区间分布
131
+ overall_dist = compute_distribution(values, bins, labels)
132
+ print_distribution(f"区间分布", overall_dist, len(values))
133
+
134
+ # -------- 按语言统计 --------
135
+ print(f"\n========== 按语言统计 {field} ==========")
136
+
137
+ for lang in TARGET_LANGS:
138
+ fields = field_data_by_lang.get(lang)
139
+ if not fields or field not in fields:
140
+ continue
141
+
142
+ vals = fields[field]
143
+
144
+ print(f"\n--- {lang} ---")
145
+ print(f"数量: {len(vals)}")
146
+ print(f"最小值: {min(vals)}")
147
+ print(f"最大值: {max(vals)}")
148
+ print(f"均值: {mean(vals):.2f}")
149
+ if len(vals) >= 2:
150
+ print(f"方差: {variance(vals):.2f}")
151
+ else:
152
+ print("方差: N/A")
153
+
154
+ # 语言级区间分布
155
+ dist = compute_distribution(vals, bins, labels)
156
+ print_distribution("区间分布:", dist, len(vals))
157
+
158
+
159
+ # ------------------------
160
+ if __name__ == "__main__":
161
+ jsonl_dir = "/home/weifengsun/tangou1/domain_code/src/datasets/analysis2/*.jsonl"
162
+ # jsonl_dir = "data/*.jsonl"
163
+ # total_lines, comment_lines, comment_tokenst, empty_lines, code_lines, tokens, functions, parameters
164
+ field = "comment_lines"
165
+ start = 0
166
+ end = 200
167
+ step = 20
168
+ analyze_field_distribution(jsonl_dir, field, start, end, step)