SciCode
/

dataset-builder

Model card Files Files and versions

xet

Community

DouDou commited on Feb 19

Commit

284b084

verified ·

1 Parent(s): 880e02b

Upload data1/compute_stars_keywords.py with huggingface_hub

Browse files

Files changed (1) hide show

data1/compute_stars_keywords.py +103 -0

data1/compute_stars_keywords.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import csv
+import sys
+from collections import defaultdict
+from statistics import mean, variance
+# ------------------------
+# 读取 CSV
+# ------------------------
+def load_csv(path):
+    keywords = []
+    stars = []
+    with open(path, "r", encoding="utf-8", errors="replace") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            if "keyword" in row:
+                keywords.append(row["keyword"])
+            if "stars" in row:
+                try:
+                    stars.append(int(row["stars"]))
+                except:
+                    pass  # 非数字跳过
+    return keywords, stars
+# ------------------------
+# 区间构造与统计
+# ------------------------
+def make_bins(start, end, step):
+    bins = list(range(start, end + step, step))
+    labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
+    labels.append(f"{end}+")
+    return bins, labels
+def distribute(values, bins, labels):
+    dist = {label: 0 for label in labels}
+    for v in values:
+        placed = False
+        for i in range(len(bins) - 1):
+            if bins[i] <= v < bins[i+1]:
+                dist[labels[i]] += 1
+                placed = True
+                break
+        if not placed:
+            dist[labels[-1]] += 1
+    return dist
+# ------------------------
+# 主统计函数
+# ------------------------
+def analyze_csv(path, start, end, step):
+    print(f"\n========== 分析 CSV 文件：{path} ==========")
+    keywords, stars = load_csv(path)
+    print("\n========== keyword 分布 ==========")
+    keyword_count = defaultdict(int)
+    for kw in keywords:
+        keyword_count[kw] += 1
+    total_keywords = len(keywords)
+    for kw, cnt in sorted(keyword_count.items(), key=lambda x: -x[1]):
+        pct = cnt / total_keywords * 100
+        print(f"{kw}: {cnt} ({pct:.2f}%)")
+    print("\n========== stars 统计 ==========")
+    if len(stars) == 0:
+        print("没有 stars 字段或无有效数据")
+    else:
+        print(f"个数: {len(stars)}")
+        print(f"最小值: {min(stars)}")
+        print(f"最大值: {max(stars)}")
+        print(f"均值: {mean(stars):.2f}")
+        if len(stars) >= 2:
+            print(f"方差: {variance(stars):.2f}")
+        # 区间统计
+        bins, labels = make_bins(start, end, step)
+        dist = distribute(stars, bins, labels)
+        print(f"区间分布")
+        for lab in labels:
+            cnt = dist[lab]
+            pct = cnt / len(stars) * 100
+            print(f"{lab}: {cnt} ({pct:.2f}%)")
+if __name__ == "__main__":
+    path = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_checked.csv"
+    start = 0
+    end = 200
+    step = 20
+    analyze_csv(path, start, end, step)