DouDou commited on
Commit
6f3497d
·
verified ·
1 Parent(s): 127f2b0

Upload data2/step22/folder_stat.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data2/step22/folder_stat.py +91 -0
data2/step22/folder_stat.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered
2
+ # 一级文件夹数量: 37736
3
+ # 代码文件数量:
4
+ # /home/weifengsun/tangou1/domain_code/src/workdir/keywords_expanded.json
5
+
6
+
7
+ # 1。统计项目文件夹数量
8
+ # from pathlib import Path
9
+
10
+ # def count_subdirs_pathlib(path):
11
+ # p = Path(path)
12
+
13
+ # if not p.exists():
14
+ # print("路径不存在")
15
+ # return 0
16
+
17
+ # # p.iterdir() 遍历目录下所有项
18
+ # # item.is_dir() 判断是否为目录
19
+ # # sum() 计算生成器中 True 的数量
20
+ # count = sum(1 for item in p.iterdir() if item.is_dir())
21
+
22
+ # return count
23
+
24
+ # # 使用示例
25
+ # folder_path = r'/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered' # 请替换为你的路径
26
+ # print(f"一级文件夹数量: {count_subdirs_pathlib(folder_path)}")
27
+
28
+ # 2. 统计代码文件后缀名
29
+ # import os
30
+ # from collections import Counter
31
+
32
+ # def count_extensions_walk(folder_path):
33
+ # ext_counter = Counter()
34
+
35
+ # if not os.path.exists(folder_path):
36
+ # print("路径不存在")
37
+ # return
38
+
39
+ # # os.walk 会自动递归遍历所有层级
40
+ # for root, dirs, files in os.walk(folder_path):
41
+ # for filename in files:
42
+ # # 分离文件名和扩展名
43
+ # # os.path.splitext('file.tar.gz') 会得到 ('file.tar', '.gz')
44
+ # _, ext = os.path.splitext(filename)
45
+
46
+ # # 转为小写
47
+ # ext = ext.lower()
48
+
49
+ # if ext == '':
50
+ # ext = '无后缀'
51
+
52
+ # ext_counter[ext] += 1
53
+
54
+ # return ext_counter
55
+
56
+ # # --- 使用示例 ---
57
+ # target_folder = r'/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered' # 替换为你的目标文件夹路径
58
+ # result = count_extensions_walk(target_folder)
59
+
60
+ # if result:
61
+ # print(f"统计结果 (总文件数: {result.total()}):")
62
+ # for ext, count in result.most_common():
63
+ # print(f"{ext}: {count}")
64
+
65
+ # 统计结果 (总文件数: 3325264):
66
+ # .py: 1078183
67
+ # .h: 352528
68
+ # .hpp: 333821
69
+ # .cpp: 259554
70
+ # .md: 218388
71
+ # .java: 180978
72
+ # .c: 160802
73
+ # .m: 149649
74
+ # .r: 124987
75
+ # .sh: 116917
76
+ # .ipynb: 82218
77
+ # .f: 70590
78
+ # .f90: 59562
79
+ # .cc: 36611
80
+ # .rs: 23565
81
+ # .cxx: 22101
82
+ # .hh: 19051
83
+ # .jl: 18052
84
+ # .go: 13904
85
+ # .for: 1847
86
+ # .bash: 1573
87
+ # .markdown: 346
88
+ # .f95: 36
89
+ # 无后缀: 1
90
+
91
+