File size: 2,357 Bytes
6f3497d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# /home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered
# 一级文件夹数量: 37736
# 代码文件数量:
# /home/weifengsun/tangou1/domain_code/src/workdir/keywords_expanded.json


# 1。统计项目文件夹数量
# from pathlib import Path

# def count_subdirs_pathlib(path):
#     p = Path(path)
    
#     if not p.exists():
#         print("路径不存在")
#         return 0
    
#     # p.iterdir() 遍历目录下所有项
#     # item.is_dir() 判断是否为目录
#     # sum() 计算生成器中 True 的数量
#     count = sum(1 for item in p.iterdir() if item.is_dir())
    
#     return count

# # 使用示例
# folder_path = r'/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered' # 请替换为你的路径
# print(f"一级文件夹数量: {count_subdirs_pathlib(folder_path)}")

# 2. 统计代码文件后缀名
# import os
# from collections import Counter

# def count_extensions_walk(folder_path):
#     ext_counter = Counter()
    
#     if not os.path.exists(folder_path):
#         print("路径不存在")
#         return

#     # os.walk 会自动递归遍历所有层级
#     for root, dirs, files in os.walk(folder_path):
#         for filename in files:
#             # 分离文件名和扩展名
#             # os.path.splitext('file.tar.gz') 会得到 ('file.tar', '.gz')
#             _, ext = os.path.splitext(filename)
            
#             # 转为小写
#             ext = ext.lower()
            
#             if ext == '':
#                 ext = '无后缀'
            
#             ext_counter[ext] += 1
            
#     return ext_counter

# # --- 使用示例 ---
# target_folder = r'/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered' # 替换为你的目标文件夹路径
# result = count_extensions_walk(target_folder)

# if result:
#     print(f"统计结果 (总文件数: {result.total()}):")
#     for ext, count in result.most_common():
#         print(f"{ext}: {count}")

# 统计结果 (总文件数: 3325264):
# .py: 1078183
# .h: 352528
# .hpp: 333821
# .cpp: 259554
# .md: 218388
# .java: 180978
# .c: 160802
# .m: 149649
# .r: 124987
# .sh: 116917
# .ipynb: 82218
# .f: 70590
# .f90: 59562
# .cc: 36611
# .rs: 23565
# .cxx: 22101
# .hh: 19051
# .jl: 18052
# .go: 13904
# .for: 1847
# .bash: 1573
# .markdown: 346
# .f95: 36
# 无后缀: 1