DouDou commited on
Upload data2/step22/folder_stat.py with huggingface_hub
Browse files- data2/step22/folder_stat.py +91 -0
data2/step22/folder_stat.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered
|
| 2 |
+
# 一级文件夹数量: 37736
|
| 3 |
+
# 代码文件数量:
|
| 4 |
+
# /home/weifengsun/tangou1/domain_code/src/workdir/keywords_expanded.json
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# 1。统计项目文件夹数量
|
| 8 |
+
# from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# def count_subdirs_pathlib(path):
|
| 11 |
+
# p = Path(path)
|
| 12 |
+
|
| 13 |
+
# if not p.exists():
|
| 14 |
+
# print("路径不存在")
|
| 15 |
+
# return 0
|
| 16 |
+
|
| 17 |
+
# # p.iterdir() 遍历目录下所有项
|
| 18 |
+
# # item.is_dir() 判断是否为目录
|
| 19 |
+
# # sum() 计算生成器中 True 的数量
|
| 20 |
+
# count = sum(1 for item in p.iterdir() if item.is_dir())
|
| 21 |
+
|
| 22 |
+
# return count
|
| 23 |
+
|
| 24 |
+
# # 使用示例
|
| 25 |
+
# folder_path = r'/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered' # 请替换为你的路径
|
| 26 |
+
# print(f"一级文件夹数量: {count_subdirs_pathlib(folder_path)}")
|
| 27 |
+
|
| 28 |
+
# 2. 统计代码文件后缀名
|
| 29 |
+
# import os
|
| 30 |
+
# from collections import Counter
|
| 31 |
+
|
| 32 |
+
# def count_extensions_walk(folder_path):
|
| 33 |
+
# ext_counter = Counter()
|
| 34 |
+
|
| 35 |
+
# if not os.path.exists(folder_path):
|
| 36 |
+
# print("路径不存在")
|
| 37 |
+
# return
|
| 38 |
+
|
| 39 |
+
# # os.walk 会自动递归遍历所有层级
|
| 40 |
+
# for root, dirs, files in os.walk(folder_path):
|
| 41 |
+
# for filename in files:
|
| 42 |
+
# # 分离文件名和扩展名
|
| 43 |
+
# # os.path.splitext('file.tar.gz') 会得到 ('file.tar', '.gz')
|
| 44 |
+
# _, ext = os.path.splitext(filename)
|
| 45 |
+
|
| 46 |
+
# # 转为小写
|
| 47 |
+
# ext = ext.lower()
|
| 48 |
+
|
| 49 |
+
# if ext == '':
|
| 50 |
+
# ext = '无后缀'
|
| 51 |
+
|
| 52 |
+
# ext_counter[ext] += 1
|
| 53 |
+
|
| 54 |
+
# return ext_counter
|
| 55 |
+
|
| 56 |
+
# # --- 使用示例 ---
|
| 57 |
+
# target_folder = r'/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered' # 替换为你的目标文件夹路径
|
| 58 |
+
# result = count_extensions_walk(target_folder)
|
| 59 |
+
|
| 60 |
+
# if result:
|
| 61 |
+
# print(f"统计结果 (总文件数: {result.total()}):")
|
| 62 |
+
# for ext, count in result.most_common():
|
| 63 |
+
# print(f"{ext}: {count}")
|
| 64 |
+
|
| 65 |
+
# 统计结果 (总文件数: 3325264):
|
| 66 |
+
# .py: 1078183
|
| 67 |
+
# .h: 352528
|
| 68 |
+
# .hpp: 333821
|
| 69 |
+
# .cpp: 259554
|
| 70 |
+
# .md: 218388
|
| 71 |
+
# .java: 180978
|
| 72 |
+
# .c: 160802
|
| 73 |
+
# .m: 149649
|
| 74 |
+
# .r: 124987
|
| 75 |
+
# .sh: 116917
|
| 76 |
+
# .ipynb: 82218
|
| 77 |
+
# .f: 70590
|
| 78 |
+
# .f90: 59562
|
| 79 |
+
# .cc: 36611
|
| 80 |
+
# .rs: 23565
|
| 81 |
+
# .cxx: 22101
|
| 82 |
+
# .hh: 19051
|
| 83 |
+
# .jl: 18052
|
| 84 |
+
# .go: 13904
|
| 85 |
+
# .for: 1847
|
| 86 |
+
# .bash: 1573
|
| 87 |
+
# .markdown: 346
|
| 88 |
+
# .f95: 36
|
| 89 |
+
# 无后缀: 1
|
| 90 |
+
|
| 91 |
+
|