dataset-builder / data2 /step22 /folder_stat.py
SunDou's picture
Upload data2/step22/folder_stat.py with huggingface_hub
6f3497d verified
# /home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered
# 一级文件夹数量: 37736
# 代码文件数量:
# /home/weifengsun/tangou1/domain_code/src/workdir/keywords_expanded.json
# 1。统计项目文件夹数量
# from pathlib import Path
# def count_subdirs_pathlib(path):
# p = Path(path)
# if not p.exists():
# print("路径不存在")
# return 0
# # p.iterdir() 遍历目录下所有项
# # item.is_dir() 判断是否为目录
# # sum() 计算生成器中 True 的数量
# count = sum(1 for item in p.iterdir() if item.is_dir())
# return count
# # 使用示例
# folder_path = r'/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered' # 请替换为你的路径
# print(f"一级文件夹数量: {count_subdirs_pathlib(folder_path)}")
# 2. 统计代码文件后缀名
# import os
# from collections import Counter
# def count_extensions_walk(folder_path):
# ext_counter = Counter()
# if not os.path.exists(folder_path):
# print("路径不存在")
# return
# # os.walk 会自动递归遍历所有层级
# for root, dirs, files in os.walk(folder_path):
# for filename in files:
# # 分离文件名和扩展名
# # os.path.splitext('file.tar.gz') 会得到 ('file.tar', '.gz')
# _, ext = os.path.splitext(filename)
# # 转为小写
# ext = ext.lower()
# if ext == '':
# ext = '无后缀'
# ext_counter[ext] += 1
# return ext_counter
# # --- 使用示例 ---
# target_folder = r'/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered' # 替换为你的目标文件夹路径
# result = count_extensions_walk(target_folder)
# if result:
# print(f"统计结果 (总文件数: {result.total()}):")
# for ext, count in result.most_common():
# print(f"{ext}: {count}")
# 统计结果 (总文件数: 3325264):
# .py: 1078183
# .h: 352528
# .hpp: 333821
# .cpp: 259554
# .md: 218388
# .java: 180978
# .c: 160802
# .m: 149649
# .r: 124987
# .sh: 116917
# .ipynb: 82218
# .f: 70590
# .f90: 59562
# .cc: 36611
# .rs: 23565
# .cxx: 22101
# .hh: 19051
# .jl: 18052
# .go: 13904
# .for: 1847
# .bash: 1573
# .markdown: 346
# .f95: 36
# 无后缀: 1