# /home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered # 一级文件夹数量: 37736 # 代码文件数量: # /home/weifengsun/tangou1/domain_code/src/workdir/keywords_expanded.json # 1。统计项目文件夹数量 # from pathlib import Path # def count_subdirs_pathlib(path): # p = Path(path) # if not p.exists(): # print("路径不存在") # return 0 # # p.iterdir() 遍历目录下所有项 # # item.is_dir() 判断是否为目录 # # sum() 计算生成器中 True 的数量 # count = sum(1 for item in p.iterdir() if item.is_dir()) # return count # # 使用示例 # folder_path = r'/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered' # 请替换为你的路径 # print(f"一级文件夹数量: {count_subdirs_pathlib(folder_path)}") # 2. 统计代码文件后缀名 # import os # from collections import Counter # def count_extensions_walk(folder_path): # ext_counter = Counter() # if not os.path.exists(folder_path): # print("路径不存在") # return # # os.walk 会自动递归遍历所有层级 # for root, dirs, files in os.walk(folder_path): # for filename in files: # # 分离文件名和扩展名 # # os.path.splitext('file.tar.gz') 会得到 ('file.tar', '.gz') # _, ext = os.path.splitext(filename) # # 转为小写 # ext = ext.lower() # if ext == '': # ext = '无后缀' # ext_counter[ext] += 1 # return ext_counter # # --- 使用示例 --- # target_folder = r'/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered' # 替换为你的目标文件夹路径 # result = count_extensions_walk(target_folder) # if result: # print(f"统计结果 (总文件数: {result.total()}):") # for ext, count in result.most_common(): # print(f"{ext}: {count}") # 统计结果 (总文件数: 3325264): # .py: 1078183 # .h: 352528 # .hpp: 333821 # .cpp: 259554 # .md: 218388 # .java: 180978 # .c: 160802 # .m: 149649 # .r: 124987 # .sh: 116917 # .ipynb: 82218 # .f: 70590 # .f90: 59562 # .cc: 36611 # .rs: 23565 # .cxx: 22101 # .hh: 19051 # .jl: 18052 # .go: 13904 # .for: 1847 # .bash: 1573 # .markdown: 346 # .f95: 36 # 无后缀: 1