DouDou

Upload data1/reporting/main.py with huggingface_hub

d50199f verified 21 days ago

7.76 kB

	"""
	主入口脚本：执行完整的统计报表流程
	"""
	import argparse
	from pathlib import Path
	import sys

	# 导入各模块
	from stage_a_stats import StageAStats
	from stage_b_stats import StageBStats
	from repo_meta_scan import RepoMetaScan
	from code_file_stats import CodeFileStats
	from code_file_stats_fast import CodeFileStatsFast # 优化版本
	from visualization import generate_all_visualizations
	from join_insights import JoinInsights


	def main():
	parser = argparse.ArgumentParser(description='生成数据统计报表')
	parser.add_argument('--repos-searched', type=str,
	default='/home/weifengsun/tangou1/domain_code/src/workdir/repos_searched.csv',
	help='repos_searched.csv路径')
	parser.add_argument('--repos-check-history', type=str,
	default='/home/weifengsun/tangou1/domain_code/src/workdir/repos_check_history.csv',
	help='repos_check_history.csv路径')
	parser.add_argument('--repos-filtered', type=str,
	default='/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered',
	help='repos_filtered目录路径')
	parser.add_argument('--output-dir', type=str,
	default='/home/weifengsun/tangou1/domain_code/src/workdir/reporting',
	help='输出目录')
	parser.add_argument('--top-n', type=int, default=None,
	help='分析的仓库数量（字典序前N个，None表示所有）')
	parser.add_argument('--workers', type=int, default=8,
	help='代码文件统计的并行worker数（默认CPU-1）')
	parser.add_argument('--stage-a', action='store_true',
	help='运行Stage A（搜索阶段统计）')
	parser.add_argument('--stage-b', action='store_true',
	help='运行Stage B（过滤阶段统计）')
	parser.add_argument('--repo-meta', action='store_true',
	help='运行仓库元画像扫描')
	parser.add_argument('--code-stats', action='store_true',
	help='运行代码文件级统计')
	parser.add_argument('--code-stats-fast', action='store_true',
	help='运行代码文件级统计（快速版本，约提速10-20倍）')
	parser.add_argument('--visualization', action='store_true',
	help='生成图表（需要先有stage-a, stage-b, repo-meta, code-stats的数据）')
	parser.add_argument('--insights', action='store_true',
	help='运行关联分析（需要先有stage-a, code-stats, stage-b的数据）')

	args = parser.parse_args()

	output_dir = Path(args.output_dir)
	output_dir.mkdir(parents=True, exist_ok=True)

	print("=" * 80)
	print("数据统计报表生成系统")
	print("=" * 80)
	print(f"输出目录: {output_dir}")
	print(f"分析仓库数: {args.top_n if args.top_n else '所有'}")

	# 检查是否有指定任何阶段
	has_stage = any([
	args.stage_a, args.stage_b, args.repo_meta,
	args.code_stats, args.code_stats_fast, args.visualization, args.insights
	])

	if not has_stage:
	print("\n错误: 请至少指定一个要运行的阶段！")
	print("可用选项:")
	print(" --stage-a 运行Stage A（搜索阶段统计）")
	print(" --stage-b 运行Stage B（过滤阶段统计）")
	print(" --repo-meta 运行仓库元画像扫描")
	print(" --code-stats 运行代码文件级统计")
	print(" --code-stats-fast 运行代码文件级统计（快速版本，推荐）")
	print(" --visualization 生成图表")
	print(" --insights 运行关联分析")
	print("\n示例: python main.py --stage-a --stage-b")
	return

	print()

	# 定义输出目录路径（即使不运行也需要，因为可能被其他阶段使用）
	stage_a_dir = output_dir / 'stage_a'
	stage_b_dir = output_dir / 'stage_b'
	repo_meta_dir = output_dir / 'repo_meta'
	code_stats_dir = output_dir / 'code_stats'

	# Stage A: 搜索阶段统计
	if args.stage_a:
	print("\n" + "=" * 80)
	print("Stage A: 搜索阶段统计 (repos_searched.csv)")
	print("=" * 80)
	stage_a_stats = StageAStats(args.repos_searched, stage_a_dir)
	stage_a_stats.run()

	# Stage B: 过滤阶段统计
	if args.stage_b:
	print("\n" + "=" * 80)
	print("Stage B: 过滤阶段统计 (repos_check_history.csv)")
	print("=" * 80)
	stage_b_stats = StageBStats(args.repos_check_history, stage_b_dir)
	stage_b_stats.run()

	# 仓库元画像扫描
	if args.repo_meta:
	print("\n" + "=" * 80)
	print("仓库元画像扫描 (repos_filtered)")
	print("=" * 80)
	repo_meta_scanner = RepoMetaScan(args.repos_filtered, repo_meta_dir, top_n=args.top_n)
	repo_meta_scanner.run()

	# Stage C: 代码文件级统计
	if args.code_stats:
	print("\n" + "=" * 80)
	print("Stage C: 代码文件级统计（原版）")
	print("=" * 80)
	code_stats = CodeFileStats(args.repos_filtered, code_stats_dir,
	top_n=args.top_n)
	code_stats.run(num_workers=args.workers)

	# Stage C: 代码文件级统计（快速版本）
	if args.code_stats_fast:
	print("\n" + "=" * 80)
	print("Stage C: 代码文件级统计（快速版本）")
	print("=" * 80)
	code_stats_fast = CodeFileStatsFast(
	args.repos_filtered,
	code_stats_dir,
	top_n=args.top_n,
	max_file_size_mb=2,
	max_files_per_repo=500 # 限制每个仓库最多500个文件
	)
	code_stats_fast.run(num_workers=args.workers if args.workers else 48)

	# 图表生成（需要前面的数据）
	if args.visualization:
	print("\n" + "=" * 80)
	print("生成图表")
	print("=" * 80)
	# 检查必要的数据是否存在
	required_dirs = [stage_a_dir, stage_b_dir, repo_meta_dir, code_stats_dir]
	missing_dirs = [d for d in required_dirs if not d.exists()]
	if missing_dirs:
	print(f"警告: 以下目录不存在，图表生成可能不完整: {[str(d) for d in missing_dirs]}")

	generate_all_visualizations(
	str(stage_a_dir),
	str(stage_b_dir),
	str(repo_meta_dir),
	str(code_stats_dir),
	args.repos_searched,
	top_n=args.top_n
	)

	# 关联分析（需要前面的数据）
	if args.insights:
	print("\n" + "=" * 80)
	print("关联分析与洞察")
	print("=" * 80)
	# 检查必要的数据是否存在（动态文件名）
	top_n_suffix = f"_top{args.top_n}" if args.top_n else ""
	repo_level_csv = code_stats_dir / f'repo_level_metrics{top_n_suffix}.csv'
	if not repo_level_csv.exists():
	print(f"错误: 代码统计文件不存在: {repo_level_csv}")
	print("请先运行 --code-stats")
	return

	insights_dir = output_dir / 'insights'
	join_insights = JoinInsights(
	args.repos_searched,
	str(repo_level_csv),
	args.repos_check_history,
	str(insights_dir)
	)
	join_insights.run()

	print("\n" + "=" * 80)
	print("完成！所有结果已保存到:")
	print(f" - 数据表格: {output_dir}")
	print(f" - 图表: {output_dir / 'figures'}")
	print("=" * 80)


	if __name__ == "__main__":
	main()