| | """ |
| | 主入口脚本:执行完整的统计报表流程 |
| | """ |
| | import argparse |
| | from pathlib import Path |
| | import sys |
| |
|
| | |
| | from stage_a_stats import StageAStats |
| | from stage_b_stats import StageBStats |
| | from repo_meta_scan import RepoMetaScan |
| | from code_file_stats import CodeFileStats |
| | from code_file_stats_fast import CodeFileStatsFast |
| | from visualization import generate_all_visualizations |
| | from join_insights import JoinInsights |
| |
|
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description='生成数据统计报表') |
| | parser.add_argument('--repos-searched', type=str, |
| | default='/home/weifengsun/tangou1/domain_code/src/workdir/repos_searched.csv', |
| | help='repos_searched.csv路径') |
| | parser.add_argument('--repos-check-history', type=str, |
| | default='/home/weifengsun/tangou1/domain_code/src/workdir/repos_check_history.csv', |
| | help='repos_check_history.csv路径') |
| | parser.add_argument('--repos-filtered', type=str, |
| | default='/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered', |
| | help='repos_filtered目录路径') |
| | parser.add_argument('--output-dir', type=str, |
| | default='/home/weifengsun/tangou1/domain_code/src/workdir/reporting', |
| | help='输出目录') |
| | parser.add_argument('--top-n', type=int, default=None, |
| | help='分析的仓库数量(字典序前N个,None表示所有)') |
| | parser.add_argument('--workers', type=int, default=8, |
| | help='代码文件统计的并行worker数(默认CPU-1)') |
| | parser.add_argument('--stage-a', action='store_true', |
| | help='运行Stage A(搜索阶段统计)') |
| | parser.add_argument('--stage-b', action='store_true', |
| | help='运行Stage B(过滤阶段统计)') |
| | parser.add_argument('--repo-meta', action='store_true', |
| | help='运行仓库元画像扫描') |
| | parser.add_argument('--code-stats', action='store_true', |
| | help='运行代码文件级统计') |
| | parser.add_argument('--code-stats-fast', action='store_true', |
| | help='运行代码文件级统计(快速版本,约提速10-20倍)') |
| | parser.add_argument('--visualization', action='store_true', |
| | help='生成图表(需要先有stage-a, stage-b, repo-meta, code-stats的数据)') |
| | parser.add_argument('--insights', action='store_true', |
| | help='运行关联分析(需要先有stage-a, code-stats, stage-b的数据)') |
| | |
| | args = parser.parse_args() |
| | |
| | output_dir = Path(args.output_dir) |
| | output_dir.mkdir(parents=True, exist_ok=True) |
| | |
| | print("=" * 80) |
| | print("数据统计报表生成系统") |
| | print("=" * 80) |
| | print(f"输出目录: {output_dir}") |
| | print(f"分析仓库数: {args.top_n if args.top_n else '所有'}") |
| | |
| | |
| | has_stage = any([ |
| | args.stage_a, args.stage_b, args.repo_meta, |
| | args.code_stats, args.code_stats_fast, args.visualization, args.insights |
| | ]) |
| | |
| | if not has_stage: |
| | print("\n错误: 请至少指定一个要运行的阶段!") |
| | print("可用选项:") |
| | print(" --stage-a 运行Stage A(搜索阶段统计)") |
| | print(" --stage-b 运行Stage B(过滤阶段统计)") |
| | print(" --repo-meta 运行仓库元画像扫描") |
| | print(" --code-stats 运行代码文件级统计") |
| | print(" --code-stats-fast 运行代码文件级统计(快速版本,推荐)") |
| | print(" --visualization 生成图表") |
| | print(" --insights 运行关联分析") |
| | print("\n示例: python main.py --stage-a --stage-b") |
| | return |
| | |
| | print() |
| | |
| | |
| | stage_a_dir = output_dir / 'stage_a' |
| | stage_b_dir = output_dir / 'stage_b' |
| | repo_meta_dir = output_dir / 'repo_meta' |
| | code_stats_dir = output_dir / 'code_stats' |
| | |
| | |
| | if args.stage_a: |
| | print("\n" + "=" * 80) |
| | print("Stage A: 搜索阶段统计 (repos_searched.csv)") |
| | print("=" * 80) |
| | stage_a_stats = StageAStats(args.repos_searched, stage_a_dir) |
| | stage_a_stats.run() |
| | |
| | |
| | if args.stage_b: |
| | print("\n" + "=" * 80) |
| | print("Stage B: 过滤阶段统计 (repos_check_history.csv)") |
| | print("=" * 80) |
| | stage_b_stats = StageBStats(args.repos_check_history, stage_b_dir) |
| | stage_b_stats.run() |
| | |
| | |
| | if args.repo_meta: |
| | print("\n" + "=" * 80) |
| | print("仓库元画像扫描 (repos_filtered)") |
| | print("=" * 80) |
| | repo_meta_scanner = RepoMetaScan(args.repos_filtered, repo_meta_dir, top_n=args.top_n) |
| | repo_meta_scanner.run() |
| | |
| | |
| | if args.code_stats: |
| | print("\n" + "=" * 80) |
| | print("Stage C: 代码文件级统计(原版)") |
| | print("=" * 80) |
| | code_stats = CodeFileStats(args.repos_filtered, code_stats_dir, |
| | top_n=args.top_n) |
| | code_stats.run(num_workers=args.workers) |
| | |
| | |
| | if args.code_stats_fast: |
| | print("\n" + "=" * 80) |
| | print("Stage C: 代码文件级统计(快速版本)") |
| | print("=" * 80) |
| | code_stats_fast = CodeFileStatsFast( |
| | args.repos_filtered, |
| | code_stats_dir, |
| | top_n=args.top_n, |
| | max_file_size_mb=2, |
| | max_files_per_repo=500 |
| | ) |
| | code_stats_fast.run(num_workers=args.workers if args.workers else 48) |
| | |
| | |
| | if args.visualization: |
| | print("\n" + "=" * 80) |
| | print("生成图表") |
| | print("=" * 80) |
| | |
| | required_dirs = [stage_a_dir, stage_b_dir, repo_meta_dir, code_stats_dir] |
| | missing_dirs = [d for d in required_dirs if not d.exists()] |
| | if missing_dirs: |
| | print(f"警告: 以下目录不存在,图表生成可能不完整: {[str(d) for d in missing_dirs]}") |
| | |
| | generate_all_visualizations( |
| | str(stage_a_dir), |
| | str(stage_b_dir), |
| | str(repo_meta_dir), |
| | str(code_stats_dir), |
| | args.repos_searched, |
| | top_n=args.top_n |
| | ) |
| | |
| | |
| | if args.insights: |
| | print("\n" + "=" * 80) |
| | print("关联分析与洞察") |
| | print("=" * 80) |
| | |
| | top_n_suffix = f"_top{args.top_n}" if args.top_n else "" |
| | repo_level_csv = code_stats_dir / f'repo_level_metrics{top_n_suffix}.csv' |
| | if not repo_level_csv.exists(): |
| | print(f"错误: 代码统计文件不存在: {repo_level_csv}") |
| | print("请先运行 --code-stats") |
| | return |
| | |
| | insights_dir = output_dir / 'insights' |
| | join_insights = JoinInsights( |
| | args.repos_searched, |
| | str(repo_level_csv), |
| | args.repos_check_history, |
| | str(insights_dir) |
| | ) |
| | join_insights.run() |
| | |
| | print("\n" + "=" * 80) |
| | print("完成!所有结果已保存到:") |
| | print(f" - 数据表格: {output_dir}") |
| | print(f" - 图表: {output_dir / 'figures'}") |
| | print("=" * 80) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|
| |
|