File size: 7,755 Bytes

d50199f

"""
主入口脚本：执行完整的统计报表流程
"""
import argparse
from pathlib import Path
import sys

# 导入各模块
from stage_a_stats import StageAStats
from stage_b_stats import StageBStats
from repo_meta_scan import RepoMetaScan
from code_file_stats import CodeFileStats
from code_file_stats_fast import CodeFileStatsFast  # 优化版本
from visualization import generate_all_visualizations
from join_insights import JoinInsights


def main():
    parser = argparse.ArgumentParser(description='生成数据统计报表')
    parser.add_argument('--repos-searched', type=str, 
                       default='/home/weifengsun/tangou1/domain_code/src/workdir/repos_searched.csv',
                       help='repos_searched.csv路径')
    parser.add_argument('--repos-check-history', type=str,
                       default='/home/weifengsun/tangou1/domain_code/src/workdir/repos_check_history.csv',
                       help='repos_check_history.csv路径')
    parser.add_argument('--repos-filtered', type=str,
                       default='/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered',
                       help='repos_filtered目录路径')
    parser.add_argument('--output-dir', type=str,
                       default='/home/weifengsun/tangou1/domain_code/src/workdir/reporting',
                       help='输出目录')
    parser.add_argument('--top-n', type=int, default=None,
                       help='分析的仓库数量（字典序前N个，None表示所有）')
    parser.add_argument('--workers', type=int, default=8,
                       help='代码文件统计的并行worker数（默认CPU-1）')
    parser.add_argument('--stage-a', action='store_true',
                       help='运行Stage A（搜索阶段统计）')
    parser.add_argument('--stage-b', action='store_true',
                       help='运行Stage B（过滤阶段统计）')
    parser.add_argument('--repo-meta', action='store_true',
                       help='运行仓库元画像扫描')
    parser.add_argument('--code-stats', action='store_true',
                       help='运行代码文件级统计')
    parser.add_argument('--code-stats-fast', action='store_true',
                       help='运行代码文件级统计（快速版本，约提速10-20倍）')
    parser.add_argument('--visualization', action='store_true',
                       help='生成图表（需要先有stage-a, stage-b, repo-meta, code-stats的数据）')
    parser.add_argument('--insights', action='store_true',
                       help='运行关联分析（需要先有stage-a, code-stats, stage-b的数据）')
    
    args = parser.parse_args()
    
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print("=" * 80)
    print("数据统计报表生成系统")
    print("=" * 80)
    print(f"输出目录: {output_dir}")
    print(f"分析仓库数: {args.top_n if args.top_n else '所有'}")
    
    # 检查是否有指定任何阶段
    has_stage = any([
        args.stage_a, args.stage_b, args.repo_meta,
        args.code_stats, args.code_stats_fast, args.visualization, args.insights
    ])
    
    if not has_stage:
        print("\n错误: 请至少指定一个要运行的阶段！")
        print("可用选项:")
        print("  --stage-a          运行Stage A（搜索阶段统计）")
        print("  --stage-b          运行Stage B（过滤阶段统计）")
        print("  --repo-meta        运行仓库元画像扫描")
        print("  --code-stats       运行代码文件级统计")
        print("  --code-stats-fast  运行代码文件级统计（快速版本，推荐）")
        print("  --visualization    生成图表")
        print("  --insights         运行关联分析")
        print("\n示例: python main.py --stage-a --stage-b")
        return
    
    print()
    
    # 定义输出目录路径（即使不运行也需要，因为可能被其他阶段使用）
    stage_a_dir = output_dir / 'stage_a'
    stage_b_dir = output_dir / 'stage_b'
    repo_meta_dir = output_dir / 'repo_meta'
    code_stats_dir = output_dir / 'code_stats'
    
    # Stage A: 搜索阶段统计
    if args.stage_a:
        print("\n" + "=" * 80)
        print("Stage A: 搜索阶段统计 (repos_searched.csv)")
        print("=" * 80)
        stage_a_stats = StageAStats(args.repos_searched, stage_a_dir)
        stage_a_stats.run()
    
    # Stage B: 过滤阶段统计
    if args.stage_b:
        print("\n" + "=" * 80)
        print("Stage B: 过滤阶段统计 (repos_check_history.csv)")
        print("=" * 80)
        stage_b_stats = StageBStats(args.repos_check_history, stage_b_dir)
        stage_b_stats.run()
    
    # 仓库元画像扫描
    if args.repo_meta:
        print("\n" + "=" * 80)
        print("仓库元画像扫描 (repos_filtered)")
        print("=" * 80)
        repo_meta_scanner = RepoMetaScan(args.repos_filtered, repo_meta_dir, top_n=args.top_n)
        repo_meta_scanner.run()
    
    # Stage C: 代码文件级统计
    if args.code_stats:
        print("\n" + "=" * 80)
        print("Stage C: 代码文件级统计（原版）")
        print("=" * 80)
        code_stats = CodeFileStats(args.repos_filtered, code_stats_dir, 
                                   top_n=args.top_n)
        code_stats.run(num_workers=args.workers)
    
    # Stage C: 代码文件级统计（快速版本）
    if args.code_stats_fast:
        print("\n" + "=" * 80)
        print("Stage C: 代码文件级统计（快速版本）")
        print("=" * 80)
        code_stats_fast = CodeFileStatsFast(
            args.repos_filtered, 
            code_stats_dir, 
            top_n=args.top_n,
            max_file_size_mb=2,
            max_files_per_repo=500  # 限制每个仓库最多500个文件
        )
        code_stats_fast.run(num_workers=args.workers if args.workers else 48)
    
    # 图表生成（需要前面的数据）
    if args.visualization:
        print("\n" + "=" * 80)
        print("生成图表")
        print("=" * 80)
        # 检查必要的数据是否存在
        required_dirs = [stage_a_dir, stage_b_dir, repo_meta_dir, code_stats_dir]
        missing_dirs = [d for d in required_dirs if not d.exists()]
        if missing_dirs:
            print(f"警告: 以下目录不存在，图表生成可能不完整: {[str(d) for d in missing_dirs]}")
        
        generate_all_visualizations(
            str(stage_a_dir),
            str(stage_b_dir),
            str(repo_meta_dir),
            str(code_stats_dir),
            args.repos_searched,
            top_n=args.top_n
        )
    
    # 关联分析（需要前面的数据）
    if args.insights:
        print("\n" + "=" * 80)
        print("关联分析与洞察")
        print("=" * 80)
        # 检查必要的数据是否存在（动态文件名）
        top_n_suffix = f"_top{args.top_n}" if args.top_n else ""
        repo_level_csv = code_stats_dir / f'repo_level_metrics{top_n_suffix}.csv'
        if not repo_level_csv.exists():
            print(f"错误: 代码统计文件不存在: {repo_level_csv}")
            print("请先运行 --code-stats")
            return
        
        insights_dir = output_dir / 'insights'
        join_insights = JoinInsights(
            args.repos_searched,
            str(repo_level_csv),
            args.repos_check_history,
            str(insights_dir)
        )
        join_insights.run()
    
    print("\n" + "=" * 80)
    print("完成！所有结果已保存到:")
    print(f"  - 数据表格: {output_dir}")
    print(f"  - 图表: {output_dir / 'figures'}")
    print("=" * 80)


if __name__ == "__main__":
    main()