name: Benchmark

on:
  schedule:
    # Run weekly on Sunday at 00:00 UTC
    - cron: '0 0 * * 0'
  workflow_dispatch:
    inputs:
      model_path:
        description: 'Path or HuggingFace model ID for evaluation'
        required: false
        default: ''
      num_samples:
        description: 'Number of samples per problem (for pass@k)'
        required: false
        default: '10'
      num_problems:
        description: 'Limit number of problems per benchmark (leave empty for full)'
        required: false
        default: ''

env:
  PYTHON_VERSION: "3.10"

jobs:
  benchmark:
    name: HumanEval & MBPP Evaluation
    runs-on: ubuntu-latest
    # Run on PRs only for comment functionality
    if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch'

    steps:
      - uses: actions/checkout@v4

      - name: Set up Python ${{ env.PYTHON_VERSION }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.PYTHON_VERSION }}

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install torch --index-url https://download.pytorch.org/whl/cpu
          pip install transformers peft accelerate
          pip install pytest matplotlib pandas plotly

      - name: Run HumanEval Benchmark
        id: humaneval
        run: |
          MODEL_PATH="${{ inputs.model_path || 'Qwen/Qwen2.5-Coder-7B' }}"
          NUM_SAMPLES="${{ inputs.num_samples || '10' }}"
          NUM_PROBLEMS="${{ inputs.num_problems || '' }}"
          
          ARGS="--model-path $MODEL_PATH --benchmark humaneval --num-samples $NUM_SAMPLES --output results_humaneval.json"
          if [ -n "$NUM_PROBLEMS" ]; then
            ARGS="$ARGS --num-problems $NUM_PROBLEMS"
          fi
          
          python evaluate_model.py $ARGS || echo "HumanEval evaluation completed with status: $?"

      - name: Run MBPP Benchmark
        id: mbpp
        run: |
          MODEL_PATH="${{ inputs.model_path || 'Qwen/Qwen2.5-Coder-7B' }}"
          NUM_SAMPLES="${{ inputs.num_samples || '10' }}"
          NUM_PROBLEMS="${{ inputs.num_problems || '' }}"
          
          ARGS="--model-path $MODEL_PATH --benchmark mbpp --num-samples $NUM_SAMPLES --output results_mbpp.json"
          if [ -n "$NUM_PROBLEMS" ]; then
            ARGS="$ARGS --num-problems $NUM_PROBLEMS"
          fi
          
          python evaluate_model.py $ARGS || echo "MBPP evaluation completed with status: $?"

      - name: Generate summary comment
        if: github.event_name == 'pull_request'
        run: |
          python -c "
          import json
          import os
          
          results = {}
          
          if os.path.exists('results_humaneval.json'):
              with open('results_humaneval.json') as f:
                  results['humaneval'] = json.load(f)
          
          if os.path.exists('results_mbpp.json'):
              with open('results_mbpp.json') as f:
                  results['mbpp'] = json.load(f)
          
          # Format as markdown comment
          comment = '## 📊 Benchmark Results\\n\\n'
          
          for bench, data in results.items():
              if 'summary' in data:
                  comment += f'### {bench.upper()}\\n'
                  summary = data['summary']
                  for key, val in summary.items():
                      if key.startswith('pass@'):
                          comment += f'- **{key}**: {val:.4f} ({val*100:.2f}%)\\n'
                  comment += '\\n'
          
          print(comment)
          
          # Write for artifact
          with open('benchmark_comment.md', 'w') as f:
              f.write(comment)
          "

      - name: Comment on PR
        if: github.event_name == 'pull_request'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          if [ -f benchmark_comment.md ]; then
            gh pr comment ${{ github.event.pull_request.number }} -F benchmark_comment.md
          else
            echo "No benchmark results to comment"
          fi

      - name: Upload results as artifact
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-results
          path: |
            results_humaneval.json
            results_mbpp.json
            benchmark_comment.md
          retention-days: 30

  # Quick smoke test for benchmark script
  benchmark-smoke:
    name: Benchmark Smoke Test
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.PYTHON_VERSION }}

      - name: Install minimal dependencies
        run: |
          python -m pip install --upgrade pip
          pip install torch --index-url https://download.pytorch.org/whl/cpu
          pip install transformers

      - name: Validate evaluate_model.py syntax
        run: |
          python -m py_compile evaluate_model.py
          echo "evaluate_model.py syntax OK"

      - name: List available benchmarks
        run: |
          python -c "
          import ast
          with open('evaluate_model.py') as f:
              tree = ast.parse(f.read())
          funcs = [n.name for n in ast.walk(tree) if isinstance(n, ast.FunctionDef) and n.name.startswith('get_')]
          print('Available benchmark loaders:', funcs)
          "