name: Benchmark on: schedule: # Run weekly on Sunday at 00:00 UTC - cron: '0 0 * * 0' workflow_dispatch: inputs: model_path: description: 'Path or HuggingFace model ID for evaluation' required: false default: '' num_samples: description: 'Number of samples per problem (for pass@k)' required: false default: '10' num_problems: description: 'Limit number of problems per benchmark (leave empty for full)' required: false default: '' env: PYTHON_VERSION: "3.10" jobs: benchmark: name: HumanEval & MBPP Evaluation runs-on: ubuntu-latest # Run on PRs only for comment functionality if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' steps: - uses: actions/checkout@v4 - name: Set up Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v5 with: python-version: ${{ env.PYTHON_VERSION }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install torch --index-url https://download.pytorch.org/whl/cpu pip install transformers peft accelerate pip install pytest matplotlib pandas plotly - name: Run HumanEval Benchmark id: humaneval run: | MODEL_PATH="${{ inputs.model_path || 'Qwen/Qwen2.5-Coder-7B' }}" NUM_SAMPLES="${{ inputs.num_samples || '10' }}" NUM_PROBLEMS="${{ inputs.num_problems || '' }}" ARGS="--model-path $MODEL_PATH --benchmark humaneval --num-samples $NUM_SAMPLES --output results_humaneval.json" if [ -n "$NUM_PROBLEMS" ]; then ARGS="$ARGS --num-problems $NUM_PROBLEMS" fi python evaluate_model.py $ARGS || echo "HumanEval evaluation completed with status: $?" - name: Run MBPP Benchmark id: mbpp run: | MODEL_PATH="${{ inputs.model_path || 'Qwen/Qwen2.5-Coder-7B' }}" NUM_SAMPLES="${{ inputs.num_samples || '10' }}" NUM_PROBLEMS="${{ inputs.num_problems || '' }}" ARGS="--model-path $MODEL_PATH --benchmark mbpp --num-samples $NUM_SAMPLES --output results_mbpp.json" if [ -n "$NUM_PROBLEMS" ]; then ARGS="$ARGS --num-problems $NUM_PROBLEMS" fi python evaluate_model.py $ARGS || echo "MBPP evaluation completed with status: $?" - name: Generate summary comment if: github.event_name == 'pull_request' run: | python -c " import json import os results = {} if os.path.exists('results_humaneval.json'): with open('results_humaneval.json') as f: results['humaneval'] = json.load(f) if os.path.exists('results_mbpp.json'): with open('results_mbpp.json') as f: results['mbpp'] = json.load(f) # Format as markdown comment comment = '## 📊 Benchmark Results\\n\\n' for bench, data in results.items(): if 'summary' in data: comment += f'### {bench.upper()}\\n' summary = data['summary'] for key, val in summary.items(): if key.startswith('pass@'): comment += f'- **{key}**: {val:.4f} ({val*100:.2f}%)\\n' comment += '\\n' print(comment) # Write for artifact with open('benchmark_comment.md', 'w') as f: f.write(comment) " - name: Comment on PR if: github.event_name == 'pull_request' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | if [ -f benchmark_comment.md ]; then gh pr comment ${{ github.event.pull_request.number }} -F benchmark_comment.md else echo "No benchmark results to comment" fi - name: Upload results as artifact uses: actions/upload-artifact@v4 with: name: benchmark-results path: | results_humaneval.json results_mbpp.json benchmark_comment.md retention-days: 30 # Quick smoke test for benchmark script benchmark-smoke: name: Benchmark Smoke Test runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: ${{ env.PYTHON_VERSION }} - name: Install minimal dependencies run: | python -m pip install --upgrade pip pip install torch --index-url https://download.pytorch.org/whl/cpu pip install transformers - name: Validate evaluate_model.py syntax run: | python -m py_compile evaluate_model.py echo "evaluate_model.py syntax OK" - name: List available benchmarks run: | python -c " import ast with open('evaluate_model.py') as f: tree = ast.parse(f.read()) funcs = [n.name for n in ast.walk(tree) if isinstance(n, ast.FunctionDef) and n.name.startswith('get_')] print('Available benchmark loaders:', funcs) "