walidsobhie-code
feat: add production infrastructure - CI/CD, Docker, code quality, and monitoring
b5998ff
name: Benchmark
on:
schedule:
# Run weekly on Sunday at 00:00 UTC
- cron: '0 0 * * 0'
workflow_dispatch:
inputs:
model_path:
description: 'Path or HuggingFace model ID for evaluation'
required: false
default: ''
num_samples:
description: 'Number of samples per problem (for pass@k)'
required: false
default: '10'
num_problems:
description: 'Limit number of problems per benchmark (leave empty for full)'
required: false
default: ''
env:
PYTHON_VERSION: "3.10"
jobs:
benchmark:
name: HumanEval & MBPP Evaluation
runs-on: ubuntu-latest
# Run on PRs only for comment functionality
if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch'
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install torch --index-url https://download.pytorch.org/whl/cpu
pip install transformers peft accelerate
pip install pytest matplotlib pandas plotly
- name: Run HumanEval Benchmark
id: humaneval
run: |
MODEL_PATH="${{ inputs.model_path || 'Qwen/Qwen2.5-Coder-7B' }}"
NUM_SAMPLES="${{ inputs.num_samples || '10' }}"
NUM_PROBLEMS="${{ inputs.num_problems || '' }}"
ARGS="--model-path $MODEL_PATH --benchmark humaneval --num-samples $NUM_SAMPLES --output results_humaneval.json"
if [ -n "$NUM_PROBLEMS" ]; then
ARGS="$ARGS --num-problems $NUM_PROBLEMS"
fi
python evaluate_model.py $ARGS || echo "HumanEval evaluation completed with status: $?"
- name: Run MBPP Benchmark
id: mbpp
run: |
MODEL_PATH="${{ inputs.model_path || 'Qwen/Qwen2.5-Coder-7B' }}"
NUM_SAMPLES="${{ inputs.num_samples || '10' }}"
NUM_PROBLEMS="${{ inputs.num_problems || '' }}"
ARGS="--model-path $MODEL_PATH --benchmark mbpp --num-samples $NUM_SAMPLES --output results_mbpp.json"
if [ -n "$NUM_PROBLEMS" ]; then
ARGS="$ARGS --num-problems $NUM_PROBLEMS"
fi
python evaluate_model.py $ARGS || echo "MBPP evaluation completed with status: $?"
- name: Generate summary comment
if: github.event_name == 'pull_request'
run: |
python -c "
import json
import os
results = {}
if os.path.exists('results_humaneval.json'):
with open('results_humaneval.json') as f:
results['humaneval'] = json.load(f)
if os.path.exists('results_mbpp.json'):
with open('results_mbpp.json') as f:
results['mbpp'] = json.load(f)
# Format as markdown comment
comment = '## 📊 Benchmark Results\\n\\n'
for bench, data in results.items():
if 'summary' in data:
comment += f'### {bench.upper()}\\n'
summary = data['summary']
for key, val in summary.items():
if key.startswith('pass@'):
comment += f'- **{key}**: {val:.4f} ({val*100:.2f}%)\\n'
comment += '\\n'
print(comment)
# Write for artifact
with open('benchmark_comment.md', 'w') as f:
f.write(comment)
"
- name: Comment on PR
if: github.event_name == 'pull_request'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
if [ -f benchmark_comment.md ]; then
gh pr comment ${{ github.event.pull_request.number }} -F benchmark_comment.md
else
echo "No benchmark results to comment"
fi
- name: Upload results as artifact
uses: actions/upload-artifact@v4
with:
name: benchmark-results
path: |
results_humaneval.json
results_mbpp.json
benchmark_comment.md
retention-days: 30
# Quick smoke test for benchmark script
benchmark-smoke:
name: Benchmark Smoke Test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install minimal dependencies
run: |
python -m pip install --upgrade pip
pip install torch --index-url https://download.pytorch.org/whl/cpu
pip install transformers
- name: Validate evaluate_model.py syntax
run: |
python -m py_compile evaluate_model.py
echo "evaluate_model.py syntax OK"
- name: List available benchmarks
run: |
python -c "
import ast
with open('evaluate_model.py') as f:
tree = ast.parse(f.read())
funcs = [n.name for n in ast.walk(tree) if isinstance(n, ast.FunctionDef) and n.name.startswith('get_')]
print('Available benchmark loaders:', funcs)
"