walidsobhie-code
feat: add production infrastructure - CI/CD, Docker, code quality, and monitoring
b5998ff | name: Benchmark | |
| on: | |
| schedule: | |
| # Run weekly on Sunday at 00:00 UTC | |
| - cron: '0 0 * * 0' | |
| workflow_dispatch: | |
| inputs: | |
| model_path: | |
| description: 'Path or HuggingFace model ID for evaluation' | |
| required: false | |
| default: '' | |
| num_samples: | |
| description: 'Number of samples per problem (for pass@k)' | |
| required: false | |
| default: '10' | |
| num_problems: | |
| description: 'Limit number of problems per benchmark (leave empty for full)' | |
| required: false | |
| default: '' | |
| env: | |
| PYTHON_VERSION: "3.10" | |
| jobs: | |
| benchmark: | |
| name: HumanEval & MBPP Evaluation | |
| runs-on: ubuntu-latest | |
| # Run on PRs only for comment functionality | |
| if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Python ${{ env.PYTHON_VERSION }} | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ env.PYTHON_VERSION }} | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install torch --index-url https://download.pytorch.org/whl/cpu | |
| pip install transformers peft accelerate | |
| pip install pytest matplotlib pandas plotly | |
| - name: Run HumanEval Benchmark | |
| id: humaneval | |
| run: | | |
| MODEL_PATH="${{ inputs.model_path || 'Qwen/Qwen2.5-Coder-7B' }}" | |
| NUM_SAMPLES="${{ inputs.num_samples || '10' }}" | |
| NUM_PROBLEMS="${{ inputs.num_problems || '' }}" | |
| ARGS="--model-path $MODEL_PATH --benchmark humaneval --num-samples $NUM_SAMPLES --output results_humaneval.json" | |
| if [ -n "$NUM_PROBLEMS" ]; then | |
| ARGS="$ARGS --num-problems $NUM_PROBLEMS" | |
| fi | |
| python evaluate_model.py $ARGS || echo "HumanEval evaluation completed with status: $?" | |
| - name: Run MBPP Benchmark | |
| id: mbpp | |
| run: | | |
| MODEL_PATH="${{ inputs.model_path || 'Qwen/Qwen2.5-Coder-7B' }}" | |
| NUM_SAMPLES="${{ inputs.num_samples || '10' }}" | |
| NUM_PROBLEMS="${{ inputs.num_problems || '' }}" | |
| ARGS="--model-path $MODEL_PATH --benchmark mbpp --num-samples $NUM_SAMPLES --output results_mbpp.json" | |
| if [ -n "$NUM_PROBLEMS" ]; then | |
| ARGS="$ARGS --num-problems $NUM_PROBLEMS" | |
| fi | |
| python evaluate_model.py $ARGS || echo "MBPP evaluation completed with status: $?" | |
| - name: Generate summary comment | |
| if: github.event_name == 'pull_request' | |
| run: | | |
| python -c " | |
| import json | |
| import os | |
| results = {} | |
| if os.path.exists('results_humaneval.json'): | |
| with open('results_humaneval.json') as f: | |
| results['humaneval'] = json.load(f) | |
| if os.path.exists('results_mbpp.json'): | |
| with open('results_mbpp.json') as f: | |
| results['mbpp'] = json.load(f) | |
| # Format as markdown comment | |
| comment = '## 📊 Benchmark Results\\n\\n' | |
| for bench, data in results.items(): | |
| if 'summary' in data: | |
| comment += f'### {bench.upper()}\\n' | |
| summary = data['summary'] | |
| for key, val in summary.items(): | |
| if key.startswith('pass@'): | |
| comment += f'- **{key}**: {val:.4f} ({val*100:.2f}%)\\n' | |
| comment += '\\n' | |
| print(comment) | |
| # Write for artifact | |
| with open('benchmark_comment.md', 'w') as f: | |
| f.write(comment) | |
| " | |
| - name: Comment on PR | |
| if: github.event_name == 'pull_request' | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| if [ -f benchmark_comment.md ]; then | |
| gh pr comment ${{ github.event.pull_request.number }} -F benchmark_comment.md | |
| else | |
| echo "No benchmark results to comment" | |
| fi | |
| - name: Upload results as artifact | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results | |
| path: | | |
| results_humaneval.json | |
| results_mbpp.json | |
| benchmark_comment.md | |
| retention-days: 30 | |
| # Quick smoke test for benchmark script | |
| benchmark-smoke: | |
| name: Benchmark Smoke Test | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ env.PYTHON_VERSION }} | |
| - name: Install minimal dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install torch --index-url https://download.pytorch.org/whl/cpu | |
| pip install transformers | |
| - name: Validate evaluate_model.py syntax | |
| run: | | |
| python -m py_compile evaluate_model.py | |
| echo "evaluate_model.py syntax OK" | |
| - name: List available benchmarks | |
| run: | | |
| python -c " | |
| import ast | |
| with open('evaluate_model.py') as f: | |
| tree = ast.parse(f.read()) | |
| funcs = [n.name for n in ast.walk(tree) if isinstance(n, ast.FunctionDef) and n.name.startswith('get_')] | |
| print('Available benchmark loaders:', funcs) | |
| " | |