walidsobhie-code

feat: add production infrastructure - CI/CD, Docker, code quality, and monitoring

b5998ff 21 days ago

5.4 kB

	name: Benchmark

	on:
	schedule:
	# Run weekly on Sunday at 00:00 UTC
	- cron: '0 0 * * 0'
	workflow_dispatch:
	inputs:
	model_path:
	description: 'Path or HuggingFace model ID for evaluation'
	required: false
	default: ''
	num_samples:
	description: 'Number of samples per problem (for pass@k)'
	required: false
	default: '10'
	num_problems:
	description: 'Limit number of problems per benchmark (leave empty for full)'
	required: false
	default: ''

	env:
	PYTHON_VERSION: "3.10"

	jobs:
	benchmark:
	name: HumanEval & MBPP Evaluation
	runs-on: ubuntu-latest
	# Run on PRs only for comment functionality
	if: github.event_name == 'pull_request' \|\| github.event_name == 'workflow_dispatch'

	steps:
	- uses: actions/checkout@v4

	- name: Set up Python ${{ env.PYTHON_VERSION }}
	uses: actions/setup-python@v5
	with:
	python-version: ${{ env.PYTHON_VERSION }}

	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install torch --index-url https://download.pytorch.org/whl/cpu
	pip install transformers peft accelerate
	pip install pytest matplotlib pandas plotly

	- name: Run HumanEval Benchmark
	id: humaneval
	run: \|
	MODEL_PATH="${{ inputs.model_path \|\| 'Qwen/Qwen2.5-Coder-7B' }}"
	NUM_SAMPLES="${{ inputs.num_samples \|\| '10' }}"
	NUM_PROBLEMS="${{ inputs.num_problems \|\| '' }}"

	ARGS="--model-path $MODEL_PATH --benchmark humaneval --num-samples $NUM_SAMPLES --output results_humaneval.json"
	if [ -n "$NUM_PROBLEMS" ]; then
	ARGS="$ARGS --num-problems $NUM_PROBLEMS"
	fi

	python evaluate_model.py $ARGS \|\| echo "HumanEval evaluation completed with status: $?"

	- name: Run MBPP Benchmark
	id: mbpp
	run: \|
	MODEL_PATH="${{ inputs.model_path \|\| 'Qwen/Qwen2.5-Coder-7B' }}"
	NUM_SAMPLES="${{ inputs.num_samples \|\| '10' }}"
	NUM_PROBLEMS="${{ inputs.num_problems \|\| '' }}"

	ARGS="--model-path $MODEL_PATH --benchmark mbpp --num-samples $NUM_SAMPLES --output results_mbpp.json"
	if [ -n "$NUM_PROBLEMS" ]; then
	ARGS="$ARGS --num-problems $NUM_PROBLEMS"
	fi

	python evaluate_model.py $ARGS \|\| echo "MBPP evaluation completed with status: $?"

	- name: Generate summary comment
	if: github.event_name == 'pull_request'
	run: \|
	python -c "
	import json
	import os

	results = {}

	if os.path.exists('results_humaneval.json'):
	with open('results_humaneval.json') as f:
	results['humaneval'] = json.load(f)

	if os.path.exists('results_mbpp.json'):
	with open('results_mbpp.json') as f:
	results['mbpp'] = json.load(f)

	# Format as markdown comment
	comment = '## 📊 Benchmark Results\\n\\n'

	for bench, data in results.items():
	if 'summary' in data:
	comment += f'### {bench.upper()}\\n'
	summary = data['summary']
	for key, val in summary.items():
	if key.startswith('pass@'):
	comment += f'- {key}: {val:.4f} ({val*100:.2f}%)\\n'
	comment += '\\n'

	print(comment)

	# Write for artifact
	with open('benchmark_comment.md', 'w') as f:
	f.write(comment)
	"

	- name: Comment on PR
	if: github.event_name == 'pull_request'
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	if [ -f benchmark_comment.md ]; then
	gh pr comment ${{ github.event.pull_request.number }} -F benchmark_comment.md
	else
	echo "No benchmark results to comment"
	fi

	- name: Upload results as artifact
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-results
	path: \|
	results_humaneval.json
	results_mbpp.json
	benchmark_comment.md
	retention-days: 30

	# Quick smoke test for benchmark script
	benchmark-smoke:
	name: Benchmark Smoke Test
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: ${{ env.PYTHON_VERSION }}

	- name: Install minimal dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install torch --index-url https://download.pytorch.org/whl/cpu
	pip install transformers

	- name: Validate evaluate_model.py syntax
	run: \|
	python -m py_compile evaluate_model.py
	echo "evaluate_model.py syntax OK"

	- name: List available benchmarks
	run: \|
	python -c "
	import ast
	with open('evaluate_model.py') as f:
	tree = ast.parse(f.read())
	funcs = [n.name for n in ast.walk(tree) if isinstance(n, ast.FunctionDef) and n.name.startswith('get_')]
	print('Available benchmark loaders:', funcs)
	"