#!/bin/bash # ============================================================================= # Stack 2.9 Full Benchmark Evaluation Suite # ============================================================================= # Runs all benchmarks and generates comprehensive evaluation report. # # Usage: # ./run_all_benchmarks.sh [OPTIONS] # # Options: # --model MODEL Model name to evaluate (default: stack-2.9) # --output DIR Output directory (default: ./results) # --skip-slow Skip slow benchmarks # --sample-size N Use N samples per benchmark (default: all) # --verbose Verbose output # # ============================================================================= set -e # Configuration MODEL="${MODEL:-stack-2.9}" OUTPUT_DIR="${OUTPUT_DIR:-./results}" SAMPLE_SIZE="" SKIP_SLOW="" VERBOSE="" PYTHON="${PYTHON:-python3}" # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Benchmark scripts SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" HUMAN_EVAL="${SCRIPT_DIR}/human_eval.py" MBPP_EVAL="${SCRIPT_DIR}/mbpp_eval.py" TOOL_EVAL="${SCRIPT_DIR}/tool_use_eval.py" SELF_IMPROVE_EVAL="${SCRIPT_DIR}/self_improve_eval.py" DASHBOARD="${SCRIPT_DIR}/results/dashboard.py" # ============================================================================= # Helper Functions # ============================================================================= log_info() { echo -e "${BLUE}[INFO]${NC} $1" } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } section() { echo "" echo "==============================================================================" echo "$1" echo "==============================================================================" } # ============================================================================= # Parse Arguments # ============================================================================= while [[ $# -gt 0 ]]; do case $1 in --model) MODEL="$2" shift 2 ;; --output) OUTPUT_DIR="$2" shift 2 ;; --skip-slow) SKIP_SLOW="1" shift ;; --sample-size) SAMPLE_SIZE="$2" shift 2 ;; --verbose) VERBOSE="1" shift ;; --help) echo "Stack 2.9 Full Benchmark Suite" echo "" echo "Usage: $0 [OPTIONS]" echo "" echo "Options:" echo " --model MODEL Model name (default: stack-2.9)" echo " --output DIR Output directory (default: ./results)" echo " --skip-slow Skip slow benchmarks" echo " --sample-size N Sample size for each benchmark" echo " --verbose Verbose output" echo " --help Show this help message" exit 0 ;; *) log_error "Unknown option: $1" exit 1 ;; esac done # ============================================================================= # Setup # ============================================================================= log_info "Stack 2.9 Benchmark Suite" log_info "Model: ${MODEL}" log_info "Output: ${OUTPUT_DIR}" echo "" # Create output directory mkdir -p "${OUTPUT_DIR}" mkdir -p "${OUTPUT_DIR}/detailed" # Track start time START_TIME=$(date +%s) # Results summary declare -A BENCHMARK_RESULTS # ============================================================================= # Check Dependencies # ============================================================================= section "Checking Dependencies" check_python() { if command -v python3 &> /dev/null; then PYTHON="python3" elif command -v python &> /dev/null; then PYTHON="python" else log_error "Python not found!" exit 1 fi log_success "Python: $(${PYTHON} --version)" } check_dependencies() { log_info "Checking Python dependencies..." # Check for required modules REQUIRED_MODULES=("json" "datetime" "pathlib" "argparse") MISSING="" for module in "${REQUIRED_MODULES[@]}"; do if ! ${PYTHON} -c "import ${module}" &> /dev/null; then MISSING="${MISSING} ${module}" fi done if [ -n "${MISSING}" ]; then log_warning "Missing modules:${MISSING}" log_info "These are standard library modules and should be available." fi log_success "Dependencies OK" } check_python check_dependencies # ============================================================================= # HumanEval Benchmark # ============================================================================= section "HumanEval Benchmark" log_info "Running HumanEval benchmark..." log_info "Metrics: Pass@1, Pass@10, Pass@100" HUMAN_EVAL_START=$(date +%s) if [ -f "${HUMAN_EVAL}" ]; then HUMAN_EVAL_CMD="${PYTHON} ${HUMAN_EVAL} --model ${MODEL} --output ${OUTPUT_DIR}/detailed" if [ -n "${SAMPLE_SIZE}" ]; then # Note: human_eval.py doesn't support sample-size directly # but we include it for other benchmarks : fi if [ -n "${VERBOSE}" ]; then ${HUMAN_EVAL_CMD} 2>&1 | tee "${OUTPUT_DIR}/detailed/humaneval_output.log" else ${HUMAN_EVAL_CMD} > "${OUTPUT_DIR}/detailed/humaneval_output.log" 2>&1 fi HUMAN_EVAL_END=$(date +%s) HUMAN_EVAL_TIME=$((HUMAN_EVAL_END - HUMAN_EVAL_START)) if [ -f "${OUTPUT_DIR}/detailed/humaneval_results.json" ]; then PASS_1=$(grep -o '"pass_at_1": [0-9.]*' "${OUTPUT_DIR}/detailed/humaneval_results.json" | cut -d':' -f2) PASS_10=$(grep -o '"pass_at_10": [0-9.]*' "${OUTPUT_DIR}/detailed/humaneval_results.json" | cut -d':' -f2) BENCHMARK_RESULTS["humaneval_pass1"]="${PASS_1}" BENCHMARK_RESULTS["humaneval_pass10"]="${PASS_10}" log_success "HumanEval: Pass@1=${PASS_1}, Pass@10=${PASS_10} (${HUMAN_EVAL_TIME}s)" else log_error "HumanEval results not found" fi else log_warning "HumanEval script not found: ${HUMAN_EVAL}" fi # ============================================================================= # MBPP Benchmark # ============================================================================= section "MBPP Benchmark" log_info "Running MBPP benchmark..." log_info "Metrics: Pass@1, Pass@10" MBPP_START=$(date +%s) if [ -f "${MBPP_EVAL}" ]; then MBPP_CMD="${PYTHON} ${MBPP_EVAL} --model ${MODEL} --output ${OUTPUT_DIR}/detailed" if [ -n "${VERBOSE}" ]; then ${MBPP_CMD} 2>&1 | tee "${OUTPUT_DIR}/detailed/mbpp_output.log" else ${MBPP_CMD} > "${OUTPUT_DIR}/detailed/mbpp_output.log" 2>&1 fi MBPP_END=$(date +%s) MBPP_TIME=$((MBPP_END - MBPP_START)) if [ -f "${OUTPUT_DIR}/detailed/mbpp_results.json" ]; then PASS_1=$(grep -o '"pass_at_1": [0-9.]*' "${OUTPUT_DIR}/detailed/mbpp_results.json" | cut -d':' -f2) PASS_10=$(grep -o '"pass_at_10": [0-9.]*' "${OUTPUT_DIR}/detailed/mbpp_results.json" | cut -d':' -f2) BENCHMARK_RESULTS["mbpp_pass1"]="${PASS_1}" BENCHMARK_RESULTS["mbpp_pass10"]="${PASS_10}" log_success "MBPP: Pass@1=${PASS_1}, Pass@10=${PASS_10} (${MBPP_TIME}s)" else log_error "MBPP results not found" fi else log_warning "MBPP script not found: ${MBPP_EVAL}" fi # ============================================================================= # Tool Use Evaluation # ============================================================================= section "Tool Use Evaluation" log_info "Running Tool Use evaluation..." log_info "Metrics: Tool Selection Accuracy, Parameter Accuracy, Execution Success" TOOL_START=$(date +%s) if [ -f "${TOOL_EVAL}" ]; then TOOL_CMD="${PYTHON} ${TOOL_EVAL} --model ${MODEL} --output ${OUTPUT_DIR}/detailed" if [ -n "${SAMPLE_SIZE}" ]; then TOOL_CMD="${TOOL_CMD} --sample ${SAMPLE_SIZE}" fi if [ -n "${VERBOSE}" ]; then ${TOOL_CMD} 2>&1 | tee "${OUTPUT_DIR}/detailed/tool_output.log" else ${TOOL_CMD} > "${OUTPUT_DIR}/detailed/tool_output.log" 2>&1 fi TOOL_END=$(date +%s) TOOL_TIME=$((TOOL_END - TOOL_START)) if [ -f "${OUTPUT_DIR}/detailed/tool_use_results.json" ]; then TOOL_ACC=$(grep -o '"tool_selection_accuracy": [0-9.]*' "${OUTPUT_DIR}/detailed/tool_use_results.json" | cut -d':' -f2) PARAM_ACC=$(grep -o '"parameter_accuracy": [0-9.]*' "${OUTPUT_DIR}/detailed/tool_use_results.json" | cut -d':' -f2) EXEC_RATE=$(grep -o '"execution_success_rate": [0-9.]*' "${OUTPUT_DIR}/detailed/tool_use_results.json" | cut -d':' -f2) BENCHMARK_RESULTS["tool_selection_accuracy"]="${TOOL_ACC}" BENCHMARK_RESULTS["parameter_accuracy"]="${PARAM_ACC}" BENCHMARK_RESULTS["execution_success_rate"]="${EXEC_RATE}" log_success "Tool Use: Selection=${TOOL_ACC}, Param=${PARAM_ACC}, Exec=${EXEC_RATE} (${TOOL_TIME}s)" else log_error "Tool Use results not found" fi else log_warning "Tool Use script not found: ${TOOL_EVAL}" fi # ============================================================================= # Self-Improvement Evaluation # ============================================================================= if [ -z "${SKIP_SLOW}" ]; then section "Self-Improvement Evaluation" log_info "Running Self-Improvement evaluation..." log_info "Metrics: Memory Retention, Pattern Application, Improvement Rate" SELF_IMPROVE_START=$(date +%s) if [ -f "${SELF_IMPROVE_EVAL}" ]; then SELF_CMD="${PYTHON} ${SELF_IMPROVE_EVAL} --model ${MODEL} --output ${OUTPUT_DIR}/detailed" if [ -n "${VERBOSE}" ]; then ${SELF_CMD} 2>&1 | tee "${OUTPUT_DIR}/detailed/self_improve_output.log" else ${SELF_CMD} > "${OUTPUT_DIR}/detailed/self_improve_output.log" 2>&1 fi SELF_IMPROVE_END=$(date +%s) SELF_TIME=$((SELF_IMPROVE_END - SELF_IMPROVE_START)) if [ -f "${OUTPUT_DIR}/detailed/self_improve_results.json" ]; then MEM_RET=$(grep -o '"memory_retention_rate": [0-9.]*' "${OUTPUT_DIR}/detailed/self_improve_results.json" | cut -d':' -f2) PATTERN_ACC=$(grep -o '"pattern_application_accuracy": [0-9.]*' "${OUTPUT_DIR}/detailed/self_improve_results.json" | cut -d':' -f2) IMPROVE_RATE=$(grep -o '"improvement_rate": [0-9.]*' "${OUTPUT_DIR}/detailed/self_improve_results.json" | cut -d':' -f2) BENCHMARK_RESULTS["memory_retention"]="${MEM_RET}" BENCHMARK_RESULTS["pattern_accuracy"]="${PATTERN_ACC}" BENCHMARK_RESULTS["improvement_rate"]="${IMPROVE_RATE}" log_success "Self-Improve: Memory=${MEM_RET}, Pattern=${PATTERN_ACC}, Improve=${IMPROVE_RATE} (${SELF_TIME}s)" else log_error "Self-Improvement results not found" fi else log_warning "Self-Improvement script not found: ${SELF_IMPROVE_EVAL}" fi else log_info "Skipping Self-Improvement evaluation (--skip-slow)" fi # ============================================================================= # Generate Dashboard # ============================================================================= section "Generating Dashboard" log_info "Creating visualization dashboard..." if [ -f "${DASHBOARD}" ]; then ${PYTHON} "${DASHBOARD}" --results-dir "${OUTPUT_DIR}/detailed" --output "${OUTPUT_DIR}" 2>&1 | tee "${OUTPUT_DIR}/detailed/dashboard_output.log" log_success "Dashboard generated at ${OUTPUT_DIR}/dashboard.html" else log_warning "Dashboard script not found: ${DASHBOARD}" fi # ============================================================================= # Generate Summary Report # ============================================================================= section "Summary Report" TOTAL_TIME=$(($(date +%s) - START_TIME)) echo "" echo "==============================================================================" echo "BENCHMARK RESULTS SUMMARY" echo "==============================================================================" echo "" echo "Model: ${MODEL}" echo "Evaluation Date: $(date '+%Y-%m-%d %H:%M:%S')" echo "Total Time: ${TOTAL_TIME}s" echo "" echo "------------------------------------------------------------------------------" echo "CODE GENERATION BENCHMARKS" echo "------------------------------------------------------------------------------" printf "%-20s %-15s %-15s\n" "Benchmark" "Pass@1" "Pass@10" echo "------------------------------------------------------------------------------" printf "%-20s %-15s %-15s\n" "HumanEval" "${BENCHMARK_RESULTS[humaneval_pass1]:-N/A}" "${BENCHMARK_RESULTS[humaneval_pass10]:-N/A}" printf "%-20s %-15s %-15s\n" "MBPP" "${BENCHMARK_RESULTS[mbpp_pass1]:-N/A}" "${BENCHMARK_RESULTS[mbpp_pass10]:-N/A}" echo "" echo "------------------------------------------------------------------------------" echo "TOOL USE CAPABILITIES" echo "------------------------------------------------------------------------------" printf "%-25s %-15s\n" "Metric" "Value" echo "------------------------------------------------------------------------------" printf "%-25s %-15s\n" "Tool Selection Accuracy" "${BENCHMARK_RESULTS[tool_selection_accuracy]:-N/A}" printf "%-25s %-15s\n" "Parameter Accuracy" "${BENCHMARK_RESULTS[parameter_accuracy]:-N/A}" printf "%-25s %-15s\n" "Execution Success Rate" "${BENCHMARK_RESULTS[execution_success_rate]:-N/A}" echo "" echo "------------------------------------------------------------------------------" echo "SELF-IMPROVEMENT CAPABILITIES" echo "------------------------------------------------------------------------------" printf "%-25s %-15s\n" "Metric" "Value" echo "------------------------------------------------------------------------------" printf "%-25s %-15s\n" "Memory Retention Rate" "${BENCHMARK_RESULTS[memory_retention]:-N/A}" printf "%-25s %-15s\n" "Pattern Application Accuracy" "${BENCHMARK_RESULTS[pattern_accuracy]:-N/A}" printf "%-25s %-15s\n" "Improvement Rate" "${BENCHMARK_RESULTS[improvement_rate]:-N/A}" echo "" echo "==============================================================================" # ============================================================================= # Save Summary to JSON # ============================================================================= cat > "${OUTPUT_DIR}/benchmark_summary.json" << EOF { "model": "${MODEL}", "evaluation_date": "$(date '+%Y-%m-%d %H:%M:%S')", "total_time_seconds": ${TOTAL_TIME}, "humaneval": { "pass_at_1": ${BENCHMARK_RESULTS[humaneval_pass1]:-null}, "pass_at_10": ${BENCHMARK_RESULTS[humaneval_pass10]:-null} }, "mbpp": { "pass_at_1": ${BENCHMARK_RESULTS[mbpp_pass1]:-null}, "pass_at_10": ${BENCHMARK_RESULTS[mbpp_pass10]:-null} }, "tool_use": { "tool_selection_accuracy": ${BENCHMARK_RESULTS[tool_selection_accuracy]:-null}, "parameter_accuracy": ${BENCHMARK_RESULTS[parameter_accuracy]:-null}, "execution_success_rate": ${BENCHMARK_RESULTS[execution_success_rate]:-null} }, "self_improvement": { "memory_retention_rate": ${BENCHMARK_RESULTS[memory_retention]:-null}, "pattern_application_accuracy": ${BENCHMARK_RESULTS[pattern_accuracy]:-null}, "improvement_rate": ${BENCHMARK_RESULTS[improvement_rate]:-null} } } EOF log_success "Summary saved to ${OUTPUT_DIR}/benchmark_summary.json" log_success "Detailed results in ${OUTPUT_DIR}/detailed/" echo "" log_success "All benchmarks completed successfully!"