Stack-2-9-finetuned / stack /eval /run_all_benchmarks.sh
walidsobhie-code
refactor: Squeeze folders further - cleaner structure
65888d5
#!/bin/bash
# =============================================================================
# Stack 2.9 Full Benchmark Evaluation Suite
# =============================================================================
# Runs all benchmarks and generates comprehensive evaluation report.
#
# Usage:
# ./run_all_benchmarks.sh [OPTIONS]
#
# Options:
# --model MODEL Model name to evaluate (default: stack-2.9)
# --output DIR Output directory (default: ./results)
# --skip-slow Skip slow benchmarks
# --sample-size N Use N samples per benchmark (default: all)
# --verbose Verbose output
#
# =============================================================================
set -e
# Configuration
MODEL="${MODEL:-stack-2.9}"
OUTPUT_DIR="${OUTPUT_DIR:-./results}"
SAMPLE_SIZE=""
SKIP_SLOW=""
VERBOSE=""
PYTHON="${PYTHON:-python3}"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Benchmark scripts
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
HUMAN_EVAL="${SCRIPT_DIR}/human_eval.py"
MBPP_EVAL="${SCRIPT_DIR}/mbpp_eval.py"
TOOL_EVAL="${SCRIPT_DIR}/tool_use_eval.py"
SELF_IMPROVE_EVAL="${SCRIPT_DIR}/self_improve_eval.py"
DASHBOARD="${SCRIPT_DIR}/results/dashboard.py"
# =============================================================================
# Helper Functions
# =============================================================================
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
section() {
echo ""
echo "=============================================================================="
echo "$1"
echo "=============================================================================="
}
# =============================================================================
# Parse Arguments
# =============================================================================
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL="$2"
shift 2
;;
--output)
OUTPUT_DIR="$2"
shift 2
;;
--skip-slow)
SKIP_SLOW="1"
shift
;;
--sample-size)
SAMPLE_SIZE="$2"
shift 2
;;
--verbose)
VERBOSE="1"
shift
;;
--help)
echo "Stack 2.9 Full Benchmark Suite"
echo ""
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --model MODEL Model name (default: stack-2.9)"
echo " --output DIR Output directory (default: ./results)"
echo " --skip-slow Skip slow benchmarks"
echo " --sample-size N Sample size for each benchmark"
echo " --verbose Verbose output"
echo " --help Show this help message"
exit 0
;;
*)
log_error "Unknown option: $1"
exit 1
;;
esac
done
# =============================================================================
# Setup
# =============================================================================
log_info "Stack 2.9 Benchmark Suite"
log_info "Model: ${MODEL}"
log_info "Output: ${OUTPUT_DIR}"
echo ""
# Create output directory
mkdir -p "${OUTPUT_DIR}"
mkdir -p "${OUTPUT_DIR}/detailed"
# Track start time
START_TIME=$(date +%s)
# Results summary
declare -A BENCHMARK_RESULTS
# =============================================================================
# Check Dependencies
# =============================================================================
section "Checking Dependencies"
check_python() {
if command -v python3 &> /dev/null; then
PYTHON="python3"
elif command -v python &> /dev/null; then
PYTHON="python"
else
log_error "Python not found!"
exit 1
fi
log_success "Python: $(${PYTHON} --version)"
}
check_dependencies() {
log_info "Checking Python dependencies..."
# Check for required modules
REQUIRED_MODULES=("json" "datetime" "pathlib" "argparse")
MISSING=""
for module in "${REQUIRED_MODULES[@]}"; do
if ! ${PYTHON} -c "import ${module}" &> /dev/null; then
MISSING="${MISSING} ${module}"
fi
done
if [ -n "${MISSING}" ]; then
log_warning "Missing modules:${MISSING}"
log_info "These are standard library modules and should be available."
fi
log_success "Dependencies OK"
}
check_python
check_dependencies
# =============================================================================
# HumanEval Benchmark
# =============================================================================
section "HumanEval Benchmark"
log_info "Running HumanEval benchmark..."
log_info "Metrics: Pass@1, Pass@10, Pass@100"
HUMAN_EVAL_START=$(date +%s)
if [ -f "${HUMAN_EVAL}" ]; then
HUMAN_EVAL_CMD="${PYTHON} ${HUMAN_EVAL} --model ${MODEL} --output ${OUTPUT_DIR}/detailed"
if [ -n "${SAMPLE_SIZE}" ]; then
# Note: human_eval.py doesn't support sample-size directly
# but we include it for other benchmarks
:
fi
if [ -n "${VERBOSE}" ]; then
${HUMAN_EVAL_CMD} 2>&1 | tee "${OUTPUT_DIR}/detailed/humaneval_output.log"
else
${HUMAN_EVAL_CMD} > "${OUTPUT_DIR}/detailed/humaneval_output.log" 2>&1
fi
HUMAN_EVAL_END=$(date +%s)
HUMAN_EVAL_TIME=$((HUMAN_EVAL_END - HUMAN_EVAL_START))
if [ -f "${OUTPUT_DIR}/detailed/humaneval_results.json" ]; then
PASS_1=$(grep -o '"pass_at_1": [0-9.]*' "${OUTPUT_DIR}/detailed/humaneval_results.json" | cut -d':' -f2)
PASS_10=$(grep -o '"pass_at_10": [0-9.]*' "${OUTPUT_DIR}/detailed/humaneval_results.json" | cut -d':' -f2)
BENCHMARK_RESULTS["humaneval_pass1"]="${PASS_1}"
BENCHMARK_RESULTS["humaneval_pass10"]="${PASS_10}"
log_success "HumanEval: Pass@1=${PASS_1}, Pass@10=${PASS_10} (${HUMAN_EVAL_TIME}s)"
else
log_error "HumanEval results not found"
fi
else
log_warning "HumanEval script not found: ${HUMAN_EVAL}"
fi
# =============================================================================
# MBPP Benchmark
# =============================================================================
section "MBPP Benchmark"
log_info "Running MBPP benchmark..."
log_info "Metrics: Pass@1, Pass@10"
MBPP_START=$(date +%s)
if [ -f "${MBPP_EVAL}" ]; then
MBPP_CMD="${PYTHON} ${MBPP_EVAL} --model ${MODEL} --output ${OUTPUT_DIR}/detailed"
if [ -n "${VERBOSE}" ]; then
${MBPP_CMD} 2>&1 | tee "${OUTPUT_DIR}/detailed/mbpp_output.log"
else
${MBPP_CMD} > "${OUTPUT_DIR}/detailed/mbpp_output.log" 2>&1
fi
MBPP_END=$(date +%s)
MBPP_TIME=$((MBPP_END - MBPP_START))
if [ -f "${OUTPUT_DIR}/detailed/mbpp_results.json" ]; then
PASS_1=$(grep -o '"pass_at_1": [0-9.]*' "${OUTPUT_DIR}/detailed/mbpp_results.json" | cut -d':' -f2)
PASS_10=$(grep -o '"pass_at_10": [0-9.]*' "${OUTPUT_DIR}/detailed/mbpp_results.json" | cut -d':' -f2)
BENCHMARK_RESULTS["mbpp_pass1"]="${PASS_1}"
BENCHMARK_RESULTS["mbpp_pass10"]="${PASS_10}"
log_success "MBPP: Pass@1=${PASS_1}, Pass@10=${PASS_10} (${MBPP_TIME}s)"
else
log_error "MBPP results not found"
fi
else
log_warning "MBPP script not found: ${MBPP_EVAL}"
fi
# =============================================================================
# Tool Use Evaluation
# =============================================================================
section "Tool Use Evaluation"
log_info "Running Tool Use evaluation..."
log_info "Metrics: Tool Selection Accuracy, Parameter Accuracy, Execution Success"
TOOL_START=$(date +%s)
if [ -f "${TOOL_EVAL}" ]; then
TOOL_CMD="${PYTHON} ${TOOL_EVAL} --model ${MODEL} --output ${OUTPUT_DIR}/detailed"
if [ -n "${SAMPLE_SIZE}" ]; then
TOOL_CMD="${TOOL_CMD} --sample ${SAMPLE_SIZE}"
fi
if [ -n "${VERBOSE}" ]; then
${TOOL_CMD} 2>&1 | tee "${OUTPUT_DIR}/detailed/tool_output.log"
else
${TOOL_CMD} > "${OUTPUT_DIR}/detailed/tool_output.log" 2>&1
fi
TOOL_END=$(date +%s)
TOOL_TIME=$((TOOL_END - TOOL_START))
if [ -f "${OUTPUT_DIR}/detailed/tool_use_results.json" ]; then
TOOL_ACC=$(grep -o '"tool_selection_accuracy": [0-9.]*' "${OUTPUT_DIR}/detailed/tool_use_results.json" | cut -d':' -f2)
PARAM_ACC=$(grep -o '"parameter_accuracy": [0-9.]*' "${OUTPUT_DIR}/detailed/tool_use_results.json" | cut -d':' -f2)
EXEC_RATE=$(grep -o '"execution_success_rate": [0-9.]*' "${OUTPUT_DIR}/detailed/tool_use_results.json" | cut -d':' -f2)
BENCHMARK_RESULTS["tool_selection_accuracy"]="${TOOL_ACC}"
BENCHMARK_RESULTS["parameter_accuracy"]="${PARAM_ACC}"
BENCHMARK_RESULTS["execution_success_rate"]="${EXEC_RATE}"
log_success "Tool Use: Selection=${TOOL_ACC}, Param=${PARAM_ACC}, Exec=${EXEC_RATE} (${TOOL_TIME}s)"
else
log_error "Tool Use results not found"
fi
else
log_warning "Tool Use script not found: ${TOOL_EVAL}"
fi
# =============================================================================
# Self-Improvement Evaluation
# =============================================================================
if [ -z "${SKIP_SLOW}" ]; then
section "Self-Improvement Evaluation"
log_info "Running Self-Improvement evaluation..."
log_info "Metrics: Memory Retention, Pattern Application, Improvement Rate"
SELF_IMPROVE_START=$(date +%s)
if [ -f "${SELF_IMPROVE_EVAL}" ]; then
SELF_CMD="${PYTHON} ${SELF_IMPROVE_EVAL} --model ${MODEL} --output ${OUTPUT_DIR}/detailed"
if [ -n "${VERBOSE}" ]; then
${SELF_CMD} 2>&1 | tee "${OUTPUT_DIR}/detailed/self_improve_output.log"
else
${SELF_CMD} > "${OUTPUT_DIR}/detailed/self_improve_output.log" 2>&1
fi
SELF_IMPROVE_END=$(date +%s)
SELF_TIME=$((SELF_IMPROVE_END - SELF_IMPROVE_START))
if [ -f "${OUTPUT_DIR}/detailed/self_improve_results.json" ]; then
MEM_RET=$(grep -o '"memory_retention_rate": [0-9.]*' "${OUTPUT_DIR}/detailed/self_improve_results.json" | cut -d':' -f2)
PATTERN_ACC=$(grep -o '"pattern_application_accuracy": [0-9.]*' "${OUTPUT_DIR}/detailed/self_improve_results.json" | cut -d':' -f2)
IMPROVE_RATE=$(grep -o '"improvement_rate": [0-9.]*' "${OUTPUT_DIR}/detailed/self_improve_results.json" | cut -d':' -f2)
BENCHMARK_RESULTS["memory_retention"]="${MEM_RET}"
BENCHMARK_RESULTS["pattern_accuracy"]="${PATTERN_ACC}"
BENCHMARK_RESULTS["improvement_rate"]="${IMPROVE_RATE}"
log_success "Self-Improve: Memory=${MEM_RET}, Pattern=${PATTERN_ACC}, Improve=${IMPROVE_RATE} (${SELF_TIME}s)"
else
log_error "Self-Improvement results not found"
fi
else
log_warning "Self-Improvement script not found: ${SELF_IMPROVE_EVAL}"
fi
else
log_info "Skipping Self-Improvement evaluation (--skip-slow)"
fi
# =============================================================================
# Generate Dashboard
# =============================================================================
section "Generating Dashboard"
log_info "Creating visualization dashboard..."
if [ -f "${DASHBOARD}" ]; then
${PYTHON} "${DASHBOARD}" --results-dir "${OUTPUT_DIR}/detailed" --output "${OUTPUT_DIR}" 2>&1 | tee "${OUTPUT_DIR}/detailed/dashboard_output.log"
log_success "Dashboard generated at ${OUTPUT_DIR}/dashboard.html"
else
log_warning "Dashboard script not found: ${DASHBOARD}"
fi
# =============================================================================
# Generate Summary Report
# =============================================================================
section "Summary Report"
TOTAL_TIME=$(($(date +%s) - START_TIME))
echo ""
echo "=============================================================================="
echo "BENCHMARK RESULTS SUMMARY"
echo "=============================================================================="
echo ""
echo "Model: ${MODEL}"
echo "Evaluation Date: $(date '+%Y-%m-%d %H:%M:%S')"
echo "Total Time: ${TOTAL_TIME}s"
echo ""
echo "------------------------------------------------------------------------------"
echo "CODE GENERATION BENCHMARKS"
echo "------------------------------------------------------------------------------"
printf "%-20s %-15s %-15s\n" "Benchmark" "Pass@1" "Pass@10"
echo "------------------------------------------------------------------------------"
printf "%-20s %-15s %-15s\n" "HumanEval" "${BENCHMARK_RESULTS[humaneval_pass1]:-N/A}" "${BENCHMARK_RESULTS[humaneval_pass10]:-N/A}"
printf "%-20s %-15s %-15s\n" "MBPP" "${BENCHMARK_RESULTS[mbpp_pass1]:-N/A}" "${BENCHMARK_RESULTS[mbpp_pass10]:-N/A}"
echo ""
echo "------------------------------------------------------------------------------"
echo "TOOL USE CAPABILITIES"
echo "------------------------------------------------------------------------------"
printf "%-25s %-15s\n" "Metric" "Value"
echo "------------------------------------------------------------------------------"
printf "%-25s %-15s\n" "Tool Selection Accuracy" "${BENCHMARK_RESULTS[tool_selection_accuracy]:-N/A}"
printf "%-25s %-15s\n" "Parameter Accuracy" "${BENCHMARK_RESULTS[parameter_accuracy]:-N/A}"
printf "%-25s %-15s\n" "Execution Success Rate" "${BENCHMARK_RESULTS[execution_success_rate]:-N/A}"
echo ""
echo "------------------------------------------------------------------------------"
echo "SELF-IMPROVEMENT CAPABILITIES"
echo "------------------------------------------------------------------------------"
printf "%-25s %-15s\n" "Metric" "Value"
echo "------------------------------------------------------------------------------"
printf "%-25s %-15s\n" "Memory Retention Rate" "${BENCHMARK_RESULTS[memory_retention]:-N/A}"
printf "%-25s %-15s\n" "Pattern Application Accuracy" "${BENCHMARK_RESULTS[pattern_accuracy]:-N/A}"
printf "%-25s %-15s\n" "Improvement Rate" "${BENCHMARK_RESULTS[improvement_rate]:-N/A}"
echo ""
echo "=============================================================================="
# =============================================================================
# Save Summary to JSON
# =============================================================================
cat > "${OUTPUT_DIR}/benchmark_summary.json" << EOF
{
"model": "${MODEL}",
"evaluation_date": "$(date '+%Y-%m-%d %H:%M:%S')",
"total_time_seconds": ${TOTAL_TIME},
"humaneval": {
"pass_at_1": ${BENCHMARK_RESULTS[humaneval_pass1]:-null},
"pass_at_10": ${BENCHMARK_RESULTS[humaneval_pass10]:-null}
},
"mbpp": {
"pass_at_1": ${BENCHMARK_RESULTS[mbpp_pass1]:-null},
"pass_at_10": ${BENCHMARK_RESULTS[mbpp_pass10]:-null}
},
"tool_use": {
"tool_selection_accuracy": ${BENCHMARK_RESULTS[tool_selection_accuracy]:-null},
"parameter_accuracy": ${BENCHMARK_RESULTS[parameter_accuracy]:-null},
"execution_success_rate": ${BENCHMARK_RESULTS[execution_success_rate]:-null}
},
"self_improvement": {
"memory_retention_rate": ${BENCHMARK_RESULTS[memory_retention]:-null},
"pattern_application_accuracy": ${BENCHMARK_RESULTS[pattern_accuracy]:-null},
"improvement_rate": ${BENCHMARK_RESULTS[improvement_rate]:-null}
}
}
EOF
log_success "Summary saved to ${OUTPUT_DIR}/benchmark_summary.json"
log_success "Detailed results in ${OUTPUT_DIR}/detailed/"
echo ""
log_success "All benchmarks completed successfully!"