import argparse import os import sys import subprocess # Add utils to path sys.path.insert(0, os.path.dirname(__file__)) from utils.benchmark_utils import BENCHMARK_CALCULATORS # List of all benchmark categories BENCHMARK_CATEGORIES = list(BENCHMARK_CALCULATORS.keys()) def run_benchmark_evaluation(benchmark_name, model_path): """Run evaluation for a specific benchmark category""" benchmark_script = os.path.join("evaluation", "benchmarks", benchmark_name, "eval.py") if not os.path.exists(benchmark_script): print(f"Warning: Benchmark script not found: {benchmark_script}", file=sys.stderr) return None try: result = subprocess.run( [sys.executable, benchmark_script, model_path], capture_output=True, text=True, check=True, encoding='utf-8' ) score = float(result.stdout.strip()) return score except subprocess.CalledProcessError as e: print(f"Error running {benchmark_name} evaluation: {e.stderr}", file=sys.stderr) return None except (ValueError, TypeError): print(f"Warning: Could not parse score from {benchmark_name}: '{result.stdout.strip()}'", file=sys.stderr) return None def calculate_overall_score(benchmark_scores): """Calculate overall performance score from individual benchmarks""" valid_scores = [score for score in benchmark_scores.values() if score is not None] if not valid_scores: return None # Weighted average with slight emphasis on reasoning tasks weights = { "math_reasoning": 1.2, "logical_reasoning": 1.2, "code_generation": 1.1, "question_answering": 1.1, "reading_comprehension": 1.0, "common_sense": 1.0, "text_classification": 0.9, "sentiment_analysis": 0.9, "dialogue_generation": 1.0, "summarization": 1.0, "translation": 1.0, "knowledge_retrieval": 1.0, "creative_writing": 0.9, "instruction_following": 1.1, "safety_evaluation": 1.1 } weighted_sum = 0 total_weight = 0 for benchmark, score in benchmark_scores.items(): if score is not None: weight = weights.get(benchmark, 1.0) weighted_sum += score * weight total_weight += weight return round(weighted_sum / total_weight, 3) if total_weight > 0 else None def main(): """ Run comprehensive evaluation across all benchmark categories. Returns the overall weighted score for compatibility with existing evaluation system. """ parser = argparse.ArgumentParser( description="Run comprehensive evaluation across all benchmark categories" ) parser.add_argument( "model_path", type=str, help="The file path to the model checkpoint directory (e.g., ../checkpoints/step_100)." ) args = parser.parse_args() # Check if the provided path is a directory if not os.path.isdir(args.model_path): print(f"Error: Directory not found at '{args.model_path}'", file=sys.stderr) sys.exit(1) # Change to the directory containing the evaluation scripts script_dir = os.path.dirname(os.path.abspath(__file__)) original_cwd = os.getcwd() os.chdir(os.path.dirname(script_dir)) benchmark_scores = {} # Run evaluation for each benchmark category for benchmark in BENCHMARK_CATEGORIES: score = run_benchmark_evaluation(benchmark, args.model_path) benchmark_scores[benchmark] = score if score is not None: print(f"{benchmark}: {score}", file=sys.stderr) # Calculate overall score overall_score = calculate_overall_score(benchmark_scores) # Restore original working directory os.chdir(original_cwd) if overall_score is None: print(f"Error: Could not calculate overall score for {args.model_path}", file=sys.stderr) sys.exit(1) # Print only the overall score for compatibility with existing evaluation pipeline print(overall_score) if __name__ == "__main__": main()