|
|
import argparse |
|
|
import os |
|
|
import sys |
|
|
import subprocess |
|
|
|
|
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__)) |
|
|
from utils.benchmark_utils import BENCHMARK_CALCULATORS |
|
|
|
|
|
|
|
|
BENCHMARK_CATEGORIES = list(BENCHMARK_CALCULATORS.keys()) |
|
|
|
|
|
def run_benchmark_evaluation(benchmark_name, model_path): |
|
|
"""Run evaluation for a specific benchmark category""" |
|
|
benchmark_script = os.path.join("evaluation", "benchmarks", benchmark_name, "eval.py") |
|
|
|
|
|
if not os.path.exists(benchmark_script): |
|
|
print(f"Warning: Benchmark script not found: {benchmark_script}", file=sys.stderr) |
|
|
return None |
|
|
|
|
|
try: |
|
|
result = subprocess.run( |
|
|
[sys.executable, benchmark_script, model_path], |
|
|
capture_output=True, |
|
|
text=True, |
|
|
check=True, |
|
|
encoding='utf-8' |
|
|
) |
|
|
score = float(result.stdout.strip()) |
|
|
return score |
|
|
except subprocess.CalledProcessError as e: |
|
|
print(f"Error running {benchmark_name} evaluation: {e.stderr}", file=sys.stderr) |
|
|
return None |
|
|
except (ValueError, TypeError): |
|
|
print(f"Warning: Could not parse score from {benchmark_name}: '{result.stdout.strip()}'", file=sys.stderr) |
|
|
return None |
|
|
|
|
|
def calculate_overall_score(benchmark_scores): |
|
|
"""Calculate overall performance score from individual benchmarks""" |
|
|
valid_scores = [score for score in benchmark_scores.values() if score is not None] |
|
|
if not valid_scores: |
|
|
return None |
|
|
|
|
|
|
|
|
weights = { |
|
|
"math_reasoning": 1.2, |
|
|
"logical_reasoning": 1.2, |
|
|
"code_generation": 1.1, |
|
|
"question_answering": 1.1, |
|
|
"reading_comprehension": 1.0, |
|
|
"common_sense": 1.0, |
|
|
"text_classification": 0.9, |
|
|
"sentiment_analysis": 0.9, |
|
|
"dialogue_generation": 1.0, |
|
|
"summarization": 1.0, |
|
|
"translation": 1.0, |
|
|
"knowledge_retrieval": 1.0, |
|
|
"creative_writing": 0.9, |
|
|
"instruction_following": 1.1, |
|
|
"safety_evaluation": 1.1 |
|
|
} |
|
|
|
|
|
weighted_sum = 0 |
|
|
total_weight = 0 |
|
|
|
|
|
for benchmark, score in benchmark_scores.items(): |
|
|
if score is not None: |
|
|
weight = weights.get(benchmark, 1.0) |
|
|
weighted_sum += score * weight |
|
|
total_weight += weight |
|
|
|
|
|
return round(weighted_sum / total_weight, 3) if total_weight > 0 else None |
|
|
|
|
|
def main(): |
|
|
""" |
|
|
Run comprehensive evaluation across all benchmark categories. |
|
|
Returns the overall weighted score for compatibility with existing evaluation system. |
|
|
""" |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Run comprehensive evaluation across all benchmark categories" |
|
|
) |
|
|
parser.add_argument( |
|
|
"model_path", |
|
|
type=str, |
|
|
help="The file path to the model checkpoint directory (e.g., ../checkpoints/step_100)." |
|
|
) |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
if not os.path.isdir(args.model_path): |
|
|
print(f"Error: Directory not found at '{args.model_path}'", file=sys.stderr) |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
script_dir = os.path.dirname(os.path.abspath(__file__)) |
|
|
original_cwd = os.getcwd() |
|
|
os.chdir(os.path.dirname(script_dir)) |
|
|
|
|
|
benchmark_scores = {} |
|
|
|
|
|
|
|
|
for benchmark in BENCHMARK_CATEGORIES: |
|
|
score = run_benchmark_evaluation(benchmark, args.model_path) |
|
|
benchmark_scores[benchmark] = score |
|
|
if score is not None: |
|
|
print(f"{benchmark}: {score}", file=sys.stderr) |
|
|
|
|
|
|
|
|
overall_score = calculate_overall_score(benchmark_scores) |
|
|
|
|
|
|
|
|
os.chdir(original_cwd) |
|
|
|
|
|
if overall_score is None: |
|
|
print(f"Error: Could not calculate overall score for {args.model_path}", file=sys.stderr) |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
print(overall_score) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |