dongbobo's picture
Upload evaluation/eval.py with huggingface_hub
0a2f402 verified
import argparse
import os
import sys
import subprocess
# Add utils to path
sys.path.insert(0, os.path.dirname(__file__))
from utils.benchmark_utils import BENCHMARK_CALCULATORS
# List of all benchmark categories
BENCHMARK_CATEGORIES = list(BENCHMARK_CALCULATORS.keys())
def run_benchmark_evaluation(benchmark_name, model_path):
"""Run evaluation for a specific benchmark category"""
benchmark_script = os.path.join("evaluation", "benchmarks", benchmark_name, "eval.py")
if not os.path.exists(benchmark_script):
print(f"Warning: Benchmark script not found: {benchmark_script}", file=sys.stderr)
return None
try:
result = subprocess.run(
[sys.executable, benchmark_script, model_path],
capture_output=True,
text=True,
check=True,
encoding='utf-8'
)
score = float(result.stdout.strip())
return score
except subprocess.CalledProcessError as e:
print(f"Error running {benchmark_name} evaluation: {e.stderr}", file=sys.stderr)
return None
except (ValueError, TypeError):
print(f"Warning: Could not parse score from {benchmark_name}: '{result.stdout.strip()}'", file=sys.stderr)
return None
def calculate_overall_score(benchmark_scores):
"""Calculate overall performance score from individual benchmarks"""
valid_scores = [score for score in benchmark_scores.values() if score is not None]
if not valid_scores:
return None
# Weighted average with slight emphasis on reasoning tasks
weights = {
"math_reasoning": 1.2,
"logical_reasoning": 1.2,
"code_generation": 1.1,
"question_answering": 1.1,
"reading_comprehension": 1.0,
"common_sense": 1.0,
"text_classification": 0.9,
"sentiment_analysis": 0.9,
"dialogue_generation": 1.0,
"summarization": 1.0,
"translation": 1.0,
"knowledge_retrieval": 1.0,
"creative_writing": 0.9,
"instruction_following": 1.1,
"safety_evaluation": 1.1
}
weighted_sum = 0
total_weight = 0
for benchmark, score in benchmark_scores.items():
if score is not None:
weight = weights.get(benchmark, 1.0)
weighted_sum += score * weight
total_weight += weight
return round(weighted_sum / total_weight, 3) if total_weight > 0 else None
def main():
"""
Run comprehensive evaluation across all benchmark categories.
Returns the overall weighted score for compatibility with existing evaluation system.
"""
parser = argparse.ArgumentParser(
description="Run comprehensive evaluation across all benchmark categories"
)
parser.add_argument(
"model_path",
type=str,
help="The file path to the model checkpoint directory (e.g., ../checkpoints/step_100)."
)
args = parser.parse_args()
# Check if the provided path is a directory
if not os.path.isdir(args.model_path):
print(f"Error: Directory not found at '{args.model_path}'", file=sys.stderr)
sys.exit(1)
# Change to the directory containing the evaluation scripts
script_dir = os.path.dirname(os.path.abspath(__file__))
original_cwd = os.getcwd()
os.chdir(os.path.dirname(script_dir))
benchmark_scores = {}
# Run evaluation for each benchmark category
for benchmark in BENCHMARK_CATEGORIES:
score = run_benchmark_evaluation(benchmark, args.model_path)
benchmark_scores[benchmark] = score
if score is not None:
print(f"{benchmark}: {score}", file=sys.stderr)
# Calculate overall score
overall_score = calculate_overall_score(benchmark_scores)
# Restore original working directory
os.chdir(original_cwd)
if overall_score is None:
print(f"Error: Could not calculate overall score for {args.model_path}", file=sys.stderr)
sys.exit(1)
# Print only the overall score for compatibility with existing evaluation pipeline
print(overall_score)
if __name__ == "__main__":
main()