Upload evaluation/eval.py with huggingface_hub

0a2f402 verified 24 days ago

4.16 kB

	import argparse
	import os
	import sys
	import subprocess

	# Add utils to path
	sys.path.insert(0, os.path.dirname(__file__))
	from utils.benchmark_utils import BENCHMARK_CALCULATORS

	# List of all benchmark categories
	BENCHMARK_CATEGORIES = list(BENCHMARK_CALCULATORS.keys())

	def run_benchmark_evaluation(benchmark_name, model_path):
	"""Run evaluation for a specific benchmark category"""
	benchmark_script = os.path.join("evaluation", "benchmarks", benchmark_name, "eval.py")

	if not os.path.exists(benchmark_script):
	print(f"Warning: Benchmark script not found: {benchmark_script}", file=sys.stderr)
	return None

	try:
	result = subprocess.run(
	[sys.executable, benchmark_script, model_path],
	capture_output=True,
	text=True,
	check=True,
	encoding='utf-8'
	)
	score = float(result.stdout.strip())
	return score
	except subprocess.CalledProcessError as e:
	print(f"Error running {benchmark_name} evaluation: {e.stderr}", file=sys.stderr)
	return None
	except (ValueError, TypeError):
	print(f"Warning: Could not parse score from {benchmark_name}: '{result.stdout.strip()}'", file=sys.stderr)
	return None

	def calculate_overall_score(benchmark_scores):
	"""Calculate overall performance score from individual benchmarks"""
	valid_scores = [score for score in benchmark_scores.values() if score is not None]
	if not valid_scores:
	return None

	# Weighted average with slight emphasis on reasoning tasks
	weights = {
	"math_reasoning": 1.2,
	"logical_reasoning": 1.2,
	"code_generation": 1.1,
	"question_answering": 1.1,
	"reading_comprehension": 1.0,
	"common_sense": 1.0,
	"text_classification": 0.9,
	"sentiment_analysis": 0.9,
	"dialogue_generation": 1.0,
	"summarization": 1.0,
	"translation": 1.0,
	"knowledge_retrieval": 1.0,
	"creative_writing": 0.9,
	"instruction_following": 1.1,
	"safety_evaluation": 1.1
	}

	weighted_sum = 0
	total_weight = 0

	for benchmark, score in benchmark_scores.items():
	if score is not None:
	weight = weights.get(benchmark, 1.0)
	weighted_sum += score * weight
	total_weight += weight

	return round(weighted_sum / total_weight, 3) if total_weight > 0 else None

	def main():
	"""
	Run comprehensive evaluation across all benchmark categories.
	Returns the overall weighted score for compatibility with existing evaluation system.
	"""
	parser = argparse.ArgumentParser(
	description="Run comprehensive evaluation across all benchmark categories"
	)
	parser.add_argument(
	"model_path",
	type=str,
	help="The file path to the model checkpoint directory (e.g., ../checkpoints/step_100)."
	)
	args = parser.parse_args()

	# Check if the provided path is a directory
	if not os.path.isdir(args.model_path):
	print(f"Error: Directory not found at '{args.model_path}'", file=sys.stderr)
	sys.exit(1)

	# Change to the directory containing the evaluation scripts
	script_dir = os.path.dirname(os.path.abspath(__file__))
	original_cwd = os.getcwd()
	os.chdir(os.path.dirname(script_dir))

	benchmark_scores = {}

	# Run evaluation for each benchmark category
	for benchmark in BENCHMARK_CATEGORIES:
	score = run_benchmark_evaluation(benchmark, args.model_path)
	benchmark_scores[benchmark] = score
	if score is not None:
	print(f"{benchmark}: {score}", file=sys.stderr)

	# Calculate overall score
	overall_score = calculate_overall_score(benchmark_scores)

	# Restore original working directory
	os.chdir(original_cwd)

	if overall_score is None:
	print(f"Error: Could not calculate overall score for {args.model_path}", file=sys.stderr)
	sys.exit(1)

	# Print only the overall score for compatibility with existing evaluation pipeline
	print(overall_score)

	if __name__ == "__main__":
	main()