PrimoGreedy-Agent / scripts /run_evals.py
CiscsoPonce's picture
feat: Sprint 9 — Execution & Quality Control
645673f
"""Run PrimoGreedy evaluators against the golden dataset.
Usage:
python scripts/run_evals.py [--dataset primogreedy-golden-v1] [--experiment sprint9]
Runs all evaluators from ``scripts/evaluators.py`` against the specified
LangSmith dataset and posts results to the LangSmith Experiments dashboard.
"""
import argparse
import os
import sys
from dotenv import load_dotenv
load_dotenv()
def _passthrough_predictor(inputs: dict) -> dict:
"""Identity function — we evaluate stored outputs, not re-run the pipeline."""
return inputs
def run_evaluation(dataset_name: str = "primogreedy-golden-v1", experiment_prefix: str = "sprint9"):
api_key = os.getenv("LANGCHAIN_API_KEY") or os.getenv("LANGSMITH_API_KEY")
if not api_key:
print("ERROR: Set LANGCHAIN_API_KEY or LANGSMITH_API_KEY env var.")
sys.exit(1)
try:
from langsmith import Client, evaluate
except ImportError:
print("ERROR: langsmith package not installed. pip install langsmith")
sys.exit(1)
from scripts.evaluators import ALL_EVALUATORS
client = Client(api_key=api_key)
try:
client.read_dataset(dataset_name=dataset_name)
except Exception:
print(f"ERROR: Dataset '{dataset_name}' not found.")
print("Run 'python scripts/build_golden_dataset.py' first.")
sys.exit(1)
print(f"Running {len(ALL_EVALUATORS)} evaluators against '{dataset_name}'...")
print(f"Experiment prefix: {experiment_prefix}")
print()
results = evaluate(
_passthrough_predictor,
data=dataset_name,
evaluators=ALL_EVALUATORS,
experiment_prefix=experiment_prefix,
client=client,
)
print("\n--- Evaluation Complete ---")
print(f"Results posted to LangSmith under experiment prefix: {experiment_prefix}")
print("View detailed results at: https://smith.langchain.com")
try:
for result in results:
example_id = result.get("example_id", "?")
scores = result.get("evaluation_results", {})
print(f"\n Example {example_id}:")
if isinstance(scores, dict):
for key, val in scores.items():
score = val.get("score", "?") if isinstance(val, dict) else val
print(f" {key}: {score}")
elif isinstance(scores, list):
for s in scores:
key = s.get("key", "?") if isinstance(s, dict) else "?"
score = s.get("score", "?") if isinstance(s, dict) else s
print(f" {key}: {score}")
except Exception:
print(" (Results streamed to LangSmith — check the dashboard for details)")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run PrimoGreedy evaluations")
parser.add_argument("--dataset", default="primogreedy-golden-v1")
parser.add_argument("--experiment", default="sprint9")
args = parser.parse_args()
run_evaluation(dataset_name=args.dataset, experiment_prefix=args.experiment)