Final_Assignment_Template / evaluate_agent.py
AlanRocha's picture
Update evaluate_agent.py
bd225e4 verified
Raw
History Blame Contribute Delete
4.26 kB
import argparse
import os
import time
from datetime import datetime
import requests
from dotenv import load_dotenv
from langchain_core.messages import HumanMessage
from langfuse import get_client
from langfuse.langchain import CallbackHandler
from agent import build_graph
from regexs import extract_last_ai_text, normalize_gaia_answer, strip_final_answer_prefix
BASE = os.getenv("SCORING_API_URL", "https://agents-course-unit4-scoring.hf.space").rstrip("/")
def file_name(item: dict) -> str:
return (item.get("file_name") or "").strip()
def has_file(item: dict) -> bool:
return bool(file_name(item))
TOOL_USE_FAILED = os.getenv("TOOL_USE_FAILED", "tool_use_failed")
MAX_RETRIES = int(os.getenv("AGENT_MAX_RETRIES", "2"))
_GAIA_FILES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "gaia_files")
def answer(graph, item: dict, cfg: dict) -> str:
msg = f"question: {item['question']}"
if fn := file_name(item):
full_path = os.path.join(_GAIA_FILES_DIR, fn)
msg += f"\nfile_path: {full_path}"
out = graph.invoke({"messages": [HumanMessage(content=msg)]}, config=cfg)
raw = normalize_gaia_answer(extract_last_ai_text(out["messages"]))
return strip_final_answer_prefix(raw)
def main() -> None:
load_dotenv()
p = argparse.ArgumentParser()
p.add_argument("--limit", type=int, default=0)
p.add_argument("--sleep", type=float, default=float(os.getenv("GROQ_EVAL_SLEEP_SECONDS", "2")))
args = p.parse_args()
questions = requests.get(f"{BASE}/questions", timeout=30).json()
if args.limit:
questions = questions[: args.limit]
graph = build_graph()
base_cfg = {"recursion_limit": int(os.getenv("LANGGRAPH_RECURSION_LIMIT", "80"))}
answers = []
run_id = datetime.now().strftime("run_%Y-%m-%d_%H-%M")
lf = CallbackHandler() if os.getenv("TRACE_WITH_LANGFUSE") else None
for i, item in enumerate(questions, 1):
print(f"[{i}/{len(questions)}] {item['task_id'][:8]}…")
result = "error when calling the agent"
if has_file(item) and ".py" not in file_name(item) and ".xlsx" not in file_name(item) and ".mp3" not in file_name(item):
answers.append({"task_id": item["task_id"], "submitted_answer": "has file, not processed yet"})
print(f"la question {i} a un fichier ({file_name(item)!r}), donc non traitée.")
continue
for attempt in range(MAX_RETRIES + 1):
try:
cfg = base_cfg if not lf else {
**base_cfg,
"callbacks": [lf],
"run_name": f"{run_id} question {i:02d}",
"metadata": {"langfuse_session_id": run_id},
}
result = answer(graph, item, cfg)
break
except Exception as e:
if TOOL_USE_FAILED in str(e) and attempt < MAX_RETRIES:
print(f" tentative {attempt + 1}/{MAX_RETRIES} : erreur tool calling, retry…")
else:
print(f" tentative {attempt + 1}/{MAX_RETRIES} : erreur non-tool-calling : {e}")
time.sleep(args.sleep)
answers.append({"task_id": item["task_id"], "submitted_answer": result})
if args.sleep and i < len(questions):
time.sleep(args.sleep)
for i, ans in enumerate(answers):
print(f"[{i+1}/{len(answers)}], {ans['submitted_answer']} \n")
try:
resp = requests.post(
f"{BASE}/submit",
json={
"username": os.environ["HF_USERNAME"],
"agent_code": os.environ["AGENT_CODE_URL"],
"answers": answers,
},
timeout=120,
)
r = resp.json()
except requests.JSONDecodeError:
print(f"Submit failed: HTTP {resp.status_code}\n{resp.text[:500]}")
return
if resp.status_code != 200 or "score" not in r:
print(f"Submit error (HTTP {resp.status_code}): {r}")
return
print( f"SCORE: {r['score']}% ({r['correct_count']}/{r['total_attempted']})")
if msg := r.get("message"):
print(msg)
if lf:
get_client().flush()
if __name__ == "__main__":
main()