Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / evaluate_agent.py

AlanRocha

Update evaluate_agent.py

bd225e4 verified 4 days ago

Raw

History Blame Contribute Delete

4.26 kB

	import argparse
	import os
	import time
	from datetime import datetime

	import requests
	from dotenv import load_dotenv
	from langchain_core.messages import HumanMessage
	from langfuse import get_client
	from langfuse.langchain import CallbackHandler

	from agent import build_graph
	from regexs import extract_last_ai_text, normalize_gaia_answer, strip_final_answer_prefix

	BASE = os.getenv("SCORING_API_URL", "https://agents-course-unit4-scoring.hf.space").rstrip("/")


	def file_name(item: dict) -> str:
	return (item.get("file_name") or "").strip()


	def has_file(item: dict) -> bool:
	return bool(file_name(item))


	TOOL_USE_FAILED = os.getenv("TOOL_USE_FAILED", "tool_use_failed")
	MAX_RETRIES = int(os.getenv("AGENT_MAX_RETRIES", "2"))


	_GAIA_FILES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "gaia_files")


	def answer(graph, item: dict, cfg: dict) -> str:
	msg = f"question: {item['question']}"
	if fn := file_name(item):
	full_path = os.path.join(_GAIA_FILES_DIR, fn)
	msg += f"\nfile_path: {full_path}"

	out = graph.invoke({"messages": [HumanMessage(content=msg)]}, config=cfg)
	raw = normalize_gaia_answer(extract_last_ai_text(out["messages"]))
	return strip_final_answer_prefix(raw)


	def main() -> None:
	load_dotenv()
	p = argparse.ArgumentParser()
	p.add_argument("--limit", type=int, default=0)
	p.add_argument("--sleep", type=float, default=float(os.getenv("GROQ_EVAL_SLEEP_SECONDS", "2")))
	args = p.parse_args()

	questions = requests.get(f"{BASE}/questions", timeout=30).json()
	if args.limit:
	questions = questions[: args.limit]

	graph = build_graph()
	base_cfg = {"recursion_limit": int(os.getenv("LANGGRAPH_RECURSION_LIMIT", "80"))}
	answers = []
	run_id = datetime.now().strftime("run_%Y-%m-%d_%H-%M")
	lf = CallbackHandler() if os.getenv("TRACE_WITH_LANGFUSE") else None

	for i, item in enumerate(questions, 1):
	print(f"[{i}/{len(questions)}] {item['task_id'][:8]}…")

	result = "error when calling the agent"

	if has_file(item) and ".py" not in file_name(item) and ".xlsx" not in file_name(item) and ".mp3" not in file_name(item):
	answers.append({"task_id": item["task_id"], "submitted_answer": "has file, not processed yet"})
	print(f"la question {i} a un fichier ({file_name(item)!r}), donc non traitée.")
	continue

	for attempt in range(MAX_RETRIES + 1):
	try:
	cfg = base_cfg if not lf else {
	**base_cfg,
	"callbacks": [lf],
	"run_name": f"{run_id} question {i:02d}",
	"metadata": {"langfuse_session_id": run_id},
	}
	result = answer(graph, item, cfg)
	break
	except Exception as e:
	if TOOL_USE_FAILED in str(e) and attempt < MAX_RETRIES:
	print(f" tentative {attempt + 1}/{MAX_RETRIES} : erreur tool calling, retry…")
	else:
	print(f" tentative {attempt + 1}/{MAX_RETRIES} : erreur non-tool-calling : {e}")
	time.sleep(args.sleep)
	answers.append({"task_id": item["task_id"], "submitted_answer": result})

	if args.sleep and i < len(questions):
	time.sleep(args.sleep)


	for i, ans in enumerate(answers):
	print(f"[{i+1}/{len(answers)}], {ans['submitted_answer']} \n")

	try:
	resp = requests.post(
	f"{BASE}/submit",
	json={
	"username": os.environ["HF_USERNAME"],
	"agent_code": os.environ["AGENT_CODE_URL"],
	"answers": answers,
	},
	timeout=120,
	)
	r = resp.json()
	except requests.JSONDecodeError:
	print(f"Submit failed: HTTP {resp.status_code}\n{resp.text[:500]}")
	return
	if resp.status_code != 200 or "score" not in r:
	print(f"Submit error (HTTP {resp.status_code}): {r}")
	return


	print( f"SCORE: {r['score']}% ({r['correct_count']}/{r['total_attempted']})")
	if msg := r.get("message"):
	print(msg)

	if lf:
	get_client().flush()


	if __name__ == "__main__":
	main()