Spaces:
Sleeping
Sleeping
| import argparse | |
| import os | |
| import time | |
| from datetime import datetime | |
| import requests | |
| from dotenv import load_dotenv | |
| from langchain_core.messages import HumanMessage | |
| from langfuse import get_client | |
| from langfuse.langchain import CallbackHandler | |
| from agent import build_graph | |
| from regexs import extract_last_ai_text, normalize_gaia_answer, strip_final_answer_prefix | |
| BASE = os.getenv("SCORING_API_URL", "https://agents-course-unit4-scoring.hf.space").rstrip("/") | |
| def file_name(item: dict) -> str: | |
| return (item.get("file_name") or "").strip() | |
| def has_file(item: dict) -> bool: | |
| return bool(file_name(item)) | |
| TOOL_USE_FAILED = os.getenv("TOOL_USE_FAILED", "tool_use_failed") | |
| MAX_RETRIES = int(os.getenv("AGENT_MAX_RETRIES", "2")) | |
| _GAIA_FILES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "gaia_files") | |
| def answer(graph, item: dict, cfg: dict) -> str: | |
| msg = f"question: {item['question']}" | |
| if fn := file_name(item): | |
| full_path = os.path.join(_GAIA_FILES_DIR, fn) | |
| msg += f"\nfile_path: {full_path}" | |
| out = graph.invoke({"messages": [HumanMessage(content=msg)]}, config=cfg) | |
| raw = normalize_gaia_answer(extract_last_ai_text(out["messages"])) | |
| return strip_final_answer_prefix(raw) | |
| def main() -> None: | |
| load_dotenv() | |
| p = argparse.ArgumentParser() | |
| p.add_argument("--limit", type=int, default=0) | |
| p.add_argument("--sleep", type=float, default=float(os.getenv("GROQ_EVAL_SLEEP_SECONDS", "2"))) | |
| args = p.parse_args() | |
| questions = requests.get(f"{BASE}/questions", timeout=30).json() | |
| if args.limit: | |
| questions = questions[: args.limit] | |
| graph = build_graph() | |
| base_cfg = {"recursion_limit": int(os.getenv("LANGGRAPH_RECURSION_LIMIT", "80"))} | |
| answers = [] | |
| run_id = datetime.now().strftime("run_%Y-%m-%d_%H-%M") | |
| lf = CallbackHandler() if os.getenv("TRACE_WITH_LANGFUSE") else None | |
| for i, item in enumerate(questions, 1): | |
| print(f"[{i}/{len(questions)}] {item['task_id'][:8]}…") | |
| result = "error when calling the agent" | |
| if has_file(item) and ".py" not in file_name(item) and ".xlsx" not in file_name(item) and ".mp3" not in file_name(item): | |
| answers.append({"task_id": item["task_id"], "submitted_answer": "has file, not processed yet"}) | |
| print(f"la question {i} a un fichier ({file_name(item)!r}), donc non traitée.") | |
| continue | |
| for attempt in range(MAX_RETRIES + 1): | |
| try: | |
| cfg = base_cfg if not lf else { | |
| **base_cfg, | |
| "callbacks": [lf], | |
| "run_name": f"{run_id} question {i:02d}", | |
| "metadata": {"langfuse_session_id": run_id}, | |
| } | |
| result = answer(graph, item, cfg) | |
| break | |
| except Exception as e: | |
| if TOOL_USE_FAILED in str(e) and attempt < MAX_RETRIES: | |
| print(f" tentative {attempt + 1}/{MAX_RETRIES} : erreur tool calling, retry…") | |
| else: | |
| print(f" tentative {attempt + 1}/{MAX_RETRIES} : erreur non-tool-calling : {e}") | |
| time.sleep(args.sleep) | |
| answers.append({"task_id": item["task_id"], "submitted_answer": result}) | |
| if args.sleep and i < len(questions): | |
| time.sleep(args.sleep) | |
| for i, ans in enumerate(answers): | |
| print(f"[{i+1}/{len(answers)}], {ans['submitted_answer']} \n") | |
| try: | |
| resp = requests.post( | |
| f"{BASE}/submit", | |
| json={ | |
| "username": os.environ["HF_USERNAME"], | |
| "agent_code": os.environ["AGENT_CODE_URL"], | |
| "answers": answers, | |
| }, | |
| timeout=120, | |
| ) | |
| r = resp.json() | |
| except requests.JSONDecodeError: | |
| print(f"Submit failed: HTTP {resp.status_code}\n{resp.text[:500]}") | |
| return | |
| if resp.status_code != 200 or "score" not in r: | |
| print(f"Submit error (HTTP {resp.status_code}): {r}") | |
| return | |
| print( f"SCORE: {r['score']}% ({r['correct_count']}/{r['total_attempted']})") | |
| if msg := r.get("message"): | |
| print(msg) | |
| if lf: | |
| get_client().flush() | |
| if __name__ == "__main__": | |
| main() |