KPatelis commited on
Commit
dfd1417
·
verified ·
1 Parent(s): 0bb2992

Upload 26 files

Browse files
config.yaml CHANGED
@@ -24,19 +24,15 @@ models:
24
  temperature: 0.6
25
  repetition_penalty: 1.3
26
  provider: "auto"
27
- thinking_enabled: true
 
 
28
  vlm:
29
  model_name: "Qwen/Qwen3-VL-32B-Instruct" # Hugging Face model ID
30
  asr:
31
  model_name: "openai/whisper-large-v3" # Hugging Face model ID — must have a provider on HF Inference Providers
32
- #device: "cuda" # cpu, cuda, or mps (for Mac)
33
- #parameters:
34
- # temperature: 0.7
35
- # max_new_tokens: 512
36
- # repetition_penalty: 1.1
37
-
38
  graph:
39
- recursion_limit: 40 # Max graph-node visits before bailing. ~5 + 2*(tool calls), so 40 ≈ 17 tool calls.
40
 
41
  api:
42
  base_url: "https://agents-course-unit4-scoring.hf.space"
 
24
  temperature: 0.6
25
  repetition_penalty: 1.3
26
  provider: "auto"
27
+ thinking_enabled: false
28
+ timeout: 300 # Read timeout (s) for the HF Inference call. Default 120 is too short under load.
29
+ max_new_tokens: 4096 # Output cap. Default 512 truncates long responses and breaks tool calls.
30
  vlm:
31
  model_name: "Qwen/Qwen3-VL-32B-Instruct" # Hugging Face model ID
32
  asr:
33
  model_name: "openai/whisper-large-v3" # Hugging Face model ID — must have a provider on HF Inference Providers
 
 
 
 
 
 
34
  graph:
35
+ recursion_limit: 40 # Max graph-node visits before bailing.
36
 
37
  api:
38
  base_url: "https://agents-course-unit4-scoring.hf.space"
gaia/__pycache__/agent.cpython-313.pyc CHANGED
Binary files a/gaia/__pycache__/agent.cpython-313.pyc and b/gaia/__pycache__/agent.cpython-313.pyc differ
 
gaia/agent.py CHANGED
@@ -64,11 +64,14 @@ if enable_vector_search:
64
  reranker = CrossEncoder(config["models"]["reranker"]["model_name"], cache_folder=config["models"]["cache_folder"])
65
 
66
  # LLM for Agent
 
67
  llm = HuggingFaceEndpoint(
68
  repo_id=config["models"]["llm"]["model_name"],
69
- temperature=config["models"]["llm"]["parameters"]["temperature"],
70
- repetition_penalty=config["models"]["llm"]["parameters"]["repetition_penalty"],
71
- provider=config["models"]["llm"]["parameters"]["provider"],
 
 
72
  huggingfacehub_api_token=hf_key
73
  )
74
 
 
64
  reranker = CrossEncoder(config["models"]["reranker"]["model_name"], cache_folder=config["models"]["cache_folder"])
65
 
66
  # LLM for Agent
67
+ _llm_params = config["models"]["llm"]["parameters"]
68
  llm = HuggingFaceEndpoint(
69
  repo_id=config["models"]["llm"]["model_name"],
70
+ temperature=_llm_params["temperature"],
71
+ repetition_penalty=_llm_params["repetition_penalty"],
72
+ provider=_llm_params["provider"],
73
+ timeout=_llm_params.get("timeout", 120),
74
+ max_new_tokens=_llm_params.get("max_new_tokens", 512),
75
  huggingfacehub_api_token=hf_key
76
  )
77
 
gaia/prompts/prompt.yaml CHANGED
@@ -14,6 +14,7 @@ prompt: |
14
  - If you can guess the exact page title from the question, call `wikipedia_page_fetch(title)` directly — this is the fastest path and returns the full page.
15
  - Otherwise, call `wiki_search` once to find candidate titles, then `wikipedia_page_fetch` on the best one.
16
  - Namespaced pages work too, e.g. `wikipedia_page_fetch("Wikipedia:Featured_article_candidates/Featured_log/November_2016")`.
 
17
  - **General web research**: prefer `tavily_web_search` (cleaner, LLM-optimised snippets). Use `duck_web_search` only if Tavily fails.
18
  - **When any search result gives you a specific URL**, call `fetch_webpage` to read the full page — do not loop on snippets.
19
  - **Do not repeat queries** with trivial rewording. If a search did not help, switch tools or pivot (try a different angle, fetch a referenced page, or go Wikipedia-direct).
 
14
  - If you can guess the exact page title from the question, call `wikipedia_page_fetch(title)` directly — this is the fastest path and returns the full page.
15
  - Otherwise, call `wiki_search` once to find candidate titles, then `wikipedia_page_fetch` on the best one.
16
  - Namespaced pages work too, e.g. `wikipedia_page_fetch("Wikipedia:Featured_article_candidates/Featured_log/November_2016")`.
17
+ - **For "as of <date>" questions** (rosters, statistics, member lists, records that may have drifted since), use `wikipedia_page_as_of(title, date)` with `date` in `YYYY-MM-DD` form — this fetches the article as it appeared at end of day UTC on that date, not the current version.
18
  - **General web research**: prefer `tavily_web_search` (cleaner, LLM-optimised snippets). Use `duck_web_search` only if Tavily fails.
19
  - **When any search result gives you a specific URL**, call `fetch_webpage` to read the full page — do not loop on snippets.
20
  - **Do not repeat queries** with trivial rewording. If a search did not help, switch tools or pivot (try a different angle, fetch a referenced page, or go Wikipedia-direct).
gaia/tools/__init__.py CHANGED
@@ -2,7 +2,8 @@
2
  from gaia.tools.basic import calculator, python_eval
3
  from gaia.tools.web import (
4
  duck_web_search, tavily_web_search, wiki_search, wikipedia_page_fetch,
5
- arxiv_search, fetch_webpage, youtube_transcript, retry_file_download,
 
6
  )
7
  from gaia.tools.files import (
8
  read_pdf, read_docx, read_pptx, read_text_file,
@@ -18,6 +19,7 @@ tools_list = [
18
  duck_web_search,
19
  wiki_search,
20
  wikipedia_page_fetch,
 
21
  arxiv_search,
22
  tavily_web_search,
23
  fetch_webpage,
 
2
  from gaia.tools.basic import calculator, python_eval
3
  from gaia.tools.web import (
4
  duck_web_search, tavily_web_search, wiki_search, wikipedia_page_fetch,
5
+ wikipedia_page_as_of, arxiv_search, fetch_webpage, youtube_transcript,
6
+ retry_file_download,
7
  )
8
  from gaia.tools.files import (
9
  read_pdf, read_docx, read_pptx, read_text_file,
 
19
  duck_web_search,
20
  wiki_search,
21
  wikipedia_page_fetch,
22
+ wikipedia_page_as_of,
23
  arxiv_search,
24
  tavily_web_search,
25
  fetch_webpage,
gaia/tools/__pycache__/__init__.cpython-313.pyc CHANGED
Binary files a/gaia/tools/__pycache__/__init__.cpython-313.pyc and b/gaia/tools/__pycache__/__init__.cpython-313.pyc differ
 
gaia/tools/__pycache__/web.cpython-313.pyc CHANGED
Binary files a/gaia/tools/__pycache__/web.cpython-313.pyc and b/gaia/tools/__pycache__/web.cpython-313.pyc differ
 
gaia/tools/web.py CHANGED
@@ -1,11 +1,27 @@
1
  """Web search and fetching tools: DuckDuckGo, Tavily, Wikipedia, Arxiv, webpage fetch, YouTube transcripts."""
 
 
 
 
 
 
 
2
  from langchain_community.tools import DuckDuckGoSearchRun
3
  from langchain_community.tools.tavily_search import TavilySearchResults
4
  from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
5
  from langchain_core.tools import tool
 
 
6
 
7
  from gaia.utils import extract_youtube_id, load_config, download_task_file
8
 
 
 
 
 
 
 
 
9
 
10
  _ddg_search = None
11
  _tavily_search = None
@@ -32,66 +48,241 @@ def duck_web_search(query: str) -> str:
32
  Args:
33
  query: The search query.
34
  """
35
- search = _get_ddg().invoke(input=query)
36
- return {"duckduckgo_web_search": search}
 
 
 
37
 
38
 
39
  @tool
40
  def wiki_search(query: str) -> str:
41
- """Search Wikipedia for a query and return maximum 3 results.
42
 
43
  Args:
44
  query: The search query."""
45
- documents = WikipediaLoader(query=query, load_max_docs=3, doc_content_chars_max=20000).load()
46
- processed_documents = "\n\n---\n\n".join(
47
- [
48
- f'Document title: {document.metadata.get("title", "")}. Summary: {document.metadata.get("summary", "")}. Documents details: {document.page_content}'
49
- for document in documents
50
- ])
51
- return {"wiki_results": processed_documents}
 
 
 
 
 
 
 
 
 
 
 
52
 
53
 
54
- @tool
55
- def wikipedia_page_fetch(title: str) -> str:
56
- """Fetch the full text of a Wikipedia page by its exact title.
57
 
58
- Use this when you can guess the page title from the question (e.g., "1928 Summer
59
- Olympics", "List of Featured Articles"). Faster than search + fetch_webpage when
60
- the title is obvious. Returns full page content, not a search summary.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
 
 
 
 
62
  Args:
63
- title: The exact Wikipedia page title (including namespace prefix if applicable,
64
- e.g., "Wikipedia:Featured_article_candidates/Featured_log/November_2016").
65
 
66
  Returns:
67
- The page content prefixed with title and URL, or a `[wikipedia_page_fetch] ...`
68
- error string.
 
 
 
69
  """
70
- import wikipedia
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  try:
72
  page = wikipedia.page(title, auto_suggest=False)
73
- return f"Wikipedia: {page.title}\nURL: {page.url}\n\n{page.content}"
74
  except wikipedia.exceptions.DisambiguationError as e:
75
  return f"[wikipedia_page_fetch] '{title}' is a disambiguation page. Options: {e.options[:10]}"
76
  except wikipedia.exceptions.PageError:
77
- return f"[wikipedia_page_fetch] page not found: '{title}'. Try wiki_search to find the correct title."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  except Exception as e:
79
  return f"[wikipedia_page_fetch] failed: {e}"
80
 
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  @tool
83
  def arxiv_search(query: str) -> str:
84
  """Search Arxiv for a query and return maximum 3 result.
85
 
86
  Args:
87
  query: The search query."""
88
- documents = ArxivLoader(query=query, load_max_docs=3).load()
89
- processed_documents = "\n\n---\n\n".join(
90
- [
91
- f'Document title: {document.metadata.get("title", "")}. Summary: {document.metadata.get("summary", "")}. Documents details: {document.page_content}'
92
- for document in documents
93
- ])
94
- return {"arxiv_results": processed_documents}
 
 
 
95
 
96
 
97
  @tool
@@ -100,13 +291,16 @@ def tavily_web_search(query: str) -> str:
100
 
101
  Args:
102
  query: The search query."""
103
- search_documents = _get_tavily().invoke(input=query)
104
- web_results = "\n\n---\n\n".join(
105
- [
106
- f'Document title: {document["title"]}. Contents: {document["content"]}. Relevance Score: {document["score"]}'
107
- for document in search_documents
108
- ])
109
- return {"web_results": web_results}
 
 
 
110
 
111
 
112
  @tool
@@ -121,7 +315,6 @@ def fetch_webpage(url: str) -> str:
121
  Returns:
122
  The extracted text content of the page.
123
  """
124
- import trafilatura
125
  try:
126
  downloaded = trafilatura.fetch_url(url)
127
  if downloaded is None:
@@ -137,11 +330,6 @@ def fetch_webpage(url: str) -> str:
137
  @tool
138
  def retry_file_download(task_id: str, file_name: str) -> str:
139
  """Retry downloading the task file from the GAIA scoring API.
140
-
141
- Use this when the initial automatic download failed (you will see a message like
142
- "the automatic download failed" in the question context). Returns the local path
143
- on success, or an error string starting with `[retry_file_download]`.
144
-
145
  Args:
146
  task_id: The task ID for the current question.
147
  file_name: The original file name from the question metadata.
@@ -164,25 +352,12 @@ def retry_file_download(task_id: str, file_name: str) -> str:
164
  @tool
165
  def youtube_transcript(url: str) -> str:
166
  """Fetch the transcript (captions) of a YouTube video as plain text.
167
-
168
- Use this whenever a question references a YouTube URL — the spoken content of
169
- the video is available via captions. Note: this returns text only; questions
170
- that require visual analysis of the frames cannot be answered from the
171
- transcript alone.
172
-
173
- Prefers manually-written English captions; falls back to auto-generated English,
174
- and finally to any available language.
175
-
176
  Args:
177
  url: The full YouTube URL (watch, youtu.be, embed, shorts) or a bare 11-char video ID.
178
 
179
  Returns:
180
  The concatenated transcript text, or an error string starting with `[youtube_transcript]`.
181
  """
182
- from youtube_transcript_api import YouTubeTranscriptApi
183
- from youtube_transcript_api._errors import (
184
- TranscriptsDisabled, NoTranscriptFound, VideoUnavailable,
185
- )
186
 
187
  video_id = extract_youtube_id(url)
188
  if not video_id:
 
1
  """Web search and fetching tools: DuckDuckGo, Tavily, Wikipedia, Arxiv, webpage fetch, YouTube transcripts."""
2
+ import re
3
+ from datetime import datetime
4
+
5
+ import requests
6
+ import trafilatura
7
+ import wikipedia
8
+ from bs4 import BeautifulSoup
9
  from langchain_community.tools import DuckDuckGoSearchRun
10
  from langchain_community.tools.tavily_search import TavilySearchResults
11
  from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
12
  from langchain_core.tools import tool
13
+ from youtube_transcript_api import YouTubeTranscriptApi
14
+ from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound, VideoUnavailable
15
 
16
  from gaia.utils import extract_youtube_id, load_config, download_task_file
17
 
18
+ # Wikipedia blocks/throttles requests with the default `wikipedia` package UA, which
19
+ # causes the API to return a non-JSON body and `requests.json()` to raise a
20
+ # `JSONDecodeError: Expecting value: line 1 column 1 (char 0)`. Setting an identifying
21
+ # UA per Wikipedia's policy fixes this for both `wiki_search` and `wikipedia_page_fetch`.
22
+ _USER_AGENT = "gaia-agent/0.1 (https://huggingface.co/spaces/KPatelis/Agents_Course_Assignment)"
23
+ wikipedia.set_user_agent(_USER_AGENT)
24
+
25
 
26
  _ddg_search = None
27
  _tavily_search = None
 
48
  Args:
49
  query: The search query.
50
  """
51
+ try:
52
+ search = _get_ddg().invoke(input=query)
53
+ return {"duckduckgo_web_search": search}
54
+ except Exception as e:
55
+ return f"[duck_web_search] failed: {type(e).__name__}: {e}"
56
 
57
 
58
  @tool
59
  def wiki_search(query: str) -> str:
60
+ """Search Wikipedia for a query and return up to 3 distinct articles.
61
 
62
  Args:
63
  query: The search query."""
64
+ try:
65
+ documents = WikipediaLoader(query=query, load_max_docs=3, doc_content_chars_max=20000).load()
66
+ # Deduplicate by article title
67
+ seen_titles = set()
68
+ unique_documents = []
69
+ for d in documents:
70
+ title = d.metadata.get("title", "")
71
+ if title and title not in seen_titles:
72
+ seen_titles.add(title)
73
+ unique_documents.append(d)
74
+ processed_documents = "\n\n---\n\n".join(
75
+ [
76
+ f'Document title: {document.metadata.get("title", "")}. Summary: {document.metadata.get("summary", "")}. Documents details: {document.page_content}'
77
+ for document in unique_documents
78
+ ])
79
+ return {"wiki_results": processed_documents}
80
+ except Exception as e:
81
+ return f"[wiki_search] failed: {type(e).__name__}: {e}"
82
 
83
 
84
+ _NAVBOX_MIN_CHARS = 200 # ignore navboxes with less than this many chars of text
85
+ _NAVBOX_MAX_CHARS = 15000 # cap navbox text to avoid blowing up context on huge pages
86
+
87
 
88
+ def _extract_navbox_text(html: str) -> str:
89
+ """Pull a flat-text dump of every ``.navbox`` div on a Wikipedia page.
90
+
91
+ Navboxes are the cross-link tables Wikipedia puts at the bottom of articles.
92
+ We collect every navbox on the page, flatten whitespace, and join with blank lines.
93
+ Returns ``""`` if no meaningful navbox content is present.
94
+ """
95
+ soup = BeautifulSoup(html, "html.parser")
96
+ parts = []
97
+ for nb in soup.find_all("div", class_="navbox"):
98
+ text = re.sub(r"\s+", " ", nb.get_text(" ", strip=True))
99
+ if text:
100
+ parts.append(text)
101
+ joined = "\n\n".join(parts).strip()
102
+ if len(joined) < _NAVBOX_MIN_CHARS:
103
+ return ""
104
+ return joined[:_NAVBOX_MAX_CHARS]
105
 
106
+
107
+ @tool
108
+ def wikipedia_page_fetch(title: str) -> str:
109
+ """Fetch a Wikipedia page by title and return its body + navbox text.
110
  Args:
111
+ title: The exact Wikipedia page title, optionally with a namespace prefix
112
+ (e.g. ``"Wikipedia:Featured article candidates/Featured log/November 2016"``).
113
 
114
  Returns:
115
+ On success: a multi-line string starting with ``"Wikipedia: <resolved title>"``,
116
+ a ``URL:`` line, a blank line, the extracted body, and (if present) a
117
+ ``--- Related (navbox) ---`` block.
118
+ On failure: a string starting with ``[wikipedia_page_fetch] …`` describing
119
+ the failure (page not found, disambiguation page, search fallback exhausted).
120
  """
121
+
122
+ def _render(page, resolved_from=None):
123
+ suffix = f" (resolved from '{resolved_from}')" if resolved_from else ""
124
+ header = f"Wikipedia: {page.title}{suffix}\nURL: {page.url}"
125
+
126
+ # Body: prefer trafilatura (preserves lists and tables — critical for
127
+ # counting-style questions). Fall back to page.content on failure.
128
+ body = None
129
+ downloaded = trafilatura.fetch_url(page.url)
130
+ if downloaded is not None:
131
+ body = trafilatura.extract(downloaded, include_tables=True, include_links=False)
132
+ if not body:
133
+ body = page.content
134
+
135
+ # Navbox: append the cross-link tables that body extractors strip.
136
+ navbox_section = ""
137
+ try:
138
+ navbox_text = _extract_navbox_text(page.html())
139
+ if navbox_text:
140
+ navbox_section = f"\n\n--- Related (navbox) ---\n{navbox_text}"
141
+ except Exception:
142
+ pass
143
+
144
+ return f"{header}\n\n{body}{navbox_section}"
145
+
146
  try:
147
  page = wikipedia.page(title, auto_suggest=False)
148
+ return _render(page)
149
  except wikipedia.exceptions.DisambiguationError as e:
150
  return f"[wikipedia_page_fetch] '{title}' is a disambiguation page. Options: {e.options[:10]}"
151
  except wikipedia.exceptions.PageError:
152
+ # Recover from case-sensitivity / slight title mismatches by searching once and
153
+ # fetching the top hit.
154
+ try:
155
+ hits = wikipedia.search(title, results=1)
156
+ except Exception as e:
157
+ return f"[wikipedia_page_fetch] page not found: '{title}'; search fallback failed: {e}"
158
+ if not hits:
159
+ return f"[wikipedia_page_fetch] page not found: '{title}' and no search hits."
160
+ resolved = hits[0]
161
+ if resolved == title:
162
+ return f"[wikipedia_page_fetch] page not found: '{title}'. Try wiki_search to find the correct title."
163
+ try:
164
+ page = wikipedia.page(resolved, auto_suggest=False)
165
+ except Exception as e:
166
+ return f"[wikipedia_page_fetch] resolved title '{resolved}' but fetch failed: {e}"
167
+ return _render(page, resolved_from=title)
168
  except Exception as e:
169
  return f"[wikipedia_page_fetch] failed: {e}"
170
 
171
 
172
+ _WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
173
+
174
+
175
+ def _resolve_revision_at(title: str, iso_timestamp: str) -> tuple[int | None, str | None, str | None]:
176
+ """Look up the Wikipedia revision id active for ``title`` at ``iso_timestamp``.
177
+ """
178
+ params = {
179
+ "action": "query",
180
+ "format": "json",
181
+ "prop": "revisions",
182
+ "titles": title,
183
+ "rvprop": "ids|timestamp",
184
+ "rvlimit": 1,
185
+ "rvdir": "older",
186
+ "rvstart": iso_timestamp,
187
+ }
188
+ try:
189
+ r = requests.get(
190
+ _WIKI_API_ENDPOINT,
191
+ params=params,
192
+ headers={"User-Agent": _USER_AGENT},
193
+ timeout=30,
194
+ )
195
+ r.raise_for_status()
196
+ data = r.json()
197
+ except Exception as e:
198
+ return None, None, f"API request failed: {type(e).__name__}: {e}"
199
+
200
+ pages = data.get("query", {}).get("pages", {})
201
+ if not pages:
202
+ return None, None, "API returned no pages"
203
+ page = next(iter(pages.values()))
204
+ if "missing" in page:
205
+ return None, None, f"page not found: '{title}'"
206
+ revisions = page.get("revisions") or []
207
+ if not revisions:
208
+ return None, None, f"no revisions for '{title}' on or before {iso_timestamp}"
209
+ return revisions[0]["revid"], page.get("title", title), None
210
+
211
+
212
+ @tool
213
+ def wikipedia_page_as_of(title: str, date: str) -> str:
214
+ """Fetch a Wikipedia page as it existed at end of day UTC on a specific date.
215
+ Args:
216
+ title: Wikipedia page title (e.g. ``"Taishō Tamai"``,
217
+ ``"Hokkaido Nippon-Ham Fighters"``, ``"1928 Summer Olympics"``).
218
+ date: Target date in ISO ``"YYYY-MM-DD"`` format (e.g. ``"2023-07-31"``).
219
+ The page is fetched as it appeared at 23:59:59 UTC on that day.
220
+
221
+ Returns:
222
+ On success: a multi-line string ``"Wikipedia: <title> (as of <date>, revid <id>) / URL: <oldid URL> / <body> / --- Related (navbox) ---"``.
223
+ On failure: a string starting with ``[wikipedia_page_as_of] …`` describing
224
+ the failure (invalid date, page not found, revision lookup failure,
225
+ rendered-HTML fetch failure).
226
+ """
227
+ try:
228
+ dt = datetime.strptime(date, "%Y-%m-%d")
229
+ except ValueError:
230
+ return f"[wikipedia_page_as_of] invalid date '{date}'; expected YYYY-MM-DD."
231
+ iso_ts = dt.strftime("%Y-%m-%dT23:59:59Z")
232
+
233
+ revid, resolved_title, err = _resolve_revision_at(title, iso_ts)
234
+ if err and err.startswith("page not found"):
235
+ # Case-/spelling-tolerant fallback: search and retry the top hit.
236
+ try:
237
+ hits = wikipedia.search(title, results=1)
238
+ except Exception as e:
239
+ return f"[wikipedia_page_as_of] page not found and search failed: {e}"
240
+ if not hits or hits[0] == title:
241
+ return f"[wikipedia_page_as_of] page not found: '{title}'"
242
+ revid, resolved_title, err = _resolve_revision_at(hits[0], iso_ts)
243
+ if err:
244
+ return f"[wikipedia_page_as_of] {err}"
245
+
246
+ url = f"https://en.wikipedia.org/w/index.php?oldid={revid}"
247
+ try:
248
+ resp = requests.get(url, headers={"User-Agent": _USER_AGENT}, timeout=30)
249
+ resp.raise_for_status()
250
+ html = resp.text
251
+ except Exception as e:
252
+ return f"[wikipedia_page_as_of] could not fetch revision URL {url}: {type(e).__name__}: {e}"
253
+
254
+ body = trafilatura.extract(html, include_tables=True, include_links=False)
255
+ if not body:
256
+ return f"[wikipedia_page_as_of] no body extracted from {url}"
257
+
258
+ navbox_section = ""
259
+ try:
260
+ navbox_text = _extract_navbox_text(html)
261
+ if navbox_text:
262
+ navbox_section = f"\n\n--- Related (navbox) ---\n{navbox_text}"
263
+ except Exception:
264
+ pass
265
+
266
+ header = f"Wikipedia: {resolved_title} (as of {date}, revid {revid})\nURL: {url}"
267
+ return f"{header}\n\n{body}{navbox_section}"
268
+
269
+
270
  @tool
271
  def arxiv_search(query: str) -> str:
272
  """Search Arxiv for a query and return maximum 3 result.
273
 
274
  Args:
275
  query: The search query."""
276
+ try:
277
+ documents = ArxivLoader(query=query, load_max_docs=3).load()
278
+ processed_documents = "\n\n---\n\n".join(
279
+ [
280
+ f'Document title: {document.metadata.get("title", "")}. Summary: {document.metadata.get("summary", "")}. Documents details: {document.page_content}'
281
+ for document in documents
282
+ ])
283
+ return {"arxiv_results": processed_documents}
284
+ except Exception as e:
285
+ return f"[arxiv_search] failed: {type(e).__name__}: {e}"
286
 
287
 
288
  @tool
 
291
 
292
  Args:
293
  query: The search query."""
294
+ try:
295
+ search_documents = _get_tavily().invoke(input=query)
296
+ web_results = "\n\n---\n\n".join(
297
+ [
298
+ f'Document title: {document["title"]}. Contents: {document["content"]}. Relevance Score: {document["score"]}'
299
+ for document in search_documents
300
+ ])
301
+ return {"web_results": web_results}
302
+ except Exception as e:
303
+ return f"[tavily_web_search] failed: {type(e).__name__}: {e}"
304
 
305
 
306
  @tool
 
315
  Returns:
316
  The extracted text content of the page.
317
  """
 
318
  try:
319
  downloaded = trafilatura.fetch_url(url)
320
  if downloaded is None:
 
330
  @tool
331
  def retry_file_download(task_id: str, file_name: str) -> str:
332
  """Retry downloading the task file from the GAIA scoring API.
 
 
 
 
 
333
  Args:
334
  task_id: The task ID for the current question.
335
  file_name: The original file name from the question metadata.
 
352
  @tool
353
  def youtube_transcript(url: str) -> str:
354
  """Fetch the transcript (captions) of a YouTube video as plain text.
 
 
 
 
 
 
 
 
 
355
  Args:
356
  url: The full YouTube URL (watch, youtu.be, embed, shorts) or a bare 11-char video ID.
357
 
358
  Returns:
359
  The concatenated transcript text, or an error string starting with `[youtube_transcript]`.
360
  """
 
 
 
 
361
 
362
  video_id = extract_youtube_id(url)
363
  if not video_id: