File size: 2,369 Bytes
a877f54
a394be7
a877f54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40deb66
a877f54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40deb66
a877f54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
from colorama import Fore, Style  # type: ignore[import]
from langchain_core.tools import tool
from tavily import TavilyClient  # type: ignore[import]


@tool
def web_search(query: str, depth: str = "advanced", max_results: int = 5) -> str:
    """
    Search the web using Tavily and extract full Markdown content from top results.
    Useful for in-depth analysis, table data, or detailed technical documentation.

    Args:
        query: The search query string.
        depth: Search depth, either "basic" or "advanced".
        max_results: Number of results to return (recommended 3-5 to save tokens).
    """
    print(f"{Fore.GREEN}[Search & Extract] {query}{Style.RESET_ALL}")
    client = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])

    # 1. Search and get URLs
    search_response = client.search(
        query=query, search_depth=depth, max_results=max_results
    )

    results = search_response.get("results", [])
    if not results:
        return "No results found."

    urls = [r["url"] for r in results]

    # 2. Extract full content via Extract API (max 20 URLs per call)
    try:
        extraction = client.extract(
            urls=urls,
            extract_depth="advanced",  # For tables and structured data
            format="markdown",  # Most readable format for LLMs
        )

        extracted_results = {
            item["url"]: item["raw_content"] for item in extraction.get("results", [])
        }
    except Exception as e:
        print(f"{Fore.RED}Extraction failed: {e}{Style.RESET_ALL}")
        extracted_results = {}

    # 3. Format output
    final_output = []
    for r in results:
        url = r["url"]
        title = r["title"]
        snippet = r["content"]  # Original search snippet
        full_content = extracted_results.get(url, "Full content extraction failed.")

        content_block = (
            f"### Title: {title}\n"
            f"**URL:** {url}\n"
            f"**Snippet:** {snippet}\n\n"
            f"**Full Markdown Content:**\n\n{full_content}\n"
            f"{'=' * 50}"
        )
        final_output.append(content_block)

    return "\n\n".join(final_output)


if __name__ == "__main__":
    from dotenv import load_dotenv

    load_dotenv()

    # Test run
    test_query = "What is LangGraph?"
    result = web_search.invoke({"query": test_query})
    print(result)