File size: 9,394 Bytes
4176077
4412065
4176077
 
 
4412065
 
 
 
 
4176077
4412065
 
 
 
4176077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4412065
 
4176077
 
 
 
4412065
 
 
4176077
 
 
 
 
 
 
 
 
 
4412065
 
 
 
 
4176077
4412065
4176077
4412065
 
 
 
 
 
 
4176077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4412065
 
 
 
 
 
 
 
4176077
 
4412065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4176077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4412065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4176077
4412065
 
 
 
 
4176077
4412065
 
 
 
 
 
4176077
4412065
 
 
4176077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4412065
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
"""Web search via scraping β€” no API key needed.

Strategy:
1. Primary: DuckDuckGo HTML (more scraper-friendly, fewer captchas)
2. Fallback: Google search with robust multi-selector parsing
"""

from __future__ import annotations

import logging
import re
import urllib.parse

logger = logging.getLogger(__name__)

# Common browser-like headers to avoid bot detection
_BROWSER_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/125.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}


def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
    """Search the web by scraping. No API key needed.

    Tries DuckDuckGo first (more scraper-friendly),
    then falls back to Google if DuckDuckGo returns nothing.

    Returns a list of dicts with keys: title, url, snippet.
    """
    results = _search_duckduckgo(query, num_results)
    if results:
        return results

    results = _search_google(query, num_results)
    return results


def _search_duckduckgo(query: str, num_results: int) -> list[dict[str, str]]:
    """Search DuckDuckGo HTML version β€” very scraper-friendly."""
    try:
        import requests
        from bs4 import BeautifulSoup

        encoded_query = urllib.parse.quote_plus(query)
        url = f"https://html.duckduckgo.com/html/?q={encoded_query}"

        headers = {**_BROWSER_HEADERS, "Referer": "https://duckduckgo.com/"}

        resp = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, "html.parser")
        results: list[dict[str, str]] = []

        # DuckDuckGo HTML uses .result blocks
        for result_div in soup.select(".result"):
            title_el = result_div.select_one(".result__title a, .result__a")
            snippet_el = result_div.select_one(".result__snippet")

            if not title_el:
                continue

            title = title_el.get_text(strip=True)
            # DDG uses redirect URLs like //duckduckgo.com/l/?uddg=...
            href = title_el.get("href", "")

            real_url = _extract_ddg_url(href)
            if not real_url:
                continue

            # Skip internal URLs
            if any(domain in real_url for domain in ["duckduckgo.com", "duck.co"]):
                continue

            snippet = snippet_el.get_text(strip=True) if snippet_el else ""

            if title and real_url:
                results.append({
                    "title": title,
                    "url": real_url,
                    "snippet": snippet,
                })

            if len(results) >= num_results:
                break

        logger.info("DuckDuckGo search for '%s' returned %d results", query, len(results))
        return results

    except ImportError:
        logger.warning("requests or beautifulsoup4 not installed for web search")
        return []
    except Exception as exc:
        logger.warning("DuckDuckGo search failed: %s", exc)
        return []


def _extract_ddg_url(href: str) -> str | None:
    """Extract the real URL from a DuckDuckGo redirect link."""
    if not href:
        return None

    # Direct HTTP URL
    if href.startswith("http"):
        return href

    # DDG redirect: //duckduckgo.com/l/?uddg=<encoded_url>&...
    if "uddg=" in href:
        parsed = urllib.parse.urlparse(href)
        params = urllib.parse.parse_qs(parsed.query)
        uddg = params.get("uddg", [])
        if uddg:
            return urllib.parse.unquote(uddg[0])

    # Sometimes it's a relative redirect
    if href.startswith("//"):
        return "https:" + href

    return None


def _search_google(query: str, num_results: int) -> list[dict[str, str]]:
    """Search Google by scraping the results page. Fallback method."""
    try:
        import requests
        from bs4 import BeautifulSoup

        encoded_query = urllib.parse.quote_plus(query)
        url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}&hl=en"

        headers = {**_BROWSER_HEADERS, "Referer": "https://www.google.com/"}

        session = requests.Session()
        # First get a cookie from Google
        session.get("https://www.google.com/", headers=headers, timeout=5)

        resp = session.get(url, headers=headers, timeout=10, allow_redirects=True)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, "html.parser")
        results: list[dict[str, str]] = []

        # Strategy 1: Modern Google layout β€” div.g > div.yuRUbf (title+link) + div.VwiC3b (snippet)
        for g_div in soup.select("div.g"):
            title_el = g_div.select_one("h3")
            link_el = g_div.select_one("a[href]")
            snippet_el = g_div.select_one("div.VwiC3b, span.aCOpRe, div[data-sncf]")

            if not title_el or not link_el:
                continue

            href = link_el.get("href", "")
            real_url = _extract_google_url(href)
            if not real_url:
                continue

            title = title_el.get_text(strip=True)
            snippet = snippet_el.get_text(strip=True) if snippet_el else ""

            if title and real_url:
                results.append({
                    "title": title,
                    "url": real_url,
                    "snippet": snippet,
                })

            if len(results) >= num_results:
                break

        # Strategy 2: Fallback β€” look for any <a> containing an <h3>
        if not results:
            for a_tag in soup.find_all("a", href=True):
                h3 = a_tag.find("h3")
                if not h3:
                    continue

                href = a_tag.get("href", "")
                real_url = _extract_google_url(href)
                if not real_url:
                    continue

                title = h3.get_text(strip=True)
                # Try to find a sibling or nearby snippet
                snippet = ""
                parent = a_tag.parent
                if parent:
                    for _ in range(3):
                        parent = parent.parent if parent else None
                    if parent:
                        snippet_el = parent.select_one("div.VwiC3b, span.aCOpRe, span.st")
                        if snippet_el:
                            snippet = snippet_el.get_text(strip=True)

                if title and real_url:
                    results.append({
                        "title": title,
                        "url": real_url,
                        "snippet": snippet,
                    })

                if len(results) >= num_results:
                    break

        # Strategy 3: Last resort β€” any <a data-ved> with external href
        if not results:
            for a_tag in soup.select("a[data-ved]"):
                href = a_tag.get("href", "")
                if not href.startswith("http"):
                    continue
                if "google.com" in href:
                    continue

                title_el = a_tag.select_one("h3, span")
                title = title_el.get_text(strip=True) if title_el else a_tag.get_text(strip=True)[:100]

                if title and href:
                    results.append({
                        "title": title,
                        "url": href,
                        "snippet": "",
                    })

                if len(results) >= num_results:
                    break

        logger.info("Google search for '%s' returned %d results", query, len(results))
        return results

    except ImportError:
        logger.warning("requests or beautifulsoup4 not installed for web search")
        return []
    except Exception as exc:
        logger.warning("Google search failed: %s", exc)
        return []


def _extract_google_url(href: str) -> str | None:
    """Extract the real URL from a Google search result link."""
    if not href:
        return None

    # Google redirect: /url?q=<real_url>&...
    if href.startswith("/url?q="):
        parsed = urllib.parse.urlparse(href)
        params = urllib.parse.parse_qs(parsed.query)
        q = params.get("q", [])
        if q:
            real_url = q[0]
            if real_url.startswith("http"):
                return real_url

    # Direct HTTP URL
    if href.startswith("http"):
        # Skip Google-internal URLs
        if any(domain in href for domain in [
            "google.com", "googleusercontent.com",
            "youtube.com", "gstatic.com",
        ]):
            return None
        return href

    return None


def format_search_results(results: list[dict[str, str]]) -> str:
    """Format search results into a text block for model context."""
    if not results:
        return "No search results found."

    parts = ["Here are the web search results for reference:\n"]
    for i, r in enumerate(results, 1):
        parts.append(f"{i}. {r['title']}")
        parts.append(f"   URL: {r['url']}")
        if r["snippet"]:
            parts.append(f"   {r['snippet']}")
        parts.append("")

    return "\n".join(parts)