"""Web search via scraping — no API key needed. Strategy: 1. Primary: DuckDuckGo HTML (more scraper-friendly, fewer captchas) 2. Fallback: Google search with robust multi-selector parsing """ from __future__ import annotations import logging import re import urllib.parse logger = logging.getLogger(__name__) # Common browser-like headers to avoid bot detection _BROWSER_HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/125.0.0.0 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", } def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]: """Search the web by scraping. No API key needed. Tries DuckDuckGo first (more scraper-friendly), then falls back to Google if DuckDuckGo returns nothing. Returns a list of dicts with keys: title, url, snippet. """ results = _search_duckduckgo(query, num_results) if results: return results results = _search_google(query, num_results) return results def _search_duckduckgo(query: str, num_results: int) -> list[dict[str, str]]: """Search DuckDuckGo HTML version — very scraper-friendly.""" try: import requests from bs4 import BeautifulSoup encoded_query = urllib.parse.quote_plus(query) url = f"https://html.duckduckgo.com/html/?q={encoded_query}" headers = {**_BROWSER_HEADERS, "Referer": "https://duckduckgo.com/"} resp = requests.get(url, headers=headers, timeout=10, allow_redirects=True) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") results: list[dict[str, str]] = [] # DuckDuckGo HTML uses .result blocks for result_div in soup.select(".result"): title_el = result_div.select_one(".result__title a, .result__a") snippet_el = result_div.select_one(".result__snippet") if not title_el: continue title = title_el.get_text(strip=True) # DDG uses redirect URLs like //duckduckgo.com/l/?uddg=... href = title_el.get("href", "") real_url = _extract_ddg_url(href) if not real_url: continue # Skip internal URLs if any(domain in real_url for domain in ["duckduckgo.com", "duck.co"]): continue snippet = snippet_el.get_text(strip=True) if snippet_el else "" if title and real_url: results.append({ "title": title, "url": real_url, "snippet": snippet, }) if len(results) >= num_results: break logger.info("DuckDuckGo search for '%s' returned %d results", query, len(results)) return results except ImportError: logger.warning("requests or beautifulsoup4 not installed for web search") return [] except Exception as exc: logger.warning("DuckDuckGo search failed: %s", exc) return [] def _extract_ddg_url(href: str) -> str | None: """Extract the real URL from a DuckDuckGo redirect link.""" if not href: return None # Direct HTTP URL if href.startswith("http"): return href # DDG redirect: //duckduckgo.com/l/?uddg=&... if "uddg=" in href: parsed = urllib.parse.urlparse(href) params = urllib.parse.parse_qs(parsed.query) uddg = params.get("uddg", []) if uddg: return urllib.parse.unquote(uddg[0]) # Sometimes it's a relative redirect if href.startswith("//"): return "https:" + href return None def _search_google(query: str, num_results: int) -> list[dict[str, str]]: """Search Google by scraping the results page. Fallback method.""" try: import requests from bs4 import BeautifulSoup encoded_query = urllib.parse.quote_plus(query) url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}&hl=en" headers = {**_BROWSER_HEADERS, "Referer": "https://www.google.com/"} session = requests.Session() # First get a cookie from Google session.get("https://www.google.com/", headers=headers, timeout=5) resp = session.get(url, headers=headers, timeout=10, allow_redirects=True) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") results: list[dict[str, str]] = [] # Strategy 1: Modern Google layout — div.g > div.yuRUbf (title+link) + div.VwiC3b (snippet) for g_div in soup.select("div.g"): title_el = g_div.select_one("h3") link_el = g_div.select_one("a[href]") snippet_el = g_div.select_one("div.VwiC3b, span.aCOpRe, div[data-sncf]") if not title_el or not link_el: continue href = link_el.get("href", "") real_url = _extract_google_url(href) if not real_url: continue title = title_el.get_text(strip=True) snippet = snippet_el.get_text(strip=True) if snippet_el else "" if title and real_url: results.append({ "title": title, "url": real_url, "snippet": snippet, }) if len(results) >= num_results: break # Strategy 2: Fallback — look for any containing an

if not results: for a_tag in soup.find_all("a", href=True): h3 = a_tag.find("h3") if not h3: continue href = a_tag.get("href", "") real_url = _extract_google_url(href) if not real_url: continue title = h3.get_text(strip=True) # Try to find a sibling or nearby snippet snippet = "" parent = a_tag.parent if parent: for _ in range(3): parent = parent.parent if parent else None if parent: snippet_el = parent.select_one("div.VwiC3b, span.aCOpRe, span.st") if snippet_el: snippet = snippet_el.get_text(strip=True) if title and real_url: results.append({ "title": title, "url": real_url, "snippet": snippet, }) if len(results) >= num_results: break # Strategy 3: Last resort — any with external href if not results: for a_tag in soup.select("a[data-ved]"): href = a_tag.get("href", "") if not href.startswith("http"): continue if "google.com" in href: continue title_el = a_tag.select_one("h3, span") title = title_el.get_text(strip=True) if title_el else a_tag.get_text(strip=True)[:100] if title and href: results.append({ "title": title, "url": href, "snippet": "", }) if len(results) >= num_results: break logger.info("Google search for '%s' returned %d results", query, len(results)) return results except ImportError: logger.warning("requests or beautifulsoup4 not installed for web search") return [] except Exception as exc: logger.warning("Google search failed: %s", exc) return [] def _extract_google_url(href: str) -> str | None: """Extract the real URL from a Google search result link.""" if not href: return None # Google redirect: /url?q=&... if href.startswith("/url?q="): parsed = urllib.parse.urlparse(href) params = urllib.parse.parse_qs(parsed.query) q = params.get("q", []) if q: real_url = q[0] if real_url.startswith("http"): return real_url # Direct HTTP URL if href.startswith("http"): # Skip Google-internal URLs if any(domain in href for domain in [ "google.com", "googleusercontent.com", "youtube.com", "gstatic.com", ]): return None return href return None def format_search_results(results: list[dict[str, str]]) -> str: """Format search results into a text block for model context.""" if not results: return "No search results found." parts = ["Here are the web search results for reference:\n"] for i, r in enumerate(results, 1): parts.append(f"{i}. {r['title']}") parts.append(f" URL: {r['url']}") if r["snippet"]: parts.append(f" {r['snippet']}") parts.append("") return "\n".join(parts)