| """Web search via scraping — no API key needed. |
| |
| Strategy: |
| 1. Primary: DuckDuckGo HTML (more scraper-friendly, fewer captchas) |
| 2. Fallback: Google search with robust multi-selector parsing |
| """ |
|
|
| from __future__ import annotations |
|
|
| import logging |
| import re |
| import urllib.parse |
|
|
| logger = logging.getLogger(__name__) |
|
|
| |
| _BROWSER_HEADERS = { |
| "User-Agent": ( |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
| "AppleWebKit/537.36 (KHTML, like Gecko) " |
| "Chrome/125.0.0.0 Safari/537.36" |
| ), |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
| "Accept-Language": "en-US,en;q=0.5", |
| "Accept-Encoding": "gzip, deflate", |
| "DNT": "1", |
| "Connection": "keep-alive", |
| "Upgrade-Insecure-Requests": "1", |
| } |
|
|
|
|
| def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]: |
| """Search the web by scraping. No API key needed. |
| |
| Tries DuckDuckGo first (more scraper-friendly), |
| then falls back to Google if DuckDuckGo returns nothing. |
| |
| Returns a list of dicts with keys: title, url, snippet. |
| """ |
| results = _search_duckduckgo(query, num_results) |
| if results: |
| return results |
|
|
| results = _search_google(query, num_results) |
| return results |
|
|
|
|
| def _search_duckduckgo(query: str, num_results: int) -> list[dict[str, str]]: |
| """Search DuckDuckGo HTML version — very scraper-friendly.""" |
| try: |
| import requests |
| from bs4 import BeautifulSoup |
|
|
| encoded_query = urllib.parse.quote_plus(query) |
| url = f"https://html.duckduckgo.com/html/?q={encoded_query}" |
|
|
| headers = {**_BROWSER_HEADERS, "Referer": "https://duckduckgo.com/"} |
|
|
| resp = requests.get(url, headers=headers, timeout=10, allow_redirects=True) |
| resp.raise_for_status() |
|
|
| soup = BeautifulSoup(resp.text, "html.parser") |
| results: list[dict[str, str]] = [] |
|
|
| |
| for result_div in soup.select(".result"): |
| title_el = result_div.select_one(".result__title a, .result__a") |
| snippet_el = result_div.select_one(".result__snippet") |
|
|
| if not title_el: |
| continue |
|
|
| title = title_el.get_text(strip=True) |
| |
| href = title_el.get("href", "") |
|
|
| real_url = _extract_ddg_url(href) |
| if not real_url: |
| continue |
|
|
| |
| if any(domain in real_url for domain in ["duckduckgo.com", "duck.co"]): |
| continue |
|
|
| snippet = snippet_el.get_text(strip=True) if snippet_el else "" |
|
|
| if title and real_url: |
| results.append({ |
| "title": title, |
| "url": real_url, |
| "snippet": snippet, |
| }) |
|
|
| if len(results) >= num_results: |
| break |
|
|
| logger.info("DuckDuckGo search for '%s' returned %d results", query, len(results)) |
| return results |
|
|
| except ImportError: |
| logger.warning("requests or beautifulsoup4 not installed for web search") |
| return [] |
| except Exception as exc: |
| logger.warning("DuckDuckGo search failed: %s", exc) |
| return [] |
|
|
|
|
| def _extract_ddg_url(href: str) -> str | None: |
| """Extract the real URL from a DuckDuckGo redirect link.""" |
| if not href: |
| return None |
|
|
| |
| if href.startswith("http"): |
| return href |
|
|
| |
| if "uddg=" in href: |
| parsed = urllib.parse.urlparse(href) |
| params = urllib.parse.parse_qs(parsed.query) |
| uddg = params.get("uddg", []) |
| if uddg: |
| return urllib.parse.unquote(uddg[0]) |
|
|
| |
| if href.startswith("//"): |
| return "https:" + href |
|
|
| return None |
|
|
|
|
| def _search_google(query: str, num_results: int) -> list[dict[str, str]]: |
| """Search Google by scraping the results page. Fallback method.""" |
| try: |
| import requests |
| from bs4 import BeautifulSoup |
|
|
| encoded_query = urllib.parse.quote_plus(query) |
| url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}&hl=en" |
|
|
| headers = {**_BROWSER_HEADERS, "Referer": "https://www.google.com/"} |
|
|
| session = requests.Session() |
| |
| session.get("https://www.google.com/", headers=headers, timeout=5) |
|
|
| resp = session.get(url, headers=headers, timeout=10, allow_redirects=True) |
| resp.raise_for_status() |
|
|
| soup = BeautifulSoup(resp.text, "html.parser") |
| results: list[dict[str, str]] = [] |
|
|
| |
| for g_div in soup.select("div.g"): |
| title_el = g_div.select_one("h3") |
| link_el = g_div.select_one("a[href]") |
| snippet_el = g_div.select_one("div.VwiC3b, span.aCOpRe, div[data-sncf]") |
|
|
| if not title_el or not link_el: |
| continue |
|
|
| href = link_el.get("href", "") |
| real_url = _extract_google_url(href) |
| if not real_url: |
| continue |
|
|
| title = title_el.get_text(strip=True) |
| snippet = snippet_el.get_text(strip=True) if snippet_el else "" |
|
|
| if title and real_url: |
| results.append({ |
| "title": title, |
| "url": real_url, |
| "snippet": snippet, |
| }) |
|
|
| if len(results) >= num_results: |
| break |
|
|
| |
| if not results: |
| for a_tag in soup.find_all("a", href=True): |
| h3 = a_tag.find("h3") |
| if not h3: |
| continue |
|
|
| href = a_tag.get("href", "") |
| real_url = _extract_google_url(href) |
| if not real_url: |
| continue |
|
|
| title = h3.get_text(strip=True) |
| |
| snippet = "" |
| parent = a_tag.parent |
| if parent: |
| for _ in range(3): |
| parent = parent.parent if parent else None |
| if parent: |
| snippet_el = parent.select_one("div.VwiC3b, span.aCOpRe, span.st") |
| if snippet_el: |
| snippet = snippet_el.get_text(strip=True) |
|
|
| if title and real_url: |
| results.append({ |
| "title": title, |
| "url": real_url, |
| "snippet": snippet, |
| }) |
|
|
| if len(results) >= num_results: |
| break |
|
|
| |
| if not results: |
| for a_tag in soup.select("a[data-ved]"): |
| href = a_tag.get("href", "") |
| if not href.startswith("http"): |
| continue |
| if "google.com" in href: |
| continue |
|
|
| title_el = a_tag.select_one("h3, span") |
| title = title_el.get_text(strip=True) if title_el else a_tag.get_text(strip=True)[:100] |
|
|
| if title and href: |
| results.append({ |
| "title": title, |
| "url": href, |
| "snippet": "", |
| }) |
|
|
| if len(results) >= num_results: |
| break |
|
|
| logger.info("Google search for '%s' returned %d results", query, len(results)) |
| return results |
|
|
| except ImportError: |
| logger.warning("requests or beautifulsoup4 not installed for web search") |
| return [] |
| except Exception as exc: |
| logger.warning("Google search failed: %s", exc) |
| return [] |
|
|
|
|
| def _extract_google_url(href: str) -> str | None: |
| """Extract the real URL from a Google search result link.""" |
| if not href: |
| return None |
|
|
| |
| if href.startswith("/url?q="): |
| parsed = urllib.parse.urlparse(href) |
| params = urllib.parse.parse_qs(parsed.query) |
| q = params.get("q", []) |
| if q: |
| real_url = q[0] |
| if real_url.startswith("http"): |
| return real_url |
|
|
| |
| if href.startswith("http"): |
| |
| if any(domain in href for domain in [ |
| "google.com", "googleusercontent.com", |
| "youtube.com", "gstatic.com", |
| ]): |
| return None |
| return href |
|
|
| return None |
|
|
|
|
| def format_search_results(results: list[dict[str, str]]) -> str: |
| """Format search results into a text block for model context.""" |
| if not results: |
| return "No search results found." |
|
|
| parts = ["Here are the web search results for reference:\n"] |
| for i, r in enumerate(results, 1): |
| parts.append(f"{i}. {r['title']}") |
| parts.append(f" URL: {r['url']}") |
| if r["snippet"]: |
| parts.append(f" {r['snippet']}") |
| parts.append("") |
|
|
| return "\n".join(parts) |
|
|