Spaces:
Running
Running
| """Web search via scraping — no API key needed. | |
| Strategy: | |
| 1. Primary: DuckDuckGo HTML (more scraper-friendly, fewer captchas) | |
| 2. Fallback: Google search with robust multi-selector parsing | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import re | |
| import urllib.parse | |
| logger = logging.getLogger(__name__) | |
| # Common browser-like headers to avoid bot detection | |
| _BROWSER_HEADERS = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/125.0.0.0 Safari/537.36" | |
| ), | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.5", | |
| "Accept-Encoding": "gzip, deflate", | |
| "DNT": "1", | |
| "Connection": "keep-alive", | |
| "Upgrade-Insecure-Requests": "1", | |
| } | |
| def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]: | |
| """Search the web by scraping. No API key needed. | |
| Tries DuckDuckGo first (more scraper-friendly), | |
| then falls back to Google if DuckDuckGo returns nothing. | |
| Returns a list of dicts with keys: title, url, snippet. | |
| """ | |
| results = _search_duckduckgo(query, num_results) | |
| if results: | |
| return results | |
| results = _search_google(query, num_results) | |
| return results | |
| def _search_duckduckgo(query: str, num_results: int) -> list[dict[str, str]]: | |
| """Search DuckDuckGo HTML version — very scraper-friendly.""" | |
| try: | |
| import requests | |
| from bs4 import BeautifulSoup | |
| encoded_query = urllib.parse.quote_plus(query) | |
| url = f"https://html.duckduckgo.com/html/?q={encoded_query}" | |
| headers = {**_BROWSER_HEADERS, "Referer": "https://duckduckgo.com/"} | |
| resp = requests.get(url, headers=headers, timeout=10, allow_redirects=True) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| results: list[dict[str, str]] = [] | |
| # DuckDuckGo HTML uses .result blocks | |
| for result_div in soup.select(".result"): | |
| title_el = result_div.select_one(".result__title a, .result__a") | |
| snippet_el = result_div.select_one(".result__snippet") | |
| if not title_el: | |
| continue | |
| title = title_el.get_text(strip=True) | |
| # DDG uses redirect URLs like //duckduckgo.com/l/?uddg=... | |
| href = title_el.get("href", "") | |
| real_url = _extract_ddg_url(href) | |
| if not real_url: | |
| continue | |
| # Skip internal URLs | |
| if any(domain in real_url for domain in ["duckduckgo.com", "duck.co"]): | |
| continue | |
| snippet = snippet_el.get_text(strip=True) if snippet_el else "" | |
| if title and real_url: | |
| results.append({ | |
| "title": title, | |
| "url": real_url, | |
| "snippet": snippet, | |
| }) | |
| if len(results) >= num_results: | |
| break | |
| logger.info("DuckDuckGo search for '%s' returned %d results", query, len(results)) | |
| return results | |
| except ImportError: | |
| logger.warning("requests or beautifulsoup4 not installed for web search") | |
| return [] | |
| except Exception as exc: | |
| logger.warning("DuckDuckGo search failed: %s", exc) | |
| return [] | |
| def _extract_ddg_url(href: str) -> str | None: | |
| """Extract the real URL from a DuckDuckGo redirect link.""" | |
| if not href: | |
| return None | |
| # Direct HTTP URL | |
| if href.startswith("http"): | |
| return href | |
| # DDG redirect: //duckduckgo.com/l/?uddg=<encoded_url>&... | |
| if "uddg=" in href: | |
| parsed = urllib.parse.urlparse(href) | |
| params = urllib.parse.parse_qs(parsed.query) | |
| uddg = params.get("uddg", []) | |
| if uddg: | |
| return urllib.parse.unquote(uddg[0]) | |
| # Sometimes it's a relative redirect | |
| if href.startswith("//"): | |
| return "https:" + href | |
| return None | |
| def _search_google(query: str, num_results: int) -> list[dict[str, str]]: | |
| """Search Google by scraping the results page. Fallback method.""" | |
| try: | |
| import requests | |
| from bs4 import BeautifulSoup | |
| encoded_query = urllib.parse.quote_plus(query) | |
| url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}&hl=en" | |
| headers = {**_BROWSER_HEADERS, "Referer": "https://www.google.com/"} | |
| session = requests.Session() | |
| # First get a cookie from Google | |
| session.get("https://www.google.com/", headers=headers, timeout=5) | |
| resp = session.get(url, headers=headers, timeout=10, allow_redirects=True) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| results: list[dict[str, str]] = [] | |
| # Strategy 1: Modern Google layout — div.g > div.yuRUbf (title+link) + div.VwiC3b (snippet) | |
| for g_div in soup.select("div.g"): | |
| title_el = g_div.select_one("h3") | |
| link_el = g_div.select_one("a[href]") | |
| snippet_el = g_div.select_one("div.VwiC3b, span.aCOpRe, div[data-sncf]") | |
| if not title_el or not link_el: | |
| continue | |
| href = link_el.get("href", "") | |
| real_url = _extract_google_url(href) | |
| if not real_url: | |
| continue | |
| title = title_el.get_text(strip=True) | |
| snippet = snippet_el.get_text(strip=True) if snippet_el else "" | |
| if title and real_url: | |
| results.append({ | |
| "title": title, | |
| "url": real_url, | |
| "snippet": snippet, | |
| }) | |
| if len(results) >= num_results: | |
| break | |
| # Strategy 2: Fallback — look for any <a> containing an <h3> | |
| if not results: | |
| for a_tag in soup.find_all("a", href=True): | |
| h3 = a_tag.find("h3") | |
| if not h3: | |
| continue | |
| href = a_tag.get("href", "") | |
| real_url = _extract_google_url(href) | |
| if not real_url: | |
| continue | |
| title = h3.get_text(strip=True) | |
| # Try to find a sibling or nearby snippet | |
| snippet = "" | |
| parent = a_tag.parent | |
| if parent: | |
| for _ in range(3): | |
| parent = parent.parent if parent else None | |
| if parent: | |
| snippet_el = parent.select_one("div.VwiC3b, span.aCOpRe, span.st") | |
| if snippet_el: | |
| snippet = snippet_el.get_text(strip=True) | |
| if title and real_url: | |
| results.append({ | |
| "title": title, | |
| "url": real_url, | |
| "snippet": snippet, | |
| }) | |
| if len(results) >= num_results: | |
| break | |
| # Strategy 3: Last resort — any <a data-ved> with external href | |
| if not results: | |
| for a_tag in soup.select("a[data-ved]"): | |
| href = a_tag.get("href", "") | |
| if not href.startswith("http"): | |
| continue | |
| if "google.com" in href: | |
| continue | |
| title_el = a_tag.select_one("h3, span") | |
| title = title_el.get_text(strip=True) if title_el else a_tag.get_text(strip=True)[:100] | |
| if title and href: | |
| results.append({ | |
| "title": title, | |
| "url": href, | |
| "snippet": "", | |
| }) | |
| if len(results) >= num_results: | |
| break | |
| logger.info("Google search for '%s' returned %d results", query, len(results)) | |
| return results | |
| except ImportError: | |
| logger.warning("requests or beautifulsoup4 not installed for web search") | |
| return [] | |
| except Exception as exc: | |
| logger.warning("Google search failed: %s", exc) | |
| return [] | |
| def _extract_google_url(href: str) -> str | None: | |
| """Extract the real URL from a Google search result link.""" | |
| if not href: | |
| return None | |
| # Google redirect: /url?q=<real_url>&... | |
| if href.startswith("/url?q="): | |
| parsed = urllib.parse.urlparse(href) | |
| params = urllib.parse.parse_qs(parsed.query) | |
| q = params.get("q", []) | |
| if q: | |
| real_url = q[0] | |
| if real_url.startswith("http"): | |
| return real_url | |
| # Direct HTTP URL | |
| if href.startswith("http"): | |
| # Skip Google-internal URLs | |
| if any(domain in href for domain in [ | |
| "google.com", "googleusercontent.com", | |
| "youtube.com", "gstatic.com", | |
| ]): | |
| return None | |
| return href | |
| return None | |
| def format_search_results(results: list[dict[str, str]]) -> str: | |
| """Format search results into a text block for model context.""" | |
| if not results: | |
| return "No search results found." | |
| parts = ["Here are the web search results for reference:\n"] | |
| for i, r in enumerate(results, 1): | |
| parts.append(f"{i}. {r['title']}") | |
| parts.append(f" URL: {r['url']}") | |
| if r["snippet"]: | |
| parts.append(f" {r['snippet']}") | |
| parts.append("") | |
| return "\n".join(parts) | |