sonicoder / code /websearch /google_scraper.py
R-Kentaren's picture
Upload folder using huggingface_hub
ccb935d verified
raw
history blame contribute delete
9.39 kB
"""Web search via scraping — no API key needed.
Strategy:
1. Primary: DuckDuckGo HTML (more scraper-friendly, fewer captchas)
2. Fallback: Google search with robust multi-selector parsing
"""
from __future__ import annotations
import logging
import re
import urllib.parse
logger = logging.getLogger(__name__)
# Common browser-like headers to avoid bot detection
_BROWSER_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/125.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
"""Search the web by scraping. No API key needed.
Tries DuckDuckGo first (more scraper-friendly),
then falls back to Google if DuckDuckGo returns nothing.
Returns a list of dicts with keys: title, url, snippet.
"""
results = _search_duckduckgo(query, num_results)
if results:
return results
results = _search_google(query, num_results)
return results
def _search_duckduckgo(query: str, num_results: int) -> list[dict[str, str]]:
"""Search DuckDuckGo HTML version — very scraper-friendly."""
try:
import requests
from bs4 import BeautifulSoup
encoded_query = urllib.parse.quote_plus(query)
url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
headers = {**_BROWSER_HEADERS, "Referer": "https://duckduckgo.com/"}
resp = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
results: list[dict[str, str]] = []
# DuckDuckGo HTML uses .result blocks
for result_div in soup.select(".result"):
title_el = result_div.select_one(".result__title a, .result__a")
snippet_el = result_div.select_one(".result__snippet")
if not title_el:
continue
title = title_el.get_text(strip=True)
# DDG uses redirect URLs like //duckduckgo.com/l/?uddg=...
href = title_el.get("href", "")
real_url = _extract_ddg_url(href)
if not real_url:
continue
# Skip internal URLs
if any(domain in real_url for domain in ["duckduckgo.com", "duck.co"]):
continue
snippet = snippet_el.get_text(strip=True) if snippet_el else ""
if title and real_url:
results.append({
"title": title,
"url": real_url,
"snippet": snippet,
})
if len(results) >= num_results:
break
logger.info("DuckDuckGo search for '%s' returned %d results", query, len(results))
return results
except ImportError:
logger.warning("requests or beautifulsoup4 not installed for web search")
return []
except Exception as exc:
logger.warning("DuckDuckGo search failed: %s", exc)
return []
def _extract_ddg_url(href: str) -> str | None:
"""Extract the real URL from a DuckDuckGo redirect link."""
if not href:
return None
# Direct HTTP URL
if href.startswith("http"):
return href
# DDG redirect: //duckduckgo.com/l/?uddg=<encoded_url>&...
if "uddg=" in href:
parsed = urllib.parse.urlparse(href)
params = urllib.parse.parse_qs(parsed.query)
uddg = params.get("uddg", [])
if uddg:
return urllib.parse.unquote(uddg[0])
# Sometimes it's a relative redirect
if href.startswith("//"):
return "https:" + href
return None
def _search_google(query: str, num_results: int) -> list[dict[str, str]]:
"""Search Google by scraping the results page. Fallback method."""
try:
import requests
from bs4 import BeautifulSoup
encoded_query = urllib.parse.quote_plus(query)
url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}&hl=en"
headers = {**_BROWSER_HEADERS, "Referer": "https://www.google.com/"}
session = requests.Session()
# First get a cookie from Google
session.get("https://www.google.com/", headers=headers, timeout=5)
resp = session.get(url, headers=headers, timeout=10, allow_redirects=True)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
results: list[dict[str, str]] = []
# Strategy 1: Modern Google layout — div.g > div.yuRUbf (title+link) + div.VwiC3b (snippet)
for g_div in soup.select("div.g"):
title_el = g_div.select_one("h3")
link_el = g_div.select_one("a[href]")
snippet_el = g_div.select_one("div.VwiC3b, span.aCOpRe, div[data-sncf]")
if not title_el or not link_el:
continue
href = link_el.get("href", "")
real_url = _extract_google_url(href)
if not real_url:
continue
title = title_el.get_text(strip=True)
snippet = snippet_el.get_text(strip=True) if snippet_el else ""
if title and real_url:
results.append({
"title": title,
"url": real_url,
"snippet": snippet,
})
if len(results) >= num_results:
break
# Strategy 2: Fallback — look for any <a> containing an <h3>
if not results:
for a_tag in soup.find_all("a", href=True):
h3 = a_tag.find("h3")
if not h3:
continue
href = a_tag.get("href", "")
real_url = _extract_google_url(href)
if not real_url:
continue
title = h3.get_text(strip=True)
# Try to find a sibling or nearby snippet
snippet = ""
parent = a_tag.parent
if parent:
for _ in range(3):
parent = parent.parent if parent else None
if parent:
snippet_el = parent.select_one("div.VwiC3b, span.aCOpRe, span.st")
if snippet_el:
snippet = snippet_el.get_text(strip=True)
if title and real_url:
results.append({
"title": title,
"url": real_url,
"snippet": snippet,
})
if len(results) >= num_results:
break
# Strategy 3: Last resort — any <a data-ved> with external href
if not results:
for a_tag in soup.select("a[data-ved]"):
href = a_tag.get("href", "")
if not href.startswith("http"):
continue
if "google.com" in href:
continue
title_el = a_tag.select_one("h3, span")
title = title_el.get_text(strip=True) if title_el else a_tag.get_text(strip=True)[:100]
if title and href:
results.append({
"title": title,
"url": href,
"snippet": "",
})
if len(results) >= num_results:
break
logger.info("Google search for '%s' returned %d results", query, len(results))
return results
except ImportError:
logger.warning("requests or beautifulsoup4 not installed for web search")
return []
except Exception as exc:
logger.warning("Google search failed: %s", exc)
return []
def _extract_google_url(href: str) -> str | None:
"""Extract the real URL from a Google search result link."""
if not href:
return None
# Google redirect: /url?q=<real_url>&...
if href.startswith("/url?q="):
parsed = urllib.parse.urlparse(href)
params = urllib.parse.parse_qs(parsed.query)
q = params.get("q", [])
if q:
real_url = q[0]
if real_url.startswith("http"):
return real_url
# Direct HTTP URL
if href.startswith("http"):
# Skip Google-internal URLs
if any(domain in href for domain in [
"google.com", "googleusercontent.com",
"youtube.com", "gstatic.com",
]):
return None
return href
return None
def format_search_results(results: list[dict[str, str]]) -> str:
"""Format search results into a text block for model context."""
if not results:
return "No search results found."
parts = ["Here are the web search results for reference:\n"]
for i, r in enumerate(results, 1):
parts.append(f"{i}. {r['title']}")
parts.append(f" URL: {r['url']}")
if r["snippet"]:
parts.append(f" {r['snippet']}")
parts.append("")
return "\n".join(parts)