Spaces:

sonic-coder
/

sonicoder

Running

App Files Files Community

sonicoder / code /websearch /google_scraper.py

R-Kentaren

Upload folder using huggingface_hub

ccb935d verified about 22 hours ago

raw

history blame contribute delete

9.39 kB

	"""Web search via scraping — no API key needed.

	Strategy:
	1. Primary: DuckDuckGo HTML (more scraper-friendly, fewer captchas)
	2. Fallback: Google search with robust multi-selector parsing
	"""

	from __future__ import annotations

	import logging
	import re
	import urllib.parse

	logger = logging.getLogger(__name__)

	# Common browser-like headers to avoid bot detection
	_BROWSER_HEADERS = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/125.0.0.0 Safari/537.36"
	),
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.5",
	"Accept-Encoding": "gzip, deflate",
	"DNT": "1",
	"Connection": "keep-alive",
	"Upgrade-Insecure-Requests": "1",
	}


	def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
	"""Search the web by scraping. No API key needed.

	Tries DuckDuckGo first (more scraper-friendly),
	then falls back to Google if DuckDuckGo returns nothing.

	Returns a list of dicts with keys: title, url, snippet.
	"""
	results = _search_duckduckgo(query, num_results)
	if results:
	return results

	results = _search_google(query, num_results)
	return results


	def _search_duckduckgo(query: str, num_results: int) -> list[dict[str, str]]:
	"""Search DuckDuckGo HTML version — very scraper-friendly."""
	try:
	import requests
	from bs4 import BeautifulSoup

	encoded_query = urllib.parse.quote_plus(query)
	url = f"https://html.duckduckgo.com/html/?q={encoded_query}"

	headers = {**_BROWSER_HEADERS, "Referer": "https://duckduckgo.com/"}

	resp = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
	resp.raise_for_status()

	soup = BeautifulSoup(resp.text, "html.parser")
	results: list[dict[str, str]] = []

	# DuckDuckGo HTML uses .result blocks
	for result_div in soup.select(".result"):
	title_el = result_div.select_one(".result__title a, .result__a")
	snippet_el = result_div.select_one(".result__snippet")

	if not title_el:
	continue

	title = title_el.get_text(strip=True)
	# DDG uses redirect URLs like //duckduckgo.com/l/?uddg=...
	href = title_el.get("href", "")

	real_url = _extract_ddg_url(href)
	if not real_url:
	continue

	# Skip internal URLs
	if any(domain in real_url for domain in ["duckduckgo.com", "duck.co"]):
	continue

	snippet = snippet_el.get_text(strip=True) if snippet_el else ""

	if title and real_url:
	results.append({
	"title": title,
	"url": real_url,
	"snippet": snippet,
	})

	if len(results) >= num_results:
	break

	logger.info("DuckDuckGo search for '%s' returned %d results", query, len(results))
	return results

	except ImportError:
	logger.warning("requests or beautifulsoup4 not installed for web search")
	return []
	except Exception as exc:
	logger.warning("DuckDuckGo search failed: %s", exc)
	return []


	def _extract_ddg_url(href: str) -> str \| None:
	"""Extract the real URL from a DuckDuckGo redirect link."""
	if not href:
	return None

	# Direct HTTP URL
	if href.startswith("http"):
	return href

	# DDG redirect: //duckduckgo.com/l/?uddg=<encoded_url>&...
	if "uddg=" in href:
	parsed = urllib.parse.urlparse(href)
	params = urllib.parse.parse_qs(parsed.query)
	uddg = params.get("uddg", [])
	if uddg:
	return urllib.parse.unquote(uddg[0])

	# Sometimes it's a relative redirect
	if href.startswith("//"):
	return "https:" + href

	return None


	def _search_google(query: str, num_results: int) -> list[dict[str, str]]:
	"""Search Google by scraping the results page. Fallback method."""
	try:
	import requests
	from bs4 import BeautifulSoup

	encoded_query = urllib.parse.quote_plus(query)
	url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}&hl=en"

	headers = {**_BROWSER_HEADERS, "Referer": "https://www.google.com/"}

	session = requests.Session()
	# First get a cookie from Google
	session.get("https://www.google.com/", headers=headers, timeout=5)

	resp = session.get(url, headers=headers, timeout=10, allow_redirects=True)
	resp.raise_for_status()

	soup = BeautifulSoup(resp.text, "html.parser")
	results: list[dict[str, str]] = []

	# Strategy 1: Modern Google layout — div.g > div.yuRUbf (title+link) + div.VwiC3b (snippet)
	for g_div in soup.select("div.g"):
	title_el = g_div.select_one("h3")
	link_el = g_div.select_one("a[href]")
	snippet_el = g_div.select_one("div.VwiC3b, span.aCOpRe, div[data-sncf]")

	if not title_el or not link_el:
	continue

	href = link_el.get("href", "")
	real_url = _extract_google_url(href)
	if not real_url:
	continue

	title = title_el.get_text(strip=True)
	snippet = snippet_el.get_text(strip=True) if snippet_el else ""

	if title and real_url:
	results.append({
	"title": title,
	"url": real_url,
	"snippet": snippet,
	})

	if len(results) >= num_results:
	break

	# Strategy 2: Fallback — look for any <a> containing an <h3>
	if not results:
	for a_tag in soup.find_all("a", href=True):
	h3 = a_tag.find("h3")
	if not h3:
	continue

	href = a_tag.get("href", "")
	real_url = _extract_google_url(href)
	if not real_url:
	continue

	title = h3.get_text(strip=True)
	# Try to find a sibling or nearby snippet
	snippet = ""
	parent = a_tag.parent
	if parent:
	for _ in range(3):
	parent = parent.parent if parent else None
	if parent:
	snippet_el = parent.select_one("div.VwiC3b, span.aCOpRe, span.st")
	if snippet_el:
	snippet = snippet_el.get_text(strip=True)

	if title and real_url:
	results.append({
	"title": title,
	"url": real_url,
	"snippet": snippet,
	})

	if len(results) >= num_results:
	break

	# Strategy 3: Last resort — any <a data-ved> with external href
	if not results:
	for a_tag in soup.select("a[data-ved]"):
	href = a_tag.get("href", "")
	if not href.startswith("http"):
	continue
	if "google.com" in href:
	continue

	title_el = a_tag.select_one("h3, span")
	title = title_el.get_text(strip=True) if title_el else a_tag.get_text(strip=True)[:100]

	if title and href:
	results.append({
	"title": title,
	"url": href,
	"snippet": "",
	})

	if len(results) >= num_results:
	break

	logger.info("Google search for '%s' returned %d results", query, len(results))
	return results

	except ImportError:
	logger.warning("requests or beautifulsoup4 not installed for web search")
	return []
	except Exception as exc:
	logger.warning("Google search failed: %s", exc)
	return []


	def _extract_google_url(href: str) -> str \| None:
	"""Extract the real URL from a Google search result link."""
	if not href:
	return None

	# Google redirect: /url?q=<real_url>&...
	if href.startswith("/url?q="):
	parsed = urllib.parse.urlparse(href)
	params = urllib.parse.parse_qs(parsed.query)
	q = params.get("q", [])
	if q:
	real_url = q[0]
	if real_url.startswith("http"):
	return real_url

	# Direct HTTP URL
	if href.startswith("http"):
	# Skip Google-internal URLs
	if any(domain in href for domain in [
	"google.com", "googleusercontent.com",
	"youtube.com", "gstatic.com",
	]):
	return None
	return href

	return None


	def format_search_results(results: list[dict[str, str]]) -> str:
	"""Format search results into a text block for model context."""
	if not results:
	return "No search results found."

	parts = ["Here are the web search results for reference:\n"]
	for i, r in enumerate(results, 1):
	parts.append(f"{i}. {r['title']}")
	parts.append(f" URL: {r['url']}")
	if r["snippet"]:
	parts.append(f" {r['snippet']}")
	parts.append("")

	return "\n".join(parts)