Spaces:

R-Kentaren
/

fullstack-code-builder

Running

File size: 9,394 Bytes

"""Web search via scraping — no API key needed.

Strategy:
1. Primary: DuckDuckGo HTML (more scraper-friendly, fewer captchas)
2. Fallback: Google search with robust multi-selector parsing
"""

from __future__ import annotations

import logging
import re
import urllib.parse

logger = logging.getLogger(__name__)

# Common browser-like headers to avoid bot detection
_BROWSER_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/125.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}


def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
    """Search the web by scraping. No API key needed.

    Tries DuckDuckGo first (more scraper-friendly),
    then falls back to Google if DuckDuckGo returns nothing.

    Returns a list of dicts with keys: title, url, snippet.
    """
    results = _search_duckduckgo(query, num_results)
    if results:
        return results

    results = _search_google(query, num_results)
    return results


def _search_duckduckgo(query: str, num_results: int) -> list[dict[str, str]]:
    """Search DuckDuckGo HTML version — very scraper-friendly."""
    try:
        import requests
        from bs4 import BeautifulSoup

        encoded_query = urllib.parse.quote_plus(query)
        url = f"https://html.duckduckgo.com/html/?q={encoded_query}"

        headers = {**_BROWSER_HEADERS, "Referer": "https://duckduckgo.com/"}

        resp = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, "html.parser")
        results: list[dict[str, str]] = []

        # DuckDuckGo HTML uses .result blocks
        for result_div in soup.select(".result"):
            title_el = result_div.select_one(".result__title a, .result__a")
            snippet_el = result_div.select_one(".result__snippet")

            if not title_el:
                continue

            title = title_el.get_text(strip=True)
            # DDG uses redirect URLs like //duckduckgo.com/l/?uddg=...
            href = title_el.get("href", "")

            real_url = _extract_ddg_url(href)
            if not real_url:
                continue

            # Skip internal URLs
            if any(domain in real_url for domain in ["duckduckgo.com", "duck.co"]):
                continue

            snippet = snippet_el.get_text(strip=True) if snippet_el else ""

            if title and real_url:
                results.append({
                    "title": title,
                    "url": real_url,
                    "snippet": snippet,
                })

            if len(results) >= num_results:
                break

        logger.info("DuckDuckGo search for '%s' returned %d results", query, len(results))
        return results

    except ImportError:
        logger.warning("requests or beautifulsoup4 not installed for web search")
        return []
    except Exception as exc:
        logger.warning("DuckDuckGo search failed: %s", exc)
        return []


def _extract_ddg_url(href: str) -> str | None:
    """Extract the real URL from a DuckDuckGo redirect link."""
    if not href:
        return None

    # Direct HTTP URL
    if href.startswith("http"):
        return href

    # DDG redirect: //duckduckgo.com/l/?uddg=<encoded_url>&...
    if "uddg=" in href:
        parsed = urllib.parse.urlparse(href)
        params = urllib.parse.parse_qs(parsed.query)
        uddg = params.get("uddg", [])
        if uddg:
            return urllib.parse.unquote(uddg[0])

    # Sometimes it's a relative redirect
    if href.startswith("//"):
        return "https:" + href

    return None


def _search_google(query: str, num_results: int) -> list[dict[str, str]]:
    """Search Google by scraping the results page. Fallback method."""
    try:
        import requests
        from bs4 import BeautifulSoup

        encoded_query = urllib.parse.quote_plus(query)
        url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}&hl=en"

        headers = {**_BROWSER_HEADERS, "Referer": "https://www.google.com/"}

        session = requests.Session()
        # First get a cookie from Google
        session.get("https://www.google.com/", headers=headers, timeout=5)

        resp = session.get(url, headers=headers, timeout=10, allow_redirects=True)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, "html.parser")
        results: list[dict[str, str]] = []

        # Strategy 1: Modern Google layout — div.g > div.yuRUbf (title+link) + div.VwiC3b (snippet)
        for g_div in soup.select("div.g"):
            title_el = g_div.select_one("h3")
            link_el = g_div.select_one("a[href]")
            snippet_el = g_div.select_one("div.VwiC3b, span.aCOpRe, div[data-sncf]")

            if not title_el or not link_el:
                continue

            href = link_el.get("href", "")
            real_url = _extract_google_url(href)
            if not real_url:
                continue

            title = title_el.get_text(strip=True)
            snippet = snippet_el.get_text(strip=True) if snippet_el else ""

            if title and real_url:
                results.append({
                    "title": title,
                    "url": real_url,
                    "snippet": snippet,
                })

            if len(results) >= num_results:
                break

        # Strategy 2: Fallback — look for any <a> containing an <h3>
        if not results:
            for a_tag in soup.find_all("a", href=True):
                h3 = a_tag.find("h3")
                if not h3:
                    continue

                href = a_tag.get("href", "")
                real_url = _extract_google_url(href)
                if not real_url:
                    continue

                title = h3.get_text(strip=True)
                # Try to find a sibling or nearby snippet
                snippet = ""
                parent = a_tag.parent
                if parent:
                    for _ in range(3):
                        parent = parent.parent if parent else None
                    if parent:
                        snippet_el = parent.select_one("div.VwiC3b, span.aCOpRe, span.st")
                        if snippet_el:
                            snippet = snippet_el.get_text(strip=True)

                if title and real_url:
                    results.append({
                        "title": title,
                        "url": real_url,
                        "snippet": snippet,
                    })

                if len(results) >= num_results:
                    break

        # Strategy 3: Last resort — any <a data-ved> with external href
        if not results:
            for a_tag in soup.select("a[data-ved]"):
                href = a_tag.get("href", "")
                if not href.startswith("http"):
                    continue
                if "google.com" in href:
                    continue

                title_el = a_tag.select_one("h3, span")
                title = title_el.get_text(strip=True) if title_el else a_tag.get_text(strip=True)[:100]

                if title and href:
                    results.append({
                        "title": title,
                        "url": href,
                        "snippet": "",
                    })

                if len(results) >= num_results:
                    break

        logger.info("Google search for '%s' returned %d results", query, len(results))
        return results

    except ImportError:
        logger.warning("requests or beautifulsoup4 not installed for web search")
        return []
    except Exception as exc:
        logger.warning("Google search failed: %s", exc)
        return []


def _extract_google_url(href: str) -> str | None:
    """Extract the real URL from a Google search result link."""
    if not href:
        return None

    # Google redirect: /url?q=<real_url>&...
    if href.startswith("/url?q="):
        parsed = urllib.parse.urlparse(href)
        params = urllib.parse.parse_qs(parsed.query)
        q = params.get("q", [])
        if q:
            real_url = q[0]
            if real_url.startswith("http"):
                return real_url

    # Direct HTTP URL
    if href.startswith("http"):
        # Skip Google-internal URLs
        if any(domain in href for domain in [
            "google.com", "googleusercontent.com",
            "youtube.com", "gstatic.com",
        ]):
            return None
        return href

    return None


def format_search_results(results: list[dict[str, str]]) -> str:
    """Format search results into a text block for model context."""
    if not results:
        return "No search results found."

    parts = ["Here are the web search results for reference:\n"]
    for i, r in enumerate(results, 1):
        parts.append(f"{i}. {r['title']}")
        parts.append(f"   URL: {r['url']}")
        if r["snippet"]:
            parts.append(f"   {r['snippet']}")
        parts.append("")

    return "\n".join(parts)