import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse def fetch_html(url, timeout=15): resp = requests.get(url, timeout=timeout, headers={"User-Agent":"geo-pipeline/1.0"}) resp.raise_for_status() return resp.text def fetch_html_playwright(url, timeout=30): """Attempt to fetch page HTML using Playwright (optional dependency). Returns page content string or raises if Playwright not available or fails. """ try: from playwright.sync_api import sync_playwright except Exception as e: raise RuntimeError('playwright not installed') from e with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page() page.goto(url, timeout=timeout * 1000) content = page.content() browser.close() return content def extract_page(url, html): soup = BeautifulSoup(html, "html.parser") title = soup.title.string.strip() if soup.title and soup.title.string else "" headings = [] for h in soup.find_all(['h1','h2','h3','h4','h5','h6']): headings.append({ 'tag': h.name, 'text': h.get_text(strip=True) }) paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)] # collect internal links (same domain) parsed = urlparse(url) domain = parsed.netloc links = set() for a in soup.find_all('a', href=True): href = urljoin(url, a['href']) if urlparse(href).netloc == domain: links.add(href) return { 'url': url, 'title': title, 'headings': headings, 'paragraphs': paragraphs, 'links': list(links) } def crawl_seed(seed_url, max_pages=5): seen = set() to_visit = [seed_url] pages = [] while to_visit and len(pages) < max_pages: u = to_visit.pop(0) if u in seen: continue try: html = fetch_html(u) page = extract_page(u, html) # if page looks empty (no paragraphs) try Playwright rendering if not page.get('paragraphs'): try: html2 = fetch_html_playwright(u) page2 = extract_page(u, html2) if page2.get('paragraphs'): page = page2 except Exception: pass pages.append(page) seen.add(u) for l in page['links']: if l not in seen and l not in to_visit: to_visit.append(l) except Exception as e: # skip pages that fail print(f"[crawl] failed {u}: {e}") return pages