Spaces:

alinabil21
/

geo-platform

Running

App Files Files Community

geo-platform / src /crawler.py

3v324v23

initial: geo-platform full stack

5c429d4 13 days ago

raw

history blame contribute delete

2.74 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse


	def fetch_html(url, timeout=15):
	resp = requests.get(url, timeout=timeout, headers={"User-Agent":"geo-pipeline/1.0"})
	resp.raise_for_status()
	return resp.text


	def fetch_html_playwright(url, timeout=30):
	"""Attempt to fetch page HTML using Playwright (optional dependency).

	Returns page content string or raises if Playwright not available or fails.
	"""
	try:
	from playwright.sync_api import sync_playwright
	except Exception as e:
	raise RuntimeError('playwright not installed') from e

	with sync_playwright() as p:
	browser = p.chromium.launch(headless=True)
	page = browser.new_page()
	page.goto(url, timeout=timeout * 1000)
	content = page.content()
	browser.close()
	return content

	def extract_page(url, html):
	soup = BeautifulSoup(html, "html.parser")
	title = soup.title.string.strip() if soup.title and soup.title.string else ""
	headings = []
	for h in soup.find_all(['h1','h2','h3','h4','h5','h6']):
	headings.append({
	'tag': h.name,
	'text': h.get_text(strip=True)
	})
	paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]

	# collect internal links (same domain)
	parsed = urlparse(url)
	domain = parsed.netloc
	links = set()
	for a in soup.find_all('a', href=True):
	href = urljoin(url, a['href'])
	if urlparse(href).netloc == domain:
	links.add(href)

	return {
	'url': url,
	'title': title,
	'headings': headings,
	'paragraphs': paragraphs,
	'links': list(links)
	}

	def crawl_seed(seed_url, max_pages=5):
	seen = set()
	to_visit = [seed_url]
	pages = []
	while to_visit and len(pages) < max_pages:
	u = to_visit.pop(0)
	if u in seen:
	continue
	try:
	html = fetch_html(u)
	page = extract_page(u, html)
	# if page looks empty (no paragraphs) try Playwright rendering
	if not page.get('paragraphs'):
	try:
	html2 = fetch_html_playwright(u)
	page2 = extract_page(u, html2)
	if page2.get('paragraphs'):
	page = page2
	except Exception:
	pass
	pages.append(page)
	seen.add(u)
	for l in page['links']:
	if l not in seen and l not in to_visit:
	to_visit.append(l)
	except Exception as e:
	# skip pages that fail
	print(f"[crawl] failed {u}: {e}")
	return pages