| import requests |
| from bs4 import BeautifulSoup |
| from urllib.robotparser import RobotFileParser |
| from urllib.parse import urlparse, urljoin |
| import time |
| import logging |
| from concurrent.futures import ThreadPoolExecutor, as_completed |
| import re |
|
|
| |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
| logger = logging.getLogger(__name__) |
|
|
| class WebScraper: |
| def __init__(self, user_agent="WebLLMAssistant/1.0 (+https://github.com/YourUsername/Web-LLM-Assistant-Llama-cpp)", |
| rate_limit=1, timeout=10, max_retries=3): |
| self.session = requests.Session() |
| self.session.headers.update({"User-Agent": user_agent}) |
| self.robot_parser = RobotFileParser() |
| self.rate_limit = rate_limit |
| self.timeout = timeout |
| self.max_retries = max_retries |
| self.last_request_time = {} |
|
|
| def can_fetch(self, url): |
| parsed_url = urlparse(url) |
| robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" |
| self.robot_parser.set_url(robots_url) |
| try: |
| self.robot_parser.read() |
| return self.robot_parser.can_fetch(self.session.headers["User-Agent"], url) |
| except Exception as e: |
| logger.warning(f"Error reading robots.txt for {url}: {e}") |
| return True |
|
|
| def respect_rate_limit(self, url): |
| domain = urlparse(url).netloc |
| current_time = time.time() |
| if domain in self.last_request_time: |
| time_since_last_request = current_time - self.last_request_time[domain] |
| if time_since_last_request < self.rate_limit: |
| time.sleep(self.rate_limit - time_since_last_request) |
| self.last_request_time[domain] = time.time() |
|
|
| def scrape_page(self, url): |
| if not self.can_fetch(url): |
| logger.info(f"Robots.txt disallows scraping: {url}") |
| return None |
|
|
| for attempt in range(self.max_retries): |
| try: |
| self.respect_rate_limit(url) |
| response = self.session.get(url, timeout=self.timeout) |
| response.raise_for_status() |
| return self.extract_content(response.text, url) |
| except requests.RequestException as e: |
| logger.warning(f"Error scraping {url} (attempt {attempt + 1}/{self.max_retries}): {e}") |
| if attempt == self.max_retries - 1: |
| logger.error(f"Failed to scrape {url} after {self.max_retries} attempts") |
| return None |
| time.sleep(2 ** attempt) |
|
|
| def extract_content(self, html, url): |
| soup = BeautifulSoup(html, 'html.parser') |
|
|
| |
| for element in soup(["script", "style", "nav", "footer", "header"]): |
| element.decompose() |
|
|
| |
| title = soup.title.string if soup.title else "" |
|
|
| |
| main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') |
|
|
| if main_content: |
| paragraphs = main_content.find_all('p') |
| else: |
| paragraphs = soup.find_all('p') |
|
|
| |
| text = ' '.join([p.get_text().strip() for p in paragraphs]) |
|
|
| |
| if not text: |
| text = soup.get_text() |
|
|
| |
| text = re.sub(r'\s+', ' ', text).strip() |
|
|
| |
| links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)] |
|
|
| return { |
| "url": url, |
| "title": title, |
| "content": text[:2400], |
| "links": links[:10] |
| } |
|
|
| def scrape_multiple_pages(urls, max_workers=5): |
| scraper = WebScraper() |
| results = {} |
|
|
| with ThreadPoolExecutor(max_workers=max_workers) as executor: |
| future_to_url = {executor.submit(scraper.scrape_page, url): url for url in urls} |
| for future in as_completed(future_to_url): |
| url = future_to_url[future] |
| try: |
| data = future.result() |
| if data: |
| results[url] = data |
| logger.info(f"Successfully scraped: {url}") |
| else: |
| logger.warning(f"Failed to scrape: {url}") |
| except Exception as exc: |
| logger.error(f"{url} generated an exception: {exc}") |
|
|
| return results |
|
|
| |
| def get_web_content(urls): |
| scraped_data = scrape_multiple_pages(urls) |
| return {url: data['content'] for url, data in scraped_data.items() if data} |
|
|
| |
| def can_fetch(url): |
| parsed_url = urlparse(url) |
| robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" |
| rp = RobotFileParser() |
| rp.set_url(robots_url) |
| try: |
| rp.read() |
| return rp.can_fetch("*", url) |
| except Exception as e: |
| logger.warning(f"Error reading robots.txt for {url}: {e}") |
| return True |
|
|
| if __name__ == "__main__": |
| test_urls = [ |
| "https://en.wikipedia.org/wiki/Web_scraping", |
| "https://example.com", |
| "https://www.python.org" |
| ] |
| scraped_content = get_web_content(test_urls) |
| for url, content in scraped_content.items(): |
| print(f"Content from {url}:") |
| print(content[:500]) |
| print("\n---\n") |
|
|