Spaces:
Running
Running
| import os | |
| from typing import Dict, Any, Optional, List | |
| import aiohttp | |
| from .utils import ToolResult | |
| from ..config import config | |
| FIRECRAWL_BASE_URL = "https://api.firecrawl.dev/v0" | |
| async def scrape_url( | |
| url: str, | |
| extract_main_content: bool = True, | |
| include_html: bool = False | |
| ) -> ToolResult: | |
| """ | |
| Scrape a URL using Firecrawl API. | |
| Args: | |
| url: URL to scrape | |
| extract_main_content: Whether to extract main content only | |
| include_html: Whether to include HTML in response | |
| Returns: | |
| ToolResult with scraped content | |
| """ | |
| firecrawl_key = config.get_api_key('firecrawl') | |
| if not firecrawl_key: | |
| return ToolResult( | |
| success=False, | |
| error="FIRECRAWL_API_KEY not found in environment variables" | |
| ) | |
| try: | |
| headers = { | |
| "Authorization": f"Bearer {firecrawl_key}", | |
| "Content-Type": "application/json" | |
| } | |
| payload = { | |
| "url": url, | |
| "extractorOptions": { | |
| "mode": "markdown" if extract_main_content else "html" | |
| }, | |
| "pageOptions": { | |
| "includeHtml": include_html, | |
| "onlyMainContent": extract_main_content | |
| } | |
| } | |
| async with aiohttp.ClientSession() as session: | |
| async with session.post(f"{FIRECRAWL_BASE_URL}/scrape", json=payload, headers=headers) as response: | |
| if response.status == 200: | |
| data = await response.json() | |
| # Extract relevant data | |
| content = data.get('data', {}).get('content', '') | |
| html = data.get('data', {}).get('html', '') if include_html else None | |
| metadata = data.get('data', {}).get('metadata', {}) | |
| return ToolResult( | |
| success=True, | |
| data={ | |
| 'url': url, | |
| 'content': content, | |
| 'html': html, | |
| 'title': metadata.get('title', ''), | |
| 'description': metadata.get('description', ''), | |
| 'keywords': metadata.get('keywords', []), | |
| 'content_length': len(content) | |
| } | |
| ) | |
| else: | |
| error_text = await response.text() | |
| return ToolResult( | |
| success=False, | |
| error=f"HTTP {response.status}: {error_text}" | |
| ) | |
| except Exception as e: | |
| return ToolResult( | |
| success=False, | |
| error=f"Web scraping failed: {str(e)}" | |
| ) | |
| async def crawl_website( | |
| url: str, | |
| max_pages: int = 5, | |
| include_paths: Optional[List[str]] = None, | |
| exclude_paths: Optional[List[str]] = None | |
| ) -> ToolResult: | |
| """ | |
| Crawl a website using Firecrawl API. | |
| Args: | |
| url: Base URL to crawl | |
| max_pages: Maximum number of pages to crawl | |
| include_paths: List of paths to include | |
| exclude_paths: List of paths to exclude | |
| Returns: | |
| ToolResult with crawled content | |
| """ | |
| firecrawl_key = config.get_api_key('firecrawl') | |
| if not firecrawl_key: | |
| return ToolResult( | |
| success=False, | |
| error="FIRECRAWL_API_KEY not found in environment variables" | |
| ) | |
| try: | |
| headers = { | |
| "Authorization": f"Bearer {firecrawl_key}", | |
| "Content-Type": "application/json" | |
| } | |
| payload = { | |
| "url": url, | |
| "crawlerOptions": { | |
| "limit": max_pages, | |
| "includePaths": include_paths or [], | |
| "excludePaths": exclude_paths or [] | |
| }, | |
| "pageOptions": { | |
| "onlyMainContent": True | |
| } | |
| } | |
| async with aiohttp.ClientSession() as session: | |
| async with session.post(f"{FIRECRAWL_BASE_URL}/crawl", json=payload, headers=headers) as response: | |
| if response.status == 200: | |
| data = await response.json() | |
| # Extract crawled pages | |
| pages = data.get('data', []) | |
| formatted_pages = [] | |
| for page in pages: | |
| formatted_page = { | |
| 'url': page.get('url', ''), | |
| 'content': page.get('content', ''), | |
| 'title': page.get('metadata', {}).get('title', ''), | |
| 'description': page.get('metadata', {}).get('description', '') | |
| } | |
| formatted_pages.append(formatted_page) | |
| return ToolResult( | |
| success=True, | |
| data={ | |
| 'base_url': url, | |
| 'pages_crawled': len(formatted_pages), | |
| 'pages': formatted_pages, | |
| 'total_content_length': sum(len(p['content']) for p in formatted_pages) | |
| } | |
| ) | |
| else: | |
| error_text = await response.text() | |
| return ToolResult( | |
| success=False, | |
| error=f"HTTP {response.status}: {error_text}" | |
| ) | |
| except Exception as e: | |
| return ToolResult( | |
| success=False, | |
| error=f"Website crawling failed: {str(e)}" | |
| ) | |