Spaces:

CiscsoPonce
/

PrimoGreedy-Agent

Running

App Files Files Community

PrimoGreedy-Agent / src /tools /firecrawl_tool.py

CiscsoPonce

Initial Deploy (Clean)

a2cbcac 2 months ago

raw

history blame contribute delete

5.79 kB

	import os
	from typing import Dict, Any, Optional, List
	import aiohttp
	from .utils import ToolResult
	from ..config import config

	FIRECRAWL_BASE_URL = "https://api.firecrawl.dev/v0"

	async def scrape_url(
	url: str,
	extract_main_content: bool = True,
	include_html: bool = False
	) -> ToolResult:
	"""
	Scrape a URL using Firecrawl API.

	Args:
	url: URL to scrape
	extract_main_content: Whether to extract main content only
	include_html: Whether to include HTML in response

	Returns:
	ToolResult with scraped content
	"""
	firecrawl_key = config.get_api_key('firecrawl')

	if not firecrawl_key:
	return ToolResult(
	success=False,
	error="FIRECRAWL_API_KEY not found in environment variables"
	)

	try:
	headers = {
	"Authorization": f"Bearer {firecrawl_key}",
	"Content-Type": "application/json"
	}

	payload = {
	"url": url,
	"extractorOptions": {
	"mode": "markdown" if extract_main_content else "html"
	},
	"pageOptions": {
	"includeHtml": include_html,
	"onlyMainContent": extract_main_content
	}
	}

	async with aiohttp.ClientSession() as session:
	async with session.post(f"{FIRECRAWL_BASE_URL}/scrape", json=payload, headers=headers) as response:
	if response.status == 200:
	data = await response.json()

	# Extract relevant data
	content = data.get('data', {}).get('content', '')
	html = data.get('data', {}).get('html', '') if include_html else None
	metadata = data.get('data', {}).get('metadata', {})

	return ToolResult(
	success=True,
	data={
	'url': url,
	'content': content,
	'html': html,
	'title': metadata.get('title', ''),
	'description': metadata.get('description', ''),
	'keywords': metadata.get('keywords', []),
	'content_length': len(content)
	}
	)
	else:
	error_text = await response.text()
	return ToolResult(
	success=False,
	error=f"HTTP {response.status}: {error_text}"
	)

	except Exception as e:
	return ToolResult(
	success=False,
	error=f"Web scraping failed: {str(e)}"
	)

	async def crawl_website(
	url: str,
	max_pages: int = 5,
	include_paths: Optional[List[str]] = None,
	exclude_paths: Optional[List[str]] = None
	) -> ToolResult:
	"""
	Crawl a website using Firecrawl API.

	Args:
	url: Base URL to crawl
	max_pages: Maximum number of pages to crawl
	include_paths: List of paths to include
	exclude_paths: List of paths to exclude

	Returns:
	ToolResult with crawled content
	"""
	firecrawl_key = config.get_api_key('firecrawl')

	if not firecrawl_key:
	return ToolResult(
	success=False,
	error="FIRECRAWL_API_KEY not found in environment variables"
	)

	try:
	headers = {
	"Authorization": f"Bearer {firecrawl_key}",
	"Content-Type": "application/json"
	}

	payload = {
	"url": url,
	"crawlerOptions": {
	"limit": max_pages,
	"includePaths": include_paths or [],
	"excludePaths": exclude_paths or []
	},
	"pageOptions": {
	"onlyMainContent": True
	}
	}

	async with aiohttp.ClientSession() as session:
	async with session.post(f"{FIRECRAWL_BASE_URL}/crawl", json=payload, headers=headers) as response:
	if response.status == 200:
	data = await response.json()

	# Extract crawled pages
	pages = data.get('data', [])
	formatted_pages = []

	for page in pages:
	formatted_page = {
	'url': page.get('url', ''),
	'content': page.get('content', ''),
	'title': page.get('metadata', {}).get('title', ''),
	'description': page.get('metadata', {}).get('description', '')
	}
	formatted_pages.append(formatted_page)

	return ToolResult(
	success=True,
	data={
	'base_url': url,
	'pages_crawled': len(formatted_pages),
	'pages': formatted_pages,
	'total_content_length': sum(len(p['content']) for p in formatted_pages)
	}
	)
	else:
	error_text = await response.text()
	return ToolResult(
	success=False,
	error=f"HTTP {response.status}: {error_text}"
	)

	except Exception as e:
	return ToolResult(
	success=False,
	error=f"Website crawling failed: {str(e)}"
	)