"""WebFetchTool - Web content fetching and parsing for Stack 2.9""" import re from datetime import datetime from typing import Any, Dict, List, Optional from urllib.parse import urlparse try: import httpx HAS_HTTPX = True except ImportError: HAS_HTTPX = False from .base import BaseTool, ToolResult from .registry import tool_registry def _extract_readable_content(html: str) -> str: """Extract readable text from HTML.""" # Remove scripts and styles text = re.sub(r']*>.*?', '', html, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r']*>.*?', '', text, flags=re.DOTALL | re.IGNORECASE) # Remove HTML tags text = re.sub(r'<[^>]+>', ' ', text) # Clean whitespace text = re.sub(r'\s+', ' ', text).strip() return text class WebFetchTool(BaseTool): """Fetch and extract readable content from URLs.""" name = "web_fetch" description = "Fetch web page content and extract readable text" input_schema = { "type": "object", "properties": { "url": { "type": "string", "description": "URL to fetch" }, "max_chars": { "type": "number", "default": 10000, "description": "Maximum characters to return" }, "extract_links": { "type": "boolean", "default": False, "description": "Extract links from the page" } }, "required": ["url"] } async def execute(self, url: str, max_chars: int = 10000, extract_links: bool = False) -> ToolResult: """Fetch URL content.""" if not HAS_HTTPX: return ToolResult(success=False, error="httpx library not installed") parsed = urlparse(url) if not parsed.scheme: return ToolResult(success=False, error="Invalid URL - missing scheme") try: response = httpx.get(url, timeout=15.0, follow_redirects=True) response.raise_for_status() content_type = response.headers.get("content-type", "") if "text/html" not in content_type and "text/plain" not in content_type: return ToolResult(success=True, data={ "url": url, "content": response.text[:max_chars], "content_type": content_type, "status_code": response.status_code }) text = _extract_readable_content(response.text) text = text[:max_chars] result = { "url": url, "content": text, "content_type": content_type, "status_code": response.status_code, "fetched_at": datetime.now().isoformat() } if extract_links: links = re.findall(r'href=["\']([^"\']+)["\']', response.text) result["links"] = links[:50] return ToolResult(success=True, data=result) except httpx.TimeoutException: return ToolResult(success=False, error=f"Timeout fetching {url}") except httpx.HTTPError as e: return ToolResult(success=False, error=f"HTTP error: {e}") except Exception as e: return ToolResult(success=False, error=str(e)) class WebFetchMetaTool(BaseTool): """Get metadata from a URL without full content.""" name = "web_fetch_meta" description = "Get metadata (title, description, images) from a URL" input_schema = { "type": "object", "properties": { "url": { "type": "string", "description": "URL to analyze" } }, "required": ["url"] } async def execute(self, url: str) -> ToolResult: """Get URL metadata.""" if not HAS_HTTPX: return ToolResult(success=False, error="httpx library not installed") try: response = httpx.get(url, timeout=10.0, follow_redirects=True) response.raise_for_status() title = re.search(r']*>([^<]+)', response.text, re.IGNORECASE) description = re.search(r']+name=["\']description["\'][^>]+content=["\']([^"\']+)["\']', response.text, re.IGNORECASE) og_image = re.search(r']+property=["\']og:image["\'][^>]+content=["\']([^"\']+)["\']', response.text, re.IGNORECASE) return ToolResult(success=True, data={ "url": url, "title": title.group(1).strip() if title else None, "description": description.group(1).strip() if description else None, "og_image": og_image.group(1).strip() if og_image else None, "status_code": response.status_code }) except Exception as e: return ToolResult(success=False, error=str(e)) # Register tools tool_registry.register(WebFetchTool()) tool_registry.register(WebFetchMetaTool())