File size: 5,082 Bytes
e6853d3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | """WebFetchTool - Web content fetching and parsing for Stack 2.9"""
import re
from datetime import datetime
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse
try:
import httpx
HAS_HTTPX = True
except ImportError:
HAS_HTTPX = False
from .base import BaseTool, ToolResult
from .registry import tool_registry
def _extract_readable_content(html: str) -> str:
"""Extract readable text from HTML."""
# Remove scripts and styles
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
# Remove HTML tags
text = re.sub(r'<[^>]+>', ' ', text)
# Clean whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
class WebFetchTool(BaseTool):
"""Fetch and extract readable content from URLs."""
name = "web_fetch"
description = "Fetch web page content and extract readable text"
input_schema = {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "URL to fetch"
},
"max_chars": {
"type": "number",
"default": 10000,
"description": "Maximum characters to return"
},
"extract_links": {
"type": "boolean",
"default": False,
"description": "Extract links from the page"
}
},
"required": ["url"]
}
async def execute(self, url: str, max_chars: int = 10000, extract_links: bool = False) -> ToolResult:
"""Fetch URL content."""
if not HAS_HTTPX:
return ToolResult(success=False, error="httpx library not installed")
parsed = urlparse(url)
if not parsed.scheme:
return ToolResult(success=False, error="Invalid URL - missing scheme")
try:
response = httpx.get(url, timeout=15.0, follow_redirects=True)
response.raise_for_status()
content_type = response.headers.get("content-type", "")
if "text/html" not in content_type and "text/plain" not in content_type:
return ToolResult(success=True, data={
"url": url,
"content": response.text[:max_chars],
"content_type": content_type,
"status_code": response.status_code
})
text = _extract_readable_content(response.text)
text = text[:max_chars]
result = {
"url": url,
"content": text,
"content_type": content_type,
"status_code": response.status_code,
"fetched_at": datetime.now().isoformat()
}
if extract_links:
links = re.findall(r'href=["\']([^"\']+)["\']', response.text)
result["links"] = links[:50]
return ToolResult(success=True, data=result)
except httpx.TimeoutException:
return ToolResult(success=False, error=f"Timeout fetching {url}")
except httpx.HTTPError as e:
return ToolResult(success=False, error=f"HTTP error: {e}")
except Exception as e:
return ToolResult(success=False, error=str(e))
class WebFetchMetaTool(BaseTool):
"""Get metadata from a URL without full content."""
name = "web_fetch_meta"
description = "Get metadata (title, description, images) from a URL"
input_schema = {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "URL to analyze"
}
},
"required": ["url"]
}
async def execute(self, url: str) -> ToolResult:
"""Get URL metadata."""
if not HAS_HTTPX:
return ToolResult(success=False, error="httpx library not installed")
try:
response = httpx.get(url, timeout=10.0, follow_redirects=True)
response.raise_for_status()
title = re.search(r'<title[^>]*>([^<]+)</title>', response.text, re.IGNORECASE)
description = re.search(r'<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)["\']', response.text, re.IGNORECASE)
og_image = re.search(r'<meta[^>]+property=["\']og:image["\'][^>]+content=["\']([^"\']+)["\']', response.text, re.IGNORECASE)
return ToolResult(success=True, data={
"url": url,
"title": title.group(1).strip() if title else None,
"description": description.group(1).strip() if description else None,
"og_image": og_image.group(1).strip() if og_image else None,
"status_code": response.status_code
})
except Exception as e:
return ToolResult(success=False, error=str(e))
# Register tools
tool_registry.register(WebFetchTool())
tool_registry.register(WebFetchMetaTool())
|