File size: 5,082 Bytes
e6853d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""WebFetchTool - Web content fetching and parsing for Stack 2.9"""

import re
from datetime import datetime
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse

try:
    import httpx
    HAS_HTTPX = True
except ImportError:
    HAS_HTTPX = False

from .base import BaseTool, ToolResult
from .registry import tool_registry


def _extract_readable_content(html: str) -> str:
    """Extract readable text from HTML."""
    # Remove scripts and styles
    text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Clean whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text


class WebFetchTool(BaseTool):
    """Fetch and extract readable content from URLs."""

    name = "web_fetch"
    description = "Fetch web page content and extract readable text"

    input_schema = {
        "type": "object",
        "properties": {
            "url": {
                "type": "string",
                "description": "URL to fetch"
            },
            "max_chars": {
                "type": "number",
                "default": 10000,
                "description": "Maximum characters to return"
            },
            "extract_links": {
                "type": "boolean",
                "default": False,
                "description": "Extract links from the page"
            }
        },
        "required": ["url"]
    }

    async def execute(self, url: str, max_chars: int = 10000, extract_links: bool = False) -> ToolResult:
        """Fetch URL content."""
        if not HAS_HTTPX:
            return ToolResult(success=False, error="httpx library not installed")

        parsed = urlparse(url)
        if not parsed.scheme:
            return ToolResult(success=False, error="Invalid URL - missing scheme")

        try:
            response = httpx.get(url, timeout=15.0, follow_redirects=True)
            response.raise_for_status()

            content_type = response.headers.get("content-type", "")
            if "text/html" not in content_type and "text/plain" not in content_type:
                return ToolResult(success=True, data={
                    "url": url,
                    "content": response.text[:max_chars],
                    "content_type": content_type,
                    "status_code": response.status_code
                })

            text = _extract_readable_content(response.text)
            text = text[:max_chars]

            result = {
                "url": url,
                "content": text,
                "content_type": content_type,
                "status_code": response.status_code,
                "fetched_at": datetime.now().isoformat()
            }

            if extract_links:
                links = re.findall(r'href=["\']([^"\']+)["\']', response.text)
                result["links"] = links[:50]

            return ToolResult(success=True, data=result)

        except httpx.TimeoutException:
            return ToolResult(success=False, error=f"Timeout fetching {url}")
        except httpx.HTTPError as e:
            return ToolResult(success=False, error=f"HTTP error: {e}")
        except Exception as e:
            return ToolResult(success=False, error=str(e))


class WebFetchMetaTool(BaseTool):
    """Get metadata from a URL without full content."""

    name = "web_fetch_meta"
    description = "Get metadata (title, description, images) from a URL"

    input_schema = {
        "type": "object",
        "properties": {
            "url": {
                "type": "string",
                "description": "URL to analyze"
            }
        },
        "required": ["url"]
    }

    async def execute(self, url: str) -> ToolResult:
        """Get URL metadata."""
        if not HAS_HTTPX:
            return ToolResult(success=False, error="httpx library not installed")

        try:
            response = httpx.get(url, timeout=10.0, follow_redirects=True)
            response.raise_for_status()

            title = re.search(r'<title[^>]*>([^<]+)</title>', response.text, re.IGNORECASE)
            description = re.search(r'<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)["\']', response.text, re.IGNORECASE)
            og_image = re.search(r'<meta[^>]+property=["\']og:image["\'][^>]+content=["\']([^"\']+)["\']', response.text, re.IGNORECASE)

            return ToolResult(success=True, data={
                "url": url,
                "title": title.group(1).strip() if title else None,
                "description": description.group(1).strip() if description else None,
                "og_image": og_image.group(1).strip() if og_image else None,
                "status_code": response.status_code
            })

        except Exception as e:
            return ToolResult(success=False, error=str(e))


# Register tools
tool_registry.register(WebFetchTool())
tool_registry.register(WebFetchMetaTool())