File size: 4,467 Bytes
a9dc537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
GPU Tools for SPARKNET
Tools for GPU monitoring and management
"""

from typing import Optional
from loguru import logger
from .base_tool import BaseTool, ToolResult
from ..utils.gpu_manager import get_gpu_manager


class GPUMonitorTool(BaseTool):
    """Tool for monitoring GPU status."""

    def __init__(self):
        super().__init__(
            name="gpu_monitor",
            description="Monitor GPU status, memory usage, and utilization",
        )
        self.add_parameter("gpu_id", "int", "Specific GPU ID to monitor (optional)", required=False, default=None)
        self.gpu_manager = get_gpu_manager()

    async def execute(self, gpu_id: Optional[int] = None, **kwargs) -> ToolResult:
        """
        Monitor GPU status.

        Args:
            gpu_id: Specific GPU ID or None for all GPUs

        Returns:
            ToolResult with GPU information
        """
        try:
            if gpu_id is not None:
                # Get info for specific GPU
                info = self.gpu_manager.get_gpu_info(gpu_id)

                if "error" in info:
                    return ToolResult(
                        success=False,
                        output=None,
                        error=info["error"],
                    )

                output = self._format_gpu_info(info)

                return ToolResult(
                    success=True,
                    output=output,
                    metadata=info,
                )
            else:
                # Get info for all GPUs
                all_info = self.gpu_manager.get_all_gpu_info()

                output_lines = []
                for info in all_info:
                    if "error" not in info:
                        output_lines.append(self._format_gpu_info(info))

                output = "\n\n".join(output_lines)

                return ToolResult(
                    success=True,
                    output=output,
                    metadata={"gpus": all_info},
                )

        except Exception as e:
            logger.error(f"GPU monitoring error: {e}")
            return ToolResult(
                success=False,
                output=None,
                error=f"Monitoring error: {str(e)}",
            )

    def _format_gpu_info(self, info: dict) -> str:
        """Format GPU info for display."""
        return (
            f"GPU {info['gpu_id']}: {info['name']}\n"
            f"  Memory: {info['memory_used'] / 1024**3:.2f} GB / {info['memory_total'] / 1024**3:.2f} GB "
            f"({info['memory_percent']:.1f}% used)\n"
            f"  Free Memory: {info['memory_free'] / 1024**3:.2f} GB\n"
            f"  GPU Utilization: {info['gpu_utilization']}%\n"
            f"  Temperature: {info['temperature']}°C"
        )


class GPUSelectTool(BaseTool):
    """Tool for selecting best available GPU."""

    def __init__(self):
        super().__init__(
            name="gpu_select",
            description="Select the best available GPU based on free memory",
        )
        self.add_parameter("min_memory_gb", "float", "Minimum required memory in GB", required=False, default=8.0)
        self.gpu_manager = get_gpu_manager()

    async def execute(self, min_memory_gb: float = 8.0, **kwargs) -> ToolResult:
        """
        Select best GPU.

        Args:
            min_memory_gb: Minimum required memory

        Returns:
            ToolResult with selected GPU ID
        """
        try:
            gpu_id = self.gpu_manager.select_best_gpu(min_memory_gb)

            if gpu_id is None:
                return ToolResult(
                    success=False,
                    output=None,
                    error=f"No GPU found with {min_memory_gb} GB free memory",
                )

            info = self.gpu_manager.get_gpu_info(gpu_id)

            output = (
                f"Selected GPU {gpu_id}: {info['name']}\n"
                f"Free Memory: {info['memory_free'] / 1024**3:.2f} GB"
            )

            return ToolResult(
                success=True,
                output=output,
                metadata={
                    "gpu_id": gpu_id,
                    "gpu_info": info,
                },
            )

        except Exception as e:
            logger.error(f"GPU selection error: {e}")
            return ToolResult(
                success=False,
                output=None,
                error=f"Selection error: {str(e)}",
            )