|
|
""" |
|
|
GPU Tools for SPARKNET |
|
|
Tools for GPU monitoring and management |
|
|
""" |
|
|
|
|
|
from typing import Optional |
|
|
from loguru import logger |
|
|
from .base_tool import BaseTool, ToolResult |
|
|
from ..utils.gpu_manager import get_gpu_manager |
|
|
|
|
|
|
|
|
class GPUMonitorTool(BaseTool): |
|
|
"""Tool for monitoring GPU status.""" |
|
|
|
|
|
def __init__(self): |
|
|
super().__init__( |
|
|
name="gpu_monitor", |
|
|
description="Monitor GPU status, memory usage, and utilization", |
|
|
) |
|
|
self.add_parameter("gpu_id", "int", "Specific GPU ID to monitor (optional)", required=False, default=None) |
|
|
self.gpu_manager = get_gpu_manager() |
|
|
|
|
|
async def execute(self, gpu_id: Optional[int] = None, **kwargs) -> ToolResult: |
|
|
""" |
|
|
Monitor GPU status. |
|
|
|
|
|
Args: |
|
|
gpu_id: Specific GPU ID or None for all GPUs |
|
|
|
|
|
Returns: |
|
|
ToolResult with GPU information |
|
|
""" |
|
|
try: |
|
|
if gpu_id is not None: |
|
|
|
|
|
info = self.gpu_manager.get_gpu_info(gpu_id) |
|
|
|
|
|
if "error" in info: |
|
|
return ToolResult( |
|
|
success=False, |
|
|
output=None, |
|
|
error=info["error"], |
|
|
) |
|
|
|
|
|
output = self._format_gpu_info(info) |
|
|
|
|
|
return ToolResult( |
|
|
success=True, |
|
|
output=output, |
|
|
metadata=info, |
|
|
) |
|
|
else: |
|
|
|
|
|
all_info = self.gpu_manager.get_all_gpu_info() |
|
|
|
|
|
output_lines = [] |
|
|
for info in all_info: |
|
|
if "error" not in info: |
|
|
output_lines.append(self._format_gpu_info(info)) |
|
|
|
|
|
output = "\n\n".join(output_lines) |
|
|
|
|
|
return ToolResult( |
|
|
success=True, |
|
|
output=output, |
|
|
metadata={"gpus": all_info}, |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"GPU monitoring error: {e}") |
|
|
return ToolResult( |
|
|
success=False, |
|
|
output=None, |
|
|
error=f"Monitoring error: {str(e)}", |
|
|
) |
|
|
|
|
|
def _format_gpu_info(self, info: dict) -> str: |
|
|
"""Format GPU info for display.""" |
|
|
return ( |
|
|
f"GPU {info['gpu_id']}: {info['name']}\n" |
|
|
f" Memory: {info['memory_used'] / 1024**3:.2f} GB / {info['memory_total'] / 1024**3:.2f} GB " |
|
|
f"({info['memory_percent']:.1f}% used)\n" |
|
|
f" Free Memory: {info['memory_free'] / 1024**3:.2f} GB\n" |
|
|
f" GPU Utilization: {info['gpu_utilization']}%\n" |
|
|
f" Temperature: {info['temperature']}°C" |
|
|
) |
|
|
|
|
|
|
|
|
class GPUSelectTool(BaseTool): |
|
|
"""Tool for selecting best available GPU.""" |
|
|
|
|
|
def __init__(self): |
|
|
super().__init__( |
|
|
name="gpu_select", |
|
|
description="Select the best available GPU based on free memory", |
|
|
) |
|
|
self.add_parameter("min_memory_gb", "float", "Minimum required memory in GB", required=False, default=8.0) |
|
|
self.gpu_manager = get_gpu_manager() |
|
|
|
|
|
async def execute(self, min_memory_gb: float = 8.0, **kwargs) -> ToolResult: |
|
|
""" |
|
|
Select best GPU. |
|
|
|
|
|
Args: |
|
|
min_memory_gb: Minimum required memory |
|
|
|
|
|
Returns: |
|
|
ToolResult with selected GPU ID |
|
|
""" |
|
|
try: |
|
|
gpu_id = self.gpu_manager.select_best_gpu(min_memory_gb) |
|
|
|
|
|
if gpu_id is None: |
|
|
return ToolResult( |
|
|
success=False, |
|
|
output=None, |
|
|
error=f"No GPU found with {min_memory_gb} GB free memory", |
|
|
) |
|
|
|
|
|
info = self.gpu_manager.get_gpu_info(gpu_id) |
|
|
|
|
|
output = ( |
|
|
f"Selected GPU {gpu_id}: {info['name']}\n" |
|
|
f"Free Memory: {info['memory_free'] / 1024**3:.2f} GB" |
|
|
) |
|
|
|
|
|
return ToolResult( |
|
|
success=True, |
|
|
output=output, |
|
|
metadata={ |
|
|
"gpu_id": gpu_id, |
|
|
"gpu_info": info, |
|
|
}, |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"GPU selection error: {e}") |
|
|
return ToolResult( |
|
|
success=False, |
|
|
output=None, |
|
|
error=f"Selection error: {str(e)}", |
|
|
) |
|
|
|