""" GPU Tools for SPARKNET Tools for GPU monitoring and management """ from typing import Optional from loguru import logger from .base_tool import BaseTool, ToolResult from ..utils.gpu_manager import get_gpu_manager class GPUMonitorTool(BaseTool): """Tool for monitoring GPU status.""" def __init__(self): super().__init__( name="gpu_monitor", description="Monitor GPU status, memory usage, and utilization", ) self.add_parameter("gpu_id", "int", "Specific GPU ID to monitor (optional)", required=False, default=None) self.gpu_manager = get_gpu_manager() async def execute(self, gpu_id: Optional[int] = None, **kwargs) -> ToolResult: """ Monitor GPU status. Args: gpu_id: Specific GPU ID or None for all GPUs Returns: ToolResult with GPU information """ try: if gpu_id is not None: # Get info for specific GPU info = self.gpu_manager.get_gpu_info(gpu_id) if "error" in info: return ToolResult( success=False, output=None, error=info["error"], ) output = self._format_gpu_info(info) return ToolResult( success=True, output=output, metadata=info, ) else: # Get info for all GPUs all_info = self.gpu_manager.get_all_gpu_info() output_lines = [] for info in all_info: if "error" not in info: output_lines.append(self._format_gpu_info(info)) output = "\n\n".join(output_lines) return ToolResult( success=True, output=output, metadata={"gpus": all_info}, ) except Exception as e: logger.error(f"GPU monitoring error: {e}") return ToolResult( success=False, output=None, error=f"Monitoring error: {str(e)}", ) def _format_gpu_info(self, info: dict) -> str: """Format GPU info for display.""" return ( f"GPU {info['gpu_id']}: {info['name']}\n" f" Memory: {info['memory_used'] / 1024**3:.2f} GB / {info['memory_total'] / 1024**3:.2f} GB " f"({info['memory_percent']:.1f}% used)\n" f" Free Memory: {info['memory_free'] / 1024**3:.2f} GB\n" f" GPU Utilization: {info['gpu_utilization']}%\n" f" Temperature: {info['temperature']}°C" ) class GPUSelectTool(BaseTool): """Tool for selecting best available GPU.""" def __init__(self): super().__init__( name="gpu_select", description="Select the best available GPU based on free memory", ) self.add_parameter("min_memory_gb", "float", "Minimum required memory in GB", required=False, default=8.0) self.gpu_manager = get_gpu_manager() async def execute(self, min_memory_gb: float = 8.0, **kwargs) -> ToolResult: """ Select best GPU. Args: min_memory_gb: Minimum required memory Returns: ToolResult with selected GPU ID """ try: gpu_id = self.gpu_manager.select_best_gpu(min_memory_gb) if gpu_id is None: return ToolResult( success=False, output=None, error=f"No GPU found with {min_memory_gb} GB free memory", ) info = self.gpu_manager.get_gpu_info(gpu_id) output = ( f"Selected GPU {gpu_id}: {info['name']}\n" f"Free Memory: {info['memory_free'] / 1024**3:.2f} GB" ) return ToolResult( success=True, output=output, metadata={ "gpu_id": gpu_id, "gpu_info": info, }, ) except Exception as e: logger.error(f"GPU selection error: {e}") return ToolResult( success=False, output=None, error=f"Selection error: {str(e)}", )