File size: 4,467 Bytes
a9dc537 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
"""
GPU Tools for SPARKNET
Tools for GPU monitoring and management
"""
from typing import Optional
from loguru import logger
from .base_tool import BaseTool, ToolResult
from ..utils.gpu_manager import get_gpu_manager
class GPUMonitorTool(BaseTool):
"""Tool for monitoring GPU status."""
def __init__(self):
super().__init__(
name="gpu_monitor",
description="Monitor GPU status, memory usage, and utilization",
)
self.add_parameter("gpu_id", "int", "Specific GPU ID to monitor (optional)", required=False, default=None)
self.gpu_manager = get_gpu_manager()
async def execute(self, gpu_id: Optional[int] = None, **kwargs) -> ToolResult:
"""
Monitor GPU status.
Args:
gpu_id: Specific GPU ID or None for all GPUs
Returns:
ToolResult with GPU information
"""
try:
if gpu_id is not None:
# Get info for specific GPU
info = self.gpu_manager.get_gpu_info(gpu_id)
if "error" in info:
return ToolResult(
success=False,
output=None,
error=info["error"],
)
output = self._format_gpu_info(info)
return ToolResult(
success=True,
output=output,
metadata=info,
)
else:
# Get info for all GPUs
all_info = self.gpu_manager.get_all_gpu_info()
output_lines = []
for info in all_info:
if "error" not in info:
output_lines.append(self._format_gpu_info(info))
output = "\n\n".join(output_lines)
return ToolResult(
success=True,
output=output,
metadata={"gpus": all_info},
)
except Exception as e:
logger.error(f"GPU monitoring error: {e}")
return ToolResult(
success=False,
output=None,
error=f"Monitoring error: {str(e)}",
)
def _format_gpu_info(self, info: dict) -> str:
"""Format GPU info for display."""
return (
f"GPU {info['gpu_id']}: {info['name']}\n"
f" Memory: {info['memory_used'] / 1024**3:.2f} GB / {info['memory_total'] / 1024**3:.2f} GB "
f"({info['memory_percent']:.1f}% used)\n"
f" Free Memory: {info['memory_free'] / 1024**3:.2f} GB\n"
f" GPU Utilization: {info['gpu_utilization']}%\n"
f" Temperature: {info['temperature']}°C"
)
class GPUSelectTool(BaseTool):
"""Tool for selecting best available GPU."""
def __init__(self):
super().__init__(
name="gpu_select",
description="Select the best available GPU based on free memory",
)
self.add_parameter("min_memory_gb", "float", "Minimum required memory in GB", required=False, default=8.0)
self.gpu_manager = get_gpu_manager()
async def execute(self, min_memory_gb: float = 8.0, **kwargs) -> ToolResult:
"""
Select best GPU.
Args:
min_memory_gb: Minimum required memory
Returns:
ToolResult with selected GPU ID
"""
try:
gpu_id = self.gpu_manager.select_best_gpu(min_memory_gb)
if gpu_id is None:
return ToolResult(
success=False,
output=None,
error=f"No GPU found with {min_memory_gb} GB free memory",
)
info = self.gpu_manager.get_gpu_info(gpu_id)
output = (
f"Selected GPU {gpu_id}: {info['name']}\n"
f"Free Memory: {info['memory_free'] / 1024**3:.2f} GB"
)
return ToolResult(
success=True,
output=output,
metadata={
"gpu_id": gpu_id,
"gpu_info": info,
},
)
except Exception as e:
logger.error(f"GPU selection error: {e}")
return ToolResult(
success=False,
output=None,
error=f"Selection error: {str(e)}",
)
|