SPARKNET / src /utils /gpu_manager.py
MHamdan's picture
Initial commit: SPARKNET framework
a9dc537
"""
GPU Manager for SPARKNET
Handles GPU allocation, monitoring, and resource management
"""
import os
import torch
from typing import Optional, List, Dict
from contextlib import contextmanager
import pynvml
from loguru import logger
class GPUManager:
"""Manages GPU resources for model deployment and monitoring."""
def __init__(self, primary_gpu: int = 0, fallback_gpus: Optional[List[int]] = None):
"""
Initialize GPU Manager.
Args:
primary_gpu: Primary GPU device ID (default: 0)
fallback_gpus: List of fallback GPU IDs (default: [1, 2, 3])
"""
self.primary_gpu = primary_gpu
self.fallback_gpus = fallback_gpus or [1, 2, 3]
self.initialized = False
# Initialize NVML for GPU monitoring
try:
pynvml.nvmlInit()
self.initialized = True
logger.info("GPU Manager initialized with NVML")
except Exception as e:
logger.warning(f"Failed to initialize NVML: {e}")
# Detect available GPUs
self.available_gpus = self._detect_gpus()
logger.info(f"Detected {len(self.available_gpus)} GPUs: {self.available_gpus}")
def _detect_gpus(self) -> List[int]:
"""Detect available CUDA GPUs."""
if not torch.cuda.is_available():
logger.warning("CUDA not available!")
return []
gpu_count = torch.cuda.device_count()
return list(range(gpu_count))
def get_gpu_info(self, gpu_id: int) -> Dict[str, any]:
"""
Get detailed information about a GPU.
Args:
gpu_id: GPU device ID
Returns:
Dictionary with GPU information
"""
if not self.initialized:
return {"error": "NVML not initialized"}
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
name = pynvml.nvmlDeviceGetName(handle)
return {
"gpu_id": gpu_id,
"name": name,
"memory_total": mem_info.total,
"memory_used": mem_info.used,
"memory_free": mem_info.free,
"memory_percent": (mem_info.used / mem_info.total) * 100,
"gpu_utilization": utilization.gpu,
"memory_utilization": utilization.memory,
"temperature": temperature,
}
except Exception as e:
logger.error(f"Error getting GPU {gpu_id} info: {e}")
return {"error": str(e)}
def get_all_gpu_info(self) -> List[Dict[str, any]]:
"""Get information for all available GPUs."""
return [self.get_gpu_info(gpu_id) for gpu_id in self.available_gpus]
def get_free_memory(self, gpu_id: int) -> int:
"""
Get free memory on a GPU in bytes.
Args:
gpu_id: GPU device ID
Returns:
Free memory in bytes
"""
info = self.get_gpu_info(gpu_id)
return info.get("memory_free", 0)
def select_best_gpu(self, min_memory_gb: float = 8.0) -> Optional[int]:
"""
Select the best available GPU based on free memory.
Args:
min_memory_gb: Minimum required free memory in GB
Returns:
GPU ID or None if no suitable GPU found
"""
min_memory_bytes = min_memory_gb * 1024 ** 3
# Try primary GPU first
if self.primary_gpu in self.available_gpus:
free_mem = self.get_free_memory(self.primary_gpu)
if free_mem >= min_memory_bytes:
logger.info(f"Selected primary GPU {self.primary_gpu} ({free_mem / 1024**3:.2f} GB free)")
return self.primary_gpu
# Try fallback GPUs
for gpu_id in self.fallback_gpus:
if gpu_id in self.available_gpus:
free_mem = self.get_free_memory(gpu_id)
if free_mem >= min_memory_bytes:
logger.info(f"Selected fallback GPU {gpu_id} ({free_mem / 1024**3:.2f} GB free)")
return gpu_id
logger.warning(f"No GPU found with {min_memory_gb} GB free memory")
return None
def set_device(self, gpu_id: int):
"""
Set the CUDA device.
Args:
gpu_id: GPU device ID
"""
if gpu_id not in self.available_gpus:
raise ValueError(f"GPU {gpu_id} not available")
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
torch.cuda.set_device(gpu_id)
logger.info(f"Set CUDA device to GPU {gpu_id}")
@contextmanager
def gpu_context(self, gpu_id: Optional[int] = None, min_memory_gb: float = 8.0):
"""
Context manager for GPU allocation.
Args:
gpu_id: Specific GPU ID or None for auto-selection
min_memory_gb: Minimum required memory in GB
Yields:
GPU device ID
"""
# Select GPU
if gpu_id is None:
gpu_id = self.select_best_gpu(min_memory_gb)
if gpu_id is None:
raise RuntimeError("No suitable GPU available")
# Store original device
original_device = os.environ.get("CUDA_VISIBLE_DEVICES", "")
try:
self.set_device(gpu_id)
yield gpu_id
finally:
# Restore original device
if original_device:
os.environ["CUDA_VISIBLE_DEVICES"] = original_device
# Clear CUDA cache
if torch.cuda.is_available():
torch.cuda.empty_cache()
logger.debug("Cleared CUDA cache")
def clear_cache(self, gpu_id: Optional[int] = None):
"""
Clear CUDA cache for a specific GPU or all GPUs.
Args:
gpu_id: GPU device ID or None for all GPUs
"""
if gpu_id is not None:
with torch.cuda.device(gpu_id):
torch.cuda.empty_cache()
logger.info(f"Cleared cache for GPU {gpu_id}")
else:
torch.cuda.empty_cache()
logger.info("Cleared cache for all GPUs")
def monitor(self) -> str:
"""
Get a formatted monitoring string for all GPUs.
Returns:
Formatted string with GPU status
"""
info_list = self.get_all_gpu_info()
lines = ["GPU Status:"]
for info in info_list:
if "error" in info:
lines.append(f" GPU {info.get('gpu_id', '?')}: Error - {info['error']}")
else:
lines.append(
f" GPU {info['gpu_id']}: {info['name']} | "
f"Memory: {info['memory_used'] / 1024**3:.2f}/{info['memory_total'] / 1024**3:.2f} GB "
f"({info['memory_percent']:.1f}%) | "
f"Utilization: {info['gpu_utilization']}% | "
f"Temp: {info['temperature']}°C"
)
return "\n".join(lines)
def __del__(self):
"""Cleanup NVML on deletion."""
if self.initialized:
try:
pynvml.nvmlShutdown()
except Exception:
pass
# Global GPU manager instance
_gpu_manager = None
def get_gpu_manager() -> GPUManager:
"""Get or create the global GPU manager instance."""
global _gpu_manager
if _gpu_manager is None:
_gpu_manager = GPUManager()
return _gpu_manager