|
|
""" |
|
|
GPU Monitoring Example for SPARKNET |
|
|
Demonstrates GPU management and monitoring capabilities |
|
|
""" |
|
|
|
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
|
|
|
from src.utils.gpu_manager import get_gpu_manager |
|
|
from src.utils.logging import setup_logging |
|
|
from loguru import logger |
|
|
import time |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Run GPU monitoring example.""" |
|
|
|
|
|
|
|
|
setup_logging(log_level="INFO") |
|
|
|
|
|
logger.info("="*70) |
|
|
logger.info("SPARKNET GPU Monitoring Example") |
|
|
logger.info("="*70) |
|
|
|
|
|
|
|
|
gpu_manager = get_gpu_manager() |
|
|
|
|
|
|
|
|
logger.info("\n" + "="*70) |
|
|
logger.info("All GPUs Status") |
|
|
logger.info("="*70) |
|
|
print(gpu_manager.monitor()) |
|
|
|
|
|
|
|
|
logger.info("\n" + "="*70) |
|
|
logger.info("Detailed GPU Information") |
|
|
logger.info("="*70) |
|
|
|
|
|
all_info = gpu_manager.get_all_gpu_info() |
|
|
for info in all_info: |
|
|
if "error" not in info: |
|
|
logger.info(f"\nGPU {info['gpu_id']}: {info['name']}") |
|
|
logger.info(f" Total Memory: {info['memory_total'] / 1024**3:.2f} GB") |
|
|
logger.info(f" Used Memory: {info['memory_used'] / 1024**3:.2f} GB") |
|
|
logger.info(f" Free Memory: {info['memory_free'] / 1024**3:.2f} GB") |
|
|
logger.info(f" Memory Usage: {info['memory_percent']:.1f}%") |
|
|
logger.info(f" GPU Utilization: {info['gpu_utilization']}%") |
|
|
logger.info(f" Memory Util: {info['memory_utilization']}%") |
|
|
logger.info(f" Temperature: {info['temperature']}°C") |
|
|
|
|
|
|
|
|
logger.info("\n" + "="*70) |
|
|
logger.info("GPU Selection") |
|
|
logger.info("="*70) |
|
|
|
|
|
min_memory = 2.0 |
|
|
best_gpu = gpu_manager.select_best_gpu(min_memory_gb=min_memory) |
|
|
|
|
|
if best_gpu is not None: |
|
|
logger.info(f"\nBest GPU for {min_memory} GB requirement: GPU {best_gpu}") |
|
|
gpu_info = gpu_manager.get_gpu_info(best_gpu) |
|
|
logger.info(f"Free memory: {gpu_info['memory_free'] / 1024**3:.2f} GB") |
|
|
else: |
|
|
logger.warning(f"\nNo GPU found with {min_memory} GB free memory") |
|
|
|
|
|
|
|
|
logger.info("\n" + "="*70) |
|
|
logger.info("GPU Context Manager Test") |
|
|
logger.info("="*70) |
|
|
|
|
|
try: |
|
|
with gpu_manager.gpu_context(min_memory_gb=1.0) as gpu_id: |
|
|
logger.info(f"\nUsing GPU {gpu_id} in context") |
|
|
logger.info("This would be where you load and run your model") |
|
|
time.sleep(1) |
|
|
logger.info("GPU context released and cache cleared") |
|
|
except RuntimeError as e: |
|
|
logger.error(f"Could not allocate GPU: {e}") |
|
|
|
|
|
|
|
|
logger.info("\n" + "="*70) |
|
|
logger.info("Available GPUs Summary") |
|
|
logger.info("="*70) |
|
|
|
|
|
available = gpu_manager.available_gpus |
|
|
logger.info(f"\nTotal GPUs detected: {len(available)}") |
|
|
logger.info(f"GPU IDs: {available}") |
|
|
logger.info(f"Primary GPU: {gpu_manager.primary_gpu}") |
|
|
logger.info(f"Fallback GPUs: {gpu_manager.fallback_gpus}") |
|
|
|
|
|
logger.info("\n" + "="*70) |
|
|
logger.info("GPU Monitoring Example Completed") |
|
|
logger.info("="*70) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|