""" GPU Monitoring Example for SPARKNET Demonstrates GPU management and monitoring capabilities """ import sys from pathlib import Path # Add parent directory to path sys.path.insert(0, str(Path(__file__).parent.parent)) from src.utils.gpu_manager import get_gpu_manager from src.utils.logging import setup_logging from loguru import logger import time def main(): """Run GPU monitoring example.""" # Setup logging setup_logging(log_level="INFO") logger.info("="*70) logger.info("SPARKNET GPU Monitoring Example") logger.info("="*70) # Get GPU manager gpu_manager = get_gpu_manager() # Show all GPU info logger.info("\n" + "="*70) logger.info("All GPUs Status") logger.info("="*70) print(gpu_manager.monitor()) # Show detailed info for each GPU logger.info("\n" + "="*70) logger.info("Detailed GPU Information") logger.info("="*70) all_info = gpu_manager.get_all_gpu_info() for info in all_info: if "error" not in info: logger.info(f"\nGPU {info['gpu_id']}: {info['name']}") logger.info(f" Total Memory: {info['memory_total'] / 1024**3:.2f} GB") logger.info(f" Used Memory: {info['memory_used'] / 1024**3:.2f} GB") logger.info(f" Free Memory: {info['memory_free'] / 1024**3:.2f} GB") logger.info(f" Memory Usage: {info['memory_percent']:.1f}%") logger.info(f" GPU Utilization: {info['gpu_utilization']}%") logger.info(f" Memory Util: {info['memory_utilization']}%") logger.info(f" Temperature: {info['temperature']}°C") # Select best GPU logger.info("\n" + "="*70) logger.info("GPU Selection") logger.info("="*70) min_memory = 2.0 # 2 GB minimum best_gpu = gpu_manager.select_best_gpu(min_memory_gb=min_memory) if best_gpu is not None: logger.info(f"\nBest GPU for {min_memory} GB requirement: GPU {best_gpu}") gpu_info = gpu_manager.get_gpu_info(best_gpu) logger.info(f"Free memory: {gpu_info['memory_free'] / 1024**3:.2f} GB") else: logger.warning(f"\nNo GPU found with {min_memory} GB free memory") # Test GPU context manager logger.info("\n" + "="*70) logger.info("GPU Context Manager Test") logger.info("="*70) try: with gpu_manager.gpu_context(min_memory_gb=1.0) as gpu_id: logger.info(f"\nUsing GPU {gpu_id} in context") logger.info("This would be where you load and run your model") time.sleep(1) logger.info("GPU context released and cache cleared") except RuntimeError as e: logger.error(f"Could not allocate GPU: {e}") # Show available GPUs logger.info("\n" + "="*70) logger.info("Available GPUs Summary") logger.info("="*70) available = gpu_manager.available_gpus logger.info(f"\nTotal GPUs detected: {len(available)}") logger.info(f"GPU IDs: {available}") logger.info(f"Primary GPU: {gpu_manager.primary_gpu}") logger.info(f"Fallback GPUs: {gpu_manager.fallback_gpus}") logger.info("\n" + "="*70) logger.info("GPU Monitoring Example Completed") logger.info("="*70) if __name__ == "__main__": main()