File size: 3,251 Bytes
a9dc537 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
"""
GPU Monitoring Example for SPARKNET
Demonstrates GPU management and monitoring capabilities
"""
import sys
from pathlib import Path
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.utils.gpu_manager import get_gpu_manager
from src.utils.logging import setup_logging
from loguru import logger
import time
def main():
"""Run GPU monitoring example."""
# Setup logging
setup_logging(log_level="INFO")
logger.info("="*70)
logger.info("SPARKNET GPU Monitoring Example")
logger.info("="*70)
# Get GPU manager
gpu_manager = get_gpu_manager()
# Show all GPU info
logger.info("\n" + "="*70)
logger.info("All GPUs Status")
logger.info("="*70)
print(gpu_manager.monitor())
# Show detailed info for each GPU
logger.info("\n" + "="*70)
logger.info("Detailed GPU Information")
logger.info("="*70)
all_info = gpu_manager.get_all_gpu_info()
for info in all_info:
if "error" not in info:
logger.info(f"\nGPU {info['gpu_id']}: {info['name']}")
logger.info(f" Total Memory: {info['memory_total'] / 1024**3:.2f} GB")
logger.info(f" Used Memory: {info['memory_used'] / 1024**3:.2f} GB")
logger.info(f" Free Memory: {info['memory_free'] / 1024**3:.2f} GB")
logger.info(f" Memory Usage: {info['memory_percent']:.1f}%")
logger.info(f" GPU Utilization: {info['gpu_utilization']}%")
logger.info(f" Memory Util: {info['memory_utilization']}%")
logger.info(f" Temperature: {info['temperature']}°C")
# Select best GPU
logger.info("\n" + "="*70)
logger.info("GPU Selection")
logger.info("="*70)
min_memory = 2.0 # 2 GB minimum
best_gpu = gpu_manager.select_best_gpu(min_memory_gb=min_memory)
if best_gpu is not None:
logger.info(f"\nBest GPU for {min_memory} GB requirement: GPU {best_gpu}")
gpu_info = gpu_manager.get_gpu_info(best_gpu)
logger.info(f"Free memory: {gpu_info['memory_free'] / 1024**3:.2f} GB")
else:
logger.warning(f"\nNo GPU found with {min_memory} GB free memory")
# Test GPU context manager
logger.info("\n" + "="*70)
logger.info("GPU Context Manager Test")
logger.info("="*70)
try:
with gpu_manager.gpu_context(min_memory_gb=1.0) as gpu_id:
logger.info(f"\nUsing GPU {gpu_id} in context")
logger.info("This would be where you load and run your model")
time.sleep(1)
logger.info("GPU context released and cache cleared")
except RuntimeError as e:
logger.error(f"Could not allocate GPU: {e}")
# Show available GPUs
logger.info("\n" + "="*70)
logger.info("Available GPUs Summary")
logger.info("="*70)
available = gpu_manager.available_gpus
logger.info(f"\nTotal GPUs detected: {len(available)}")
logger.info(f"GPU IDs: {available}")
logger.info(f"Primary GPU: {gpu_manager.primary_gpu}")
logger.info(f"Fallback GPUs: {gpu_manager.fallback_gpus}")
logger.info("\n" + "="*70)
logger.info("GPU Monitoring Example Completed")
logger.info("="*70)
if __name__ == "__main__":
main()
|