File size: 3,251 Bytes
a9dc537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
GPU Monitoring Example for SPARKNET
Demonstrates GPU management and monitoring capabilities
"""

import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from src.utils.gpu_manager import get_gpu_manager
from src.utils.logging import setup_logging
from loguru import logger
import time


def main():
    """Run GPU monitoring example."""

    # Setup logging
    setup_logging(log_level="INFO")

    logger.info("="*70)
    logger.info("SPARKNET GPU Monitoring Example")
    logger.info("="*70)

    # Get GPU manager
    gpu_manager = get_gpu_manager()

    # Show all GPU info
    logger.info("\n" + "="*70)
    logger.info("All GPUs Status")
    logger.info("="*70)
    print(gpu_manager.monitor())

    # Show detailed info for each GPU
    logger.info("\n" + "="*70)
    logger.info("Detailed GPU Information")
    logger.info("="*70)

    all_info = gpu_manager.get_all_gpu_info()
    for info in all_info:
        if "error" not in info:
            logger.info(f"\nGPU {info['gpu_id']}: {info['name']}")
            logger.info(f"  Total Memory:     {info['memory_total'] / 1024**3:.2f} GB")
            logger.info(f"  Used Memory:      {info['memory_used'] / 1024**3:.2f} GB")
            logger.info(f"  Free Memory:      {info['memory_free'] / 1024**3:.2f} GB")
            logger.info(f"  Memory Usage:     {info['memory_percent']:.1f}%")
            logger.info(f"  GPU Utilization:  {info['gpu_utilization']}%")
            logger.info(f"  Memory Util:      {info['memory_utilization']}%")
            logger.info(f"  Temperature:      {info['temperature']}°C")

    # Select best GPU
    logger.info("\n" + "="*70)
    logger.info("GPU Selection")
    logger.info("="*70)

    min_memory = 2.0  # 2 GB minimum
    best_gpu = gpu_manager.select_best_gpu(min_memory_gb=min_memory)

    if best_gpu is not None:
        logger.info(f"\nBest GPU for {min_memory} GB requirement: GPU {best_gpu}")
        gpu_info = gpu_manager.get_gpu_info(best_gpu)
        logger.info(f"Free memory: {gpu_info['memory_free'] / 1024**3:.2f} GB")
    else:
        logger.warning(f"\nNo GPU found with {min_memory} GB free memory")

    # Test GPU context manager
    logger.info("\n" + "="*70)
    logger.info("GPU Context Manager Test")
    logger.info("="*70)

    try:
        with gpu_manager.gpu_context(min_memory_gb=1.0) as gpu_id:
            logger.info(f"\nUsing GPU {gpu_id} in context")
            logger.info("This would be where you load and run your model")
            time.sleep(1)
        logger.info("GPU context released and cache cleared")
    except RuntimeError as e:
        logger.error(f"Could not allocate GPU: {e}")

    # Show available GPUs
    logger.info("\n" + "="*70)
    logger.info("Available GPUs Summary")
    logger.info("="*70)

    available = gpu_manager.available_gpus
    logger.info(f"\nTotal GPUs detected: {len(available)}")
    logger.info(f"GPU IDs: {available}")
    logger.info(f"Primary GPU: {gpu_manager.primary_gpu}")
    logger.info(f"Fallback GPUs: {gpu_manager.fallback_gpus}")

    logger.info("\n" + "="*70)
    logger.info("GPU Monitoring Example Completed")
    logger.info("="*70)


if __name__ == "__main__":
    main()