File size: 4,872 Bytes
747451d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# /*---------------------------------------------------------------------------------------------
#  * Copyright (c) 2022 STMicroelectronics.
#  * All rights reserved.
#  * This software is licensed under terms that can be found in the LICENSE file in
#  * the root directory of this software component.
#  * If no LICENSE file comes with this software, it is provided AS-IS.
#  *--------------------------------------------------------------------------------------------*/

import os, sys, time
import numpy as np
import tensorflow as tf


def set_gpu_memory_limit(gigabytes):
    """
     Sets the upper memory limit for the first GPU to the specified number of gigabytes.

     Args:
         gigabytes (int): The number of gigabytes to set as the upper memory limit.

     Raises:
         RuntimeError: If virtual devices have not been set before GPUs are initialized.

     Returns:
         None
     """
    # GPU memory usage configuration
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            tf.config.set_logical_device_configuration(
                gpus[0],
                [tf.config.LogicalDeviceConfiguration(memory_limit=1024 * gigabytes)])
            logical_gpus = tf.config.list_logical_devices('GPU')
            print("{} physical GPUs, {} logical GPUs".format(len(gpus), len(logical_gpus)))
            print("[INFO] : Setting upper memory limit to {}GBytes on gpu[0]".format(gigabytes))
        except:
            raise RuntimeError("\nVirtual devices must be set before GPUs have been initialized.")


def inc_gpu_mode() -> None:
    """
    Increases the GPU memory allocation incrementally as needed.

    Returns:
        None
    """
    physical_gpus = tf.config.experimental.list_physical_devices('GPU')
    if not physical_gpus:
        return

    try:
        for gpu in physical_gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(f"Error setting memory growth: {e}")


def check_training_determinism(model: tf.keras.Model, sample_ds: tf.data.Dataset):
    """
    Check if there are operations that can rise exceptions during training.

    Args:
        model (tf.keras.Model): A keras model.
    
    Returns:
        valid_training (bool): True if the training raise no exception.
    """
    valid_training = True
    x_sample, y_sample = next(iter(sample_ds))

    try:
        with tf.GradientTape() as g:
            y = model(x_sample, training=True)
            loss = model.loss(y_sample, y)
        _ = g.gradient(loss, model.trainable_variables)
        
    except Exception as error:
        print(f"[WARN] {error}")
        valid_training = False

    return valid_training


def get_mem_consumption(batchsize,input_shape,model):
    """
    Calculate the memory consumption and time consumed for a given batch size,
    input shape, and model.

    Args:
        batchsize (int): The batch size for the input data.
        input_shape (tuple): The shape of the input data.
        model (tf.keras.Model): The model to calculate memory consumption and time for.

    Returns:
        tuple: A tuple containing the peak memory consumption (in GB).
    """
    img = np.random.rand(batchsize,*input_shape)
    labels_shape = model.output.shape
    label = np.random.rand(*labels_shape)
    tf.config.experimental.reset_memory_stats("GPU:0")
    t1 = time.time()
    with tf.GradientTape(watch_accessed_variables=False) as g:
        g.watch(model.trainable_variables)
        model_output = model(img, training=True)
        loss = model.loss(label, model_output)
    gradients = g.gradient(loss, model.trainable_variables)
    model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    t2 = time.time()
    info = tf.config.experimental.get_memory_info("GPU:0")
    tf.config.experimental.reset_memory_stats("GPU:0")
    time_consumed = t2 - t1
    return (info["peak"]/ 1024 / 1024 /1024, time_consumed)


def gpu_benchmark(gpu_limit,batch_size,input_shape,model):
    """
    Benchmark the memory consumption of a given model with a given batch size and input shape.

    Args:
        gpu_limit (float): The maximum amount of GPU memory (in GB) that can be used.
        batch_size (int): The batch size for the input data.
        input_shape (tuple): The shape of the input data.
        model (tf.keras.Model): The model to benchmark.

    Returns:
        bool: A boolean indicating whether an exception was raised during the benchmarking process.
    """
    S_E = False
    try:
        info = get_mem_consumption(batch_size,input_shape,model)
        memory_with_tf_overhead = info[0]
        if gpu_limit > memory_with_tf_overhead:
            print("[INFO] : Model memory requirement: {:.2f} GB".format(memory_with_tf_overhead))
    except Exception as e:
        S_E = True
    return S_E