stm32-modelzoo-app / common /utils /gpu_utils.py
FBAGSTM's picture
STM32 AI Experimentation Hub
747451d
# /*---------------------------------------------------------------------------------------------
# * Copyright (c) 2022 STMicroelectronics.
# * All rights reserved.
# * This software is licensed under terms that can be found in the LICENSE file in
# * the root directory of this software component.
# * If no LICENSE file comes with this software, it is provided AS-IS.
# *--------------------------------------------------------------------------------------------*/
import os, sys, time
import numpy as np
import tensorflow as tf
def set_gpu_memory_limit(gigabytes):
"""
Sets the upper memory limit for the first GPU to the specified number of gigabytes.
Args:
gigabytes (int): The number of gigabytes to set as the upper memory limit.
Raises:
RuntimeError: If virtual devices have not been set before GPUs are initialized.
Returns:
None
"""
# GPU memory usage configuration
gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
tf.config.set_logical_device_configuration(
gpus[0],
[tf.config.LogicalDeviceConfiguration(memory_limit=1024 * gigabytes)])
logical_gpus = tf.config.list_logical_devices('GPU')
print("{} physical GPUs, {} logical GPUs".format(len(gpus), len(logical_gpus)))
print("[INFO] : Setting upper memory limit to {}GBytes on gpu[0]".format(gigabytes))
except:
raise RuntimeError("\nVirtual devices must be set before GPUs have been initialized.")
def inc_gpu_mode() -> None:
"""
Increases the GPU memory allocation incrementally as needed.
Returns:
None
"""
physical_gpus = tf.config.experimental.list_physical_devices('GPU')
if not physical_gpus:
return
try:
for gpu in physical_gpus:
tf.config.experimental.set_memory_growth(gpu, True)
except RuntimeError as e:
print(f"Error setting memory growth: {e}")
def check_training_determinism(model: tf.keras.Model, sample_ds: tf.data.Dataset):
"""
Check if there are operations that can rise exceptions during training.
Args:
model (tf.keras.Model): A keras model.
Returns:
valid_training (bool): True if the training raise no exception.
"""
valid_training = True
x_sample, y_sample = next(iter(sample_ds))
try:
with tf.GradientTape() as g:
y = model(x_sample, training=True)
loss = model.loss(y_sample, y)
_ = g.gradient(loss, model.trainable_variables)
except Exception as error:
print(f"[WARN] {error}")
valid_training = False
return valid_training
def get_mem_consumption(batchsize,input_shape,model):
"""
Calculate the memory consumption and time consumed for a given batch size,
input shape, and model.
Args:
batchsize (int): The batch size for the input data.
input_shape (tuple): The shape of the input data.
model (tf.keras.Model): The model to calculate memory consumption and time for.
Returns:
tuple: A tuple containing the peak memory consumption (in GB).
"""
img = np.random.rand(batchsize,*input_shape)
labels_shape = model.output.shape
label = np.random.rand(*labels_shape)
tf.config.experimental.reset_memory_stats("GPU:0")
t1 = time.time()
with tf.GradientTape(watch_accessed_variables=False) as g:
g.watch(model.trainable_variables)
model_output = model(img, training=True)
loss = model.loss(label, model_output)
gradients = g.gradient(loss, model.trainable_variables)
model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
t2 = time.time()
info = tf.config.experimental.get_memory_info("GPU:0")
tf.config.experimental.reset_memory_stats("GPU:0")
time_consumed = t2 - t1
return (info["peak"]/ 1024 / 1024 /1024, time_consumed)
def gpu_benchmark(gpu_limit,batch_size,input_shape,model):
"""
Benchmark the memory consumption of a given model with a given batch size and input shape.
Args:
gpu_limit (float): The maximum amount of GPU memory (in GB) that can be used.
batch_size (int): The batch size for the input data.
input_shape (tuple): The shape of the input data.
model (tf.keras.Model): The model to benchmark.
Returns:
bool: A boolean indicating whether an exception was raised during the benchmarking process.
"""
S_E = False
try:
info = get_mem_consumption(batch_size,input_shape,model)
memory_with_tf_overhead = info[0]
if gpu_limit > memory_with_tf_overhead:
print("[INFO] : Model memory requirement: {:.2f} GB".format(memory_with_tf_overhead))
except Exception as e:
S_E = True
return S_E