Spaces:

STMicroelectronics
/

stm32-modelzoo-app

Running

App Files Files Community

stm32-modelzoo-app / common /utils /gpu_utils.py

FBAGSTM

STM32 AI Experimentation Hub

747451d 18 days ago

raw

history blame contribute delete

4.87 kB

	# /*---------------------------------------------------------------------------------------------
	# * Copyright (c) 2022 STMicroelectronics.
	# * All rights reserved.
	# * This software is licensed under terms that can be found in the LICENSE file in
	# * the root directory of this software component.
	# * If no LICENSE file comes with this software, it is provided AS-IS.
	# --------------------------------------------------------------------------------------------/

	import os, sys, time
	import numpy as np
	import tensorflow as tf


	def set_gpu_memory_limit(gigabytes):
	"""
	Sets the upper memory limit for the first GPU to the specified number of gigabytes.

	Args:
	gigabytes (int): The number of gigabytes to set as the upper memory limit.

	Raises:
	RuntimeError: If virtual devices have not been set before GPUs are initialized.

	Returns:
	None
	"""
	# GPU memory usage configuration
	gpus = tf.config.list_physical_devices('GPU')
	if gpus:
	try:
	tf.config.set_logical_device_configuration(
	gpus[0],
	[tf.config.LogicalDeviceConfiguration(memory_limit=1024 * gigabytes)])
	logical_gpus = tf.config.list_logical_devices('GPU')
	print("{} physical GPUs, {} logical GPUs".format(len(gpus), len(logical_gpus)))
	print("[INFO] : Setting upper memory limit to {}GBytes on gpu[0]".format(gigabytes))
	except:
	raise RuntimeError("\nVirtual devices must be set before GPUs have been initialized.")


	def inc_gpu_mode() -> None:
	"""
	Increases the GPU memory allocation incrementally as needed.

	Returns:
	None
	"""
	physical_gpus = tf.config.experimental.list_physical_devices('GPU')
	if not physical_gpus:
	return

	try:
	for gpu in physical_gpus:
	tf.config.experimental.set_memory_growth(gpu, True)
	except RuntimeError as e:
	print(f"Error setting memory growth: {e}")


	def check_training_determinism(model: tf.keras.Model, sample_ds: tf.data.Dataset):
	"""
	Check if there are operations that can rise exceptions during training.

	Args:
	model (tf.keras.Model): A keras model.

	Returns:
	valid_training (bool): True if the training raise no exception.
	"""
	valid_training = True
	x_sample, y_sample = next(iter(sample_ds))

	try:
	with tf.GradientTape() as g:
	y = model(x_sample, training=True)
	loss = model.loss(y_sample, y)
	_ = g.gradient(loss, model.trainable_variables)

	except Exception as error:
	print(f"[WARN] {error}")
	valid_training = False

	return valid_training


	def get_mem_consumption(batchsize,input_shape,model):
	"""
	Calculate the memory consumption and time consumed for a given batch size,
	input shape, and model.

	Args:
	batchsize (int): The batch size for the input data.
	input_shape (tuple): The shape of the input data.
	model (tf.keras.Model): The model to calculate memory consumption and time for.

	Returns:
	tuple: A tuple containing the peak memory consumption (in GB).
	"""
	img = np.random.rand(batchsize,*input_shape)
	labels_shape = model.output.shape
	label = np.random.rand(*labels_shape)
	tf.config.experimental.reset_memory_stats("GPU:0")
	t1 = time.time()
	with tf.GradientTape(watch_accessed_variables=False) as g:
	g.watch(model.trainable_variables)
	model_output = model(img, training=True)
	loss = model.loss(label, model_output)
	gradients = g.gradient(loss, model.trainable_variables)
	model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
	t2 = time.time()
	info = tf.config.experimental.get_memory_info("GPU:0")
	tf.config.experimental.reset_memory_stats("GPU:0")
	time_consumed = t2 - t1
	return (info["peak"]/ 1024 / 1024 /1024, time_consumed)


	def gpu_benchmark(gpu_limit,batch_size,input_shape,model):
	"""
	Benchmark the memory consumption of a given model with a given batch size and input shape.

	Args:
	gpu_limit (float): The maximum amount of GPU memory (in GB) that can be used.
	batch_size (int): The batch size for the input data.
	input_shape (tuple): The shape of the input data.
	model (tf.keras.Model): The model to benchmark.

	Returns:
	bool: A boolean indicating whether an exception was raised during the benchmarking process.
	"""
	S_E = False
	try:
	info = get_mem_consumption(batch_size,input_shape,model)
	memory_with_tf_overhead = info[0]
	if gpu_limit > memory_with_tf_overhead:
	print("[INFO] : Model memory requirement: {:.2f} GB".format(memory_with_tf_overhead))
	except Exception as e:
	S_E = True
	return S_E