| |
| |
| |
| |
| |
| |
| |
|
|
| import os, sys, time |
| import numpy as np |
| import tensorflow as tf |
|
|
|
|
| def set_gpu_memory_limit(gigabytes): |
| """ |
| Sets the upper memory limit for the first GPU to the specified number of gigabytes. |
| |
| Args: |
| gigabytes (int): The number of gigabytes to set as the upper memory limit. |
| |
| Raises: |
| RuntimeError: If virtual devices have not been set before GPUs are initialized. |
| |
| Returns: |
| None |
| """ |
| |
| gpus = tf.config.list_physical_devices('GPU') |
| if gpus: |
| try: |
| tf.config.set_logical_device_configuration( |
| gpus[0], |
| [tf.config.LogicalDeviceConfiguration(memory_limit=1024 * gigabytes)]) |
| logical_gpus = tf.config.list_logical_devices('GPU') |
| print("{} physical GPUs, {} logical GPUs".format(len(gpus), len(logical_gpus))) |
| print("[INFO] : Setting upper memory limit to {}GBytes on gpu[0]".format(gigabytes)) |
| except: |
| raise RuntimeError("\nVirtual devices must be set before GPUs have been initialized.") |
|
|
|
|
| def inc_gpu_mode() -> None: |
| """ |
| Increases the GPU memory allocation incrementally as needed. |
| |
| Returns: |
| None |
| """ |
| physical_gpus = tf.config.experimental.list_physical_devices('GPU') |
| if not physical_gpus: |
| return |
|
|
| try: |
| for gpu in physical_gpus: |
| tf.config.experimental.set_memory_growth(gpu, True) |
| except RuntimeError as e: |
| print(f"Error setting memory growth: {e}") |
|
|
|
|
| def check_training_determinism(model: tf.keras.Model, sample_ds: tf.data.Dataset): |
| """ |
| Check if there are operations that can rise exceptions during training. |
| |
| Args: |
| model (tf.keras.Model): A keras model. |
| |
| Returns: |
| valid_training (bool): True if the training raise no exception. |
| """ |
| valid_training = True |
| x_sample, y_sample = next(iter(sample_ds)) |
|
|
| try: |
| with tf.GradientTape() as g: |
| y = model(x_sample, training=True) |
| loss = model.loss(y_sample, y) |
| _ = g.gradient(loss, model.trainable_variables) |
| |
| except Exception as error: |
| print(f"[WARN] {error}") |
| valid_training = False |
|
|
| return valid_training |
|
|
|
|
| def get_mem_consumption(batchsize,input_shape,model): |
| """ |
| Calculate the memory consumption and time consumed for a given batch size, |
| input shape, and model. |
| |
| Args: |
| batchsize (int): The batch size for the input data. |
| input_shape (tuple): The shape of the input data. |
| model (tf.keras.Model): The model to calculate memory consumption and time for. |
| |
| Returns: |
| tuple: A tuple containing the peak memory consumption (in GB). |
| """ |
| img = np.random.rand(batchsize,*input_shape) |
| labels_shape = model.output.shape |
| label = np.random.rand(*labels_shape) |
| tf.config.experimental.reset_memory_stats("GPU:0") |
| t1 = time.time() |
| with tf.GradientTape(watch_accessed_variables=False) as g: |
| g.watch(model.trainable_variables) |
| model_output = model(img, training=True) |
| loss = model.loss(label, model_output) |
| gradients = g.gradient(loss, model.trainable_variables) |
| model.optimizer.apply_gradients(zip(gradients, model.trainable_variables)) |
| t2 = time.time() |
| info = tf.config.experimental.get_memory_info("GPU:0") |
| tf.config.experimental.reset_memory_stats("GPU:0") |
| time_consumed = t2 - t1 |
| return (info["peak"]/ 1024 / 1024 /1024, time_consumed) |
|
|
|
|
| def gpu_benchmark(gpu_limit,batch_size,input_shape,model): |
| """ |
| Benchmark the memory consumption of a given model with a given batch size and input shape. |
| |
| Args: |
| gpu_limit (float): The maximum amount of GPU memory (in GB) that can be used. |
| batch_size (int): The batch size for the input data. |
| input_shape (tuple): The shape of the input data. |
| model (tf.keras.Model): The model to benchmark. |
| |
| Returns: |
| bool: A boolean indicating whether an exception was raised during the benchmarking process. |
| """ |
| S_E = False |
| try: |
| info = get_mem_consumption(batch_size,input_shape,model) |
| memory_with_tf_overhead = info[0] |
| if gpu_limit > memory_with_tf_overhead: |
| print("[INFO] : Model memory requirement: {:.2f} GB".format(memory_with_tf_overhead)) |
| except Exception as e: |
| S_E = True |
| return S_E |
|
|