deepspeed-model-memory-usage

Running

App Files Files Community

deepspeed-model-memory-usage / src /model_utils.py

andstor

Only do model name translation for Llama-2 and CodeLlama

f8a5ad5 verified 8 months ago

raw

history blame contribute delete

5.36 kB

	# Utilities related to loading in and working with models/specific models
	from urllib.parse import urlparse

	import gradio as gr
	import torch
	from accelerate.commands.estimate import check_has_model, create_empty_model
	from accelerate.utils import calculate_maximum_sizes, convert_bytes
	from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
	from parallelism_utils import estimate_zero1_model_states_mem_needs, estimate_zero2_model_states_mem_needs, estimate_zero3_model_states_mem_needs


	DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8}
	PRECISION = {"Mixed precision": "mixed", "Single precision": "single"}
	DTYPE = {"float32": "float32", "float16/bfloat16": "float16"}


	def extract_from_url(name: str):
	"Checks if `name` is a URL, and if so converts it to a model name"
	is_url = False
	try:
	result = urlparse(name)
	is_url = all([result.scheme, result.netloc])
	except Exception:
	is_url = False
	# Pass through if not a URL
	if not is_url:
	return name
	else:
	path = result.path
	return path[1:]


	def translate_llama(text):
	"Translates Llama-2 and CodeLlama to its hf counterpart"
	if not text.endswith("-hf"):
	return text + "-hf"
	return text


	def get_model(model_name: str, library: str, access_token: str):
	"Finds and grabs model from the Hub, and initializes on `meta`"
	if "meta-llama/Llama-2-" in model_name or "meta-llama/CodeLlama-" in model_name:
	model_name = translate_llama(model_name)
	if library == "auto":
	library = None
	model_name = extract_from_url(model_name)
	access_token = access_token or None
	try:
	model = create_empty_model(model_name, library_name=library, trust_remote_code=True, access_token=access_token)
	except GatedRepoError:
	raise gr.Error(
	f"Model `{model_name}` is a gated model, please ensure to pass in your access token and try again if you have access. You can find your access token here : https://huggingface.co/settings/tokens. "
	)
	except RepositoryNotFoundError:
	raise gr.Error(f"Model `{model_name}` was not found on the Hub, please try another model name.")
	except ValueError:
	raise gr.Error(
	f"Model `{model_name}` does not have any library metadata on the Hub, please manually select a library_name to use (such as `transformers`)"
	)
	except (RuntimeError, OSError) as e:
	library = check_has_model(e)
	if library != "unknown":
	raise gr.Error(
	f"Tried to load `{model_name}` with `{library}` but a possible model to load was not found inside the repo."
	)
	raise gr.Error(
	f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`"
	)
	except ImportError:
	# hacky way to check if it works with `trust_remote_code=False`
	model = create_empty_model(
	model_name, library_name=library, trust_remote_code=False, access_token=access_token
	)
	except Exception as e:
	raise gr.Error(
	f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`"
	)
	return model


	def calculate_memory(model: torch.nn.Module, options: dict):
	"Calculates the memory usage for a model init on `meta` device"
	total_size, largest_layer = calculate_maximum_sizes(model)
	total_params = model.num_parameters()

	data = []
	for dtype in options["precision"]:
	dtype_total_size = total_size
	dtype_largest_layer = largest_layer[0]

	modifier = DTYPE_MODIFIER[dtype]
	dtype_total_size /= modifier
	dtype_largest_layer /= modifier

	dtype_largest_layer = convert_bytes(dtype_largest_layer)

	precision = PRECISION[options["training_regime"]]
	model_dtype = DTYPE[dtype]

	if options["zero_stage"] == 0:
	cpu_mem = dtype_total_size * 4
	gpu_mem = cpu_mem
	elif options["zero_stage"] == 1:
	cpu_mem, gpu_mem = estimate_zero1_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["additional_buffer_factor"], precision, model_dtype)
	elif options["zero_stage"] == 2:
	cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["additional_buffer_factor"], precision, model_dtype)
	elif options["zero_stage"] == 3:
	cpu_mem, gpu_mem, largest_layer_memory = estimate_zero3_model_states_mem_needs(total_params, largest_layer[0], options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["cpu_offload_params"], options["zero_init"], options["additional_buffer_factor"], precision, model_dtype)
	data.append(
	{
	"Model dtype": dtype,
	"Largest Layer or Residual Group": dtype_largest_layer,
	"Model Size": convert_bytes(dtype_total_size),
	"per CPU": convert_bytes(cpu_mem),
	"per GPU": convert_bytes(gpu_mem),
	}
	)

	return data