| | |
| | from urllib.parse import urlparse |
| |
|
| | import gradio as gr |
| | import torch |
| | from accelerate.commands.estimate import check_has_model, create_empty_model |
| | from accelerate.utils import calculate_maximum_sizes, convert_bytes |
| | from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError |
| | from parallelism_utils import estimate_zero1_model_states_mem_needs, estimate_zero2_model_states_mem_needs, estimate_zero3_model_states_mem_needs |
| |
|
| |
|
| | DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8} |
| | PRECISION = {"Mixed precision": "mixed", "Single precision": "single"} |
| | DTYPE = {"float32": "float32", "float16/bfloat16": "float16"} |
| |
|
| |
|
| | def extract_from_url(name: str): |
| | "Checks if `name` is a URL, and if so converts it to a model name" |
| | is_url = False |
| | try: |
| | result = urlparse(name) |
| | is_url = all([result.scheme, result.netloc]) |
| | except Exception: |
| | is_url = False |
| | |
| | if not is_url: |
| | return name |
| | else: |
| | path = result.path |
| | return path[1:] |
| |
|
| |
|
| | def translate_llama(text): |
| | "Translates Llama-2 and CodeLlama to its hf counterpart" |
| | if not text.endswith("-hf"): |
| | return text + "-hf" |
| | return text |
| |
|
| |
|
| | def get_model(model_name: str, library: str, access_token: str): |
| | "Finds and grabs model from the Hub, and initializes on `meta`" |
| | if "meta-llama/Llama-2-" in model_name or "meta-llama/CodeLlama-" in model_name: |
| | model_name = translate_llama(model_name) |
| | if library == "auto": |
| | library = None |
| | model_name = extract_from_url(model_name) |
| | access_token = access_token or None |
| | try: |
| | model = create_empty_model(model_name, library_name=library, trust_remote_code=True, access_token=access_token) |
| | except GatedRepoError: |
| | raise gr.Error( |
| | f"Model `{model_name}` is a gated model, please ensure to pass in your access token and try again if you have access. You can find your access token here : https://huggingface.co/settings/tokens. " |
| | ) |
| | except RepositoryNotFoundError: |
| | raise gr.Error(f"Model `{model_name}` was not found on the Hub, please try another model name.") |
| | except ValueError: |
| | raise gr.Error( |
| | f"Model `{model_name}` does not have any library metadata on the Hub, please manually select a library_name to use (such as `transformers`)" |
| | ) |
| | except (RuntimeError, OSError) as e: |
| | library = check_has_model(e) |
| | if library != "unknown": |
| | raise gr.Error( |
| | f"Tried to load `{model_name}` with `{library}` but a possible model to load was not found inside the repo." |
| | ) |
| | raise gr.Error( |
| | f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`" |
| | ) |
| | except ImportError: |
| | |
| | model = create_empty_model( |
| | model_name, library_name=library, trust_remote_code=False, access_token=access_token |
| | ) |
| | except Exception as e: |
| | raise gr.Error( |
| | f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`" |
| | ) |
| | return model |
| |
|
| |
|
| | def calculate_memory(model: torch.nn.Module, options: dict): |
| | "Calculates the memory usage for a model init on `meta` device" |
| | total_size, largest_layer = calculate_maximum_sizes(model) |
| | total_params = model.num_parameters() |
| |
|
| | data = [] |
| | for dtype in options["precision"]: |
| | dtype_total_size = total_size |
| | dtype_largest_layer = largest_layer[0] |
| |
|
| | modifier = DTYPE_MODIFIER[dtype] |
| | dtype_total_size /= modifier |
| | dtype_largest_layer /= modifier |
| |
|
| | dtype_largest_layer = convert_bytes(dtype_largest_layer) |
| |
|
| | precision = PRECISION[options["training_regime"]] |
| | model_dtype = DTYPE[dtype] |
| |
|
| | if options["zero_stage"] == 0: |
| | cpu_mem = dtype_total_size * 4 |
| | gpu_mem = cpu_mem |
| | elif options["zero_stage"] == 1: |
| | cpu_mem, gpu_mem = estimate_zero1_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["additional_buffer_factor"], precision, model_dtype) |
| | elif options["zero_stage"] == 2: |
| | cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["additional_buffer_factor"], precision, model_dtype) |
| | elif options["zero_stage"] == 3: |
| | cpu_mem, gpu_mem, largest_layer_memory = estimate_zero3_model_states_mem_needs(total_params, largest_layer[0], options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["cpu_offload_params"], options["zero_init"], options["additional_buffer_factor"], precision, model_dtype) |
| | data.append( |
| | { |
| | "Model dtype": dtype, |
| | "Largest Layer or Residual Group": dtype_largest_layer, |
| | "Model Size": convert_bytes(dtype_total_size), |
| | "per CPU": convert_bytes(cpu_mem), |
| | "per GPU": convert_bytes(gpu_mem), |
| | } |
| | ) |
| |
|
| | return data |
| |
|