| | from transformers.configuration_utils import PretrainedConfig |
| | from transformers.utils import logging |
| |
|
| | logger = logging.get_logger(__name__) |
| |
|
| |
|
| | class HCXVisionConfig(PretrainedConfig): |
| | model_type = "hyperclovax_vlm" |
| | keys_to_ignore_at_inference = ["past_key_values"] |
| |
|
| | |
| | language_config_attribute_map = { |
| | "n_embd": "hidden_size", |
| | "n_positions": "max_position_embeddings", |
| | "n_head": "num_attention_heads", |
| | "n_layer": "num_hidden_layers", |
| | } |
| |
|
| | def __init__( |
| | self, |
| | language_config=None, |
| | vision_config=None, |
| | use_nth_layer=-2, |
| | img_start_id=100009, |
| | decoder_max_length=4096, |
| | anyres=False, |
| | unpad=False, |
| | max_num_grids=-1, |
| | num_queries_vis_abstractor=-1, |
| | ignore_index=-100, |
| | proj_pos_emb=True, |
| | proj_prenorm=False, |
| | use_1x1_grid=False, |
| | **kwargs, |
| | ): |
| | for key, val in self.language_config_attribute_map.items(): |
| | if language_config is not None and key in language_config: |
| | language_config[val] = language_config.pop(key) |
| |
|
| | self.language_config = language_config |
| | self.vision_config = vision_config |
| |
|
| | if language_config is not None: |
| | |
| | self.hidden_size = ( |
| | language_config["hidden_size"] if "hidden_size" in language_config else language_config["n_embd"] |
| | ) |
| | |
| | self.use_nth_layer = use_nth_layer |
| | self.decoder_max_length = decoder_max_length |
| | self.anyres = anyres |
| | self.unpad = unpad |
| | self.max_num_grids = max_num_grids |
| | self.num_queries_vis_abstractor = num_queries_vis_abstractor |
| | self.img_start_id = img_start_id |
| | self.ignore_index = ignore_index |
| | self.proj_pos_emb = proj_pos_emb |
| | self.proj_prenorm = proj_prenorm |
| | self.use_1x1_grid = use_1x1_grid |
| | super().__init__(**kwargs) |
| |
|