| | |
| | |
| | |
| | |
| | |
| | |
| | from dataclasses import dataclass |
| | from typing import ClassVar, Optional, Union |
| |
|
| | import torch |
| | from torch import nn |
| |
|
| | from ...cache_utils import Cache, HybridCache, StaticCache |
| | from ...generation import GenerationMixin |
| | from ...modeling_utils import PreTrainedModel |
| | from ...utils import ( |
| | ModelOutput, |
| | add_start_docstrings, |
| | add_start_docstrings_to_model_forward, |
| | replace_return_docstrings, |
| | ) |
| | from ..auto import AutoModel, AutoModelForCausalLM |
| | from .configuration_new_task_model import NewTaskModelConfig |
| |
|
| |
|
| | _CONFIG_FOR_DOC = "NewTaskModelConfig" |
| |
|
| |
|
| | @dataclass |
| | class NewTaskModelCausalLMOutputWithPast(ModelOutput): |
| | """ |
| | Base class for NewTaskModelcausal language model (or autoregressive) outputs. |
| | |
| | Args: |
| | loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): |
| | Language modeling loss (for next-token prediction). |
| | logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`): |
| | Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). |
| | past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): |
| | Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape |
| | `(batch_size, num_heads, sequence_length, embed_size_per_head)`) |
| | |
| | Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see |
| | `past_key_values` input) to speed up sequential decoding. |
| | hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
| | Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + |
| | one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
| | |
| | Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. |
| | attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
| | Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
| | sequence_length)`. |
| | |
| | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention |
| | heads. |
| | image_hidden_states (`torch.FloatTensor`, *optional*): |
| | A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. |
| | image_hidden_states of the model produced by the vision encoder after projecting last hidden state. |
| | """ |
| |
|
| | loss: Optional[torch.FloatTensor] = None |
| | logits: torch.FloatTensor = None |
| | past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None |
| | hidden_states: Optional[tuple[torch.FloatTensor]] = None |
| | attentions: Optional[tuple[torch.FloatTensor]] = None |
| | image_hidden_states: Optional[torch.FloatTensor] = None |
| |
|
| |
|
| | class NewTaskModelMultiModalProjector(nn.Module): |
| | def __init__(self, config: NewTaskModelConfig): |
| | super().__init__() |
| | self.linear = nn.Linear(config.vision_config.hidden_size, config.vision_config.projection_dim, bias=True) |
| |
|
| | def forward(self, image_features): |
| | hidden_states = self.linear(image_features) |
| |
|
| | return hidden_states |
| |
|
| |
|
| | NEW_TASK_MODEL_START_DOCSTRING = r""" |
| | This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the |
| | library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads |
| | etc.) |
| | |
| | This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. |
| | Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage |
| | and behavior. |
| | |
| | Parameters: |
| | config ([`NewTaskModelConfig`] or [`NewTaskModelVisionConfig`]): |
| | Model configuration class with all the parameters of the model. Initializing with a config file does not |
| | load the weights associated with the model, only the configuration. Check out the |
| | [`~PreTrainedModel.from_pretrained`] method to load the model weights. |
| | """ |
| |
|
| |
|
| | @add_start_docstrings( |
| | "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", |
| | NEW_TASK_MODEL_START_DOCSTRING, |
| | ) |
| | class NewTaskModelPreTrainedModel(PreTrainedModel): |
| | config_class = NewTaskModelConfig |
| | base_model_prefix = "model" |
| | supports_gradient_checkpointing = True |
| | _no_split_modules = ["NewTaskModelMultiModalProjector"] |
| | _skip_keys_device_placement = "past_key_values" |
| | _supports_cache_class = True |
| | _supports_quantized_cache = True |
| | _supports_static_cache = True |
| | _supports_flash_attn_2 = True |
| | _supports_sdpa = True |
| |
|
| | def _init_weights(self, module): |
| | |
| | |
| | std = ( |
| | self.config.initializer_range |
| | if hasattr(self.config, "initializer_range") |
| | else self.config.text_config.initializer_range |
| | ) |
| |
|
| | if hasattr(module, "class_embedding"): |
| | module.class_embedding.data.normal_(mean=0.0, std=std) |
| |
|
| | if isinstance(module, (nn.Linear, nn.Conv2d)): |
| | module.weight.data.normal_(mean=0.0, std=std) |
| | if module.bias is not None: |
| | module.bias.data.zero_() |
| | elif isinstance(module, nn.Embedding): |
| | module.weight.data.normal_(mean=0.0, std=std) |
| | if module.padding_idx is not None: |
| | module.weight.data[module.padding_idx].zero_() |
| |
|
| |
|
| | NEW_TASK_MODEL_INPUTS_DOCSTRING = r""" |
| | Args: |
| | input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): |
| | Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide |
| | it. |
| | |
| | Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and |
| | [`PreTrainedTokenizer.__call__`] for details. |
| | |
| | [What are input IDs?](../glossary#input-ids) |
| | pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): |
| | The tensors corresponding to the input images. Pixel values can be obtained using |
| | [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses |
| | [`SiglipImageProcessor`] for processing images). |
| | attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): |
| | Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: |
| | |
| | - 1 for tokens that are **not masked**, |
| | - 0 for tokens that are **masked**. |
| | |
| | [What are attention masks?](../glossary#attention-mask) |
| | |
| | Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and |
| | [`PreTrainedTokenizer.__call__`] for details. |
| | |
| | If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see |
| | `past_key_values`). |
| | |
| | If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] |
| | and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more |
| | information on the default strategy. |
| | |
| | - 1 indicates the head is **not masked**, |
| | - 0 indicates the head is **masked**. |
| | position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): |
| | Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, |
| | config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) |
| | past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): |
| | Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape |
| | `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape |
| | `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. |
| | |
| | Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention |
| | blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. |
| | |
| | If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that |
| | don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all |
| | `decoder_input_ids` of shape `(batch_size, sequence_length)`. |
| | inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): |
| | Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This |
| | is useful if you want more control over how to convert `input_ids` indices into associated vectors than the |
| | model's internal embedding lookup matrix. |
| | use_cache (`bool`, *optional*): |
| | If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see |
| | `past_key_values`). |
| | output_attentions (`bool`, *optional*): |
| | Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned |
| | tensors for more detail. |
| | output_hidden_states (`bool`, *optional*): |
| | Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for |
| | more detail. |
| | return_dict (`bool`, *optional*): |
| | Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. |
| | cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): |
| | Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, |
| | this tensor is not affected by padding. It is used to update the cache in the correct position and to infer |
| | the complete sequence length. |
| | """ |
| |
|
| |
|
| | @add_start_docstrings( |
| | """The NEW_TASK_MODEL model which consists of a vision backbone and a language model.""", |
| | NEW_TASK_MODEL_START_DOCSTRING, |
| | ) |
| | class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin): |
| | main_input_name: ClassVar[str] = "doc_input_ids" |
| |
|
| | def __init__(self, config): |
| | super().__init__(config) |
| | self.vision_tower = AutoModel.from_config(config=config.vision_config) |
| | self.multi_modal_projector = NewTaskModelMultiModalProjector(config) |
| | self.vocab_size = config.text_config.vocab_size |
| |
|
| | language_model = AutoModelForCausalLM.from_config(config=config.text_config) |
| |
|
| | if language_model._tied_weights_keys is not None: |
| | self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys] |
| | self.language_model = language_model |
| |
|
| | self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 |
| |
|
| | self.embedding_dim = self.config.embedding_dim |
| | self.custom_text_proj = nn.Linear(self.config.text_config.hidden_size, self.embedding_dim) |
| |
|
| | if self.language_model._tied_weights_keys is not None: |
| | self._tied_weights_keys = [f"model.language_model.{k}" for k in self.language_model._tied_weights_keys] |
| | self.post_init() |
| |
|
| | def get_input_embeddings(self): |
| | return self.language_model.get_input_embeddings() |
| |
|
| | def set_input_embeddings(self, value): |
| | self.language_model.set_input_embeddings(value) |
| |
|
| | def get_output_embeddings(self): |
| | return self.language_model.get_output_embeddings() |
| |
|
| | def set_output_embeddings(self, new_embeddings): |
| | self.language_model.set_output_embeddings(new_embeddings) |
| |
|
| | def set_decoder(self, decoder): |
| | self.language_model.set_decoder(decoder) |
| |
|
| | def get_decoder(self): |
| | return self.language_model.get_decoder() |
| |
|
| | def _update_causal_mask( |
| | self, |
| | attention_mask, |
| | token_type_ids, |
| | past_key_values, |
| | cache_position, |
| | input_tensor, |
| | is_training: bool = False, |
| | ): |
| | if self.config.text_config._attn_implementation == "flash_attention_2": |
| | if attention_mask is not None and 0.0 in attention_mask: |
| | return attention_mask |
| | return None |
| |
|
| | using_static_cache = isinstance(past_key_values, StaticCache) |
| | min_dtype = torch.finfo(self.dtype).min |
| | inputs_lead_dim, sequence_length = input_tensor.shape[:2] |
| | if using_static_cache: |
| | target_length = past_key_values.get_max_cache_shape() |
| | elif isinstance(past_key_values, HybridCache): |
| | target_length = past_key_values.get_max_cache_shape() |
| | else: |
| | target_length = ( |
| | attention_mask.shape[-1] |
| | if isinstance(attention_mask, torch.Tensor) |
| | else cache_position[0] + sequence_length + 1 |
| | ) |
| |
|
| | if attention_mask is not None and attention_mask.dim() == 4: |
| | |
| | return attention_mask |
| |
|
| | causal_mask = torch.full( |
| | (sequence_length, target_length), fill_value=min_dtype, dtype=self.dtype, device=cache_position.device |
| | ) |
| | |
| | if sequence_length != 1: |
| | if is_training: |
| | causal_mask = torch.triu(causal_mask, diagonal=1) |
| | else: |
| | causal_mask[:, :sequence_length] = 0.0 |
| |
|
| | causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) |
| | causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1) |
| | if attention_mask is not None: |
| | causal_mask = causal_mask.clone() |
| | mask_length = attention_mask.shape[-1] |
| |
|
| | |
| | if is_training: |
| | causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( |
| | token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0 |
| | ) |
| |
|
| | |
| | padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device) |
| | padding_mask = padding_mask == 0 |
| | causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( |
| | padding_mask, min_dtype |
| | ) |
| |
|
| | return causal_mask |
| |
|
| | def get_image_features(self, pixel_values: torch.FloatTensor): |
| | """ |
| | Obtains image last hidden states from the vision tower and apply multimodal projection. |
| | |
| | Args: |
| | pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) |
| | The tensors corresponding to the input images. |
| | Returns: |
| | image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). |
| | """ |
| | image_outputs = self.vision_tower(pixel_values) |
| | selected_image_feature = image_outputs.last_hidden_state |
| | image_features = self.multi_modal_projector(selected_image_feature) |
| | image_features = image_features / (self.config.text_config.hidden_size**0.5) |
| | return image_features |
| |
|
| | @add_start_docstrings_to_model_forward(NEW_TASK_MODEL_INPUTS_DOCSTRING) |
| | @replace_return_docstrings(output_type=NewTaskModelCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) |
| | def forward( |
| | self, |
| | input_ids: torch.LongTensor = None, |
| | pixel_values: torch.FloatTensor = None, |
| | attention_mask: Optional[torch.Tensor] = None, |
| | position_ids: Optional[torch.LongTensor] = None, |
| | past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None, |
| | token_type_ids: Optional[torch.LongTensor] = None, |
| | cache_position: Optional[torch.LongTensor] = None, |
| | inputs_embeds: Optional[torch.FloatTensor] = None, |
| | labels: Optional[torch.LongTensor] = None, |
| | use_cache: Optional[bool] = None, |
| | output_attentions: Optional[bool] = None, |
| | output_hidden_states: Optional[bool] = None, |
| | return_dict: Optional[bool] = None, |
| | num_logits_to_keep: int = 0, |
| | ) -> Union[tuple, NewTaskModelCausalLMOutputWithPast]: |
| | r""" |
| | labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): |
| | Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., |
| | config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored |
| | (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`. |
| | |
| | logits_to_keep (`int` or `torch.Tensor`, *optional*): |
| | If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all |
| | `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that |
| | token can save memory, which becomes pretty significant for long sequences or large vocabulary size. |
| | If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension. |
| | This is useful when using packed tensor format (single dimension for batch and sequence length). |
| | |
| | Returns: |
| | |
| | Example: |
| | |
| | ```python |
| | >>> from PIL import Image |
| | >>> import requests |
| | >>> from transformers import AutoProcessor, NewTaskModelForNewTask |
| | |
| | >>> model = NewTaskModelForNewTask.from_pretrained("google/NewTaskModel-test-224px-hf") |
| | >>> processor = AutoProcessor.from_pretrained("google/NewTaskModel-test-224px-hf") |
| | |
| | >>> prompt = "answer en Where is the cow standing?" |
| | >>> url = "https://huggingface.co/gv-hf/NewTaskModel-test-224px-hf/resolve/main/cow_beach_1.png" |
| | >>> image = Image.open(requests.get(url, stream=True).raw) |
| | |
| | >>> inputs = processor(images=image, text=prompt, return_tensors="pt") |
| | |
| | >>> # Generate |
| | >>> generate_ids = model.generate(**inputs, max_length=30) |
| | >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] |
| | "answer en Where is the cow standing?\nbeach" |
| | ``` |
| | Returns: |
| | """ |
| | vlm_outputs = super().forward( |
| | input_ids=input_ids, |
| | pixel_values=pixel_values, |
| | attention_mask=attention_mask, |
| | position_ids=position_ids, |
| | past_key_values=past_key_values, |
| | token_type_ids=token_type_ids, |
| | cache_position=cache_position, |
| | inputs_embeds=inputs_embeds, |
| | labels=labels, |
| | use_cache=use_cache, |
| | output_attentions=output_attentions, |
| | output_hidden_states=True, |
| | return_dict=True, |
| | num_logits_to_keep=num_logits_to_keep, |
| | ) |
| | last_hidden_states = vlm_outputs.hidden_states[-1] |
| | proj = self.custom_text_proj(last_hidden_states) |
| |
|
| | |
| | embeddings = proj / proj.norm(dim=-1, keepdim=True) |
| |
|
| | embeddings = embeddings * attention_mask.unsqueeze(-1) |
| |
|
| | return (embeddings,) + vlm_outputs |
| |
|
| | def prepare_inputs_for_generation( |
| | self, |
| | input_ids, |
| | past_key_values=None, |
| | inputs_embeds=None, |
| | cache_position=None, |
| | position_ids=None, |
| | pixel_values=None, |
| | attention_mask=None, |
| | token_type_ids=None, |
| | use_cache=True, |
| | logits_to_keep=None, |
| | labels=None, |
| | **kwargs, |
| | ): |
| | |
| | model_inputs = self.language_model.prepare_inputs_for_generation( |
| | input_ids, |
| | past_key_values=past_key_values, |
| | inputs_embeds=inputs_embeds, |
| | attention_mask=attention_mask, |
| | position_ids=position_ids, |
| | cache_position=cache_position, |
| | use_cache=use_cache, |
| | logits_to_keep=logits_to_keep, |
| | token_type_ids=token_type_ids, |
| | **kwargs, |
| | ) |
| |
|
| | |
| | if model_inputs.get("position_ids") is not None: |
| | model_inputs["position_ids"] += 1 |
| | |
| | |
| | if cache_position[0] == 0: |
| | model_inputs["pixel_values"] = pixel_values |
| | is_training = token_type_ids is not None and labels is not None |
| | if cache_position[0] == 0 and isinstance(past_key_values, HybridCache): |
| | input_tensor = inputs_embeds if inputs_embeds is not None else input_ids |
| | causal_mask = self._update_causal_mask( |
| | attention_mask, token_type_ids, past_key_values, cache_position, input_tensor, is_training |
| | ) |
| | model_inputs["attention_mask"] = causal_mask |
| |
|
| | return model_inputs |
| |
|
| | def resize_token_embeddings( |
| | self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None, mean_resizing=True |
| | ) -> nn.Embedding: |
| | model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) |
| |
|
| | |
| | self.config.text_config.vocab_size = model_embeds.num_embeddings |
| | self.config.vocab_size = model_embeds.num_embeddings |
| | self.vocab_size = model_embeds.num_embeddings |
| |
|
| | return model_embeds |
| |
|