| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| from dataclasses import dataclass |
| from typing import Optional, Tuple |
|
|
| import paddle |
| from paddle import nn |
|
|
| from paddlenlp.transformers import RobertaConfig as XLMRobertaConfig |
| from paddlenlp.transformers import RobertaModel as XLMRobertaModel |
| from paddlenlp.transformers import RobertaPretrainedModel |
| from paddlenlp.transformers.model_outputs import ModelOutput |
|
|
|
|
| def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): |
| """ |
| Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols |
| are ignored. This is modified from fairseq's `utils.make_positions`. |
| |
| Args: |
| x: paddle.Tensor x: |
| Returns: paddle.Tensor |
| |
| """ |
| |
| mask = (input_ids != padding_idx).cast("int64") |
| incremental_indices = (paddle.cumsum(mask, axis=1) + past_key_values_length) * mask |
| return incremental_indices + padding_idx |
|
|
|
|
| @dataclass |
| class TransformationModelOutput(ModelOutput): |
| """ |
| Base class for text model's outputs that also contains a pooling of the last hidden states. |
| Args: |
| text_embeds (`paddle.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): |
| The text embeddings obtained by applying the projection layer to the pooler_output. |
| last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): |
| Sequence of hidden-states at the output of the last layer of the model. |
| hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
| Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + |
| one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
| Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. |
| attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
| Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
| sequence_length)`. |
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention |
| heads. |
| """ |
|
|
| projection_state: Optional[paddle.Tensor] = None |
| last_hidden_state: paddle.Tensor = None |
| hidden_states: Optional[Tuple[paddle.Tensor]] = None |
| attentions: Optional[Tuple[paddle.Tensor]] = None |
|
|
|
|
| class RobertaSeriesConfig(XLMRobertaConfig): |
| model_type = "roberta" |
|
|
| def __init__( |
| self, |
| pad_token_id=1, |
| bos_token_id=0, |
| eos_token_id=2, |
| project_dim=512, |
| pooler_fn="cls", |
| learn_encoder=False, |
| use_attention_mask=True, |
| **kwargs, |
| ): |
| super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) |
| self.project_dim = project_dim |
| self.pooler_fn = pooler_fn |
| self.learn_encoder = learn_encoder |
| self.use_attention_mask = use_attention_mask |
|
|
|
|
| class RobertaSeriesModelWithTransformation(RobertaPretrainedModel): |
| _keys_to_ignore_on_load_unexpected = [r"pooler"] |
| _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] |
| base_model_prefix = "roberta" |
| config_class = RobertaSeriesConfig |
|
|
| def __init__(self, config: RobertaSeriesConfig): |
| super().__init__(config) |
| self.roberta = XLMRobertaModel(config) |
| self.transformation = nn.Linear(config.hidden_size, config.project_dim) |
| self.apply(self.init_weights) |
|
|
| def forward( |
| self, |
| input_ids: Optional[paddle.Tensor] = None, |
| attention_mask: Optional[paddle.Tensor] = None, |
| token_type_ids: Optional[paddle.Tensor] = None, |
| position_ids: Optional[paddle.Tensor] = None, |
| output_attentions: Optional[bool] = None, |
| return_dict: Optional[bool] = None, |
| output_hidden_states: Optional[bool] = None, |
| ): |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
| if position_ids is None: |
| position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id) |
| outputs = self.base_model( |
| input_ids=input_ids, |
| attention_mask=attention_mask, |
| token_type_ids=token_type_ids, |
| position_ids=position_ids, |
| output_attentions=output_attentions, |
| output_hidden_states=output_hidden_states, |
| return_dict=return_dict, |
| ) |
|
|
| projection_state = self.transformation(outputs.last_hidden_state) |
|
|
| return TransformationModelOutput( |
| projection_state=projection_state, |
| last_hidden_state=outputs.last_hidden_state, |
| hidden_states=outputs.hidden_states, |
| attentions=outputs.attentions, |
| ) |
|
|