Remove legacy remote-code files (now provided natively by transformers >= 5.9.0)

#13
added_tokens.json DELETED
@@ -1,35 +0,0 @@
1
- {
2
- "<EMAIL>": 110521,
3
- "<KEY>": 110522,
4
- "<NAME>": 110520,
5
- "<PASSWORD>": 110523,
6
- "<code_to_intermediate>": 110502,
7
- "<empty_output>": 110501,
8
- "<file_sep>": 110492,
9
- "<intermediate_to_code>": 110503,
10
- "<issue_closed>": 110495,
11
- "<issue_comment>": 110494,
12
- "<issue_start>": 110493,
13
- "<jupyter_code>": 110498,
14
- "<jupyter_output>": 110499,
15
- "<jupyter_script>": 110500,
16
- "<jupyter_start>": 110496,
17
- "<jupyter_text>": 110497,
18
- "<pr>": 110504,
19
- "<pr_base>": 110507,
20
- "<pr_base_code>": 110509,
21
- "<pr_comment>": 110512,
22
- "<pr_diff>": 110510,
23
- "<pr_diff_hunk>": 110511,
24
- "<pr_diff_hunk_comment_line>": 110519,
25
- "<pr_event_id>": 110513,
26
- "<pr_file>": 110508,
27
- "<pr_in_reply_to_comment_id>": 110518,
28
- "<pr_in_reply_to_review_id>": 110517,
29
- "<pr_is_merged>": 110506,
30
- "<pr_review>": 110514,
31
- "<pr_review_comment>": 110516,
32
- "<pr_review_state>": 110515,
33
- "<pr_status>": 110505,
34
- "<repo_name>": 110491
35
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -29,7 +29,6 @@
29
  "num_hidden_layers": 38,
30
  "num_key_value_heads": 8,
31
  "pad_token_id": 100257,
32
- "pretraining_tp": 1,
33
  "resid_pdrop": 0.0,
34
  "residual_multiplier": 1.0,
35
  "rms_norm_eps": 1e-05,
 
29
  "num_hidden_layers": 38,
30
  "num_key_value_heads": 8,
31
  "pad_token_id": 100257,
 
32
  "resid_pdrop": 0.0,
33
  "residual_multiplier": 1.0,
34
  "rms_norm_eps": 1e-05,
configuration_hyperclovax.py DELETED
@@ -1,235 +0,0 @@
1
- # coding=utf-8
2
- # This file was created for the HyperCLOVA X SEED 14B Think architecture.
3
- # partially copied and modified from https://github.com/huggingface/transformers
4
- # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
5
- #
6
- # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
7
- # and OPT implementations in this library. It has been modified from its
8
- # original forms to accommodate minor architectural differences compared
9
- # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
10
- #
11
- # Licensed under the Apache License, Version 2.0 (the "License");
12
- # you may not use this file except in compliance with the License.
13
- # You may obtain a copy of the License at
14
- #
15
- # http://www.apache.org/licenses/LICENSE-2.0
16
- #
17
- # Unless required by applicable law or agreed to in writing, software
18
- # distributed under the License is distributed on an "AS IS" BASIS,
19
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20
- # See the License for the specific language governing permissions and
21
- # limitations under the License.
22
- """HyperCLOVAX model configuration"""
23
-
24
- from transformers.configuration_utils import PretrainedConfig
25
-
26
- class HyperCLOVAXConfig(PretrainedConfig):
27
- r"""
28
- This is the configuration class to store the configuration of a [`HyperCLOVAXModel`]. It is used to instantiate an HyperCLOVAX
29
- model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
30
- defaults will yield a similar configuration to that of the HyperCLOVAX.
31
-
32
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
- documentation from [`PretrainedConfig`] for more information.
34
-
35
-
36
- Args:
37
- vocab_size (`int`, *optional*, defaults to 32000):
38
- Vocabulary size of the HyperCLOVAX model. Defines the number of different tokens that can be represented by the
39
- `inputs_ids` passed when calling [`HyperCLOVAXModel`]
40
- hidden_size (`int`, *optional*, defaults to 4096):
41
- Dimension of the hidden representations.
42
- intermediate_size (`int`, *optional*, defaults to 11008):
43
- Dimension of the MLP representations.
44
- num_hidden_layers (`int`, *optional*, defaults to 32):
45
- Number of hidden layers in the Transformer decoder.
46
- num_attention_heads (`int`, *optional*, defaults to 32):
47
- Number of attention heads for each attention layer in the Transformer decoder.
48
- num_key_value_heads (`int`, *optional*):
49
- This is the number of key_value heads that should be used to implement Grouped Query Attention. If
50
- `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
51
- `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
52
- converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
53
- by meanpooling all the original heads within that group. For more details checkout [this
54
- paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
55
- `num_attention_heads`.
56
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
57
- The non-linear activation function (function or string) in the decoder.
58
- max_position_embeddings (`int`, *optional*, defaults to 2048):
59
- The maximum sequence length that this model might ever be used with.
60
- initializer_range (`float`, *optional*, defaults to 0.02):
61
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
62
- rms_norm_eps (`float`, *optional*, defaults to 1e-06):
63
- The epsilon used by the rms normalization layers.
64
- use_cache (`bool`, *optional*, defaults to `True`):
65
- Whether or not the model should return the last key/values attentions (not used by all models). Only
66
- relevant if `config.is_decoder=True`.
67
- pad_token_id (`int`, *optional*):
68
- Padding token id.
69
- bos_token_id (`int`, *optional*, defaults to 1):
70
- Beginning of stream token id.
71
- eos_token_id (`int`, *optional*, defaults to 2):
72
- End of stream token id.
73
- pretraining_tp (`int`, *optional*, defaults to 1):
74
- Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
75
- document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
76
- understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
77
- results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
78
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
79
- Whether to tie weight embeddings
80
- rope_theta (`float`, *optional*, defaults to 10000.0):
81
- The base period of the RoPE embeddings.
82
- rope_scaling (`Dict`, *optional*):
83
- Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
84
- and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
85
- accordingly.
86
- Expected contents:
87
- `rope_type` (`str`):
88
- The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
89
- 'llama3'], with 'default' being the original RoPE implementation.
90
- `factor` (`float`, *optional*):
91
- Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
92
- most scaling types, a `factor` of x will enable the model to handle sequences of length x *
93
- original maximum pre-trained length.
94
- `original_max_position_embeddings` (`int`, *optional*):
95
- Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
96
- pretraining.
97
- `attention_factor` (`float`, *optional*):
98
- Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
99
- computation. If unspecified, it defaults to value recommended by the implementation, using the
100
- `factor` field to infer the suggested value.
101
- `beta_fast` (`float`, *optional*):
102
- Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
103
- ramp function. If unspecified, it defaults to 32.
104
- `beta_slow` (`float`, *optional*):
105
- Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
106
- ramp function. If unspecified, it defaults to 1.
107
- `short_factor` (`List[float]`, *optional*):
108
- Only used with 'longrope'. The scaling factor to be applied to short contexts (<
109
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
110
- size divided by the number of attention heads divided by 2
111
- `long_factor` (`List[float]`, *optional*):
112
- Only used with 'longrope'. The scaling factor to be applied to long contexts (<
113
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
114
- size divided by the number of attention heads divided by 2
115
- `low_freq_factor` (`float`, *optional*):
116
- Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
117
- `high_freq_factor` (`float`, *optional*):
118
- Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
119
- attention_bias (`bool`, *optional*, defaults to `False`):
120
- Whether to use a bias in the query, key, value and output projection layers during self-attention.
121
- attention_dropout (`float`, *optional*, defaults to 0.0):
122
- The dropout ratio for the attention probabilities.
123
- mlp_bias (`bool`, *optional*, defaults to `False`):
124
- Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
125
- head_dim (`int`, *optional*):
126
- The attention head dimension. If None, it will default to hidden_size // num_heads
127
- embedding_multiplier (`float, *optional*, defaults to `None`):
128
- Multiplier applied to the embedding weights. If `None`, it is equivalent to `1.0`.
129
- logits_scaling (`float, *optional*, defaults to `None`):
130
- Scaling factor for logits. If `None`, it is equivalent to `1.0`.
131
- attention_multiplier (`float, *optional*, defaults to `None`):
132
- Multiplier applied to the attention weights. If `None`, it is equivalent to `self.head_dim ** -0.5`.
133
- residual_multiplier (`float, *optional*, defaults to `None`):
134
- Scaling factor for residual connections. If `None`, it is equivalent to `1.0`.
135
- use_post_norm (`bool`, *optional*, defaults to `False`):
136
- Determines whether to apply Peri-Layer Normalization. Set to True to enable this feature.
137
-
138
- ```python
139
- >>> from transformers import HyperCLOVAXModel, HyperCLOVAXConfig
140
-
141
- >>> # Initializing a HyperCLOVAX HyperCLOVAX style configuration
142
- >>> configuration = HyperCLOVAXConfig()
143
-
144
- >>> # Initializing a model from the HyperCLOVAX style configuration
145
- >>> model = HyperCLOVAXModel(configuration)
146
-
147
- >>> # Accessing the model configuration
148
- >>> configuration = model.config
149
- ```"""
150
-
151
- model_type = "hyperclovax"
152
- keys_to_ignore_at_inference = ["past_key_values"]
153
-
154
- def __init__(
155
- self,
156
- vocab_size=32000,
157
- hidden_size=4096,
158
- intermediate_size=11008,
159
- num_hidden_layers=32,
160
- num_attention_heads=32,
161
- num_key_value_heads=None,
162
- hidden_act="silu",
163
- max_position_embeddings=2048,
164
- initializer_range=0.02,
165
- rms_norm_eps=1e-6,
166
- use_cache=True,
167
- pad_token_id=None,
168
- bos_token_id=1,
169
- eos_token_id=2,
170
- pretraining_tp=1,
171
- tie_word_embeddings=False,
172
- rope_theta=10000.0,
173
- rope_scaling=None,
174
- attention_bias=False,
175
- attention_dropout=0.0,
176
- mlp_bias=False,
177
- head_dim=None,
178
- embedding_multiplier=None, # MuP
179
- logits_scaling=None, # MuP
180
- attention_multiplier=None, # MuP
181
- residual_multiplier=None, # MuP
182
- use_post_norm=False, # Peri-LN (post-norm)
183
- auto_map={
184
- "AutoConfig": "configuration_hyperclovax.HyperCLOVAXConfig",
185
- "AutoModel": "modeling_hyperclovax.HyperCLOVAXModel",
186
- "AutoModelForCausalLM": "modeling_hyperclovax.HyperCLOVAXForCausalLM"
187
- },
188
- **kwargs,
189
- ):
190
- self.vocab_size = vocab_size
191
- self.max_position_embeddings = max_position_embeddings
192
- self.hidden_size = hidden_size
193
- self.intermediate_size = intermediate_size
194
- self.num_hidden_layers = num_hidden_layers
195
- self.num_attention_heads = num_attention_heads
196
-
197
- # for backward compatibility
198
- if num_key_value_heads is None:
199
- num_key_value_heads = num_attention_heads
200
-
201
- self.num_key_value_heads = num_key_value_heads
202
- self.hidden_act = hidden_act
203
- self.initializer_range = initializer_range
204
- self.rms_norm_eps = rms_norm_eps
205
- self.pretraining_tp = pretraining_tp
206
- self.use_cache = use_cache
207
- self.rope_theta = rope_theta
208
- self.rope_scaling = rope_scaling
209
- self.attention_bias = attention_bias
210
- self.attention_dropout = attention_dropout
211
- self.mlp_bias = mlp_bias
212
- self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
213
- # Validate the correctness of rotary position embeddings parameters
214
- # BC: if there is a 'type' field, copy it it to 'rope_type'.
215
- if self.rope_scaling is not None and "type" in self.rope_scaling:
216
- self.rope_scaling["rope_type"] = self.rope_scaling["type"]
217
- # rope_config_validation(self)
218
-
219
- # MuP
220
- self.embedding_multiplier = embedding_multiplier if embedding_multiplier is not None else 1.0
221
- self.logits_scaling = logits_scaling if logits_scaling is not None else 1.0
222
- self.attention_multiplier = attention_multiplier if attention_multiplier is not None else self.head_dim ** -0.5
223
- self.residual_multiplier = residual_multiplier if residual_multiplier is not None else 1.0
224
-
225
- # Peri-LN (post-norm)
226
- self.use_post_norm = use_post_norm
227
-
228
- super().__init__(
229
- pad_token_id=pad_token_id,
230
- bos_token_id=bos_token_id,
231
- eos_token_id=eos_token_id,
232
- tie_word_embeddings=tie_word_embeddings,
233
- auto_map=auto_map,
234
- **kwargs,
235
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modeling_hyperclovax.py DELETED
@@ -1,987 +0,0 @@
1
- # coding=utf-8
2
- # This file was created for the HyperCLOVA X SEED 14B Think architecture.
3
- # partially copied and modified from
4
- # https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/llama/modeling_llama.py
5
- # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
6
- #
7
- # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
8
- # and OPT implementations in this library. It has been modified from its
9
- # original forms to accommodate minor architectural differences compared
10
- # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
11
- #
12
- # Licensed under the Apache License, Version 2.0 (the "License");
13
- # you may not use this file except in compliance with the License.
14
- # You may obtain a copy of the License at
15
- #
16
- # http://www.apache.org/licenses/LICENSE-2.0
17
- #
18
- # Unless required by applicable law or agreed to in writing, software
19
- # distributed under the License is distributed on an "AS IS" BASIS,
20
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21
- # See the License for the specific language governing permissions and
22
- # limitations under the License.
23
- from typing import Callable, Optional, Union
24
-
25
- import torch
26
- import torch.utils.checkpoint
27
- from torch import nn
28
-
29
- from transformers.activations import ACT2FN
30
- from transformers.cache_utils import Cache, DynamicCache
31
- from transformers.generation import GenerationMixin
32
- from transformers.modeling_attn_mask_utils import AttentionMaskConverter
33
- from transformers.integrations import use_kernel_forward_from_hub
34
- from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
35
- from transformers.modeling_layers import GradientCheckpointingLayer
36
- from transformers.modeling_outputs import (
37
- BaseModelOutputWithPast,
38
- CausalLMOutputWithPast,
39
- QuestionAnsweringModelOutput,
40
- SequenceClassifierOutputWithPast,
41
- TokenClassifierOutput,
42
- )
43
- from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
44
- from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
45
- from transformers.processing_utils import Unpack
46
- from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
47
- from transformers.utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
48
- try:
49
- from transformers.utils import LossKwargs
50
- loss_kwargs_class = LossKwargs
51
- except ImportError:
52
- from transformers.utils import TransformersKwargs
53
- loss_kwargs_class = TransformersKwargs
54
-
55
- from .configuration_hyperclovax import HyperCLOVAXConfig
56
- if is_torch_flex_attn_available():
57
- from torch.nn.attention.flex_attention import BlockMask
58
-
59
- from transformers.integrations.flex_attention import make_flex_block_causal_mask
60
-
61
- logger = logging.get_logger(__name__)
62
-
63
-
64
- @use_kernel_forward_from_hub("RMSNorm")
65
- class HyperCLOVAXRMSNorm(nn.Module):
66
- def __init__(self, hidden_size, eps=1e-6):
67
- """
68
- HyperCLOVAXRMSNorm is equivalent to T5LayerNorm
69
- """
70
- super().__init__()
71
- self.weight = nn.Parameter(torch.ones(hidden_size))
72
- self.variance_epsilon = eps
73
-
74
- def forward(self, hidden_states):
75
- input_dtype = hidden_states.dtype
76
- hidden_states = hidden_states.to(torch.float32)
77
- variance = hidden_states.pow(2).mean(-1, keepdim=True)
78
- hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
79
- return self.weight * hidden_states.to(input_dtype)
80
-
81
- def extra_repr(self):
82
- return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
83
-
84
- ALL_LAYERNORM_LAYERS.append(HyperCLOVAXRMSNorm)
85
- class HyperCLOVAXRotaryEmbedding(nn.Module):
86
- def __init__(self, config: HyperCLOVAXConfig, device=None):
87
- super().__init__()
88
- # BC: "rope_type" was originally "type"
89
- if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
90
- self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
91
- else:
92
- self.rope_type = "default"
93
- self.max_seq_len_cached = config.max_position_embeddings
94
- self.original_max_seq_len = config.max_position_embeddings
95
-
96
- self.config = config
97
- self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
98
-
99
- inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
100
- self.register_buffer("inv_freq", inv_freq, persistent=False)
101
- self.original_inv_freq = self.inv_freq
102
-
103
- @torch.no_grad()
104
- @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
105
- def forward(self, x, position_ids):
106
- inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
107
- position_ids_expanded = position_ids[:, None, :].float()
108
-
109
- device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
110
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
111
- freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
112
- emb = torch.cat((freqs, freqs), dim=-1)
113
- cos = emb.cos() * self.attention_scaling
114
- sin = emb.sin() * self.attention_scaling
115
-
116
- return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
117
-
118
-
119
- def rotate_half(x):
120
- """Rotates half the hidden dims of the input."""
121
- x1 = x[..., : x.shape[-1] // 2]
122
- x2 = x[..., x.shape[-1] // 2 :]
123
- return torch.cat((-x2, x1), dim=-1)
124
-
125
-
126
- def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
127
- """Applies Rotary Position Embedding to the query and key tensors.
128
-
129
- Args:
130
- q (`torch.Tensor`): The query tensor.
131
- k (`torch.Tensor`): The key tensor.
132
- cos (`torch.Tensor`): The cosine part of the rotary embedding.
133
- sin (`torch.Tensor`): The sine part of the rotary embedding.
134
- position_ids (`torch.Tensor`, *optional*):
135
- Deprecated and unused.
136
- unsqueeze_dim (`int`, *optional*, defaults to 1):
137
- The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
138
- sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
139
- that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
140
- k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
141
- cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
142
- the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
143
- Returns:
144
- `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
145
- """
146
- cos = cos.unsqueeze(unsqueeze_dim)
147
- sin = sin.unsqueeze(unsqueeze_dim)
148
- q_embed = (q * cos) + (rotate_half(q) * sin)
149
- k_embed = (k * cos) + (rotate_half(k) * sin)
150
- return q_embed, k_embed
151
-
152
-
153
- class HyperCLOVAXMLP(nn.Module):
154
- def __init__(self, config):
155
- super().__init__()
156
- self.config = config
157
- self.hidden_size = config.hidden_size
158
- self.intermediate_size = config.intermediate_size
159
- self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
160
- self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
161
- self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
162
- self.act_fn = ACT2FN[config.hidden_act]
163
-
164
- def forward(self, x):
165
- down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
166
- return down_proj
167
-
168
-
169
- def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
170
- """
171
- This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
172
- num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
173
- """
174
- batch, num_key_value_heads, slen, head_dim = hidden_states.shape
175
- if n_rep == 1:
176
- return hidden_states
177
- hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
178
- return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
179
-
180
-
181
- def eager_attention_forward(
182
- module: nn.Module,
183
- query: torch.Tensor,
184
- key: torch.Tensor,
185
- value: torch.Tensor,
186
- attention_mask: Optional[torch.Tensor],
187
- scaling: float,
188
- dropout: float = 0.0,
189
- **kwargs,
190
- ):
191
- key_states = repeat_kv(key, module.num_key_value_groups)
192
- value_states = repeat_kv(value, module.num_key_value_groups)
193
-
194
- attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
195
- if attention_mask is not None:
196
- causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
197
- attn_weights = attn_weights + causal_mask
198
-
199
- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
200
- attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
201
- attn_output = torch.matmul(attn_weights, value_states)
202
- attn_output = attn_output.transpose(1, 2).contiguous()
203
-
204
- return attn_output, attn_weights
205
-
206
-
207
- class HyperCLOVAXAttention(nn.Module):
208
- """Multi-headed attention from 'Attention Is All You Need' paper"""
209
-
210
- def __init__(self, config: HyperCLOVAXConfig, layer_idx: int):
211
- super().__init__()
212
- self.config = config
213
- self.layer_idx = layer_idx
214
- self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
215
- self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
216
- self.scaling = getattr(config, "attention_multiplier", self.head_dim**-0.5) # MuP
217
- self.attention_dropout = config.attention_dropout
218
- self.is_causal = True
219
-
220
- self.q_proj = nn.Linear(
221
- config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
222
- )
223
- self.k_proj = nn.Linear(
224
- config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
225
- )
226
- self.v_proj = nn.Linear(
227
- config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
228
- )
229
- self.o_proj = nn.Linear(
230
- config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
231
- )
232
-
233
- def forward(
234
- self,
235
- hidden_states: torch.Tensor,
236
- position_embeddings: tuple[torch.Tensor, torch.Tensor],
237
- attention_mask: Optional[torch.Tensor],
238
- past_key_value: Optional[Cache] = None,
239
- cache_position: Optional[torch.LongTensor] = None,
240
- **kwargs: Unpack[FlashAttentionKwargs],
241
- ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
242
- input_shape = hidden_states.shape[:-1]
243
- hidden_shape = (*input_shape, -1, self.head_dim)
244
-
245
- query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
246
- key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
247
- value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
248
-
249
- cos, sin = position_embeddings
250
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
251
-
252
- if past_key_value is not None:
253
- # sin and cos are specific to RoPE models; cache_position needed for the static cache
254
- cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
255
- key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
256
-
257
- attention_interface: Callable = eager_attention_forward
258
-
259
- if self.config._attn_implementation != "eager":
260
- if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
261
- logger.warning_once(
262
- "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
263
- 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
264
- )
265
- else:
266
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
267
-
268
- attn_output, attn_weights = attention_interface(
269
- self,
270
- query_states,
271
- key_states,
272
- value_states,
273
- attention_mask,
274
- dropout=0.0 if not self.training else self.attention_dropout,
275
- scaling=self.scaling,
276
- **kwargs,
277
- )
278
-
279
- attn_output = attn_output.reshape(*input_shape, -1).contiguous()
280
- attn_output = self.o_proj(attn_output)
281
- return attn_output, attn_weights
282
-
283
-
284
- class HyperCLOVAXDecoderLayer(GradientCheckpointingLayer):
285
- def __init__(self, config: HyperCLOVAXConfig, layer_idx: int):
286
- super().__init__()
287
- self.hidden_size = config.hidden_size
288
-
289
- self.self_attn = HyperCLOVAXAttention(config=config, layer_idx=layer_idx)
290
-
291
- self.mlp = HyperCLOVAXMLP(config)
292
- self.input_layernorm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
293
- self.post_attention_layernorm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
294
- self.use_post_norm = getattr(config, "use_post_norm", False)
295
-
296
- # Peri-LN (post-norm)
297
- if self.use_post_norm:
298
- self.post_norm1 = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
299
- self.post_norm2 = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
300
-
301
- self.residual_multiplier = getattr(config, "residual_multiplier", 1.0) # MuP
302
-
303
- def forward(
304
- self,
305
- hidden_states: torch.Tensor,
306
- attention_mask: Optional[torch.Tensor] = None,
307
- position_ids: Optional[torch.LongTensor] = None,
308
- past_key_value: Optional[Cache] = None,
309
- output_attentions: Optional[bool] = False,
310
- use_cache: Optional[bool] = False,
311
- cache_position: Optional[torch.LongTensor] = None,
312
- position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
313
- **kwargs: Unpack[FlashAttentionKwargs],
314
- ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
315
- residual = hidden_states
316
- hidden_states = self.input_layernorm(hidden_states)
317
-
318
- # Self Attention
319
- hidden_states, self_attn_weights = self.self_attn(
320
- hidden_states=hidden_states,
321
- attention_mask=attention_mask,
322
- position_ids=position_ids,
323
- past_key_value=past_key_value,
324
- output_attentions=output_attentions,
325
- use_cache=use_cache,
326
- cache_position=cache_position,
327
- position_embeddings=position_embeddings,
328
- **kwargs,
329
- )
330
-
331
- if self.use_post_norm: # Peri-LN
332
- hidden_states = self.post_norm1(hidden_states)
333
-
334
- hidden_states = residual + hidden_states * self.residual_multiplier # MuP
335
-
336
- # Fully Connected
337
- residual = hidden_states
338
- hidden_states = self.post_attention_layernorm(hidden_states)
339
- hidden_states = self.mlp(hidden_states)
340
-
341
- if self.use_post_norm: # Peri-LN
342
- hidden_states = self.post_norm2(hidden_states)
343
-
344
- hidden_states = residual + hidden_states * self.residual_multiplier # MuP
345
-
346
- outputs = (hidden_states,)
347
- if output_attentions:
348
- outputs += (self_attn_weights,)
349
-
350
- return outputs
351
-
352
-
353
- @auto_docstring
354
- class HyperCLOVAXPreTrainedModel(PreTrainedModel):
355
- config_class = HyperCLOVAXConfig
356
- base_model_prefix = "model"
357
- supports_gradient_checkpointing = True
358
- _no_split_modules = ["HyperCLOVAXDecoderLayer"]
359
- _skip_keys_device_placement = ["past_key_values"]
360
- _supports_flash_attn_2 = True
361
- _supports_sdpa = True
362
- _supports_flex_attn = True
363
- _supports_cache_class = True
364
- _supports_quantized_cache = True
365
- _supports_static_cache = True
366
- _supports_attention_backend = True
367
-
368
- def _init_weights(self, module):
369
- std = self.config.initializer_range
370
- if isinstance(module, nn.Linear):
371
- module.weight.data.normal_(mean=0.0, std=std)
372
- if module.bias is not None:
373
- module.bias.data.zero_()
374
- elif isinstance(module, nn.Embedding):
375
- module.weight.data.normal_(mean=0.0, std=std)
376
- if module.padding_idx is not None:
377
- module.weight.data[module.padding_idx].zero_()
378
- elif isinstance(module, HyperCLOVAXRMSNorm):
379
- module.weight.data.fill_(1.0)
380
-
381
-
382
- @auto_docstring
383
- class HyperCLOVAXModel(HyperCLOVAXPreTrainedModel):
384
- def __init__(self, config: HyperCLOVAXConfig):
385
- super().__init__(config)
386
- self.padding_idx = config.pad_token_id
387
- self.vocab_size = config.vocab_size
388
-
389
- self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
390
- self.layers = nn.ModuleList(
391
- [HyperCLOVAXDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
392
- )
393
- self.norm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
394
- self.rotary_emb = HyperCLOVAXRotaryEmbedding(config=config)
395
- self.gradient_checkpointing = False
396
-
397
- # Initialize weights and apply final processing
398
- self.post_init()
399
-
400
- # MuP
401
- self.embedding_multiplier = getattr(config, "embedding_multiplier", 1.0)
402
-
403
- def get_input_embeddings(self):
404
- return self.embed_tokens
405
-
406
- def set_input_embeddings(self, value):
407
- self.embed_tokens = value
408
-
409
- @can_return_tuple
410
- @auto_docstring
411
- def forward(
412
- self,
413
- input_ids: Optional[torch.LongTensor] = None,
414
- attention_mask: Optional[torch.Tensor] = None,
415
- position_ids: Optional[torch.LongTensor] = None,
416
- past_key_values: Optional[Cache] = None,
417
- inputs_embeds: Optional[torch.FloatTensor] = None,
418
- use_cache: Optional[bool] = None,
419
- output_attentions: Optional[bool] = None,
420
- output_hidden_states: Optional[bool] = None,
421
- cache_position: Optional[torch.LongTensor] = None,
422
- **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
423
- ) -> BaseModelOutputWithPast:
424
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
425
- output_hidden_states = (
426
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
427
- )
428
- use_cache = use_cache if use_cache is not None else self.config.use_cache
429
-
430
- if (input_ids is None) ^ (inputs_embeds is not None):
431
- raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
432
-
433
- if self.gradient_checkpointing and self.training and use_cache:
434
- logger.warning_once(
435
- "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
436
- )
437
- use_cache = False
438
-
439
- # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
440
- if not isinstance(past_key_values, (type(None), Cache)):
441
- raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
442
-
443
- if inputs_embeds is None:
444
- inputs_embeds = self.embed_tokens(input_ids)
445
-
446
- inputs_embeds = inputs_embeds * self.embedding_multiplier # MuP
447
-
448
- if use_cache and past_key_values is None:
449
- past_key_values = DynamicCache()
450
-
451
- if cache_position is None:
452
- past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
453
- cache_position = torch.arange(
454
- past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
455
- )
456
-
457
- if position_ids is None:
458
- position_ids = cache_position.unsqueeze(0)
459
-
460
- causal_mask = self._update_causal_mask(
461
- attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
462
- )
463
-
464
- hidden_states = inputs_embeds
465
-
466
- # create position embeddings to be shared across the decoder layers
467
- position_embeddings = self.rotary_emb(hidden_states, position_ids)
468
-
469
- # decoder layers
470
- all_hidden_states = () if output_hidden_states else None
471
- all_self_attns = () if output_attentions else None
472
-
473
- for decoder_layer in self.layers[: self.config.num_hidden_layers]:
474
- if output_hidden_states:
475
- all_hidden_states += (hidden_states,)
476
-
477
- layer_outputs = decoder_layer(
478
- hidden_states,
479
- attention_mask=causal_mask,
480
- position_ids=position_ids,
481
- past_key_value=past_key_values,
482
- output_attentions=output_attentions,
483
- use_cache=use_cache,
484
- cache_position=cache_position,
485
- position_embeddings=position_embeddings,
486
- **flash_attn_kwargs,
487
- )
488
-
489
- hidden_states = layer_outputs[0]
490
-
491
- if output_attentions:
492
- all_self_attns += (layer_outputs[1],)
493
-
494
- hidden_states = self.norm(hidden_states)
495
-
496
- # add hidden states from the last decoder layer
497
- if output_hidden_states:
498
- all_hidden_states += (hidden_states,)
499
-
500
- return BaseModelOutputWithPast(
501
- last_hidden_state=hidden_states,
502
- past_key_values=past_key_values if use_cache else None,
503
- hidden_states=all_hidden_states,
504
- attentions=all_self_attns,
505
- )
506
-
507
- def _update_causal_mask(
508
- self,
509
- attention_mask: Union[torch.Tensor, "BlockMask"],
510
- input_tensor: torch.Tensor,
511
- cache_position: torch.Tensor,
512
- past_key_values: Cache,
513
- output_attentions: bool = False,
514
- ):
515
- if self.config._attn_implementation == "flash_attention_2":
516
- if attention_mask is not None and (attention_mask == 0.0).any():
517
- return attention_mask
518
- return None
519
- if self.config._attn_implementation == "flex_attention":
520
- if isinstance(attention_mask, torch.Tensor):
521
- attention_mask = make_flex_block_causal_mask(attention_mask)
522
- return attention_mask
523
-
524
- # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
525
- # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
526
- # to infer the attention mask.
527
- past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
528
- using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
529
-
530
- # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
531
- if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
532
- if AttentionMaskConverter._ignore_causal_mask_sdpa(
533
- attention_mask,
534
- inputs_embeds=input_tensor,
535
- past_key_values_length=past_seen_tokens,
536
- is_training=self.training,
537
- ):
538
- return None
539
-
540
- dtype = input_tensor.dtype
541
- sequence_length = input_tensor.shape[1]
542
- if using_compilable_cache:
543
- target_length = past_key_values.get_max_cache_shape()
544
- else:
545
- target_length = (
546
- attention_mask.shape[-1]
547
- if isinstance(attention_mask, torch.Tensor)
548
- else past_seen_tokens + sequence_length + 1
549
- )
550
-
551
- # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
552
- causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
553
- attention_mask,
554
- sequence_length=sequence_length,
555
- target_length=target_length,
556
- dtype=dtype,
557
- cache_position=cache_position,
558
- batch_size=input_tensor.shape[0],
559
- )
560
-
561
- if (
562
- self.config._attn_implementation == "sdpa"
563
- and attention_mask is not None
564
- and attention_mask.device.type in ["cuda", "xpu", "npu"]
565
- and not output_attentions
566
- ):
567
- # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
568
- # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
569
- # Details: https://github.com/pytorch/pytorch/issues/110213
570
- min_dtype = torch.finfo(dtype).min
571
- causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
572
-
573
- return causal_mask
574
-
575
- @staticmethod
576
- def _prepare_4d_causal_attention_mask_with_cache_position(
577
- attention_mask: torch.Tensor,
578
- sequence_length: int,
579
- target_length: int,
580
- dtype: torch.dtype,
581
- cache_position: torch.Tensor,
582
- batch_size: int,
583
- **kwargs,
584
- ):
585
- """
586
- Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
587
- `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
588
-
589
- Args:
590
- attention_mask (`torch.Tensor`):
591
- A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
592
- `(batch_size, 1, query_length, key_value_length)`.
593
- sequence_length (`int`):
594
- The sequence length being processed.
595
- target_length (`int`):
596
- The target length: when generating with static cache, the mask should be as long as the static cache,
597
- to account for the 0 padding, the part of the cache that is not filled yet.
598
- dtype (`torch.dtype`):
599
- The dtype to use for the 4D attention mask.
600
- cache_position (`torch.Tensor`):
601
- Indices depicting the position of the input sequence tokens in the sequence.
602
- batch_size (`torch.Tensor`):
603
- Batch size.
604
- """
605
- if attention_mask is not None and attention_mask.dim() == 4:
606
- # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
607
- causal_mask = attention_mask
608
- else:
609
- min_dtype = torch.finfo(dtype).min
610
- causal_mask = torch.full(
611
- (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
612
- )
613
- if sequence_length != 1:
614
- causal_mask = torch.triu(causal_mask, diagonal=1)
615
- causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
616
- causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
617
- if attention_mask is not None:
618
- causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
619
- mask_length = attention_mask.shape[-1]
620
- padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
621
- causal_mask.device
622
- )
623
- padding_mask = padding_mask == 0
624
- causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
625
- padding_mask, min_dtype
626
- )
627
-
628
- return causal_mask
629
-
630
-
631
- class KwargsForCausalLM(FlashAttentionKwargs, loss_kwargs_class): ...
632
-
633
-
634
- @auto_docstring
635
- class HyperCLOVAXForCausalLM(HyperCLOVAXPreTrainedModel, GenerationMixin):
636
- _tied_weights_keys = ["lm_head.weight"]
637
- _tp_plan = {"lm_head": "colwise_rep"}
638
- _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
639
-
640
- def __init__(self, config):
641
- super().__init__(config)
642
- self.model = HyperCLOVAXModel(config)
643
- self.vocab_size = config.vocab_size
644
- self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
645
- self.logits_scaling = getattr(config, "logits_scaling", 1.0)
646
-
647
- # Initialize weights and apply final processing
648
- self.post_init()
649
-
650
- def get_input_embeddings(self):
651
- return self.model.embed_tokens
652
-
653
- def set_input_embeddings(self, value):
654
- self.model.embed_tokens = value
655
-
656
- def get_output_embeddings(self):
657
- return self.lm_head
658
-
659
- def set_output_embeddings(self, new_embeddings):
660
- self.lm_head = new_embeddings
661
-
662
- def set_decoder(self, decoder):
663
- self.model = decoder
664
-
665
- def get_decoder(self):
666
- return self.model
667
-
668
- @can_return_tuple
669
- @auto_docstring
670
- def forward(
671
- self,
672
- input_ids: Optional[torch.LongTensor] = None,
673
- attention_mask: Optional[torch.Tensor] = None,
674
- position_ids: Optional[torch.LongTensor] = None,
675
- past_key_values: Optional[Cache] = None,
676
- inputs_embeds: Optional[torch.FloatTensor] = None,
677
- labels: Optional[torch.LongTensor] = None,
678
- use_cache: Optional[bool] = None,
679
- output_attentions: Optional[bool] = None,
680
- output_hidden_states: Optional[bool] = None,
681
- cache_position: Optional[torch.LongTensor] = None,
682
- logits_to_keep: Union[int, torch.Tensor] = 0,
683
- **kwargs: Unpack[KwargsForCausalLM],
684
- ) -> CausalLMOutputWithPast:
685
- r"""
686
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
687
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
688
- config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
689
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
690
-
691
- Example:
692
-
693
- ```python
694
- >>> from transformers import AutoTokenizer, HyperCLOVAXForCausalLM
695
-
696
- >>> model = HyperCLOVAXForCausalLM.from_pretrained("naver-hyperclovax/{model_name}")
697
- >>> tokenizer = AutoTokenizer.from_pretrained("naver-hyperclovax/{model_name}")
698
-
699
- >>> prompt = "Hey, are you conscious? Can you talk to me?"
700
- >>> inputs = tokenizer(prompt, return_tensors="pt")
701
-
702
- >>> # Generate
703
- >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
704
- >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
705
- "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
706
- ```"""
707
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
708
- output_hidden_states = (
709
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
710
- )
711
-
712
- # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
713
- outputs: BaseModelOutputWithPast = self.model(
714
- input_ids=input_ids,
715
- attention_mask=attention_mask,
716
- position_ids=position_ids,
717
- past_key_values=past_key_values,
718
- inputs_embeds=inputs_embeds,
719
- use_cache=use_cache,
720
- output_attentions=output_attentions,
721
- output_hidden_states=output_hidden_states,
722
- cache_position=cache_position,
723
- **kwargs,
724
- )
725
-
726
- hidden_states = outputs.last_hidden_state
727
- # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
728
- slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
729
- # MuP
730
- logits = self.lm_head(hidden_states[:, slice_indices, :]) * self.logits_scaling
731
-
732
- loss = None
733
- if labels is not None:
734
- loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
735
-
736
- return CausalLMOutputWithPast(
737
- loss=loss,
738
- logits=logits,
739
- past_key_values=outputs.past_key_values,
740
- hidden_states=outputs.hidden_states,
741
- attentions=outputs.attentions,
742
- )
743
-
744
-
745
- @auto_docstring(
746
- custom_intro="""
747
- The HyperCLOVAX Model transformer with a sequence classification head on top (linear layer).
748
-
749
- [`HyperCLOVAXForSequenceClassification`] uses the last token in order to do the classification, as other causal models
750
- (e.g. GPT-2) do.
751
-
752
- Since it does classification on the last token, it requires to know the position of the last token. If a
753
- `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
754
- no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
755
- padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
756
- each row of the batch).
757
- """
758
- )
759
- class HyperCLOVAXForSequenceClassification(HyperCLOVAXPreTrainedModel):
760
- def __init__(self, config):
761
- super().__init__(config)
762
- self.num_labels = config.num_labels
763
- self.model = HyperCLOVAXModel(config)
764
- self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
765
-
766
- # Initialize weights and apply final processing
767
- self.post_init()
768
-
769
- def get_input_embeddings(self):
770
- return self.model.embed_tokens
771
-
772
- def set_input_embeddings(self, value):
773
- self.model.embed_tokens = value
774
-
775
- @can_return_tuple
776
- @auto_docstring
777
- def forward(
778
- self,
779
- input_ids: Optional[torch.LongTensor] = None,
780
- attention_mask: Optional[torch.Tensor] = None,
781
- position_ids: Optional[torch.LongTensor] = None,
782
- past_key_values: Optional[Cache] = None,
783
- inputs_embeds: Optional[torch.FloatTensor] = None,
784
- labels: Optional[torch.LongTensor] = None,
785
- use_cache: Optional[bool] = None,
786
- output_attentions: Optional[bool] = None,
787
- output_hidden_states: Optional[bool] = None,
788
- ) -> SequenceClassifierOutputWithPast:
789
- r"""
790
- labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
791
- Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
792
- config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
793
- `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
794
- """
795
-
796
- transformer_outputs: BaseModelOutputWithPast = self.model(
797
- input_ids,
798
- attention_mask=attention_mask,
799
- position_ids=position_ids,
800
- past_key_values=past_key_values,
801
- inputs_embeds=inputs_embeds,
802
- use_cache=use_cache,
803
- output_attentions=output_attentions,
804
- output_hidden_states=output_hidden_states,
805
- )
806
- hidden_states = transformer_outputs.last_hidden_state
807
- logits = self.score(hidden_states)
808
-
809
- if input_ids is not None:
810
- batch_size = input_ids.shape[0]
811
- else:
812
- batch_size = inputs_embeds.shape[0]
813
-
814
- if self.config.pad_token_id is None and batch_size != 1:
815
- raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
816
- if self.config.pad_token_id is None:
817
- last_non_pad_token = -1
818
- elif input_ids is not None:
819
- # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
820
- non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
821
- token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
822
- last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
823
- else:
824
- last_non_pad_token = -1
825
- logger.warning_once(
826
- f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
827
- "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
828
- )
829
-
830
- pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
831
-
832
- loss = None
833
- if labels is not None:
834
- loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
835
-
836
- return SequenceClassifierOutputWithPast(
837
- loss=loss,
838
- logits=pooled_logits,
839
- past_key_values=transformer_outputs.past_key_values,
840
- hidden_states=transformer_outputs.hidden_states,
841
- attentions=transformer_outputs.attentions,
842
- )
843
-
844
-
845
- @auto_docstring
846
- class HyperCLOVAXForQuestionAnswering(HyperCLOVAXPreTrainedModel):
847
- base_model_prefix = "transformer"
848
-
849
- # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->HyperCLOVAX
850
- def __init__(self, config):
851
- super().__init__(config)
852
- self.transformer = HyperCLOVAXModel(config)
853
- self.qa_outputs = nn.Linear(config.hidden_size, 2)
854
-
855
- # Initialize weights and apply final processing
856
- self.post_init()
857
-
858
- def get_input_embeddings(self):
859
- return self.transformer.embed_tokens
860
-
861
- def set_input_embeddings(self, value):
862
- self.transformer.embed_tokens = value
863
-
864
- @can_return_tuple
865
- @auto_docstring
866
- def forward(
867
- self,
868
- input_ids: Optional[torch.LongTensor] = None,
869
- attention_mask: Optional[torch.Tensor] = None,
870
- position_ids: Optional[torch.LongTensor] = None,
871
- past_key_values: Optional[Cache] = None,
872
- inputs_embeds: Optional[torch.FloatTensor] = None,
873
- start_positions: Optional[torch.LongTensor] = None,
874
- end_positions: Optional[torch.LongTensor] = None,
875
- output_attentions: Optional[bool] = None,
876
- output_hidden_states: Optional[bool] = None,
877
- **kwargs,
878
- ) -> QuestionAnsweringModelOutput:
879
- outputs: BaseModelOutputWithPast = self.transformer(
880
- input_ids,
881
- attention_mask=attention_mask,
882
- position_ids=position_ids,
883
- past_key_values=past_key_values,
884
- inputs_embeds=inputs_embeds,
885
- output_attentions=output_attentions,
886
- output_hidden_states=output_hidden_states,
887
- )
888
-
889
- sequence_output = outputs.last_hidden_state
890
-
891
- logits = self.qa_outputs(sequence_output)
892
- start_logits, end_logits = logits.split(1, dim=-1)
893
- start_logits = start_logits.squeeze(-1).contiguous()
894
- end_logits = end_logits.squeeze(-1).contiguous()
895
-
896
- loss = None
897
- if start_positions is not None and end_positions is not None:
898
- loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
899
-
900
- return QuestionAnsweringModelOutput(
901
- loss=loss,
902
- start_logits=start_logits,
903
- end_logits=end_logits,
904
- hidden_states=outputs.hidden_states,
905
- attentions=outputs.attentions,
906
- )
907
-
908
-
909
- @auto_docstring
910
- class HyperCLOVAXForTokenClassification(HyperCLOVAXPreTrainedModel):
911
- def __init__(self, config):
912
- super().__init__(config)
913
- self.num_labels = config.num_labels
914
- self.model = HyperCLOVAXModel(config)
915
- if getattr(config, "classifier_dropout", None) is not None:
916
- classifier_dropout = config.classifier_dropout
917
- elif getattr(config, "hidden_dropout", None) is not None:
918
- classifier_dropout = config.hidden_dropout
919
- else:
920
- classifier_dropout = 0.1
921
- self.dropout = nn.Dropout(classifier_dropout)
922
- self.score = nn.Linear(config.hidden_size, config.num_labels)
923
-
924
- # Initialize weights and apply final processing
925
- self.post_init()
926
-
927
- def get_input_embeddings(self):
928
- return self.model.embed_tokens
929
-
930
- def set_input_embeddings(self, value):
931
- self.model.embed_tokens = value
932
-
933
- @can_return_tuple
934
- @auto_docstring
935
- def forward(
936
- self,
937
- input_ids: Optional[torch.LongTensor] = None,
938
- attention_mask: Optional[torch.Tensor] = None,
939
- position_ids: Optional[torch.LongTensor] = None,
940
- past_key_values: Optional[Cache] = None,
941
- inputs_embeds: Optional[torch.FloatTensor] = None,
942
- labels: Optional[torch.LongTensor] = None,
943
- use_cache: Optional[bool] = None,
944
- output_attentions: Optional[bool] = None,
945
- output_hidden_states: Optional[bool] = None,
946
- ) -> TokenClassifierOutput:
947
- r"""
948
- labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
949
- Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
950
- config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
951
- `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
952
- """
953
-
954
- outputs: BaseModelOutputWithPast = self.model(
955
- input_ids,
956
- attention_mask=attention_mask,
957
- position_ids=position_ids,
958
- past_key_values=past_key_values,
959
- inputs_embeds=inputs_embeds,
960
- use_cache=use_cache,
961
- output_attentions=output_attentions,
962
- output_hidden_states=output_hidden_states,
963
- )
964
- sequence_output = outputs.last_hidden_state
965
- sequence_output = self.dropout(sequence_output)
966
- logits = self.score(sequence_output)
967
-
968
- loss = None
969
- if labels is not None:
970
- loss = self.loss_function(logits, labels, self.config)
971
-
972
- return TokenClassifierOutput(
973
- loss=loss,
974
- logits=logits,
975
- hidden_states=outputs.hidden_states,
976
- attentions=outputs.attentions,
977
- )
978
-
979
-
980
- __all__ = [
981
- "HyperCLOVAXForCausalLM",
982
- "HyperCLOVAXModel",
983
- "HyperCLOVAXPreTrainedModel",
984
- "HyperCLOVAXForSequenceClassification",
985
- "HyperCLOVAXForQuestionAnswering",
986
- "HyperCLOVAXForTokenClassification",
987
- ]