hfastino commited on
Commit
b674efb
·
verified ·
1 Parent(s): a1e462d

Upload trained model to hfastino/broke-fish

Browse files
added_tokens.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[C]": 128004,
3
+ "[DESCRIPTION]": 128010,
4
+ "[EXAMPLE]": 128008,
5
+ "[E]": 128005,
6
+ "[L]": 128007,
7
+ "[MASK]": 128000,
8
+ "[OUTPUT]": 128009,
9
+ "[P]": 128003,
10
+ "[R]": 128006,
11
+ "[SEP_STRUCT]": 128001,
12
+ "[SEP_TEXT]": 128002
13
+ }
code/inference.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SageMaker Multi-Model Endpoint inference script for GLiNER2.
3
+
4
+ This script handles model loading and inference for the GLiNER2 Multi-Model Endpoint.
5
+ Models are loaded dynamically based on the TargetModel header in the request.
6
+
7
+ Key differences from single-model inference:
8
+ - model_fn() receives the full path to the model directory (including model name)
9
+ - Models are cached automatically by SageMaker MME
10
+ - Multiple models can be loaded in memory simultaneously
11
+ - LRU eviction when memory is full
12
+ """
13
+
14
+ import json
15
+ import os
16
+ import sys
17
+ import subprocess
18
+
19
+
20
+ def _ensure_gliner2_installed():
21
+ """
22
+ Ensure gliner2 is installed. Install it dynamically if missing.
23
+
24
+ This is a workaround for SageMaker MME where requirements.txt
25
+ might not be installed automatically.
26
+ """
27
+ try:
28
+ import gliner2 # noqa: PLC0415
29
+
30
+ print(f"[MME] gliner2 version {gliner2.__version__} already installed")
31
+ return True
32
+ except ImportError:
33
+ print("[MME] gliner2 not found, installing...")
34
+ try:
35
+ # IMPORTANT: Use transformers<4.46 for compatibility with PyTorch 2.1.0
36
+ # (transformers 4.46+ requires PyTorch 2.3+ for torch.utils._pytree.register_pytree_node)
37
+ subprocess.check_call(
38
+ [
39
+ sys.executable,
40
+ "-m",
41
+ "pip",
42
+ "install",
43
+ "--quiet",
44
+ "--no-cache-dir",
45
+ "gliner2==1.0.1",
46
+ "transformers>=4.30.0,<4.46.0",
47
+ ]
48
+ )
49
+ print("[MME] ✓ gliner2 installed successfully")
50
+ return True
51
+ except subprocess.CalledProcessError as e:
52
+ print(f"[MME] ERROR: Failed to install gliner2: {e}")
53
+ return False
54
+
55
+
56
+ # Ensure gliner2 is installed before importing torch (to avoid conflicts)
57
+ _ensure_gliner2_installed()
58
+
59
+ import torch # noqa: E402
60
+
61
+ # Add parent directory to path to potentially import from gliner_2_inference
62
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
63
+
64
+
65
+ class DummyModel:
66
+ """Placeholder model for MME container initialization"""
67
+
68
+ def __call__(self, *args, **kwargs):
69
+ raise ValueError("Container model invoked directly. Use TargetModel header.")
70
+
71
+ def extract_entities(self, *args, **kwargs):
72
+ raise ValueError("Container model invoked directly. Use TargetModel header.")
73
+
74
+ def classify_text(self, *args, **kwargs):
75
+ raise ValueError("Container model invoked directly. Use TargetModel header.")
76
+
77
+ def extract_json(self, *args, **kwargs):
78
+ raise ValueError("Container model invoked directly. Use TargetModel header.")
79
+
80
+
81
+ def model_fn(model_dir):
82
+ """
83
+ Load the GLiNER2 model from the model directory.
84
+
85
+ For Multi-Model Endpoints, SageMaker passes the full path to the specific
86
+ model being loaded, e.g., /opt/ml/models/<model_name>/
87
+
88
+ Args:
89
+ model_dir: The directory where model artifacts are extracted
90
+
91
+ Returns:
92
+ The loaded GLiNER2 model
93
+ """
94
+ print(f"[MME] Loading model from: {model_dir}")
95
+ try:
96
+ print(f"[MME] Contents: {os.listdir(model_dir)}")
97
+ except Exception as e:
98
+ print(f"[MME] Could not list directory contents: {e}")
99
+
100
+ # Import GLiNER2 here (should be installed by _ensure_gliner2_installed)
101
+ try:
102
+ from gliner2 import GLiNER2 # noqa: PLC0415
103
+ except ImportError as e:
104
+ print(f"[MME] ERROR: gliner2 import failed: {e}")
105
+ print("[MME] Attempting to install gliner2...")
106
+ if _ensure_gliner2_installed():
107
+ from gliner2 import GLiNER2 # noqa: PLC0415
108
+ else:
109
+ GLiNER2 = None
110
+
111
+ # Detect device
112
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
113
+ print(f"[MME] Using device: {device}")
114
+
115
+ if torch.cuda.is_available():
116
+ print(f"[MME] GPU: {torch.cuda.get_device_name(0)}")
117
+ print(f"[MME] CUDA version: {torch.version.cuda}")
118
+ mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
119
+ print(f"[MME] GPU memory: {mem_gb:.2f} GB")
120
+
121
+ # Get HuggingFace token if available
122
+ hf_token = os.environ.get("HF_TOKEN")
123
+
124
+ # Check if this is the container model (placeholder)
125
+ if os.path.exists(os.path.join(model_dir, "mme_container.txt")):
126
+ print("[MME] Container model detected - returning dummy model")
127
+ return DummyModel()
128
+
129
+ if GLiNER2 is None:
130
+ raise ImportError("gliner2 package required but not found")
131
+
132
+ # Check if model is already extracted in model_dir
133
+ if os.path.exists(os.path.join(model_dir, "config.json")):
134
+ print("[MME] Loading model from extracted artifacts...")
135
+ model = GLiNER2.from_pretrained(model_dir, token=hf_token)
136
+ elif os.path.exists(os.path.join(model_dir, "download_at_runtime.txt")):
137
+ # Fallback: download from HuggingFace
138
+ print("[MME] Model not in archive, downloading from HuggingFace...")
139
+ model_name = os.environ.get("GLINER_MODEL", "fastino/gliner2-base-v1")
140
+ print(f"[MME] Downloading model: {model_name}")
141
+ model = GLiNER2.from_pretrained(model_name, token=hf_token)
142
+ else:
143
+ # Final fallback
144
+ model_name = os.environ.get("GLINER_MODEL", "fastino/gliner2-base-v1")
145
+ print(f"[MME] Model directory empty, downloading: {model_name}")
146
+ model = GLiNER2.from_pretrained(model_name, token=hf_token)
147
+
148
+ # Move model to GPU if available
149
+ print(f"[MME] Moving model to {device}...")
150
+ model = model.to(device)
151
+
152
+ # Enable half precision on GPU for memory efficiency
153
+ if torch.cuda.is_available():
154
+ print("[MME] Converting to fp16...")
155
+ model = model.half()
156
+
157
+ # Memory optimizations for GPU
158
+ if torch.cuda.is_available():
159
+ torch.backends.cuda.matmul.allow_tf32 = True
160
+ torch.backends.cudnn.allow_tf32 = True
161
+ torch.cuda.empty_cache()
162
+ # Reserve memory for multiple models in MME
163
+ torch.cuda.set_per_process_memory_fraction(0.85)
164
+ print("[MME] GPU memory optimizations enabled")
165
+
166
+ print(f"[MME] ✓ Model loaded successfully on {device}")
167
+ return model
168
+
169
+
170
+ def input_fn(request_body, request_content_type):
171
+ """
172
+ Deserialize and prepare the input data for prediction.
173
+
174
+ Args:
175
+ request_body: The request body
176
+ request_content_type: The content type of the request
177
+
178
+ Returns:
179
+ Parsed input data as a dictionary
180
+ """
181
+ if request_content_type == "application/json":
182
+ input_data = json.loads(request_body)
183
+ return input_data
184
+ else:
185
+ raise ValueError(f"Unsupported content type: {request_content_type}")
186
+
187
+
188
+ def predict_fn(input_data, model):
189
+ """
190
+ Run prediction on the input data using the loaded model.
191
+
192
+ Args:
193
+ input_data: Dictionary containing:
194
+ - task: One of 'extract_entities', 'classify_text', or 'extract_json'
195
+ - text: Text to process (string) or list of texts (for batch processing)
196
+ - schema: Schema for extraction (format depends on task)
197
+ - threshold: Optional confidence threshold (default: 0.5)
198
+ model: The loaded GLiNER2 model
199
+
200
+ Returns:
201
+ Task-specific results (single result or list of results for batch)
202
+ """
203
+ # Clear CUDA cache before processing
204
+ if torch.cuda.is_available():
205
+ torch.cuda.empty_cache()
206
+
207
+ text = input_data.get("text")
208
+ task = input_data.get("task", "extract_entities")
209
+ schema = input_data.get("schema")
210
+ threshold = input_data.get("threshold", 0.5)
211
+
212
+ if not text:
213
+ raise ValueError("'text' field is required")
214
+ if not schema:
215
+ raise ValueError("'schema' field is required")
216
+
217
+ # Detect batch mode
218
+ is_batch = isinstance(text, list)
219
+
220
+ if is_batch and len(text) == 0:
221
+ raise ValueError("'text' list cannot be empty")
222
+
223
+ # Use inference_mode for faster inference
224
+ with torch.inference_mode():
225
+ if task == "extract_entities":
226
+ if is_batch:
227
+ if hasattr(model, "batch_extract_entities"):
228
+ result = model.batch_extract_entities(
229
+ text, schema, threshold=threshold
230
+ )
231
+ elif hasattr(model, "batch_predict_entities"):
232
+ result = model.batch_predict_entities(
233
+ text, schema, threshold=threshold
234
+ )
235
+ else:
236
+ result = [
237
+ model.extract_entities(t, schema, threshold=threshold)
238
+ for t in text
239
+ ]
240
+ else:
241
+ result = model.extract_entities(text, schema, threshold=threshold)
242
+ return result
243
+
244
+ elif task == "classify_text":
245
+ if is_batch:
246
+ if hasattr(model, "batch_classify_text"):
247
+ result = model.batch_classify_text(
248
+ text, schema, threshold=threshold
249
+ )
250
+ else:
251
+ result = [
252
+ model.classify_text(t, schema, threshold=threshold)
253
+ for t in text
254
+ ]
255
+ else:
256
+ result = model.classify_text(text, schema, threshold=threshold)
257
+ return result
258
+
259
+ elif task == "extract_json":
260
+ if is_batch:
261
+ if hasattr(model, "batch_extract_json"):
262
+ result = model.batch_extract_json(text, schema, threshold=threshold)
263
+ else:
264
+ result = [
265
+ model.extract_json(t, schema, threshold=threshold) for t in text
266
+ ]
267
+ else:
268
+ result = model.extract_json(text, schema, threshold=threshold)
269
+ return result
270
+
271
+ else:
272
+ raise ValueError(
273
+ f"Unsupported task: {task}. "
274
+ "Must be one of: extract_entities, classify_text, extract_json"
275
+ )
276
+
277
+
278
+ def output_fn(prediction, response_content_type):
279
+ """
280
+ Serialize the prediction output.
281
+
282
+ Args:
283
+ prediction: The prediction result
284
+ response_content_type: The desired response content type
285
+
286
+ Returns:
287
+ Serialized prediction
288
+ """
289
+ if response_content_type == "application/json":
290
+ return json.dumps(prediction)
291
+ else:
292
+ raise ValueError(f"Unsupported response content type: {response_content_type}")
code/requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requirements for GLiNER2 Multi-Model Endpoint
2
+ # NOTE: These are installed when the SageMaker container starts
3
+ #
4
+ # IMPORTANT: SageMaker PyTorch 2.1.0 container requires transformers<4.46
5
+ # (transformers 4.46+ uses torch.utils._pytree.register_pytree_node which needs PyTorch 2.3+)
6
+
7
+ # Core dependencies - gliner2 must be installed for model loading
8
+ gliner2==1.0.1
9
+ transformers>=4.30.0,<4.46.0
10
+
11
+ # JSON handling
12
+ orjson>=3.9.0
config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "counting_layer": "count_lstm_v2",
4
+ "max_width": 8,
5
+ "model_name": "microsoft/deberta-v3-base",
6
+ "model_type": "extractor",
7
+ "token_pooling": "first",
8
+ "transformers_version": "4.57.6"
9
+ }
encoder_config/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "dtype": "float32",
5
+ "hidden_act": "gelu",
6
+ "hidden_dropout_prob": 0.1,
7
+ "hidden_size": 768,
8
+ "initializer_range": 0.02,
9
+ "intermediate_size": 3072,
10
+ "layer_norm_eps": 1e-07,
11
+ "legacy": true,
12
+ "max_position_embeddings": 512,
13
+ "max_relative_positions": -1,
14
+ "model_type": "deberta-v2",
15
+ "norm_rel_ebd": "layer_norm",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "pooler_dropout": 0,
20
+ "pooler_hidden_act": "gelu",
21
+ "pooler_hidden_size": 768,
22
+ "pos_att_type": [
23
+ "p2c",
24
+ "c2p"
25
+ ],
26
+ "position_biased_input": false,
27
+ "position_buckets": 256,
28
+ "relative_attention": true,
29
+ "share_att_key": true,
30
+ "transformers_version": "4.57.6",
31
+ "type_vocab_size": 0,
32
+ "vocab_size": 128011
33
+ }
labels.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["certificate_number", "date", "destination", "fish_species", "health_status", "inspector_name", "organization", "origin_location", "quantity", "weight"]
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cdd9f766a4626b85c25e5000f3681049cd28c173b0e306172f405e8533cba1b
3
+ size 833938108
special_tokens_map.json ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "[SEP_STRUCT]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "[SEP_TEXT]",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "[P]",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "[C]",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "[E]",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "[R]",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "[L]",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ {
53
+ "content": "[EXAMPLE]",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "content": "[OUTPUT]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ {
67
+ "content": "[DESCRIPTION]",
68
+ "lstrip": false,
69
+ "normalized": false,
70
+ "rstrip": false,
71
+ "single_word": false
72
+ }
73
+ ],
74
+ "bos_token": {
75
+ "content": "[CLS]",
76
+ "lstrip": false,
77
+ "normalized": false,
78
+ "rstrip": false,
79
+ "single_word": false
80
+ },
81
+ "cls_token": {
82
+ "content": "[CLS]",
83
+ "lstrip": false,
84
+ "normalized": false,
85
+ "rstrip": false,
86
+ "single_word": false
87
+ },
88
+ "eos_token": {
89
+ "content": "[SEP]",
90
+ "lstrip": false,
91
+ "normalized": false,
92
+ "rstrip": false,
93
+ "single_word": false
94
+ },
95
+ "mask_token": {
96
+ "content": "[MASK]",
97
+ "lstrip": false,
98
+ "normalized": false,
99
+ "rstrip": false,
100
+ "single_word": false
101
+ },
102
+ "pad_token": {
103
+ "content": "[PAD]",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false
108
+ },
109
+ "sep_token": {
110
+ "content": "[SEP]",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false
115
+ },
116
+ "unk_token": {
117
+ "content": "[UNK]",
118
+ "lstrip": false,
119
+ "normalized": true,
120
+ "rstrip": false,
121
+ "single_word": false
122
+ }
123
+ }
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "128001": {
44
+ "content": "[SEP_STRUCT]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "128002": {
52
+ "content": "[SEP_TEXT]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "128003": {
60
+ "content": "[P]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "128004": {
68
+ "content": "[C]",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "128005": {
76
+ "content": "[E]",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "128006": {
84
+ "content": "[R]",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "128007": {
92
+ "content": "[L]",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "128008": {
100
+ "content": "[EXAMPLE]",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "128009": {
108
+ "content": "[OUTPUT]",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "128010": {
116
+ "content": "[DESCRIPTION]",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ }
123
+ },
124
+ "additional_special_tokens": [
125
+ "[SEP_STRUCT]",
126
+ "[SEP_TEXT]",
127
+ "[P]",
128
+ "[C]",
129
+ "[E]",
130
+ "[R]",
131
+ "[L]",
132
+ "[EXAMPLE]",
133
+ "[OUTPUT]",
134
+ "[DESCRIPTION]"
135
+ ],
136
+ "bos_token": "[CLS]",
137
+ "clean_up_tokenization_spaces": false,
138
+ "cls_token": "[CLS]",
139
+ "do_lower_case": false,
140
+ "eos_token": "[SEP]",
141
+ "extra_special_tokens": {},
142
+ "mask_token": "[MASK]",
143
+ "model_max_length": 1000000000000000019884624838656,
144
+ "pad_token": "[PAD]",
145
+ "sep_token": "[SEP]",
146
+ "sp_model_kwargs": {},
147
+ "split_by_punct": false,
148
+ "tokenizer_class": "DebertaV2Tokenizer",
149
+ "unk_token": "[UNK]",
150
+ "vocab_type": "spm"
151
+ }