{ "version": "1.0", "truncation": null, "padding": null, "added_tokens": [ { "id": 75, "content": "<|startoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 76, "content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 77, "content": "[CLS]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 78, "content": "<|mdm_mask|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 79, "content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 80, "content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 81, "content": "<|arithmetic_start|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 82, "content": "<|arithmetic_end|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 83, "content": "<|number_start|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 84, "content": "<|number_end|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true } ], "normalizer": null, "pre_tokenizer": { "type": "Sequence", "pretokenizers": [ { "type": "Split", "pattern": { "Regex": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+" }, "behavior": "Isolated", "invert": false }, { "type": "ByteLevel", "add_prefix_space": false, "trim_offsets": true, "use_regex": false } ] }, "post_processor": { "type": "ByteLevel", "add_prefix_space": true, "trim_offsets": false, "use_regex": true }, "decoder": { "type": "ByteLevel", "add_prefix_space": true, "trim_offsets": true, "use_regex": true }, "model": { "type": "BPE", "dropout": null, "unk_token": null, "continuing_subword_prefix": null, "end_of_word_suffix": null, "fuse_unk": false, "byte_fallback": false, "ignore_merges": false, "vocab": { "!": 0, "*": 1, "+": 2, ",": 3, "-": 4, ".": 5, "/": 6, "0": 7, "1": 8, "2": 9, "3": 10, "4": 11, "5": 12, "6": 13, "7": 14, "8": 15, "9": 16, ":": 17, ";": 18, "=": 19, "?": 20, "A": 21, "B": 22, "C": 23, "D": 24, "E": 25, "F": 26, "G": 27, "H": 28, "I": 29, "J": 30, "K": 31, "L": 32, "M": 33, "N": 34, "O": 35, "P": 36, "Q": 37, "R": 38, "S": 39, "T": 40, "U": 41, "V": 42, "W": 43, "X": 44, "Y": 45, "Z": 46, "a": 47, "b": 48, "c": 49, "d": 50, "e": 51, "f": 52, "g": 53, "h": 54, "i": 55, "j": 56, "k": 57, "l": 58, "m": 59, "n": 60, "o": 61, "p": 62, "q": 63, "r": 64, "s": 65, "t": 66, "u": 67, "v": 68, "w": 69, "x": 70, "y": 71, "z": 72, "|": 73, "Ċ": 74 }, "merges": [] } }