| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | import os |
| | import unittest |
| |
|
| | from transformers import BertTokenizerFast |
| | from transformers.models.bert.tokenization_bert import ( |
| | VOCAB_FILES_NAMES, |
| | BasicTokenizer, |
| | BertTokenizer, |
| | WordpieceTokenizer, |
| | _is_control, |
| | _is_punctuation, |
| | _is_whitespace, |
| | ) |
| | from transformers.testing_utils import require_tokenizers, slow |
| |
|
| | from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english |
| |
|
| |
|
| | @require_tokenizers |
| | class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): |
| | from_pretrained_id = "google-bert/bert-base-uncased" |
| | tokenizer_class = BertTokenizer |
| | rust_tokenizer_class = BertTokenizerFast |
| | test_rust_tokenizer = True |
| | space_between_special_tokens = True |
| | from_pretrained_filter = filter_non_english |
| |
|
| | @classmethod |
| | def setUpClass(cls): |
| | super().setUpClass() |
| |
|
| | vocab_tokens = [ |
| | "[UNK]", |
| | "[CLS]", |
| | "[SEP]", |
| | "[PAD]", |
| | "[MASK]", |
| | "want", |
| | "##want", |
| | "##ed", |
| | "wa", |
| | "un", |
| | "runn", |
| | "##ing", |
| | ",", |
| | "low", |
| | "lowest", |
| | ] |
| | cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) |
| | with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: |
| | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) |
| |
|
| | def get_input_output_texts(self, tokenizer): |
| | input_text = "UNwant\u00e9d,running" |
| | output_text = "unwanted, running" |
| | return input_text, output_text |
| |
|
| | def test_full_tokenizer(self): |
| | tokenizer = self.tokenizer_class(self.vocab_file) |
| |
|
| | tokens = tokenizer.tokenize("UNwant\u00e9d,running") |
| | self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) |
| | self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11]) |
| |
|
| | def test_rust_and_python_full_tokenizers(self): |
| | if not self.test_rust_tokenizer: |
| | self.skipTest(reason="test_rust_tokenizer is set to False") |
| |
|
| | tokenizer = self.get_tokenizer() |
| | rust_tokenizer = self.get_rust_tokenizer() |
| |
|
| | sequence = "UNwant\u00e9d,running" |
| |
|
| | tokens = tokenizer.tokenize(sequence) |
| | rust_tokens = rust_tokenizer.tokenize(sequence) |
| | self.assertListEqual(tokens, rust_tokens) |
| |
|
| | ids = tokenizer.encode(sequence, add_special_tokens=False) |
| | rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) |
| | self.assertListEqual(ids, rust_ids) |
| |
|
| | rust_tokenizer = self.get_rust_tokenizer() |
| | ids = tokenizer.encode(sequence) |
| | rust_ids = rust_tokenizer.encode(sequence) |
| | self.assertListEqual(ids, rust_ids) |
| |
|
| | |
| | tokenizer = self.get_tokenizer(do_lower_case=True) |
| | rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True) |
| |
|
| | sequence = "UNwant\u00e9d,running" |
| |
|
| | tokens = tokenizer.tokenize(sequence) |
| | rust_tokens = rust_tokenizer.tokenize(sequence) |
| | self.assertListEqual(tokens, rust_tokens) |
| |
|
| | ids = tokenizer.encode(sequence, add_special_tokens=False) |
| | rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) |
| | self.assertListEqual(ids, rust_ids) |
| |
|
| | rust_tokenizer = self.get_rust_tokenizer() |
| | ids = tokenizer.encode(sequence) |
| | rust_ids = rust_tokenizer.encode(sequence) |
| | self.assertListEqual(ids, rust_ids) |
| |
|
| | def test_chinese(self): |
| | tokenizer = BasicTokenizer() |
| |
|
| | self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"]) |
| |
|
| | def test_basic_tokenizer_lower(self): |
| | tokenizer = BasicTokenizer(do_lower_case=True) |
| |
|
| | self.assertListEqual( |
| | tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"] |
| | ) |
| | self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"]) |
| |
|
| | def test_basic_tokenizer_lower_strip_accents_false(self): |
| | tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False) |
| |
|
| | self.assertListEqual( |
| | tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hällo", "!", "how", "are", "you", "?"] |
| | ) |
| | self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"]) |
| |
|
| | def test_basic_tokenizer_lower_strip_accents_true(self): |
| | tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True) |
| |
|
| | self.assertListEqual( |
| | tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"] |
| | ) |
| | self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"]) |
| |
|
| | def test_basic_tokenizer_lower_strip_accents_default(self): |
| | tokenizer = BasicTokenizer(do_lower_case=True) |
| |
|
| | self.assertListEqual( |
| | tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"] |
| | ) |
| | self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"]) |
| |
|
| | def test_basic_tokenizer_no_lower(self): |
| | tokenizer = BasicTokenizer(do_lower_case=False) |
| |
|
| | self.assertListEqual( |
| | tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"] |
| | ) |
| |
|
| | def test_basic_tokenizer_no_lower_strip_accents_false(self): |
| | tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False) |
| |
|
| | self.assertListEqual( |
| | tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HäLLo", "!", "how", "Are", "yoU", "?"] |
| | ) |
| |
|
| | def test_basic_tokenizer_no_lower_strip_accents_true(self): |
| | tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True) |
| |
|
| | self.assertListEqual( |
| | tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HaLLo", "!", "how", "Are", "yoU", "?"] |
| | ) |
| |
|
| | def test_basic_tokenizer_respects_never_split_tokens(self): |
| | tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"]) |
| |
|
| | self.assertListEqual( |
| | tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"] |
| | ) |
| |
|
| | def test_basic_tokenizer_splits_on_punctuation(self): |
| | tokenizer = BasicTokenizer() |
| | text = "a\n'll !!to?'d of, can't." |
| | expected = ["a", "'", "ll", "!", "!", "to", "?", "'", "d", "of", ",", "can", "'", "t", "."] |
| | self.assertListEqual(tokenizer.tokenize(text), expected) |
| |
|
| | def test_wordpiece_tokenizer(self): |
| | vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"] |
| |
|
| | vocab = {} |
| | for i, token in enumerate(vocab_tokens): |
| | vocab[token] = i |
| | tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]") |
| |
|
| | self.assertListEqual(tokenizer.tokenize(""), []) |
| |
|
| | self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) |
| |
|
| | self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) |
| |
|
| | def test_is_whitespace(self): |
| | self.assertTrue(_is_whitespace(" ")) |
| | self.assertTrue(_is_whitespace("\t")) |
| | self.assertTrue(_is_whitespace("\r")) |
| | self.assertTrue(_is_whitespace("\n")) |
| | self.assertTrue(_is_whitespace("\u00a0")) |
| |
|
| | self.assertFalse(_is_whitespace("A")) |
| | self.assertFalse(_is_whitespace("-")) |
| |
|
| | def test_is_control(self): |
| | self.assertTrue(_is_control("\u0005")) |
| |
|
| | self.assertFalse(_is_control("A")) |
| | self.assertFalse(_is_control(" ")) |
| | self.assertFalse(_is_control("\t")) |
| | self.assertFalse(_is_control("\r")) |
| |
|
| | def test_is_punctuation(self): |
| | self.assertTrue(_is_punctuation("-")) |
| | self.assertTrue(_is_punctuation("$")) |
| | self.assertTrue(_is_punctuation("`")) |
| | self.assertTrue(_is_punctuation(".")) |
| |
|
| | self.assertFalse(_is_punctuation("A")) |
| | self.assertFalse(_is_punctuation(" ")) |
| |
|
| | def test_clean_text(self): |
| | tokenizer = self.get_tokenizer() |
| | rust_tokenizer = self.get_rust_tokenizer() |
| |
|
| | |
| | self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]) |
| |
|
| | self.assertListEqual( |
| | [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]] |
| | ) |
| |
|
| | @slow |
| | def test_sequence_builders(self): |
| | tokenizer = self.tokenizer_class.from_pretrained("google-bert/bert-base-uncased") |
| |
|
| | text = tokenizer.encode("sequence builders", add_special_tokens=False) |
| | text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) |
| |
|
| | encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) |
| | encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) |
| |
|
| | assert encoded_sentence == [101] + text + [102] |
| | assert encoded_pair == [101] + text + [102] + text_2 + [102] |
| |
|
| | def test_offsets_with_special_characters(self): |
| | for tokenizer, pretrained_name, kwargs in self.tokenizers_list: |
| | with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): |
| | tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) |
| |
|
| | sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." |
| | tokens = tokenizer_r.encode_plus( |
| | sentence, |
| | return_attention_mask=False, |
| | return_token_type_ids=False, |
| | return_offsets_mapping=True, |
| | add_special_tokens=True, |
| | ) |
| |
|
| | do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False |
| | expected_results = ( |
| | [ |
| | ((0, 0), tokenizer_r.cls_token), |
| | ((0, 1), "A"), |
| | ((1, 2), ","), |
| | ((3, 5), "na"), |
| | ((5, 6), "##ï"), |
| | ((6, 8), "##ve"), |
| | ((9, 15), tokenizer_r.mask_token), |
| | ((16, 21), "Allen"), |
| | ((21, 23), "##NL"), |
| | ((23, 24), "##P"), |
| | ((25, 33), "sentence"), |
| | ((33, 34), "."), |
| | ((0, 0), tokenizer_r.sep_token), |
| | ] |
| | if not do_lower_case |
| | else [ |
| | ((0, 0), tokenizer_r.cls_token), |
| | ((0, 1), "a"), |
| | ((1, 2), ","), |
| | ((3, 8), "naive"), |
| | ((9, 15), tokenizer_r.mask_token), |
| | ((16, 21), "allen"), |
| | ((21, 23), "##nl"), |
| | ((23, 24), "##p"), |
| | ((25, 33), "sentence"), |
| | ((33, 34), "."), |
| | ((0, 0), tokenizer_r.sep_token), |
| | ] |
| | ) |
| |
|
| | self.assertEqual( |
| | [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"]) |
| | ) |
| | self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"]) |
| |
|
| | def test_change_tokenize_chinese_chars(self): |
| | list_of_commun_chinese_char = ["的", "人", "有"] |
| | text_with_chinese_char = "".join(list_of_commun_chinese_char) |
| | for tokenizer, pretrained_name, kwargs in self.tokenizers_list: |
| | with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): |
| | kwargs["tokenize_chinese_chars"] = True |
| | tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) |
| | tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) |
| |
|
| | ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) |
| | ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) |
| |
|
| | tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r) |
| | tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p) |
| |
|
| | |
| | self.assertListEqual(tokens_without_spe_char_p, list_of_commun_chinese_char) |
| | self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char) |
| |
|
| | kwargs["tokenize_chinese_chars"] = False |
| | tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) |
| | tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) |
| |
|
| | ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) |
| | ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) |
| |
|
| | tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r) |
| | tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p) |
| |
|
| | |
| | expected_tokens = [ |
| | f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char) |
| | ] |
| | self.assertListEqual(tokens_without_spe_char_p, expected_tokens) |
| | self.assertListEqual(tokens_without_spe_char_r, expected_tokens) |
| |
|