| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import unittest |
| |
|
| | from transformers.models.cpm.tokenization_cpm import CpmTokenizer |
| | from transformers.testing_utils import custom_tokenizers |
| |
|
| |
|
| | @custom_tokenizers |
| | class CpmTokenizationTest(unittest.TestCase): |
| | |
| | def is_pipeline_test_to_skip( |
| | self, |
| | pipeline_test_case_name, |
| | config_class, |
| | model_architecture, |
| | tokenizer_name, |
| | image_processor_name, |
| | feature_extractor_name, |
| | processor_name, |
| | ): |
| | return True |
| |
|
| | def test_pre_tokenization(self): |
| | tokenizer = CpmTokenizer.from_pretrained("TsinghuaAI/CPM-Generate") |
| | text = "Hugging Face大法好,谁用谁知道。" |
| | normalized_text = "Hugging Face大法好,谁用谁知道。<unk>" |
| | bpe_tokens = "▁Hu gg ing ▁ ▂ ▁F ace ▁大法 ▁好 ▁ , ▁谁 ▁用 ▁谁 ▁知 道 ▁ 。".split() |
| |
|
| | tokens = tokenizer.tokenize(text) |
| | self.assertListEqual(tokens, bpe_tokens) |
| |
|
| | input_tokens = tokens + [tokenizer.unk_token] |
| |
|
| | input_bpe_tokens = [13789, 13283, 1421, 8, 10, 1164, 13608, 16528, 63, 8, 9, 440, 108, 440, 121, 90, 8, 12, 0] |
| | self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) |
| |
|
| | reconstructed_text = tokenizer.decode(input_bpe_tokens) |
| | self.assertEqual(reconstructed_text, normalized_text) |
| |
|