| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import json |
| | import os |
| | import shutil |
| | import sys |
| | import tempfile |
| | import unittest |
| | from pathlib import Path |
| |
|
| | import pytest |
| |
|
| | import transformers |
| | from transformers import ( |
| | AutoTokenizer, |
| | BertConfig, |
| | BertTokenizer, |
| | BertTokenizerFast, |
| | CTRLTokenizer, |
| | GPT2Tokenizer, |
| | GPT2TokenizerFast, |
| | PreTrainedTokenizerFast, |
| | RobertaTokenizer, |
| | RobertaTokenizerFast, |
| | is_tokenizers_available, |
| | ) |
| | from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig |
| | from transformers.models.auto.tokenization_auto import ( |
| | TOKENIZER_MAPPING, |
| | get_tokenizer_config, |
| | tokenizer_class_from_name, |
| | ) |
| | from transformers.models.roberta.configuration_roberta import RobertaConfig |
| | from transformers.testing_utils import ( |
| | DUMMY_DIFF_TOKENIZER_IDENTIFIER, |
| | DUMMY_UNKNOWN_IDENTIFIER, |
| | SMALL_MODEL_IDENTIFIER, |
| | RequestCounter, |
| | is_flaky, |
| | require_tokenizers, |
| | slow, |
| | ) |
| |
|
| |
|
| | sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils")) |
| |
|
| | from test_module.custom_configuration import CustomConfig |
| | from test_module.custom_tokenization import CustomTokenizer |
| |
|
| |
|
| | if is_tokenizers_available(): |
| | from test_module.custom_tokenization_fast import CustomTokenizerFast |
| |
|
| |
|
| | class AutoTokenizerTest(unittest.TestCase): |
| | def setUp(self): |
| | transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0 |
| |
|
| | @slow |
| | def test_tokenizer_from_pretrained(self): |
| | for model_name in {"google-bert/bert-base-uncased", "google-bert/bert-base-cased"}: |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | self.assertIsNotNone(tokenizer) |
| | self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast)) |
| | self.assertGreater(len(tokenizer), 0) |
| |
|
| | for model_name in ["openai-community/gpt2", "openai-community/gpt2-medium"]: |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | self.assertIsNotNone(tokenizer) |
| | self.assertIsInstance(tokenizer, (GPT2Tokenizer, GPT2TokenizerFast)) |
| | self.assertGreater(len(tokenizer), 0) |
| |
|
| | def test_tokenizer_from_pretrained_identifier(self): |
| | tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER) |
| | self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast)) |
| | self.assertEqual(tokenizer.vocab_size, 12) |
| |
|
| | def test_tokenizer_from_model_type(self): |
| | tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER) |
| | self.assertIsInstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast)) |
| | self.assertEqual(tokenizer.vocab_size, 20) |
| |
|
| | def test_tokenizer_from_tokenizer_class(self): |
| | config = AutoConfig.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER) |
| | self.assertIsInstance(config, RobertaConfig) |
| | |
| | tokenizer = AutoTokenizer.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER, config=config) |
| | self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast)) |
| | self.assertEqual(tokenizer.vocab_size, 12) |
| |
|
| | def test_tokenizer_from_type(self): |
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt")) |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert", use_fast=False) |
| | self.assertIsInstance(tokenizer, BertTokenizer) |
| |
|
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json")) |
| | shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt")) |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2", use_fast=False) |
| | self.assertIsInstance(tokenizer, GPT2Tokenizer) |
| |
|
| | @require_tokenizers |
| | def test_tokenizer_from_type_fast(self): |
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt")) |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert") |
| | self.assertIsInstance(tokenizer, BertTokenizerFast) |
| |
|
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json")) |
| | shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt")) |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2") |
| | self.assertIsInstance(tokenizer, GPT2TokenizerFast) |
| |
|
| | def test_tokenizer_from_type_incorrect_name(self): |
| | with pytest.raises(ValueError): |
| | AutoTokenizer.from_pretrained("./", tokenizer_type="xxx") |
| |
|
| | @require_tokenizers |
| | def test_tokenizer_identifier_with_correct_config(self): |
| | for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]: |
| | tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased") |
| | self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast)) |
| |
|
| | if isinstance(tokenizer, BertTokenizer): |
| | self.assertEqual(tokenizer.basic_tokenizer.do_lower_case, False) |
| | else: |
| | self.assertEqual(tokenizer.do_lower_case, False) |
| |
|
| | self.assertEqual(tokenizer.model_max_length, 512) |
| |
|
| | @require_tokenizers |
| | @is_flaky() |
| | def test_tokenizer_identifier_non_existent(self): |
| | for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]: |
| | with self.assertRaisesRegex( |
| | EnvironmentError, |
| | "julien-c/herlolip-not-exists is not a local folder and is not a valid model identifier", |
| | ): |
| | _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists") |
| |
|
| | def test_model_name_edge_cases_in_mappings(self): |
| | |
| | |
| | |
| | tokenizers = TOKENIZER_MAPPING.values() |
| | tokenizer_names = [] |
| |
|
| | for slow_tok, fast_tok in tokenizers: |
| | if slow_tok is not None: |
| | tokenizer_names.append(slow_tok.__name__) |
| |
|
| | if fast_tok is not None: |
| | tokenizer_names.append(fast_tok.__name__) |
| |
|
| | for tokenizer_name in tokenizer_names: |
| | |
| | tokenizer_class_from_name(tokenizer_name) |
| |
|
| | @require_tokenizers |
| | def test_from_pretrained_use_fast_toggle(self): |
| | self.assertIsInstance( |
| | AutoTokenizer.from_pretrained("google-bert/bert-base-cased", use_fast=False), BertTokenizer |
| | ) |
| | self.assertIsInstance(AutoTokenizer.from_pretrained("google-bert/bert-base-cased"), BertTokenizerFast) |
| |
|
| | @require_tokenizers |
| | def test_do_lower_case(self): |
| | tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", do_lower_case=False) |
| | sample = "Hello, world. How are you?" |
| | tokens = tokenizer.tokenize(sample) |
| | self.assertEqual("[UNK]", tokens[0]) |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base", do_lower_case=False) |
| | tokens = tokenizer.tokenize(sample) |
| | self.assertEqual("[UNK]", tokens[0]) |
| |
|
| | @require_tokenizers |
| | def test_PreTrainedTokenizerFast_from_pretrained(self): |
| | tokenizer = AutoTokenizer.from_pretrained("robot-test/dummy-tokenizer-fast-with-model-config") |
| | self.assertEqual(type(tokenizer), PreTrainedTokenizerFast) |
| | self.assertEqual(tokenizer.model_max_length, 512) |
| | self.assertEqual(tokenizer.vocab_size, 30000) |
| | self.assertEqual(tokenizer.unk_token, "[UNK]") |
| | self.assertEqual(tokenizer.padding_side, "right") |
| | self.assertEqual(tokenizer.truncation_side, "right") |
| |
|
| | def test_auto_tokenizer_from_local_folder(self): |
| | tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER) |
| | self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast)) |
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | tokenizer.save_pretrained(tmp_dir) |
| | tokenizer2 = AutoTokenizer.from_pretrained(tmp_dir) |
| |
|
| | self.assertIsInstance(tokenizer2, tokenizer.__class__) |
| | self.assertEqual(tokenizer2.vocab_size, 12) |
| |
|
| | def test_auto_tokenizer_fast_no_slow(self): |
| | tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl") |
| | |
| | self.assertIsInstance(tokenizer, CTRLTokenizer) |
| |
|
| | def test_get_tokenizer_config(self): |
| | |
| | config = get_tokenizer_config("google-bert/bert-base-cased") |
| | _ = config.pop("_commit_hash", None) |
| | |
| | self.assertEqual(config, {"do_lower_case": False, "model_max_length": 512}) |
| |
|
| | |
| | config = get_tokenizer_config(SMALL_MODEL_IDENTIFIER) |
| | self.assertDictEqual(config, {}) |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER) |
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | tokenizer.save_pretrained(tmp_dir) |
| | config = get_tokenizer_config(tmp_dir) |
| |
|
| | |
| | self.assertEqual(config["tokenizer_class"], "BertTokenizer") |
| |
|
| | def test_new_tokenizer_registration(self): |
| | try: |
| | AutoConfig.register("custom", CustomConfig) |
| |
|
| | AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer) |
| | |
| | with self.assertRaises(ValueError): |
| | AutoTokenizer.register(BertConfig, slow_tokenizer_class=BertTokenizer) |
| |
|
| | tokenizer = CustomTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER) |
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | tokenizer.save_pretrained(tmp_dir) |
| |
|
| | new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir) |
| | self.assertIsInstance(new_tokenizer, CustomTokenizer) |
| |
|
| | finally: |
| | if "custom" in CONFIG_MAPPING._extra_content: |
| | del CONFIG_MAPPING._extra_content["custom"] |
| | if CustomConfig in TOKENIZER_MAPPING._extra_content: |
| | del TOKENIZER_MAPPING._extra_content[CustomConfig] |
| |
|
| | @require_tokenizers |
| | def test_new_tokenizer_fast_registration(self): |
| | try: |
| | AutoConfig.register("custom", CustomConfig) |
| |
|
| | |
| | AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer) |
| | self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, None)) |
| | AutoTokenizer.register(CustomConfig, fast_tokenizer_class=CustomTokenizerFast) |
| | self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast)) |
| |
|
| | del TOKENIZER_MAPPING._extra_content[CustomConfig] |
| | |
| | AutoTokenizer.register( |
| | CustomConfig, slow_tokenizer_class=CustomTokenizer, fast_tokenizer_class=CustomTokenizerFast |
| | ) |
| | self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast)) |
| |
|
| | |
| | with self.assertRaises(ValueError): |
| | AutoTokenizer.register(BertConfig, fast_tokenizer_class=BertTokenizerFast) |
| |
|
| | |
| | |
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | bert_tokenizer = BertTokenizerFast.from_pretrained(SMALL_MODEL_IDENTIFIER) |
| | bert_tokenizer.save_pretrained(tmp_dir) |
| | tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir) |
| |
|
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | tokenizer.save_pretrained(tmp_dir) |
| |
|
| | new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir) |
| | self.assertIsInstance(new_tokenizer, CustomTokenizerFast) |
| |
|
| | new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, use_fast=False) |
| | self.assertIsInstance(new_tokenizer, CustomTokenizer) |
| |
|
| | finally: |
| | if "custom" in CONFIG_MAPPING._extra_content: |
| | del CONFIG_MAPPING._extra_content["custom"] |
| | if CustomConfig in TOKENIZER_MAPPING._extra_content: |
| | del TOKENIZER_MAPPING._extra_content[CustomConfig] |
| |
|
| | def test_from_pretrained_dynamic_tokenizer(self): |
| | |
| | with self.assertRaises(ValueError): |
| | tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer") |
| | |
| | with self.assertRaises(ValueError): |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False |
| | ) |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True) |
| | self.assertTrue(tokenizer.special_attribute_present) |
| |
|
| | |
| | reloaded_tokenizer = AutoTokenizer.from_pretrained( |
| | "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True |
| | ) |
| | self.assertIs(tokenizer.__class__, reloaded_tokenizer.__class__) |
| |
|
| | |
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | tokenizer.save_pretrained(tmp_dir) |
| | reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, trust_remote_code=True) |
| | self.assertTrue(reloaded_tokenizer.special_attribute_present) |
| |
|
| | if is_tokenizers_available(): |
| | self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast") |
| | self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizerFast") |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False |
| | ) |
| | self.assertTrue(tokenizer.special_attribute_present) |
| | self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer") |
| | |
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | tokenizer.save_pretrained(tmp_dir) |
| | reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, trust_remote_code=True, use_fast=False) |
| | self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer") |
| | self.assertTrue(reloaded_tokenizer.special_attribute_present) |
| | else: |
| | self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer") |
| | self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer") |
| |
|
| | |
| | |
| | |
| | self.assertIs(tokenizer.__class__, reloaded_tokenizer.__class__) |
| |
|
| | |
| | reloaded_tokenizer = AutoTokenizer.from_pretrained( |
| | "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, force_download=True |
| | ) |
| | self.assertIsNot(tokenizer.__class__, reloaded_tokenizer.__class__) |
| | self.assertTrue(reloaded_tokenizer.special_attribute_present) |
| |
|
| | @require_tokenizers |
| | def test_from_pretrained_dynamic_tokenizer_conflict(self): |
| | class NewTokenizer(BertTokenizer): |
| | special_attribute_present = False |
| |
|
| | class NewTokenizerFast(BertTokenizerFast): |
| | slow_tokenizer_class = NewTokenizer |
| | special_attribute_present = False |
| |
|
| | try: |
| | AutoConfig.register("custom", CustomConfig) |
| | AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer) |
| | AutoTokenizer.register(CustomConfig, fast_tokenizer_class=NewTokenizerFast) |
| | |
| | tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer") |
| | self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast") |
| | self.assertFalse(tokenizer.special_attribute_present) |
| | tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", use_fast=False) |
| | self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer") |
| | self.assertFalse(tokenizer.special_attribute_present) |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False |
| | ) |
| | self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast") |
| | self.assertFalse(tokenizer.special_attribute_present) |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False, use_fast=False |
| | ) |
| | self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer") |
| | self.assertFalse(tokenizer.special_attribute_present) |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True |
| | ) |
| | self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast") |
| | self.assertTrue(tokenizer.special_attribute_present) |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False |
| | ) |
| | self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer") |
| | self.assertTrue(tokenizer.special_attribute_present) |
| |
|
| | finally: |
| | if "custom" in CONFIG_MAPPING._extra_content: |
| | del CONFIG_MAPPING._extra_content["custom"] |
| | if CustomConfig in TOKENIZER_MAPPING._extra_content: |
| | del TOKENIZER_MAPPING._extra_content[CustomConfig] |
| |
|
| | def test_from_pretrained_dynamic_tokenizer_legacy_format(self): |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True |
| | ) |
| | self.assertTrue(tokenizer.special_attribute_present) |
| | if is_tokenizers_available(): |
| | self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast") |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True, use_fast=False |
| | ) |
| | self.assertTrue(tokenizer.special_attribute_present) |
| | self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer") |
| | else: |
| | self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer") |
| |
|
| | def test_repo_not_found(self): |
| | with self.assertRaisesRegex( |
| | EnvironmentError, "bert-base is not a local folder and is not a valid model identifier" |
| | ): |
| | _ = AutoTokenizer.from_pretrained("bert-base") |
| |
|
| | def test_revision_not_found(self): |
| | with self.assertRaisesRegex( |
| | EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)" |
| | ): |
| | _ = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa") |
| |
|
| | @unittest.skip("This test is failing on main") |
| | def test_cached_tokenizer_has_minimum_calls_to_head(self): |
| | |
| | _ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert") |
| | with RequestCounter() as counter: |
| | _ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert") |
| | self.assertEqual(counter["GET"], 0) |
| | self.assertEqual(counter["HEAD"], 1) |
| | self.assertEqual(counter.total_calls, 1) |
| |
|
| | def test_init_tokenizer_with_trust(self): |
| | nop_tokenizer_code = """ |
| | import transformers |
| | |
| | class NopTokenizer(transformers.PreTrainedTokenizer): |
| | def get_vocab(self): |
| | return {} |
| | """ |
| |
|
| | nop_config_code = """ |
| | from transformers import PretrainedConfig |
| | |
| | class NopConfig(PretrainedConfig): |
| | model_type = "test_unregistered_dynamic" |
| | |
| | def __init__(self, **kwargs): |
| | super().__init__(**kwargs) |
| | """ |
| |
|
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | fake_model_id = "hf-internal-testing/test_unregistered_dynamic" |
| | fake_repo = os.path.join(tmp_dir, fake_model_id) |
| | os.makedirs(fake_repo) |
| |
|
| | tokenizer_src_file = os.path.join(fake_repo, "tokenizer.py") |
| | with open(tokenizer_src_file, "w") as wfp: |
| | wfp.write(nop_tokenizer_code) |
| |
|
| | model_config_src_file = os.path.join(fake_repo, "config.py") |
| | with open(model_config_src_file, "w") as wfp: |
| | wfp.write(nop_config_code) |
| |
|
| | config = { |
| | "model_type": "test_unregistered_dynamic", |
| | "auto_map": {"AutoConfig": f"{fake_model_id}--config.NopConfig"}, |
| | } |
| |
|
| | config_file = os.path.join(fake_repo, "config.json") |
| | with open(config_file, "w") as wfp: |
| | json.dump(config, wfp, indent=2) |
| |
|
| | tokenizer_config = { |
| | "auto_map": { |
| | "AutoTokenizer": [ |
| | f"{fake_model_id}--tokenizer.NopTokenizer", |
| | None, |
| | ] |
| | } |
| | } |
| |
|
| | tokenizer_config_file = os.path.join(fake_repo, "tokenizer_config.json") |
| | with open(tokenizer_config_file, "w") as wfp: |
| | json.dump(tokenizer_config, wfp, indent=2) |
| |
|
| | prev_dir = os.getcwd() |
| | try: |
| | |
| | os.chdir(tmp_dir) |
| |
|
| | |
| | _ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=True) |
| | try: |
| | |
| | _ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=False) |
| | self.fail("AutoTokenizer.from_pretrained with trust_remote_code=False should raise ValueException") |
| | except ValueError: |
| | pass |
| | finally: |
| | os.chdir(prev_dir) |
| |
|