| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | """Utils for tokenization."""
|
| |
|
| | import warnings
|
| |
|
| | __all__ = ["hf_tokenizer", "hf_processor"]
|
| |
|
| |
|
| | def set_pad_token_id(tokenizer):
|
| | """Set pad_token_id to eos_token_id if it is None.
|
| |
|
| | Args:
|
| | tokenizer (transformers.PreTrainedTokenizer): The tokenizer to be set.
|
| |
|
| | """
|
| | if tokenizer.pad_token_id is None:
|
| | tokenizer.pad_token_id = tokenizer.eos_token_id
|
| | warnings.warn(f"tokenizer.pad_token_id is None. Now set to {tokenizer.eos_token_id}", stacklevel=1)
|
| | if tokenizer.pad_token is None:
|
| | tokenizer.pad_token = tokenizer.eos_token
|
| | warnings.warn(f"tokenizer.pad_token is None. Now set to {tokenizer.eos_token}", stacklevel=1)
|
| |
|
| |
|
| | def hf_tokenizer(name_or_path, correct_pad_token=True, correct_gemma2=True, **kwargs):
|
| | """Create a huggingface pretrained tokenizer which correctness handles eos and pad tokens.
|
| |
|
| | Args:
|
| |
|
| | name (str): The name of the tokenizer.
|
| | correct_pad_token (bool): Whether to correct the pad token id.
|
| | correct_gemma2 (bool): Whether to correct the gemma2 tokenizer.
|
| |
|
| | Returns:
|
| |
|
| | transformers.PreTrainedTokenizer: The pretrained tokenizer.
|
| |
|
| | """
|
| | from transformers import AutoTokenizer
|
| |
|
| | if correct_gemma2 and isinstance(name_or_path, str) and "gemma-2-2b-it" in name_or_path:
|
| |
|
| |
|
| | warnings.warn("Found gemma-2-2b-it tokenizer. Set eos_token and eos_token_id to <end_of_turn> and 107.", stacklevel=1)
|
| | kwargs["eos_token"] = "<end_of_turn>"
|
| | kwargs["eos_token_id"] = 107
|
| | tokenizer = AutoTokenizer.from_pretrained(name_or_path, **kwargs)
|
| | if correct_pad_token:
|
| | set_pad_token_id(tokenizer)
|
| | return tokenizer
|
| |
|
| |
|
| | def hf_processor(name_or_path, **kwargs):
|
| | """Create a huggingface processor to process multimodal data.
|
| |
|
| | Args:
|
| | name_or_path (str): The name of the processor.
|
| |
|
| | Returns:
|
| | transformers.ProcessorMixin: The pretrained processor.
|
| | """
|
| | from transformers import AutoProcessor
|
| |
|
| | try:
|
| | processor = AutoProcessor.from_pretrained(name_or_path, **kwargs)
|
| | except Exception:
|
| | processor = None
|
| |
|
| |
|
| | if processor is not None and "Processor" not in processor.__class__.__name__:
|
| | processor = None
|
| | return processor
|
| |
|