| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import builtins |
| import contextlib |
| import copy |
| import functools |
| import time |
| import weakref |
| from collections import OrderedDict |
| from types import FunctionType, MethodType |
| from typing import Any, Callable, Dict, List, Optional, Tuple |
|
|
| from .utils import is_paddle_available, is_paddlenlp_available |
|
|
|
|
| def copy_func(f): |
| "Copy a non-builtin function (NB `copy.copy` does not work for this)" |
| if not isinstance(f, FunctionType): |
| return copy.copy(f) |
| fn = FunctionType(f.__code__, f.__globals__, f.__name__, f.__defaults__, f.__closure__) |
| fn.__kwdefaults__ = f.__kwdefaults__ |
| fn.__dict__.update(f.__dict__) |
| fn.__annotations__.update(f.__annotations__) |
| fn.__qualname__ = f.__qualname__ |
| return fn |
|
|
|
|
| |
| def patch_to(cls, as_prop=False, cls_method=False): |
| "Decorator: add `f` to `cls`" |
| if not isinstance(cls, (tuple, list)): |
| cls = (cls,) |
|
|
| def _inner(f): |
| for c_ in cls: |
| nf = copy_func(f) |
| nm = f.__name__ |
| |
| for o in functools.WRAPPER_ASSIGNMENTS: |
| setattr(nf, o, getattr(f, o)) |
| nf.__qualname__ = f"{c_.__name__}.{nm}" |
| if cls_method: |
| setattr(c_, nm, MethodType(nf, c_)) |
| else: |
| setattr(c_, nm, property(nf) if as_prop else nf) |
| |
| return globals().get(nm, builtins.__dict__.get(nm, None)) |
|
|
| return _inner |
|
|
|
|
| if is_paddle_available(): |
| import paddle |
| import paddle.nn as nn |
|
|
| @contextlib.contextmanager |
| def device_scope(device="cpu"): |
| new_device = device.replace("cuda", "gpu") |
| old_device = paddle.get_device() |
| if str(new_device) == str(old_device): |
| yield |
| else: |
| try: |
| paddle.set_device(new_device) |
| yield |
| finally: |
| paddle.set_device(old_device) |
|
|
| paddle.device_scope = device_scope |
|
|
| class RNGStatesTracker: |
| def __init__(self): |
| self.states_ = {} |
|
|
| def reset(self): |
| self.states_ = {} |
|
|
| def remove(self, generator_name=None): |
| if generator_name is not None: |
| del self.states_[generator_name] |
|
|
| def manual_seed(self, seed, generator_name=None): |
| if generator_name is None: |
| generator_name = str(time.time()) |
| if generator_name in self.states_: |
| raise ValueError("state {} already exists".format(generator_name)) |
| orig_rng_state = paddle.get_cuda_rng_state() |
| paddle.seed(seed) |
| self.states_[generator_name] = paddle.get_cuda_rng_state() |
| paddle.set_cuda_rng_state(orig_rng_state) |
| return generator_name |
|
|
| @contextlib.contextmanager |
| def rng_state(self, generator_name=None): |
| if generator_name is not None: |
| if generator_name not in self.states_: |
| raise ValueError("state {} does not exist".format(generator_name)) |
| orig_cuda_rng_state = paddle.get_cuda_rng_state() |
| paddle.set_cuda_rng_state(self.states_[generator_name]) |
| try: |
| yield |
| finally: |
| self.states_[generator_name] = paddle.get_cuda_rng_state() |
| paddle.set_cuda_rng_state(orig_cuda_rng_state) |
| else: |
| yield |
|
|
| RNG_STATE_TRACKER = RNGStatesTracker() |
|
|
| def get_rng_state_tracker(*args, **kwargs): |
| return RNG_STATE_TRACKER |
|
|
| paddle.Generator = get_rng_state_tracker |
| randn = paddle.randn |
|
|
| def randn_pt(shape, dtype=None, name=None, **kwargs): |
| generator = kwargs.get("generator", None) |
| if generator is None: |
| return randn(shape, dtype=dtype, name=name) |
| else: |
| with get_rng_state_tracker().rng_state(generator): |
| return randn(shape, dtype=dtype, name=name) |
|
|
| paddle.randn = randn_pt |
|
|
| rand = paddle.rand |
|
|
| def rand_pt(shape, dtype=None, name=None, **kwargs): |
| generator = kwargs.get("generator", None) |
| if generator is None: |
| return randn(shape, dtype=dtype, name=name) |
| else: |
| with get_rng_state_tracker().rng_state(generator): |
| return rand(shape, dtype=dtype, name=name) |
|
|
| paddle.rand = rand_pt |
|
|
| @patch_to(nn.Layer) |
| def get_sublayer(self, target: str): |
| if target == "": |
| return self |
|
|
| atoms: List[str] = target.split(".") |
| mod: nn.Layer = self |
|
|
| for item in atoms: |
| if not hasattr(mod, item): |
| raise AttributeError(mod.__class__.__name__ + " has no " "attribute `" + item + "`") |
|
|
| mod = getattr(mod, item) |
|
|
| if not isinstance(mod, nn.Layer): |
| raise AttributeError("`" + item + "` is not " "an nn.Layer") |
| return mod |
|
|
| class _WrappedHook: |
| def __init__(self, hook: Callable, module: Optional["nn.Layer"] = None): |
| self.hook: Callable = hook |
| functools.update_wrapper(self, hook) |
|
|
| self.with_module: bool = False |
|
|
| if module is not None: |
| self.module: weakref.ReferenceType["nn.Layer"] = weakref.ref(module) |
| self.with_module = True |
|
|
| def __call__(self, *args: Any, **kwargs: Any) -> Any: |
| if self.with_module: |
| module = self.module() |
| if module is None: |
| raise RuntimeError("You are trying to call the hook of a dead Module!") |
| return self.hook(module, *args, **kwargs) |
| return self.hook(*args, **kwargs) |
|
|
| def __getstate__(self) -> Dict: |
| result = {"hook": self.hook, "with_module": self.with_module} |
| if self.with_module: |
| result["module"] = self.module() |
|
|
| return result |
|
|
| def __setstate__(self, state: Dict): |
| self.hook = state["hook"] |
| self.with_module = state["with_module"] |
|
|
| if self.with_module: |
| if state["module"] is None: |
| raise RuntimeError("You are trying to revive the hook of a dead Module!") |
| self.module = weakref.ref(state["module"]) |
|
|
| from paddle.fluid.dygraph.layers import HookRemoveHelper |
|
|
| @patch_to(nn.Layer) |
| def register_load_state_dict_pre_hook(self, hook, with_module=False): |
| handle = HookRemoveHelper(self.load_state_dict_pre_hooks) |
| self.load_state_dict_pre_hooks[handle._hook_id] = _WrappedHook(hook, self if with_module else None) |
| return handle |
|
|
| raw_set_state_dict = nn.Layer.set_state_dict |
|
|
| @patch_to(nn.Layer) |
| def set_state_dict(self, state_dict, use_structured_name: bool = True): |
| for hook in self.load_state_dict_pre_hooks.values(): |
| hook(state_dict) |
| return raw_set_state_dict(self, state_dict, use_structured_name=use_structured_name) |
|
|
| nn.Layer.load_dict = nn.Layer.set_state_dict |
| nn.Layer.set_dict = nn.Layer.set_state_dict |
|
|
| raw_init = nn.Layer.__init__ |
|
|
| @patch_to(nn.Layer) |
| def __init__(self, name_scope=None, dtype="float32"): |
| raw_init(self, name_scope=name_scope, dtype=dtype) |
| self.load_state_dict_pre_hooks = OrderedDict() |
|
|
|
|
| if is_paddle_available() and is_paddlenlp_available(): |
| import paddle |
|
|
| import paddlenlp.transformers |
| from paddlenlp.transformers import PretrainedModel |
|
|
| @patch_to(PretrainedModel, as_prop=True) |
| def dtype(self): |
| try: |
| return next(self.named_parameters())[1].dtype |
| except StopIteration: |
| return paddle.get_default_dtype() |
|
|
| @patch_to(PretrainedModel, as_prop=True) |
| def device(self): |
| try: |
| return next(self.named_parameters())[1].place |
| except StopIteration: |
| return paddle.get_device() |
|
|
| try: |
| from paddlenlp.transformers import XLMRobertaTokenizer |
| except ImportError: |
| |
| """Tokenization classes for XLM-RoBERTa model.""" |
| import os |
| from shutil import copyfile |
|
|
| import sentencepiece as spm |
|
|
| from paddlenlp.transformers.tokenizer_utils import ( |
| AddedToken, |
| PretrainedTokenizer, |
| ) |
| from paddlenlp.utils.log import logger |
|
|
| SPIECE_UNDERLINE = "▁" |
|
|
| class XLMRobertaTokenizer(PretrainedTokenizer): |
|
|
| resource_files_names = {"vocab_file": "sentencepiece.bpe.model"} |
| pretrained_resource_files_map = {} |
| pretrained_init_configuration = {} |
| max_model_input_sizes = { |
| "xlm-roberta-base": 512, |
| "xlm-roberta-large": 512, |
| "xlm-roberta-large-finetuned-conll02-dutch": 512, |
| "xlm-roberta-large-finetuned-conll02-spanish": 512, |
| "xlm-roberta-large-finetuned-conll03-english": 512, |
| "xlm-roberta-large-finetuned-conll03-german": 512, |
| } |
| model_input_names = ["input_ids", "attention_mask"] |
|
|
| def __init__( |
| self, |
| vocab_file, |
| bos_token="<s>", |
| eos_token="</s>", |
| sep_token="</s>", |
| cls_token="<s>", |
| unk_token="<unk>", |
| pad_token="<pad>", |
| mask_token="<mask>", |
| sp_model_kwargs: Optional[Dict[str, Any]] = None, |
| **kwargs |
| ) -> None: |
| |
| mask_token = ( |
| AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token |
| ) |
|
|
| self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs |
|
|
| super().__init__( |
| bos_token=bos_token, |
| eos_token=eos_token, |
| unk_token=unk_token, |
| sep_token=sep_token, |
| cls_token=cls_token, |
| pad_token=pad_token, |
| mask_token=mask_token, |
| sp_model_kwargs=self.sp_model_kwargs, |
| **kwargs, |
| ) |
|
|
| self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) |
| self.sp_model.Load(str(vocab_file)) |
| self.vocab_file = vocab_file |
|
|
| |
| |
| |
| |
| |
|
|
| |
| self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3} |
|
|
| |
| self.fairseq_offset = 1 |
|
|
| self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset |
| self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} |
|
|
| def __getstate__(self): |
| state = self.__dict__.copy() |
| state["sp_model"] = None |
| state["sp_model_proto"] = self.sp_model.serialized_model_proto() |
| return state |
|
|
| def __setstate__(self, d): |
| self.__dict__ = d |
|
|
| |
| if not hasattr(self, "sp_model_kwargs"): |
| self.sp_model_kwargs = {} |
|
|
| self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) |
| self.sp_model.LoadFromSerializedProto(self.sp_model_proto) |
|
|
| def build_inputs_with_special_tokens( |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| ) -> List[int]: |
| """ |
| Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and |
| adding special tokens. An XLM-RoBERTa sequence has the following format: |
| - single sequence: `<s> X </s>` |
| - pair of sequences: `<s> A </s></s> B </s>` |
| Args: |
| token_ids_0 (`List[int]`): |
| List of IDs to which the special tokens will be added. |
| token_ids_1 (`List[int]`, *optional*): |
| Optional second list of IDs for sequence pairs. |
| Returns: |
| `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. |
| """ |
|
|
| if token_ids_1 is None: |
| return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] |
| cls = [self.cls_token_id] |
| sep = [self.sep_token_id] |
| return cls + token_ids_0 + sep + sep + token_ids_1 + sep |
|
|
| def get_special_tokens_mask( |
| self, |
| token_ids_0: List[int], |
| token_ids_1: Optional[List[int]] = None, |
| already_has_special_tokens: bool = False, |
| ) -> List[int]: |
| """ |
| Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding |
| special tokens using the tokenizer `prepare_for_model` method. |
| Args: |
| token_ids_0 (`List[int]`): |
| List of IDs. |
| token_ids_1 (`List[int]`, *optional*): |
| Optional second list of IDs for sequence pairs. |
| already_has_special_tokens (`bool`, *optional*, defaults to `False`): |
| Whether or not the token list is already formatted with special tokens for the model. |
| Returns: |
| `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. |
| """ |
|
|
| if already_has_special_tokens: |
| return super().get_special_tokens_mask( |
| token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True |
| ) |
|
|
| if token_ids_1 is None: |
| return [1] + ([0] * len(token_ids_0)) + [1] |
| return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] |
|
|
| def create_token_type_ids_from_sequences( |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| ) -> List[int]: |
| """ |
| Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does |
| not make use of token type ids, therefore a list of zeros is returned. |
| Args: |
| token_ids_0 (`List[int]`): |
| List of IDs. |
| token_ids_1 (`List[int]`, *optional*): |
| Optional second list of IDs for sequence pairs. |
| Returns: |
| `List[int]`: List of zeros. |
| """ |
|
|
| sep = [self.sep_token_id] |
| cls = [self.cls_token_id] |
|
|
| if token_ids_1 is None: |
| return len(cls + token_ids_0 + sep) * [0] |
| return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] |
|
|
| @property |
| def vocab_size(self): |
| return len(self.sp_model) + self.fairseq_offset + 1 |
|
|
| def get_vocab(self): |
| vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} |
| vocab.update(self.added_tokens_encoder) |
| return vocab |
|
|
| def _tokenize(self, text: str) -> List[str]: |
| return self.sp_model.encode(text, out_type=str) |
|
|
| def _convert_token_to_id(self, token): |
| """Converts a token (str) in an id using the vocab.""" |
| if token in self.fairseq_tokens_to_ids: |
| return self.fairseq_tokens_to_ids[token] |
| spm_id = self.sp_model.PieceToId(token) |
|
|
| |
| return spm_id + self.fairseq_offset if spm_id else self.unk_token_id |
|
|
| def _convert_id_to_token(self, index): |
| """Converts an index (integer) in a token (str) using the vocab.""" |
| if index in self.fairseq_ids_to_tokens: |
| return self.fairseq_ids_to_tokens[index] |
| return self.sp_model.IdToPiece(index - self.fairseq_offset) |
|
|
| def convert_tokens_to_string(self, tokens): |
| """Converts a sequence of tokens (strings for sub-words) in a single string.""" |
| out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() |
| return out_string |
|
|
| def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: |
| if not os.path.isdir(save_directory): |
| logger.error(f"Vocabulary path ({save_directory}) should be a directory") |
| return |
| out_vocab_file = os.path.join( |
| save_directory, |
| (filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"], |
| ) |
|
|
| if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile( |
| self.vocab_file |
| ): |
| copyfile(self.vocab_file, out_vocab_file) |
| elif not os.path.isfile(self.vocab_file): |
| with open(out_vocab_file, "wb") as fi: |
| content_spiece_model = self.sp_model.serialized_model_proto() |
| fi.write(content_spiece_model) |
|
|
| return (out_vocab_file,) |
|
|
| paddlenlp.transformers.XLMRobertaTokenizer = XLMRobertaTokenizer |
|
|
| |
| from paddlenlp.transformers import BertModel |
|
|
| raw_forward = BertModel.forward |
|
|
| @patch_to(BertModel) |
| def forward( |
| self, |
| input_ids: paddle.Tensor, |
| token_type_ids: Optional[paddle.Tensor] = None, |
| position_ids: Optional[paddle.Tensor] = None, |
| attention_mask: Optional[paddle.Tensor] = None, |
| past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None, |
| use_cache: Optional[bool] = None, |
| output_hidden_states: Optional[bool] = None, |
| output_attentions: Optional[bool] = None, |
| return_dict: Optional[bool] = None, |
| ): |
| if attention_mask is None: |
| attention_mask = paddle.ones_like(input_ids) |
| return raw_forward( |
| self, |
| input_ids, |
| token_type_ids, |
| position_ids, |
| attention_mask, |
| past_key_values, |
| use_cache, |
| output_hidden_states, |
| output_attentions, |
| return_dict, |
| ) |
|
|