| | import collections |
| | import logging |
| | import os |
| | import re |
| | import codecs |
| | import unicodedata |
| | from typing import List, Optional |
| | from transformers import PreTrainedTokenizer |
| | from SmilesPE.tokenizer import SPE_Tokenizer |
| | import torch |
| |
|
| | def load_vocab(vocab_file): |
| | """Loads a vocabulary file into a dictionary.""" |
| | vocab = collections.OrderedDict() |
| | with open(vocab_file, "r", encoding="utf-8") as reader: |
| | tokens = reader.readlines() |
| | for index, token in enumerate(tokens): |
| | token = token.rstrip("\n") |
| | vocab[token] = index |
| | return vocab |
| |
|
| | class Atomwise_Tokenizer(object): |
| | """Run atom-level SMILES tokenization""" |
| |
|
| | def __init__(self): |
| | """ Constructs a atom-level Tokenizer. |
| | """ |
| | |
| | self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])" |
| |
|
| | self.regex = re.compile(self.regex_pattern) |
| |
|
| | def tokenize(self, text): |
| | """ Basic Tokenization of a SMILES. |
| | """ |
| | tokens = [token for token in self.regex.findall(text)] |
| | return tokens |
| | |
| | class SMILES_SPE_Tokenizer(PreTrainedTokenizer): |
| | r""" |
| | Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE). |
| | This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users |
| | should refer to the superclass for more information regarding methods. |
| | Args: |
| | vocab_file (:obj:`string`): |
| | File containing the vocabulary. |
| | spe_file (:obj:`string`): |
| | File containing the trained SMILES Pair Encoding vocabulary. |
| | unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): |
| | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this |
| | token instead. |
| | sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): |
| | The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences |
| | for sequence classification or for a text and a question for question answering. |
| | It is also used as the last token of a sequence built with special tokens. |
| | pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): |
| | The token used for padding, for example when batching sequences of different lengths. |
| | cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): |
| | The classifier token which is used when doing sequence classification (classification of the whole |
| | sequence instead of per-token classification). It is the first token of the sequence when built with |
| | special tokens. |
| | mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): |
| | The token used for masking values. This is the token used when training this model with masked language |
| | modeling. This is the token which the model will try to predict. |
| | """ |
| |
|
| | def __init__(self, vocab_file, spe_file, |
| | unk_token="[UNK]", |
| | sep_token="[SEP]", |
| | pad_token="[PAD]", |
| | cls_token="[CLS]", |
| | mask_token="[MASK]", |
| | **kwargs): |
| | if not os.path.isfile(vocab_file): |
| | raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file)) |
| | if not os.path.isfile(spe_file): |
| | raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file)) |
| |
|
| | self.vocab = load_vocab(vocab_file) |
| | self.spe_vocab = open(spe_file, 'r', encoding='utf-8') |
| | self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) |
| | self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab) |
| |
|
| | super().__init__( |
| | unk_token=unk_token, |
| | sep_token=sep_token, |
| | pad_token=pad_token, |
| | cls_token=cls_token, |
| | mask_token=mask_token, |
| | **kwargs) |
| |
|
| | @property |
| | def vocab_size(self): |
| | return len(self.vocab) |
| |
|
| | def get_vocab(self): |
| | return dict(self.vocab, **self.added_tokens_encoder) |
| |
|
| | def _tokenize(self, text): |
| | return self.spe_tokenizer.tokenize(text).split(' ') |
| |
|
| | def _convert_token_to_id(self, token): |
| | """ Converts a token (str) in an id using the vocab. """ |
| | return self.vocab.get(token, self.vocab.get(self.unk_token)) |
| | |
| | |
| | def encode(self, token_array): |
| | token_ids = [] |
| | token_ids.append(2) |
| | for token in token_array: |
| | id = self._convert_token_to_id(token) |
| | token_ids.append(id) |
| | token_ids.append(3) |
| | token_ids = torch.tensor([token_ids]) |
| | attn_mask = torch.ones_like(token_ids) |
| | return {'input_ids': token_ids, 'attention_mask': attn_mask} |
| | |
| | def decode(self, token_ids, skip_special_tokens=True): |
| | token_ids = token_ids.squeeze(0).cpu().tolist() |
| | token_array = [] |
| | for idx in token_ids: |
| | if idx == 3: |
| | break |
| | if skip_special_tokens and idx in self.all_special_ids: |
| | continue |
| | token = self._convert_id_to_token(idx) |
| | token_array.append(token) |
| | sequence = "".join(token_array) |
| | return sequence |
| | |
| | def batch_decode(self, batch_token_ids, skip_special_tokens=True): |
| | sequences = [] |
| | for token_ids in batch_token_ids: |
| | sequences.append(self.decode(token_ids)) |
| | return sequences |
| | |
| | def get_token_split(self, token_ids): |
| | if isinstance(token_ids, torch.Tensor): |
| | token_ids = token_ids.cpu().tolist() |
| | |
| | token_array = [] |
| | for seq_ids in token_ids: |
| | seq_array = [] |
| | for id in seq_ids: |
| | token = self._convert_id_to_token(id) |
| | seq_array.append(token) |
| | token_array.append(seq_array) |
| | |
| | return token_array |
| | |
| | def _convert_id_to_token(self, index): |
| | """Converts an index (integer) in a token (str) using the vocab.""" |
| | return self.ids_to_tokens.get(index, self.unk_token) |
| |
|
| | def convert_tokens_to_string(self, tokens): |
| | """ Converts a sequence of tokens (string) in a single string. """ |
| | out_string = " ".join(tokens).replace(" ##", "").strip() |
| | return out_string |
| |
|
| | def build_inputs_with_special_tokens( |
| | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| | ) -> List[int]: |
| | """ |
| | Build model inputs from a sequence or a pair of sequence for sequence classification tasks |
| | by concatenating and adding special tokens. |
| | A BERT sequence has the following format: |
| | - single sequence: ``[CLS] X [SEP]`` |
| | - pair of sequences: ``[CLS] A [SEP] B [SEP]`` |
| | Args: |
| | token_ids_0 (:obj:`List[int]`): |
| | List of IDs to which the special tokens will be added |
| | token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| | Optional second list of IDs for sequence pairs. |
| | Returns: |
| | :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. |
| | """ |
| | if token_ids_1 is None: |
| | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] |
| | cls = [self.cls_token_id] |
| | sep = [self.sep_token_id] |
| | return cls + token_ids_0 + sep + token_ids_1 + sep |
| |
|
| | def get_special_tokens_mask( |
| | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False |
| | ) -> List[int]: |
| | """ |
| | Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding |
| | special tokens using the tokenizer ``prepare_for_model`` method. |
| | Args: |
| | token_ids_0 (:obj:`List[int]`): |
| | List of ids. |
| | token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| | Optional second list of IDs for sequence pairs. |
| | already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): |
| | Set to True if the token list is already formatted with special tokens for the model |
| | Returns: |
| | :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. |
| | """ |
| |
|
| | if already_has_special_tokens: |
| | if token_ids_1 is not None: |
| | raise ValueError( |
| | "You should not supply a second sequence if the provided sequence of " |
| | "ids is already formated with special tokens for the model." |
| | ) |
| | return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) |
| |
|
| | if token_ids_1 is not None: |
| | return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] |
| | return [1] + ([0] * len(token_ids_0)) + [1] |
| |
|
| | def create_token_type_ids_from_sequences( |
| | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| | ) -> List[int]: |
| | """ |
| | Creates a mask from the two sequences passed to be used in a sequence-pair classification task. |
| | A BERT sequence pair mask has the following format: |
| | :: |
| | 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 |
| | | first sequence | second sequence | |
| | if token_ids_1 is None, only returns the first portion of the mask (0's). |
| | Args: |
| | token_ids_0 (:obj:`List[int]`): |
| | List of ids. |
| | token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| | Optional second list of IDs for sequence pairs. |
| | Returns: |
| | :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given |
| | sequence(s). |
| | """ |
| | sep = [self.sep_token_id] |
| | cls = [self.cls_token_id] |
| | if token_ids_1 is None: |
| | return len(cls + token_ids_0 + sep) * [0] |
| | return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] |
| |
|
| | def save_vocabulary(self, vocab_path): |
| | """ |
| | Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. |
| | Args: |
| | vocab_path (:obj:`str`): |
| | The directory in which to save the vocabulary. |
| | Returns: |
| | :obj:`Tuple(str)`: Paths to the files saved. |
| | """ |
| | index = 0 |
| | if os.path.isdir(vocab_path): |
| | vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) |
| | else: |
| | vocab_file = vocab_path |
| | with open(vocab_file, "w", encoding="utf-8") as writer: |
| | for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): |
| | if index != token_index: |
| | logger.warning( |
| | "Saving vocabulary to {}: vocabulary indices are not consecutive." |
| | " Please check that the vocabulary is not corrupted!".format(vocab_file) |
| | ) |
| | index = token_index |
| | writer.write(token + "\n") |
| | index += 1 |
| | return (vocab_file,) |
| |
|
| | class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer): |
| | r""" |
| | Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE). |
| | This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users |
| | should refer to the superclass for more information regarding methods. |
| | Args: |
| | vocab_file (:obj:`string`): |
| | File containing the vocabulary. |
| | unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): |
| | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this |
| | token instead. |
| | sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): |
| | The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences |
| | for sequence classification or for a text and a question for question answering. |
| | It is also used as the last token of a sequence built with special tokens. |
| | pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): |
| | The token used for padding, for example when batching sequences of different lengths. |
| | cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): |
| | The classifier token which is used when doing sequence classification (classification of the whole |
| | sequence instead of per-token classification). It is the first token of the sequence when built with |
| | special tokens. |
| | mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): |
| | The token used for masking values. This is the token used when training this model with masked language |
| | modeling. This is the token which the model will try to predict. |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | vocab_file, |
| | unk_token="[UNK]", |
| | sep_token="[SEP]", |
| | pad_token="[PAD]", |
| | cls_token="[CLS]", |
| | mask_token="[MASK]", |
| | **kwargs |
| | ): |
| | super().__init__( |
| | unk_token=unk_token, |
| | sep_token=sep_token, |
| | pad_token=pad_token, |
| | cls_token=cls_token, |
| | mask_token=mask_token, |
| | **kwargs, |
| | ) |
| |
|
| | if not os.path.isfile(vocab_file): |
| | raise ValueError( |
| | "Can't find a vocabulary file at path '{}'.".format(vocab_file) |
| | ) |
| | self.vocab = load_vocab(vocab_file) |
| | self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) |
| | self.tokenizer = Atomwise_Tokenizer() |
| |
|
| | @property |
| | def vocab_size(self): |
| | return len(self.vocab) |
| |
|
| | def get_vocab(self): |
| | return dict(self.vocab, **self.added_tokens_encoder) |
| |
|
| | |
| | def _tokenize(self, text): |
| | return self.tokenizer.tokenize(text) |
| |
|
| | def _convert_token_to_id(self, token): |
| | """ Converts a token (str) in an id using the vocab. """ |
| | return self.vocab.get(token, self.vocab.get(self.unk_token)) |
| |
|
| | def _convert_id_to_token(self, index): |
| | """Converts an index (integer) in a token (str) using the vocab.""" |
| | return self.ids_to_tokens.get(index, self.unk_token) |
| |
|
| | def convert_tokens_to_string(self, tokens): |
| | """ Converts a sequence of tokens (string) in a single string. """ |
| | out_string = " ".join(tokens).replace(" ##", "").strip() |
| | return out_string |
| |
|
| | def build_inputs_with_special_tokens( |
| | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| | ) -> List[int]: |
| | """ |
| | Build model inputs from a sequence or a pair of sequence for sequence classification tasks |
| | by concatenating and adding special tokens. |
| | A BERT sequence has the following format: |
| | - single sequence: ``[CLS] X [SEP]`` |
| | - pair of sequences: ``[CLS] A [SEP] B [SEP]`` |
| | Args: |
| | token_ids_0 (:obj:`List[int]`): |
| | List of IDs to which the special tokens will be added |
| | token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| | Optional second list of IDs for sequence pairs. |
| | Returns: |
| | :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. |
| | """ |
| | if token_ids_1 is None: |
| | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] |
| | cls = [self.cls_token_id] |
| | sep = [self.sep_token_id] |
| | return cls + token_ids_0 + sep + token_ids_1 + sep |
| |
|
| | def get_special_tokens_mask( |
| | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False |
| | ) -> List[int]: |
| | """ |
| | Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding |
| | special tokens using the tokenizer ``prepare_for_model`` method. |
| | Args: |
| | token_ids_0 (:obj:`List[int]`): |
| | List of ids. |
| | token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| | Optional second list of IDs for sequence pairs. |
| | already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): |
| | Set to True if the token list is already formatted with special tokens for the model |
| | Returns: |
| | :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. |
| | """ |
| |
|
| | if already_has_special_tokens: |
| | if token_ids_1 is not None: |
| | raise ValueError( |
| | "You should not supply a second sequence if the provided sequence of " |
| | "ids is already formated with special tokens for the model." |
| | ) |
| | return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) |
| |
|
| | if token_ids_1 is not None: |
| | return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] |
| | return [1] + ([0] * len(token_ids_0)) + [1] |
| |
|
| | def create_token_type_ids_from_sequences( |
| | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| | ) -> List[int]: |
| | """ |
| | Creates a mask from the two sequences passed to be used in a sequence-pair classification task. |
| | A BERT sequence pair mask has the following format: |
| | :: |
| | 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 |
| | | first sequence | second sequence | |
| | if token_ids_1 is None, only returns the first portion of the mask (0's). |
| | Args: |
| | token_ids_0 (:obj:`List[int]`): |
| | List of ids. |
| | token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| | Optional second list of IDs for sequence pairs. |
| | Returns: |
| | :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given |
| | sequence(s). |
| | """ |
| | sep = [self.sep_token_id] |
| | cls = [self.cls_token_id] |
| | if token_ids_1 is None: |
| | return len(cls + token_ids_0 + sep) * [0] |
| | return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] |
| |
|
| | def save_vocabulary(self, vocab_path): |
| | """ |
| | Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. |
| | Args: |
| | vocab_path (:obj:`str`): |
| | The directory in which to save the vocabulary. |
| | Returns: |
| | :obj:`Tuple(str)`: Paths to the files saved. |
| | """ |
| | index = 0 |
| | if os.path.isdir(vocab_path): |
| | vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) |
| | else: |
| | vocab_file = vocab_path |
| | with open(vocab_file, "w", encoding="utf-8") as writer: |
| | for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): |
| | if index != token_index: |
| | logger.warning( |
| | "Saving vocabulary to {}: vocabulary indices are not consecutive." |
| | " Please check that the vocabulary is not corrupted!".format(vocab_file) |
| | ) |
| | index = token_index |
| | writer.write(token + "\n") |
| | index += 1 |
| | return (vocab_file,) |
| |
|