File size: 509 Bytes
f724f93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import json

class Tokenizer:
    def __init__(self, vocab):
        self.token_to_id = vocab
        self.id_to_token = {v: k for k, v in vocab.items()}

    @classmethod
    def load(cls, path):
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return cls(data["token_to_id"])

    def encode(self, text):
        return [self.token_to_id.get(c, 0) for c in text]

    def decode(self, tokens):
        return "".join([self.id_to_token.get(t, "") for t in tokens])