File size: 509 Bytes
f724f93 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | import json
class Tokenizer:
def __init__(self, vocab):
self.token_to_id = vocab
self.id_to_token = {v: k for k, v in vocab.items()}
@classmethod
def load(cls, path):
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
return cls(data["token_to_id"])
def encode(self, text):
return [self.token_to_id.get(c, 0) for c in text]
def decode(self, tokens):
return "".join([self.id_to_token.get(t, "") for t in tokens]) |