| import json | |
| class Tokenizer: | |
| def __init__(self, vocab): | |
| self.token_to_id = vocab | |
| self.id_to_token = {v: k for k, v in vocab.items()} | |
| def load(cls, path): | |
| with open(path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| return cls(data["token_to_id"]) | |
| def encode(self, text): | |
| return [self.token_to_id.get(c, 0) for c in text] | |
| def decode(self, tokens): | |
| return "".join([self.id_to_token.get(t, "") for t in tokens]) |