MiniAxion1-0.9M / tokenizer.py
AxionLab-official's picture
Create tokenizer.py
f724f93 verified
raw
history blame contribute delete
509 Bytes
import json
class Tokenizer:
def __init__(self, vocab):
self.token_to_id = vocab
self.id_to_token = {v: k for k, v in vocab.items()}
@classmethod
def load(cls, path):
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
return cls(data["token_to_id"])
def encode(self, text):
return [self.token_to_id.get(c, 0) for c in text]
def decode(self, tokens):
return "".join([self.id_to_token.get(t, "") for t in tokens])