| // tokenizer.h — minimal Qwen3 tokenizer. | |
| // | |
| // M2-phase1: decode() is native C++ (simple vocab lookup). encode() is a Python subprocess | |
| // (one-time cost at prompt setup). Native BPE encode is a future item. | |
| // | |
| class Tokenizer { | |
| public: | |
| bool load(const std::string& vocab_bin_path); | |
| // Decode a single token id to UTF-8 string. | |
| std::string decode(int token_id) const; | |
| // Decode list of token ids to concatenated UTF-8 string. | |
| std::string decode(const std::vector<int>& token_ids) const; | |
| // Encode prompt to token ids. Uses a Python subprocess since Qwen3 needs proper BPE. | |
| // The subprocess call takes ~200ms but is only invoked once per prompt. | |
| std::vector<int> encode_via_python(const std::string& model_dir, | |
| const std::string& prompt, | |
| bool apply_chat_template = false) const; | |
| // Encode a multi-turn conversation by applying the model's chat template. Each pair is | |
| // (role, content) — typical roles: "system", "user", "assistant". Uses Python subprocess. | |
| std::vector<int> encode_conversation_via_python( | |
| const std::string& model_dir, | |
| const std::vector<std::pair<std::string, std::string>>& conversation, | |
| bool add_generation_prompt = true) const; | |
| size_t size() const { return id_to_bytes_.size(); } | |
| private: | |
| std::vector<std::string> id_to_bytes_; // id -> raw utf-8 bytes | |
| }; | |