// tokenizer.h — minimal Qwen3 tokenizer. // // M2-phase1: decode() is native C++ (simple vocab lookup). encode() is a Python subprocess // (one-time cost at prompt setup). Native BPE encode is a future item. // #pragma once #include #include #include class Tokenizer { public: bool load(const std::string& vocab_bin_path); // Decode a single token id to UTF-8 string. std::string decode(int token_id) const; // Decode list of token ids to concatenated UTF-8 string. std::string decode(const std::vector& token_ids) const; // Encode prompt to token ids. Uses a Python subprocess since Qwen3 needs proper BPE. // The subprocess call takes ~200ms but is only invoked once per prompt. std::vector encode_via_python(const std::string& model_dir, const std::string& prompt, bool apply_chat_template = false) const; // Encode a multi-turn conversation by applying the model's chat template. Each pair is // (role, content) — typical roles: "system", "user", "assistant". Uses Python subprocess. std::vector encode_conversation_via_python( const std::string& model_dir, const std::vector>& conversation, bool add_generation_prompt = true) const; size_t size() const { return id_to_bytes_.size(); } private: std::vector id_to_bytes_; // id -> raw utf-8 bytes };