llm_mutil_npu / include /tokenizer.h
xianglarry's picture
Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU
4b9fefd
// tokenizer.h — minimal Qwen3 tokenizer.
//
// M2-phase1: decode() is native C++ (simple vocab lookup). encode() is a Python subprocess
// (one-time cost at prompt setup). Native BPE encode is a future item.
//
#pragma once
#include <string>
#include <vector>
#include <cstdint>
class Tokenizer {
public:
bool load(const std::string& vocab_bin_path);
// Decode a single token id to UTF-8 string.
std::string decode(int token_id) const;
// Decode list of token ids to concatenated UTF-8 string.
std::string decode(const std::vector<int>& token_ids) const;
// Encode prompt to token ids. Uses a Python subprocess since Qwen3 needs proper BPE.
// The subprocess call takes ~200ms but is only invoked once per prompt.
std::vector<int> encode_via_python(const std::string& model_dir,
const std::string& prompt,
bool apply_chat_template = false) const;
// Encode a multi-turn conversation by applying the model's chat template. Each pair is
// (role, content) — typical roles: "system", "user", "assistant". Uses Python subprocess.
std::vector<int> encode_conversation_via_python(
const std::string& model_dir,
const std::vector<std::pair<std::string, std::string>>& conversation,
bool add_generation_prompt = true) const;
size_t size() const { return id_to_bytes_.size(); }
private:
std::vector<std::string> id_to_bytes_; // id -> raw utf-8 bytes
};