Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU

4b9fefd 13 days ago

1.51 kB

	// tokenizer.h — minimal Qwen3 tokenizer.
	//
	// M2-phase1: decode() is native C++ (simple vocab lookup). encode() is a Python subprocess
	// (one-time cost at prompt setup). Native BPE encode is a future item.
	//
	#pragma once
	#include <string>
	#include <vector>
	#include <cstdint>

	class Tokenizer {
	public:
	bool load(const std::string& vocab_bin_path);

	// Decode a single token id to UTF-8 string.
	std::string decode(int token_id) const;

	// Decode list of token ids to concatenated UTF-8 string.
	std::string decode(const std::vector<int>& token_ids) const;

	// Encode prompt to token ids. Uses a Python subprocess since Qwen3 needs proper BPE.
	// The subprocess call takes ~200ms but is only invoked once per prompt.
	std::vector<int> encode_via_python(const std::string& model_dir,
	const std::string& prompt,
	bool apply_chat_template = false) const;

	// Encode a multi-turn conversation by applying the model's chat template. Each pair is
	// (role, content) — typical roles: "system", "user", "assistant". Uses Python subprocess.
	std::vector<int> encode_conversation_via_python(
	const std::string& model_dir,
	const std::vector<std::pair<std::string, std::string>>& conversation,
	bool add_generation_prompt = true) const;

	size_t size() const { return id_to_bytes_.size(); }

	private:
	std::vector<std::string> id_to_bytes_; // id -> raw utf-8 bytes
	};