#include "tokenizer.h" #include #include #include #include #include #include #include #include #include bool Tokenizer::load(const std::string& vocab_bin_path) { std::ifstream f(vocab_bin_path, std::ios::binary); if (!f) { fprintf(stderr, "Tokenizer: cannot open %s\n", vocab_bin_path.c_str()); return false; } uint32_t num; f.read((char*)&num, 4); if (!f) return false; id_to_bytes_.resize(num); for (uint32_t i = 0; i < num; i++) { uint32_t len; f.read((char*)&len, 4); if (!f) return false; id_to_bytes_[i].resize(len); if (len > 0) f.read(id_to_bytes_[i].data(), len); } return true; } std::string Tokenizer::decode(int id) const { if (id < 0 || (size_t)id >= id_to_bytes_.size()) return ""; return id_to_bytes_[id]; } std::string Tokenizer::decode(const std::vector& ids) const { std::string out; for (int id : ids) out += decode(id); return out; } std::vector Tokenizer::encode_via_python(const std::string& model_dir, const std::string& prompt, bool apply_chat_template) const { // Call python subprocess to tokenize. Embed prompt via stdin to avoid shell-escape bugs. std::string cmd; // Set QWEN3_PYENV_INIT to override the Python env activation sequence (e.g., "source /opt/my_env/activate && ") // Default assumes conda at ~/miniconda3 with env 'qwen3' and Ascend toolkit installed. if (const char* init = std::getenv("QWEN3_PYENV_INIT")) { cmd += init; } else { cmd += "source ${HOME}/miniconda3/etc/profile.d/conda.sh 2>/dev/null && "; cmd += "conda activate qwen3 2>/dev/null || true; "; cmd += "source /usr/local/Ascend/ascend-toolkit/set_env.sh 2>/dev/null || true; "; } cmd += "python3 -c \""; cmd += "import sys, json;"; cmd += "from transformers import AutoTokenizer;"; cmd += "t = AutoTokenizer.from_pretrained('" + model_dir + "');"; cmd += "p = sys.stdin.read();"; if (apply_chat_template) { cmd += "msg = [{'role': 'user', 'content': p}];"; cmd += "ids = t.apply_chat_template(msg, add_generation_prompt=True);"; } else { cmd += "ids = t.encode(p);"; } cmd += "print(' '.join(str(i) for i in ids));"; cmd += "\""; // popen with stdin: use the two-pipe dance via temp file for safety char tmpl[] = "/tmp/lca_prompt_XXXXXX"; int fd = mkstemp(tmpl); if (fd < 0) { perror("mkstemp"); return {}; } write(fd, prompt.data(), prompt.size()); close(fd); std::string full = cmd + " < " + tmpl + " 2>/dev/null"; FILE* pipe = popen(full.c_str(), "r"); if (!pipe) { perror("popen"); unlink(tmpl); return {}; } std::string out; char buf[4096]; while (size_t n = fread(buf, 1, sizeof(buf), pipe)) out.append(buf, n); pclose(pipe); unlink(tmpl); std::vector ids; std::istringstream iss(out); int x; while (iss >> x) ids.push_back(x); return ids; } // Shell-quote a string for embedding in a JSON string (escape ", \, control chars). static std::string json_escape(const std::string& s) { std::string out; out.reserve(s.size() + 8); for (char c : s) { switch (c) { case '"': out += "\\\""; break; case '\\': out += "\\\\"; break; case '\n': out += "\\n"; break; case '\r': out += "\\r"; break; case '\t': out += "\\t"; break; default: if ((unsigned char)c < 0x20) { char buf[8]; snprintf(buf, sizeof(buf), "\\u%04x", (unsigned char)c); out += buf; } else { out += c; } } } return out; } std::vector Tokenizer::encode_conversation_via_python( const std::string& model_dir, const std::vector>& conversation, bool add_generation_prompt) const { // Build JSON array of messages. Pass via stdin to avoid shell-escape issues. std::string json_msgs = "["; for (size_t i = 0; i < conversation.size(); i++) { if (i > 0) json_msgs += ","; json_msgs += "{\"role\":\"" + json_escape(conversation[i].first) + "\","; json_msgs += "\"content\":\"" + json_escape(conversation[i].second) + "\"}"; } json_msgs += "]"; std::string cmd; // Set QWEN3_PYENV_INIT to override the Python env activation sequence (e.g., "source /opt/my_env/activate && ") // Default assumes conda at ~/miniconda3 with env 'qwen3' and Ascend toolkit installed. if (const char* init = std::getenv("QWEN3_PYENV_INIT")) { cmd += init; } else { cmd += "source ${HOME}/miniconda3/etc/profile.d/conda.sh 2>/dev/null && "; cmd += "conda activate qwen3 2>/dev/null || true; "; cmd += "source /usr/local/Ascend/ascend-toolkit/set_env.sh 2>/dev/null || true; "; } cmd += "python3 -c \""; cmd += "import sys, json;"; cmd += "from transformers import AutoTokenizer;"; cmd += "t = AutoTokenizer.from_pretrained('" + model_dir + "');"; cmd += "msgs = json.loads(sys.stdin.read());"; cmd += "ids = t.apply_chat_template(msgs, add_generation_prompt="; cmd += add_generation_prompt ? "True" : "False"; cmd += ");"; cmd += "print(' '.join(str(i) for i in ids));"; cmd += "\""; char tmpl[] = "/tmp/lca_conv_XXXXXX"; int fd = mkstemp(tmpl); if (fd < 0) { perror("mkstemp"); return {}; } write(fd, json_msgs.data(), json_msgs.size()); close(fd); std::string full = cmd + " < " + tmpl + " 2>/dev/null"; FILE* pipe = popen(full.c_str(), "r"); if (!pipe) { perror("popen"); unlink(tmpl); return {}; } std::string out; char buf[4096]; while (size_t n = fread(buf, 1, sizeof(buf), pipe)) out.append(buf, n); pclose(pipe); unlink(tmpl); std::vector ids; std::istringstream iss(out); int x; while (iss >> x) ids.push_back(x); return ids; }