File size: 2,084 Bytes
4b9fefd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | // test_tokenizer.cpp — verify encode + decode match Python reference.
#include "tokenizer.h"
#include <cstdio>
#include <string>
int main() {
const std::string model_dir = "/path/to/Qwen3-235B-A22B-Instruct-2507-BF16";
const std::string vocab_path = "tokenizer_data/vocab.bin";
Tokenizer tok;
if (!tok.load(vocab_path)) return 1;
printf("Vocab size: %zu\n", tok.size());
// Test 1: decode known ids (from export_vocab.py)
std::vector<int> ids = {785, 6722, 315, 9625, 374};
std::string decoded = tok.decode(ids);
printf("decode({785,6722,315,9625,374}) = '%s'\n", decoded.c_str());
bool ok1 = (decoded == "The capital of France is");
printf(" %s\n", ok1 ? "OK" : "FAIL");
// Test 2: encode roundtrip via Python subprocess
printf("\nTesting encode_via_python...\n");
std::string prompt = "The capital of France is";
auto enc = tok.encode_via_python(model_dir, prompt, false);
printf("encode('%s') -> [", prompt.c_str());
for (size_t i = 0; i < enc.size(); i++) printf("%s%d", i ? "," : "", enc[i]);
printf("]\n");
bool ok2 = (enc == std::vector<int>{785, 6722, 315, 9625, 374});
printf(" %s (expected [785,6722,315,9625,374])\n", ok2 ? "OK" : "FAIL");
// Test 3: encode then decode roundtrip
std::string rt = tok.decode(enc);
printf("\nencode+decode roundtrip: '%s'\n", rt.c_str());
bool ok3 = (rt == prompt);
printf(" %s\n", ok3 ? "OK" : "FAIL");
// Test 4: chat template
printf("\nTesting chat template...\n");
auto chat_ids = tok.encode_via_python(model_dir, "What is the capital of France?", true);
printf("chat encode size: %zu\n", chat_ids.size());
std::string chat_decoded = tok.decode(chat_ids);
printf("chat decode: %s\n", chat_decoded.c_str());
bool ok4 = (chat_ids.size() > 10 && chat_decoded.find("capital") != std::string::npos);
printf(" %s\n", ok4 ? "OK" : "FAIL");
bool all = ok1 && ok2 && ok3 && ok4;
printf("\n%s\n", all ? "=== test_tokenizer PASS ===" : "=== test_tokenizer FAIL ===");
return all ? 0 : 1;
}
|