// test_tokenizer.cpp — verify encode + decode match Python reference. #include "tokenizer.h" #include #include int main() { const std::string model_dir = "/path/to/Qwen3-235B-A22B-Instruct-2507-BF16"; const std::string vocab_path = "tokenizer_data/vocab.bin"; Tokenizer tok; if (!tok.load(vocab_path)) return 1; printf("Vocab size: %zu\n", tok.size()); // Test 1: decode known ids (from export_vocab.py) std::vector ids = {785, 6722, 315, 9625, 374}; std::string decoded = tok.decode(ids); printf("decode({785,6722,315,9625,374}) = '%s'\n", decoded.c_str()); bool ok1 = (decoded == "The capital of France is"); printf(" %s\n", ok1 ? "OK" : "FAIL"); // Test 2: encode roundtrip via Python subprocess printf("\nTesting encode_via_python...\n"); std::string prompt = "The capital of France is"; auto enc = tok.encode_via_python(model_dir, prompt, false); printf("encode('%s') -> [", prompt.c_str()); for (size_t i = 0; i < enc.size(); i++) printf("%s%d", i ? "," : "", enc[i]); printf("]\n"); bool ok2 = (enc == std::vector{785, 6722, 315, 9625, 374}); printf(" %s (expected [785,6722,315,9625,374])\n", ok2 ? "OK" : "FAIL"); // Test 3: encode then decode roundtrip std::string rt = tok.decode(enc); printf("\nencode+decode roundtrip: '%s'\n", rt.c_str()); bool ok3 = (rt == prompt); printf(" %s\n", ok3 ? "OK" : "FAIL"); // Test 4: chat template printf("\nTesting chat template...\n"); auto chat_ids = tok.encode_via_python(model_dir, "What is the capital of France?", true); printf("chat encode size: %zu\n", chat_ids.size()); std::string chat_decoded = tok.decode(chat_ids); printf("chat decode: %s\n", chat_decoded.c_str()); bool ok4 = (chat_ids.size() > 10 && chat_decoded.find("capital") != std::string::npos); printf(" %s\n", ok4 ? "OK" : "FAIL"); bool all = ok1 && ok2 && ok3 && ok4; printf("\n%s\n", all ? "=== test_tokenizer PASS ===" : "=== test_tokenizer FAIL ==="); return all ? 0 : 1; }