| |
| #include "tokenizer.h" |
| #include <cstdio> |
| #include <string> |
|
|
| int main() { |
| const std::string model_dir = "/path/to/Qwen3-235B-A22B-Instruct-2507-BF16"; |
| const std::string vocab_path = "tokenizer_data/vocab.bin"; |
|
|
| Tokenizer tok; |
| if (!tok.load(vocab_path)) return 1; |
| printf("Vocab size: %zu\n", tok.size()); |
|
|
| |
| std::vector<int> ids = {785, 6722, 315, 9625, 374}; |
| std::string decoded = tok.decode(ids); |
| printf("decode({785,6722,315,9625,374}) = '%s'\n", decoded.c_str()); |
| bool ok1 = (decoded == "The capital of France is"); |
| printf(" %s\n", ok1 ? "OK" : "FAIL"); |
|
|
| |
| printf("\nTesting encode_via_python...\n"); |
| std::string prompt = "The capital of France is"; |
| auto enc = tok.encode_via_python(model_dir, prompt, false); |
| printf("encode('%s') -> [", prompt.c_str()); |
| for (size_t i = 0; i < enc.size(); i++) printf("%s%d", i ? "," : "", enc[i]); |
| printf("]\n"); |
| bool ok2 = (enc == std::vector<int>{785, 6722, 315, 9625, 374}); |
| printf(" %s (expected [785,6722,315,9625,374])\n", ok2 ? "OK" : "FAIL"); |
|
|
| |
| std::string rt = tok.decode(enc); |
| printf("\nencode+decode roundtrip: '%s'\n", rt.c_str()); |
| bool ok3 = (rt == prompt); |
| printf(" %s\n", ok3 ? "OK" : "FAIL"); |
|
|
| |
| printf("\nTesting chat template...\n"); |
| auto chat_ids = tok.encode_via_python(model_dir, "What is the capital of France?", true); |
| printf("chat encode size: %zu\n", chat_ids.size()); |
| std::string chat_decoded = tok.decode(chat_ids); |
| printf("chat decode: %s\n", chat_decoded.c_str()); |
| bool ok4 = (chat_ids.size() > 10 && chat_decoded.find("capital") != std::string::npos); |
| printf(" %s\n", ok4 ? "OK" : "FAIL"); |
|
|
| bool all = ok1 && ok2 && ok3 && ok4; |
| printf("\n%s\n", all ? "=== test_tokenizer PASS ===" : "=== test_tokenizer FAIL ==="); |
| return all ? 0 : 1; |
| } |
|
|