File size: 2,084 Bytes
4b9fefd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
// test_tokenizer.cpp — verify encode + decode match Python reference.
#include "tokenizer.h"
#include <cstdio>
#include <string>

int main() {
    const std::string model_dir = "/path/to/Qwen3-235B-A22B-Instruct-2507-BF16";
    const std::string vocab_path = "tokenizer_data/vocab.bin";

    Tokenizer tok;
    if (!tok.load(vocab_path)) return 1;
    printf("Vocab size: %zu\n", tok.size());

    // Test 1: decode known ids (from export_vocab.py)
    std::vector<int> ids = {785, 6722, 315, 9625, 374};
    std::string decoded = tok.decode(ids);
    printf("decode({785,6722,315,9625,374}) = '%s'\n", decoded.c_str());
    bool ok1 = (decoded == "The capital of France is");
    printf("  %s\n", ok1 ? "OK" : "FAIL");

    // Test 2: encode roundtrip via Python subprocess
    printf("\nTesting encode_via_python...\n");
    std::string prompt = "The capital of France is";
    auto enc = tok.encode_via_python(model_dir, prompt, false);
    printf("encode('%s') -> [", prompt.c_str());
    for (size_t i = 0; i < enc.size(); i++) printf("%s%d", i ? "," : "", enc[i]);
    printf("]\n");
    bool ok2 = (enc == std::vector<int>{785, 6722, 315, 9625, 374});
    printf("  %s (expected [785,6722,315,9625,374])\n", ok2 ? "OK" : "FAIL");

    // Test 3: encode then decode roundtrip
    std::string rt = tok.decode(enc);
    printf("\nencode+decode roundtrip: '%s'\n", rt.c_str());
    bool ok3 = (rt == prompt);
    printf("  %s\n", ok3 ? "OK" : "FAIL");

    // Test 4: chat template
    printf("\nTesting chat template...\n");
    auto chat_ids = tok.encode_via_python(model_dir, "What is the capital of France?", true);
    printf("chat encode size: %zu\n", chat_ids.size());
    std::string chat_decoded = tok.decode(chat_ids);
    printf("chat decode: %s\n", chat_decoded.c_str());
    bool ok4 = (chat_ids.size() > 10 && chat_decoded.find("capital") != std::string::npos);
    printf("  %s\n", ok4 ? "OK" : "FAIL");

    bool all = ok1 && ok2 && ok3 && ok4;
    printf("\n%s\n", all ? "=== test_tokenizer PASS ===" : "=== test_tokenizer FAIL ===");
    return all ? 0 : 1;
}