File size: 2,884 Bytes
4b9fefd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
// test_runner.cpp — end-to-end Runner integration with N=1 layer.
//
// Validates: Runner.prefill(5 tokens) with num_layers=1 produces correct hidden state
// compared to moe_data/final_out.bin (layer-0 post-residual output).
//
// Then: Runner.decode(1 new token) successfully appends to cache and yields logits.
#include "runner.h"

#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <vector>

static float bf16_to_float(uint16_t x) {
    uint32_t u = (uint32_t)x << 16; float f; std::memcpy(&f, &u, 4); return f;
}
static std::vector<uint8_t> read_file(const std::string& p) {
    std::ifstream f(p, std::ios::binary | std::ios::ate); size_t s = f.tellg();
    f.seekg(0); std::vector<uint8_t> v(s); f.read((char*)v.data(), s); return v;
}

int main() {
    const std::string model_dir = "/path/to/Qwen3-235B-A22B-Instruct-2507-BF16";
    const std::string attn_data = "tests/attn_data";

    Runner runner;
    int N = 2;
    if (const char* env = std::getenv("N_LAYERS")) N = std::atoi(env);
    if (!runner.init(model_dir, /*tp_size=*/1, /*tp_rank=*/0,
                     /*num_layers=*/N, /*max_seq=*/128)) return 1;

    // Load 5 prefill tokens
    auto tok_raw = read_file(attn_data + "/token_ids.bin");
    int32_t S = *(int32_t*)tok_raw.data();
    std::vector<int32_t> tokens(S);
    std::memcpy(tokens.data(), tok_raw.data() + 4, S * 4);
    printf("prefill %d tokens...\n", S);

    DeviceBuffer logits;
    if (!runner.prefill(tokens.data(), S, logits)) return 1;
    printf("prefill done. past_len=%ld, logits size=%zu bytes\n", runner.past_len(), logits.size);

    // Read last-position logits and print top-5 tokens
    const int64_t V = runner.cfg().vocab_size;
    std::vector<uint16_t> logits_host(V);
    ACL_CHECK(aclrtMemcpy(logits_host.data(), V*2, logits.get(), V*2, ACL_MEMCPY_DEVICE_TO_HOST));

    // Find argmax
    int best = 0;
    float best_v = bf16_to_float(logits_host[0]);
    for (int i = 1; i < V; i++) {
        float v = bf16_to_float(logits_host[i]);
        if (v > best_v) { best_v = v; best = i; }
    }
    printf("prefill argmax token_id = %d (logit=%.3f)\n", best, best_v);

    // Decode one new token
    DeviceBuffer logits2;
    if (!runner.decode(best, logits2)) return 1;
    printf("decode done. past_len=%ld\n", runner.past_len());

    std::vector<uint16_t> logits2_host(V);
    ACL_CHECK(aclrtMemcpy(logits2_host.data(), V*2, logits2.get(), V*2, ACL_MEMCPY_DEVICE_TO_HOST));
    int best2 = 0;
    float best2_v = bf16_to_float(logits2_host[0]);
    for (int i = 1; i < V; i++) {
        float v = bf16_to_float(logits2_host[i]);
        if (v > best2_v) { best2_v = v; best2 = i; }
    }
    printf("decode argmax token_id = %d (logit=%.3f)\n", best2, best2_v);

    printf("\n=== test_runner (N=1 layer) PASS — prefill + decode ran cleanly ===\n");
    return 0;
}