File size: 2,884 Bytes
4b9fefd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | // test_runner.cpp — end-to-end Runner integration with N=1 layer.
//
// Validates: Runner.prefill(5 tokens) with num_layers=1 produces correct hidden state
// compared to moe_data/final_out.bin (layer-0 post-residual output).
//
// Then: Runner.decode(1 new token) successfully appends to cache and yields logits.
#include "runner.h"
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <vector>
static float bf16_to_float(uint16_t x) {
uint32_t u = (uint32_t)x << 16; float f; std::memcpy(&f, &u, 4); return f;
}
static std::vector<uint8_t> read_file(const std::string& p) {
std::ifstream f(p, std::ios::binary | std::ios::ate); size_t s = f.tellg();
f.seekg(0); std::vector<uint8_t> v(s); f.read((char*)v.data(), s); return v;
}
int main() {
const std::string model_dir = "/path/to/Qwen3-235B-A22B-Instruct-2507-BF16";
const std::string attn_data = "tests/attn_data";
Runner runner;
int N = 2;
if (const char* env = std::getenv("N_LAYERS")) N = std::atoi(env);
if (!runner.init(model_dir, /*tp_size=*/1, /*tp_rank=*/0,
/*num_layers=*/N, /*max_seq=*/128)) return 1;
// Load 5 prefill tokens
auto tok_raw = read_file(attn_data + "/token_ids.bin");
int32_t S = *(int32_t*)tok_raw.data();
std::vector<int32_t> tokens(S);
std::memcpy(tokens.data(), tok_raw.data() + 4, S * 4);
printf("prefill %d tokens...\n", S);
DeviceBuffer logits;
if (!runner.prefill(tokens.data(), S, logits)) return 1;
printf("prefill done. past_len=%ld, logits size=%zu bytes\n", runner.past_len(), logits.size);
// Read last-position logits and print top-5 tokens
const int64_t V = runner.cfg().vocab_size;
std::vector<uint16_t> logits_host(V);
ACL_CHECK(aclrtMemcpy(logits_host.data(), V*2, logits.get(), V*2, ACL_MEMCPY_DEVICE_TO_HOST));
// Find argmax
int best = 0;
float best_v = bf16_to_float(logits_host[0]);
for (int i = 1; i < V; i++) {
float v = bf16_to_float(logits_host[i]);
if (v > best_v) { best_v = v; best = i; }
}
printf("prefill argmax token_id = %d (logit=%.3f)\n", best, best_v);
// Decode one new token
DeviceBuffer logits2;
if (!runner.decode(best, logits2)) return 1;
printf("decode done. past_len=%ld\n", runner.past_len());
std::vector<uint16_t> logits2_host(V);
ACL_CHECK(aclrtMemcpy(logits2_host.data(), V*2, logits2.get(), V*2, ACL_MEMCPY_DEVICE_TO_HOST));
int best2 = 0;
float best2_v = bf16_to_float(logits2_host[0]);
for (int i = 1; i < V; i++) {
float v = bf16_to_float(logits2_host[i]);
if (v > best2_v) { best2_v = v; best2 = i; }
}
printf("decode argmax token_id = %d (logit=%.3f)\n", best2, best2_v);
printf("\n=== test_runner (N=1 layer) PASS — prefill + decode ran cleanly ===\n");
return 0;
}
|