// test_runner.cpp — end-to-end Runner integration with N=1 layer. // // Validates: Runner.prefill(5 tokens) with num_layers=1 produces correct hidden state // compared to moe_data/final_out.bin (layer-0 post-residual output). // // Then: Runner.decode(1 new token) successfully appends to cache and yields logits. #include "runner.h" #include #include #include #include #include #include static float bf16_to_float(uint16_t x) { uint32_t u = (uint32_t)x << 16; float f; std::memcpy(&f, &u, 4); return f; } static std::vector read_file(const std::string& p) { std::ifstream f(p, std::ios::binary | std::ios::ate); size_t s = f.tellg(); f.seekg(0); std::vector v(s); f.read((char*)v.data(), s); return v; } int main() { const std::string model_dir = "/path/to/Qwen3-235B-A22B-Instruct-2507-BF16"; const std::string attn_data = "tests/attn_data"; Runner runner; int N = 2; if (const char* env = std::getenv("N_LAYERS")) N = std::atoi(env); if (!runner.init(model_dir, /*tp_size=*/1, /*tp_rank=*/0, /*num_layers=*/N, /*max_seq=*/128)) return 1; // Load 5 prefill tokens auto tok_raw = read_file(attn_data + "/token_ids.bin"); int32_t S = *(int32_t*)tok_raw.data(); std::vector tokens(S); std::memcpy(tokens.data(), tok_raw.data() + 4, S * 4); printf("prefill %d tokens...\n", S); DeviceBuffer logits; if (!runner.prefill(tokens.data(), S, logits)) return 1; printf("prefill done. past_len=%ld, logits size=%zu bytes\n", runner.past_len(), logits.size); // Read last-position logits and print top-5 tokens const int64_t V = runner.cfg().vocab_size; std::vector logits_host(V); ACL_CHECK(aclrtMemcpy(logits_host.data(), V*2, logits.get(), V*2, ACL_MEMCPY_DEVICE_TO_HOST)); // Find argmax int best = 0; float best_v = bf16_to_float(logits_host[0]); for (int i = 1; i < V; i++) { float v = bf16_to_float(logits_host[i]); if (v > best_v) { best_v = v; best = i; } } printf("prefill argmax token_id = %d (logit=%.3f)\n", best, best_v); // Decode one new token DeviceBuffer logits2; if (!runner.decode(best, logits2)) return 1; printf("decode done. past_len=%ld\n", runner.past_len()); std::vector logits2_host(V); ACL_CHECK(aclrtMemcpy(logits2_host.data(), V*2, logits2.get(), V*2, ACL_MEMCPY_DEVICE_TO_HOST)); int best2 = 0; float best2_v = bf16_to_float(logits2_host[0]); for (int i = 1; i < V; i++) { float v = bf16_to_float(logits2_host[i]); if (v > best2_v) { best2_v = v; best2 = i; } } printf("decode argmax token_id = %d (logit=%.3f)\n", best2, best2_v); printf("\n=== test_runner (N=1 layer) PASS — prefill + decode ran cleanly ===\n"); return 0; }