| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| #include "runner.h" |
|
|
| #include <chrono> |
| #include <cstdio> |
| #include <cstring> |
| #include <vector> |
|
|
| int main() { |
| const std::string model_dir = "/path/to/Qwen3-235B-A22B-Instruct-2507-BF16"; |
| Runner r; |
| int tp_rank = 0, tp_size = 1; |
| if (const char* v = std::getenv("TP_RANK")) tp_rank = std::atoi(v); |
| if (const char* v = std::getenv("TP_SIZE")) tp_size = std::atoi(v); |
| bool is_master = tp_rank == 0; |
|
|
| if (!r.init(model_dir, tp_size, tp_rank, 94, 512)) return 1; |
|
|
| |
| std::vector<int32_t> prompt = {785, 6722, 315, 9625, 374}; |
| DeviceBuffer logits; |
| r.prefill(prompt.data(), prompt.size(), logits); |
|
|
| auto now = []() { return std::chrono::steady_clock::now(); }; |
| auto ms = [](auto t0, auto t1) { return std::chrono::duration<double, std::milli>(t1 - t0).count(); }; |
|
|
| std::vector<int> batch_sizes = {1, 2, 4, 8}; |
| int N_ITERS = 20; |
|
|
| if (is_master) { |
| printf("\n=== Batched decode forward benchmark (94 layers, TP=%d) ===\n", tp_size); |
| printf("Each row: forward with S=K new tokens after prefill\n"); |
| printf("%-5s %-12s %-18s %-18s %s\n", |
| "S", "ms/forward", "ms/token (amort)", "tokens/sec", "vs S=1 efficiency"); |
| } |
|
|
| double base_per_token = 0; |
| for (int S : batch_sizes) { |
| |
| |
| |
|
|
| std::vector<double> times; |
| for (int iter = 0; iter < N_ITERS + 3; iter++) { |
| r.reset_cache(); |
| r.prefill(prompt.data(), prompt.size(), logits); |
|
|
| |
| std::vector<int32_t> new_tokens(S, 100); |
| auto t0 = now(); |
| DeviceBuffer logits2; |
| r.prefill(new_tokens.data(), S, logits2); |
| auto t1 = now(); |
| if (iter >= 3) times.push_back(ms(t0, t1)); |
| } |
| std::sort(times.begin(), times.end()); |
| double median_ms = times[times.size() / 2]; |
| double per_token = median_ms / S; |
| double tok_per_sec = 1000.0 / per_token; |
| if (S == 1) base_per_token = per_token; |
| double efficiency = base_per_token / per_token * 100.0; |
|
|
| if (is_master) { |
| printf("%-5d %-12.2f %-18.2f %-18.2f %.1f%%\n", |
| S, median_ms, per_token, tok_per_sec, efficiency); |
| } |
| } |
|
|
| if (is_master) { |
| printf("\n=== Interpretation ===\n"); |
| printf("If S=4 forward ~ S=1 (efficiency high), spec decoding with accept_rate=70%%\n"); |
| printf("gives TG = 0.7*4 / cost(S=5) vs baseline 1 / cost(S=1) = up to 2.8× speedup.\n"); |
| } |
| return 0; |
| } |
|
|