File size: 3,492 Bytes
4b9fefd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
// test_batch_decode.cpp — benchmark decode with different batch sizes S = 1, 2, 4, 8.
//
// Purpose: quantify the cost of "batched decode" (a.k.a. the ingredient speculative decoding
// relies on). If Runner.prefill(S=K) forward-pass is only a small overhead over S=1, then
// spec-decoding with K draft tokens gives ~K× speedup at high accept rate.
//
// Per-token amortized cost:
//   cost(S) / S
// Speculative decoding benefit:
//   expected_accept_rate * K = effective tokens per forward
//   → TG = expected / (cost(S=K+1) / 1_sec)
#include "runner.h"

#include <chrono>
#include <cstdio>
#include <cstring>
#include <vector>

int main() {
    const std::string model_dir = "/path/to/Qwen3-235B-A22B-Instruct-2507-BF16";
    Runner r;
    int tp_rank = 0, tp_size = 1;
    if (const char* v = std::getenv("TP_RANK")) tp_rank = std::atoi(v);
    if (const char* v = std::getenv("TP_SIZE")) tp_size = std::atoi(v);
    bool is_master = tp_rank == 0;

    if (!r.init(model_dir, tp_size, tp_rank, /*num_layers=*/94, /*max_seq=*/512)) return 1;

    // Prefill a short context so decode has some KV cache
    std::vector<int32_t> prompt = {785, 6722, 315, 9625, 374};  // "The capital of France is"
    DeviceBuffer logits;
    r.prefill(prompt.data(), prompt.size(), logits);

    auto now = []() { return std::chrono::steady_clock::now(); };
    auto ms = [](auto t0, auto t1) { return std::chrono::duration<double, std::milli>(t1 - t0).count(); };

    std::vector<int> batch_sizes = {1, 2, 4, 8};
    int N_ITERS = 20;

    if (is_master) {
        printf("\n=== Batched decode forward benchmark (94 layers, TP=%d) ===\n", tp_size);
        printf("Each row: forward with S=K new tokens after prefill\n");
        printf("%-5s %-12s %-18s %-18s %s\n",
               "S", "ms/forward", "ms/token (amort)", "tokens/sec", "vs S=1 efficiency");
    }

    double base_per_token = 0;
    for (int S : batch_sizes) {
        // Reset cache between measurements to keep cache size fair (same position for each)
        // Actually we want to simulate: after some past_len, do 1 forward with S new tokens.
        // Use prefill() which accepts S>=1.

        std::vector<double> times;
        for (int iter = 0; iter < N_ITERS + 3; iter++) {  // +3 for warmup
            r.reset_cache();
            r.prefill(prompt.data(), prompt.size(), logits);  // re-prefill

            // New forward with S tokens (as if doing speculative verify)
            std::vector<int32_t> new_tokens(S, 100);  // dummy token ids
            auto t0 = now();
            DeviceBuffer logits2;
            r.prefill(new_tokens.data(), S, logits2);
            auto t1 = now();
            if (iter >= 3) times.push_back(ms(t0, t1));
        }
        std::sort(times.begin(), times.end());
        double median_ms = times[times.size() / 2];
        double per_token = median_ms / S;
        double tok_per_sec = 1000.0 / per_token;
        if (S == 1) base_per_token = per_token;
        double efficiency = base_per_token / per_token * 100.0;

        if (is_master) {
            printf("%-5d %-12.2f %-18.2f %-18.2f %.1f%%\n",
                   S, median_ms, per_token, tok_per_sec, efficiency);
        }
    }

    if (is_master) {
        printf("\n=== Interpretation ===\n");
        printf("If S=4 forward ~ S=1 (efficiency high), spec decoding with accept_rate=70%%\n");
        printf("gives TG = 0.7*4 / cost(S=5) vs baseline 1 / cost(S=1) = up to 2.8× speedup.\n");
    }
    return 0;
}