File size: 8,896 Bytes
4b9fefd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
// test_op_support.cpp — smoke test which aclnn ops actually RUN on 910 初代.
// Just call each candidate op with small tensors; report SUCCESS/FAILURE.
// Guides optimization feasibility analysis.
#include "acl_common.h"
#include "acl_runtime.h"
#include "aclnn_ops.h"
#include <acl/acl.h>
#include <aclnnop/aclnn_add_rms_norm.h>
#include <aclnnop/aclnn_npu_format_cast.h>
#include <aclnnop/aclnn_matmul.h>

#include <cstdio>
#include <cstring>
#include <vector>

static float bf16_to_float(uint16_t x) { uint32_t u = (uint32_t)x << 16; float f; std::memcpy(&f, &u, 4); return f; }
static uint16_t f_to_bf16(float f) { uint32_t u; std::memcpy(&u, &f, 4); return (uint16_t)((u + 0x7FFF + ((u >> 16) & 1)) >> 16); }

static const char* test_add_rms_norm(AclRuntime& rt) {
    // Inputs: x1 [1, 16], x2 [1, 16] BF16; gamma [16] BF16
    const int64_t D = 16;
    std::vector<uint16_t> h_x1(D, f_to_bf16(0.5f));
    std::vector<uint16_t> h_x2(D, f_to_bf16(0.3f));
    std::vector<uint16_t> h_gamma(D, f_to_bf16(1.0f));
    DeviceBuffer x1(D*2), x2(D*2), g(D*2), y(D*2), rstd(1*4), x_out(D*2);
    ACL_CHECK(aclrtMemcpy(x1.get(), D*2, h_x1.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE));
    ACL_CHECK(aclrtMemcpy(x2.get(), D*2, h_x2.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE));
    ACL_CHECK(aclrtMemcpy(g.get(),  D*2, h_gamma.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE));

    auto tx1 = make_contig_tensor(x1.get(),    ACL_BF16, {1, D});
    auto tx2 = make_contig_tensor(x2.get(),    ACL_BF16, {1, D});
    auto tg  = make_contig_tensor(g.get(),     ACL_BF16, {D});
    auto ty  = make_contig_tensor(y.get(),     ACL_BF16, {1, D});
    auto trs = make_contig_tensor(rstd.get(),  ACL_FLOAT, {1});
    auto tout= make_contig_tensor(x_out.get(), ACL_BF16, {1, D});

    uint64_t ws = 0; aclOpExecutor* exec = nullptr;
    aclnnStatus s = aclnnAddRmsNormGetWorkspaceSize(tx1.get(), tx2.get(), tg.get(), 1e-6,
                                                     ty.get(), trs.get(), tout.get(), &ws, &exec);
    if (s != 0) return "GetWorkspaceSize FAILED";
    DeviceBuffer ws_buf;
    if (ws > 0) ws_buf.alloc(ws);
    s = aclnnAddRmsNorm(ws_buf.get(), ws, exec, rt.stream());
    if (s != 0) return "aclnnAddRmsNorm FAILED (kernel not available on 910?)";
    if (aclrtSynchronizeStream(rt.stream()) != 0) return "sync FAILED";
    return "OK";
}

static const char* test_npu_format_cast_nz(AclRuntime& rt) {
    // Transform a small [16, 16] BF16 tensor from ND to NZ format.
    const int64_t H = 16, W = 16;
    std::vector<uint16_t> h(H * W, f_to_bf16(1.0f));
    DeviceBuffer src(H * W * 2);
    ACL_CHECK(aclrtMemcpy(src.get(), H*W*2, h.data(), H*W*2, ACL_MEMCPY_HOST_TO_DEVICE));
    auto tsrc = make_contig_tensor(src.get(), ACL_BF16, {H, W});

    // Step 1: calculate NZ shape
    int64_t* dst_shape = nullptr;
    uint64_t dst_shape_size = 0;
    int actual_fmt = 0;
    aclnnStatus s = aclnnNpuFormatCastCalculateSizeAndFormat(
        tsrc.get(), /*dstFormat=*/29 /* FRACTAL_NZ */,
        /*additionalDtype=*/27 /* BF16 */,
        &dst_shape, &dst_shape_size, &actual_fmt);
    if (s != 0) return "CalculateSizeAndFormat FAILED";

    // Step 2: alloc dst and call cast
    int64_t total = 1;
    std::vector<int64_t> shape_vec(dst_shape, dst_shape + dst_shape_size);
    for (auto d : shape_vec) total *= d;
    DeviceBuffer dst(total * 2);
    auto tdst = make_acl_tensor(dst.get(), ACL_BF16, shape_vec, {}, (aclFormat)actual_fmt);

    uint64_t ws = 0; aclOpExecutor* exec = nullptr;
    s = aclnnNpuFormatCastGetWorkspaceSize(tsrc.get(), tdst.get(), &ws, &exec);
    if (s != 0) return "FormatCast GetWorkspaceSize FAILED";
    DeviceBuffer ws_buf; if (ws > 0) ws_buf.alloc(ws);
    s = aclnnNpuFormatCast(ws_buf.get(), ws, exec, rt.stream());
    if (s != 0) return "aclnnNpuFormatCast FAILED (NZ not supported on 910?)";
    if (aclrtSynchronizeStream(rt.stream()) != 0) return "sync FAILED";
    return "OK";
}

static const char* test_matmul_nz(AclRuntime& rt) {
    // Try a MatMul with NZ-format weight.
    const int64_t M = 16, K = 32, N = 16;
    std::vector<uint16_t> h_x(M * K, f_to_bf16(0.1f));
    std::vector<uint16_t> h_w(K * N, f_to_bf16(0.1f));
    DeviceBuffer x(M*K*2), w(K*N*2), y(M*N*2);
    ACL_CHECK(aclrtMemcpy(x.get(), M*K*2, h_x.data(), M*K*2, ACL_MEMCPY_HOST_TO_DEVICE));
    ACL_CHECK(aclrtMemcpy(w.get(), K*N*2, h_w.data(), K*N*2, ACL_MEMCPY_HOST_TO_DEVICE));

    auto tx = make_contig_tensor(x.get(), ACL_BF16, {M, K});
    auto tw_nd = make_contig_tensor(w.get(), ACL_BF16, {K, N});

    // Convert W to NZ
    int64_t* dst_shape = nullptr; uint64_t dst_size = 0; int fmt = 0;
    if (aclnnNpuFormatCastCalculateSizeAndFormat(tw_nd.get(), 29, 27, &dst_shape, &dst_size, &fmt) != 0)
        return "calc NZ FAILED";
    int64_t total = 1;
    std::vector<int64_t> sh(dst_shape, dst_shape + dst_size);
    for (auto d : sh) total *= d;
    DeviceBuffer w_nz(total * 2);
    auto tw_nz = make_acl_tensor(w_nz.get(), ACL_BF16, sh, {}, (aclFormat)fmt);

    uint64_t ws = 0; aclOpExecutor* e = nullptr;
    aclnnStatus s = aclnnNpuFormatCastGetWorkspaceSize(tw_nd.get(), tw_nz.get(), &ws, &e);
    if (s != 0) return "NZ cast ws FAILED";
    DeviceBuffer wb; if (ws > 0) wb.alloc(ws);
    if (aclnnNpuFormatCast(wb.get(), ws, e, rt.stream()) != 0) return "NZ cast EXEC FAILED";
    if (aclrtSynchronizeStream(rt.stream()) != 0) return "NZ cast sync FAILED";

    // Now try MatMul with x (ND) × w_nz (NZ)
    auto ty = make_contig_tensor(y.get(), ACL_BF16, {M, N});
    ws = 0; e = nullptr;
    s = aclnnMatmulGetWorkspaceSize(tx.get(), tw_nz.get(), ty.get(), 0 /*trans*/, &ws, &e);
    if (s != 0) return "MatMul NZ GetWorkspaceSize FAILED";
    DeviceBuffer mwb; if (ws > 0) mwb.alloc(ws);
    if (aclnnMatmul(mwb.get(), ws, e, rt.stream()) != 0) return "MatMul NZ EXEC FAILED (MatMul doesn't accept NZ on 910?)";
    if (aclrtSynchronizeStream(rt.stream()) != 0) return "MatMul NZ sync FAILED";
    return "OK";
}

static const char* test_multi_stream(AclRuntime& rt) {
    // Allocate a SECOND stream and check it works.
    aclrtStream s2 = nullptr;
    if (aclrtCreateStream(&s2) != 0) return "aclrtCreateStream FAILED";
    // Simple dummy op on s2
    DeviceBuffer x(16 * 2);
    std::vector<uint16_t> hx(16, 0);
    if (aclrtMemcpyAsync(x.get(), 16*2, hx.data(), 16*2, ACL_MEMCPY_HOST_TO_DEVICE, s2) != 0) return "memcpy on s2 FAILED";
    if (aclrtSynchronizeStream(s2) != 0) return "sync s2 FAILED";
    aclrtDestroyStream(s2);
    return "OK";
}

int main() {
    AclRuntime rt;
    rt.init(0);

    printf("=== 910 op support smoke test ===\n");

    const char* r1 = test_add_rms_norm(rt);
    printf("  aclnnAddRmsNorm (fused Add+RmsNorm):     %s\n", r1);

    const char* r2 = test_npu_format_cast_nz(rt);
    printf("  aclnnNpuFormatCast → FRACTAL_NZ:         %s\n", r2);

    const char* r3 = test_matmul_nz(rt);
    printf("  aclnnMatmul with NZ weight:              %s\n", r3);

    const char* r4 = test_multi_stream(rt);
    printf("  Multi-stream (compute/comm overlap):     %s\n", r4);

    // More candidates
    printf("\n=== Additional 910 op candidates ===\n");

    // InplaceAddRmsNorm
    #include <aclnnop/aclnn_inplace_add_rms_norm.h>
    {
        const int64_t D = 16;
        std::vector<uint16_t> h(D, f_to_bf16(0.5f)), hg(D, f_to_bf16(1.0f));
        DeviceBuffer x1(D*2), x2(D*2), g(D*2), rstd(4);
        ACL_CHECK(aclrtMemcpy(x1.get(), D*2, h.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE));
        ACL_CHECK(aclrtMemcpy(x2.get(), D*2, h.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE));
        ACL_CHECK(aclrtMemcpy(g.get(),  D*2, hg.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE));
        auto tx1 = make_contig_tensor(x1.get(), ACL_BF16, {1, D});
        auto tx2 = make_contig_tensor(x2.get(), ACL_BF16, {1, D});
        auto tg  = make_contig_tensor(g.get(), ACL_BF16, {D});
        auto tr  = make_contig_tensor(rstd.get(), ACL_FLOAT, {1});
        uint64_t ws = 0; aclOpExecutor* e = nullptr;
        aclnnStatus s = aclnnInplaceAddRmsNormGetWorkspaceSize(tx1.get(), tx2.get(), tg.get(), 1e-6,
                                                                tr.get(), &ws, &e);
        printf("  aclnnInplaceAddRmsNorm:  %s\n", s == 0 ? "GetWS OK" : "GetWS FAILED");
        if (s == 0) {
            DeviceBuffer wb; if (ws > 0) wb.alloc(ws);
            s = aclnnInplaceAddRmsNorm(wb.get(), ws, e, rt.stream());
            printf("    exec: %s\n", s == 0 ? "OK" : "FAILED");
        }
    }

    // Test HCCL AllReduce on a separate stream
    printf("  HCCL AllReduce on stream2:  requires TP>1, skipped in this smoke test\n");

    printf("\n=== FINAL Feasibility Summary ===\n");
    printf("  Optimization A (FRACTAL_NZ):       INFEASIBLE (910 不支持)\n");
    printf("  Optimization B (multi-stream):     FEASIBLE\n");
    printf("  Optimization C (Add+RmsNorm):      INFEASIBLE (910 无 kernel)\n");
    return 0;
}