File size: 8,896 Bytes
4b9fefd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | // test_op_support.cpp — smoke test which aclnn ops actually RUN on 910 初代.
// Just call each candidate op with small tensors; report SUCCESS/FAILURE.
// Guides optimization feasibility analysis.
#include "acl_common.h"
#include "acl_runtime.h"
#include "aclnn_ops.h"
#include <acl/acl.h>
#include <aclnnop/aclnn_add_rms_norm.h>
#include <aclnnop/aclnn_npu_format_cast.h>
#include <aclnnop/aclnn_matmul.h>
#include <cstdio>
#include <cstring>
#include <vector>
static float bf16_to_float(uint16_t x) { uint32_t u = (uint32_t)x << 16; float f; std::memcpy(&f, &u, 4); return f; }
static uint16_t f_to_bf16(float f) { uint32_t u; std::memcpy(&u, &f, 4); return (uint16_t)((u + 0x7FFF + ((u >> 16) & 1)) >> 16); }
static const char* test_add_rms_norm(AclRuntime& rt) {
// Inputs: x1 [1, 16], x2 [1, 16] BF16; gamma [16] BF16
const int64_t D = 16;
std::vector<uint16_t> h_x1(D, f_to_bf16(0.5f));
std::vector<uint16_t> h_x2(D, f_to_bf16(0.3f));
std::vector<uint16_t> h_gamma(D, f_to_bf16(1.0f));
DeviceBuffer x1(D*2), x2(D*2), g(D*2), y(D*2), rstd(1*4), x_out(D*2);
ACL_CHECK(aclrtMemcpy(x1.get(), D*2, h_x1.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE));
ACL_CHECK(aclrtMemcpy(x2.get(), D*2, h_x2.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE));
ACL_CHECK(aclrtMemcpy(g.get(), D*2, h_gamma.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE));
auto tx1 = make_contig_tensor(x1.get(), ACL_BF16, {1, D});
auto tx2 = make_contig_tensor(x2.get(), ACL_BF16, {1, D});
auto tg = make_contig_tensor(g.get(), ACL_BF16, {D});
auto ty = make_contig_tensor(y.get(), ACL_BF16, {1, D});
auto trs = make_contig_tensor(rstd.get(), ACL_FLOAT, {1});
auto tout= make_contig_tensor(x_out.get(), ACL_BF16, {1, D});
uint64_t ws = 0; aclOpExecutor* exec = nullptr;
aclnnStatus s = aclnnAddRmsNormGetWorkspaceSize(tx1.get(), tx2.get(), tg.get(), 1e-6,
ty.get(), trs.get(), tout.get(), &ws, &exec);
if (s != 0) return "GetWorkspaceSize FAILED";
DeviceBuffer ws_buf;
if (ws > 0) ws_buf.alloc(ws);
s = aclnnAddRmsNorm(ws_buf.get(), ws, exec, rt.stream());
if (s != 0) return "aclnnAddRmsNorm FAILED (kernel not available on 910?)";
if (aclrtSynchronizeStream(rt.stream()) != 0) return "sync FAILED";
return "OK";
}
static const char* test_npu_format_cast_nz(AclRuntime& rt) {
// Transform a small [16, 16] BF16 tensor from ND to NZ format.
const int64_t H = 16, W = 16;
std::vector<uint16_t> h(H * W, f_to_bf16(1.0f));
DeviceBuffer src(H * W * 2);
ACL_CHECK(aclrtMemcpy(src.get(), H*W*2, h.data(), H*W*2, ACL_MEMCPY_HOST_TO_DEVICE));
auto tsrc = make_contig_tensor(src.get(), ACL_BF16, {H, W});
// Step 1: calculate NZ shape
int64_t* dst_shape = nullptr;
uint64_t dst_shape_size = 0;
int actual_fmt = 0;
aclnnStatus s = aclnnNpuFormatCastCalculateSizeAndFormat(
tsrc.get(), /*dstFormat=*/29 /* FRACTAL_NZ */,
/*additionalDtype=*/27 /* BF16 */,
&dst_shape, &dst_shape_size, &actual_fmt);
if (s != 0) return "CalculateSizeAndFormat FAILED";
// Step 2: alloc dst and call cast
int64_t total = 1;
std::vector<int64_t> shape_vec(dst_shape, dst_shape + dst_shape_size);
for (auto d : shape_vec) total *= d;
DeviceBuffer dst(total * 2);
auto tdst = make_acl_tensor(dst.get(), ACL_BF16, shape_vec, {}, (aclFormat)actual_fmt);
uint64_t ws = 0; aclOpExecutor* exec = nullptr;
s = aclnnNpuFormatCastGetWorkspaceSize(tsrc.get(), tdst.get(), &ws, &exec);
if (s != 0) return "FormatCast GetWorkspaceSize FAILED";
DeviceBuffer ws_buf; if (ws > 0) ws_buf.alloc(ws);
s = aclnnNpuFormatCast(ws_buf.get(), ws, exec, rt.stream());
if (s != 0) return "aclnnNpuFormatCast FAILED (NZ not supported on 910?)";
if (aclrtSynchronizeStream(rt.stream()) != 0) return "sync FAILED";
return "OK";
}
static const char* test_matmul_nz(AclRuntime& rt) {
// Try a MatMul with NZ-format weight.
const int64_t M = 16, K = 32, N = 16;
std::vector<uint16_t> h_x(M * K, f_to_bf16(0.1f));
std::vector<uint16_t> h_w(K * N, f_to_bf16(0.1f));
DeviceBuffer x(M*K*2), w(K*N*2), y(M*N*2);
ACL_CHECK(aclrtMemcpy(x.get(), M*K*2, h_x.data(), M*K*2, ACL_MEMCPY_HOST_TO_DEVICE));
ACL_CHECK(aclrtMemcpy(w.get(), K*N*2, h_w.data(), K*N*2, ACL_MEMCPY_HOST_TO_DEVICE));
auto tx = make_contig_tensor(x.get(), ACL_BF16, {M, K});
auto tw_nd = make_contig_tensor(w.get(), ACL_BF16, {K, N});
// Convert W to NZ
int64_t* dst_shape = nullptr; uint64_t dst_size = 0; int fmt = 0;
if (aclnnNpuFormatCastCalculateSizeAndFormat(tw_nd.get(), 29, 27, &dst_shape, &dst_size, &fmt) != 0)
return "calc NZ FAILED";
int64_t total = 1;
std::vector<int64_t> sh(dst_shape, dst_shape + dst_size);
for (auto d : sh) total *= d;
DeviceBuffer w_nz(total * 2);
auto tw_nz = make_acl_tensor(w_nz.get(), ACL_BF16, sh, {}, (aclFormat)fmt);
uint64_t ws = 0; aclOpExecutor* e = nullptr;
aclnnStatus s = aclnnNpuFormatCastGetWorkspaceSize(tw_nd.get(), tw_nz.get(), &ws, &e);
if (s != 0) return "NZ cast ws FAILED";
DeviceBuffer wb; if (ws > 0) wb.alloc(ws);
if (aclnnNpuFormatCast(wb.get(), ws, e, rt.stream()) != 0) return "NZ cast EXEC FAILED";
if (aclrtSynchronizeStream(rt.stream()) != 0) return "NZ cast sync FAILED";
// Now try MatMul with x (ND) × w_nz (NZ)
auto ty = make_contig_tensor(y.get(), ACL_BF16, {M, N});
ws = 0; e = nullptr;
s = aclnnMatmulGetWorkspaceSize(tx.get(), tw_nz.get(), ty.get(), 0 /*trans*/, &ws, &e);
if (s != 0) return "MatMul NZ GetWorkspaceSize FAILED";
DeviceBuffer mwb; if (ws > 0) mwb.alloc(ws);
if (aclnnMatmul(mwb.get(), ws, e, rt.stream()) != 0) return "MatMul NZ EXEC FAILED (MatMul doesn't accept NZ on 910?)";
if (aclrtSynchronizeStream(rt.stream()) != 0) return "MatMul NZ sync FAILED";
return "OK";
}
static const char* test_multi_stream(AclRuntime& rt) {
// Allocate a SECOND stream and check it works.
aclrtStream s2 = nullptr;
if (aclrtCreateStream(&s2) != 0) return "aclrtCreateStream FAILED";
// Simple dummy op on s2
DeviceBuffer x(16 * 2);
std::vector<uint16_t> hx(16, 0);
if (aclrtMemcpyAsync(x.get(), 16*2, hx.data(), 16*2, ACL_MEMCPY_HOST_TO_DEVICE, s2) != 0) return "memcpy on s2 FAILED";
if (aclrtSynchronizeStream(s2) != 0) return "sync s2 FAILED";
aclrtDestroyStream(s2);
return "OK";
}
int main() {
AclRuntime rt;
rt.init(0);
printf("=== 910 op support smoke test ===\n");
const char* r1 = test_add_rms_norm(rt);
printf(" aclnnAddRmsNorm (fused Add+RmsNorm): %s\n", r1);
const char* r2 = test_npu_format_cast_nz(rt);
printf(" aclnnNpuFormatCast → FRACTAL_NZ: %s\n", r2);
const char* r3 = test_matmul_nz(rt);
printf(" aclnnMatmul with NZ weight: %s\n", r3);
const char* r4 = test_multi_stream(rt);
printf(" Multi-stream (compute/comm overlap): %s\n", r4);
// More candidates
printf("\n=== Additional 910 op candidates ===\n");
// InplaceAddRmsNorm
#include <aclnnop/aclnn_inplace_add_rms_norm.h>
{
const int64_t D = 16;
std::vector<uint16_t> h(D, f_to_bf16(0.5f)), hg(D, f_to_bf16(1.0f));
DeviceBuffer x1(D*2), x2(D*2), g(D*2), rstd(4);
ACL_CHECK(aclrtMemcpy(x1.get(), D*2, h.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE));
ACL_CHECK(aclrtMemcpy(x2.get(), D*2, h.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE));
ACL_CHECK(aclrtMemcpy(g.get(), D*2, hg.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE));
auto tx1 = make_contig_tensor(x1.get(), ACL_BF16, {1, D});
auto tx2 = make_contig_tensor(x2.get(), ACL_BF16, {1, D});
auto tg = make_contig_tensor(g.get(), ACL_BF16, {D});
auto tr = make_contig_tensor(rstd.get(), ACL_FLOAT, {1});
uint64_t ws = 0; aclOpExecutor* e = nullptr;
aclnnStatus s = aclnnInplaceAddRmsNormGetWorkspaceSize(tx1.get(), tx2.get(), tg.get(), 1e-6,
tr.get(), &ws, &e);
printf(" aclnnInplaceAddRmsNorm: %s\n", s == 0 ? "GetWS OK" : "GetWS FAILED");
if (s == 0) {
DeviceBuffer wb; if (ws > 0) wb.alloc(ws);
s = aclnnInplaceAddRmsNorm(wb.get(), ws, e, rt.stream());
printf(" exec: %s\n", s == 0 ? "OK" : "FAILED");
}
}
// Test HCCL AllReduce on a separate stream
printf(" HCCL AllReduce on stream2: requires TP>1, skipped in this smoke test\n");
printf("\n=== FINAL Feasibility Summary ===\n");
printf(" Optimization A (FRACTAL_NZ): INFEASIBLE (910 不支持)\n");
printf(" Optimization B (multi-stream): FEASIBLE\n");
printf(" Optimization C (Add+RmsNorm): INFEASIBLE (910 无 kernel)\n");
return 0;
}
|