| |
| |
| |
| #include "acl_common.h" |
| #include "acl_runtime.h" |
| #include "aclnn_ops.h" |
| #include <acl/acl.h> |
| #include <aclnnop/aclnn_add_rms_norm.h> |
| #include <aclnnop/aclnn_npu_format_cast.h> |
| #include <aclnnop/aclnn_matmul.h> |
|
|
| #include <cstdio> |
| #include <cstring> |
| #include <vector> |
|
|
| static float bf16_to_float(uint16_t x) { uint32_t u = (uint32_t)x << 16; float f; std::memcpy(&f, &u, 4); return f; } |
| static uint16_t f_to_bf16(float f) { uint32_t u; std::memcpy(&u, &f, 4); return (uint16_t)((u + 0x7FFF + ((u >> 16) & 1)) >> 16); } |
|
|
| static const char* test_add_rms_norm(AclRuntime& rt) { |
| |
| const int64_t D = 16; |
| std::vector<uint16_t> h_x1(D, f_to_bf16(0.5f)); |
| std::vector<uint16_t> h_x2(D, f_to_bf16(0.3f)); |
| std::vector<uint16_t> h_gamma(D, f_to_bf16(1.0f)); |
| DeviceBuffer x1(D*2), x2(D*2), g(D*2), y(D*2), rstd(1*4), x_out(D*2); |
| ACL_CHECK(aclrtMemcpy(x1.get(), D*2, h_x1.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE)); |
| ACL_CHECK(aclrtMemcpy(x2.get(), D*2, h_x2.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE)); |
| ACL_CHECK(aclrtMemcpy(g.get(), D*2, h_gamma.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE)); |
|
|
| auto tx1 = make_contig_tensor(x1.get(), ACL_BF16, {1, D}); |
| auto tx2 = make_contig_tensor(x2.get(), ACL_BF16, {1, D}); |
| auto tg = make_contig_tensor(g.get(), ACL_BF16, {D}); |
| auto ty = make_contig_tensor(y.get(), ACL_BF16, {1, D}); |
| auto trs = make_contig_tensor(rstd.get(), ACL_FLOAT, {1}); |
| auto tout= make_contig_tensor(x_out.get(), ACL_BF16, {1, D}); |
|
|
| uint64_t ws = 0; aclOpExecutor* exec = nullptr; |
| aclnnStatus s = aclnnAddRmsNormGetWorkspaceSize(tx1.get(), tx2.get(), tg.get(), 1e-6, |
| ty.get(), trs.get(), tout.get(), &ws, &exec); |
| if (s != 0) return "GetWorkspaceSize FAILED"; |
| DeviceBuffer ws_buf; |
| if (ws > 0) ws_buf.alloc(ws); |
| s = aclnnAddRmsNorm(ws_buf.get(), ws, exec, rt.stream()); |
| if (s != 0) return "aclnnAddRmsNorm FAILED (kernel not available on 910?)"; |
| if (aclrtSynchronizeStream(rt.stream()) != 0) return "sync FAILED"; |
| return "OK"; |
| } |
|
|
| static const char* test_npu_format_cast_nz(AclRuntime& rt) { |
| |
| const int64_t H = 16, W = 16; |
| std::vector<uint16_t> h(H * W, f_to_bf16(1.0f)); |
| DeviceBuffer src(H * W * 2); |
| ACL_CHECK(aclrtMemcpy(src.get(), H*W*2, h.data(), H*W*2, ACL_MEMCPY_HOST_TO_DEVICE)); |
| auto tsrc = make_contig_tensor(src.get(), ACL_BF16, {H, W}); |
|
|
| |
| int64_t* dst_shape = nullptr; |
| uint64_t dst_shape_size = 0; |
| int actual_fmt = 0; |
| aclnnStatus s = aclnnNpuFormatCastCalculateSizeAndFormat( |
| tsrc.get(), 29 , |
| 27 , |
| &dst_shape, &dst_shape_size, &actual_fmt); |
| if (s != 0) return "CalculateSizeAndFormat FAILED"; |
|
|
| |
| int64_t total = 1; |
| std::vector<int64_t> shape_vec(dst_shape, dst_shape + dst_shape_size); |
| for (auto d : shape_vec) total *= d; |
| DeviceBuffer dst(total * 2); |
| auto tdst = make_acl_tensor(dst.get(), ACL_BF16, shape_vec, {}, (aclFormat)actual_fmt); |
|
|
| uint64_t ws = 0; aclOpExecutor* exec = nullptr; |
| s = aclnnNpuFormatCastGetWorkspaceSize(tsrc.get(), tdst.get(), &ws, &exec); |
| if (s != 0) return "FormatCast GetWorkspaceSize FAILED"; |
| DeviceBuffer ws_buf; if (ws > 0) ws_buf.alloc(ws); |
| s = aclnnNpuFormatCast(ws_buf.get(), ws, exec, rt.stream()); |
| if (s != 0) return "aclnnNpuFormatCast FAILED (NZ not supported on 910?)"; |
| if (aclrtSynchronizeStream(rt.stream()) != 0) return "sync FAILED"; |
| return "OK"; |
| } |
|
|
| static const char* test_matmul_nz(AclRuntime& rt) { |
| |
| const int64_t M = 16, K = 32, N = 16; |
| std::vector<uint16_t> h_x(M * K, f_to_bf16(0.1f)); |
| std::vector<uint16_t> h_w(K * N, f_to_bf16(0.1f)); |
| DeviceBuffer x(M*K*2), w(K*N*2), y(M*N*2); |
| ACL_CHECK(aclrtMemcpy(x.get(), M*K*2, h_x.data(), M*K*2, ACL_MEMCPY_HOST_TO_DEVICE)); |
| ACL_CHECK(aclrtMemcpy(w.get(), K*N*2, h_w.data(), K*N*2, ACL_MEMCPY_HOST_TO_DEVICE)); |
|
|
| auto tx = make_contig_tensor(x.get(), ACL_BF16, {M, K}); |
| auto tw_nd = make_contig_tensor(w.get(), ACL_BF16, {K, N}); |
|
|
| |
| int64_t* dst_shape = nullptr; uint64_t dst_size = 0; int fmt = 0; |
| if (aclnnNpuFormatCastCalculateSizeAndFormat(tw_nd.get(), 29, 27, &dst_shape, &dst_size, &fmt) != 0) |
| return "calc NZ FAILED"; |
| int64_t total = 1; |
| std::vector<int64_t> sh(dst_shape, dst_shape + dst_size); |
| for (auto d : sh) total *= d; |
| DeviceBuffer w_nz(total * 2); |
| auto tw_nz = make_acl_tensor(w_nz.get(), ACL_BF16, sh, {}, (aclFormat)fmt); |
|
|
| uint64_t ws = 0; aclOpExecutor* e = nullptr; |
| aclnnStatus s = aclnnNpuFormatCastGetWorkspaceSize(tw_nd.get(), tw_nz.get(), &ws, &e); |
| if (s != 0) return "NZ cast ws FAILED"; |
| DeviceBuffer wb; if (ws > 0) wb.alloc(ws); |
| if (aclnnNpuFormatCast(wb.get(), ws, e, rt.stream()) != 0) return "NZ cast EXEC FAILED"; |
| if (aclrtSynchronizeStream(rt.stream()) != 0) return "NZ cast sync FAILED"; |
|
|
| |
| auto ty = make_contig_tensor(y.get(), ACL_BF16, {M, N}); |
| ws = 0; e = nullptr; |
| s = aclnnMatmulGetWorkspaceSize(tx.get(), tw_nz.get(), ty.get(), 0 , &ws, &e); |
| if (s != 0) return "MatMul NZ GetWorkspaceSize FAILED"; |
| DeviceBuffer mwb; if (ws > 0) mwb.alloc(ws); |
| if (aclnnMatmul(mwb.get(), ws, e, rt.stream()) != 0) return "MatMul NZ EXEC FAILED (MatMul doesn't accept NZ on 910?)"; |
| if (aclrtSynchronizeStream(rt.stream()) != 0) return "MatMul NZ sync FAILED"; |
| return "OK"; |
| } |
|
|
| static const char* test_multi_stream(AclRuntime& rt) { |
| |
| aclrtStream s2 = nullptr; |
| if (aclrtCreateStream(&s2) != 0) return "aclrtCreateStream FAILED"; |
| |
| DeviceBuffer x(16 * 2); |
| std::vector<uint16_t> hx(16, 0); |
| if (aclrtMemcpyAsync(x.get(), 16*2, hx.data(), 16*2, ACL_MEMCPY_HOST_TO_DEVICE, s2) != 0) return "memcpy on s2 FAILED"; |
| if (aclrtSynchronizeStream(s2) != 0) return "sync s2 FAILED"; |
| aclrtDestroyStream(s2); |
| return "OK"; |
| } |
|
|
| int main() { |
| AclRuntime rt; |
| rt.init(0); |
|
|
| printf("=== 910 op support smoke test ===\n"); |
|
|
| const char* r1 = test_add_rms_norm(rt); |
| printf(" aclnnAddRmsNorm (fused Add+RmsNorm): %s\n", r1); |
|
|
| const char* r2 = test_npu_format_cast_nz(rt); |
| printf(" aclnnNpuFormatCast → FRACTAL_NZ: %s\n", r2); |
|
|
| const char* r3 = test_matmul_nz(rt); |
| printf(" aclnnMatmul with NZ weight: %s\n", r3); |
|
|
| const char* r4 = test_multi_stream(rt); |
| printf(" Multi-stream (compute/comm overlap): %s\n", r4); |
|
|
| |
| printf("\n=== Additional 910 op candidates ===\n"); |
|
|
| |
| #include <aclnnop/aclnn_inplace_add_rms_norm.h> |
| { |
| const int64_t D = 16; |
| std::vector<uint16_t> h(D, f_to_bf16(0.5f)), hg(D, f_to_bf16(1.0f)); |
| DeviceBuffer x1(D*2), x2(D*2), g(D*2), rstd(4); |
| ACL_CHECK(aclrtMemcpy(x1.get(), D*2, h.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE)); |
| ACL_CHECK(aclrtMemcpy(x2.get(), D*2, h.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE)); |
| ACL_CHECK(aclrtMemcpy(g.get(), D*2, hg.data(), D*2, ACL_MEMCPY_HOST_TO_DEVICE)); |
| auto tx1 = make_contig_tensor(x1.get(), ACL_BF16, {1, D}); |
| auto tx2 = make_contig_tensor(x2.get(), ACL_BF16, {1, D}); |
| auto tg = make_contig_tensor(g.get(), ACL_BF16, {D}); |
| auto tr = make_contig_tensor(rstd.get(), ACL_FLOAT, {1}); |
| uint64_t ws = 0; aclOpExecutor* e = nullptr; |
| aclnnStatus s = aclnnInplaceAddRmsNormGetWorkspaceSize(tx1.get(), tx2.get(), tg.get(), 1e-6, |
| tr.get(), &ws, &e); |
| printf(" aclnnInplaceAddRmsNorm: %s\n", s == 0 ? "GetWS OK" : "GetWS FAILED"); |
| if (s == 0) { |
| DeviceBuffer wb; if (ws > 0) wb.alloc(ws); |
| s = aclnnInplaceAddRmsNorm(wb.get(), ws, e, rt.stream()); |
| printf(" exec: %s\n", s == 0 ? "OK" : "FAILED"); |
| } |
| } |
|
|
| |
| printf(" HCCL AllReduce on stream2: requires TP>1, skipped in this smoke test\n"); |
|
|
| printf("\n=== FINAL Feasibility Summary ===\n"); |
| printf(" Optimization A (FRACTAL_NZ): INFEASIBLE (910 不支持)\n"); |
| printf(" Optimization B (multi-stream): FEASIBLE\n"); |
| printf(" Optimization C (Add+RmsNorm): INFEASIBLE (910 无 kernel)\n"); |
| return 0; |
| } |
|
|