llm_mutil_npu / tests /test_op_support.cpp

Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU

4b9fefd 11 days ago

8.9 kB

	// test_op_support.cpp — smoke test which aclnn ops actually RUN on 910 初代.
	// Just call each candidate op with small tensors; report SUCCESS/FAILURE.
	// Guides optimization feasibility analysis.
	#include "acl_common.h"
	#include "acl_runtime.h"
	#include "aclnn_ops.h"
	#include <acl/acl.h>
	#include <aclnnop/aclnn_add_rms_norm.h>
	#include <aclnnop/aclnn_npu_format_cast.h>
	#include <aclnnop/aclnn_matmul.h>

	#include <cstdio>
	#include <cstring>
	#include <vector>

	static float bf16_to_float(uint16_t x) { uint32_t u = (uint32_t)x << 16; float f; std::memcpy(&f, &u, 4); return f; }
	static uint16_t f_to_bf16(float f) { uint32_t u; std::memcpy(&u, &f, 4); return (uint16_t)((u + 0x7FFF + ((u >> 16) & 1)) >> 16); }

	static const char* test_add_rms_norm(AclRuntime& rt) {
	// Inputs: x1 [1, 16], x2 [1, 16] BF16; gamma [16] BF16
	const int64_t D = 16;
	std::vector<uint16_t> h_x1(D, f_to_bf16(0.5f));
	std::vector<uint16_t> h_x2(D, f_to_bf16(0.3f));
	std::vector<uint16_t> h_gamma(D, f_to_bf16(1.0f));
	DeviceBuffer x1(D2), x2(D2), g(D2), y(D2), rstd(14), x_out(D2);
	ACL_CHECK(aclrtMemcpy(x1.get(), D2, h_x1.data(), D2, ACL_MEMCPY_HOST_TO_DEVICE));
	ACL_CHECK(aclrtMemcpy(x2.get(), D2, h_x2.data(), D2, ACL_MEMCPY_HOST_TO_DEVICE));
	ACL_CHECK(aclrtMemcpy(g.get(), D2, h_gamma.data(), D2, ACL_MEMCPY_HOST_TO_DEVICE));

	auto tx1 = make_contig_tensor(x1.get(), ACL_BF16, {1, D});
	auto tx2 = make_contig_tensor(x2.get(), ACL_BF16, {1, D});
	auto tg = make_contig_tensor(g.get(), ACL_BF16, {D});
	auto ty = make_contig_tensor(y.get(), ACL_BF16, {1, D});
	auto trs = make_contig_tensor(rstd.get(), ACL_FLOAT, {1});
	auto tout= make_contig_tensor(x_out.get(), ACL_BF16, {1, D});

	uint64_t ws = 0; aclOpExecutor* exec = nullptr;
	aclnnStatus s = aclnnAddRmsNormGetWorkspaceSize(tx1.get(), tx2.get(), tg.get(), 1e-6,
	ty.get(), trs.get(), tout.get(), &ws, &exec);
	if (s != 0) return "GetWorkspaceSize FAILED";
	DeviceBuffer ws_buf;
	if (ws > 0) ws_buf.alloc(ws);
	s = aclnnAddRmsNorm(ws_buf.get(), ws, exec, rt.stream());
	if (s != 0) return "aclnnAddRmsNorm FAILED (kernel not available on 910?)";
	if (aclrtSynchronizeStream(rt.stream()) != 0) return "sync FAILED";
	return "OK";
	}

	static const char* test_npu_format_cast_nz(AclRuntime& rt) {
	// Transform a small [16, 16] BF16 tensor from ND to NZ format.
	const int64_t H = 16, W = 16;
	std::vector<uint16_t> h(H * W, f_to_bf16(1.0f));
	DeviceBuffer src(H * W * 2);
	ACL_CHECK(aclrtMemcpy(src.get(), HW2, h.data(), HW2, ACL_MEMCPY_HOST_TO_DEVICE));
	auto tsrc = make_contig_tensor(src.get(), ACL_BF16, {H, W});

	// Step 1: calculate NZ shape
	int64_t* dst_shape = nullptr;
	uint64_t dst_shape_size = 0;
	int actual_fmt = 0;
	aclnnStatus s = aclnnNpuFormatCastCalculateSizeAndFormat(
	tsrc.get(), /dstFormat=/29 /* FRACTAL_NZ */,
	/additionalDtype=/27 /* BF16 */,
	&dst_shape, &dst_shape_size, &actual_fmt);
	if (s != 0) return "CalculateSizeAndFormat FAILED";

	// Step 2: alloc dst and call cast
	int64_t total = 1;
	std::vector<int64_t> shape_vec(dst_shape, dst_shape + dst_shape_size);
	for (auto d : shape_vec) total *= d;
	DeviceBuffer dst(total * 2);
	auto tdst = make_acl_tensor(dst.get(), ACL_BF16, shape_vec, {}, (aclFormat)actual_fmt);

	uint64_t ws = 0; aclOpExecutor* exec = nullptr;
	s = aclnnNpuFormatCastGetWorkspaceSize(tsrc.get(), tdst.get(), &ws, &exec);
	if (s != 0) return "FormatCast GetWorkspaceSize FAILED";
	DeviceBuffer ws_buf; if (ws > 0) ws_buf.alloc(ws);
	s = aclnnNpuFormatCast(ws_buf.get(), ws, exec, rt.stream());
	if (s != 0) return "aclnnNpuFormatCast FAILED (NZ not supported on 910?)";
	if (aclrtSynchronizeStream(rt.stream()) != 0) return "sync FAILED";
	return "OK";
	}

	static const char* test_matmul_nz(AclRuntime& rt) {
	// Try a MatMul with NZ-format weight.
	const int64_t M = 16, K = 32, N = 16;
	std::vector<uint16_t> h_x(M * K, f_to_bf16(0.1f));
	std::vector<uint16_t> h_w(K * N, f_to_bf16(0.1f));
	DeviceBuffer x(MK2), w(KN2), y(MN2);
	ACL_CHECK(aclrtMemcpy(x.get(), MK2, h_x.data(), MK2, ACL_MEMCPY_HOST_TO_DEVICE));
	ACL_CHECK(aclrtMemcpy(w.get(), KN2, h_w.data(), KN2, ACL_MEMCPY_HOST_TO_DEVICE));

	auto tx = make_contig_tensor(x.get(), ACL_BF16, {M, K});
	auto tw_nd = make_contig_tensor(w.get(), ACL_BF16, {K, N});

	// Convert W to NZ
	int64_t* dst_shape = nullptr; uint64_t dst_size = 0; int fmt = 0;
	if (aclnnNpuFormatCastCalculateSizeAndFormat(tw_nd.get(), 29, 27, &dst_shape, &dst_size, &fmt) != 0)
	return "calc NZ FAILED";
	int64_t total = 1;
	std::vector<int64_t> sh(dst_shape, dst_shape + dst_size);
	for (auto d : sh) total *= d;
	DeviceBuffer w_nz(total * 2);
	auto tw_nz = make_acl_tensor(w_nz.get(), ACL_BF16, sh, {}, (aclFormat)fmt);

	uint64_t ws = 0; aclOpExecutor* e = nullptr;
	aclnnStatus s = aclnnNpuFormatCastGetWorkspaceSize(tw_nd.get(), tw_nz.get(), &ws, &e);
	if (s != 0) return "NZ cast ws FAILED";
	DeviceBuffer wb; if (ws > 0) wb.alloc(ws);
	if (aclnnNpuFormatCast(wb.get(), ws, e, rt.stream()) != 0) return "NZ cast EXEC FAILED";
	if (aclrtSynchronizeStream(rt.stream()) != 0) return "NZ cast sync FAILED";

	// Now try MatMul with x (ND) × w_nz (NZ)
	auto ty = make_contig_tensor(y.get(), ACL_BF16, {M, N});
	ws = 0; e = nullptr;
	s = aclnnMatmulGetWorkspaceSize(tx.get(), tw_nz.get(), ty.get(), 0 /trans/, &ws, &e);
	if (s != 0) return "MatMul NZ GetWorkspaceSize FAILED";
	DeviceBuffer mwb; if (ws > 0) mwb.alloc(ws);
	if (aclnnMatmul(mwb.get(), ws, e, rt.stream()) != 0) return "MatMul NZ EXEC FAILED (MatMul doesn't accept NZ on 910?)";
	if (aclrtSynchronizeStream(rt.stream()) != 0) return "MatMul NZ sync FAILED";
	return "OK";
	}

	static const char* test_multi_stream(AclRuntime& rt) {
	// Allocate a SECOND stream and check it works.
	aclrtStream s2 = nullptr;
	if (aclrtCreateStream(&s2) != 0) return "aclrtCreateStream FAILED";
	// Simple dummy op on s2
	DeviceBuffer x(16 * 2);
	std::vector<uint16_t> hx(16, 0);
	if (aclrtMemcpyAsync(x.get(), 162, hx.data(), 162, ACL_MEMCPY_HOST_TO_DEVICE, s2) != 0) return "memcpy on s2 FAILED";
	if (aclrtSynchronizeStream(s2) != 0) return "sync s2 FAILED";
	aclrtDestroyStream(s2);
	return "OK";
	}

	int main() {
	AclRuntime rt;
	rt.init(0);

	printf("=== 910 op support smoke test ===\n");

	const char* r1 = test_add_rms_norm(rt);
	printf(" aclnnAddRmsNorm (fused Add+RmsNorm): %s\n", r1);

	const char* r2 = test_npu_format_cast_nz(rt);
	printf(" aclnnNpuFormatCast → FRACTAL_NZ: %s\n", r2);

	const char* r3 = test_matmul_nz(rt);
	printf(" aclnnMatmul with NZ weight: %s\n", r3);

	const char* r4 = test_multi_stream(rt);
	printf(" Multi-stream (compute/comm overlap): %s\n", r4);

	// More candidates
	printf("\n=== Additional 910 op candidates ===\n");

	// InplaceAddRmsNorm
	#include <aclnnop/aclnn_inplace_add_rms_norm.h>
	{
	const int64_t D = 16;
	std::vector<uint16_t> h(D, f_to_bf16(0.5f)), hg(D, f_to_bf16(1.0f));
	DeviceBuffer x1(D2), x2(D2), g(D*2), rstd(4);
	ACL_CHECK(aclrtMemcpy(x1.get(), D2, h.data(), D2, ACL_MEMCPY_HOST_TO_DEVICE));
	ACL_CHECK(aclrtMemcpy(x2.get(), D2, h.data(), D2, ACL_MEMCPY_HOST_TO_DEVICE));
	ACL_CHECK(aclrtMemcpy(g.get(), D2, hg.data(), D2, ACL_MEMCPY_HOST_TO_DEVICE));
	auto tx1 = make_contig_tensor(x1.get(), ACL_BF16, {1, D});
	auto tx2 = make_contig_tensor(x2.get(), ACL_BF16, {1, D});
	auto tg = make_contig_tensor(g.get(), ACL_BF16, {D});
	auto tr = make_contig_tensor(rstd.get(), ACL_FLOAT, {1});
	uint64_t ws = 0; aclOpExecutor* e = nullptr;
	aclnnStatus s = aclnnInplaceAddRmsNormGetWorkspaceSize(tx1.get(), tx2.get(), tg.get(), 1e-6,
	tr.get(), &ws, &e);
	printf(" aclnnInplaceAddRmsNorm: %s\n", s == 0 ? "GetWS OK" : "GetWS FAILED");
	if (s == 0) {
	DeviceBuffer wb; if (ws > 0) wb.alloc(ws);
	s = aclnnInplaceAddRmsNorm(wb.get(), ws, e, rt.stream());
	printf(" exec: %s\n", s == 0 ? "OK" : "FAILED");
	}
	}

	// Test HCCL AllReduce on a separate stream
	printf(" HCCL AllReduce on stream2: requires TP>1, skipped in this smoke test\n");

	printf("\n=== FINAL Feasibility Summary ===\n");
	printf(" Optimization A (FRACTAL_NZ): INFEASIBLE (910 不支持)\n");
	printf(" Optimization B (multi-stream): FEASIBLE\n");
	printf(" Optimization C (Add+RmsNorm): INFEASIBLE (910 无 kernel)\n");
	return 0;
	}