llm_mutil_npu / tests /test_moe_layer.cpp

Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU

4b9fefd 11 days ago

34.8 kB

	// test_moe_layer.cpp — Full MoE layer forward (Qwen3-235B layer 0), TP=1.
	//
	// Pipeline:
	// 1. Post-attention RmsNorm (input from attn_data/final_out.bin)
	// 2. Router: xn @ W_router.T → logits [S, E]
	// 3. TopK softmax → weights [S, K], expert_ids [S, K]
	// 4. Host-normalize top_k weights (Qwen3 norm_topk_prob)
	// 5. MoeInitRoutingV3 → expanded_x [S*K, D], expanded_row_idx, tokens_per_expert
	// 6. GMM gate: expanded_x × gate_exps → [S*K, I]
	// 7. GMM up: same → [S*K, I]
	// 8. silu(gate) * up → [S*K, I]
	// 9. GMM down: act × down_exps → [S*K, D]
	// 10. MoeFinalizeRouting (weighted sum) → [S, D]
	// 11. + residual
	#include "acl_common.h"
	#include "acl_runtime.h"
	#include "aclnn_ops.h"
	#include "device_weights.h"
	#include "model_config.h"
	#include "safetensors_loader.h"

	#include <algorithm>
	#include <cmath>
	#include <cstdio>
	#include <cstring>
	#include <fstream>
	#include <tuple>
	#include <vector>

	static float bf16_to_float(uint16_t x) {
	uint32_t u = (uint32_t)x << 16; float f; std::memcpy(&f, &u, 4); return f;
	}
	static uint16_t float_to_bf16(float x) {
	uint32_t u; std::memcpy(&u, &x, 4);
	return (uint16_t)((u + 0x7FFF + ((u >> 16) & 1)) >> 16);
	}
	static std::vector<uint8_t> read_file(const std::string& p) {
	std::ifstream f(p, std::ios::binary \| std::ios::ate); size_t s = f.tellg();
	f.seekg(0); std::vector<uint8_t> v(s); f.read((char*)v.data(), s); return v;
	}

	int main() {
	const std::string model_dir = "/path/to/Qwen3-235B-A22B-Instruct-2507-BF16";
	const std::string data_dir = "tests/moe_data";

	ModelConfig cfg;
	if (!cfg.load_from_json(model_dir + "/config.json")) return 1;
	cfg.compute_derived(1, 0); // TP=1
	const int64_t D = cfg.hidden_size;
	const int64_t I = cfg.moe_intermediate_size;
	const int64_t E = cfg.num_experts;
	const int64_t K = cfg.num_experts_per_tok;
	const double eps = cfg.rms_norm_eps;

	AclRuntime rt;
	rt.init(0);
	printf("[dbg] rt init ok\n"); fflush(stdout);

	SafetensorsLoader st;
	if (!st.open(model_dir)) return 1;

	// ---- Load weights ----
	printf("Loading layer 0 attention weights (for post_attention_layernorm)...\n");
	DeviceWeightsLoader dw(st, cfg);
	LayerAttnWeights attn;
	if (!dw.load_attention(0, attn)) return 1;

	printf("Loading layer 0 MoE weights (128 experts × 3 projections, stacking + permute)...\n"); fflush(stdout);
	LayerMoEWeights moe;
	if (!dw.load_moe(0, rt.stream(), moe)) return 1;
	rt.sync();
	printf("[dbg] moe load ok\n"); fflush(stdout);
	printf(" router %.1f MB gate_exps %.0f MB up_exps %.0f MB down_exps %.0f MB\n",
	moe.router.size / 1e6, moe.gate_exps.size / 1e6, moe.up_exps.size / 1e6, moe.down_exps.size / 1e6);

	// ---- Load input & Python reference ----
	int S = 5;
	auto x_in_host = read_file(data_dir + "/x_in.bin");
	auto ref_out_host = read_file(data_dir + "/final_out.bin");
	DeviceBuffer x_dev(S * D * 2);
	ACL_CHECK(aclrtMemcpy(x_dev.get(), x_in_host.size(), x_in_host.data(), x_in_host.size(), ACL_MEMCPY_HOST_TO_DEVICE));

	// Residual snapshot
	DeviceBuffer residual_dev(S * D * 2);
	ACL_CHECK(aclrtMemcpy(residual_dev.get(), SD2, x_dev.get(), SD2, ACL_MEMCPY_DEVICE_TO_DEVICE));

	printf("[dbg] loaded data and residual ok, TOTAL=%ld\n", S * K); fflush(stdout);

	// ---- Step 1: Post-attention RmsNorm ----
	DeviceBuffer xn_dev(S * D * 2);
	DeviceBuffer rstd_dev(S * 4);
	auto t_x = make_contig_tensor(x_dev.get(), ACL_BF16, {S, D});
	auto t_xn = make_contig_tensor(xn_dev.get(), ACL_BF16, {S, D});
	auto t_ln = make_contig_tensor(attn.post_attention_layernorm.get(), ACL_BF16, {D});
	auto t_rstd = make_contig_tensor(rstd_dev.get(), ACL_FLOAT, {S});
	rms_norm(rt.stream(), t_x.get(), t_ln.get(), eps, t_xn.get(), t_rstd.get());
	rt.sync();
	printf("[dbg] rms_norm ok\n"); fflush(stdout);

	// ---- Step 2: Router (gate matmul) ----
	DeviceBuffer logits_dev(S * E * 2);
	auto t_logits = make_contig_tensor(logits_dev.get(), ACL_BF16, {S, E});
	// router is [E, D] (HF). logits = xn @ router.T
	linear_hf(rt.stream(), t_xn.get(), moe.router.get(), ACL_BF16, E, D, t_logits.get());
	rt.sync();
	printf("[dbg] router linear ok\n"); fflush(stdout);

	// ---- Step 3: TopK softmax ----
	DeviceBuffer topk_w_dev(S * K * 2); // BF16
	DeviceBuffer topk_idx_dev(S * K * 4); // int32
	DeviceBuffer row_idx_dev(S * K * 4); // int32 (from gating op, unused for our routing)
	auto t_topk_w = make_contig_tensor(topk_w_dev.get(), ACL_BF16, {S, K});
	auto t_topk_idx = make_contig_tensor(topk_idx_dev.get(), ACL_INT32, {S, K});
	auto t_row_idx = make_contig_tensor(row_idx_dev.get(), ACL_INT32, {S, K});
	moe_gating_topk_softmax(rt.stream(), t_logits.get(), K, t_topk_w.get(), t_topk_idx.get(), t_row_idx.get());
	rt.sync();
	printf("[dbg] topk_softmax ok\n"); fflush(stdout);

	// ---- Step 4: Host-normalize top_k weights (norm_topk_prob=true) ----
	std::vector<uint16_t> tw_bf(S * K);
	ACL_CHECK(aclrtMemcpy(tw_bf.data(), SK2, topk_w_dev.get(), SK2, ACL_MEMCPY_DEVICE_TO_HOST));
	for (int s = 0; s < S; s++) {
	float sum = 0.0f;
	for (int k = 0; k < K; k++) sum += bf16_to_float(tw_bf[s*K + k]);
	sum += 1e-20f;
	for (int k = 0; k < K; k++) {
	float v = bf16_to_float(tw_bf[s*K + k]) / sum;
	tw_bf[s*K + k] = float_to_bf16(v);
	}
	}
	ACL_CHECK(aclrtMemcpy(topk_w_dev.get(), SK2, tw_bf.data(), SK2, ACL_MEMCPY_HOST_TO_DEVICE));

	// ---- Step 5: MoE init routing ----
	int64_t TOTAL = S * K;
	DeviceBuffer expanded_x_dev(TOTAL * D * 2);
	DeviceBuffer expanded_row_idx_dev(TOTAL * 4);
	DeviceBuffer tokens_per_expert_dev(E * 8);

	auto t_ex_x = make_contig_tensor(expanded_x_dev.get(), ACL_BF16, {TOTAL, D});
	auto t_ex_ri = make_contig_tensor(expanded_row_idx_dev.get(), ACL_INT32, {TOTAL});
	auto t_tpe = make_contig_tensor(tokens_per_expert_dev.get(), ACL_INT64, {E});

	moe_init_routing_v3(rt.stream(),
	t_xn.get(), t_topk_idx.get(),
	E, TOTAL,
	t_ex_x.get(), t_ex_ri.get(), t_tpe.get());
	rt.sync();
	printf("[dbg] moe_init_routing ok\n"); fflush(stdout);

	// Convert tokens_per_expert from counts to cumsum (on host) for GMM groupListType=0.
	DeviceBuffer tpe_cumsum_dev(E * 8);
	{
	std::vector<int64_t> h_counts(E), h_cum(E);
	ACL_CHECK(aclrtMemcpy(h_counts.data(), E8, tokens_per_expert_dev.get(), E8, ACL_MEMCPY_DEVICE_TO_HOST));
	int64_t acc = 0;
	for (int i = 0; i < E; i++) { acc += h_counts[i]; h_cum[i] = acc; }
	ACL_CHECK(aclrtMemcpy(tpe_cumsum_dev.get(), E8, h_cum.data(), E8, ACL_MEMCPY_HOST_TO_DEVICE));
	}
	auto t_tpe_cum = make_contig_tensor(tpe_cumsum_dev.get(), ACL_INT64, {E});

	// ---- Step 6/7: GMM gate and up ----
	DeviceBuffer gate_out_dev(TOTAL * I * 2);
	DeviceBuffer up_out_dev(TOTAL * I * 2);
	auto t_gate_out = make_contig_tensor(gate_out_dev.get(), ACL_BF16, {TOTAL, I});
	auto t_up_out = make_contig_tensor(up_out_dev.get(), ACL_BF16, {TOTAL, I});
	// gate/up_exps loaded as [E, D, I] row-major
	auto t_w_gate = make_contig_tensor(moe.gate_exps.get(), ACL_BF16, {E, D, I});
	auto t_w_up = make_contig_tensor(moe.up_exps.get(), ACL_BF16, {E, D, I});
	// Use cumsum group_list (groupListType=0): empirically more reliable with many zero-count experts.
	grouped_matmul_v4(rt.stream(), t_ex_x.get(), t_w_gate.get(), t_tpe_cum.get(), t_gate_out.get(), 0);
	rt.sync();
	printf("[dbg] gmm gate ok\n"); fflush(stdout);
	grouped_matmul_v4(rt.stream(), t_ex_x.get(), t_w_up.get(), t_tpe_cum.get(), t_up_out.get(), 0);
	rt.sync();
	printf("[dbg] gmm up ok\n"); fflush(stdout);

	// ---- Step 8: SwiGLU ----
	// act = silu(gate) * up (inplace on gate_out)
	silu(rt.stream(), t_gate_out.get(), t_gate_out.get());
	rt.sync(); printf("[dbg] silu ok\n"); fflush(stdout);
	mul(rt.stream(), t_gate_out.get(), t_up_out.get(), t_gate_out.get());
	rt.sync(); printf("[dbg] mul ok\n"); fflush(stdout);
	// now gate_out_dev contains the activated intermediate

	// ---- Step 9: GMM down ----
	DeviceBuffer down_out_dev(TOTAL * D * 2);
	auto t_down_out = make_contig_tensor(down_out_dev.get(), ACL_BF16, {TOTAL, D});
	auto t_w_down = make_contig_tensor(moe.down_exps.get(), ACL_BF16, {E, I, D});
	grouped_matmul_v4(rt.stream(), t_gate_out.get(), t_w_down.get(), t_tpe_cum.get(), t_down_out.get(), 0);
	rt.sync();
	printf("[dbg] gmm down ok\n"); fflush(stdout);

	// ---- Step 10: Device-side manual finalize (replacement for buggy MoeFinalizeRoutingV2) ----
	// Compute forward permutation fwd[n*K + k] = p where token n's k-th expert's output is at
	// expanded position p. We use tokens_per_expert (cumsum) + topk_idx to resolve this correctly,
	// regardless of the exact rowIdxType semantics returned by MoeInitRoutingV3.
	DeviceBuffer fwd_dev(TOTAL * 8);
	{
	std::vector<int64_t> h_tpe2(E);
	std::vector<int32_t> h_tidx3(S * K);
	ACL_CHECK(aclrtMemcpy(h_tpe2.data(), E8, tokens_per_expert_dev.get(), E8, ACL_MEMCPY_DEVICE_TO_HOST));
	ACL_CHECK(aclrtMemcpy(h_tidx3.data(), SK4, topk_idx_dev.get(), SK4, ACL_MEMCPY_DEVICE_TO_HOST));

	// Sort (n, k) pairs by expert ascending (stable). For each expert in order, tokens
	// appear in ascending token index (since MoeInitRoutingV3 is stable by s).
	// Specifically: expanded positions 0..tpe[0]-1 are for expert 0 (tokens picking e=0, in n-ascending order),
	// next tpe[1] are for expert 1, etc.
	//
	// To build fwd: for each (n, k), expert e = topk_idx[n, k]. Position p is the base of expert e's
	// block plus the rank of n within tokens picking e.
	std::vector<int64_t> expert_base(E + 1, 0);
	for (int e = 0; e < E; e++) expert_base[e + 1] = expert_base[e] + h_tpe2[e];

	std::vector<int> expert_slot(E, 0); // next available slot per expert
	std::vector<int64_t> fwd(TOTAL);
	// Iterate in token-ascending, k-ascending order — match MoeInitRoutingV3's stable sort convention.
	// For each (n, k) sorted by (expert[n,k], n), assign p.
	// Simpler: pre-collect (e, n, k) triples, sort by (e, n), then p is the rank.
	std::vector<std::tuple<int, int, int>> triples;
	triples.reserve(TOTAL);
	for (int n = 0; n < S; n++) for (int k = 0; k < K; k++) {
	triples.emplace_back(h_tidx3[n * K + k], n, k);
	}
	std::sort(triples.begin(), triples.end(), [](const auto& a, const auto& b){
	if (std::get<0>(a) != std::get<0>(b)) return std::get<0>(a) < std::get<0>(b);
	return std::get<1>(a) < std::get<1>(b);
	});
	for (int64_t p = 0; p < TOTAL; p++) {
	auto [e, n, k] = triples[p];
	fwd[n * K + k] = p;
	}
	ACL_CHECK(aclrtMemcpy(fwd_dev.get(), TOTAL8, fwd.data(), TOTAL8, ACL_MEMCPY_HOST_TO_DEVICE));
	}
	auto t_fwd = make_contig_tensor(fwd_dev.get(), ACL_INT64, {TOTAL});

	// Gather: packed [S*K, D] = down_out[fwd, :]
	DeviceBuffer packed_dev(TOTAL * D * 2);
	auto t_packed = make_contig_tensor(packed_dev.get(), ACL_BF16, {TOTAL, D});
	index_select(rt.stream(), t_down_out.get(), 0, t_fwd.get(), t_packed.get());
	rt.sync();

	// Broadcast-multiply by topk_w: view packed as [S, K, D], topk_w as [S, K, 1].
	auto t_packed_3d = make_contig_tensor(packed_dev.get(), ACL_BF16, {S, K, D});
	auto t_topk_w_3d = make_contig_tensor(topk_w_dev.get(), ACL_BF16, {S, K, 1});
	DeviceBuffer weighted_dev(S * K * D * 2);
	auto t_weighted = make_contig_tensor(weighted_dev.get(), ACL_BF16, {S, K, D});
	mul(rt.stream(), t_packed_3d.get(), t_topk_w_3d.get(), t_weighted.get());
	rt.sync();

	// Verify broadcast mul + sum by dumping all k entries and summing on host.
	{
	std::vector<uint16_t> h_pk_all(S * K * D);
	std::vector<uint16_t> h_wt_all(S * K * D);
	std::vector<uint16_t> h_tw_all(S * K);
	ACL_CHECK(aclrtMemcpy(h_pk_all.data(), SKD2, packed_dev.get(), SKD2, ACL_MEMCPY_DEVICE_TO_HOST));
	ACL_CHECK(aclrtMemcpy(h_wt_all.data(), SKD2, weighted_dev.get(), SKD2, ACL_MEMCPY_DEVICE_TO_HOST));
	ACL_CHECK(aclrtMemcpy(h_tw_all.data(), SK2, topk_w_dev.get(), SK2, ACL_MEMCPY_DEVICE_TO_HOST));

	printf(" verify weighted[0, k, 0] = packed[0, k, 0] * topk_w[0, k] for all k:\n");
	float host_sum = 0;
	for (int k = 0; k < K; k++) {
	float p = bf16_to_float(h_pk_all[k * D]); // packed[0, k, 0] = offset sKD + kD + 0 = kD (for s=0)
	float w = bf16_to_float(h_tw_all[k]); // topk_w[0, k]
	float wt = bf16_to_float(h_wt_all[k * D]); // weighted[0, k, 0]
	host_sum += p * w;
	printf(" k=%d: packed=%.5f * topk_w=%.5f = expect=%.5f dev=%.5f\n",
	k, p, w, p*w, wt);
	}
	printf(" host_sum_of_weighted[0, :, 0] = %.5f (expected moe_out[0,0] = -0.02466)\n", host_sum);
	}

	// ReduceSum over K axis → [S, D]
	DeviceBuffer moe_out_dev(S * D * 2);
	auto t_moe_out = make_contig_tensor(moe_out_dev.get(), ACL_BF16, {S, D});
	reduce_sum(rt.stream(), t_weighted.get(), {1}, /keep_dims=/false, ACL_BF16, t_moe_out.get());
	rt.sync();
	printf("[dbg] device-side finalize (gather+mul+reduce) ok\n"); fflush(stdout);

	// Residual add to produce final_out
	float alpha_v = 1.0f; aclScalar* alpha = aclCreateScalar(&alpha_v, ACL_FLOAT);
	DeviceBuffer final_dev(S * D * 2);
	auto t_final = make_contig_tensor(final_dev.get(), ACL_BF16, {S, D});
	auto t_res = make_contig_tensor(residual_dev.get(), ACL_BF16, {S, D});
	{
	uint64_t ws = 0; aclOpExecutor* e = nullptr;
	ACLNN_CHECK(aclnnAddGetWorkspaceSize(t_res.get(), t_moe_out.get(), alpha, t_final.get(), &ws, &e));
	DeviceBuffer wb; if (ws > 0) wb.alloc(ws);
	ACLNN_CHECK(aclnnAdd(wb.get(), ws, e, rt.stream()));
	}
	aclDestroyScalar(alpha);
	rt.sync();

	// ---- Compare (intermediate + final) ----
	auto compare_bf16 = [&](const char* label, void* dev_ptr, int64_t nelem,
	const std::string& ref_file) {
	std::vector<uint16_t> cxx(nelem);
	ACL_CHECK(aclrtMemcpy(cxx.data(), nelem2, dev_ptr, nelem2, ACL_MEMCPY_DEVICE_TO_HOST));
	auto refbuf = read_file(data_dir + "/" + ref_file);
	auto* ref = (const uint16_t*)refbuf.data();
	double l2d = 0, l2r = 0, maxd = 0;
	for (int64_t i = 0; i < nelem; i++) {
	float a = bf16_to_float(cxx[i]), b = bf16_to_float(ref[i]);
	l2d += (a-b)(a-b); l2r += bb;
	if (std::abs(a-b) > maxd) maxd = std::abs(a-b);
	}
	double rel = std::sqrt(l2d) / (std::sqrt(l2r) + 1e-10);
	printf(" [cmp] %-12s rel=%.4e max_abs=%.4f cxx[:4]=%.5f %.5f %.5f %.5f ref[:4]=%.5f %.5f %.5f %.5f\n",
	label, rel, maxd,
	bf16_to_float(cxx[0]), bf16_to_float(cxx[1]), bf16_to_float(cxx[2]), bf16_to_float(cxx[3]),
	bf16_to_float(ref[0]), bf16_to_float(ref[1]), bf16_to_float(ref[2]), bf16_to_float(ref[3]));
	return rel;
	};

	printf("\n=== Intermediate diagnostics ===\n");
	compare_bf16("xn", xn_dev.get(), S * D, "xn.bin");
	compare_bf16("topk_w", topk_w_dev.get(), S * K, "topk_w.bin");

	// Dump topk_idx (int32) to compare
	{
	std::vector<int32_t> cxx_idx(S*K);
	ACL_CHECK(aclrtMemcpy(cxx_idx.data(), SK4, topk_idx_dev.get(), SK4, ACL_MEMCPY_DEVICE_TO_HOST));
	auto refbuf = read_file(data_dir + "/topk_idx.bin");
	auto* ref = (const int32_t*)refbuf.data();
	int mismatches = 0;
	for (int i = 0; i < S*K; i++) if (cxx_idx[i] != ref[i]) mismatches++;
	printf(" [cmp] topk_idx mismatches=%d/%d cxx[0,:4]=%d %d %d %d ref[0,:4]=%d %d %d %d\n",
	mismatches, S*K,
	cxx_idx[0], cxx_idx[1], cxx_idx[2], cxx_idx[3],
	ref[0], ref[1], ref[2], ref[3]);
	}

	printf("\n=== MoE-only (before residual) ===\n");
	compare_bf16("moe_out", moe_out_dev.get(), S * D, "out_flat.bin");

	// Manual host-side finalize: verify what down_out + expanded_row_idx + topk_w produce.
	{
	std::vector<uint16_t> h_down(TOTAL * D);
	std::vector<int32_t> h_ri(TOTAL);
	std::vector<uint16_t> h_tw(S * K);
	ACL_CHECK(aclrtMemcpy(h_down.data(), TOTALD2, down_out_dev.get(), TOTALD2, ACL_MEMCPY_DEVICE_TO_HOST));
	ACL_CHECK(aclrtMemcpy(h_ri.data(), TOTAL4, expanded_row_idx_dev.get(), TOTAL4, ACL_MEMCPY_DEVICE_TO_HOST));
	ACL_CHECK(aclrtMemcpy(h_tw.data(), SK2, topk_w_dev.get(), SK2, ACL_MEMCPY_DEVICE_TO_HOST));

	printf(" expanded_row_idx (all %ld):\n ", TOTAL);
	for (int i = 0; i < TOTAL; i++) {
	printf("%d ", h_ri[i]);
	if ((i+1) % 10 == 0) printf("\n ");
	}
	printf("\n");
	// count unique and check bijection
	std::vector<int> count(TOTAL, 0);
	int out_of_range = 0;
	for (int i = 0; i < TOTAL; i++) {
	int v = h_ri[i];
	if (v >= 0 && v < TOTAL) count[v]++;
	else out_of_range++;
	}
	int bijection_ok = (out_of_range == 0);
	for (int i = 0; i < TOTAL && bijection_ok; i++) if (count[i] != 1) bijection_ok = 0;
	printf(" bijection=%s out_of_range=%d\n", bijection_ok ? "YES" : "NO", out_of_range);

	// Also dump tokens_per_expert (int64) — should sum to TOTAL
	std::vector<int64_t> h_tpe(E);
	ACL_CHECK(aclrtMemcpy(h_tpe.data(), E8, tokens_per_expert_dev.get(), E8, ACL_MEMCPY_DEVICE_TO_HOST));
	int64_t tpe_sum = 0, nonzero = 0;
	int64_t tpe_max = 0;
	for (int i = 0; i < E; i++) { tpe_sum += h_tpe[i]; if (h_tpe[i]>0) nonzero++; if (h_tpe[i]>tpe_max) tpe_max=h_tpe[i]; }
	printf(" tokens_per_expert: sum=%ld nonzero=%ld max=%ld (expected sum=%ld if counts, or last=%ld if cumsum)\n",
	tpe_sum, nonzero, tpe_max, TOTAL, TOTAL);
	printf(" tpe[last 4]: %ld %ld %ld %ld\n", h_tpe[E-4], h_tpe[E-3], h_tpe[E-2], h_tpe[E-1]);

	std::vector<float> manual(S * D, 0.0f);
	for (int64_t p = 0; p < TOTAL; p++) {
	int32_t src = h_ri[p];
	int s = src / K;
	int k = src % K;
	if (s < 0 \|\| s >= S \|\| k < 0 \|\| k >= K) { printf(" bad idx p=%ld src=%d\n", p, src); continue; }
	float w = bf16_to_float(h_tw[s * K + k]);
	for (int d = 0; d < D; d++) {
	manual[s * D + d] += w * bf16_to_float(h_down[p * D + d]);
	}
	}
	// Convert to bf16 and compare to Python out_flat
	auto refbuf = read_file(data_dir + "/out_flat.bin");
	auto* ref = (const uint16_t*)refbuf.data();
	double l2d=0, l2r=0, maxd=0;
	for (int64_t i = 0; i < S*D; i++) {
	float a = manual[i], b = bf16_to_float(ref[i]);
	l2d += (a-b)(a-b); l2r += bb;
	if (std::abs(a-b) > maxd) maxd = std::abs(a-b);
	}
	double rel_manual = std::sqrt(l2d) / (std::sqrt(l2r) + 1e-10);
	printf(" [cmp] MANUAL(row_idx=src→flat) rel=%.4e max_abs=%.4f m[:4]=%.5f %.5f %.5f %.5f r[:4]=%.5f %.5f %.5f %.5f\n",
	rel_manual, maxd,
	manual[0], manual[1], manual[2], manual[3],
	bf16_to_float(ref[0]), bf16_to_float(ref[1]), bf16_to_float(ref[2]), bf16_to_float(ref[3]));

	// Alternative semantic: row_idx[p] = destination position
	// In that case: p=src_row, dst=h_ri[p]
	std::vector<float> manual2(S * D, 0.0f);
	for (int64_t p = 0; p < TOTAL; p++) {
	int32_t dst = h_ri[p];
	int s = dst / K;
	int k = dst % K;
	if (s < 0 \|\| s >= S \|\| k < 0 \|\| k >= K) continue;
	float w = bf16_to_float(h_tw[s * K + k]);
	for (int d = 0; d < D; d++) {
	manual2[s * D + d] += w * bf16_to_float(h_down[p * D + d]);
	}
	}
	double l2d2=0, l2r2=0, maxd2=0;
	for (int64_t i = 0; i < S*D; i++) {
	float a = manual2[i], b = bf16_to_float(ref[i]);
	l2d2 += (a-b)(a-b); l2r2 += bb;
	if (std::abs(a-b) > maxd2) maxd2 = std::abs(a-b);
	}
	double rel_manual2 = std::sqrt(l2d2) / (std::sqrt(l2r2) + 1e-10);
	printf(" [cmp] MANUAL(row_idx=p→dst_flat) rel=%.4e max_abs=%.4f m[:4]=%.5f %.5f %.5f %.5f\n",
	rel_manual2, maxd2,
	manual2[0], manual2[1], manual2[2], manual2[3]);
	}

	// Manual finalize using cumsum (semantics-independent):
	// For each (n, k), find p such that actual_s(p)=n AND expert(p)=topk_idx[n,k], then
	// out[n] += topk_w[n,k] * down_out[p].
	{
	std::vector<uint16_t> h_down(TOTAL * D);
	std::vector<int64_t> h_tpe(E);
	std::vector<int32_t> h_tidx(S * K);
	std::vector<uint16_t> h_tw(S * K);
	std::vector<uint16_t> h_xn_all(S * D);
	std::vector<uint16_t> h_ex_all(TOTAL * D);
	ACL_CHECK(aclrtMemcpy(h_down.data(), TOTALD2, down_out_dev.get(), TOTALD2, ACL_MEMCPY_DEVICE_TO_HOST));
	ACL_CHECK(aclrtMemcpy(h_tpe.data(), E8, tokens_per_expert_dev.get(), E8, ACL_MEMCPY_DEVICE_TO_HOST));
	ACL_CHECK(aclrtMemcpy(h_tidx.data(), SK4, topk_idx_dev.get(), SK4, ACL_MEMCPY_DEVICE_TO_HOST));
	ACL_CHECK(aclrtMemcpy(h_tw.data(), SK2, topk_w_dev.get(), SK2, ACL_MEMCPY_DEVICE_TO_HOST));
	ACL_CHECK(aclrtMemcpy(h_xn_all.data(), SD2, xn_dev.get(), SD2, ACL_MEMCPY_DEVICE_TO_HOST));
	ACL_CHECK(aclrtMemcpy(h_ex_all.data(), TOTALD2, expanded_x_dev.get(), TOTALD2, ACL_MEMCPY_DEVICE_TO_HOST));

	// Build p → (actual_s, actual_expert).
	// actual_s: find s with xn[s,0] == expanded_x[p,0]
	// actual_expert: find e such that cumsum_tpe[e-1] <= p < cumsum_tpe[e]
	std::vector<int> p_to_s(TOTAL), p_to_e(TOTAL);
	int64_t cum = 0;
	int cursor_e = 0;
	for (int64_t p = 0; p < TOTAL; p++) {
	while (cursor_e < E && p >= cum + h_tpe[cursor_e]) { cum += h_tpe[cursor_e]; cursor_e++; }
	p_to_e[p] = cursor_e;
	float ev = bf16_to_float(h_ex_all[p * D]);
	int best = -1; float bd = 1e30f;
	for (int s = 0; s < S; s++) {
	float df = std::abs(bf16_to_float(h_xn_all[s * D]) - ev);
	if (df < bd) { bd = df; best = s; }
	}
	p_to_s[p] = best;
	}

	// Build (n, k) → p lookup via (n, expert) → p
	std::vector<float> manual_cum(S * D, 0.0f);
	int found_count = 0;
	for (int n = 0; n < S; n++) {
	for (int k = 0; k < K; k++) {
	int e = h_tidx[n * K + k];
	float w = bf16_to_float(h_tw[n * K + k]);
	// search p with p_to_s[p]==n and p_to_e[p]==e
	int found_p = -1;
	for (int64_t p = 0; p < TOTAL; p++) {
	if (p_to_s[p] == n && p_to_e[p] == e) { found_p = p; break; }
	}
	if (found_p < 0) {
	printf(" [!!!] not found: n=%d k=%d expert=%d\n", n, k, e);
	continue;
	}
	found_count++;
	for (int d = 0; d < D; d++)
	manual_cum[n * D + d] += w * bf16_to_float(h_down[found_p * D + d]);
	}
	}
	auto refbuf = read_file(data_dir + "/out_flat.bin");
	auto* ref = (const uint16_t*)refbuf.data();
	double l2d=0, l2r=0, maxd=0;
	for (int64_t i = 0; i < S*D; i++) {
	float a = manual_cum[i], b = bf16_to_float(ref[i]);
	l2d += (a-b)(a-b); l2r += bb;
	if (std::abs(a-b) > maxd) maxd = std::abs(a-b);
	}
	double rel_cum = std::sqrt(l2d) / (std::sqrt(l2r) + 1e-10);
	printf(" [cmp] MANUAL_CUMSUM (p via expert cumsum) rel=%.4e max=%.4f found=%d/40 m[:4]=%.5f %.5f %.5f %.5f\n",
	rel_cum, maxd, found_count, manual_cum[0], manual_cum[1], manual_cum[2], manual_cum[3]);
	}

	// Dump all expanded_x[p, 0] and all xn[s, 0] to determine the mapping.
	{
	std::vector<uint16_t> h_xn_all(S * D);
	ACL_CHECK(aclrtMemcpy(h_xn_all.data(), SD2, xn_dev.get(), SD2, ACL_MEMCPY_DEVICE_TO_HOST));
	std::vector<uint16_t> h_ex_all(TOTAL * D);
	ACL_CHECK(aclrtMemcpy(h_ex_all.data(), TOTALD2, expanded_x_dev.get(), TOTALD2, ACL_MEMCPY_DEVICE_TO_HOST));
	printf(" xn[s, 0]: ");
	for (int s = 0; s < S; s++) printf("%.5f ", bf16_to_float(h_xn_all[s * D]));
	printf("\n expanded_x[p, 0]: ");
	for (int p = 0; p < TOTAL; p++) printf("%.5f ", bf16_to_float(h_ex_all[p * D]));
	printf("\n mapping p→s (by matching expanded_x[p,0] to xn[s,0]): ");
	for (int p = 0; p < TOTAL; p++) {
	float e = bf16_to_float(h_ex_all[p * D]);
	int match = -1; float best = 1e30f;
	for (int s = 0; s < S; s++) {
	float df = std::abs(bf16_to_float(h_xn_all[s * D]) - e);
	if (df < best) { best = df; match = s; }
	}
	printf("%d ", match);
	}
	printf("\n");
	}

	// Dump gate_out[p=4, :8] — gate activation of xn[0] via expert 10
	{
	std::vector<uint16_t> h_gate(I);
	// NOTE: gate_out_dev was overwritten by silu+mul. So we need to reload from scratch.
	// Instead just show down_out[4, :4].
	std::vector<uint16_t> h_d(D);
	ACL_CHECK(aclrtMemcpy(h_d.data(), D2, (char)down_out_dev.get() + 4D2, D*2, ACL_MEMCPY_DEVICE_TO_HOST));
	printf(" down_out[p=4, :4] (s=0, k=0, expert=10): %.5f %.5f %.5f %.5f\n",
	bf16_to_float(h_d[0]), bf16_to_float(h_d[1]), bf16_to_float(h_d[2]), bf16_to_float(h_d[3]));
	// If GMM is correct, down_out[4] ~ ref[0] / topk_w[0,0]. ref[0,:4]=[-0.025, -0.007, 0.005, -0.008] / 0.224 ~ [-0.113, -0.031, 0.024, -0.036].
	// But it's just ONE contribution so hard to compare directly.
	}

	// Single-expert verification using linear_hf: compute gate/up/down for (xn[0], expert=10)
	// and compare with GMM's down_out at the corresponding position.
	// linear_hf expects HF-layout weight [out_features, in_features]; our stacked gate_exps/up_exps
	// are [E, D, I] — meaning per-expert shape is [D, I] (K, N) NOT HF [I, D]. So we can NOT directly
	// linear_hf from gate_exps. Instead, load the expert-10 weight fresh and use linear_hf.
	{
	std::vector<int32_t> h_tidx_local(S * K);
	ACL_CHECK(aclrtMemcpy(h_tidx_local.data(), SK4, topk_idx_dev.get(), SK4, ACL_MEMCPY_DEVICE_TO_HOST));
	int target_expert = h_tidx_local[0 * K + 0]; // topk_idx[0, 0] should be 10 from Python ref
	printf("\n === Single-expert linear_hf vs GMM sanity (token 0, expert %d) ===\n", target_expert);

	// Recompute p_to_s and p_to_e from host data (scoped locally).
	std::vector<int64_t> h_tpe2(E);
	std::vector<uint16_t> h_xn_all2(S * D);
	std::vector<uint16_t> h_ex_all2(TOTAL * D);
	ACL_CHECK(aclrtMemcpy(h_tpe2.data(), E8, tokens_per_expert_dev.get(), E8, ACL_MEMCPY_DEVICE_TO_HOST));
	ACL_CHECK(aclrtMemcpy(h_xn_all2.data(), SD2, xn_dev.get(), SD2, ACL_MEMCPY_DEVICE_TO_HOST));
	ACL_CHECK(aclrtMemcpy(h_ex_all2.data(), TOTALD2, expanded_x_dev.get(), TOTALD2, ACL_MEMCPY_DEVICE_TO_HOST));
	std::vector<int> p_to_s(TOTAL), p_to_e(TOTAL);
	{
	int64_t cum = 0; int ce = 0;
	for (int64_t p = 0; p < TOTAL; p++) {
	while (ce < E && p >= cum + h_tpe2[ce]) { cum += h_tpe2[ce]; ce++; }
	p_to_e[p] = ce;
	float ev = bf16_to_float(h_ex_all2[p * D]);
	int best = -1; float bd = 1e30f;
	for (int s = 0; s < S; s++) {
	float df = std::abs(bf16_to_float(h_xn_all2[s * D]) - ev);
	if (df < bd) { bd = df; best = s; }
	}
	p_to_s[p] = best;
	}
	}

	DeviceBuffer g_w, u_w, d_w;
	char ename[256];
	snprintf(ename, sizeof(ename), "model.layers.0.mlp.experts.%d.gate_proj.weight", target_expert);
	if (!dw.st().get(ename)) { printf(" missing %s\n", ename); goto after_sanity; }

	// Load full per-expert weight using public helpers (indirectly via loader).
	// Easiest: use load_tensor_full_ via friend access... Instead, use st_ directly.
	{
	auto* m_gate = dw.st().get(ename);
	DeviceBuffer gw_buf(m_gate->nbytes);
	ACL_CHECK(aclrtMemcpy(gw_buf.get(), m_gate->nbytes, dw.st().data_ptr(*m_gate), m_gate->nbytes, ACL_MEMCPY_HOST_TO_DEVICE));
	g_w = std::move(gw_buf);

	snprintf(ename, sizeof(ename), "model.layers.0.mlp.experts.%d.up_proj.weight", target_expert);
	auto* m_up = dw.st().get(ename);
	DeviceBuffer uw_buf(m_up->nbytes);
	ACL_CHECK(aclrtMemcpy(uw_buf.get(), m_up->nbytes, dw.st().data_ptr(*m_up), m_up->nbytes, ACL_MEMCPY_HOST_TO_DEVICE));
	u_w = std::move(uw_buf);

	snprintf(ename, sizeof(ename), "model.layers.0.mlp.experts.%d.down_proj.weight", target_expert);
	auto* m_down = dw.st().get(ename);
	DeviceBuffer dw_buf(m_down->nbytes);
	ACL_CHECK(aclrtMemcpy(dw_buf.get(), m_down->nbytes, dw.st().data_ptr(*m_down), m_down->nbytes, ACL_MEMCPY_HOST_TO_DEVICE));
	d_w = std::move(dw_buf);
	}

	// Compute gate = xn[0] @ gate_w.T → [I]; up = xn[0] @ up_w.T → [I]; act; down = act @ down_w.T → [D]
	DeviceBuffer xn0_dev(D * 2);
	ACL_CHECK(aclrtMemcpy(xn0_dev.get(), D2, xn_dev.get(), D2, ACL_MEMCPY_DEVICE_TO_DEVICE));

	DeviceBuffer gate_v(I * 2), up_v(I * 2), act_v(I * 2), down_v(D * 2);
	auto t_xn0 = make_contig_tensor(xn0_dev.get(), ACL_BF16, {1, D});
	auto t_gate = make_contig_tensor(gate_v.get(), ACL_BF16, {1, I});
	auto t_up = make_contig_tensor(up_v.get(), ACL_BF16, {1, I});
	auto t_act = make_contig_tensor(act_v.get(), ACL_BF16, {1, I});
	auto t_down = make_contig_tensor(down_v.get(), ACL_BF16, {1, D});
	linear_hf(rt.stream(), t_xn0.get(), g_w.get(), ACL_BF16, I, D, t_gate.get()); // gate_proj HF [I, D]
	linear_hf(rt.stream(), t_xn0.get(), u_w.get(), ACL_BF16, I, D, t_up.get());
	rt.sync();
	silu(rt.stream(), t_gate.get(), t_act.get());
	mul(rt.stream(), t_act.get(), t_up.get(), t_act.get());
	rt.sync();
	linear_hf(rt.stream(), t_act.get(), d_w.get(), ACL_BF16, D, I, t_down.get()); // down_proj HF [D, I]
	rt.sync();

	std::vector<uint16_t> h_down_lin(D);
	ACL_CHECK(aclrtMemcpy(h_down_lin.data(), D2, down_v.get(), D2, ACL_MEMCPY_DEVICE_TO_HOST));

	// Find the p in GMM output that corresponds to (s=0, expert=target_expert)
	int found_p = -1;
	for (int64_t p = 0; p < TOTAL; p++) {
	if (p_to_s[p] == 0 && p_to_e[p] == target_expert) { found_p = p; break; }
	}
	if (found_p >= 0) {
	std::vector<uint16_t> h_down_gmm(D);
	ACL_CHECK(aclrtMemcpy(h_down_gmm.data(), D2, (char)down_out_dev.get() + found_pD2, D*2, ACL_MEMCPY_DEVICE_TO_HOST));
	double l2d=0, l2r=0, maxd=0;
	for (int i = 0; i < D; i++) {
	float a = bf16_to_float(h_down_gmm[i]), b = bf16_to_float(h_down_lin[i]);
	l2d += (a-b)(a-b); l2r += bb;
	if (std::abs(a-b) > maxd) maxd = std::abs(a-b);
	}
	double rel = std::sqrt(l2d) / (std::sqrt(l2r) + 1e-10);
	printf(" GMM down_out[p=%d] vs linear_hf down: rel=%.4e max=%.4f\n", found_p, rel, maxd);
	printf(" GMM[:4]: %.5f %.5f %.5f %.5f\n",
	bf16_to_float(h_down_gmm[0]), bf16_to_float(h_down_gmm[1]), bf16_to_float(h_down_gmm[2]), bf16_to_float(h_down_gmm[3]));
	printf(" linear[:4]: %.5f %.5f %.5f %.5f\n",
	bf16_to_float(h_down_lin[0]), bf16_to_float(h_down_lin[1]), bf16_to_float(h_down_lin[2]), bf16_to_float(h_down_lin[3]));
	} else {
	printf(" not found p for (s=0, expert=%d)\n", target_expert);
	}
	}
	after_sanity:;

	// Direct verification: gate_exps[expert_10, :4, :4] vs HF gate_proj_10 (transposed).
	{
	int expert_id = 10;
	std::vector<uint16_t> h_stacked(4 * 4);
	// gate_exps shape [E, D, I]. Expert 10 starts at offset expert_id * D * I * 2.
	// Read the first 4 rows (d=0..3), first 4 cols (i=0..3). Row stride = I * 2 bytes.
	for (int d = 0; d < 4; d++) {
	ACL_CHECK(aclrtMemcpy(h_stacked.data() + d*4, 8,
	(char)moe.gate_exps.get() + (expert_id D * I + d * I) * 2, 8,
	ACL_MEMCPY_DEVICE_TO_HOST));
	}
	char ename[256];
	snprintf(ename, sizeof(ename), "model.layers.0.mlp.experts.%d.gate_proj.weight", expert_id);
	auto* m = dw.st().get(ename);
	// HF gate_proj [I, D] row-major. Element at (i, d) is at offset (iD + d)2.
	// Expected gate_exps[10, d, i] == HF_gate_proj[10][i, d].
	// So for d in 0..3, i in 0..3: expected is HF[i, d].
	std::vector<uint16_t> h_expected(4 * 4);
	auto* hf = (const uint16_t)dw.st().data_ptr(m);
	for (int d = 0; d < 4; d++) {
	for (int i = 0; i < 4; i++) {
	h_expected[d4 + i] = hf[i D + d]; // HF[i, d]
	}
	}
	printf("\n === gate_exps[10, :4, :4] layout check ===\n");
	printf(" stacked: ");
	for (int i = 0; i < 16; i++) printf("%.5f ", bf16_to_float(h_stacked[i]));
	printf("\n expected: ");
	for (int i = 0; i < 16; i++) printf("%.5f ", bf16_to_float(h_expected[i]));
	printf("\n");
	int mism = 0;
	for (int i = 0; i < 16; i++) if (h_stacked[i] != h_expected[i]) mism++;
	printf(" mismatches: %d / 16\n", mism);
	}

	printf("\n=== Final (with residual) ===\n");
	double rel = compare_bf16("final_out", final_dev.get(), S * D, "final_out.bin");
	bool pass = rel < 5e-2;
	printf("\n%s\n", pass ? "=== test_moe_layer PASS ===" : "=== test_moe_layer FAIL ===");
	return pass ? 0 : 1;
	}