llm_mutil_npu / src /device_weights.cpp

Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU

4b9fefd 9 days ago

10.7 kB

	#include "device_weights.h"
	#include "aclnn_ops.h"
	#include <cstdio>
	#include <cstring>
	#include <vector>

	bool DeviceWeightsLoader::load_tensor_full_(const std::string& name, DeviceBuffer& buf) {
	const auto* m = st_.get(name);
	if (!m) { fprintf(stderr, "load_tensor_full_: missing %s\n", name.c_str()); return false; }
	const void* host = st_.data_ptr(*m);
	if (!host) { fprintf(stderr, "load_tensor_full_: null host ptr %s\n", name.c_str()); return false; }
	buf.alloc(m->nbytes);
	ACL_CHECK(aclrtMemcpy(buf.get(), m->nbytes, host, m->nbytes, ACL_MEMCPY_HOST_TO_DEVICE));
	return true;
	}

	bool DeviceWeightsLoader::load_tensor_row_slice_(const std::string& name,
	int64_t row_lo, int64_t row_hi,
	DeviceBuffer& buf) {
	const auto* m = st_.get(name);
	if (!m) { fprintf(stderr, "load_tensor_row_slice_: missing %s\n", name.c_str()); return false; }
	if (m->shape.empty()) { fprintf(stderr, "%s: empty shape\n", name.c_str()); return false; }
	int64_t D0 = m->shape[0];
	if (row_hi > D0 \|\| row_lo < 0 \|\| row_hi <= row_lo) {
	fprintf(stderr, "load_tensor_row_slice_: %s bad range [%ld,%ld) vs D0=%ld\n",
	name.c_str(), row_lo, row_hi, D0);
	return false;
	}
	size_t elem = sdtype_size(m->dtype);
	size_t inner = 1;
	for (size_t i = 1; i < m->shape.size(); i++) inner *= m->shape[i];
	size_t row_bytes = inner * elem;
	size_t slice_bytes = (row_hi - row_lo) * row_bytes;

	const auto* host = (const char)st_.data_ptr(m);
	buf.alloc(slice_bytes);
	ACL_CHECK(aclrtMemcpy(buf.get(), slice_bytes,
	host + row_lo * row_bytes, slice_bytes,
	ACL_MEMCPY_HOST_TO_DEVICE));
	return true;
	}

	bool DeviceWeightsLoader::load_tensor_col_slice_(const std::string& name,
	int64_t col_lo, int64_t col_hi,
	DeviceBuffer& buf) {
	const auto* m = st_.get(name);
	if (!m \|\| m->shape.size() < 2) {
	fprintf(stderr, "load_tensor_col_slice_: bad shape %s\n", name.c_str()); return false;
	}
	int64_t D0 = m->shape[0];
	int64_t D1 = m->shape[1];
	if (col_hi > D1 \|\| col_lo < 0 \|\| col_hi <= col_lo) {
	fprintf(stderr, "load_tensor_col_slice_: bad range %ld-%ld D1=%ld\n",
	col_lo, col_hi, D1); return false;
	}
	size_t elem = sdtype_size(m->dtype);
	int64_t new_cols = col_hi - col_lo;
	size_t slice_bytes = D0 * new_cols * elem;
	buf.alloc(slice_bytes);

	// Need to copy row-by-row since source has stride D1 but dest has stride new_cols.
	const auto* host = (const char)st_.data_ptr(m);
	std::vector<char> staging(slice_bytes);
	size_t src_row = D1 * elem;
	size_t dst_row = new_cols * elem;
	size_t col_off = col_lo * elem;
	for (int64_t r = 0; r < D0; r++) {
	std::memcpy(staging.data() + r * dst_row, host + r * src_row + col_off, dst_row);
	}
	ACL_CHECK(aclrtMemcpy(buf.get(), slice_bytes, staging.data(), slice_bytes,
	ACL_MEMCPY_HOST_TO_DEVICE));
	return true;
	}

	bool DeviceWeightsLoader::load_shared(SharedWeights& out) {
	if (!load_tensor_full_("model.embed_tokens.weight", out.embed_tokens)) return false;
	if (!load_tensor_full_("lm_head.weight", out.lm_head)) return false;
	if (!load_tensor_full_("model.norm.weight", out.final_norm)) return false;
	return true;
	}

	bool DeviceWeightsLoader::load_moe(int L, aclrtStream stream, LayerMoEWeights& out) {
	const int64_t E = cfg_.num_experts;
	const int64_t D = cfg_.hidden_size;
	const int64_t I_full = cfg_.moe_intermediate_size;
	const int64_t I_rank = cfg_.i_per_rank;
	const size_t elem = 2; // BF16

	auto base = "model.layers." + std::to_string(L);

	// 1. Router [E, D] — small, fully replicated
	if (!load_tensor_full_(base + ".mlp.gate.weight", out.router)) return false;

	// 2. MoE expert weights: need to stack 128 experts + TP slice + permute
	// HF gate/up: each expert [I_full, D] → TP slice rows to [I_rank, D]
	// HF down: each expert [D, I_full] → TP slice cols to [D, I_rank]

	auto load_and_stack = [&](const std::string& proj_name,
	bool is_down, DeviceBuffer& final_buf) -> bool {
	// HF shape for gate/up: [I_full, D]; for down: [D, I_full]
	// After TP slice: gate/up rows [I_rank, D]; down cols [D, I_rank]
	// Stacked:
	// gate/up: [E, I_rank, D] → permute to [E, D, I_rank]
	// down: [E, D, I_rank] → permute to [E, I_rank, D]

	int64_t K_in, N_out;
	bool row_slice;
	if (!is_down) {
	K_in = I_rank; // HF first dim after row-slice
	N_out = D;
	row_slice = true;
	} else {
	K_in = D;
	N_out = I_rank;
	row_slice = false; // col slice
	}

	// Stage: stacked HF-layout [E, K_in, N_out] on device (before permute)
	size_t elem_stack = K_in * N_out * elem;
	DeviceBuffer stacked_hf(E * elem_stack);

	// For each expert, load + TP slice + memcpy to stacked_hf[e]
	// We use the existing row_slice/col_slice helpers on a per-expert basis.
	DeviceBuffer tmp;
	for (int e = 0; e < E; e++) {
	std::string name = base + ".mlp.experts." + std::to_string(e) + "." + proj_name + ".weight";
	if (row_slice) {
	int64_t lo = cfg_.tp_rank * I_rank;
	int64_t hi = lo + I_rank;
	if (!load_tensor_row_slice_(name, lo, hi, tmp)) return false;
	} else {
	int64_t lo = cfg_.tp_rank * I_rank;
	int64_t hi = lo + I_rank;
	if (!load_tensor_col_slice_(name, lo, hi, tmp)) return false;
	}
	if (tmp.size != elem_stack) {
	fprintf(stderr, "load_moe: expert %d %s slice size %zu != expected %zu\n",
	e, name.c_str(), tmp.size, elem_stack);
	return false;
	}
	// Synchronous D2D: tmp is about to be reallocated in the next iteration,
	// so we cannot enqueue an async copy that would still reference it.
	ACL_CHECK(aclrtMemcpy(
	(char)stacked_hf.get() + e elem_stack, elem_stack,
	tmp.get(), elem_stack,
	ACL_MEMCPY_DEVICE_TO_DEVICE));
	}

	// Now permute stacked_hf [E, K_in, N_out] → final [E, N_out, K_in] row-major
	// (swap last two dims)
	final_buf.alloc(E * elem_stack);
	const int64_t d0 = E, d1 = K_in, d2 = N_out;
	// View stacked_hf with permuted strides pointing to same data:
	// logical shape [E, N_out, K_in], strides [K_in*N_out, 1, N_out]
	// (since physical is [E, K_in, N_out] row-major with strides [K_in*N_out, N_out, 1])
	auto t_src = make_acl_tensor(stacked_hf.get(), ACL_BF16,
	{d0, d2, d1}, // [E, N_out, K_in]
	{d1 * d2, 1, d2});
	auto t_dst = make_contig_tensor(final_buf.get(), ACL_BF16, {d0, d2, d1});
	inplace_copy(stream, t_dst.get(), t_src.get());
	// Must sync before stacked_hf goes out of scope — the inplace_copy is async and
	// reads from stacked_hf's memory. If we return without syncing, DeviceBuffer's
	// destructor frees stacked_hf while the permute kernel is still running, producing
	// garbage in final_buf.
	ACL_CHECK(aclrtSynchronizeStream(stream));
	return true;
	};

	if (!load_and_stack("gate_proj", false, out.gate_exps)) return false;
	if (!load_and_stack("up_proj", false, out.up_exps)) return false;
	if (!load_and_stack("down_proj", true, out.down_exps)) return false;

	return true;
	}

	bool DeviceWeightsLoader::load_attention(int L, LayerAttnWeights& out) {
	auto base = "model.layers." + std::to_string(L);

	if (!load_tensor_full_(base + ".input_layernorm.weight", out.input_layernorm)) return false;
	if (!load_tensor_full_(base + ".post_attention_layernorm.weight", out.post_attention_layernorm)) return false;
	if (!load_tensor_full_(base + ".self_attn.q_norm.weight", out.q_norm)) return false;
	if (!load_tensor_full_(base + ".self_attn.k_norm.weight", out.k_norm)) return false;

	const int64_t head_dim = cfg_.head_dim;
	const int64_t q_full = cfg_.num_attention_heads * head_dim; // 64 * 128 = 8192

	// q_proj: [q_full, D], shard rows by head. Each rank gets n_heads_per_rank heads.
	int64_t q_rows_per_rank = cfg_.n_heads_per_rank * head_dim;
	int64_t q_row_lo = cfg_.tp_rank * q_rows_per_rank;
	int64_t q_row_hi = q_row_lo + q_rows_per_rank;
	if (!load_tensor_row_slice_(base + ".self_attn.q_proj.weight",
	q_row_lo, q_row_hi, out.q_proj)) return false;

	// k_proj, v_proj: HF shape [num_kv * head_dim, D].
	// Case A (tp <= n_kv): split rows across ranks, each rank gets n_kv/tp KV heads.
	// Case B (tp > n_kv): each rank gets exactly ONE KV head; group of (tp/n_kv) ranks share it.
	// kv_head_idx = tp_rank / (tp_size / n_kv)
	if (cfg_.tp_size <= cfg_.num_key_value_heads) {
	int64_t kv_rows_per_rank = cfg_.n_kv_heads_per_rank * head_dim;
	int64_t kv_row_lo = cfg_.tp_rank * kv_rows_per_rank;
	int64_t kv_row_hi = kv_row_lo + kv_rows_per_rank;
	if (!load_tensor_row_slice_(base + ".self_attn.k_proj.weight", kv_row_lo, kv_row_hi, out.k_proj)) return false;
	if (!load_tensor_row_slice_(base + ".self_attn.v_proj.weight", kv_row_lo, kv_row_hi, out.v_proj)) return false;
	} else {
	// GQA replicated-group mode: 1 KV head per rank, selected by group.
	int64_t ranks_per_kv = cfg_.tp_size / cfg_.num_key_value_heads;
	int64_t kv_head_idx = cfg_.tp_rank / ranks_per_kv;
	int64_t kv_row_lo = kv_head_idx * head_dim;
	int64_t kv_row_hi = kv_row_lo + head_dim;
	if (!load_tensor_row_slice_(base + ".self_attn.k_proj.weight", kv_row_lo, kv_row_hi, out.k_proj)) return false;
	if (!load_tensor_row_slice_(base + ".self_attn.v_proj.weight", kv_row_lo, kv_row_hi, out.v_proj)) return false;
	}

	// o_proj: [D, q_full], row-parallel → shard cols (input dim) by head.
	int64_t o_col_lo = q_row_lo; // same slicing as q rows
	int64_t o_col_hi = q_row_hi;
	if (!load_tensor_col_slice_(base + ".self_attn.o_proj.weight",
	o_col_lo, o_col_hi, out.o_proj)) return false;

	return true;
	}