llm_mutil_npu / src /device_weights.cpp
xianglarry's picture
Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 Γ— 16 NPU
4b9fefd
#include "device_weights.h"
#include "aclnn_ops.h"
#include <cstdio>
#include <cstring>
#include <vector>
bool DeviceWeightsLoader::load_tensor_full_(const std::string& name, DeviceBuffer& buf) {
const auto* m = st_.get(name);
if (!m) { fprintf(stderr, "load_tensor_full_: missing %s\n", name.c_str()); return false; }
const void* host = st_.data_ptr(*m);
if (!host) { fprintf(stderr, "load_tensor_full_: null host ptr %s\n", name.c_str()); return false; }
buf.alloc(m->nbytes);
ACL_CHECK(aclrtMemcpy(buf.get(), m->nbytes, host, m->nbytes, ACL_MEMCPY_HOST_TO_DEVICE));
return true;
}
bool DeviceWeightsLoader::load_tensor_row_slice_(const std::string& name,
int64_t row_lo, int64_t row_hi,
DeviceBuffer& buf) {
const auto* m = st_.get(name);
if (!m) { fprintf(stderr, "load_tensor_row_slice_: missing %s\n", name.c_str()); return false; }
if (m->shape.empty()) { fprintf(stderr, "%s: empty shape\n", name.c_str()); return false; }
int64_t D0 = m->shape[0];
if (row_hi > D0 || row_lo < 0 || row_hi <= row_lo) {
fprintf(stderr, "load_tensor_row_slice_: %s bad range [%ld,%ld) vs D0=%ld\n",
name.c_str(), row_lo, row_hi, D0);
return false;
}
size_t elem = sdtype_size(m->dtype);
size_t inner = 1;
for (size_t i = 1; i < m->shape.size(); i++) inner *= m->shape[i];
size_t row_bytes = inner * elem;
size_t slice_bytes = (row_hi - row_lo) * row_bytes;
const auto* host = (const char*)st_.data_ptr(*m);
buf.alloc(slice_bytes);
ACL_CHECK(aclrtMemcpy(buf.get(), slice_bytes,
host + row_lo * row_bytes, slice_bytes,
ACL_MEMCPY_HOST_TO_DEVICE));
return true;
}
bool DeviceWeightsLoader::load_tensor_col_slice_(const std::string& name,
int64_t col_lo, int64_t col_hi,
DeviceBuffer& buf) {
const auto* m = st_.get(name);
if (!m || m->shape.size() < 2) {
fprintf(stderr, "load_tensor_col_slice_: bad shape %s\n", name.c_str()); return false;
}
int64_t D0 = m->shape[0];
int64_t D1 = m->shape[1];
if (col_hi > D1 || col_lo < 0 || col_hi <= col_lo) {
fprintf(stderr, "load_tensor_col_slice_: bad range %ld-%ld D1=%ld\n",
col_lo, col_hi, D1); return false;
}
size_t elem = sdtype_size(m->dtype);
int64_t new_cols = col_hi - col_lo;
size_t slice_bytes = D0 * new_cols * elem;
buf.alloc(slice_bytes);
// Need to copy row-by-row since source has stride D1 but dest has stride new_cols.
const auto* host = (const char*)st_.data_ptr(*m);
std::vector<char> staging(slice_bytes);
size_t src_row = D1 * elem;
size_t dst_row = new_cols * elem;
size_t col_off = col_lo * elem;
for (int64_t r = 0; r < D0; r++) {
std::memcpy(staging.data() + r * dst_row, host + r * src_row + col_off, dst_row);
}
ACL_CHECK(aclrtMemcpy(buf.get(), slice_bytes, staging.data(), slice_bytes,
ACL_MEMCPY_HOST_TO_DEVICE));
return true;
}
bool DeviceWeightsLoader::load_shared(SharedWeights& out) {
if (!load_tensor_full_("model.embed_tokens.weight", out.embed_tokens)) return false;
if (!load_tensor_full_("lm_head.weight", out.lm_head)) return false;
if (!load_tensor_full_("model.norm.weight", out.final_norm)) return false;
return true;
}
bool DeviceWeightsLoader::load_moe(int L, aclrtStream stream, LayerMoEWeights& out) {
const int64_t E = cfg_.num_experts;
const int64_t D = cfg_.hidden_size;
const int64_t I_full = cfg_.moe_intermediate_size;
const int64_t I_rank = cfg_.i_per_rank;
const size_t elem = 2; // BF16
auto base = "model.layers." + std::to_string(L);
// 1. Router [E, D] β€” small, fully replicated
if (!load_tensor_full_(base + ".mlp.gate.weight", out.router)) return false;
// 2. MoE expert weights: need to stack 128 experts + TP slice + permute
// HF gate/up: each expert [I_full, D] β†’ TP slice rows to [I_rank, D]
// HF down: each expert [D, I_full] β†’ TP slice cols to [D, I_rank]
auto load_and_stack = [&](const std::string& proj_name,
bool is_down, DeviceBuffer& final_buf) -> bool {
// HF shape for gate/up: [I_full, D]; for down: [D, I_full]
// After TP slice: gate/up rows [I_rank, D]; down cols [D, I_rank]
// Stacked:
// gate/up: [E, I_rank, D] β†’ permute to [E, D, I_rank]
// down: [E, D, I_rank] β†’ permute to [E, I_rank, D]
int64_t K_in, N_out;
bool row_slice;
if (!is_down) {
K_in = I_rank; // HF first dim after row-slice
N_out = D;
row_slice = true;
} else {
K_in = D;
N_out = I_rank;
row_slice = false; // col slice
}
// Stage: stacked HF-layout [E, K_in, N_out] on device (before permute)
size_t elem_stack = K_in * N_out * elem;
DeviceBuffer stacked_hf(E * elem_stack);
// For each expert, load + TP slice + memcpy to stacked_hf[e]
// We use the existing row_slice/col_slice helpers on a per-expert basis.
DeviceBuffer tmp;
for (int e = 0; e < E; e++) {
std::string name = base + ".mlp.experts." + std::to_string(e) + "." + proj_name + ".weight";
if (row_slice) {
int64_t lo = cfg_.tp_rank * I_rank;
int64_t hi = lo + I_rank;
if (!load_tensor_row_slice_(name, lo, hi, tmp)) return false;
} else {
int64_t lo = cfg_.tp_rank * I_rank;
int64_t hi = lo + I_rank;
if (!load_tensor_col_slice_(name, lo, hi, tmp)) return false;
}
if (tmp.size != elem_stack) {
fprintf(stderr, "load_moe: expert %d %s slice size %zu != expected %zu\n",
e, name.c_str(), tmp.size, elem_stack);
return false;
}
// Synchronous D2D: tmp is about to be reallocated in the next iteration,
// so we cannot enqueue an async copy that would still reference it.
ACL_CHECK(aclrtMemcpy(
(char*)stacked_hf.get() + e * elem_stack, elem_stack,
tmp.get(), elem_stack,
ACL_MEMCPY_DEVICE_TO_DEVICE));
}
// Now permute stacked_hf [E, K_in, N_out] β†’ final [E, N_out, K_in] row-major
// (swap last two dims)
final_buf.alloc(E * elem_stack);
const int64_t d0 = E, d1 = K_in, d2 = N_out;
// View stacked_hf with permuted strides pointing to same data:
// logical shape [E, N_out, K_in], strides [K_in*N_out, 1, N_out]
// (since physical is [E, K_in, N_out] row-major with strides [K_in*N_out, N_out, 1])
auto t_src = make_acl_tensor(stacked_hf.get(), ACL_BF16,
{d0, d2, d1}, // [E, N_out, K_in]
{d1 * d2, 1, d2});
auto t_dst = make_contig_tensor(final_buf.get(), ACL_BF16, {d0, d2, d1});
inplace_copy(stream, t_dst.get(), t_src.get());
// Must sync before stacked_hf goes out of scope β€” the inplace_copy is async and
// reads from stacked_hf's memory. If we return without syncing, DeviceBuffer's
// destructor frees stacked_hf while the permute kernel is still running, producing
// garbage in final_buf.
ACL_CHECK(aclrtSynchronizeStream(stream));
return true;
};
if (!load_and_stack("gate_proj", false, out.gate_exps)) return false;
if (!load_and_stack("up_proj", false, out.up_exps)) return false;
if (!load_and_stack("down_proj", true, out.down_exps)) return false;
return true;
}
bool DeviceWeightsLoader::load_attention(int L, LayerAttnWeights& out) {
auto base = "model.layers." + std::to_string(L);
if (!load_tensor_full_(base + ".input_layernorm.weight", out.input_layernorm)) return false;
if (!load_tensor_full_(base + ".post_attention_layernorm.weight", out.post_attention_layernorm)) return false;
if (!load_tensor_full_(base + ".self_attn.q_norm.weight", out.q_norm)) return false;
if (!load_tensor_full_(base + ".self_attn.k_norm.weight", out.k_norm)) return false;
const int64_t head_dim = cfg_.head_dim;
const int64_t q_full = cfg_.num_attention_heads * head_dim; // 64 * 128 = 8192
// q_proj: [q_full, D], shard rows by head. Each rank gets n_heads_per_rank heads.
int64_t q_rows_per_rank = cfg_.n_heads_per_rank * head_dim;
int64_t q_row_lo = cfg_.tp_rank * q_rows_per_rank;
int64_t q_row_hi = q_row_lo + q_rows_per_rank;
if (!load_tensor_row_slice_(base + ".self_attn.q_proj.weight",
q_row_lo, q_row_hi, out.q_proj)) return false;
// k_proj, v_proj: HF shape [num_kv * head_dim, D].
// Case A (tp <= n_kv): split rows across ranks, each rank gets n_kv/tp KV heads.
// Case B (tp > n_kv): each rank gets exactly ONE KV head; group of (tp/n_kv) ranks share it.
// kv_head_idx = tp_rank / (tp_size / n_kv)
if (cfg_.tp_size <= cfg_.num_key_value_heads) {
int64_t kv_rows_per_rank = cfg_.n_kv_heads_per_rank * head_dim;
int64_t kv_row_lo = cfg_.tp_rank * kv_rows_per_rank;
int64_t kv_row_hi = kv_row_lo + kv_rows_per_rank;
if (!load_tensor_row_slice_(base + ".self_attn.k_proj.weight", kv_row_lo, kv_row_hi, out.k_proj)) return false;
if (!load_tensor_row_slice_(base + ".self_attn.v_proj.weight", kv_row_lo, kv_row_hi, out.v_proj)) return false;
} else {
// GQA replicated-group mode: 1 KV head per rank, selected by group.
int64_t ranks_per_kv = cfg_.tp_size / cfg_.num_key_value_heads;
int64_t kv_head_idx = cfg_.tp_rank / ranks_per_kv;
int64_t kv_row_lo = kv_head_idx * head_dim;
int64_t kv_row_hi = kv_row_lo + head_dim;
if (!load_tensor_row_slice_(base + ".self_attn.k_proj.weight", kv_row_lo, kv_row_hi, out.k_proj)) return false;
if (!load_tensor_row_slice_(base + ".self_attn.v_proj.weight", kv_row_lo, kv_row_hi, out.v_proj)) return false;
}
// o_proj: [D, q_full], row-parallel β†’ shard cols (input dim) by head.
int64_t o_col_lo = q_row_lo; // same slicing as q rows
int64_t o_col_hi = q_row_hi;
if (!load_tensor_col_slice_(base + ".self_attn.o_proj.weight",
o_col_lo, o_col_hi, out.o_proj)) return false;
return true;
}