#include "device_weights.h" #include "aclnn_ops.h" #include #include #include bool DeviceWeightsLoader::load_tensor_full_(const std::string& name, DeviceBuffer& buf) { const auto* m = st_.get(name); if (!m) { fprintf(stderr, "load_tensor_full_: missing %s\n", name.c_str()); return false; } const void* host = st_.data_ptr(*m); if (!host) { fprintf(stderr, "load_tensor_full_: null host ptr %s\n", name.c_str()); return false; } buf.alloc(m->nbytes); ACL_CHECK(aclrtMemcpy(buf.get(), m->nbytes, host, m->nbytes, ACL_MEMCPY_HOST_TO_DEVICE)); return true; } bool DeviceWeightsLoader::load_tensor_row_slice_(const std::string& name, int64_t row_lo, int64_t row_hi, DeviceBuffer& buf) { const auto* m = st_.get(name); if (!m) { fprintf(stderr, "load_tensor_row_slice_: missing %s\n", name.c_str()); return false; } if (m->shape.empty()) { fprintf(stderr, "%s: empty shape\n", name.c_str()); return false; } int64_t D0 = m->shape[0]; if (row_hi > D0 || row_lo < 0 || row_hi <= row_lo) { fprintf(stderr, "load_tensor_row_slice_: %s bad range [%ld,%ld) vs D0=%ld\n", name.c_str(), row_lo, row_hi, D0); return false; } size_t elem = sdtype_size(m->dtype); size_t inner = 1; for (size_t i = 1; i < m->shape.size(); i++) inner *= m->shape[i]; size_t row_bytes = inner * elem; size_t slice_bytes = (row_hi - row_lo) * row_bytes; const auto* host = (const char*)st_.data_ptr(*m); buf.alloc(slice_bytes); ACL_CHECK(aclrtMemcpy(buf.get(), slice_bytes, host + row_lo * row_bytes, slice_bytes, ACL_MEMCPY_HOST_TO_DEVICE)); return true; } bool DeviceWeightsLoader::load_tensor_col_slice_(const std::string& name, int64_t col_lo, int64_t col_hi, DeviceBuffer& buf) { const auto* m = st_.get(name); if (!m || m->shape.size() < 2) { fprintf(stderr, "load_tensor_col_slice_: bad shape %s\n", name.c_str()); return false; } int64_t D0 = m->shape[0]; int64_t D1 = m->shape[1]; if (col_hi > D1 || col_lo < 0 || col_hi <= col_lo) { fprintf(stderr, "load_tensor_col_slice_: bad range %ld-%ld D1=%ld\n", col_lo, col_hi, D1); return false; } size_t elem = sdtype_size(m->dtype); int64_t new_cols = col_hi - col_lo; size_t slice_bytes = D0 * new_cols * elem; buf.alloc(slice_bytes); // Need to copy row-by-row since source has stride D1 but dest has stride new_cols. const auto* host = (const char*)st_.data_ptr(*m); std::vector staging(slice_bytes); size_t src_row = D1 * elem; size_t dst_row = new_cols * elem; size_t col_off = col_lo * elem; for (int64_t r = 0; r < D0; r++) { std::memcpy(staging.data() + r * dst_row, host + r * src_row + col_off, dst_row); } ACL_CHECK(aclrtMemcpy(buf.get(), slice_bytes, staging.data(), slice_bytes, ACL_MEMCPY_HOST_TO_DEVICE)); return true; } bool DeviceWeightsLoader::load_shared(SharedWeights& out) { if (!load_tensor_full_("model.embed_tokens.weight", out.embed_tokens)) return false; if (!load_tensor_full_("lm_head.weight", out.lm_head)) return false; if (!load_tensor_full_("model.norm.weight", out.final_norm)) return false; return true; } bool DeviceWeightsLoader::load_moe(int L, aclrtStream stream, LayerMoEWeights& out) { const int64_t E = cfg_.num_experts; const int64_t D = cfg_.hidden_size; const int64_t I_full = cfg_.moe_intermediate_size; const int64_t I_rank = cfg_.i_per_rank; const size_t elem = 2; // BF16 auto base = "model.layers." + std::to_string(L); // 1. Router [E, D] — small, fully replicated if (!load_tensor_full_(base + ".mlp.gate.weight", out.router)) return false; // 2. MoE expert weights: need to stack 128 experts + TP slice + permute // HF gate/up: each expert [I_full, D] → TP slice rows to [I_rank, D] // HF down: each expert [D, I_full] → TP slice cols to [D, I_rank] auto load_and_stack = [&](const std::string& proj_name, bool is_down, DeviceBuffer& final_buf) -> bool { // HF shape for gate/up: [I_full, D]; for down: [D, I_full] // After TP slice: gate/up rows [I_rank, D]; down cols [D, I_rank] // Stacked: // gate/up: [E, I_rank, D] → permute to [E, D, I_rank] // down: [E, D, I_rank] → permute to [E, I_rank, D] int64_t K_in, N_out; bool row_slice; if (!is_down) { K_in = I_rank; // HF first dim after row-slice N_out = D; row_slice = true; } else { K_in = D; N_out = I_rank; row_slice = false; // col slice } // Stage: stacked HF-layout [E, K_in, N_out] on device (before permute) size_t elem_stack = K_in * N_out * elem; DeviceBuffer stacked_hf(E * elem_stack); // For each expert, load + TP slice + memcpy to stacked_hf[e] // We use the existing row_slice/col_slice helpers on a per-expert basis. DeviceBuffer tmp; for (int e = 0; e < E; e++) { std::string name = base + ".mlp.experts." + std::to_string(e) + "." + proj_name + ".weight"; if (row_slice) { int64_t lo = cfg_.tp_rank * I_rank; int64_t hi = lo + I_rank; if (!load_tensor_row_slice_(name, lo, hi, tmp)) return false; } else { int64_t lo = cfg_.tp_rank * I_rank; int64_t hi = lo + I_rank; if (!load_tensor_col_slice_(name, lo, hi, tmp)) return false; } if (tmp.size != elem_stack) { fprintf(stderr, "load_moe: expert %d %s slice size %zu != expected %zu\n", e, name.c_str(), tmp.size, elem_stack); return false; } // Synchronous D2D: tmp is about to be reallocated in the next iteration, // so we cannot enqueue an async copy that would still reference it. ACL_CHECK(aclrtMemcpy( (char*)stacked_hf.get() + e * elem_stack, elem_stack, tmp.get(), elem_stack, ACL_MEMCPY_DEVICE_TO_DEVICE)); } // Now permute stacked_hf [E, K_in, N_out] → final [E, N_out, K_in] row-major // (swap last two dims) final_buf.alloc(E * elem_stack); const int64_t d0 = E, d1 = K_in, d2 = N_out; // View stacked_hf with permuted strides pointing to same data: // logical shape [E, N_out, K_in], strides [K_in*N_out, 1, N_out] // (since physical is [E, K_in, N_out] row-major with strides [K_in*N_out, N_out, 1]) auto t_src = make_acl_tensor(stacked_hf.get(), ACL_BF16, {d0, d2, d1}, // [E, N_out, K_in] {d1 * d2, 1, d2}); auto t_dst = make_contig_tensor(final_buf.get(), ACL_BF16, {d0, d2, d1}); inplace_copy(stream, t_dst.get(), t_src.get()); // Must sync before stacked_hf goes out of scope — the inplace_copy is async and // reads from stacked_hf's memory. If we return without syncing, DeviceBuffer's // destructor frees stacked_hf while the permute kernel is still running, producing // garbage in final_buf. ACL_CHECK(aclrtSynchronizeStream(stream)); return true; }; if (!load_and_stack("gate_proj", false, out.gate_exps)) return false; if (!load_and_stack("up_proj", false, out.up_exps)) return false; if (!load_and_stack("down_proj", true, out.down_exps)) return false; return true; } bool DeviceWeightsLoader::load_attention(int L, LayerAttnWeights& out) { auto base = "model.layers." + std::to_string(L); if (!load_tensor_full_(base + ".input_layernorm.weight", out.input_layernorm)) return false; if (!load_tensor_full_(base + ".post_attention_layernorm.weight", out.post_attention_layernorm)) return false; if (!load_tensor_full_(base + ".self_attn.q_norm.weight", out.q_norm)) return false; if (!load_tensor_full_(base + ".self_attn.k_norm.weight", out.k_norm)) return false; const int64_t head_dim = cfg_.head_dim; const int64_t q_full = cfg_.num_attention_heads * head_dim; // 64 * 128 = 8192 // q_proj: [q_full, D], shard rows by head. Each rank gets n_heads_per_rank heads. int64_t q_rows_per_rank = cfg_.n_heads_per_rank * head_dim; int64_t q_row_lo = cfg_.tp_rank * q_rows_per_rank; int64_t q_row_hi = q_row_lo + q_rows_per_rank; if (!load_tensor_row_slice_(base + ".self_attn.q_proj.weight", q_row_lo, q_row_hi, out.q_proj)) return false; // k_proj, v_proj: HF shape [num_kv * head_dim, D]. // Case A (tp <= n_kv): split rows across ranks, each rank gets n_kv/tp KV heads. // Case B (tp > n_kv): each rank gets exactly ONE KV head; group of (tp/n_kv) ranks share it. // kv_head_idx = tp_rank / (tp_size / n_kv) if (cfg_.tp_size <= cfg_.num_key_value_heads) { int64_t kv_rows_per_rank = cfg_.n_kv_heads_per_rank * head_dim; int64_t kv_row_lo = cfg_.tp_rank * kv_rows_per_rank; int64_t kv_row_hi = kv_row_lo + kv_rows_per_rank; if (!load_tensor_row_slice_(base + ".self_attn.k_proj.weight", kv_row_lo, kv_row_hi, out.k_proj)) return false; if (!load_tensor_row_slice_(base + ".self_attn.v_proj.weight", kv_row_lo, kv_row_hi, out.v_proj)) return false; } else { // GQA replicated-group mode: 1 KV head per rank, selected by group. int64_t ranks_per_kv = cfg_.tp_size / cfg_.num_key_value_heads; int64_t kv_head_idx = cfg_.tp_rank / ranks_per_kv; int64_t kv_row_lo = kv_head_idx * head_dim; int64_t kv_row_hi = kv_row_lo + head_dim; if (!load_tensor_row_slice_(base + ".self_attn.k_proj.weight", kv_row_lo, kv_row_hi, out.k_proj)) return false; if (!load_tensor_row_slice_(base + ".self_attn.v_proj.weight", kv_row_lo, kv_row_hi, out.v_proj)) return false; } // o_proj: [D, q_full], row-parallel → shard cols (input dim) by head. int64_t o_col_lo = q_row_lo; // same slicing as q rows int64_t o_col_hi = q_row_hi; if (!load_tensor_col_slice_(base + ".self_attn.o_proj.weight", o_col_lo, o_col_hi, out.o_proj)) return false; return true; }