File size: 5,411 Bytes
4b9fefd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | #include "model_config.h"
#include <cstdio>
#include <fstream>
#include <sstream>
#include "json.hpp"
using json = nlohmann::json;
bool ModelConfig::load_from_json(const std::string& path) {
std::ifstream f(path);
if (!f) {
fprintf(stderr, "ModelConfig: cannot open %s\n", path.c_str());
return false;
}
json j;
try { f >> j; } catch (std::exception& e) {
fprintf(stderr, "ModelConfig: bad json: %s\n", e.what());
return false;
}
auto get = [&](const char* k, auto def) {
if (j.contains(k) && !j[k].is_null()) return j[k].get<decltype(def)>();
return def;
};
vocab_size = get("vocab_size", (int64_t)0);
hidden_size = get("hidden_size", (int64_t)0);
intermediate_size = get("intermediate_size", (int64_t)0);
moe_intermediate_size = get("moe_intermediate_size", (int64_t)0);
num_hidden_layers = get("num_hidden_layers", (int64_t)0);
num_attention_heads = get("num_attention_heads", (int64_t)0);
num_key_value_heads = get("num_key_value_heads", (int64_t)0);
head_dim = get("head_dim", (int64_t)0);
num_experts = get("num_experts", (int64_t)0);
num_experts_per_tok = get("num_experts_per_tok", (int64_t)0);
max_position_embeddings = get("max_position_embeddings", (int64_t)0);
rope_theta = (float)get("rope_theta", (double)10000.0);
rms_norm_eps = (float)get("rms_norm_eps", (double)1e-6);
norm_topk_prob = get("norm_topk_prob", true);
tie_word_embeddings = get("tie_word_embeddings", false);
bos_token_id = get("bos_token_id", (int64_t)0);
eos_token_id = get("eos_token_id", (int64_t)0);
// Sanity
if (num_attention_heads == 0 || head_dim == 0 || hidden_size == 0) {
fprintf(stderr, "ModelConfig: required fields missing\n");
return false;
}
return true;
}
void ModelConfig::compute_derived(int tps, int tpr) {
tp_size = tps;
tp_rank = tpr;
// Attention Q: split by head
if (num_attention_heads % tp_size != 0) {
fprintf(stderr, "WARN: num_attention_heads=%ld not divisible by tp_size=%d\n",
num_attention_heads, tp_size);
}
n_heads_per_rank = num_attention_heads / tp_size;
q_dim_per_rank = n_heads_per_rank * head_dim;
// Attention KV: GQA sharding.
// Case A (tp_size <= num_kv_heads): split KV heads across ranks.
// n_kv_heads_per_rank = num_kv_heads / tp_size
// Case B (tp_size > num_kv_heads): each rank gets ONE kv head shared by multiple ranks.
// Ranks in the same "group" share one kv head (ratio = tp_size / num_kv_heads).
// n_kv_heads_per_rank = 1
// kv_head_idx_for_rank = tp_rank / (tp_size / num_kv_heads)
// This matches the GQA semantics: each group of (num_q_heads / num_kv_heads) Q heads
// shares one KV head. FIAS is given matched Hq (rank-local Q heads) and Hkv=1.
if (tp_size <= num_key_value_heads && num_key_value_heads % tp_size == 0) {
n_kv_heads_per_rank = num_key_value_heads / tp_size;
} else if (tp_size % num_key_value_heads == 0) {
n_kv_heads_per_rank = 1;
} else {
fprintf(stderr, "WARN: non-standard TP/KV head ratio: tp=%d kv=%ld — falling back to replicate-all\n",
tp_size, num_key_value_heads);
n_kv_heads_per_rank = num_key_value_heads;
}
kv_dim_per_rank = n_kv_heads_per_rank * head_dim;
// MoE intermediate dim split
if (moe_intermediate_size % tp_size != 0) {
fprintf(stderr, "WARN: moe_intermediate_size=%ld not divisible by tp_size=%d\n",
moe_intermediate_size, tp_size);
}
i_per_rank = moe_intermediate_size / tp_size;
}
std::string ModelConfig::describe() const {
std::ostringstream os;
os << "Qwen3MoE config:\n"
<< " vocab_size = " << vocab_size << "\n"
<< " hidden_size = " << hidden_size << "\n"
<< " num_hidden_layers = " << num_hidden_layers << "\n"
<< " num_attention_heads = " << num_attention_heads << "\n"
<< " num_key_value_heads = " << num_key_value_heads << "\n"
<< " head_dim = " << head_dim << "\n"
<< " num_experts = " << num_experts << "\n"
<< " num_experts_per_tok = " << num_experts_per_tok << "\n"
<< " moe_intermediate_size = " << moe_intermediate_size << "\n"
<< " rope_theta = " << rope_theta << "\n"
<< " rms_norm_eps = " << rms_norm_eps << "\n"
<< " max_pos_embeddings = " << max_position_embeddings << "\n"
<< "TP rank " << tp_rank << " / " << tp_size << " derived:\n"
<< " n_heads_per_rank = " << n_heads_per_rank << "\n"
<< " q_dim_per_rank = " << q_dim_per_rank << "\n"
<< " n_kv_heads_per_rank = " << n_kv_heads_per_rank << "\n"
<< " kv_dim_per_rank = " << kv_dim_per_rank << "\n"
<< " i_per_rank = " << i_per_rank << "\n";
return os.str();
}
|