| static bool old_mixtral_warning_showed = false; |
|
|
| |
| #include "llama-impl.cpp" |
| #include "llama-chat.cpp" |
| #include "llama-mmap.cpp" |
| #include "llama-context.cpp" |
| #include "llama-adapter.cpp" |
| #include "llama-arch.cpp" |
| #include "llama-batch.cpp" |
| #include "llama-vocab.cpp" |
| #include "llama-grammar.cpp" |
| #include "llama-sampling.cpp" |
| #include "llama-kv-cache.cpp" |
| #include "llama-model-loader.cpp" |
| #include "llama-model.cpp" |
| #include "llama-quant.cpp" |
| #include "llama-hparams.cpp" |
|
|
| #include "ggml.h" |
| #include "ggml-alloc.h" |
| #include "ggml-backend.h" |
| #include "ggml-cpp.h" |
|
|
| #include <algorithm> |
| #include <array> |
| #include <cassert> |
| #include <cfloat> |
| #include <cmath> |
| #include <cstddef> |
| #include <cstdint> |
| #include <cstdio> |
| #include <cstring> |
| #include <ctime> |
| #include <functional> |
| #include <numeric> |
| #include <type_traits> |
| #include <iostream> |
|
|
| #ifdef GGML_USE_CUDA |
| # include "ggml-cuda.h" |
| #elif defined(GGML_USE_CLBLAST) |
| # include "ggml_v3b-opencl.h" |
| #endif |
|
|
|
|
| #if defined(_MSC_VER) |
| #pragma warning(disable: 4244 4267) |
| #endif |
|
|
| |
| static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) { |
| |
| |
| model.t_load_us = 0; |
| time_meas tm(model.t_load_us); |
|
|
| model.t_start_us = tm.t_start_us; |
|
|
| try { |
| llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides); |
|
|
| ml.print_info(); |
|
|
| model.hparams.vocab_only = params.vocab_only; |
|
|
| try { |
| model.load_arch(ml); |
| } catch(const std::exception & e) { |
| throw std::runtime_error("error loading model architecture: " + std::string(e.what())); |
| } |
| try { |
| model.load_hparams(ml); |
| } catch(const std::exception & e) { |
| throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); |
| } |
| try { |
| model.load_vocab(ml); |
| } catch(const std::exception & e) { |
| throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); |
| } |
|
|
| model.load_stats(ml); |
| model.print_info(); |
|
|
| if (params.vocab_only) { |
| LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); |
| return 0; |
| } |
|
|
| if (!model.load_tensors(ml)) { |
| return -2; |
| } |
| } catch (const std::exception & err) { |
| LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); |
| return -1; |
| } |
|
|
| return 0; |
| } |
|
|
| |
| |
| |
|
|
| using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>; |
|
|
| enum llm_ffn_op_type { |
| LLM_FFN_SILU, |
| LLM_FFN_GELU, |
| LLM_FFN_RELU, |
| LLM_FFN_RELU_SQR, |
| LLM_FFN_SWIGLU, |
| }; |
|
|
| enum llm_ffn_gate_type { |
| LLM_FFN_SEQ, |
| LLM_FFN_PAR, |
| }; |
|
|
| enum llm_norm_type { |
| LLM_NORM, |
| LLM_NORM_RMS, |
| LLM_NORM_GROUP, |
| }; |
|
|
| static struct ggml_tensor * llm_build_inp_embd( |
| struct ggml_context * ctx, |
| struct llama_context & lctx, |
| const llama_hparams & hparams, |
| const llama_ubatch & ubatch, |
| struct ggml_tensor * tok_embd, |
| const llm_build_cb & cb) { |
| const int64_t n_embd = hparams.n_embd; |
|
|
| struct ggml_tensor * inpL; |
|
|
| if (ubatch.token) { |
| lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ubatch.n_tokens); |
| cb(lctx.inp_tokens, "inp_tokens", -1); |
| ggml_set_input(lctx.inp_tokens); |
|
|
| inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); |
|
|
| |
| for (auto & it : lctx.lora) { |
| struct llama_adapter_lora_weight * lw = it.first->get_weight(tok_embd); |
| if (lw == nullptr) { |
| continue; |
| } |
| const float adapter_scale = it.second; |
| const float scale = lw->get_scale(it.first->alpha, adapter_scale); |
| struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat( |
| ctx, lw->b, |
| ggml_get_rows(ctx, lw->a, lctx.inp_tokens) |
| ), scale); |
| inpL = ggml_add(ctx, inpL, inpL_delta); |
| } |
| } else { |
| lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens); |
| inpL = lctx.inp_embd; |
| ggml_set_input(lctx.inp_embd); |
| } |
|
|
| |
| if (hparams.f_embedding_scale != 0.0f) { |
| inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale); |
| } |
|
|
| cb(inpL, "inp_embd", -1); |
|
|
| return inpL; |
| } |
|
|
| static void llm_build_kv_store( |
| struct ggml_context * ctx, |
| const llama_hparams & hparams, |
| const llama_cparams & cparams, |
| const llama_kv_cache & kv, |
| struct ggml_cgraph * graph, |
| struct ggml_tensor * k_cur, |
| struct ggml_tensor * v_cur, |
| int32_t n_tokens, |
| int32_t kv_head, |
| const llm_build_cb & cb, |
| int64_t il) { |
| const int64_t n_ctx = cparams.n_ctx; |
|
|
| const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); |
| const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); |
|
|
| GGML_ASSERT(kv.size == n_ctx); |
|
|
| struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head); |
| cb(k_cache_view, "k_cache_view", il); |
|
|
| |
| ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); |
|
|
| assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); |
|
|
| struct ggml_tensor * v_cache_view = nullptr; |
|
|
| if (cparams.flash_attn) { |
| v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head); |
| } else { |
| |
| v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa, |
| ( n_ctx)*ggml_element_size(kv.v_l[il]), |
| (kv_head)*ggml_element_size(kv.v_l[il])); |
|
|
| v_cur = ggml_transpose(ctx, v_cur); |
| } |
| cb(v_cache_view, "v_cache_view", il); |
|
|
| ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view)); |
| } |
|
|
| |
| static struct ggml_tensor * llm_build_lora_mm( |
| struct llama_context & lctx, |
| struct ggml_context * ctx0, |
| struct ggml_tensor * w, |
| struct ggml_tensor * cur) { |
| struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); |
| for (auto & it : lctx.lora) { |
| struct llama_adapter_lora_weight * lw = it.first->get_weight(w); |
| if (lw == nullptr) { |
| continue; |
| } |
| const float adapter_scale = it.second; |
| const float scale = lw->get_scale(it.first->alpha, adapter_scale); |
| struct ggml_tensor * ab_cur = ggml_mul_mat( |
| ctx0, lw->b, |
| ggml_mul_mat(ctx0, lw->a, cur) |
| ); |
| ab_cur = ggml_scale(ctx0, ab_cur, scale); |
| res = ggml_add(ctx0, res, ab_cur); |
| } |
| return res; |
| } |
|
|
| |
| static struct ggml_tensor * llm_build_lora_mm_id( |
| struct llama_context & lctx, |
| struct ggml_context * ctx0, |
| struct ggml_tensor * w, |
| struct ggml_tensor * cur, |
| struct ggml_tensor * ids) { |
| struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); |
| for (auto & it : lctx.lora) { |
| struct llama_adapter_lora_weight * lw = it.first->get_weight(w); |
| if (lw == nullptr) { |
| continue; |
| } |
| const float alpha = it.first->alpha; |
| const float rank = (float) lw->b->ne[0]; |
| const float scale = alpha ? it.second * alpha / rank : it.second; |
| struct ggml_tensor * ab_cur = ggml_mul_mat_id( |
| ctx0, lw->b, |
| ggml_mul_mat_id(ctx0, lw->a, cur, ids), |
| ids |
| ); |
| ab_cur = ggml_scale(ctx0, ab_cur, scale); |
| res = ggml_add(ctx0, res, ab_cur); |
| } |
| return res; |
| } |
|
|
| static struct ggml_tensor * llm_build_norm( |
| struct ggml_context * ctx, |
| struct ggml_tensor * cur, |
| const llama_hparams & hparams, |
| struct ggml_tensor * mw, |
| struct ggml_tensor * mb, |
| llm_norm_type type, |
| const llm_build_cb & cb, |
| int il) { |
| switch (type) { |
| case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break; |
| case LLM_NORM_RMS: cur = ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break; |
| case LLM_NORM_GROUP: |
| { |
| cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]); |
| cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps); |
| cur = ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]); |
| } break; |
| } |
|
|
| if (mw || mb) { |
| cb(cur, "norm", il); |
| } |
|
|
| if (mw) { |
| cur = ggml_mul(ctx, cur, mw); |
| if (mb) { |
| cb(cur, "norm_w", il); |
| } |
| } |
|
|
| if (mb) { |
| cur = ggml_add(ctx, cur, mb); |
| } |
|
|
| return cur; |
| } |
|
|
| static struct ggml_tensor * llm_build_ffn( |
| struct ggml_context * ctx, |
| struct llama_context & lctx, |
| struct ggml_tensor * cur, |
| struct ggml_tensor * up, |
| struct ggml_tensor * up_b, |
| struct ggml_tensor * up_s, |
| struct ggml_tensor * gate, |
| struct ggml_tensor * gate_b, |
| struct ggml_tensor * gate_s, |
| struct ggml_tensor * down, |
| struct ggml_tensor * down_b, |
| struct ggml_tensor * down_s, |
| struct ggml_tensor * act_scales, |
| llm_ffn_op_type type_op, |
| llm_ffn_gate_type type_gate, |
| const llm_build_cb & cb, |
| int il) { |
| struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur; |
| cb(tmp, "ffn_up", il); |
|
|
| if (up_b) { |
| tmp = ggml_add(ctx, tmp, up_b); |
| cb(tmp, "ffn_up_b", il); |
| } |
|
|
| if (up_s) { |
| tmp = ggml_mul(ctx, tmp, up_s); |
| cb(tmp, "ffn_up_s", il); |
| } |
|
|
| if (gate) { |
| switch (type_gate) { |
| case LLM_FFN_SEQ: |
| { |
| cur = llm_build_lora_mm(lctx, ctx, gate, tmp); |
| cb(cur, "ffn_gate", il); |
| } break; |
| case LLM_FFN_PAR: |
| { |
| cur = llm_build_lora_mm(lctx, ctx, gate, cur); |
| cb(cur, "ffn_gate", il); |
| } break; |
| } |
|
|
| if (gate_b) { |
| cur = ggml_add(ctx, cur, gate_b); |
| cb(cur, "ffn_gate_b", il); |
| } |
|
|
| if (gate_s) { |
| cur = ggml_mul(ctx, cur, gate_s); |
| cb(cur, "ffn_gate_s", il); |
| } |
|
|
| } else { |
| cur = tmp; |
| } |
|
|
| switch (type_op) { |
| case LLM_FFN_SILU: |
| { |
| cur = ggml_silu(ctx, cur); |
| cb(cur, "ffn_silu", il); |
| } break; |
| case LLM_FFN_GELU: |
| { |
| cur = ggml_gelu(ctx, cur); |
| cb(cur, "ffn_gelu", il); |
| if (act_scales != NULL) { |
| cur = ggml_div(ctx, cur, act_scales); |
| cb(cur, "ffn_act", il); |
| } |
| } break; |
| case LLM_FFN_RELU: |
| { |
| cur = ggml_relu(ctx, cur); |
| cb(cur, "ffn_relu", il); |
| } break; |
| case LLM_FFN_RELU_SQR: |
| { |
| cur = ggml_relu(ctx, cur); |
| cb(cur, "ffn_relu", il); |
|
|
| cur = ggml_sqr(ctx, cur); |
| cb(cur, "ffn_sqr(relu)", il); |
| } break; |
| case LLM_FFN_SWIGLU: |
| { |
| |
| int64_t split_point = cur->ne[0] / 2; |
| struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0)); |
| struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); |
|
|
| x0 = ggml_silu(ctx, x0); |
| cb(cur, "ffn_silu", il); |
|
|
| cur = ggml_mul(ctx, x0, x1); |
| cb(cur, "ffn_mul", il); |
| } break; |
| } |
|
|
| if (type_gate == LLM_FFN_PAR) { |
| cur = ggml_mul(ctx, cur, tmp); |
| cb(cur, "ffn_gate_par", il); |
| } |
|
|
| if (down) { |
| cur = llm_build_lora_mm(lctx, ctx, down, cur); |
| } |
|
|
| if (down_b) { |
| cb(cur, "ffn_down", il); |
| } |
|
|
| if (down_b) { |
| cur = ggml_add(ctx, cur, down_b); |
| } |
|
|
| if (down_s) { |
| cur = ggml_mul(ctx, cur, down_s); |
| cb(cur, "ffn_down_s", il); |
| } |
|
|
| return cur; |
| } |
|
|
| static struct ggml_tensor * llm_build_moe_ffn( |
| struct ggml_context * ctx, |
| struct llama_context & lctx, |
| struct ggml_tensor * cur, |
| struct ggml_tensor * gate_inp, |
| struct ggml_tensor * up_exps, |
| struct ggml_tensor * gate_exps, |
| struct ggml_tensor * down_exps, |
| struct ggml_tensor * exp_probs_b, |
| int64_t n_expert, |
| int64_t n_expert_used, |
| llm_ffn_op_type type_op, |
| bool norm_w, |
| bool scale_w, |
| float w_scale, |
| llama_expert_gating_func_type gating_op, |
| const llm_build_cb & cb, |
| int il) { |
| int64_t n_embd = cur->ne[0]; |
| int64_t n_tokens = cur->ne[1]; |
|
|
| ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); |
| cb(logits, "ffn_moe_logits", il); |
|
|
| ggml_tensor * probs = nullptr; |
| switch (gating_op) { |
| case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: |
| { |
| probs = ggml_soft_max(ctx, logits); |
| } break; |
| case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: |
| { |
| probs = ggml_sigmoid(ctx, logits); |
| } break; |
| default: |
| GGML_ABORT("fatal error"); |
| } |
| cb(probs, "ffn_moe_probs", il); |
|
|
| |
| |
| ggml_tensor * selection_probs = probs; |
| if (exp_probs_b != nullptr) { |
| selection_probs = ggml_add(ctx, probs, exp_probs_b); |
| cb(selection_probs, "ffn_moe_probs_biased", il); |
| } |
|
|
| |
| ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); |
| cb(selected_experts->src[0], "ffn_moe_argsort", il); |
| cb(selected_experts, "ffn_moe_topk", il); |
|
|
| ggml_tensor * weights = ggml_get_rows(ctx, |
| ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); |
| cb(weights, "ffn_moe_weights", il); |
|
|
| if (norm_w) { |
| weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens); |
|
|
| ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); |
| cb(weights_sum, "ffn_moe_weights_sum", il); |
|
|
| weights = ggml_div(ctx, weights, weights_sum); |
| cb(weights, "ffn_moe_weights_norm", il); |
|
|
| weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens); |
| } |
| if (scale_w) { |
| weights = ggml_scale(ctx, weights, w_scale); |
| cb(weights, "ffn_moe_weights_scaled", il); |
| } |
|
|
| cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); |
| ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); |
| cb(up, "ffn_moe_up", il); |
|
|
| ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); |
| cb(gate, "ffn_moe_gate", il); |
|
|
| switch (type_op) { |
| case LLM_FFN_SILU: |
| { |
| gate = ggml_silu(ctx, gate); |
| cb(gate, "ffn_moe_silu", il); |
| } break; |
| case LLM_FFN_GELU: |
| { |
| gate = ggml_gelu(ctx, gate); |
| cb(gate, "ffn_moe_gelu", il); |
| } break; |
| default: |
| GGML_ABORT("fatal error"); |
| } |
|
|
| ggml_tensor * par = ggml_mul(ctx, up, gate); |
| cb(par, "ffn_moe_gate_par", il); |
|
|
| ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); |
| cb(experts, "ffn_moe_down", il); |
|
|
| experts = ggml_mul(ctx, experts, weights); |
|
|
| |
| ggml_tensor * moe_out = nullptr; |
| for (int i = 0; i < n_expert_used; ++i) { |
| ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens, |
| experts->nb[2], i*experts->nb[1]); |
|
|
| if (i == 0) { |
| moe_out = cur_expert; |
| } else { |
| moe_out = ggml_add(ctx, moe_out, cur_expert); |
| } |
| } |
|
|
| if (n_expert_used == 1) { |
| |
| moe_out = ggml_cont(ctx, moe_out); |
| } |
|
|
| return moe_out; |
| } |
|
|
| static struct ggml_tensor * llm_build_kqv( |
| struct ggml_context * ctx, |
| struct llama_context & lctx, |
| const llama_kv_cache & kv, |
| struct ggml_cgraph * graph, |
| struct ggml_tensor * wo, |
| struct ggml_tensor * wo_b, |
| struct ggml_tensor * q_cur, |
| struct ggml_tensor * kq_mask, |
| int32_t n_tokens, |
| int32_t n_kv, |
| float kq_scale, |
| const llm_build_cb & cb, |
| int il) { |
| const llama_model & model = lctx.model; |
| const llama_hparams & hparams = lctx.model.hparams; |
| const llama_cparams & cparams = lctx.cparams; |
|
|
| const int64_t n_ctx = cparams.n_ctx; |
| const int64_t n_head = hparams.n_head(il); |
| const int64_t n_head_kv = hparams.n_head_kv(il); |
| const int64_t n_embd_head_k = hparams.n_embd_head_k; |
| const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); |
| const int64_t n_embd_head_v = hparams.n_embd_head_v; |
| const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); |
|
|
| struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); |
| cb(q, "q", il); |
|
|
| struct ggml_tensor * k = |
| ggml_view_3d(ctx, kv.k_l[il], |
| n_embd_head_k, n_kv, n_head_kv, |
| ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), |
| ggml_row_size(kv.k_l[il]->type, n_embd_head_k), |
| 0); |
| cb(k, "k", il); |
|
|
| struct ggml_tensor * cur; |
|
|
| if (cparams.flash_attn) { |
| GGML_UNUSED(model); |
| GGML_UNUSED(n_ctx); |
|
|
| |
| struct ggml_tensor * v = |
| ggml_view_3d(ctx, kv.v_l[il], |
| n_embd_head_v, n_kv, n_head_kv, |
| ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa), |
| ggml_row_size(kv.v_l[il]->type, n_embd_head_v), |
| 0); |
| cb(v, "v", il); |
|
|
| cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, |
| hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); |
|
|
| #if defined(GGML_USE_HIP) |
| if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_GEMMA2 || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { |
| ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); |
| } |
| #else |
| ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); |
| #endif |
|
|
| cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); |
| } else { |
| struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); |
| cb(kq, "kq", il); |
|
|
| #if defined(GGML_USE_HIP) |
| if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON || model.arch == LLM_ARCH_CHATGLM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { |
| |
| |
| ggml_mul_mat_set_prec(kq, GGML_PREC_F32); |
| } |
| #else |
| |
| |
| ggml_mul_mat_set_prec(kq, GGML_PREC_F32); |
| #endif |
|
|
| if (model.arch == LLM_ARCH_GROK) { |
| |
| |
| |
| |
| |
|
|
| kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f)); |
| kq = ggml_scale(ctx, kq, 30); |
| } |
|
|
| if (hparams.attn_soft_cap) { |
| kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping); |
| kq = ggml_tanh(ctx, kq); |
| kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping); |
| } |
|
|
| kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); |
| cb(kq, "kq_soft_max_ext", il); |
|
|
| GGML_ASSERT(kv.size == n_ctx); |
|
|
| |
| struct ggml_tensor * v = |
| ggml_view_3d(ctx, kv.v_l[il], |
| n_kv, n_embd_head_v, n_head_kv, |
| ggml_element_size(kv.v_l[il])*n_ctx, |
| ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v, |
| 0); |
| cb(v, "v", il); |
|
|
| struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); |
| cb(kqv, "kqv", il); |
|
|
| struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); |
| cb(kqv_merged, "kqv_merged", il); |
|
|
| cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens); |
| cb(cur, "kqv_merged_cont", il); |
| } |
|
|
| ggml_build_forward_expand(graph, cur); |
|
|
| if (wo) { |
| cur = llm_build_lora_mm(lctx, ctx, wo, cur); |
| } |
|
|
| if (wo_b) { |
| cb(cur, "kqv_wo", il); |
| } |
|
|
| if (wo_b) { |
| cur = ggml_add(ctx, cur, wo_b); |
| } |
|
|
| return cur; |
| } |
|
|
| static struct ggml_tensor * llm_build_kv( |
| struct ggml_context * ctx, |
| struct llama_context & lctx, |
| const llama_kv_cache & kv, |
| struct ggml_cgraph * graph, |
| struct ggml_tensor * wo, |
| struct ggml_tensor * wo_b, |
| struct ggml_tensor * k_cur, |
| struct ggml_tensor * v_cur, |
| struct ggml_tensor * q_cur, |
| struct ggml_tensor * kq_mask, |
| int32_t n_tokens, |
| int32_t kv_head, |
| int32_t n_kv, |
| float kq_scale, |
| const llm_build_cb & cb, |
| int il) { |
| const llama_hparams & hparams = lctx.model.hparams; |
| const llama_cparams & cparams = lctx.cparams; |
|
|
| |
| |
| ggml_build_forward_expand(graph, q_cur); |
| ggml_build_forward_expand(graph, k_cur); |
| ggml_build_forward_expand(graph, v_cur); |
|
|
| llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il); |
|
|
| struct ggml_tensor * cur; |
|
|
| cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il); |
| cb(cur, "kqv_out", il); |
|
|
| return cur; |
| } |
|
|
| static struct ggml_tensor * llm_build_copy_mask_state( |
| struct ggml_context * ctx, |
| struct ggml_cgraph * graph, |
| struct ggml_tensor * s, |
| struct ggml_tensor * state_copy, |
| struct ggml_tensor * state_mask, |
| int32_t n_state, |
| int32_t kv_size, |
| int32_t kv_head, |
| int32_t n_kv, |
| int32_t n_seqs) { |
| struct ggml_tensor * states = ggml_reshape_2d(ctx, s, n_state, kv_size); |
|
|
| |
| |
| |
| states = ggml_get_rows(ctx, states, state_copy); |
|
|
| |
| |
| states = ggml_mul(ctx, states, state_mask); |
|
|
| |
| ggml_build_forward_expand(graph, |
| ggml_cpy(ctx, |
| ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)), |
| ggml_view_1d(ctx, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s)))); |
|
|
| |
| return ggml_view_2d(ctx, states, n_state, n_seqs, states->nb[1], 0); |
| } |
|
|
| |
| static struct ggml_tensor * llm_build_mamba( |
| struct ggml_context * ctx, |
| struct llama_context & lctx, |
| const llama_ubatch & ubatch, |
| struct ggml_cgraph * graph, |
| struct ggml_tensor * cur, |
| struct ggml_tensor * state_copy, |
| struct ggml_tensor * state_mask, |
| int32_t kv_head, |
| int32_t n_kv, |
| const llm_build_cb & cb, |
| int il) { |
| const llama_model & model = lctx.model; |
| const llama_hparams & hparams = model.hparams; |
| const llama_kv_cache & kv = lctx.kv_self; |
| const int64_t d_conv = hparams.ssm_d_conv; |
| const int64_t d_inner = hparams.ssm_d_inner; |
| const int64_t d_state = hparams.ssm_d_state; |
| const int64_t dt_rank = hparams.ssm_dt_rank; |
| const int64_t n_seqs = ubatch.n_seqs; |
| |
| const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms; |
| |
| const float norm_rms_eps = hparams.f_norm_rms_eps; |
|
|
| const int64_t n_seq_tokens = ubatch.n_seq_tokens; |
|
|
| GGML_ASSERT(n_seqs != 0); |
| GGML_ASSERT(ubatch.equal_seqs); |
| GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); |
|
|
| struct ggml_tensor * conv_states_all = kv.k_l[il]; |
| struct ggml_tensor * ssm_states_all = kv.v_l[il]; |
|
|
| |
| struct ggml_tensor * conv = llm_build_copy_mask_state(ctx, |
| graph, conv_states_all, state_copy, state_mask, |
| hparams.n_embd_k_s(), kv.size, kv_head, n_kv, n_seqs); |
| conv = ggml_reshape_3d(ctx, conv, d_conv - 1, d_inner, n_seqs); |
| struct ggml_tensor * ssm = llm_build_copy_mask_state(ctx, |
| graph, ssm_states_all, state_copy, state_mask, |
| hparams.n_embd_v_s(), kv.size, kv_head, n_kv, n_seqs); |
| ssm = ggml_reshape_3d(ctx, ssm, d_state, d_inner, n_seqs); |
|
|
| |
| cur = ggml_reshape_3d(ctx, cur, cur->ne[0], n_seq_tokens, n_seqs); |
|
|
| |
| struct ggml_tensor * xz = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_in, cur); |
| |
| |
| struct ggml_tensor * x = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0); |
| struct ggml_tensor * z = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz)); |
|
|
| |
| { |
| |
| struct ggml_tensor * conv_x = ggml_concat(ctx, conv, ggml_transpose(ctx, x), 0); |
|
|
| |
| struct ggml_tensor * last_conv = ggml_view_3d(ctx, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); |
|
|
| ggml_build_forward_expand(graph, |
| ggml_cpy(ctx, last_conv, |
| ggml_view_1d(ctx, conv_states_all, |
| (d_conv - 1)*(d_inner)*(n_seqs), |
| kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all)))); |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| x = ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d); |
|
|
| |
| x = ggml_add(ctx, x, model.layers[il].ssm_conv1d_b); |
|
|
| x = ggml_silu(ctx, x); |
| } |
|
|
| |
| { |
| |
| struct ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_x, x); |
| |
| struct ggml_tensor * dt = ggml_view_3d(ctx, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0); |
| struct ggml_tensor * B = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank); |
| struct ggml_tensor * C = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state)); |
|
|
| |
| if (ssm_dt_b_c_rms) { |
| dt = ggml_rms_norm(ctx, dt, norm_rms_eps); |
| B = ggml_rms_norm(ctx, B, norm_rms_eps); |
| C = ggml_rms_norm(ctx, C, norm_rms_eps); |
| } |
|
|
| |
| dt = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_dt, dt); |
| dt = ggml_add(ctx, dt, model.layers[il].ssm_dt_b); |
|
|
| |
| |
| |
| struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C); |
|
|
| |
| ggml_build_forward_expand(graph, |
| ggml_cpy(ctx, |
| ggml_view_1d(ctx, y_ssm, d_state*d_inner*n_seqs, x->nb[3]), |
| ggml_view_1d(ctx, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); |
|
|
| struct ggml_tensor * y = ggml_view_3d(ctx, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0); |
|
|
| |
|
|
| |
| y = ggml_add(ctx, y, ggml_mul(ctx, x, model.layers[il].ssm_d)); |
| y = ggml_mul(ctx, y, ggml_silu(ctx, ggml_cont(ctx, z))); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_out, y); |
| } |
|
|
| |
| cur = ggml_reshape_2d(ctx, cur, cur->ne[0], n_seq_tokens * n_seqs); |
| cb(cur, "mamba_out", il); |
|
|
| return cur; |
| } |
|
|
| static struct ggml_tensor * llm_build_rwkv6_time_mix( |
| struct llama_context & lctx, |
| struct ggml_context * ctx, |
| const struct llama_layer * layer, |
| struct ggml_tensor * cur, |
| struct ggml_tensor * x_prev, |
| struct ggml_tensor ** wkv_state, |
| size_t wkv_head_size, |
| size_t head_count_kv) { |
| size_t n_embd = cur->ne[0]; |
| size_t n_seq_tokens = cur->ne[1]; |
| size_t n_seqs = cur->ne[2]; |
|
|
| size_t head_size = wkv_head_size; |
| size_t head_count = n_embd / head_size; |
|
|
| size_t n_tokens = n_seqs * n_seq_tokens; |
|
|
| bool is_qrwkv = layer->time_mix_first == nullptr; |
|
|
| struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); |
|
|
| sx = ggml_reshape_2d(ctx, sx, n_embd, n_tokens); |
| cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); |
|
|
| struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur); |
|
|
| xxx = ggml_reshape_4d( |
| ctx, |
| ggml_tanh( |
| ctx, |
| ggml_mul_mat(ctx, layer->time_mix_w1, xxx) |
| ), |
| layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens |
| ); |
|
|
| xxx = ggml_cont(ctx, ggml_permute(ctx, xxx, 0, 1, 3, 2)); |
|
|
| xxx = ggml_mul_mat( |
| ctx, |
| ggml_reshape_4d( |
| ctx, |
| layer->time_mix_w2, |
| layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 |
| ), |
| xxx |
| ); |
|
|
| struct ggml_tensor *xw, *xk, *xv, *xr, *xg; |
| if (layer->time_mix_lerp_fused) { |
| |
| sx = ggml_reshape_3d(ctx, sx, n_embd, 1, n_tokens); |
| cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); |
| xxx = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xxx, layer->time_mix_lerp_fused), sx), cur); |
| xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); |
| xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); |
| xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); |
| xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); |
| xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); |
| } else { |
| |
| xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); |
| xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); |
| xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); |
| xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); |
| xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); |
|
|
| xw = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xw, layer->time_mix_lerp_w), sx), cur); |
| xk = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xk, layer->time_mix_lerp_k), sx), cur); |
| xv = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xv, layer->time_mix_lerp_v), sx), cur); |
| xr = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xr, layer->time_mix_lerp_r), sx), cur); |
| xg = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xg, layer->time_mix_lerp_g), sx), cur); |
| } |
|
|
| struct ggml_tensor * r = llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr); |
| struct ggml_tensor * k = llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk); |
| struct ggml_tensor * v = llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv); |
| if (layer->time_mix_receptance_b) { |
| r = ggml_add(ctx, r, layer->time_mix_receptance_b); |
| } |
| if (layer->time_mix_key_b) { |
| k = ggml_add(ctx, k, layer->time_mix_key_b); |
| } |
| if (layer->time_mix_value_b) { |
| v = ggml_add(ctx, v, layer->time_mix_value_b); |
| } |
|
|
| struct ggml_tensor * g = llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg); |
| if (is_qrwkv) { |
| g = ggml_sigmoid(ctx, g); |
| } else { |
| g = ggml_silu(ctx, g); |
| } |
|
|
| if (head_count_kv != head_count) { |
| GGML_ASSERT(head_count % head_count_kv == 0); |
| k = ggml_reshape_4d(ctx, k, head_size, 1, head_count_kv, n_tokens); |
| v = ggml_reshape_4d(ctx, v, head_size, 1, head_count_kv, n_tokens); |
| struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens); |
| k = ggml_repeat(ctx, k, tmp); |
| v = ggml_repeat(ctx, v, tmp); |
| } |
|
|
| k = ggml_reshape_3d(ctx, k, head_size, head_count, n_tokens); |
| v = ggml_reshape_3d(ctx, v, head_size, head_count, n_tokens); |
| r = ggml_reshape_3d(ctx, r, head_size, head_count, n_tokens); |
|
|
| struct ggml_tensor * w = ggml_mul_mat( |
| ctx, |
| layer->time_mix_decay_w2, |
| ggml_tanh( |
| ctx, |
| ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw) |
| ) |
| ); |
|
|
| w = ggml_add(ctx, w, layer->time_mix_decay); |
| w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w))); |
| w = ggml_reshape_3d(ctx, w, head_size, head_count, n_tokens); |
|
|
| if (is_qrwkv) { |
| |
| k = ggml_sub(ctx, k, ggml_mul(ctx, k, w)); |
| } |
|
|
| struct ggml_tensor * wkv_output; |
| if (!layer->time_mix_first) { |
| wkv_output = ggml_gated_linear_attn(ctx, k, v, r, w, *wkv_state, pow(head_size, -0.5f)); |
| } else { |
| wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); |
| } |
| cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0); |
| *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); |
|
|
| if (!is_qrwkv) { |
| |
| cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens); |
| cur = ggml_norm(ctx, cur, 64e-5f); |
|
|
| |
| cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); |
| cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); |
| } else { |
| cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); |
| } |
|
|
| cur = ggml_mul(ctx, cur, g); |
| cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur); |
|
|
| return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs); |
| } |
|
|
| static struct ggml_tensor * llm_build_rwkv6_channel_mix( |
| struct llama_context & lctx, |
| struct ggml_context * ctx, |
| const struct llama_layer * layer, |
| struct ggml_tensor * cur, |
| struct ggml_tensor * x_prev) { |
| struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); |
| struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur); |
| struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur); |
|
|
| struct ggml_tensor * r = ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr)); |
| struct ggml_tensor * k = ggml_sqr( |
| ctx, |
| ggml_relu( |
| ctx, |
| llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk) |
| ) |
| ); |
|
|
| return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k)); |
| } |
|
|
| struct llm_build_context { |
| const llama_model & model; |
| llama_context & lctx; |
| const llama_hparams & hparams; |
| const llama_cparams & cparams; |
| const llama_ubatch & ubatch; |
| const llama_kv_cache & kv_self; |
|
|
| const int64_t n_embd; |
| const int64_t n_layer; |
| const int64_t n_rot; |
| const int64_t n_ctx; |
| const int64_t n_head; |
| const int64_t n_head_kv; |
| const int64_t n_embd_head_k; |
| const int64_t n_embd_k_gqa; |
| const int64_t n_embd_head_v; |
| const int64_t n_embd_v_gqa; |
| const int64_t n_expert; |
| const int64_t n_expert_used; |
|
|
| const float freq_base; |
| const float freq_scale; |
| const float ext_factor; |
| const float attn_factor; |
| const float beta_fast; |
| const float beta_slow; |
| const float norm_eps; |
| const float norm_rms_eps; |
|
|
| const int32_t n_tokens; |
| const int32_t n_kv; |
| const int32_t n_outputs; |
| const int32_t n_outputs_enc; |
| const int32_t kv_head; |
| const int32_t n_ctx_orig; |
|
|
| const bool flash_attn; |
|
|
| const enum llama_pooling_type pooling_type; |
| const enum llama_rope_type rope_type; |
|
|
| const llm_build_cb & cb; |
|
|
| std::vector<uint8_t> & buf_compute_meta; |
|
|
| struct ggml_context * ctx0 = nullptr; |
|
|
| |
| llm_build_context( |
| llama_context & lctx, |
| const llama_ubatch & ubatch, |
| const llm_build_cb & cb, |
| bool worst_case) : |
| model (lctx.model), |
| lctx (lctx), |
| hparams (model.hparams), |
| cparams (lctx.cparams), |
| ubatch (ubatch), |
| kv_self (lctx.kv_self), |
| n_embd (hparams.n_embd), |
| n_layer (hparams.n_layer), |
| n_rot (hparams.n_rot), |
| n_ctx (cparams.n_ctx), |
| n_head (hparams.n_head()), |
| n_head_kv (hparams.n_head_kv()), |
| n_embd_head_k (hparams.n_embd_head_k), |
| n_embd_k_gqa (hparams.n_embd_k_gqa()), |
| n_embd_head_v (hparams.n_embd_head_v), |
| n_embd_v_gqa (hparams.n_embd_v_gqa()), |
| n_expert (hparams.n_expert), |
| n_expert_used (hparams.n_expert_used), |
| freq_base (cparams.rope_freq_base), |
| freq_scale (cparams.rope_freq_scale), |
| ext_factor (cparams.yarn_ext_factor), |
| attn_factor (cparams.yarn_attn_factor), |
| beta_fast (cparams.yarn_beta_fast), |
| beta_slow (cparams.yarn_beta_slow), |
| norm_eps (hparams.f_norm_eps), |
| norm_rms_eps (hparams.f_norm_rms_eps), |
| n_tokens (ubatch.n_tokens), |
| n_kv (worst_case ? kv_self.size : kv_self.n), |
| n_outputs (worst_case ? n_tokens : lctx.n_outputs), |
| n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd), |
| kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head), |
| n_ctx_orig (cparams.n_ctx_orig_yarn), |
| flash_attn (cparams.flash_attn), |
| pooling_type (cparams.pooling_type), |
| rope_type (hparams.rope_type), |
| cb (cb), |
| buf_compute_meta (lctx.buf_compute_meta) { |
| |
| } |
|
|
| void init() { |
| struct ggml_init_params params = { |
| buf_compute_meta.size(), |
| buf_compute_meta.data(), |
| true, |
| }; |
|
|
| ctx0 = ggml_init(params); |
|
|
| lctx.inp_tokens = nullptr; |
| lctx.inp_embd = nullptr; |
| lctx.inp_pos = nullptr; |
| lctx.inp_out_ids = nullptr; |
| lctx.inp_KQ_mask = nullptr; |
| lctx.inp_KQ_mask_swa = nullptr; |
| lctx.inp_K_shift = nullptr; |
| lctx.inp_mean = nullptr; |
| lctx.inp_cls = nullptr; |
| lctx.inp_s_copy = nullptr; |
| lctx.inp_s_mask = nullptr; |
| lctx.inp_s_seq = nullptr; |
| lctx.inp_pos_bucket = nullptr; |
| lctx.inp_embd_enc = nullptr; |
| lctx.inp_KQ_mask_cross = nullptr; |
| } |
|
|
| void free() { |
| ggml_free(ctx0); |
| ctx0 = nullptr; |
| } |
|
|
| struct ggml_cgraph * build_k_shift() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| GGML_ASSERT(kv_self.size == n_ctx); |
|
|
| lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); |
| cb(lctx.inp_K_shift, "K_shift", -1); |
| ggml_set_input(lctx.inp_K_shift); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| const int64_t n_head_kv = hparams.n_head_kv(il); |
| const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); |
| struct ggml_tensor * rope_factors = build_rope_factors(il); |
| struct ggml_tensor * k = |
| ggml_view_3d(ctx0, kv_self.k_l[il], |
| n_embd_head_k, n_head_kv, n_ctx, |
| ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), |
| ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), |
| 0); |
|
|
| struct ggml_tensor * tmp; |
| if (ggml_is_quantized(k->type)) { |
| |
| tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); |
| cb(tmp, "K_f32", il); |
| for (auto & backend : lctx.backends) { |
| |
| if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { |
| ggml_backend_sched_set_tensor_backend(lctx.sched.get(), tmp, backend.get()); |
| break; |
| } |
| } |
| tmp = ggml_rope_ext_inplace(ctx0, tmp, |
| lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow); |
| cb(tmp, "K_shifted_f32", il); |
| tmp = ggml_cpy(ctx0, tmp, k); |
| } else { |
| |
| tmp = ggml_rope_ext_inplace(ctx0, k, |
| lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow); |
| } |
| cb(tmp, "K_shifted", il); |
| ggml_build_forward_expand(gf, tmp); |
| } |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| for (uint32_t i = 0; i < ids.size(); ++i) { |
| const uint32_t id = ids[i]; |
|
|
| if (i == id || id == ids.size()) { |
| continue; |
| } |
|
|
| uint32_t nm = 1; |
|
|
| while (i + nm < ids.size() && ids[i + nm] == id + nm) { |
| nm++; |
| } |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); |
| const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); |
|
|
| ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], |
| n_embd_k_gqa, nm, |
| ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), |
| ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); |
|
|
| ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], |
| n_embd_k_gqa, nm, |
| ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), |
| ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); |
|
|
| ggml_tensor * view_v_src; |
| ggml_tensor * view_v_dst; |
|
|
| if (flash_attn) { |
| |
| view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], |
| n_embd_v_gqa, nm, |
| ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), |
| ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); |
|
|
| view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], |
| n_embd_v_gqa, nm, |
| ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), |
| ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); |
| } else { |
| view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], |
| nm, n_embd_v_gqa, |
| ggml_row_size(kv_self.v_l[il]->type, kv_self.size), |
| ggml_row_size(kv_self.v_l[il]->type, i)); |
|
|
| view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], |
| nm, n_embd_v_gqa, |
| ggml_row_size(kv_self.v_l[il]->type, kv_self.size), |
| ggml_row_size(kv_self.v_l[il]->type, id)); |
| } |
|
|
| ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); |
| ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); |
| } |
|
|
| i += nm - 1; |
| } |
|
|
| |
|
|
| return gf; |
| } |
|
|
| struct ggml_tensor * build_inp_pos() { |
| lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); |
| cb(lctx.inp_pos, "inp_pos", -1); |
| ggml_set_input(lctx.inp_pos); |
| return lctx.inp_pos; |
| } |
|
|
| struct ggml_tensor * build_rope_factors(int il) { |
| |
| const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; |
|
|
| if (model.layers[il].rope_freqs != nullptr) { |
| return model.layers[il].rope_freqs; |
| } |
|
|
| if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { |
| return model.layers[il].rope_long; |
| } |
|
|
| return model.layers[il].rope_short; |
| } |
|
|
| struct ggml_tensor * build_inp_out_ids() { |
| lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); |
| cb(lctx.inp_out_ids, "inp_out_ids", -1); |
| ggml_set_input(lctx.inp_out_ids); |
| return lctx.inp_out_ids; |
| } |
|
|
| struct ggml_tensor * build_inp_KQ_mask(bool causal = true) { |
| lctx.inp_KQ_mask = causal |
| ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) |
| : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); |
| cb(lctx.inp_KQ_mask, "KQ_mask", -1); |
| ggml_set_input(lctx.inp_KQ_mask); |
|
|
| return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask; |
| } |
|
|
| struct ggml_tensor * build_inp_KQ_mask_swa(bool causal = true) { |
| GGML_ASSERT(hparams.n_swa > 0); |
|
|
| lctx.inp_KQ_mask_swa = causal |
| ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) |
| : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); |
| cb(lctx.inp_KQ_mask_swa, "KQ_mask_swa", -1); |
| ggml_set_input(lctx.inp_KQ_mask_swa); |
|
|
| return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask_swa, GGML_TYPE_F16) : lctx.inp_KQ_mask_swa; |
| } |
|
|
| struct ggml_tensor * build_inp_mean() { |
| lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); |
| cb(lctx.inp_mean, "inp_mean", -1); |
| ggml_set_input(lctx.inp_mean); |
| return lctx.inp_mean; |
| } |
|
|
| struct ggml_tensor * build_inp_cls() { |
| lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); |
| cb(lctx.inp_cls, "inp_cls", -1); |
| ggml_set_input(lctx.inp_cls); |
| return lctx.inp_cls; |
| } |
|
|
| struct ggml_tensor * build_inp_s_copy() { |
| lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); |
| cb(lctx.inp_s_copy, "inp_s_copy", -1); |
| ggml_set_input(lctx.inp_s_copy); |
| return lctx.inp_s_copy; |
| } |
|
|
| struct ggml_tensor * build_inp_s_mask() { |
| lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); |
| cb(lctx.inp_s_mask, "inp_s_mask", -1); |
| ggml_set_input(lctx.inp_s_mask); |
| return lctx.inp_s_mask; |
| } |
|
|
| struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { |
| |
| struct ggml_tensor * inp = nullptr; |
| for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { |
| inp = ggml_graph_node(gf, i); |
| if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { |
| break; |
| } else { |
| inp = nullptr; |
| } |
| } |
| GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor"); |
|
|
| struct ggml_tensor * cur; |
|
|
| switch (pooling_type) { |
| case LLAMA_POOLING_TYPE_NONE: |
| { |
| cur = inp; |
| } break; |
| case LLAMA_POOLING_TYPE_MEAN: |
| { |
| struct ggml_tensor * inp_mean = build_inp_mean(); |
| cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean); |
| } break; |
| case LLAMA_POOLING_TYPE_CLS: |
| case LLAMA_POOLING_TYPE_LAST: |
| { |
| struct ggml_tensor * inp_cls = build_inp_cls(); |
| cur = ggml_get_rows(ctx0, inp, inp_cls); |
| } break; |
| case LLAMA_POOLING_TYPE_RANK: |
| { |
| struct ggml_tensor * inp_cls = build_inp_cls(); |
| inp = ggml_get_rows(ctx0, inp, inp_cls); |
|
|
| |
| |
| GGML_ASSERT(model.cls != nullptr); |
| GGML_ASSERT(model.cls_b != nullptr); |
|
|
| cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls, inp), model.cls_b); |
| cur = ggml_tanh(ctx0, cur); |
|
|
| |
| |
| if (model.cls_out) { |
| GGML_ASSERT(model.cls_out_b != nullptr); |
|
|
| cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b); |
| } |
| } break; |
| default: |
| { |
| GGML_ABORT("unknown pooling type"); |
| } |
| } |
|
|
| cb(cur, "result_embd_pooled", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_tensor * llm_build_pos_bucket(bool causal) { |
| if (causal) { |
| lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); |
| } else { |
| lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); |
| } |
|
|
| ggml_set_input(lctx.inp_pos_bucket); |
| cb(lctx.inp_pos_bucket, "pos_bucket", -1); |
|
|
| return lctx.inp_pos_bucket; |
| } |
|
|
| struct ggml_tensor * llm_build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { |
| struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0); |
| cb(pos_bucket_1d, "pos_bucket_1d", -1); |
|
|
| struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d); |
| cb(pos_bias, "pos_bias", -1); |
|
|
| pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0], 0); |
| cb(pos_bias, "pos_bias", -1); |
|
|
| pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3); |
| cb(pos_bias, "pos_bias", -1); |
|
|
| pos_bias = ggml_cont(ctx0, pos_bias); |
| cb(pos_bias, "pos_bias", -1); |
|
|
| return pos_bias; |
| } |
|
|
| struct ggml_tensor * llm_build_inp_embd_enc() { |
| const int64_t n_embd = hparams.n_embd; |
| lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc); |
| ggml_set_input(lctx.inp_embd_enc); |
| cb(lctx.inp_embd_enc, "embd_enc", -1); |
| return lctx.inp_embd_enc; |
| } |
|
|
| struct ggml_tensor * llm_build_inp_KQ_mask_cross() { |
| lctx.inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); |
| ggml_set_input(lctx.inp_KQ_mask_cross); |
| cb(lctx.inp_KQ_mask_cross, "KQ_mask_cross", -1); |
| return lctx.inp_KQ_mask_cross; |
| } |
|
|
| struct ggml_cgraph * build_llama() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; |
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| #if defined(GGML_USE_CLBLAST) |
| struct ggml_tensor * rope_factors = nullptr; |
| #else |
| struct ggml_tensor * rope_factors = build_rope_factors(il); |
| #endif |
|
|
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| if (model.layers[il].bq) { |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| if (model.layers[il].bk) { |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| if (model.layers[il].bv) { |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
| } |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| |
| if (hparams.f_residual_scale) { |
| cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| if (model.layers[il].ffn_gate_inp == nullptr) { |
|
|
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } else { |
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_moe_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_gate_inp, |
| model.layers[il].ffn_up_exps, |
| model.layers[il].ffn_gate_exps, |
| model.layers[il].ffn_down_exps, |
| nullptr, |
| n_expert, n_expert_used, |
| LLM_FFN_SILU, true, |
| false, 0.0, |
| LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, |
| cb, il); |
| cb(cur, "ffn_moe_out", il); |
| } |
|
|
| |
| if (hparams.f_residual_scale) { |
| cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cb(cur, "ffn_out", il); |
|
|
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
|
|
| |
| if (hparams.f_logit_scale) { |
| cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); |
| } |
|
|
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_deci() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; |
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
| const int64_t n_head_kv = hparams.n_head_kv(il); |
| const int64_t n_head = hparams.n_head(il); |
|
|
| if (n_head == 0) { |
| |
| cur = inpL; |
| } else { |
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
| } |
|
|
| if (n_head > 0 && n_head_kv == 0) { |
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); |
| cb(cur, "wo", il); |
| } else if (n_head > 0) { |
| |
| |
| struct ggml_tensor * rope_factors = build_rope_factors(il); |
|
|
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| if (model.layers[il].bq) { |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| if (model.layers[il].bk) { |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| if (model.layers[il].bv) { |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
| } |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| |
| if (hparams.f_residual_scale) { |
| cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); |
| } |
|
|
| |
| struct ggml_tensor * ffn_inp = cur; |
| if (n_head > 0) { |
| ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
| } |
|
|
| |
| if (model.layers[il].ffn_gate_inp == nullptr) { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| |
| if (hparams.f_residual_scale) { |
| cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cb(cur, "ffn_out", il); |
|
|
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
|
|
| |
| if (hparams.f_logit_scale) { |
| cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); |
| } |
|
|
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_baichuan() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
|
|
| switch (model.type) { |
| case LLM_TYPE_7B: |
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| break; |
| case LLM_TYPE_13B: |
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens); |
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens); |
| break; |
| default: |
| GGML_ABORT("fatal error"); |
| } |
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_xverse() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_falcon() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * attn_norm; |
|
|
| attn_norm = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, |
| model.layers[il].attn_norm_b, |
| LLM_NORM, cb, il); |
| cb(attn_norm, "attn_norm", il); |
|
|
| |
| { |
| if (model.layers[il].attn_norm_2) { |
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm_2, |
| model.layers[il].attn_norm_2_b, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm_2", il); |
| } else { |
| cur = attn_norm; |
| } |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); |
| cb(cur, "wqkv", il); |
|
|
| struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); |
|
|
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
|
|
| |
| Qcur = ggml_rope_ext( |
| ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, |
| freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, |
| freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = cur; |
|
|
| |
| { |
| cur = llm_build_ffn(ctx0, lctx, attn_norm, |
| model.layers[il].ffn_up, NULL, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = ggml_add(ctx0, cur, inpL); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, |
| model.output_norm_b, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_grok() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| inpL = ggml_scale(ctx0, inpL, 78.38367176906169f); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| if (model.layers[il].bq) { |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| if (model.layers[il].bk) { |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| if (model.layers[il].bv) { |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
| } |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| |
| |
| if (model.layers[il].attn_out_norm) { |
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].attn_out_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_out_norm", il); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_moe_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_gate_inp, |
| model.layers[il].ffn_up_exps, |
| model.layers[il].ffn_gate_exps, |
| model.layers[il].ffn_down_exps, |
| nullptr, |
| n_expert, n_expert_used, |
| LLM_FFN_GELU, true, |
| false, 0.0, |
| LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, |
| cb, il); |
| cb(cur, "ffn_moe_out", il); |
|
|
| |
| |
| |
| if (model.layers[il].layer_out_norm) { |
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].layer_out_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "layer_out_norm", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cb(cur, "ffn_out", il); |
|
|
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
|
|
| |
| |
|
|
| cur = ggml_scale(ctx0, cur, 0.5773502691896257f); |
|
|
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_dbrx() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| struct ggml_tensor * Qcur = nullptr; |
| struct ggml_tensor * Kcur = nullptr; |
| struct ggml_tensor * Vcur = nullptr; |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); |
| cb(cur, "wqkv", il); |
|
|
| cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); |
| cb(cur, "wqkv_clamped", il); |
|
|
| Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); |
| Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); |
| Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); |
|
|
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].attn_out_norm, NULL, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_out_norm", il); |
|
|
| cur = llm_build_moe_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_gate_inp, |
| model.layers[il].ffn_up_exps, |
| model.layers[il].ffn_gate_exps, |
| model.layers[il].ffn_down_exps, |
| nullptr, |
| n_expert, n_expert_used, |
| LLM_FFN_SILU, true, |
| false, 0.0, |
| LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, |
| cb, il); |
| cb(cur, "ffn_moe_out", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cb(cur, "ffn_out", il); |
|
|
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
|
|
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_starcoder() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); |
| cb(pos, "pos_embd", -1); |
|
|
| inpL = ggml_add(ctx0, inpL, pos); |
| cb(inpL, "inpL", -1); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, |
| model.layers[il].attn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); |
| cb(cur, "wqkv", il); |
|
|
| cur = ggml_add(ctx0, cur, model.layers[il].bqkv); |
| cb(cur, "bqkv", il); |
|
|
| struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); |
|
|
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| } |
|
|
| |
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, |
| model.layers[il].ffn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.output_norm, |
| model.output_norm_b, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_refact() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
|
|
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
| cb(Kcur, "Kcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| cb(Qcur, "Qcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_bert() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
|
|
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
| struct ggml_tensor * inp_pos = nullptr; |
|
|
| if (model.arch != LLM_ARCH_JINA_BERT_V2) { |
| inp_pos = build_inp_pos(); |
| } |
|
|
| |
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); |
| inpL = ggml_add(ctx0, inpL, type_row0); |
| if (model.arch == LLM_ARCH_BERT) { |
| inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); |
| } |
| cb(inpL, "inp_embd", -1); |
|
|
| |
| inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); |
| cb(inpL, "inp_norm", -1); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false); |
|
|
| |
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * cur = inpL; |
|
|
| struct ggml_tensor * Qcur; |
| struct ggml_tensor * Kcur; |
| struct ggml_tensor * Vcur; |
|
|
| |
| if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) { |
| Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
|
|
| if (model.layers[il].attn_q_norm) { |
| Qcur = llm_build_norm(ctx0, Qcur, hparams, |
| model.layers[il].attn_q_norm, |
| model.layers[il].attn_q_norm_b, |
| LLM_NORM, cb, il); |
| } |
|
|
| Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
|
|
| if (model.layers[il].attn_k_norm) { |
| Kcur = llm_build_norm(ctx0, Kcur, hparams, |
| model.layers[il].attn_k_norm, |
| model.layers[il].attn_k_norm_b, |
| LLM_NORM, cb, il); |
| } |
| Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
| } else { |
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); |
| cb(cur, "wqkv", il); |
|
|
| Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); |
| Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); |
| Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); |
|
|
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); |
| struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); |
|
|
| struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); |
| cb(kq, "kq", il); |
|
|
| kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); |
| cb(kq, "kq_soft_max_ext", il); |
|
|
| struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); |
| cb(v, "v", il); |
|
|
| struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); |
| cb(kqv, "kqv", il); |
|
|
| struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); |
| cb(kqv_merged, "kqv_merged", il); |
|
|
| cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); |
| cb(cur, "kqv_merged_cont", il); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); |
| if (model.layers[il].bo) { |
| cb(cur, "kqv_wo", il); |
| } |
|
|
| if (model.layers[il].bo) { |
| cur = ggml_add(ctx0, cur, model.layers[il].bo); |
| } |
| cb(cur, "kqv_out", il); |
|
|
| if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| } |
|
|
| |
| cur = ggml_add(ctx0, cur, inpL); |
|
|
| |
| cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il); |
|
|
| if (model.layers[il].attn_norm_2 != nullptr) { |
| cur = ggml_add(ctx0, cur, inpL); |
| cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il); |
| } |
|
|
| struct ggml_tensor * ffn_inp = cur; |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| if (model.arch == LLM_ARCH_BERT) { |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); |
| } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_PAR, cb, il); |
| } else { |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| } |
| cb(cur, "ffn_out", il); |
|
|
| |
| cur = ggml_add(ctx0, cur, ffn_inp); |
|
|
| |
| cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cb(cur, "result_embd", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_bloom() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| inpL = llm_build_norm(ctx0, inpL, hparams, |
| model.tok_norm, |
| model.tok_norm_b, |
| LLM_NORM, cb, -1); |
| cb(inpL, "inp_norm", -1); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, |
| model.layers[il].attn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); |
| cb(cur, "wqkv", il); |
|
|
| cur = ggml_add(ctx0, cur, model.layers[il].bqkv); |
| cb(cur, "bqkv", il); |
|
|
| struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); |
|
|
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| } |
|
|
| |
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, |
| model.layers[il].ffn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.output_norm, |
| model.output_norm_b, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_mpt() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * pos; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| if (model.pos_embd) { |
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
| pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); |
| cb(pos, "pos_embd", -1); |
|
|
| inpL = ggml_add(ctx0, inpL, pos); |
| cb(inpL, "inpL", -1); |
| } |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * attn_norm; |
|
|
| attn_norm = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, |
| model.layers[il].attn_norm_b, |
| LLM_NORM, cb, il); |
| cb(attn_norm, "attn_norm", il); |
|
|
| |
| { |
| cur = attn_norm; |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); |
| cb(cur, "wqkv", il); |
|
|
| if (model.layers[il].bqkv){ |
| cur = ggml_add(ctx0, cur, model.layers[il].bqkv); |
| cb(cur, "bqkv", il); |
| } |
|
|
| if (hparams.f_clamp_kqv > 0.0f) { |
| cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); |
| cb(cur, "wqkv_clamped", il); |
| } |
|
|
| struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); |
|
|
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
| cb(Vcur, "Vcur", il); |
|
|
| |
| if (model.layers[il].attn_q_norm) { |
| Qcur = llm_build_norm(ctx0, Qcur, hparams, |
| model.layers[il].attn_q_norm, |
| model.layers[il].attn_q_norm_b, |
| LLM_NORM, cb, il); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = llm_build_norm(ctx0, Kcur, hparams, |
| model.layers[il].attn_k_norm, |
| model.layers[il].attn_k_norm_b, |
| LLM_NORM, cb, il); |
| cb(Kcur, "Kcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } else { |
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| } |
|
|
| |
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, |
| model.layers[il].ffn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "ffn_norm", il); |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| model.layers[il].ffn_act, |
| LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, |
| model.output_norm_b, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_stablelm() { |
| struct ggml_cgraph * gf = ggml_new_graph(ctx0); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
|
|
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, |
| model.layers[il].attn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| struct ggml_tensor * inpSA = cur; |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| if (model.layers[il].bq) { |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| if (model.layers[il].bk) { |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| if (model.layers[il].bv) { |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
| } |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| cb(Qcur, "Qcur", il); |
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
| cb(Kcur, "Kcur", il); |
|
|
| if (model.layers[il].attn_q_norm) { |
| Qcur = llm_build_norm(ctx0, Qcur, hparams, |
| model.layers[il].attn_q_norm, |
| NULL, |
| LLM_NORM, cb, il); |
| cb(Qcur, "Qcur", il); |
| } |
| if (model.layers[il].attn_k_norm) { |
| Kcur = llm_build_norm(ctx0, Kcur, hparams, |
| model.layers[il].attn_k_norm, |
| NULL, |
| LLM_NORM, cb, il); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
|
|
| Qcur = ggml_rope_ext( |
| ctx0, Qcur, inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, Kcur, inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| if (model.layers[il].ffn_norm) { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, |
| model.layers[il].ffn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "ffn_norm", il); |
| } else { |
| |
| cur = inpSA; |
| } |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, |
| model.output_norm_b, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_qwen() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); |
| cb(cur, "wqkv", il); |
|
|
| cur = ggml_add(ctx0, cur, model.layers[il].bqkv); |
| cb(cur, "bqkv", il); |
|
|
| struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd))); |
|
|
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
|
|
| |
| Qcur = ggml_rope_ext( |
| ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, |
| freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, |
| freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_qwen2() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_qwen2vl() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4); |
| cb(lctx.inp_pos, "inp_pos", -1); |
| ggml_set_input(lctx.inp_pos); |
| struct ggml_tensor * inp_pos = lctx.inp_pos; |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
| int sections[4]; |
| std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_rope_multi( |
| ctx0, |
| ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_multi( |
| ctx0, |
| ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_qwen2moe() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| ggml_tensor * moe_out = |
| llm_build_moe_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_gate_inp, |
| model.layers[il].ffn_up_exps, |
| model.layers[il].ffn_gate_exps, |
| model.layers[il].ffn_down_exps, |
| nullptr, |
| n_expert, n_expert_used, |
| LLM_FFN_SILU, false, |
| false, 0.0, |
| LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, |
| cb, il); |
| cb(cur, "ffn_moe_out", il); |
|
|
| |
| { |
| ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur); |
| cb(cur_gate_inp, "ffn_shexp_gate_inp", il); |
|
|
| |
| ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp); |
| cb(cur_gate, "ffn_shexp_gate", il); |
|
|
| ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up_shexp, NULL, NULL, |
| model.layers[il].ffn_gate_shexp, NULL, NULL, |
| model.layers[il].ffn_down_shexp, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur_ffn, "ffn_shexp", il); |
|
|
| ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate); |
| cb(ffn_shexp_out, "ffn_shexp_out", il); |
|
|
| moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out); |
| cb(moe_out, "ffn_out", il); |
|
|
| cur = moe_out; |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_phi2() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * attn_norm_output; |
| struct ggml_tensor * ffn_output; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| attn_norm_output = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, |
| model.layers[il].attn_norm_b, |
| LLM_NORM, cb, il); |
| cb(attn_norm_output, "attn_norm", il); |
|
|
| |
| { |
| struct ggml_tensor * Qcur = nullptr; |
| struct ggml_tensor * Kcur = nullptr; |
| struct ggml_tensor * Vcur = nullptr; |
|
|
| if (model.layers[il].wqkv) { |
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output); |
| cb(cur, "wqkv", il); |
|
|
| cur = ggml_add(ctx0, cur, model.layers[il].bqkv); |
| cb(cur, "bqkv", il); |
|
|
| Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); |
| Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); |
| Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); |
| } else { |
| Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); |
| Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); |
| Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); |
| } |
|
|
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, |
| freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| |
| |
| Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head))); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, |
| freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids); |
| } |
|
|
| |
| { |
| ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); |
| cb(ffn_output, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_output); |
| cur = ggml_add(ctx0, cur, inpL); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.output_norm, |
| model.output_norm_b, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output_no_bias", -1); |
|
|
| cur = ggml_add(ctx0, cur, model.output_b); |
| cb(cur, "result_output", -1); |
| ggml_build_forward_expand(gf, cur); |
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_phi3() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = nullptr; |
| if (hparams.n_swa == 0) { |
| |
| KQ_mask = build_inp_KQ_mask(); |
| } else { |
| KQ_mask = build_inp_KQ_mask_swa(); |
| } |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| auto residual = inpL; |
|
|
| |
| { |
| |
| struct ggml_tensor * rope_factors = build_rope_factors(il); |
|
|
| struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, |
| model.layers[il].attn_norm_b, |
| LLM_NORM_RMS, cb, il); |
| cb(attn_norm_output, "attn_norm", il); |
|
|
| struct ggml_tensor * Qcur = nullptr; |
| struct ggml_tensor * Kcur = nullptr; |
| struct ggml_tensor * Vcur = nullptr; |
|
|
| if (model.layers[il].wqkv) { |
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output); |
| cb(cur, "wqkv", il); |
|
|
| Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); |
| Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd))); |
| Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); |
| } else { |
| Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); |
| Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); |
| Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); |
| } |
|
|
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, |
| freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, |
| freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor* inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| residual = ggml_get_rows(ctx0, residual, inp_out_ids); |
| } |
|
|
| cur = ggml_add(ctx0, cur, residual); |
| residual = cur; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| |
| if (model.layers[il].ffn_gate_inp == nullptr) { |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il); |
| cb(cur, "ffn_out", il); |
| } else { |
| |
| cur = llm_build_moe_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_gate_inp, |
| model.layers[il].ffn_up_exps, |
| model.layers[il].ffn_gate_exps, |
| model.layers[il].ffn_down_exps, |
| nullptr, |
| n_expert, n_expert_used, |
| LLM_FFN_SILU, true, |
| false, 0.0, |
| LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, |
| cb, il); |
| cb(cur, "ffn_moe_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, residual, cur); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.output_norm, |
| model.output_norm_b, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
|
|
| if (model.output_b != nullptr) { |
| cb(cur, "result_output_no_bias", -1); |
| cur = ggml_add(ctx0, cur, model.output_b); |
| } |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
|
|
| struct ggml_cgraph * build_plamo() { |
| struct ggml_cgraph * gf = ggml_new_graph(ctx0); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| struct ggml_tensor * attention_norm = cur; |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr, |
| n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
| struct ggml_tensor * sa_out = cur; |
|
|
| cur = attention_norm; |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| } |
|
|
| |
| { |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, sa_out); |
| cur = ggml_add(ctx0, cur, inpL); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_gpt2() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * pos; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); |
| cb(pos, "pos_embd", -1); |
|
|
| inpL = ggml_add(ctx0, inpL, pos); |
| cb(inpL, "inpL", -1); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, |
| model.layers[il].attn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); |
| cb(cur, "wqkv", il); |
|
|
| cur = ggml_add(ctx0, cur, model.layers[il].bqkv); |
| cb(cur, "bqkv", il); |
|
|
| struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); |
|
|
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| } |
|
|
| |
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, |
| model.layers[il].ffn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.output_norm, |
| model.output_norm_b, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_codeshell() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, |
| model.layers[il].attn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); |
| cb(cur, "wqkv", il); |
|
|
| cur = ggml_add(ctx0, cur, model.layers[il].bqkv); |
| cb(cur, "bqkv", il); |
|
|
| struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); |
| struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); |
|
|
| cb(tmpq, "tmpq", il); |
| cb(tmpk, "tmpk", il); |
| cb(Vcur, "Vcur", il); |
|
|
| struct ggml_tensor * Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| } |
|
|
| |
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, |
| model.layers[il].ffn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.output_norm, |
| model.output_norm_b, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_orion() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, model.layers[il].attn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| |
| |
| |
| |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| |
| |
| |
| |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| |
| |
| |
| |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, model.output_norm_b, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_internlm2() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| if (model.layers[il].bq) { |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| if (model.layers[il].bk) { |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| if (model.layers[il].bv) { |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
| } |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_minicpm3() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| const int64_t n_embd_base = 256; |
| const float scale_embd = 12.0f; |
| const float scale_depth = 1.4f; |
| const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k)); |
|
|
| const uint32_t n_embd_head_qk_rope = hparams.n_rot; |
| const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; |
| const uint32_t kv_lora_rank = hparams.n_lora_kv; |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| inpL = ggml_scale(ctx0, inpL, scale_embd); |
| cb(inpL, "inp_scaled", -1); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| struct ggml_tensor * rope_factors = build_rope_factors(il); |
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| struct ggml_tensor * q = NULL; |
| |
| q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); |
| cb(q, "q", il); |
|
|
| q = llm_build_norm(ctx0, q, hparams, |
| model.layers[il].attn_q_a_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(q, "q", il); |
|
|
| |
| q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); |
| cb(q, "q", il); |
|
|
| |
| struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, |
| ggml_row_size(q->type, hparams.n_embd_head_k), |
| ggml_row_size(q->type, hparams.n_embd_head_k * n_head), |
| 0); |
| cb(q_nope, "q_nope", il); |
|
|
| |
| struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, |
| ggml_row_size(q->type, hparams.n_embd_head_k), |
| ggml_row_size(q->type, hparams.n_embd_head_k * n_head), |
| ggml_row_size(q->type, n_embd_head_qk_nope)); |
| cb(q_pe, "q_pe", il); |
|
|
| |
| struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); |
| cb(kv_pe_compresseed, "kv_pe_compresseed", il); |
|
|
| |
| struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, |
| kv_pe_compresseed->nb[1], |
| 0); |
| cb(kv_compressed, "kv_compressed", il); |
|
|
| |
| struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, |
| kv_pe_compresseed->nb[1], |
| kv_pe_compresseed->nb[1], |
| ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); |
| cb(k_pe, "k_pe", il); |
|
|
| |
| kv_compressed = ggml_cont(ctx0, kv_compressed); |
| kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, |
| model.layers[il].attn_kv_a_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(kv_compressed, "kv_compressed", il); |
|
|
| |
| struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); |
| cb(kv, "kv", il); |
|
|
| |
| struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, |
| ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), |
| ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), |
| 0); |
| cb(k_nope, "k_nope", il); |
|
|
| |
| struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, |
| ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), |
| ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), |
| ggml_row_size(kv->type, (n_embd_head_qk_nope))); |
| cb(v_states, "v_states", il); |
|
|
| v_states = ggml_cont(ctx0, v_states); |
| cb(v_states, "v_states", il); |
|
|
| v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, |
| ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), |
| 0); |
| cb(v_states, "v_states", il); |
|
|
| q_pe = ggml_cont(ctx0, q_pe); |
| q_pe = ggml_rope_ext( |
| ctx0, q_pe, inp_pos, rope_factors, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(q_pe, "q_pe", il); |
|
|
| |
| k_pe = ggml_cont(ctx0, k_pe); |
| k_pe = ggml_rope_ext( |
| ctx0, k_pe, inp_pos, rope_factors, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(k_pe, "k_pe", il); |
|
|
| struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); |
| cb(q_states, "q_states", il); |
|
|
| struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); |
| cb(k_states, "k_states", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| |
| const float scale_res = scale_depth/sqrtf(float(n_layer)); |
| cur = ggml_scale(ctx0, cur, scale_res); |
| cb(cur, "hidden_scaled", il); |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| |
| cur = ggml_scale(ctx0, cur, scale_res); |
| cb(cur, "hidden_scaled_ffn", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| const float scale_lmhead = float(n_embd_base)/float(n_embd); |
| cur = ggml_scale(ctx0, cur, scale_lmhead); |
| cb(cur, "lmhead_scaling", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_gemma() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head_k = hparams.n_embd_head_k; |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); |
| cb(inpL, "inp_scaled", -1); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow); |
| cb(Qcur, "Qcur", il); |
|
|
| Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); |
| cb(Qcur, "Qcur_scaled", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); |
| cb(sa_out, "sa_out", il); |
|
|
| cur = llm_build_norm(ctx0, sa_out, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| |
| { |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, sa_out); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_gemma2() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head_k = hparams.n_embd_head_k; |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); |
| cb(inpL, "inp_scaled", -1); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true); |
| struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| |
| struct ggml_tensor * KQ_mask_l = (il % 2 == 0) ? KQ_mask_swa : KQ_mask; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow); |
| cb(Qcur, "Qcur", il); |
|
|
| |
| switch (model.type) { |
| case LLM_TYPE_2B: |
| case LLM_TYPE_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; |
| case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; |
| default: GGML_ABORT("fatal error"); |
| }; |
| cb(Qcur, "Qcur_scaled", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il); |
| } |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].attn_post_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_post_norm", il); |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); |
| cb(sa_out, "sa_out", il); |
|
|
| cur = llm_build_norm(ctx0, sa_out, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| |
| { |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].ffn_post_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "ffn_post_norm", -1); |
|
|
| cur = ggml_add(ctx0, cur, sa_out); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
|
|
| |
| cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); |
| cur = ggml_tanh(ctx0, cur); |
| cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); |
|
|
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_gemma3() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head_k = hparams.n_embd_head_k; |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| if (ubatch.token) { |
| inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); |
| cb(inpL, "inp_scaled", -1); |
| } |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true); |
| struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true); |
|
|
| |
| |
| static const int sliding_window_pattern = 6; |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| const bool is_sliding = (il + 1) % sliding_window_pattern; |
| const float freq_base_l = is_sliding ? 10000.0f : freq_base; |
| const float freq_scale_l = is_sliding ? 1.0f : freq_scale; |
| struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens); |
| Qcur = llm_build_norm(ctx0, Qcur, hparams, |
| model.layers[il].attn_q_norm, |
| NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(Qcur, "Qcur_normed", il); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, Qcur, inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, |
| ext_factor, attn_factor, beta_fast, beta_slow); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens); |
| Kcur = llm_build_norm(ctx0, Kcur, hparams, |
| model.layers[il].attn_k_norm, |
| NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(Kcur, "Kcur_normed", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, Kcur, inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, |
| ext_factor, attn_factor, beta_fast, beta_slow); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, hparams.f_attention_scale, cb, il); |
| } |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].attn_post_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_post_norm", il); |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); |
| cb(sa_out, "sa_out", il); |
|
|
| cur = llm_build_norm(ctx0, sa_out, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| |
| { |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].ffn_post_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "ffn_post_norm", -1); |
|
|
| cur = ggml_add(ctx0, cur, sa_out); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
|
|
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_starcoder2() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, model.layers[il].attn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| if (model.layers[il].bq) { |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| if (model.layers[il].bk) { |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| if (model.layers[il].bv) { |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
| } |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
|
|
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, model.output_norm_b, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_mamba() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| |
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| struct ggml_tensor * state_copy = build_inp_s_copy(); |
| struct ggml_tensor * state_mask = build_inp_s_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| cur = llm_build_mamba(ctx0, lctx, ubatch, gf, cur, |
| state_copy, state_mask, |
| kv_head, n_kv, cb, il); |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| } |
|
|
| |
| cur = ggml_add(ctx0, cur, inpL); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_command_r() { |
|
|
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| const float f_logit_scale = hparams.f_logit_scale; |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
| struct ggml_tensor * ffn_inp = cur; |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| if (model.layers[il].bq) { |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| if (model.layers[il].bk) { |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| if (model.layers[il].bv) { |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
| } |
|
|
| if (model.layers[il].attn_q_norm) { |
| Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, |
| ggml_element_size(Qcur) * n_embd_head, |
| ggml_element_size(Qcur) * n_embd_head * n_head, |
| 0); |
| cb(Qcur, "Qcur", il); |
| Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens, |
| ggml_element_size(Kcur) * n_embd_head, |
| ggml_element_size(Kcur) * n_embd_head * n_head_kv, |
| 0); |
| cb(Kcur, "Kcur", il); |
|
|
| Qcur = llm_build_norm(ctx0, Qcur, hparams, |
| model.layers[il].attn_q_norm, |
| NULL, |
| LLM_NORM, cb, il); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = llm_build_norm(ctx0, Kcur, hparams, |
| model.layers[il].attn_k_norm, |
| NULL, |
| LLM_NORM, cb, il); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * attn_out = cur; |
|
|
| |
| { |
| cur = llm_build_ffn(ctx0, lctx, ffn_inp, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| |
| cur = ggml_add(ctx0, cur, inpL); |
| cur = ggml_add(ctx0, cur, attn_out); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
|
|
| if (f_logit_scale) { |
| cur = ggml_scale(ctx0, cur, f_logit_scale); |
| } |
|
|
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
|
|
| } |
|
|
| struct ggml_cgraph * build_cohere2() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| const float f_logit_scale = hparams.f_logit_scale; |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
| struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); |
|
|
| |
| const int32_t sliding_window_pattern = 4; |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| |
| |
| const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); |
| struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
| struct ggml_tensor * ffn_inp = cur; |
|
|
| |
| { |
| |
| struct ggml_tensor * rope_factors = build_rope_factors(il); |
|
|
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| if (model.layers[il].bq) { |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| if (model.layers[il].bk) { |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| if (model.layers[il].bv) { |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
| } |
|
|
| if (is_sliding) { |
| Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, |
| beta_fast, beta_slow); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, |
| rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, |
| attn_factor, beta_fast, beta_slow); |
| cb(Kcur, "Kcur", il); |
| } else { |
| |
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, |
| KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * attn_out = cur; |
|
|
| |
| { |
| cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, |
| NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, |
| cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| |
| cur = ggml_add(ctx0, cur, inpL); |
| cur = ggml_add(ctx0, cur, attn_out); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
|
|
| if (f_logit_scale) { |
| cur = ggml_scale(ctx0, cur, f_logit_scale); |
| } |
|
|
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| struct ggml_cgraph * build_olmo() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| NULL, NULL, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| if (hparams.f_clamp_kqv > 0.0f) { |
| Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| if (hparams.f_clamp_kqv > 0.0f) { |
| Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| if (hparams.f_clamp_kqv > 0.0f) { |
| Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); |
| cb(Vcur, "Vcur", il); |
| } |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, nullptr, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| NULL, NULL, |
| LLM_NORM, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cb(cur, "ffn_out", il); |
|
|
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| NULL, NULL, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_olmo2() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| cur = inpL; |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(Qcur, "Qcur_normed", il); |
|
|
| Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(Kcur, "Kcur_normed", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, Qcur, inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur_rope", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, Kcur, inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur_rope", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].attn_post_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_post_norm", il); |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| cur = llm_build_ffn(ctx0, lctx, ffn_inp, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].ffn_post_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "ffn_post_norm", -1); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cb(cur, "ffn_out", il); |
|
|
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| |
| |
| |
| |
| struct ggml_cgraph * build_olmoe() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(Qcur, "Qcur_normed", il); |
|
|
| Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(Kcur, "Kcur_normed", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, Qcur, inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur_rope", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, Kcur, inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur_rope", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_moe_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_gate_inp, |
| model.layers[il].ffn_up_exps, |
| model.layers[il].ffn_gate_exps, |
| model.layers[il].ffn_down_exps, |
| nullptr, |
| n_expert, n_expert_used, |
| LLM_FFN_SILU, false, |
| false, 0.0, |
| LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, |
| cb, il); |
| cb(cur, "ffn_moe_out", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_openelm() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| const int64_t n_head = hparams.n_head(il); |
| const int64_t n_head_kv = hparams.n_head_kv(il); |
| const int64_t n_head_qkv = 2*n_head_kv + n_head; |
|
|
| cur = inpL; |
| struct ggml_tensor * residual = cur; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); |
| cb(cur, "wqkv", il); |
|
|
| cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); |
|
|
| struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0)); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head)); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = llm_build_norm(ctx0, Qcur, hparams, |
| model.layers[il].attn_q_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = llm_build_norm(ctx0, Kcur, hparams, |
| model.layers[il].attn_k_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(Kcur, "Kcur", il); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, Qcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig, |
| freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, Kcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig, |
| freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); |
| cb(Qcur, "Vcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| residual = ggml_get_rows(ctx0, residual, inp_out_ids); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_gptneox() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, |
| model.layers[il].attn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); |
| cb(cur, "wqkv", il); |
|
|
| cur = ggml_add(ctx0, cur, model.layers[il].bqkv); |
| cb(cur, "bqkv", il); |
|
|
| struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); |
| struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); |
|
|
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| } |
|
|
| |
| if (hparams.use_par_res) { |
| |
| |
|
|
| struct ggml_tensor * attn_out = cur; |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].ffn_norm, |
| model.layers[il].ffn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| cur = ggml_add(ctx0, cur, inpL); |
| cb(cur, "ffn_out", il); |
|
|
| cur = ggml_add(ctx0, cur, attn_out); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } else { |
| |
| |
| |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, |
| model.layers[il].ffn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
| } |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.output_norm, |
| model.output_norm_b, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_arctic() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp); |
| cb(ffn_out, "ffn_out", il); |
|
|
| |
| cur = llm_build_norm(ctx0, inpSA, hparams, |
| model.layers[il].ffn_norm_exps, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm_exps", il); |
|
|
| cur = llm_build_moe_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_gate_inp, |
| model.layers[il].ffn_up_exps, |
| model.layers[il].ffn_gate_exps, |
| model.layers[il].ffn_down_exps, |
| nullptr, |
| n_expert, n_expert_used, |
| LLM_FFN_SILU, true, |
| false, 0.0, |
| LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, |
| cb, il); |
| cb(cur, "ffn_moe_out", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_out); |
| cb(cur, "ffn_out", il); |
|
|
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_deepseek() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
| const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; |
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * rope_factors = build_rope_factors(il); |
|
|
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| if (model.layers[il].bq) { |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| if (model.layers[il].bk) { |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| if (model.layers[il].bv) { |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
| } |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| if ((uint32_t) il < hparams.n_layer_dense_lead) { |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } else { |
| |
| ggml_tensor * moe_out = |
| llm_build_moe_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_gate_inp, |
| model.layers[il].ffn_up_exps, |
| model.layers[il].ffn_gate_exps, |
| model.layers[il].ffn_down_exps, |
| nullptr, |
| n_expert, n_expert_used, |
| LLM_FFN_SILU, false, |
| false, hparams.expert_weights_scale, |
| LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, |
| cb, il); |
| cb(moe_out, "ffn_moe_out", il); |
|
|
| |
| { |
| ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up_shexp, NULL, NULL, |
| model.layers[il].ffn_gate_shexp, NULL, NULL, |
| model.layers[il].ffn_down_shexp, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(ffn_shexp, "ffn_shexp", il); |
|
|
| cur = ggml_add(ctx0, moe_out, ffn_shexp); |
| cb(cur, "ffn_out", il); |
| } |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
|
|
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_deepseek2() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| bool is_lite = (hparams.n_layer == 27); |
|
|
| |
| |
| const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale)); |
| const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k)); |
| const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)); |
|
|
| const uint32_t n_embd_head_qk_rope = hparams.n_rot; |
| const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; |
| const uint32_t kv_lora_rank = hparams.n_lora_kv; |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| |
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| struct ggml_tensor * q = NULL; |
| if (!is_lite) { |
| |
| q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); |
| cb(q, "q", il); |
|
|
| q = llm_build_norm(ctx0, q, hparams, |
| model.layers[il].attn_q_a_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(q, "q", il); |
|
|
| |
| q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); |
| cb(q, "q", il); |
| } else { |
| q = ggml_mul_mat(ctx0, model.layers[il].wq, cur); |
| cb(q, "q", il); |
| } |
|
|
| |
| struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, |
| ggml_row_size(q->type, hparams.n_embd_head_k), |
| ggml_row_size(q->type, hparams.n_embd_head_k * n_head), |
| 0); |
| cb(q_nope, "q_nope", il); |
|
|
| |
| struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, |
| ggml_row_size(q->type, hparams.n_embd_head_k), |
| ggml_row_size(q->type, hparams.n_embd_head_k * n_head), |
| ggml_row_size(q->type, n_embd_head_qk_nope)); |
| cb(q_pe, "q_pe", il); |
|
|
| |
| struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); |
| cb(kv_pe_compresseed, "kv_pe_compresseed", il); |
|
|
| |
| struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, |
| kv_pe_compresseed->nb[1], |
| 0); |
| cb(kv_compressed, "kv_compressed", il); |
|
|
| |
| struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, |
| kv_pe_compresseed->nb[1], |
| kv_pe_compresseed->nb[1], |
| ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); |
| cb(k_pe, "k_pe", il); |
|
|
| |
| kv_compressed = ggml_cont(ctx0, kv_compressed); |
| kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, |
| model.layers[il].attn_kv_a_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(kv_compressed, "kv_compressed", il); |
|
|
| |
| struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); |
| cb(kv, "kv", il); |
|
|
| |
| struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, |
| ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), |
| ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), |
| 0); |
| cb(k_nope, "k_nope", il); |
|
|
| |
| struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, |
| ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), |
| ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), |
| ggml_row_size(kv->type, (n_embd_head_qk_nope))); |
| cb(v_states, "v_states", il); |
|
|
| v_states = ggml_cont(ctx0, v_states); |
| cb(v_states, "v_states", il); |
|
|
| v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, |
| ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), |
| 0); |
| cb(v_states, "v_states", il); |
|
|
| q_pe = ggml_cont(ctx0, q_pe); |
| q_pe = ggml_rope_ext( |
| ctx0, q_pe, inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor_scaled, beta_fast, beta_slow |
| ); |
| cb(q_pe, "q_pe", il); |
|
|
| |
| k_pe = ggml_cont(ctx0, k_pe); |
| k_pe = ggml_rope_ext( |
| ctx0, k_pe, inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor_scaled, beta_fast, beta_slow |
| ); |
| cb(k_pe, "k_pe", il); |
|
|
| struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); |
| cb(q_states, "q_states", il); |
|
|
| struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); |
| cb(k_states, "k_states", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| if ((uint32_t) il < hparams.n_layer_dense_lead) { |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } else { |
| |
| ggml_tensor * moe_out = |
| llm_build_moe_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_gate_inp, |
| model.layers[il].ffn_up_exps, |
| model.layers[il].ffn_gate_exps, |
| model.layers[il].ffn_down_exps, |
| model.layers[il].ffn_exp_probs_b, |
| n_expert, n_expert_used, |
| LLM_FFN_SILU, hparams.expert_weights_norm, |
| true, hparams.expert_weights_scale, |
| (enum llama_expert_gating_func_type) hparams.expert_gating_func, |
| cb, il); |
| cb(moe_out, "ffn_moe_out", il); |
|
|
| |
| { |
| ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up_shexp, NULL, NULL, |
| model.layers[il].ffn_gate_shexp, NULL, NULL, |
| model.layers[il].ffn_down_shexp, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(ffn_shexp, "ffn_shexp", il); |
|
|
| cur = ggml_add(ctx0, moe_out, ffn_shexp); |
| cb(cur, "ffn_out", il); |
| } |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = ggml_mul_mat(ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_bitnet() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| if (model.layers[il].wq_scale) { |
| Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale); |
| } |
| cb(Qcur, "Qcur", il); |
| if (model.layers[il].bq) { |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| |
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| if (model.layers[il].wk_scale) { |
| Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale); |
| } |
| cb(Kcur, "Kcur", il); |
| if (model.layers[il].bk) { |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| |
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| if (model.layers[il].wv_scale) { |
| Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale); |
| } |
| cb(Vcur, "Vcur", il); |
| if (model.layers[il].bv) { |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
| } |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| NULL, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].attn_sub_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_sub_norm", il); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); |
| if (model.layers[il].wo_scale) { |
| cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale); |
| } |
| if (model.layers[il].bo) { |
| cur = ggml_add(ctx0, cur, model.layers[il].bo); |
| } |
| cb(cur, "attn_o_out", il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, |
| model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, |
| NULL, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_sub_out", il); |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].ffn_sub_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_sub_norm", il); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur); |
| if (model.layers[il].ffn_down_scale) { |
| cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale); |
| } |
| cb(cur, "ffn_down", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_t5_enc() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| GGML_ASSERT(lctx.is_encoding); |
| struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false); |
|
|
| |
| struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm_enc, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
|
|
| struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); |
| struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); |
|
|
| struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); |
| cb(kq, "kq", il); |
|
|
| struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; |
| struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b); |
| struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); |
| cb(kq_b, "kq_b", il); |
|
|
| kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); |
| cb(kq, "kq_soft_max_ext", il); |
|
|
| struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); |
| cb(v, "v", il); |
|
|
| struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); |
| cb(kqv, "kqv", il); |
|
|
| struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); |
| cb(kqv_merged, "kqv_merged", il); |
|
|
| cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); |
| cb(cur, "kqv_merged_cont", il); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur); |
| cb(cur, "kqv_out", il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm_enc, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up_enc, NULL, NULL, |
| model.layers[il].ffn_gate_enc, NULL, NULL, |
| model.layers[il].ffn_down_enc, NULL, NULL, |
| NULL, |
| model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, |
| model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, |
| cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cb(cur, "ffn_out", il); |
|
|
| ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); |
| if (layer_dir != nullptr) { |
| cur = ggml_add(ctx0, cur, layer_dir); |
| } |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
| cb(cur, "result_embd", -1); |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm_enc, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_t5_dec() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| GGML_ASSERT(!lctx.is_encoding); |
| GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); |
|
|
| struct ggml_tensor * embd_enc = llm_build_inp_embd_enc(); |
| struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true); |
|
|
| struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); |
| struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
|
|
| llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); |
|
|
| struct ggml_tensor * k = |
| ggml_view_3d(ctx0, kv_self.k_l[il], |
| n_embd_head_k, n_kv, n_head_kv, |
| ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), |
| ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), |
| 0); |
| cb(k, "k", il); |
|
|
| struct ggml_tensor * v = |
| ggml_view_3d(ctx0, kv_self.v_l[il], |
| n_kv, n_embd_head_v, n_head_kv, |
| ggml_element_size(kv_self.v_l[il])*n_ctx, |
| ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, |
| 0); |
| cb(v, "v", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
|
|
| struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); |
|
|
| struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); |
| cb(kq, "kq", il); |
|
|
| struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; |
| struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b); |
| struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); |
| cb(kq_b, "kq_b", il); |
|
|
| kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); |
| cb(kq, "kq_soft_max_ext", il); |
|
|
| struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); |
| cb(kqv, "kqv", il); |
|
|
| struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); |
| cb(kqv_merged, "kqv_merged", il); |
|
|
| cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); |
| cb(cur, "kqv_merged_cont", il); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); |
| cb(cur, "kqv_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, inpSA); |
| cb(cur, "cross_inp", il); |
|
|
| struct ggml_tensor * inpCA = cur; |
|
|
| |
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].attn_norm_cross, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm_cross", il); |
|
|
| |
| { |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); |
|
|
| struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); |
| struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); |
|
|
| struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); |
| cb(kq, "kq", il); |
|
|
| kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); |
| cb(kq, "kq_soft_max_ext", il); |
|
|
| struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); |
| cb(v, "v", il); |
|
|
| struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); |
| cb(kqv, "kqv", il); |
|
|
| struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); |
| cb(kqv_merged, "kqv_merged", il); |
|
|
| cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); |
| cb(cur, "kqv_merged_cont", il); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur); |
| cb(cur, "kqv_out", il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| |
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, |
| model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, |
| cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cb(cur, "ffn_out", il); |
|
|
| ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); |
| if (layer_dir != nullptr) { |
| cur = ggml_add(ctx0, cur, layer_dir); |
| } |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
| cb(cur, "result_embd", -1); |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_jais() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, |
| model.layers[il].attn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); |
| cb(cur, "wqkv", il); |
|
|
| cur = ggml_add(ctx0, cur, model.layers[il].bqkv); |
| cb(cur, "bqkv", il); |
|
|
| struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd))); |
| struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd))); |
| struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa))); |
|
|
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
| cb(Vcur, "Vcur", il); |
|
|
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| } |
|
|
| |
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, |
| model.layers[il].ffn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
| } |
|
|
| inpL = ggml_add(ctx0, cur, ffn_inp); |
| cb(inpL, "l_out", il); |
| } |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.output_norm, |
| model.output_norm_b, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
|
|
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_chatglm() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, |
| NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| struct ggml_tensor * Qcur = nullptr; |
| struct ggml_tensor * Kcur = nullptr; |
| struct ggml_tensor * Vcur = nullptr; |
| if (model.layers[il].wqkv == nullptr) { |
| Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| if (model.layers[il].bq) { |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| } |
| Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| if (model.layers[il].bk) { |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| } |
| Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| if (model.layers[il].bv) { |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| } |
| } else { |
| cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); |
| cb(cur, "wqkv", il); |
| if (model.layers[il].bqkv) { |
| cur = ggml_add(ctx0, cur, model.layers[il].bqkv); |
| cb(cur, "bqkv", il); |
| } |
| Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); |
| Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); |
| Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); |
| } |
| cb(Qcur, "Qcur", il); |
| cb(Kcur, "Kcur", il); |
| cb(Vcur, "Vcur", il); |
| |
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur_rope", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur_rope", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, NULL, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
|
|
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| |
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, |
| NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| } |
|
|
| inpL = ggml_add(ctx0, cur, ffn_inp); |
| cb(inpL, "l_out", il); |
| } |
|
|
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.output_norm, |
| NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_nemotron() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, |
| model.layers[il].attn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| if (model.layers[il].bq) { |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| if (model.layers[il].bk) { |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| if (model.layers[il].bv) { |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
| } |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, |
| model.layers[il].ffn_norm_b, |
| LLM_NORM, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, |
| NULL, NULL, NULL, |
| model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, |
| NULL, |
| LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cb(cur, "ffn_out", il); |
|
|
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, model.output_norm_b, |
| LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_exaone() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
|
|
| |
| { |
| |
| struct ggml_tensor * rope_factors = build_rope_factors(il); |
|
|
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
| if (model.layers[il].bq) { |
| Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
| if (model.layers[il].bk) { |
| Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
| if (model.layers[il].bv) { |
| Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); |
| cb(Vcur, "Vcur", il); |
| } |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, model.layers[il].bo, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cb(cur, "ffn_out", il); |
|
|
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| ggml_cgraph * build_rwkv6() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); |
|
|
| const int64_t n_seqs = ubatch.n_seqs; |
| const int64_t n_seq_tokens = ubatch.n_seq_tokens; |
| const int64_t n_tokens = ubatch.n_tokens; |
| GGML_ASSERT(n_seqs != 0); |
| GGML_ASSERT(ubatch.equal_seqs); |
| GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
| struct ggml_tensor * state_copy = build_inp_s_copy(); |
| struct ggml_tensor * state_mask = build_inp_s_mask(); |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
| inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| const llama_layer * layer = &model.layers[il]; |
|
|
| |
| struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0, |
| gf, kv_self.k_l[il], state_copy, state_mask, |
| hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs); |
| struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0, |
| gf, kv_self.v_l[il], state_copy, state_mask, |
| hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs); |
|
|
| cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); |
| token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs); |
|
|
| struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); |
| struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); |
|
|
| struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il); |
| struct ggml_tensor * x_prev = ggml_concat( |
| ctx0, |
| att_shift, |
| ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), |
| 1 |
| ); |
|
|
| cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size)); |
| ggml_build_forward_expand(gf, cur); |
| ggml_build_forward_expand( |
| gf, |
| ggml_cpy( |
| ctx0, |
| wkv_states, |
| ggml_view_1d( |
| ctx0, |
| kv_self.v_l[il], |
| hparams.n_embd_v_s() * n_seqs, |
| hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) |
| ) |
| ) |
| ); |
|
|
| struct ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il); |
| x_prev = ggml_concat( |
| ctx0, |
| ffn_shift, |
| ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0), |
| 1 |
| ); |
| cur = ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev)); |
| ggml_build_forward_expand(gf, cur); |
|
|
| struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); |
| struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn)); |
|
|
| token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1); |
|
|
| ggml_build_forward_expand( |
| gf, |
| ggml_cpy( |
| ctx0, |
| ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0), |
| ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) |
| ) |
| ); |
|
|
| if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { |
| cur = ggml_scale(ctx0, cur, 0.5F); |
| } |
|
|
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| |
| ggml_cgraph * build_rwkv6qwen2() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| GGML_ASSERT(n_embd == hparams.n_embd_k_s()); |
|
|
| const int64_t n_seqs = ubatch.n_seqs; |
| const int64_t n_seq_tokens = ubatch.n_seq_tokens; |
| const int64_t n_tokens = ubatch.n_tokens; |
| GGML_ASSERT(n_seqs != 0); |
| GGML_ASSERT(ubatch.equal_seqs); |
| GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
| struct ggml_tensor * state_copy = build_inp_s_copy(); |
| struct ggml_tensor * state_mask = build_inp_s_mask(); |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| const llama_layer * layer = &model.layers[il]; |
|
|
| |
| struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0, |
| gf, kv_self.k_l[il], state_copy, state_mask, |
| hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs); |
| struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0, |
| gf, kv_self.v_l[il], state_copy, state_mask, |
| hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs); |
|
|
| cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); |
| token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs); |
|
|
| struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, cb, il); |
| struct ggml_tensor * x_prev = ggml_concat( |
| ctx0, |
| token_shift, |
| ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), |
| 1 |
| ); |
|
|
| struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); |
| ggml_build_forward_expand( |
| gf, |
| ggml_cpy( |
| ctx0, |
| ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0), |
| ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) |
| ) |
| ); |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv())); |
| ggml_build_forward_expand(gf, ffn_inp); |
| ggml_build_forward_expand( |
| gf, |
| ggml_cpy( |
| ctx0, |
| wkv_states, |
| ggml_view_1d( |
| ctx0, |
| kv_self.v_l[il], |
| hparams.n_embd_v_s() * n_seqs, |
| hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) |
| ) |
| ) |
| ); |
|
|
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| struct ggml_cgraph * build_chameleon() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| |
| int32_t n_tokens = this->n_tokens; |
|
|
| const int64_t n_embd_head = hparams.n_embd_head_v; |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); |
| GGML_ASSERT(n_embd_head == hparams.n_rot); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| |
| struct ggml_tensor * inp_pos = build_inp_pos(); |
|
|
| |
| struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); |
|
|
| for (int il = 0; il < n_layer; ++il) { |
| struct ggml_tensor * inpSA = inpL; |
|
|
| |
| if (hparams.swin_norm) { |
| cur = inpL; |
| } else { |
| cur = llm_build_norm(ctx0, inpL, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "attn_norm", il); |
| } |
|
|
| |
| { |
| |
| struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); |
| cb(Qcur, "Qcur", il); |
|
|
| struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); |
| cb(Kcur, "Kcur", il); |
|
|
| struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); |
| cb(Vcur, "Vcur", il); |
|
|
| if (model.layers[il].attn_q_norm) { |
| Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, |
| ggml_element_size(Qcur) * n_embd_head, |
| ggml_element_size(Qcur) * n_embd_head * n_head, |
| 0); |
| cb(Qcur, "Qcur", il); |
|
|
| Qcur = llm_build_norm(ctx0, Qcur, hparams, |
| model.layers[il].attn_q_norm, |
| model.layers[il].attn_q_norm_b, |
| LLM_NORM, cb, il); |
| cb(Qcur, "Qcur", il); |
| } |
|
|
| if (model.layers[il].attn_k_norm) { |
| Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens, |
| ggml_element_size(Kcur) * n_embd_head, |
| ggml_element_size(Kcur) * n_embd_head * n_head_kv, |
| 0); |
| cb(Kcur, "Kcur", il); |
|
|
| Kcur = llm_build_norm(ctx0, Kcur, hparams, |
| model.layers[il].attn_k_norm, |
| model.layers[il].attn_k_norm_b, |
| LLM_NORM, cb, il); |
| cb(Kcur, "Kcur", il); |
| } |
|
|
| Qcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Qcur, "Qcur", il); |
|
|
| Kcur = ggml_rope_ext( |
| ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| ext_factor, attn_factor, beta_fast, beta_slow |
| ); |
| cb(Kcur, "Kcur", il); |
|
|
| cur = llm_build_kv(ctx0, lctx, kv_self, gf, |
| model.layers[il].wo, nullptr, |
| Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); |
|
|
| if (hparams.swin_norm) { |
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].attn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| } |
| } |
|
|
| if (il == n_layer - 1) { |
| |
| struct ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| n_tokens = n_outputs; |
| cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); |
| } |
|
|
| struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); |
| cb(ffn_inp, "ffn_inp", il); |
|
|
| |
| if (!hparams.swin_norm) { |
| cur = llm_build_norm(ctx0, ffn_inp, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
| } |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| model.layers[il].ffn_up, NULL, NULL, |
| model.layers[il].ffn_gate, NULL, NULL, |
| model.layers[il].ffn_down, NULL, NULL, |
| NULL, |
| LLM_FFN_SILU, LLM_FFN_PAR, cb, il); |
| cb(cur, "ffn_out", il); |
|
|
| if (hparams.swin_norm) { |
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.layers[il].ffn_norm, NULL, |
| LLM_NORM_RMS, cb, il); |
| cb(cur, "ffn_norm", il); |
| } |
|
|
| cur = ggml_add(ctx0, cur, ffn_inp); |
| cb(cur, "ffn_out", il); |
|
|
| cur = lctx.cvec.apply_to(ctx0, cur, il); |
| cb(cur, "l_out", il); |
|
|
| |
| inpL = cur; |
| } |
|
|
| cur = inpL; |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, NULL, |
| LLM_NORM_RMS, cb, -1); |
| cb(cur, "result_norm", -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
| cb(cur, "result_output_with_img_logits", -1); |
|
|
| |
| |
| int img_token_end_idx = 8196; |
| int img_token_start_idx = 4; |
| int num_img_tokens = img_token_end_idx - img_token_start_idx; |
| |
| |
| struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens); |
| img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX); |
| cb(img_logits, "img_logits", -1); |
| cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx); |
| cb(cur, "result_output", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
|
|
| struct ggml_cgraph * build_wavtokenizer_dec() { |
| struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); |
|
|
| struct ggml_tensor * cur; |
| struct ggml_tensor * inpL; |
|
|
| inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); |
|
|
| cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); |
|
|
| cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1); |
| cur = ggml_add(ctx0, cur, model.conv1d_b); |
|
|
| |
| for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) { |
| const auto & layer = model.layers[il].posnet; |
|
|
| inpL = cur; |
|
|
| switch (il) { |
| case 0: |
| case 1: |
| case 3: |
| case 4: |
| { |
| cur = llm_build_norm(ctx0, cur, hparams, |
| layer.norm1, |
| layer.norm1_b, |
| LLM_NORM_GROUP, cb, 0); |
|
|
| cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); |
|
|
| cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1); |
| cur = ggml_add(ctx0, cur, layer.conv1_b); |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| layer.norm2, |
| layer.norm2_b, |
| LLM_NORM_GROUP, cb, 0); |
|
|
| cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); |
|
|
| cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1); |
| cur = ggml_add(ctx0, cur, layer.conv2_b); |
|
|
| cur = ggml_add(ctx0, cur, inpL); |
| } break; |
| case 2: |
| { |
| cur = llm_build_norm(ctx0, cur, hparams, |
| layer.attn_norm, |
| layer.attn_norm_b, |
| LLM_NORM_GROUP, cb, 0); |
|
|
| struct ggml_tensor * q; |
| struct ggml_tensor * k; |
| struct ggml_tensor * v; |
|
|
| q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1); |
| k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1); |
| v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1); |
|
|
| q = ggml_add(ctx0, q, layer.attn_q_b); |
| k = ggml_add(ctx0, k, layer.attn_k_b); |
| v = ggml_add(ctx0, v, layer.attn_v_b); |
|
|
| q = ggml_cont(ctx0, ggml_transpose(ctx0, q)); |
| k = ggml_cont(ctx0, ggml_transpose(ctx0, k)); |
|
|
| struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); |
|
|
| kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f); |
|
|
| cur = ggml_mul_mat(ctx0, kq, v); |
|
|
| cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1); |
| cur = ggml_add(ctx0, cur, layer.attn_o_b); |
|
|
| cur = ggml_add(ctx0, cur, inpL); |
| } break; |
| case 5: |
| { |
| cur = llm_build_norm(ctx0, cur, hparams, |
| layer.norm, |
| layer.norm_b, |
| LLM_NORM_GROUP, cb, 0); |
| } break; |
| default: GGML_ABORT("unknown posnet layer"); |
| }; |
| } |
|
|
| cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.tok_norm, |
| model.tok_norm_b, |
| LLM_NORM, cb, -1); |
|
|
| cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); |
|
|
| inpL = cur; |
|
|
| |
| for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) { |
| const auto & layer = model.layers[il].convnext; |
|
|
| cur = inpL; |
|
|
| cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1); |
| cur = ggml_add(ctx0, cur, layer.dw_b); |
|
|
| cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| layer.norm, |
| layer.norm_b, |
| LLM_NORM, cb, -1); |
|
|
| cur = llm_build_ffn(ctx0, lctx, cur, |
| layer.pw1, layer.pw1_b, NULL, |
| NULL, NULL, NULL, |
| layer.pw2, layer.pw2_b, NULL, |
| NULL, |
| LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); |
|
|
| cur = ggml_mul(ctx0, cur, layer.gamma); |
|
|
| cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); |
|
|
| inpL = ggml_add(ctx0, cur, inpL); |
| } |
|
|
| cur = inpL; |
|
|
| cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); |
|
|
| cur = llm_build_norm(ctx0, cur, hparams, |
| model.output_norm, |
| model.output_norm_b, |
| LLM_NORM, cb, -1); |
|
|
| |
| cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); |
|
|
| cur = ggml_add(ctx0, cur, model.output_b); |
| cb(cur, "result_embd", -1); |
|
|
| ggml_build_forward_expand(gf, cur); |
|
|
| return gf; |
| } |
| }; |
|
|
| static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) { |
| llama_ubatch dummy = {}; |
| dummy.equal_seqs = true; |
|
|
| llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; |
|
|
| struct llm_build_context llm(lctx, dummy, cb, false); |
|
|
| llm.init(); |
|
|
| struct ggml_cgraph * result = llm.build_defrag(ids); |
|
|
| llm.free(); |
|
|
| return result; |
| } |
|
|
| static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { |
| llama_ubatch dummy = {}; |
| dummy.equal_seqs = true; |
|
|
| llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; |
|
|
| struct llm_build_context llm(lctx, dummy, cb, false); |
|
|
| llm.init(); |
|
|
| struct ggml_cgraph * result = llm.build_k_shift(); |
|
|
| llm.free(); |
|
|
| return result; |
| } |
|
|
| static struct ggml_cgraph * llama_build_graph( |
| llama_context & lctx, |
| const llama_ubatch & ubatch, |
| bool worst_case) { |
| const auto & model = lctx.model; |
|
|
| |
| llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { |
| if (il >= 0) { |
| ggml_format_name(cur, "%s-%d", name, il); |
| } else { |
| ggml_set_name(cur, name); |
| } |
|
|
| if (!lctx.cparams.offload_kqv) { |
| if (strcmp(name, "kqv_merged_cont") == 0) { |
| |
| ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, lctx.backend_cpu); |
| } |
| } |
|
|
| |
| |
| const bool full_offload = lctx.model.params.n_gpu_layers > (int) lctx.model.hparams.n_layer; |
| if (ubatch.n_tokens < 32 || full_offload) { |
| if (il != -1 && strcmp(name, "norm") == 0) { |
| const auto & dev_layer = lctx.model.dev_layer(il); |
| for (auto & backend : lctx.backends) { |
| if (ggml_backend_get_device(backend.get()) == dev_layer) { |
| if (ggml_backend_supports_op(backend.get(), cur)) { |
| ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get()); |
| } |
| } |
| } |
| } |
| } |
| }; |
|
|
| struct ggml_cgraph * result = NULL; |
|
|
| struct llm_build_context llm(lctx, ubatch, cb, worst_case); |
|
|
| llm.init(); |
|
|
| switch (model.arch) { |
| case LLM_ARCH_LLAMA: |
| case LLM_ARCH_MINICPM: |
| case LLM_ARCH_GRANITE: |
| case LLM_ARCH_GRANITE_MOE: |
| { |
| result = llm.build_llama(); |
| } break; |
| case LLM_ARCH_DECI: |
| { |
| result = llm.build_deci(); |
| } break; |
| case LLM_ARCH_BAICHUAN: |
| { |
| result = llm.build_baichuan(); |
| } break; |
| case LLM_ARCH_FALCON: |
| { |
| result = llm.build_falcon(); |
| } break; |
| case LLM_ARCH_GROK: |
| { |
| result = llm.build_grok(); |
| } break; |
| case LLM_ARCH_STARCODER: |
| { |
| result = llm.build_starcoder(); |
| } break; |
| case LLM_ARCH_REFACT: |
| { |
| result = llm.build_refact(); |
| } break; |
| case LLM_ARCH_BERT: |
| case LLM_ARCH_JINA_BERT_V2: |
| case LLM_ARCH_NOMIC_BERT: |
| { |
| result = llm.build_bert(); |
| } break; |
| case LLM_ARCH_BLOOM: |
| { |
| result = llm.build_bloom(); |
| } break; |
| case LLM_ARCH_MPT: |
| { |
| result = llm.build_mpt(); |
| } break; |
| case LLM_ARCH_STABLELM: |
| { |
| result = llm.build_stablelm(); |
| } break; |
| case LLM_ARCH_QWEN: |
| { |
| result = llm.build_qwen(); |
| } break; |
| case LLM_ARCH_QWEN2: |
| { |
| result = llm.build_qwen2(); |
| } break; |
| case LLM_ARCH_QWEN2VL: |
| { |
| lctx.n_pos_per_token = 4; |
| result = llm.build_qwen2vl(); |
| } break; |
| case LLM_ARCH_QWEN2MOE: |
| { |
| result = llm.build_qwen2moe(); |
| } break; |
| case LLM_ARCH_PHI2: |
| { |
| result = llm.build_phi2(); |
| } break; |
| case LLM_ARCH_PHI3: |
| case LLM_ARCH_PHIMOE: |
| { |
| result = llm.build_phi3(); |
| } break; |
| case LLM_ARCH_PLAMO: |
| { |
| result = llm.build_plamo(); |
| } break; |
| case LLM_ARCH_GPT2: |
| { |
| result = llm.build_gpt2(); |
| } break; |
| case LLM_ARCH_CODESHELL: |
| { |
| result = llm.build_codeshell(); |
| } break; |
| case LLM_ARCH_ORION: |
| { |
| result = llm.build_orion(); |
| } break; |
| case LLM_ARCH_INTERNLM2: |
| { |
| result = llm.build_internlm2(); |
| } break; |
| case LLM_ARCH_MINICPM3: |
| { |
| result = llm.build_minicpm3(); |
| } break; |
| case LLM_ARCH_GEMMA: |
| { |
| result = llm.build_gemma(); |
| } break; |
| case LLM_ARCH_GEMMA2: |
| { |
| result = llm.build_gemma2(); |
| } break; |
| case LLM_ARCH_GEMMA3: |
| { |
| result = llm.build_gemma3(); |
| } break; |
| case LLM_ARCH_STARCODER2: |
| { |
| result = llm.build_starcoder2(); |
| } break; |
| case LLM_ARCH_MAMBA: |
| { |
| result = llm.build_mamba(); |
| } break; |
| case LLM_ARCH_XVERSE: |
| { |
| result = llm.build_xverse(); |
| } break; |
| case LLM_ARCH_COMMAND_R: |
| { |
| result = llm.build_command_r(); |
| } break; |
| case LLM_ARCH_COHERE2: |
| { |
| result = llm.build_cohere2(); |
| } break; |
| case LLM_ARCH_DBRX: |
| { |
| result = llm.build_dbrx(); |
| } break; |
| case LLM_ARCH_OLMO: |
| { |
| result = llm.build_olmo(); |
| } break; |
| case LLM_ARCH_OLMO2: |
| { |
| result = llm.build_olmo2(); |
| } break; |
| case LLM_ARCH_OLMOE: |
| { |
| result = llm.build_olmoe(); |
| } break; |
| case LLM_ARCH_OPENELM: |
| { |
| result = llm.build_openelm(); |
| } break; |
| case LLM_ARCH_GPTNEOX: |
| { |
| result = llm.build_gptneox(); |
| } break; |
| case LLM_ARCH_ARCTIC: |
| { |
| result = llm.build_arctic(); |
| } break; |
| case LLM_ARCH_DEEPSEEK: |
| { |
| result = llm.build_deepseek(); |
| } break; |
| case LLM_ARCH_DEEPSEEK2: |
| { |
| result = llm.build_deepseek2(); |
| } break; |
| case LLM_ARCH_CHATGLM: |
| { |
| result = llm.build_chatglm(); |
| } break; |
| case LLM_ARCH_BITNET: |
| { |
| result = llm.build_bitnet(); |
| } break; |
| case LLM_ARCH_T5: |
| { |
| if (lctx.is_encoding) { |
| result = llm.build_t5_enc(); |
| } else { |
| result = llm.build_t5_dec(); |
| } |
| } break; |
| case LLM_ARCH_T5ENCODER: |
| { |
| result = llm.build_t5_enc(); |
| } break; |
| case LLM_ARCH_JAIS: |
| { |
| result = llm.build_jais(); |
| } break; |
| case LLM_ARCH_NEMOTRON: |
| { |
| result = llm.build_nemotron(); |
| } break; |
| case LLM_ARCH_EXAONE: |
| { |
| result = llm.build_exaone(); |
| } break; |
| case LLM_ARCH_RWKV6: |
| { |
| result = llm.build_rwkv6(); |
| } break; |
| case LLM_ARCH_RWKV6QWEN2: |
| { |
| result = llm.build_rwkv6qwen2(); |
| } break; |
| case LLM_ARCH_CHAMELEON: |
| { |
| result = llm.build_chameleon(); |
| } break; |
| case LLM_ARCH_WAVTOKENIZER_DEC: |
| { |
| result = llm.build_wavtokenizer_dec(); |
| } break; |
| default: |
| GGML_ABORT("fatal error"); |
| } |
|
|
| |
| if (lctx.cparams.embeddings) { |
| result = llm.append_pooling(result); |
| } |
|
|
| llm.free(); |
|
|
| return result; |
| } |
|
|
| |
| static enum ggml_status llama_graph_compute( |
| llama_context & lctx, |
| ggml_cgraph * gf, |
| int n_threads, |
| ggml_threadpool * threadpool) { |
| if (lctx.backend_cpu != nullptr) { |
| auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu)); |
| auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); |
| set_threadpool_fn(lctx.backend_cpu, threadpool); |
| } |
|
|
| |
| for (const auto & set_n_threads_fn : lctx.set_n_threads_fns) { |
| set_n_threads_fn.second(set_n_threads_fn.first, n_threads); |
| } |
|
|
| auto status = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf); |
| if (status != GGML_STATUS_SUCCESS) { |
| LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); |
| } |
|
|
| |
|
|
| return status; |
| } |
|
|
| static int llama_prepare_sbatch( |
| llama_context & lctx, |
| const llama_batch & batch, |
| uint32_t & n_outputs) { |
| const auto & model = lctx.model; |
| const auto & hparams = model.hparams; |
| const auto & cparams = lctx.cparams; |
|
|
| const uint32_t n_tokens_all = batch.n_tokens; |
| const int64_t n_embd = hparams.n_embd; |
|
|
| |
| const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; |
|
|
| GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); |
| if (batch.token) { |
| for (uint32_t i = 0; i < n_tokens_all; ++i) { |
| if (batch.token[i] < 0 || uint32_t(batch.token[i]) >= model.vocab.n_tokens()) { |
| LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); |
| return -1; |
| } |
| } |
| } |
| GGML_ASSERT_CONTINUE(n_tokens_all <= cparams.n_batch); |
| |
|
|
| lctx.n_queued_tokens += n_tokens_all; |
| lctx.embd_seq.clear(); |
|
|
| |
| if (batch.logits && !embd_pooled) { |
| for (uint32_t i = 0; i < n_tokens_all; ++i) { |
| n_outputs += batch.logits[i] != 0; |
| } |
| } else if (lctx.logits_all || embd_pooled) { |
| n_outputs = n_tokens_all; |
| } else { |
| |
| n_outputs = 1; |
| } |
|
|
| lctx.sbatch.from_batch(batch, n_embd, |
| !lctx.kv_self.recurrent, |
| n_outputs == n_tokens_all); |
|
|
| |
| if (llama_output_reserve(lctx, n_outputs) < n_outputs) { |
| LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs); |
| return -2; |
| }; |
|
|
| return 0; |
| } |
|
|
| static int llama_prepare_ubatch( |
| llama_context & lctx, |
| llama_kv_slot_restorer & kv_slot_restorer, |
| llama_ubatch & ubatch, |
| const uint32_t n_outputs, |
| const uint32_t n_tokens_all) { |
| GGML_ASSERT(lctx.sbatch.n_tokens > 0); |
|
|
| auto & kv_self = lctx.kv_self; |
| const auto & cparams = lctx.cparams; |
| const auto & hparams = lctx.model.hparams; |
|
|
| |
| const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; |
|
|
| if (lctx.kv_self.recurrent) { |
| if (embd_pooled) { |
| |
| ubatch = lctx.sbatch.split_seq(cparams.n_ubatch); |
| } else { |
| |
| |
| ubatch = lctx.sbatch.split_equal(cparams.n_ubatch); |
| } |
| } else { |
| ubatch = lctx.sbatch.split_simple(cparams.n_ubatch); |
| } |
|
|
| |
| { |
| int32_t n_outputs_new = 0; |
|
|
| if (n_outputs == n_tokens_all) { |
| n_outputs_new = ubatch.n_tokens; |
| } else { |
| GGML_ASSERT(ubatch.output); |
| for (uint32_t i = 0; i < ubatch.n_tokens; i++) { |
| n_outputs_new += int32_t(ubatch.output[i] != 0); |
| } |
| } |
|
|
| |
| lctx.n_outputs = n_outputs_new; |
| } |
|
|
| |
| if (hparams.causal_attn) { |
| llama_kv_cache_update(&lctx); |
|
|
| |
| |
| if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) { |
| kv_self.head = 0; |
| } |
|
|
| const auto slot = llama_kv_cache_find_slot(kv_self, ubatch); |
| if (!slot) { |
| return 1; |
| } |
| kv_slot_restorer.save(slot); |
|
|
| if (!kv_self.recurrent) { |
| |
| |
| |
| const uint32_t pad = llama_kv_cache_get_padding(cparams); |
| kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad))); |
| |
| } |
| } |
|
|
| return 0; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| static int llama_decode_impl( |
| llama_context & lctx, |
| llama_batch inp_batch) { |
|
|
| lctx.is_encoding = false; |
|
|
| if (inp_batch.n_tokens == 0) { |
| LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); |
| return -1; |
| } |
|
|
| |
| llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1); |
| const llama_batch & batch = batch_allocr.batch; |
|
|
| const auto & model = lctx.model; |
| const auto & vocab = model.vocab; |
| const auto & hparams = model.hparams; |
| const auto & cparams = lctx.cparams; |
|
|
| if (lctx.t_compute_start_us == 0) { |
| lctx.t_compute_start_us = ggml_time_us(); |
| } |
| auto & kv_self = lctx.kv_self; |
| llama_kv_slot_restorer kv_slot_restorer(kv_self); |
|
|
| const int64_t n_embd = hparams.n_embd; |
| const int64_t n_vocab = vocab.n_tokens(); |
|
|
| uint32_t n_outputs = 0; |
| uint32_t n_outputs_prev = 0; |
|
|
| { |
| const int ret = llama_prepare_sbatch(lctx, batch, n_outputs); |
| if (ret != 0) { |
| return ret; |
| } |
| } |
|
|
| while (lctx.sbatch.n_tokens > 0) { |
| llama_ubatch ubatch; |
| { |
| const int ret = llama_prepare_ubatch(lctx, kv_slot_restorer, ubatch, n_outputs, batch.n_tokens); |
| if (ret != 0) { |
| return ret; |
| } |
| } |
|
|
| const int n_threads = ubatch.n_tokens < 32 ? cparams.n_threads : cparams.n_threads_batch; |
| ggml_threadpool_t threadpool = ubatch.n_tokens < 32 ? lctx.threadpool : lctx.threadpool_batch; |
|
|
| GGML_ASSERT(n_threads > 0); |
|
|
| |
|
|
| ggml_backend_sched_reset(lctx.sched.get()); |
| ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); |
|
|
| ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); |
|
|
| |
| struct ggml_tensor * res = ggml_graph_node(gf, -1); |
| struct ggml_tensor * embd = ggml_graph_node(gf, -2); |
|
|
| if (lctx.n_outputs == 0) { |
| |
| res = nullptr; |
| embd = nullptr; |
| } else if (cparams.embeddings) { |
| res = nullptr; |
| embd = nullptr; |
| for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { |
| if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { |
| embd = ggml_graph_node(gf, i); |
| break; |
| } |
| } |
| GGML_ASSERT(embd != nullptr && "missing embeddings tensor"); |
| } else { |
| embd = nullptr; |
| GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); |
| } |
|
|
| |
|
|
| ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); |
|
|
| llama_set_inputs(lctx, ubatch); |
|
|
| const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); |
| if (compute_status != GGML_STATUS_SUCCESS) { |
| kv_slot_restorer.restore(kv_self); |
| switch (compute_status) { |
| case GGML_STATUS_ABORTED: |
| return 2; |
| case GGML_STATUS_ALLOC_FAILED: |
| return -2; |
| case GGML_STATUS_FAILED: |
| default: |
| return -3; |
| } |
| } |
|
|
| |
| { |
| kv_self.head += ubatch.n_tokens; |
|
|
| |
| if (kv_self.head >= kv_self.size) { |
| kv_self.head = 0; |
| } |
| } |
|
|
| |
| |
| |
| |
|
|
| |
| if (res) { |
| ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), res); |
| GGML_ASSERT(backend_res != nullptr); |
| GGML_ASSERT(lctx.logits != nullptr); |
|
|
| float * logits_out = lctx.logits + n_outputs_prev*n_vocab; |
| const int32_t n_outputs_new = lctx.n_outputs; |
|
|
| if (n_outputs_new) { |
| GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs); |
| GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size); |
| ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float)); |
| } |
| } |
|
|
| |
| if (embd) { |
| ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd); |
| GGML_ASSERT(backend_embd != nullptr); |
|
|
| switch (cparams.pooling_type) { |
| case LLAMA_POOLING_TYPE_NONE: |
| { |
| |
| GGML_ASSERT(lctx.embd != nullptr); |
| float * embd_out = lctx.embd + n_outputs_prev*n_embd; |
| const int32_t n_outputs_new = lctx.n_outputs; |
|
|
| if (n_outputs_new) { |
| GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs); |
| GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size); |
| ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); |
| } |
| } break; |
| case LLAMA_POOLING_TYPE_MEAN: |
| case LLAMA_POOLING_TYPE_CLS: |
| case LLAMA_POOLING_TYPE_LAST: |
| { |
| |
| auto & embd_seq_out = lctx.embd_seq; |
|
|
| for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { |
| const llama_seq_id seq_id = ubatch.seq_id[s][0]; |
| if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { |
| continue; |
| } |
| embd_seq_out[seq_id].resize(n_embd); |
| ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); |
| } |
| } break; |
| case LLAMA_POOLING_TYPE_RANK: |
| { |
| |
| auto & embd_seq_out = lctx.embd_seq; |
|
|
| for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { |
| const llama_seq_id seq_id = ubatch.seq_id[s][0]; |
| if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { |
| continue; |
| } |
| embd_seq_out[seq_id].resize(1); |
| ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float)); |
| } |
| } break; |
| case LLAMA_POOLING_TYPE_UNSPECIFIED: |
| { |
| GGML_ABORT("unknown pooling type"); |
| } |
| } |
| } |
| n_outputs_prev += lctx.n_outputs; |
| } |
|
|
| |
| { |
| bool sorted_output = true; |
|
|
| GGML_ASSERT(lctx.sbatch.out_ids.size() == n_outputs); |
|
|
| for (size_t i = 0; i < n_outputs; ++i) { |
| size_t out_id = lctx.sbatch.out_ids[i]; |
| lctx.output_ids[out_id] = i; |
| if (out_id != i) { |
| sorted_output = false; |
| } |
| } |
|
|
| if (sorted_output) { |
| lctx.sbatch.out_ids.clear(); |
| } |
| } |
|
|
| |
| lctx.n_outputs = n_outputs; |
|
|
| |
| |
|
|
| |
| if (cparams.causal_attn && cparams.defrag_thold > 0.0f) { |
| |
| |
| const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + llama_kv_cache_get_padding(cparams))/float(kv_self.n)) : 0.0f; |
|
|
| |
| if (fragmentation > cparams.defrag_thold) { |
| LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); |
|
|
| llama_kv_cache_defrag(kv_self); |
| } |
| } |
|
|
| |
| |
| ggml_backend_sched_reset(lctx.sched.get()); |
|
|
| return 0; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| static int llama_encode_impl( |
| llama_context & lctx, |
| llama_batch inp_batch) { |
|
|
| lctx.is_encoding = true; |
|
|
| if (inp_batch.n_tokens == 0) { |
| LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); |
| return -1; |
| } |
|
|
| |
| llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1); |
|
|
| const llama_batch & batch = batch_allocr.batch; |
| const uint32_t n_tokens = batch.n_tokens; |
|
|
| const auto & model = lctx.model; |
| const auto & hparams = model.hparams; |
| const auto & cparams = lctx.cparams; |
|
|
| GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); |
|
|
| if (batch.token) { |
| for (uint32_t i = 0; i < n_tokens; ++i) { |
| if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { |
| LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); |
| return -1; |
| } |
| } |
| } |
|
|
| |
| GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); |
|
|
| if (lctx.t_compute_start_us == 0) { |
| lctx.t_compute_start_us = ggml_time_us(); |
| } |
|
|
| lctx.n_queued_tokens += n_tokens; |
|
|
| const int64_t n_embd = hparams.n_embd; |
|
|
| lctx.sbatch.from_batch(batch, n_embd, true, true); |
|
|
| const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens); |
|
|
| |
| if (llama_output_reserve(lctx, n_tokens) < n_tokens) { |
| LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); |
| return -2; |
| }; |
|
|
| for (uint32_t i = 0; i < n_tokens; ++i) { |
| lctx.output_ids[i] = i; |
| } |
|
|
| lctx.inp_embd_enc = NULL; |
| lctx.n_outputs = n_tokens; |
|
|
| int n_threads = (n_tokens < 32) ? cparams.n_threads : cparams.n_threads_batch; |
| ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; |
|
|
| GGML_ASSERT(n_threads > 0); |
|
|
| ggml_backend_sched_reset(lctx.sched.get()); |
| ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); |
|
|
| ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); |
|
|
| |
| struct ggml_tensor * embd = nullptr; |
|
|
| |
| if (llama_model_has_decoder(&lctx.model)) { |
| |
| embd = ggml_graph_node(gf, -1); |
| GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor"); |
| } else { |
| |
| if (cparams.embeddings) { |
| |
| embd = ggml_graph_node(gf, -1); |
| if (strcmp(embd->name, "result_embd_pooled") != 0) { |
| embd = ggml_graph_node(gf, -2); |
| } |
| GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); |
| } |
| } |
|
|
| ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); |
|
|
| llama_set_inputs(lctx, ubatch); |
|
|
| const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); |
| switch (compute_status) { |
| case GGML_STATUS_SUCCESS: |
| break; |
| case GGML_STATUS_ABORTED: |
| return 2; |
| case GGML_STATUS_ALLOC_FAILED: |
| return -2; |
| case GGML_STATUS_FAILED: |
| default: |
| return -3; |
| } |
|
|
| |
| if (embd) { |
| ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd); |
| GGML_ASSERT(backend_embd != nullptr); |
|
|
| if (llama_model_has_decoder(&lctx.model)) { |
| lctx.embd_enc.resize(n_tokens*n_embd); |
| float * embd_out = lctx.embd_enc.data(); |
|
|
| ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); |
| GGML_ASSERT(!ubatch.equal_seqs); |
|
|
| |
| lctx.seq_ids_enc.resize(n_tokens); |
| for (uint32_t i = 0; i < n_tokens; i++) { |
| for (int s = 0; s < ubatch.n_seq_id[i]; s++) { |
| llama_seq_id seq_id = ubatch.seq_id[i][s]; |
| lctx.seq_ids_enc[i].insert(seq_id); |
| } |
| } |
| } else { |
| GGML_ASSERT(lctx.embd != nullptr); |
|
|
| switch (cparams.pooling_type) { |
| case LLAMA_POOLING_TYPE_NONE: |
| { |
| |
| GGML_ASSERT(lctx.embd != nullptr); |
| float * embd_out = lctx.embd; |
|
|
| GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size); |
| ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); |
| } break; |
| case LLAMA_POOLING_TYPE_MEAN: |
| case LLAMA_POOLING_TYPE_CLS: |
| case LLAMA_POOLING_TYPE_LAST: |
| { |
| |
| auto & embd_seq_out = lctx.embd_seq; |
| embd_seq_out.clear(); |
|
|
| GGML_ASSERT(!ubatch.equal_seqs); |
|
|
| for (uint32_t i = 0; i < n_tokens; i++) { |
| const llama_seq_id seq_id = ubatch.seq_id[i][0]; |
| if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { |
| continue; |
| } |
| embd_seq_out[seq_id].resize(n_embd); |
| ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); |
| } |
| } break; |
| case LLAMA_POOLING_TYPE_RANK: |
| { |
| |
| |
| |
| GGML_ABORT("RANK pooling not implemented yet"); |
| } |
| case LLAMA_POOLING_TYPE_UNSPECIFIED: |
| { |
| GGML_ABORT("unknown pooling type"); |
| } |
| } |
| } |
| } |
|
|
| |
| |
| ggml_backend_sched_reset(lctx.sched.get()); |
|
|
| return 0; |
| } |
|
|
| |
| static void llama_kv_cache_defrag_impl(struct llama_context & lctx) { |
| auto & kv_self = lctx.kv_self; |
|
|
| const auto & hparams = lctx.model.hparams; |
|
|
| const uint32_t n_layer = hparams.n_layer; |
|
|
| const uint32_t n_kv = llama_kv_cache_cell_max(kv_self); |
| const uint32_t n_used = kv_self.used; |
|
|
| assert(n_used <= n_kv); |
|
|
| |
|
|
| |
| uint32_t n_moves = 0; |
|
|
| |
| |
| |
| |
| |
| const uint32_t max_moves = (lctx.model.max_nodes() - 2*n_layer)/(6*n_layer); |
|
|
| |
| |
| |
| |
| |
| |
| std::vector<uint32_t> ids(n_kv, n_kv); |
|
|
| for (uint32_t i0 = 0; i0 < n_used; ++i0) { |
| const auto & cell0 = kv_self.cells[i0]; |
|
|
| if (!cell0.is_empty()) { |
| ids[i0] = i0; |
|
|
| continue; |
| } |
|
|
| |
|
|
| uint32_t nh = 1; |
|
|
| |
| while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { |
| nh++; |
| } |
|
|
| uint32_t nf = 0; |
| uint32_t is = n_kv - 1; |
|
|
| |
| for (; is > i0; --is) { |
| const auto & cell1 = kv_self.cells[is]; |
|
|
| if (cell1.is_empty() || ids[is] != n_kv) { |
| continue; |
| } |
|
|
| |
| nf++; |
|
|
| if (nf == nh) { |
| break; |
| } |
| } |
|
|
| |
| GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); |
|
|
| nf = 0; |
|
|
| uint32_t i1 = is; |
|
|
| |
| bool cont = false; |
|
|
| |
| bool stop = false; |
|
|
| |
| for (; i1 < n_kv; ++i1) { |
| auto & cell1 = kv_self.cells[i1]; |
|
|
| if (cell1.is_empty() || ids[i1] != n_kv) { |
| if (n_moves == max_moves) { |
| stop = true; |
| break; |
| } |
|
|
| cont = false; |
| continue; |
| } |
|
|
| |
| ids[i1] = i0 + nf; |
|
|
| |
| kv_self.cells[i0 + nf] = cell1; |
|
|
| |
| cell1 = llama_kv_cell(); |
| kv_self.head = n_used; |
|
|
| if (!cont) { |
| n_moves++; |
| cont = true; |
| } |
|
|
| nf++; |
|
|
| if (nf == nh) { |
| break; |
| } |
| } |
|
|
| if (stop || n_moves == max_moves) { |
| break; |
| } |
|
|
| |
|
|
| i0 += nh - 1; |
| } |
|
|
| if (n_moves == 0) { |
| return; |
| } |
|
|
| |
|
|
| |
|
|
| #if 0 |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); |
| const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); |
|
|
| const uint32_t kv_size = kv_self.size; |
|
|
| std::vector<uint8_t> buf_k; |
| std::vector<uint8_t> buf_v; |
|
|
| for (uint32_t il = 0; il < n_layer; ++il) { |
| const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); |
| const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size); |
|
|
| const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); |
| const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size); |
|
|
| buf_k.resize(k_size); |
| buf_v.resize(v_size); |
|
|
| ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); |
| ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); |
|
|
| |
| |
| for (uint32_t i = 0; i < n_kv; ++i) { |
| const uint32_t id = ids[i]; |
|
|
| if (i == id || id == n_kv) { |
| continue; |
| } |
|
|
| uint32_t nm = 1; |
|
|
| while (i + nm < n_kv && ids[i + nm] == id + nm) { |
| nm++; |
| } |
|
|
| |
| { |
| const int64_t os = i*k_size_row; |
| const int64_t od = id*k_size_row; |
|
|
| memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); |
| } |
|
|
| |
| { |
| const int64_t os = i; |
| const int64_t od = id; |
|
|
| for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { |
| memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); |
| } |
| } |
|
|
| i += nm - 1; |
| } |
|
|
| ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); |
| ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); |
| } |
| #else |
| |
|
|
| ggml_backend_sched_reset(lctx.sched.get()); |
|
|
| ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); |
|
|
| llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); |
| #endif |
|
|
| |
|
|
| |
| } |
|
|
| static void llama_kv_cache_update_impl(struct llama_context & lctx) { |
| bool need_reserve = false; |
|
|
| if (lctx.kv_self.has_shift) { |
| if (!llama_kv_cache_can_shift(&lctx)) { |
| printf("\nWARNING: The current context does not support K-shift!\n"); |
| } else { |
|
|
| |
| if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { |
| ggml_backend_sched_reset(lctx.sched.get()); |
|
|
| ggml_cgraph * gf = llama_build_graph_k_shift(lctx); |
|
|
| ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); |
|
|
| llama_set_k_shift(lctx); |
|
|
| llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); |
|
|
| need_reserve = true; |
| } |
|
|
| { |
| auto & kv_self = lctx.kv_self; |
|
|
| kv_self.has_shift = false; |
|
|
| for (uint32_t i = 0; i < kv_self.size; ++i) { |
| kv_self.cells[i].delta = 0; |
| } |
| } |
| } |
| } |
|
|
| |
| if (lctx.kv_self.do_defrag) { |
| llama_kv_cache_defrag_impl(lctx); |
|
|
| need_reserve = true; |
|
|
| lctx.kv_self.do_defrag = false; |
| } |
|
|
| |
| if (need_reserve) { |
| |
| |
| uint32_t n_seqs = 1; |
| uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch); |
| llama_token token = lctx.model.vocab.token_bos(); |
| llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; |
| ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); |
|
|
| |
| ggml_backend_sched_reset(lctx.sched.get()); |
| if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { |
| LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); |
| } |
| } |
| } |
|
|
| int32_t llama_set_adapter_lora( |
| struct llama_context * ctx, |
| struct llama_adapter_lora * adapter, |
| float scale) { |
| ctx->lora[adapter] = scale; |
| return 0; |
| } |
|
|
| int32_t llama_rm_adapter_lora( |
| struct llama_context * ctx, |
| struct llama_adapter_lora * adapter) { |
| auto pos = ctx->lora.find(adapter); |
| if (pos != ctx->lora.end()) { |
| ctx->lora.erase(pos); |
| return 0; |
| } |
|
|
| return -1; |
| } |
|
|
| void llama_clear_adapter_lora(struct llama_context * ctx) { |
| ctx->lora.clear(); |
| } |
|
|
| int32_t llama_apply_adapter_cvec( |
| struct llama_context * ctx, |
| const float * data, |
| size_t len, |
| int32_t n_embd, |
| int32_t il_start, |
| int32_t il_end) { |
| return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end); |
| } |
|
|
| |
| |
| |
|
|
| struct llama_context_params llama_context_default_params() { |
| struct llama_context_params result = { |
| 512, |
| 2048, |
| 512, |
| 1, |
| GGML_DEFAULT_N_THREADS, |
| GGML_DEFAULT_N_THREADS, |
| LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, |
| LLAMA_POOLING_TYPE_UNSPECIFIED, |
| LLAMA_ATTENTION_TYPE_UNSPECIFIED, |
| 0.0f, |
| 0.0f, |
| -1.0f, |
| 1.0f, |
| 32.0f, |
| 1.0f, |
| 0, |
| -1.0f, |
| nullptr, |
| nullptr, |
| GGML_TYPE_F16, |
| GGML_TYPE_F16, |
| false, |
| false, |
| true, |
| false, |
| true, |
| nullptr, |
| nullptr, |
| }; |
|
|
| return result; |
| } |
|
|
| struct llama_sampler_chain_params llama_sampler_chain_default_params() { |
| struct llama_sampler_chain_params result = { |
| true, |
| }; |
|
|
| return result; |
| } |
|
|
| size_t llama_max_devices(void) { |
| return 16; |
| } |
|
|
| bool llama_supports_mmap(void) { |
| return llama_mmap::SUPPORTED; |
| } |
|
|
| bool llama_supports_mlock(void) { |
| return llama_mlock::SUPPORTED; |
| } |
|
|
| bool llama_supports_gpu_offload(void) { |
| #if defined(GGML_USE_CLBLAST) |
| return true; |
| #else |
| return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr || |
| llama_supports_rpc(); |
| #endif |
| } |
|
|
| bool llama_supports_rpc(void) { |
| return ggml_backend_reg_by_name("RPC") != nullptr; |
| } |
|
|
| void llama_backend_init(void) { |
| ggml_time_init(); |
|
|
| |
| { |
| struct ggml_init_params params = { 0, NULL, false }; |
| struct ggml_context * ctx = ggml_init(params); |
| ggml_free(ctx); |
| } |
| } |
|
|
| void llama_numa_init(enum ggml_numa_strategy numa) { |
| if (numa != GGML_NUMA_STRATEGY_DISABLED) { |
| auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); |
| GGML_ASSERT(dev && "CPU backend is not loaded"); |
| auto * reg = ggml_backend_dev_backend_reg(dev); |
| auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init"); |
| numa_init_fn(numa); |
| } |
| } |
|
|
| void llama_backend_free(void) { |
| ggml_quantize_free(); |
| } |
|
|
| int64_t llama_time_us(void) { |
| return ggml_time_us(); |
| } |
|
|
| static struct llama_model * llama_model_load_from_file_impl( |
| const std::string & path_model, |
| std::vector<std::string> & splits, |
| struct llama_model_params params) { |
| ggml_time_init(); |
|
|
| unsigned cur_percentage = 0; |
| if (params.progress_callback == NULL) { |
| params.progress_callback_user_data = &cur_percentage; |
| params.progress_callback = [](float progress, void * ctx) { |
| unsigned * cur_percentage_p = (unsigned *) ctx; |
| unsigned percentage = (unsigned) (100 * progress); |
| while (percentage > *cur_percentage_p) { |
| *cur_percentage_p = percentage; |
| LLAMA_LOG_CONT("."); |
| if (percentage >= 100) { |
| LLAMA_LOG_CONT("\n"); |
| } |
| } |
| return true; |
| }; |
| } |
|
|
| llama_model * model = new llama_model(params); |
|
|
| |
| if (params.devices) { |
| for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) { |
| model->devices.push_back(*dev); |
| } |
| } else { |
| std::vector<ggml_backend_dev_t> rpc_servers; |
| |
| for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { |
| ggml_backend_dev_t dev = ggml_backend_dev_get(i); |
| switch (ggml_backend_dev_type(dev)) { |
| case GGML_BACKEND_DEVICE_TYPE_CPU: |
| case GGML_BACKEND_DEVICE_TYPE_ACCEL: |
| |
| break; |
|
|
| case GGML_BACKEND_DEVICE_TYPE_GPU: |
| ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); |
| if (ggml_backend_reg_name(reg) == std::string("RPC")) { |
| rpc_servers.push_back(dev); |
| } else { |
| model->devices.push_back(dev); |
| } |
| break; |
| } |
| } |
| |
| if (!rpc_servers.empty()) { |
| model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end()); |
| } |
| } |
|
|
| |
| if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { |
| if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) { |
| LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size()); |
| llama_model_free(model); |
| return nullptr; |
| } |
| ggml_backend_dev_t main_gpu = model->devices[params.main_gpu]; |
| model->devices.clear(); |
| model->devices.push_back(main_gpu); |
| } |
|
|
| for (auto * dev : model->devices) { |
| size_t free, total; |
| ggml_backend_dev_memory(dev, &free, &total); |
| LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024); |
| } |
|
|
| const int status = llama_model_load(path_model, splits, *model, params); |
| GGML_ASSERT(status <= 0); |
| if (status < 0) { |
| if (status == -1) { |
| LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); |
| } else if (status == -2) { |
| LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); |
| } |
|
|
| llama_model_free(model); |
| return nullptr; |
| } |
|
|
| return model; |
| } |
|
|
| |
| struct llama_model * llama_load_model_from_file( |
| const char * path_model, |
| struct llama_model_params params) { |
| return llama_model_load_from_file(path_model, params); |
| } |
|
|
| struct llama_model * llama_model_load_from_file( |
| const char * path_model, |
| struct llama_model_params params) { |
| std::vector<std::string> splits = {}; |
| return llama_model_load_from_file_impl(path_model, splits, params); |
| } |
|
|
| struct llama_model * llama_model_load_from_splits( |
| const char ** paths, |
| size_t n_paths, |
| struct llama_model_params params) { |
| std::vector<std::string> splits; |
| if (n_paths == 0) { |
| LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__); |
| return nullptr; |
| } |
| for (size_t i = 0; i < n_paths; ++i) { |
| splits.push_back(paths[i]); |
| } |
| return llama_model_load_from_file_impl(splits.front(), splits, params); |
| } |
|
|
| struct llama_context * llama_init_from_model( |
| struct llama_model * model, |
| struct llama_context_params params) { |
|
|
| if (!model) { |
| LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__); |
| return nullptr; |
| } |
|
|
| if (params.n_batch == 0 && params.n_ubatch == 0) { |
| LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__); |
| return nullptr; |
| } |
|
|
| if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) { |
| LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__); |
| return nullptr; |
| } |
|
|
| if (params.flash_attn && model->arch == LLM_ARCH_GROK) { |
| LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__); |
| params.flash_attn = false; |
| } |
|
|
| if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) { |
| LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__); |
| params.flash_attn = false; |
| } |
|
|
| if (ggml_is_quantized(params.type_v) && !params.flash_attn) { |
| LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__); |
| return nullptr; |
| } |
|
|
| llama_context * ctx = new llama_context(*model); |
|
|
| const auto & hparams = model->hparams; |
| auto & cparams = ctx->cparams; |
|
|
| cparams.n_seq_max = std::max(1u, params.n_seq_max); |
| cparams.n_threads = params.n_threads; |
| cparams.n_threads_batch = params.n_threads_batch; |
| cparams.yarn_ext_factor = params.yarn_ext_factor; |
| cparams.yarn_attn_factor = params.yarn_attn_factor; |
| cparams.yarn_beta_fast = params.yarn_beta_fast; |
| cparams.yarn_beta_slow = params.yarn_beta_slow; |
| cparams.defrag_thold = params.defrag_thold; |
| cparams.embeddings = params.embeddings; |
| cparams.offload_kqv = params.offload_kqv; |
| cparams.flash_attn = params.flash_attn; |
| cparams.no_perf = params.no_perf; |
| cparams.pooling_type = params.pooling_type; |
|
|
| cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; |
| cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; |
| cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; |
|
|
| |
| cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams)); |
|
|
| |
| cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; |
|
|
| |
| |
| |
| if (cparams.n_batch < GGML_KQ_MASK_PAD) { |
| LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); |
| cparams.n_batch = GGML_KQ_MASK_PAD; |
| } |
|
|
| cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); |
|
|
| cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : |
| hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn : |
| hparams.n_ctx_train; |
|
|
| cparams.cb_eval = params.cb_eval; |
| cparams.cb_eval_user_data = params.cb_eval_user_data; |
|
|
| auto rope_scaling_type = params.rope_scaling_type; |
| if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { |
| rope_scaling_type = hparams.rope_scaling_type_train; |
| } |
|
|
| if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { |
| cparams.rope_freq_scale = 1.0f; |
| } |
|
|
| if (cparams.yarn_ext_factor < 0.0f) { |
| cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; |
| } |
|
|
| cparams.yarn_attn_factor *= hparams.rope_attn_factor; |
|
|
| if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { |
| if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { |
| cparams.pooling_type = LLAMA_POOLING_TYPE_NONE; |
| } else { |
| cparams.pooling_type = hparams.pooling_type; |
| } |
| } |
|
|
| if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) { |
| cparams.causal_attn = hparams.causal_attn; |
| } else { |
| cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; |
| } |
|
|
| const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; |
|
|
| LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); |
| LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); |
| LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq); |
| LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); |
| LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); |
| LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); |
| LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); |
| LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); |
|
|
| if (n_ctx_per_seq < hparams.n_ctx_train) { |
| LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", |
| __func__, n_ctx_per_seq, hparams.n_ctx_train); |
| } |
|
|
| if (n_ctx_per_seq > hparams.n_ctx_train) { |
| LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", |
| __func__, n_ctx_per_seq, hparams.n_ctx_train); |
| } |
|
|
| ctx->logits_all = params.logits_all; |
|
|
| |
| ctx->is_encoding = llama_model_has_encoder(model); |
|
|
| uint32_t kv_size = cparams.n_ctx; |
| ggml_type type_k = params.type_k; |
| ggml_type type_v = params.type_v; |
|
|
| |
| if (llama_model_is_recurrent(model)) { |
| |
| kv_size = std::max((uint32_t) 1, params.n_seq_max); |
| |
| type_k = GGML_TYPE_F32; |
| type_v = GGML_TYPE_F32; |
| } |
|
|
| GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); |
| GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); |
|
|
| if (!hparams.vocab_only) { |
| |
| for (auto * dev : model->devices) { |
| ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); |
| if (backend == nullptr) { |
| LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); |
| llama_free(ctx); |
| return nullptr; |
| } |
| ctx->backends.emplace_back(backend); |
| } |
|
|
| |
| for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { |
| ggml_backend_dev_t dev = ggml_backend_dev_get(i); |
| if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) { |
| ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); |
| if (backend == nullptr) { |
| LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); |
| llama_free(ctx); |
| return nullptr; |
| } |
| ctx->backends.emplace_back(backend); |
| } |
| } |
|
|
| |
| ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); |
| if (ctx->backend_cpu == nullptr) { |
| LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); |
| llama_free(ctx); |
| return nullptr; |
| } |
| ctx->backends.emplace_back(ctx->backend_cpu); |
|
|
| |
| for (auto & backend : ctx->backends) { |
| ggml_backend_dev_t dev = ggml_backend_get_device(backend.get()); |
| ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; |
| if (reg) { |
| auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); |
| if (ggml_backend_set_n_threads_fn) { |
| ctx->set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn); |
| } |
| } |
| } |
|
|
| llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data); |
|
|
| if (!llama_kv_cache_init(ctx->kv_self, ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { |
| LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); |
| llama_free(ctx); |
| return nullptr; |
| } |
|
|
| { |
| size_t memory_size_k = 0; |
| size_t memory_size_v = 0; |
|
|
| for (auto & k : ctx->kv_self.k_l) { |
| memory_size_k += ggml_nbytes(k); |
| } |
|
|
| for (auto & v : ctx->kv_self.v_l) { |
| memory_size_v += ggml_nbytes(v); |
| } |
|
|
| LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, |
| (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), |
| ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), |
| ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); |
| } |
|
|
| |
| { |
| |
| if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) { |
| LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); |
| llama_free(ctx); |
| return nullptr; |
| } |
|
|
| LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, |
| ggml_backend_buffer_name(ctx->buf_output.get()), |
| ggml_backend_buffer_get_size(ctx->buf_output.get()) / 1024.0 / 1024.0); |
| } |
|
|
| |
| { |
| |
| std::vector<ggml_backend_buffer_type_t> backend_buft; |
| std::vector<ggml_backend_t> backend_ptrs; |
| for (auto & backend : ctx->backends) { |
| auto * buft = ggml_backend_get_default_buffer_type(backend.get()); |
| auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); |
| if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) { |
| |
| auto * dev = model->devices[0]; |
| auto * host_buft = ggml_backend_dev_host_buffer_type(dev); |
| if (host_buft) { |
| buft = host_buft; |
| } |
| } |
| backend_buft.push_back(buft); |
| backend_ptrs.push_back(backend.get()); |
| } |
|
|
| const size_t max_nodes = model->max_nodes(); |
|
|
| |
| ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); |
|
|
| |
| |
| bool pipeline_parallel = |
| model->n_devices() > 1 && |
| model->params.n_gpu_layers > (int)model->hparams.n_layer && |
| model->params.split_mode == LLAMA_SPLIT_MODE_LAYER && |
| params.offload_kqv; |
|
|
| |
| if (pipeline_parallel) { |
| for (auto & backend : ctx->backends) { |
| auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); |
| if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { |
| |
| continue; |
| } |
| auto * dev = ggml_backend_get_device(backend.get()); |
| ggml_backend_dev_props props; |
| ggml_backend_dev_get_props(dev, &props); |
| if (!props.caps.async || !props.caps.events) { |
| |
| pipeline_parallel = false; |
| break; |
| } |
| } |
| } |
|
|
| ctx->sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); |
|
|
| if (pipeline_parallel) { |
| LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched.get())); |
| } |
|
|
| |
| uint32_t n_seqs = 1; |
| uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); |
| llama_token token = ctx->model.vocab.token_bos(); |
|
|
| llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; |
| ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true); |
|
|
| |
| ggml_backend_sched_reserve(ctx->sched.get(), gf_pp); |
| int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched.get()); |
| int n_nodes_pp = ggml_graph_n_nodes(gf_pp); |
|
|
| |
| llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; |
| ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true); |
| ggml_backend_sched_reserve(ctx->sched.get(), gf_tg); |
| int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get()); |
| int n_nodes_tg = ggml_graph_n_nodes(gf_tg); |
|
|
| |
| gf_pp = llama_build_graph(*ctx, ubatch_pp, true); |
| if (!ggml_backend_sched_reserve(ctx->sched.get(), gf_pp)) { |
| LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); |
| llama_free(ctx); |
| return nullptr; |
| } |
|
|
| for (size_t i = 0; i < backend_ptrs.size(); ++i) { |
| ggml_backend_t backend = backend_ptrs[i]; |
| ggml_backend_buffer_type_t buft = backend_buft[i]; |
| size_t size = ggml_backend_sched_get_buffer_size(ctx->sched.get(), backend); |
| if (size > 1) { |
| LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, |
| ggml_backend_buft_name(buft), |
| size / 1024.0 / 1024.0); |
| } |
| } |
|
|
| if (n_nodes_pp == n_nodes_tg) { |
| LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp); |
| } else { |
| LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg); |
| } |
| if (n_splits_pp == n_splits_tg) { |
| LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp); |
| } else { |
| LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg); |
| } |
| } |
| } |
|
|
| return ctx; |
| } |
|
|
| struct llama_context * llama_new_context_with_model( |
| struct llama_model * model, |
| struct llama_context_params params) { |
| return llama_init_from_model(model, params); |
| } |
|
|
| |
| |
| |
|
|
| |
|
|
| struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) { |
| return llama_kv_cache_view_init(ctx->kv_self, n_seq_max); |
| } |
|
|
| void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) { |
| llama_kv_cache_view_update(view, ctx->kv_self); |
| } |
|
|
| int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) { |
| return llama_get_kv_cache_token_count(ctx->kv_self); |
| } |
|
|
| int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) { |
| return llama_get_kv_cache_used_cells(ctx->kv_self); |
| } |
|
|
| void llama_kv_cache_clear(struct llama_context * ctx) { |
| llama_kv_cache_clear(ctx->kv_self); |
| } |
|
|
| bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) { |
| return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1); |
| } |
|
|
| void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { |
| if (seq_id_src == seq_id_dst) { |
| return; |
| } |
| llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1); |
| } |
|
|
| void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) { |
| llama_kv_cache_seq_keep(ctx->kv_self, seq_id); |
| } |
|
|
| void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { |
| if (delta == 0) { |
| return; |
| } |
|
|
| llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta); |
| } |
|
|
| void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { |
| if (d == 1) { |
| return; |
| } |
|
|
| llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d); |
| } |
|
|
| llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) { |
| return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id); |
| } |
|
|
| void llama_kv_cache_defrag(struct llama_context * ctx) { |
| llama_kv_cache_defrag(ctx->kv_self); |
| } |
|
|
| void llama_kv_cache_update(struct llama_context * ctx) { |
| llama_kv_cache_update_impl(*ctx); |
| } |
|
|
| bool llama_kv_cache_can_shift(struct llama_context * ctx) { |
| return llama_kv_cache_can_shift(ctx->kv_self); |
| } |
|
|
| |
|
|
| int32_t llama_encode( |
| struct llama_context * ctx, |
| struct llama_batch batch) { |
| const int ret = llama_encode_impl(*ctx, batch); |
| if (ret != 0) { |
| LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret); |
| } |
|
|
| return ret; |
| } |
|
|
| int32_t llama_decode( |
| struct llama_context * ctx, |
| struct llama_batch batch) { |
| const int ret = llama_decode_impl(*ctx, batch); |
| if (ret != 0) { |
| LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); |
| } |
|
|
| return ret; |
| } |
|
|
| |
| |
| |
|
|
| int32_t llama_chat_apply_template( |
| const char * tmpl, |
| const struct llama_chat_message * chat, |
| size_t n_msg, |
| bool add_ass, |
| char * buf, |
| int32_t length) { |
| const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl); |
|
|
| |
| std::vector<const llama_chat_message *> chat_vec; |
| chat_vec.resize(n_msg); |
| for (size_t i = 0; i < n_msg; i++) { |
| chat_vec[i] = &chat[i]; |
| } |
|
|
| std::string formatted_chat; |
| llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl); |
| if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) { |
| return -1; |
| } |
| int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass); |
| if (res < 0) { |
| return res; |
| } |
| if (buf && length > 0) { |
| strncpy(buf, formatted_chat.c_str(), length); |
| } |
| return res; |
| } |
|
|
| |
| |
| |
|
|
| int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) { |
| static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf"; |
| if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) { |
| return strlen(split_path); |
| } |
| return 0; |
| } |
|
|
| int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) { |
| std::string str_split_path(split_path); |
| char postfix[32]; |
| snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count); |
| std::string str_postfix(postfix); |
|
|
| |
| int size_prefix = str_split_path.size() - str_postfix.size(); |
| if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) { |
| snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path); |
| return size_prefix; |
| } |
|
|
| return 0; |
| } |
|
|
| const char * llama_print_system_info(void) { |
| static std::string s; |
| s.clear(); |
|
|
|
|
| for (size_t i = 0; i < ggml_backend_reg_count(); i++) { |
| auto * reg = ggml_backend_reg_get(i); |
| auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); |
| if (get_features_fn) { |
| ggml_backend_feature * features = get_features_fn(reg); |
| s += ggml_backend_reg_name(reg); |
| s += " : "; |
| for (; features->name; features++) { |
| s += features->name; |
| s += " = "; |
| s += features->value; |
| s += " | "; |
| } |
| } |
| } |
|
|
| return s.c_str(); |
| } |
|
|
| |
| |
| |
|
|
| struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) { |
| struct llama_perf_context_data data = {}; |
|
|
| if (ctx == nullptr) { |
| return data; |
| } |
|
|
| data.t_start_ms = 1e-3 * ctx->t_start_us; |
| data.t_load_ms = 1e-3 * ctx->t_load_us; |
| data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us; |
| data.t_eval_ms = 1e-3 * ctx->t_eval_us; |
| data.n_p_eval = std::max(1, ctx->n_p_eval); |
| data.n_eval = std::max(1, ctx->n_eval); |
|
|
| return data; |
| } |
|
|
| void llama_perf_context_print(const struct llama_context * ctx) { |
| const auto data = llama_perf_context(ctx); |
|
|
| const double t_end_ms = 1e-3 * ggml_time_us(); |
|
|
| LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); |
| LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", |
| __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); |
| LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", |
| __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); |
| LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); |
| } |
|
|
| void llama_perf_context_reset(struct llama_context * ctx) { |
| ctx->t_start_us = ggml_time_us(); |
| ctx->t_eval_us = ctx->n_eval = 0; |
| ctx->t_p_eval_us = ctx->n_p_eval = 0; |
| } |
|
|