7878
7979// bump if necessary
8080#define LLAMA_MAX_LAYERS 512
81- #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
81+ #define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
82+
83+ enum llama_expert_gating_func_type {
84+ LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
85+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
86+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
87+ };
8288
8389//
8490// helpers
@@ -282,6 +288,8 @@ enum llm_kv {
282288 LLM_KV_EXPERT_USED_COUNT,
283289 LLM_KV_EXPERT_SHARED_COUNT,
284290 LLM_KV_EXPERT_WEIGHTS_SCALE,
291+ LLM_KV_EXPERT_WEIGHTS_NORM,
292+ LLM_KV_EXPERT_GATING_FUNC,
285293 LLM_KV_POOLING_TYPE,
286294 LLM_KV_LOGIT_SCALE,
287295 LLM_KV_DECODER_START_TOKEN_ID,
@@ -398,6 +406,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
398406 { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
399407 { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
400408 { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
409+ { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
410+ { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
401411 { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
402412 { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
403413 { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -534,6 +544,7 @@ enum llm_tensor {
534544 LLM_TENSOR_FFN_DOWN_SHEXP,
535545 LLM_TENSOR_FFN_GATE_SHEXP,
536546 LLM_TENSOR_FFN_UP_SHEXP,
547+ LLM_TENSOR_FFN_EXP_PROBS_B,
537548 LLM_TENSOR_ATTN_Q_NORM,
538549 LLM_TENSOR_ATTN_K_NORM,
539550 LLM_TENSOR_LAYER_OUT_NORM,
@@ -1338,6 +1349,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
13381349 { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
13391350 { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
13401351 { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1352+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
13411353 },
13421354 },
13431355 {
@@ -2442,6 +2454,7 @@ enum e_model {
24422454 MODEL_70B,
24432455 MODEL_236B,
24442456 MODEL_314B,
2457+ MODEL_671B,
24452458 MODEL_SMALL,
24462459 MODEL_MEDIUM,
24472460 MODEL_LARGE,
@@ -2491,6 +2504,8 @@ struct llama_hparams {
24912504 uint32_t n_ff_shexp = 0;
24922505 uint32_t n_expert_shared = 0;
24932506 float expert_weights_scale = 0.0;
2507+ bool expert_weights_norm = false;
2508+ uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
24942509
24952510 float f_norm_eps;
24962511 float f_norm_rms_eps;
@@ -2790,6 +2805,7 @@ struct llama_layer {
27902805 struct ggml_tensor * ffn_down_b; // b2
27912806 struct ggml_tensor * ffn_up_b; // b3
27922807 struct ggml_tensor * ffn_act;
2808+ struct ggml_tensor * ffn_exp_probs_b;
27932809
27942810 // mamba proj
27952811 struct ggml_tensor * ssm_in;
@@ -5376,6 +5392,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
53765392 }
53775393}
53785394
5395+ static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
5396+ switch (type) {
5397+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
5398+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
5399+ default: return "unknown";
5400+ }
5401+ }
5402+
53795403static const char * llama_model_type_name(e_model type) {
53805404 switch (type) {
53815405 case MODEL_14M: return "14M";
@@ -5427,6 +5451,7 @@ static const char * llama_model_type_name(e_model type) {
54275451 case MODEL_70B: return "70B";
54285452 case MODEL_236B: return "236B";
54295453 case MODEL_314B: return "314B";
5454+ case MODEL_671B: return "671B";
54305455 case MODEL_SMALL: return "0.1B";
54315456 case MODEL_MEDIUM: return "0.4B";
54325457 case MODEL_LARGE: return "0.8B";
@@ -6109,6 +6134,13 @@ static void llm_load_hparams(
61096134 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
61106135 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
61116136 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
6137+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
6138+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
6139+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
6140+ // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
6141+ // that have no expert_gating_func model parameter set
6142+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
6143+ }
61126144 ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
61136145
61146146 switch (hparams.n_layer) {
@@ -6430,6 +6462,10 @@ static void llm_load_vocab(
64306462 tokenizer_pre == "deepseek-coder") {
64316463 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
64326464 vocab.tokenizer_clean_spaces = false;
6465+ } else if (
6466+ tokenizer_pre == "deepseek-v3") {
6467+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
6468+ vocab.tokenizer_clean_spaces = false;
64336469 } else if (
64346470 tokenizer_pre == "falcon") {
64356471 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -7103,6 +7139,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
71037139 LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
71047140 LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
71057141 LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7142+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7143+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
71067144 LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
71077145 }
71087146
@@ -7250,6 +7288,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
72507288 {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
72517289 {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
72527290 {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
7291+ {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
72537292 // this tensor is loaded for T5, but never used
72547293 {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
72557294};
@@ -8961,6 +9000,7 @@ static bool llm_load_tensors(
89619000 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
89629001 } else {
89639002 layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
9003+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
89649004
89659005 if (n_expert == 0) {
89669006 throw std::runtime_error("n_expert must be > 0");
@@ -9831,12 +9871,14 @@ static struct ggml_tensor * llm_build_moe_ffn(
98319871 struct ggml_tensor * up_exps,
98329872 struct ggml_tensor * gate_exps,
98339873 struct ggml_tensor * down_exps,
9874+ struct ggml_tensor * exp_probs_b,
98349875 int64_t n_expert,
98359876 int64_t n_expert_used,
98369877 llm_ffn_op_type type_op,
98379878 bool norm_w,
98389879 bool scale_w,
98399880 float w_scale,
9881+ llama_expert_gating_func_type gating_op,
98409882 const llm_build_cb & cb,
98419883 int il) {
98429884 int64_t n_embd = cur->ne[0];
@@ -9845,11 +9887,31 @@ static struct ggml_tensor * llm_build_moe_ffn(
98459887 ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
98469888 cb(logits, "ffn_moe_logits", il);
98479889
9848- ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
9890+ ggml_tensor * probs = nullptr;
9891+ switch (gating_op) {
9892+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
9893+ {
9894+ probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
9895+ } break;
9896+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
9897+ {
9898+ probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens]
9899+ } break;
9900+ default:
9901+ GGML_ABORT("fatal error");
9902+ }
98499903 cb(probs, "ffn_moe_probs", il);
98509904
9905+ // add experts selection bias - introduced in DeepSeek V3
9906+ // leave probs unbiased as it's later used to get expert weights
9907+ ggml_tensor * selection_probs = probs;
9908+ if (exp_probs_b != nullptr) {
9909+ selection_probs = ggml_add(ctx, probs, exp_probs_b);
9910+ cb(selection_probs, "ffn_moe_probs_biased", il);
9911+ }
9912+
98519913 // select experts
9852- ggml_tensor * selected_experts = ggml_top_k(ctx, probs , n_expert_used); // [n_expert_used, n_tokens]
9914+ ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs , n_expert_used); // [n_expert_used, n_tokens]
98539915 cb(selected_experts->src[0], "ffn_moe_argsort", il);
98549916 cb(selected_experts, "ffn_moe_topk", il);
98559917
@@ -10970,9 +11032,11 @@ struct llm_build_context {
1097011032 model.layers[il].ffn_up_exps,
1097111033 model.layers[il].ffn_gate_exps,
1097211034 model.layers[il].ffn_down_exps,
11035+ nullptr,
1097311036 n_expert, n_expert_used,
1097411037 LLM_FFN_SILU, true,
1097511038 false, 0.0,
11039+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
1097611040 cb, il);
1097711041 cb(cur, "ffn_moe_out", il);
1097811042 }
@@ -11461,9 +11525,11 @@ struct llm_build_context {
1146111525 model.layers[il].ffn_up_exps,
1146211526 model.layers[il].ffn_gate_exps,
1146311527 model.layers[il].ffn_down_exps,
11528+ nullptr,
1146411529 n_expert, n_expert_used,
1146511530 LLM_FFN_GELU, true,
1146611531 false, 0.0,
11532+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
1146711533 cb, il);
1146811534 cb(cur, "ffn_moe_out", il);
1146911535
@@ -11602,9 +11668,11 @@ struct llm_build_context {
1160211668 model.layers[il].ffn_up_exps,
1160311669 model.layers[il].ffn_gate_exps,
1160411670 model.layers[il].ffn_down_exps,
11671+ nullptr,
1160511672 n_expert, n_expert_used,
1160611673 LLM_FFN_SILU, true,
1160711674 false, 0.0,
11675+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
1160811676 cb, il);
1160911677 cb(cur, "ffn_moe_out", il);
1161011678
@@ -12732,9 +12800,11 @@ struct llm_build_context {
1273212800 model.layers[il].ffn_up_exps,
1273312801 model.layers[il].ffn_gate_exps,
1273412802 model.layers[il].ffn_down_exps,
12803+ nullptr,
1273512804 n_expert, n_expert_used,
1273612805 LLM_FFN_SILU, false,
1273712806 false, 0.0,
12807+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
1273812808 cb, il);
1273912809 cb(cur, "ffn_moe_out", il);
1274012810
@@ -14726,9 +14796,11 @@ struct llm_build_context {
1472614796 model.layers[il].ffn_up_exps,
1472714797 model.layers[il].ffn_gate_exps,
1472814798 model.layers[il].ffn_down_exps,
14799+ nullptr,
1472914800 n_expert, n_expert_used,
1473014801 LLM_FFN_SILU, false,
1473114802 false, 0.0,
14803+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
1473214804 cb, il);
1473314805 cb(cur, "ffn_moe_out", il);
1473414806
@@ -15123,9 +15195,11 @@ struct llm_build_context {
1512315195 model.layers[il].ffn_up_exps,
1512415196 model.layers[il].ffn_gate_exps,
1512515197 model.layers[il].ffn_down_exps,
15198+ nullptr,
1512615199 n_expert, n_expert_used,
1512715200 LLM_FFN_SILU, true,
1512815201 false, 0.0,
15202+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
1512915203 cb, il);
1513015204 cb(cur, "ffn_moe_out", il);
1513115205
@@ -15338,9 +15412,11 @@ struct llm_build_context {
1533815412 model.layers[il].ffn_up_exps,
1533915413 model.layers[il].ffn_gate_exps,
1534015414 model.layers[il].ffn_down_exps,
15415+ model.layers[il].ffn_exp_probs_b,
1534115416 n_expert, n_expert_used,
15342- LLM_FFN_SILU, false,
15343- true, hparams.expert_weights_scale,
15417+ LLM_FFN_SILU, hparams.expert_weights_norm,
15418+ false, 0.0,
15419+ (enum llama_expert_gating_func_type) hparams.expert_gating_func,
1534415420 cb, il);
1534515421 cb(moe_out, "ffn_moe_out", il);
1534615422
0 commit comments