@@ -286,10 +286,10 @@ enum llm_kv {
286
286
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
287
287
LLM_KV_ROPE_SCALING_FINETUNED,
288
288
289
- LLM_KV_SSM_D_INNER ,
290
- LLM_KV_SSM_D_CONV ,
291
- LLM_KV_SSM_D_STATE ,
292
- LLM_KV_SSM_DT_RANK ,
289
+ LLM_KV_SSM_INNER_SIZE ,
290
+ LLM_KV_SSM_CONV_KERNEL ,
291
+ LLM_KV_SSM_STATE_SIZE ,
292
+ LLM_KV_SSM_TIME_STEP_RANK ,
293
293
294
294
LLM_KV_TOKENIZER_MODEL,
295
295
LLM_KV_TOKENIZER_LIST,
@@ -349,10 +349,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
349
349
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
350
350
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
351
351
352
- { LLM_KV_SSM_D_CONV , "%s.ssm.d_conv" },
353
- { LLM_KV_SSM_D_INNER , "%s.ssm.d_inner" },
354
- { LLM_KV_SSM_D_STATE , "%s.ssm.d_state" },
355
- { LLM_KV_SSM_DT_RANK , "%s.ssm.dt_rank" },
352
+ { LLM_KV_SSM_CONV_KERNEL , "%s.ssm.conv_kernel" },
353
+ { LLM_KV_SSM_INNER_SIZE , "%s.ssm.inner_size" },
354
+ { LLM_KV_SSM_STATE_SIZE , "%s.ssm.state_size" },
355
+ { LLM_KV_SSM_TIME_STEP_RANK , "%s.ssm.time_step_rank" },
356
356
357
357
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
358
358
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
@@ -3599,10 +3599,10 @@ static void llm_load_hparams(
3599
3599
} break;
3600
3600
case LLM_ARCH_MAMBA:
3601
3601
{
3602
- ml.get_key(LLM_KV_SSM_D_CONV, hparams.ssm_d_conv);
3603
- ml.get_key(LLM_KV_SSM_D_INNER, hparams.ssm_d_inner);
3604
- ml.get_key(LLM_KV_SSM_D_STATE, hparams.ssm_d_state);
3605
- ml.get_key(LLM_KV_SSM_DT_RANK , hparams.ssm_dt_rank);
3602
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
3603
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
3604
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
3605
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK , hparams.ssm_dt_rank);
3606
3606
3607
3607
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3608
3608
@@ -4864,8 +4864,15 @@ static bool llm_load_tensors(
4864
4864
4865
4865
// output
4866
4866
{
4867
- model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4868
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4867
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4868
+
4869
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4870
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
4871
+ if (model.output == NULL) {
4872
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4873
+ ml.n_created--; // artificial tensor
4874
+ ml.size_data += ggml_nbytes(model.output);
4875
+ }
4869
4876
}
4870
4877
4871
4878
for (int i = 0; i < n_layer; ++i) {
0 commit comments