Skip to content

Commit 9a9de40

Browse files
committed
fix: Only allocate attention cache for attention layers (not non-recurrent)
https://github.com/ggml-org/llama.cpp/issues/nemotron-nano-15409 Branch: gabe-l-hart/nvidia-nemotron-nano-15409 Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 37c42c9 commit 9a9de40

File tree

1 file changed

+19
-2
lines changed

1 file changed

+19
-2
lines changed

src/llama-model.cpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18316,6 +18316,23 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1831618316
std::max((uint32_t) 1, cparams.n_seq_max),
1831718317
cparams.n_seq_max);
1831818318
} else if (llm_arch_is_hybrid(arch)) {
18319+
18320+
// The main difference between hybrid architectures is the
18321+
// layer filters, so pick the right one here
18322+
llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
18323+
llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
18324+
if (arch == LLM_ARCH_FALCON_H1) {
18325+
filter_attn = [&](int32_t) { return true; };
18326+
filter_recr = [&](int32_t) { return true; };
18327+
} else if (arch == LLM_ARCH_NEMOTRONH) {
18328+
filter_attn = [&](int32_t il) {
18329+
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
18330+
};
18331+
filter_recr = [&](int32_t il) {
18332+
return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
18333+
};
18334+
}
18335+
1831918336
const auto padding = llama_kv_cache::get_padding(cparams);
1832018337

1832118338
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
@@ -18335,8 +18352,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1833518352
/* n_seq_max */ cparams.n_seq_max,
1833618353
/* offload */ cparams.offload_kqv,
1833718354
/* unified */ cparams.kv_unified,
18338-
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
18339-
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
18355+
/* filter_attn */ std::move(filter_attn),
18356+
/* filter_recr */ std::move(filter_recr));
1834018357
} else {
1834118358
const auto padding = llama_kv_cache::get_padding(cparams);
1834218359

0 commit comments

Comments
 (0)