llama : rename attn_streams -> kv_unified

ggerganov · ggerganov · commit fb8150d84733 · 2025-07-14T16:48:22.000+03:00
ggml-ci
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1465,13 +1465,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_SWA_FULL"));
     add_opt(common_arg(
-        {"--attn-streams", "-as"},
+        {"--kv-split", "-kvs"},
         string_format("use multiple streams when computing the attention (default: %s)\n"
-            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.attn_streams ? "true" : "false"),
+            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_split ? "true" : "false"),
         [](common_params & params) {
-            params.attn_streams = true;
+            params.kv_split = true;
         }
-    ).set_env("LLAMA_ARG_ATTN_STREAMS"));
+    ).set_env("LLAMA_ARG_KV_SPLIT"));
     add_opt(common_arg(
         {"--no-context-shift"},
         string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
diff --git a/common/common.cpp b/common/common.cpp
@@ -1157,7 +1157,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.no_perf           = params.no_perf;
     cparams.op_offload        = !params.no_op_offload;
     cparams.swa_full          = params.swa_full;
-    cparams.attn_streams      = params.attn_streams;
+    cparams.kv_unified        = !params.kv_split;
 
     cparams.type_k = params.cache_type_k;
     cparams.type_v = params.cache_type_v;
diff --git a/common/common.h b/common/common.h
@@ -330,7 +330,7 @@ struct common_params {
     bool no_perf           = false; // disable performance metrics
     bool ctx_shift         = true;  // context shift on inifinite text generation
     bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
-    bool attn_streams      = false; // multi-stream attention and KV cache buffers
+    bool kv_split          = false; // disable unified KV cache
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool use_mmap          = true;  // use mmap for faster loads
diff --git a/include/llama.h b/include/llama.h
@@ -334,10 +334,9 @@ extern "C" {
         bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
                           // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
                           //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
-
-        bool attn_streams; // if enabled, use multiple streams during the attention (determined by n_seq_max)
-                           // NOTE: this requires support for the ggml_set_rows() operator
-                           //       this flag can improve the performance for parallel, multi-sequence use cases
+        bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
+                          // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
+                          // ref: https://github.com/ggml-org/llama.cpp/pull/14363
     };
 
     // model quantization parameters
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -101,8 +101,8 @@ llama_context::llama_context(
 
     cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
 
-    cparams.op_offload   = params.op_offload;
-    cparams.attn_streams = params.attn_streams;
+    cparams.op_offload = params.op_offload;
+    cparams.kv_unified = params.kv_unified;
 
     const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
 
@@ -113,7 +113,7 @@ llama_context::llama_context(
     LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
     LLAMA_LOG_INFO("%s: causal_attn   = %d\n",   __func__, cparams.causal_attn);
     LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
-    LLAMA_LOG_INFO("%s: attn_streams  = %s\n",   __func__, cparams.attn_streams ? "true" : "false");
+    LLAMA_LOG_INFO("%s: kv_unified    = %s\n",   __func__, cparams.kv_unified ? "true" : "false");
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
 
@@ -269,7 +269,7 @@ llama_context::llama_context(
 
     // reserve worst-case graph
     if (!hparams.vocab_only && memory) {
-        const uint32_t n_seqs = cparams.attn_streams ? cparams.n_seq_max : 1;
+        const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
         const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
         LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
@@ -481,7 +481,7 @@ bool llama_context::kv_self_update(bool optimize) {
             throw std::runtime_error("failed to initialize memory context");
         }
 
-        const uint32_t n_seqs = cparams.attn_streams ? cparams.n_seq_max : 1;
+        const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
         const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
         auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
@@ -740,7 +740,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
     const int64_t n_embd = hparams.n_embd;
 
     // note: during encode, we always pass the full sequence starting from pos = 0
-    if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.attn_streams ? cparams.n_seq_max : LLAMA_MAX_SEQ, true)) {
+    if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
         return -1;
     }
@@ -907,7 +907,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
     // when computing embeddings, all tokens are output
     const bool output_all = cparams.embeddings;
 
-    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.attn_streams ? cparams.n_seq_max : LLAMA_MAX_SEQ, output_all)) {
+    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
         return -1;
     }
@@ -2036,7 +2036,7 @@ void llama_context::opt_epoch_iter(
             batch.logits  [pos_batch]    = true;
         }
 
-        if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.attn_streams ? cparams.n_seq_max : LLAMA_MAX_SEQ, true)) {
+        if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
             LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
             return;
         }
@@ -2195,7 +2195,7 @@ llama_context_params llama_context_default_params() {
         /*.no_perf                     =*/ true,
         /*.op_offload                  =*/ true,
         /*.swa_full                    =*/ true,
-        /*.attn_streams                =*/ false,
+        /*.kv_unified                  =*/ true,
     };
 
     return result;
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
@@ -33,7 +33,7 @@ struct llama_cparams {
     bool no_perf;
     bool warmup;
     bool op_offload;
-    bool attn_streams;
+    bool kv_unified;
 
     enum llama_pooling_type pooling_type;
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -1166,7 +1166,7 @@ static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unifie
 
         const auto n_kv     = mctx_cur->get_n_kv();
         const auto n_tokens = ubatch.n_tokens;
-        const auto n_stream = cparams.attn_streams ? ubatch.n_seqs_unq : 1;
+        const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
 
         inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
         inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
@@ -1371,7 +1371,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
 
     auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, mctx_cur);
 
-    const auto n_stream = cparams.attn_streams ? ubatch.n_seqs_unq : 1;
+    const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
 
     {
         const auto n_kv = mctx_cur->get_base()->get_n_kv();
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -195,6 +195,11 @@ llama_kv_cache_unified::llama_kv_cache_unified(
     const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
     supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0;
 
+    if (!supports_set_rows && !unified) {
+        LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing LLAMA_SET_ROWS=1\n", __func__);
+        supports_set_rows = 1;
+    }
+
     if (!supports_set_rows) {
         LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
     }
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -16120,7 +16120,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
 
                     uint32_t n_ctx_per_stream = cparams.n_ctx;
 
-                    if (cparams.attn_streams) {
+                    if (!cparams.kv_unified) {
                         n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
                         n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
 
@@ -16143,7 +16143,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                                 !cparams.flash_attn,
                                 cparams.offload_kqv,
                                 params.swa_full,
-                                !cparams.attn_streams,
+                                cparams.kv_unified,
                                 n_ctx_per_stream,
                                 cparams.n_seq_max,
                                 cparams.n_ubatch,
@@ -16158,7 +16158,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                                 params.type_v,
                                 !cparams.flash_attn,
                                 cparams.offload_kqv,
-                                !cparams.attn_streams,
+                                cparams.kv_unified,
                                 n_ctx_per_stream,
                                 cparams.n_seq_max,
                                 padding,