Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions ggml/src/ggml-cann/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,6 @@ struct ggml_backend_cann_context {
#endif
cann_task_queue task_queue;
bool async_mode;
bool support_set_rows;
// Rope Cache
void* rope_init_ptr = nullptr;
void* rope_sin_ptr = nullptr;
Expand All @@ -400,14 +399,6 @@ struct ggml_backend_cann_context {
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
device, async_mode ? "ON" : "OFF");

support_set_rows = parse_bool(get_env("LLAMA_SET_ROWS").value_or(""));
GGML_LOG_INFO("%s: LLAMA_SET_ROWS is %s\n", __func__, support_set_rows ? "ON" : "OFF");

if (!support_set_rows) {
GGML_LOG_INFO("%s: CANN Graph currently only supports execution when LLAMA_SET_ROWS is ON. "
"Falling back to eager mode.\n", __func__);
}
}

/**
Expand Down
5 changes: 0 additions & 5 deletions ggml/src/ggml-cann/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2251,11 +2251,6 @@ static enum ggml_status ggml_backend_cann_graph_compute(
bool use_cann_graph = true;
bool cann_graph_update_required = false;

// check environment LLAMA_SET_ROWS
if (!cann_ctx->support_set_rows) {
use_cann_graph = false;
}

if (use_cann_graph) {
if (cann_ctx->cann_graph == nullptr) {
cann_ctx->cann_graph.reset(new ggml_cann_graph());
Expand Down
22 changes: 0 additions & 22 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,16 +102,6 @@ llama_context::llama_context(
cparams.op_offload = params.op_offload;
cparams.kv_unified = params.kv_unified;

{
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;

if (!supports_set_rows && !cparams.kv_unified) {
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
cparams.kv_unified = true;
}
}

{
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
Expand Down Expand Up @@ -890,12 +880,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
}
}

if (!supports_set_rows) {
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
}

// TODO: hacky solution
if (model.arch == LLM_ARCH_T5 && t_embd) {
//cross.t_embd = t_embd;
Expand Down Expand Up @@ -1226,12 +1210,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
// wait for the computation to finish (automatically done when obtaining the model output)
//synchronize();

if (!supports_set_rows) {
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
}

return 0;
}

Expand Down
4 changes: 0 additions & 4 deletions src/llama-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,10 +283,6 @@ struct llama_context {

bool has_evaluated_once = false;

// env: LLAMA_SET_ROWS (temporary)
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
bool supports_set_rows = true;

// env: LLAMA_GRAPH_REUSE_DISABLE
bool graph_reuse_disable = false;

Expand Down
4 changes: 0 additions & 4 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,6 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);

res &= mctx->get_supports_set_rows(); // TODO: tmp

return res;
}

Expand Down Expand Up @@ -350,8 +348,6 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);

res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp

return res;
}

Expand Down
104 changes: 20 additions & 84 deletions src/llama-kv-cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,18 +197,6 @@ llama_kv_cache::llama_kv_cache(

const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;

const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;

if (!supports_set_rows) {
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
GGML_ASSERT(unified && "cannot use non-unified KV cache without ggml_set_rows() support");
}

if (!supports_set_rows) {
LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
}
}

void llama_kv_cache::clear(bool data) {
Expand Down Expand Up @@ -551,11 +539,8 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
bool success = true;

for (const auto & ubatch : ubatches) {
// non-continuous slots require support for ggml_set_rows()
const bool cont = supports_set_rows ? false : true;

// only find a suitable slot for the ubatch. don't modify the cells yet
const auto sinfo_new = find_slot(ubatch, cont);
const auto sinfo_new = find_slot(ubatch, true);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should have been false - fixing ..

if (sinfo_new.empty()) {
success = false;
break;
Expand Down Expand Up @@ -976,10 +961,6 @@ uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
return result;
}

bool llama_kv_cache::get_supports_set_rows() const {
return supports_set_rows;
}

ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
const int32_t ikv = map_layer_ids.at(il);

Expand Down Expand Up @@ -1033,36 +1014,26 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
}

ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
GGML_UNUSED(sinfo);

const int32_t ikv = map_layer_ids.at(il);

auto * k = layers[ikv].k;

const int64_t n_embd_k_gqa = k->ne[0];
const int64_t n_tokens = k_cur->ne[2];

k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);

if (k_idxs && supports_set_rows) {
if (k->ne[2] > 1) {
k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
}

return ggml_set_rows(ctx, k, k_cur, k_idxs);
if (k->ne[2] > 1) {
k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
}

// TODO: fallback to old ggml_cpy() method for backwards compatibility
// will be removed when ggml_set_rows() is adopted by all backends

GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");

ggml_tensor * k_view = ggml_view_1d(ctx, k,
n_tokens*n_embd_k_gqa,
ggml_row_size(k->type, n_embd_k_gqa)*sinfo.head());

return ggml_cpy(ctx, k_cur, k_view);
return ggml_set_rows(ctx, k, k_cur, k_idxs);
}

ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
GGML_UNUSED(sinfo);

const int32_t ikv = map_layer_ids.at(il);

auto * v = layers[ikv].v;
Expand All @@ -1072,48 +1043,25 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm

v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);

if (v_idxs && supports_set_rows) {
if (!v_trans) {
if (v->ne[2] > 1) {
v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
}

return ggml_set_rows(ctx, v, v_cur, v_idxs);
}

// [TAG_V_CACHE_VARIABLE]
if (n_embd_v_gqa < v->ne[0]) {
v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
if (!v_trans) {
if (v->ne[2] > 1) {
v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
}

// the row becomes a single element
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);

v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);

return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
return ggml_set_rows(ctx, v, v_cur, v_idxs);
}

// TODO: fallback to old ggml_cpy() method for backwards compatibility
// will be removed when ggml_set_rows() is adopted by all backends

GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");
// [TAG_V_CACHE_VARIABLE]
if (n_embd_v_gqa < v->ne[0]) {
v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
}

ggml_tensor * v_view = nullptr;
// the row becomes a single element
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);

if (!v_trans) {
v_view = ggml_view_1d(ctx, v,
n_tokens*n_embd_v_gqa,
ggml_row_size(v->type, n_embd_v_gqa)*sinfo.head());
} else {
v_cur = ggml_transpose(ctx, v_cur);
v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);

v_view = ggml_view_2d(ctx, v, n_tokens, n_embd_v_gqa,
(v->ne[1] )*ggml_element_size(v),
(sinfo.head())*ggml_element_size(v));
}

return ggml_cpy(ctx, v_cur, v_view);
return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
}

ggml_tensor * llama_kv_cache::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
Expand Down Expand Up @@ -1143,10 +1091,6 @@ ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama
}

void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
if (!supports_set_rows) {
return;
}

const uint32_t n_tokens = ubatch->n_tokens;
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());

Expand All @@ -1163,10 +1107,6 @@ void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ub
}

void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
if (!supports_set_rows) {
return;
}

const uint32_t n_tokens = ubatch->n_tokens;
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());

Expand Down Expand Up @@ -2004,10 +1944,6 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
return n_kv;
}

bool llama_kv_cache_context::get_supports_set_rows() const {
return kv->get_supports_set_rows();
}

ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
}
Expand Down
10 changes: 0 additions & 10 deletions src/llama-kv-cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,6 @@ class llama_kv_cache : public llama_memory_i {

uint32_t get_n_kv(const slot_info & sinfo) const;

// TODO: temporary
bool get_supports_set_rows() const;

// get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
Expand Down Expand Up @@ -215,10 +212,6 @@ class llama_kv_cache : public llama_memory_i {
// env: LLAMA_KV_CACHE_DEBUG
int debug = 0;

// env: LLAMA_SET_ROWS (temporary)
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
bool supports_set_rows = true;

const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;

std::vector<ggml_context_ptr> ctxs;
Expand Down Expand Up @@ -318,9 +311,6 @@ class llama_kv_cache_context : public llama_memory_context_i {

uint32_t get_n_kv() const;

// TODO: temporary
bool get_supports_set_rows() const;

// get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
Expand Down
Loading