From 6167c263c7c822c8f3512593d8e3a026ae0ee441 Mon Sep 17 00:00:00 2001 From: kalomaze <66376113+kalomaze@users.noreply.github.com> Date: Mon, 22 Jan 2024 03:02:40 -0600 Subject: [PATCH 1/4] Softmax exp & sum in one pass + temp returns if 1 --- llama.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 909ad4ad854c4..fb8e7b6a346fc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7955,11 +7955,14 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c float max_l = candidates->data[0].logit; float cum_sum = 0.0f; + + // Calculate the exp and sum in one pass for (size_t i = 0; i < candidates->size; ++i) { - float p = expf(candidates->data[i].logit - max_l); - candidates->data[i].p = p; - cum_sum += p; + candidates->data[i].p = expf(candidates->data[i].logit - max_l); + cum_sum += candidates->data[i].p; } + + // Normalize the probabilities for (size_t i = 0; i < candidates->size; ++i) { candidates->data[i].p /= cum_sum; } @@ -8178,6 +8181,10 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { const int64_t t_start_sample_us = ggml_time_us(); + if (temp == 1.0f) { + return; // No adjustment needed as dividing by 1 leaves the values unchanged + } + for (size_t i = 0; i < candidates_p->size; ++i) { candidates_p->data[i].logit /= temp; } From 4779d994fc3d9d66014c99d159fa6385726c3044 Mon Sep 17 00:00:00 2001 From: kalomaze <66376113+kalomaze@users.noreply.github.com> Date: Mon, 22 Jan 2024 03:58:59 -0600 Subject: [PATCH 2/4] tiny min p return check tweak Number of sampleable candidates can never be zero in the first place --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index fb8e7b6a346fc..59ab8f8682c6d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8030,7 +8030,7 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can } void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { - if (p <= 0.0f || !candidates->size) { + if (p <= 0.0f) { return; } From feea528addddcc8633b9a2d544dd55735b67a1b8 Mon Sep 17 00:00:00 2001 From: kalomaze <66376113+kalomaze@users.noreply.github.com> Date: Mon, 22 Jan 2024 05:11:50 -0600 Subject: [PATCH 3/4] Standardize top_k sorting --- llama.cpp | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/llama.cpp b/llama.cpp index 59ab8f8682c6d..e128cea6f30a6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7973,23 +7973,26 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c } void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) { - const int64_t t_start_sample_us = ggml_time_us(); + if (candidates->sorted) { + candidates->size = k; + return; + } + const int64_t t_start_sample_us = ggml_time_us(); + k = std::max(k, (int) min_keep); k = std::min(k, (int) candidates->size); // Sort scores in descending order - if (!candidates->sorted) { - auto comp = [](const llama_token_data & a, const llama_token_data & b) { - return a.logit > b.logit; - }; - if (k == (int) candidates->size) { - std::sort(candidates->data, candidates->data + candidates->size, comp); - } else { - std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); - } - candidates->sorted = true; + auto comp = [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }; + if (k == (int) candidates->size) { + std::sort(candidates->data, candidates->data + candidates->size, comp); + } else { + std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); } + candidates->sorted = true; candidates->size = k; if (ctx) { From bbb578b09d1adc4e59739a2e5daeb5b531a8758f Mon Sep 17 00:00:00 2001 From: kalomaze <66376113+kalomaze@users.noreply.github.com> Date: Mon, 22 Jan 2024 05:24:54 -0600 Subject: [PATCH 4/4] Capture softmax operations for sampler profiling --- llama.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/llama.cpp b/llama.cpp index e128cea6f30a6..4e912e84ca0b0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8005,10 +8005,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can return; } - llama_sample_softmax(ctx, candidates); - const int64_t t_start_sample_us = ggml_time_us(); + llama_sample_softmax(ctx, candidates); + // Compute the cumulative probabilities float cum_sum = 0.0f; size_t last_idx = candidates->size; @@ -8036,11 +8036,11 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can if (p <= 0.0f) { return; } + + const int64_t t_start_sample_us = ggml_time_us(); llama_sample_softmax(ctx, candidates); - const int64_t t_start_sample_us = ggml_time_us(); - float scale = candidates->data[0].p; // scale by max prob size_t i = 1; // first token always matches @@ -8063,9 +8063,10 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * return; } - llama_sample_softmax(nullptr, candidates); const int64_t t_start_sample_us = ggml_time_us(); + llama_sample_softmax(nullptr, candidates); + // Compute the first and second derivatives std::vector first_derivatives(candidates->size - 1); std::vector second_derivatives(candidates->size - 2); @@ -8124,11 +8125,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c return; } + const int64_t t_start_sample_us = ggml_time_us(); + // Compute the softmax of logits and calculate entropy llama_sample_softmax(nullptr, candidates); - const int64_t t_start_sample_us = ggml_time_us(); - float entropy = 0.0f; for (size_t i = 0; i < candidates->size; ++i) { entropy += -candidates->data[i].p * logf(candidates->data[i].p);