diff --git a/llama.cpp b/llama.cpp index 909ad4ad854c4..4e912e84ca0b0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7955,11 +7955,14 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c float max_l = candidates->data[0].logit; float cum_sum = 0.0f; + + // Calculate the exp and sum in one pass for (size_t i = 0; i < candidates->size; ++i) { - float p = expf(candidates->data[i].logit - max_l); - candidates->data[i].p = p; - cum_sum += p; + candidates->data[i].p = expf(candidates->data[i].logit - max_l); + cum_sum += candidates->data[i].p; } + + // Normalize the probabilities for (size_t i = 0; i < candidates->size; ++i) { candidates->data[i].p /= cum_sum; } @@ -7970,23 +7973,26 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c } void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) { - const int64_t t_start_sample_us = ggml_time_us(); + if (candidates->sorted) { + candidates->size = k; + return; + } + const int64_t t_start_sample_us = ggml_time_us(); + k = std::max(k, (int) min_keep); k = std::min(k, (int) candidates->size); // Sort scores in descending order - if (!candidates->sorted) { - auto comp = [](const llama_token_data & a, const llama_token_data & b) { - return a.logit > b.logit; - }; - if (k == (int) candidates->size) { - std::sort(candidates->data, candidates->data + candidates->size, comp); - } else { - std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); - } - candidates->sorted = true; + auto comp = [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }; + if (k == (int) candidates->size) { + std::sort(candidates->data, candidates->data + candidates->size, comp); + } else { + std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); } + candidates->sorted = true; candidates->size = k; if (ctx) { @@ -7999,10 +8005,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can return; } - llama_sample_softmax(ctx, candidates); - const int64_t t_start_sample_us = ggml_time_us(); + llama_sample_softmax(ctx, candidates); + // Compute the cumulative probabilities float cum_sum = 0.0f; size_t last_idx = candidates->size; @@ -8027,14 +8033,14 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can } void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { - if (p <= 0.0f || !candidates->size) { + if (p <= 0.0f) { return; } + + const int64_t t_start_sample_us = ggml_time_us(); llama_sample_softmax(ctx, candidates); - const int64_t t_start_sample_us = ggml_time_us(); - float scale = candidates->data[0].p; // scale by max prob size_t i = 1; // first token always matches @@ -8057,9 +8063,10 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * return; } - llama_sample_softmax(nullptr, candidates); const int64_t t_start_sample_us = ggml_time_us(); + llama_sample_softmax(nullptr, candidates); + // Compute the first and second derivatives std::vector first_derivatives(candidates->size - 1); std::vector second_derivatives(candidates->size - 2); @@ -8118,11 +8125,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c return; } + const int64_t t_start_sample_us = ggml_time_us(); + // Compute the softmax of logits and calculate entropy llama_sample_softmax(nullptr, candidates); - const int64_t t_start_sample_us = ggml_time_us(); - float entropy = 0.0f; for (size_t i = 0; i < candidates->size; ++i) { entropy += -candidates->data[i].p * logf(candidates->data[i].p); @@ -8178,6 +8185,10 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { const int64_t t_start_sample_us = ggml_time_us(); + if (temp == 1.0f) { + return; // No adjustment needed as dividing by 1 leaves the values unchanged + } + for (size_t i = 0; i < candidates_p->size; ++i) { candidates_p->data[i].logit /= temp; }