From 6167c263c7c822c8f3512593d8e3a026ae0ee441 Mon Sep 17 00:00:00 2001
From: kalomaze <66376113+kalomaze@users.noreply.github.com>
Date: Mon, 22 Jan 2024 03:02:40 -0600
Subject: [PATCH 1/4] Softmax exp & sum in one pass + temp returns if 1

---
 llama.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 909ad4ad854c4..fb8e7b6a346fc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7955,11 +7955,14 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
 
     float max_l = candidates->data[0].logit;
     float cum_sum = 0.0f;
+    
+    // Calculate the exp and sum in one pass
     for (size_t i = 0; i < candidates->size; ++i) {
-        float p = expf(candidates->data[i].logit - max_l);
-        candidates->data[i].p = p;
-        cum_sum += p;
+        candidates->data[i].p = expf(candidates->data[i].logit - max_l);
+        cum_sum += candidates->data[i].p;
     }
+    
+    // Normalize the probabilities
     for (size_t i = 0; i < candidates->size; ++i) {
         candidates->data[i].p /= cum_sum;
     }
@@ -8178,6 +8181,10 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
 void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
     const int64_t t_start_sample_us = ggml_time_us();
 
+    if (temp == 1.0f) {
+        return; // No adjustment needed as dividing by 1 leaves the values unchanged
+    }
+
     for (size_t i = 0; i < candidates_p->size; ++i) {
         candidates_p->data[i].logit /= temp;
     }

From 4779d994fc3d9d66014c99d159fa6385726c3044 Mon Sep 17 00:00:00 2001
From: kalomaze <66376113+kalomaze@users.noreply.github.com>
Date: Mon, 22 Jan 2024 03:58:59 -0600
Subject: [PATCH 2/4] tiny min p return check tweak

Number of sampleable candidates can never be zero in the first place
---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index fb8e7b6a346fc..59ab8f8682c6d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8030,7 +8030,7 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
 }
 
 void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    if (p <= 0.0f || !candidates->size) {
+    if (p <= 0.0f) {
         return;
     }
 

From feea528addddcc8633b9a2d544dd55735b67a1b8 Mon Sep 17 00:00:00 2001
From: kalomaze <66376113+kalomaze@users.noreply.github.com>
Date: Mon, 22 Jan 2024 05:11:50 -0600
Subject: [PATCH 3/4] Standardize top_k sorting

---
 llama.cpp | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 59ab8f8682c6d..e128cea6f30a6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7973,23 +7973,26 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
 }
 
 void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
-    const int64_t t_start_sample_us = ggml_time_us();
+    if (candidates->sorted) {
+       candidates->size = k;
+       return;
+    }
 
+    const int64_t t_start_sample_us = ggml_time_us();
+    
     k = std::max(k, (int) min_keep);
     k = std::min(k, (int) candidates->size);
 
     // Sort scores in descending order
-    if (!candidates->sorted) {
-        auto comp = [](const llama_token_data & a, const llama_token_data & b) {
-            return a.logit > b.logit;
-        };
-        if (k == (int) candidates->size) {
-            std::sort(candidates->data, candidates->data + candidates->size, comp);
-        } else {
-            std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
-        }
-        candidates->sorted = true;
+    auto comp = [](const llama_token_data & a, const llama_token_data & b) {
+        return a.logit > b.logit;
+    };
+    if (k == (int) candidates->size) {
+        std::sort(candidates->data, candidates->data + candidates->size, comp);
+    } else {
+        std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
     }
+    candidates->sorted = true;
     candidates->size = k;
 
     if (ctx) {

From bbb578b09d1adc4e59739a2e5daeb5b531a8758f Mon Sep 17 00:00:00 2001
From: kalomaze <66376113+kalomaze@users.noreply.github.com>
Date: Mon, 22 Jan 2024 05:24:54 -0600
Subject: [PATCH 4/4] Capture softmax operations for sampler profiling

---
 llama.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index e128cea6f30a6..4e912e84ca0b0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8005,10 +8005,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
         return;
     }
 
-    llama_sample_softmax(ctx, candidates);
-
     const int64_t t_start_sample_us = ggml_time_us();
 
+    llama_sample_softmax(ctx, candidates);
+
     // Compute the cumulative probabilities
     float cum_sum = 0.0f;
     size_t last_idx = candidates->size;
@@ -8036,11 +8036,11 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
     if (p <= 0.0f) {
         return;
     }
+    
+    const int64_t t_start_sample_us = ggml_time_us();
 
     llama_sample_softmax(ctx, candidates);
 
-    const int64_t t_start_sample_us = ggml_time_us();
-
     float scale = candidates->data[0].p; // scale by max prob
     size_t i = 1; // first token always matches
 
@@ -8063,9 +8063,10 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
         return;
     }
 
-    llama_sample_softmax(nullptr, candidates);
     const int64_t t_start_sample_us = ggml_time_us();
 
+    llama_sample_softmax(nullptr, candidates);
+
     // Compute the first and second derivatives
     std::vector<float> first_derivatives(candidates->size - 1);
     std::vector<float> second_derivatives(candidates->size - 2);
@@ -8124,11 +8125,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
         return;
     }
 
+    const int64_t t_start_sample_us = ggml_time_us();
+
     // Compute the softmax of logits and calculate entropy
     llama_sample_softmax(nullptr, candidates);
 
-    const int64_t t_start_sample_us = ggml_time_us();
-
     float entropy = 0.0f;
     for (size_t i = 0; i < candidates->size; ++i) {
         entropy += -candidates->data[i].p * logf(candidates->data[i].p);