-
Notifications
You must be signed in to change notification settings - Fork 13.3k
Small sampling optimizations #5074
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7955,11 +7955,14 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c | |
|
||
float max_l = candidates->data[0].logit; | ||
float cum_sum = 0.0f; | ||
|
||
// Calculate the exp and sum in one pass | ||
for (size_t i = 0; i < candidates->size; ++i) { | ||
float p = expf(candidates->data[i].logit - max_l); | ||
candidates->data[i].p = p; | ||
cum_sum += p; | ||
candidates->data[i].p = expf(candidates->data[i].logit - max_l); | ||
cum_sum += candidates->data[i].p; | ||
} | ||
|
||
// Normalize the probabilities | ||
for (size_t i = 0; i < candidates->size; ++i) { | ||
candidates->data[i].p /= cum_sum; | ||
} | ||
|
@@ -7970,23 +7973,26 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c | |
} | ||
|
||
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) { | ||
const int64_t t_start_sample_us = ggml_time_us(); | ||
if (candidates->sorted) { | ||
candidates->size = k; | ||
return; | ||
} | ||
|
||
const int64_t t_start_sample_us = ggml_time_us(); | ||
|
||
k = std::max(k, (int) min_keep); | ||
k = std::min(k, (int) candidates->size); | ||
|
||
// Sort scores in descending order | ||
if (!candidates->sorted) { | ||
auto comp = [](const llama_token_data & a, const llama_token_data & b) { | ||
return a.logit > b.logit; | ||
}; | ||
if (k == (int) candidates->size) { | ||
std::sort(candidates->data, candidates->data + candidates->size, comp); | ||
} else { | ||
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); | ||
} | ||
candidates->sorted = true; | ||
auto comp = [](const llama_token_data & a, const llama_token_data & b) { | ||
return a.logit > b.logit; | ||
}; | ||
if (k == (int) candidates->size) { | ||
std::sort(candidates->data, candidates->data + candidates->size, comp); | ||
} else { | ||
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); | ||
} | ||
candidates->sorted = true; | ||
candidates->size = k; | ||
|
||
if (ctx) { | ||
|
@@ -7999,10 +8005,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can | |
return; | ||
} | ||
|
||
llama_sample_softmax(ctx, candidates); | ||
|
||
const int64_t t_start_sample_us = ggml_time_us(); | ||
|
||
llama_sample_softmax(ctx, candidates); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment about counting the time 2 times |
||
|
||
// Compute the cumulative probabilities | ||
float cum_sum = 0.0f; | ||
size_t last_idx = candidates->size; | ||
|
@@ -8027,14 +8033,14 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can | |
} | ||
|
||
void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { | ||
if (p <= 0.0f || !candidates->size) { | ||
if (p <= 0.0f) { | ||
return; | ||
} | ||
|
||
const int64_t t_start_sample_us = ggml_time_us(); | ||
|
||
llama_sample_softmax(ctx, candidates); | ||
|
||
Comment on lines
+8039
to
8043
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will count the |
||
const int64_t t_start_sample_us = ggml_time_us(); | ||
|
||
float scale = candidates->data[0].p; // scale by max prob | ||
size_t i = 1; // first token always matches | ||
|
||
|
@@ -8057,9 +8063,10 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * | |
return; | ||
} | ||
|
||
llama_sample_softmax(nullptr, candidates); | ||
const int64_t t_start_sample_us = ggml_time_us(); | ||
|
||
llama_sample_softmax(nullptr, candidates); | ||
|
||
// Compute the first and second derivatives | ||
std::vector<float> first_derivatives(candidates->size - 1); | ||
std::vector<float> second_derivatives(candidates->size - 2); | ||
|
@@ -8118,11 +8125,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c | |
return; | ||
} | ||
|
||
const int64_t t_start_sample_us = ggml_time_us(); | ||
|
||
// Compute the softmax of logits and calculate entropy | ||
llama_sample_softmax(nullptr, candidates); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is OK because it does not take a context |
||
|
||
const int64_t t_start_sample_us = ggml_time_us(); | ||
|
||
float entropy = 0.0f; | ||
for (size_t i = 0; i < candidates->size; ++i) { | ||
entropy += -candidates->data[i].p * logf(candidates->data[i].p); | ||
|
@@ -8178,6 +8185,10 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c | |
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { | ||
const int64_t t_start_sample_us = ggml_time_us(); | ||
|
||
if (temp == 1.0f) { | ||
return; // No adjustment needed as dividing by 1 leaves the values unchanged | ||
} | ||
Comment on lines
8186
to
+8190
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should make it consistent and return before |
||
|
||
for (size_t i = 0; i < candidates_p->size; ++i) { | ||
candidates_p->data[i].logit /= temp; | ||
} | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm realizing now that these sanity checks will not run if we return early. Does that matter at all / should I revert the changes of the "Standardize top k" commit (I assume they are not actually necessary)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think they need to remain - simply move them before the "if sorted" early return at the top