Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 33 additions & 22 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7955,11 +7955,14 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c

float max_l = candidates->data[0].logit;
float cum_sum = 0.0f;

// Calculate the exp and sum in one pass
for (size_t i = 0; i < candidates->size; ++i) {
float p = expf(candidates->data[i].logit - max_l);
candidates->data[i].p = p;
cum_sum += p;
candidates->data[i].p = expf(candidates->data[i].logit - max_l);
cum_sum += candidates->data[i].p;
}

// Normalize the probabilities
for (size_t i = 0; i < candidates->size; ++i) {
candidates->data[i].p /= cum_sum;
}
Expand All @@ -7970,23 +7973,26 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
}

void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
const int64_t t_start_sample_us = ggml_time_us();
if (candidates->sorted) {
candidates->size = k;
return;
}

const int64_t t_start_sample_us = ggml_time_us();

k = std::max(k, (int) min_keep);
k = std::min(k, (int) candidates->size);
Copy link
Contributor Author

@kalomaze kalomaze Jan 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm realizing now that these sanity checks will not run if we return early. Does that matter at all / should I revert the changes of the "Standardize top k" commit (I assume they are not actually necessary)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think they need to remain - simply move them before the "if sorted" early return at the top


// Sort scores in descending order
if (!candidates->sorted) {
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
};
if (k == (int) candidates->size) {
std::sort(candidates->data, candidates->data + candidates->size, comp);
} else {
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
}
candidates->sorted = true;
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
};
if (k == (int) candidates->size) {
std::sort(candidates->data, candidates->data + candidates->size, comp);
} else {
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
}
candidates->sorted = true;
candidates->size = k;

if (ctx) {
Expand All @@ -7999,10 +8005,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
return;
}

llama_sample_softmax(ctx, candidates);

const int64_t t_start_sample_us = ggml_time_us();

llama_sample_softmax(ctx, candidates);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment about counting the time 2 times


// Compute the cumulative probabilities
float cum_sum = 0.0f;
size_t last_idx = candidates->size;
Expand All @@ -8027,14 +8033,14 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
}

void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
if (p <= 0.0f || !candidates->size) {
if (p <= 0.0f) {
return;
}

const int64_t t_start_sample_us = ggml_time_us();

llama_sample_softmax(ctx, candidates);

Comment on lines +8039 to 8043
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will count the llama_sample_softmax() time 2 times. Either change to llama_sample_softmax(nullptr, candidates); or keep it before the ggml_time_us() call

const int64_t t_start_sample_us = ggml_time_us();

float scale = candidates->data[0].p; // scale by max prob
size_t i = 1; // first token always matches

Expand All @@ -8057,9 +8063,10 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
return;
}

llama_sample_softmax(nullptr, candidates);
const int64_t t_start_sample_us = ggml_time_us();

llama_sample_softmax(nullptr, candidates);

// Compute the first and second derivatives
std::vector<float> first_derivatives(candidates->size - 1);
std::vector<float> second_derivatives(candidates->size - 2);
Expand Down Expand Up @@ -8118,11 +8125,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
return;
}

const int64_t t_start_sample_us = ggml_time_us();

// Compute the softmax of logits and calculate entropy
llama_sample_softmax(nullptr, candidates);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is OK because it does not take a context


const int64_t t_start_sample_us = ggml_time_us();

float entropy = 0.0f;
for (size_t i = 0; i < candidates->size; ++i) {
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
Expand Down Expand Up @@ -8178,6 +8185,10 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
const int64_t t_start_sample_us = ggml_time_us();

if (temp == 1.0f) {
return; // No adjustment needed as dividing by 1 leaves the values unchanged
}
Comment on lines 8186 to +8190
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should make it consistent and return before ggml_time_us() like in the other calls


for (size_t i = 0; i < candidates_p->size; ++i) {
candidates_p->data[i].logit /= temp;
}
Expand Down