@@ -1905,10 +1905,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
19051905 return ;
19061906 }
19071907
1908- const int64_t t_start_sample_us = ggml_time_us ();
1909-
19101908 llama_sample_softmax (ctx, candidates);
19111909
1910+ const int64_t t_start_sample_us = ggml_time_us ();
1911+
19121912 // Compute the cumulative probabilities
19131913 float cum_sum = 0 .0f ;
19141914 size_t last_idx = candidates->size ;
@@ -1937,9 +1937,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
19371937 return ;
19381938 }
19391939
1940- const int64_t t_start_sample_us = ggml_time_us ();
1941-
19421940 llama_sample_softmax (nullptr , candidates);
1941+ const int64_t t_start_sample_us = ggml_time_us ();
19431942
19441943 // Compute the first and second derivatives
19451944 std::vector<float > first_derivatives (candidates->size - 1 );
@@ -1991,11 +1990,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
19911990 return ;
19921991 }
19931992
1994- const int64_t t_start_sample_us = ggml_time_us ();
1995-
19961993 // Compute the softmax of logits and calculate entropy
19971994 llama_sample_softmax (nullptr , candidates);
19981995
1996+ const int64_t t_start_sample_us = ggml_time_us ();
1997+
19991998 float entropy = 0 .0f ;
20001999 for (size_t i = 0 ; i < candidates->size ; ++i) {
20012000 entropy += -candidates->data [i].p * logf (candidates->data [i].p );
@@ -2164,13 +2163,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
21642163
21652164 if (ctx) {
21662165 ctx->t_sample_us += ggml_time_us () - t_start_sample_us;
2167- ctx->n_sample ++;
21682166 }
21692167 return X;
21702168}
21712169
21722170llama_token llama_sample_token_mirostat_v2 (struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
2173- assert (ctx);
21742171 int64_t t_start_sample_us;
21752172 t_start_sample_us = ggml_time_us ();
21762173
@@ -2185,13 +2182,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
21852182 candidates->size = 1 ;
21862183 }
21872184
2185+ if (ctx) {
2186+ ctx->t_sample_us += ggml_time_us () - t_start_sample_us;
2187+ }
2188+
21882189 // Normalize the probabilities of the remaining words
21892190 llama_sample_softmax (ctx, candidates);
21902191
21912192 // Sample the next word X from the remaining words
2192- if (ctx) {
2193- ctx->t_sample_us += ggml_time_us () - t_start_sample_us;
2194- }
21952193 llama_token X = llama_sample_token (ctx, candidates);
21962194 t_start_sample_us = ggml_time_us ();
21972195
0 commit comments