@@ -351,6 +351,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
351351 fprintf (stderr, " %s : loaded %zu tasks from prompt.\n " , __func__, hs_task_count);
352352
353353 const bool is_spm = llama_vocab_type (ctx) == LLAMA_VOCAB_TYPE_SPM;
354+ fprintf (stderr, " ================================= is_spm = %d\n " , is_spm);
354355
355356 // This is needed as usual for LLaMA models
356357 const bool add_bos = is_spm;
@@ -406,18 +407,30 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
406407 double acc = 0 .0f ;
407408 const int n_vocab = llama_n_vocab (ctx);
408409
410+ std::vector<std::vector<int >> ending_tokens (4 );
411+
409412 std::vector<float > tok_logits (n_vocab);
410413
411414 for (size_t task_idx = 0 ; task_idx < hs_task_count; task_idx++) {
412415 // Tokenize the context to count tokens
413416 std::vector<int > context_embd = ::llama_tokenize (ctx, hs_data[task_idx].context , add_bos);
414417 size_t context_size = context_embd.size ();
415418
419+ for (int i = 0 ; i < 4 ; ++i) {
420+ ending_tokens[i] = ::llama_tokenize (ctx, hs_data[task_idx].context + hs_data[task_idx].ending [i], add_bos);
421+ for (int k = 0 ; k < int (context_size); ++k) {
422+ if (ending_tokens[i][k] != context_embd[k]) {
423+ fprintf (stderr, " Oops: ending %d of task %d differs from context at position %d\n " ,i,int (task_idx),k);
424+ break ;
425+ }
426+ }
427+ }
428+
416429 // Do the 1st ending
417430 // In this case we include the context when evaluating
418- auto query_embd = ::llama_tokenize (ctx, hs_data[task_idx].context + hs_data[task_idx].ending [0 ], add_bos);
431+ // auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
432+ auto query_embd = ending_tokens[0 ];
419433 auto query_size = query_embd.size ();
420- // printf("First query: %d\n",(int)query_size);
421434
422435 // Stop if query wont fit the ctx window
423436 if (query_size > (size_t )params.n_ctx ) {
@@ -462,7 +475,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
462475 for (size_t ending_idx = 1 ; ending_idx < 4 ; ending_idx++) {
463476
464477 // Tokenize the query
465- query_embd = ::llama_tokenize (ctx, hs_data[task_idx].ending [ending_idx], false );
478+ query_embd.resize (ending_tokens[ending_idx].size () - context_size);
479+ std::memcpy (query_embd.data (), ending_tokens[ending_idx].data () + context_size, query_embd.size ()*sizeof (int ));
466480 query_size = query_embd.size ();
467481
468482 // Stop if query wont fit the ctx window
0 commit comments