Skip to content

Commit 4cfd4bb

Browse files
committed
Add tests
1 parent 17888b8 commit 4cfd4bb

File tree

1 file changed

+33
-0
lines changed

1 file changed

+33
-0
lines changed

tests/test-tokenizer-0.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests()
1414
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
1515
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
1616
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
17+
{ ">>>>ANSWER<<", { 1, 6778, 6778, 2190, 23066, 1001, 9314,}, },
1718
};
1819
return _k_tests;
1920
};
@@ -94,6 +95,38 @@ int main(int argc, char **argv) {
9495
}
9596
}
9697

98+
#if 0
99+
// how many tokens would not tokenize to themselves
100+
for (llama_token i = 1; i < llama_n_vocab(ctx); i++)
101+
{
102+
const char* str = llama_token_to_str(ctx, i);
103+
std::vector<llama_token> res(100);
104+
105+
const int n = llama_tokenize(ctx, str, res.data(), int(res.size()), false);
106+
res.resize(n);
107+
108+
for (const auto & t : res)
109+
{
110+
//if (t == 1) continue;
111+
112+
if (t != i) {
113+
fprintf(stderr, "%s : failed test: '%s'\n", __func__, str);
114+
fprintf(stderr, "%s : expected tokens: %d\n", __func__, i);
115+
fprintf(stderr, "%s : got tokens: ", __func__);
116+
for (const auto & t : res) {
117+
fprintf(stderr, "%6d, ", t);
118+
}
119+
for (const auto & t : res) {
120+
fprintf(stderr, "%s|", llama_token_to_str(ctx, t));
121+
}
122+
123+
fprintf(stderr, "\n");
124+
}
125+
}
126+
127+
}
128+
#endif
129+
97130
llama_free_model(model);
98131
llama_free(ctx);
99132

0 commit comments

Comments
 (0)