@@ -14,6 +14,7 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests()
14
14
{ " this is 🦙.cpp" , { 1 , 445 , 338 , 29871 , 243 , 162 , 169 , 156 , 29889 , 8223 , }, },
15
15
{ " w048 7tuijk dsdfhu" , { 1 , 29893 , 29900 , 29946 , 29947 , 29871 , 29955 , 9161 , 13535 , 18031 , 2176 , 6905 , }, },
16
16
{ " нещо на Български" , { 1 , 821 , 4851 , 665 , 1386 , 29713 , 1305 , }, },
17
+ { " >>>>ANSWER<<" , { 1 , 6778 , 6778 , 2190 , 23066 , 1001 , 9314 ,}, },
17
18
};
18
19
return _k_tests;
19
20
};
@@ -94,6 +95,38 @@ int main(int argc, char **argv) {
94
95
}
95
96
}
96
97
98
+ #if 0
99
+ // how many tokens would not tokenize to themselves
100
+ for (llama_token i = 1; i < llama_n_vocab(ctx); i++)
101
+ {
102
+ const char* str = llama_token_to_str(ctx, i);
103
+ std::vector<llama_token> res(100);
104
+
105
+ const int n = llama_tokenize(ctx, str, res.data(), int(res.size()), false);
106
+ res.resize(n);
107
+
108
+ for (const auto & t : res)
109
+ {
110
+ //if (t == 1) continue;
111
+
112
+ if (t != i) {
113
+ fprintf(stderr, "%s : failed test: '%s'\n", __func__, str);
114
+ fprintf(stderr, "%s : expected tokens: %d\n", __func__, i);
115
+ fprintf(stderr, "%s : got tokens: ", __func__);
116
+ for (const auto & t : res) {
117
+ fprintf(stderr, "%6d, ", t);
118
+ }
119
+ for (const auto & t : res) {
120
+ fprintf(stderr, "%s|", llama_token_to_str(ctx, t));
121
+ }
122
+
123
+ fprintf(stderr, "\n");
124
+ }
125
+ }
126
+
127
+ }
128
+ #endif
129
+
97
130
llama_free_model (model);
98
131
llama_free (ctx);
99
132
0 commit comments