19
19
sys .path .insert (1 , str (Path (__file__ ).parent / 'gguf-py' / 'gguf' ))
20
20
import gguf
21
21
22
+ # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
23
+
24
+
25
+ def bytes_to_unicode ():
26
+ """
27
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
28
+ The reversible bpe codes work on unicode strings.
29
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
30
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
31
+ This is a significant percentage of your normal, say, 32K bpe vocab.
32
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
33
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
34
+ """
35
+ bs = list (range (ord ("!" ), ord ("~" )+ 1 ))+ list (range (ord ("¡" ), ord ("¬" )+ 1 ))+ list (range (ord ("®" ), ord ("ÿ" )+ 1 ))
36
+ cs = bs [:]
37
+ n = 0
38
+ for b in range (2 ** 8 ):
39
+ if b not in bs :
40
+ bs .append (b )
41
+ cs .append (2 ** 8 + n )
42
+ n += 1
43
+ return dict (zip (bs , (chr (n ) for n in cs )))
44
+
22
45
23
46
def count_model_parts (dir_model : Path ) -> int :
24
47
num_parts = 0
@@ -107,8 +130,6 @@ def parse_args() -> argparse.Namespace:
107
130
print ("gguf: get tokenizer metadata" )
108
131
109
132
tokens : list [bytearray ] = []
110
- scores : list [float ] = []
111
- toktypes : list [int ] = []
112
133
113
134
# gpt2 tokenizer
114
135
gguf_writer .add_tokenizer_model ("gpt2" )
@@ -124,15 +145,28 @@ def parse_args() -> argparse.Namespace:
124
145
assert max (tokenizer .vocab .values ()) < vocab_size
125
146
126
147
reverse_vocab = {id : encoded_tok for encoded_tok , id in tokenizer .vocab .items ()}
148
+ byte_encoder = bytes_to_unicode ()
149
+ byte_decoder = {v : k for k , v in byte_encoder .items ()}
127
150
128
151
for i in range (vocab_size ):
129
- tokens .append (reverse_vocab [i ] if i in reverse_vocab else f"[PAD{ i } ]" )
130
- scores .append (0.0 ) # dummy
131
- toktypes .append (gguf .TokenType .NORMAL )
152
+ if i in reverse_vocab :
153
+ try :
154
+ text = bytearray ([byte_decoder [c ] for c in reverse_vocab [i ]])
155
+ except KeyError :
156
+ text = bytearray ()
157
+ for c in reverse_vocab [i ]:
158
+ if ord (c ) < 256 : # single byte character
159
+ text .append (byte_decoder [ord (c )])
160
+ else : # multibyte special token character
161
+ text .extend (c .encode ('utf-8' ))
162
+ else :
163
+ print (f"Key { i } not in tokenizer vocabulary. Padding with an arbitrary token." )
164
+ pad_token = f"[PAD{ i } ]" .encode ("utf8" )
165
+ text = bytearray (pad_token )
166
+
167
+ tokens .append (text )
132
168
133
169
gguf_writer .add_token_list (tokens )
134
- gguf_writer .add_token_scores (scores )
135
- gguf_writer .add_token_types (toktypes )
136
170
137
171
special_vocab = gguf .SpecialVocab (dir_model , load_merges = True )
138
172
special_vocab .add_to_gguf (gguf_writer )
0 commit comments