@@ -234,14 +234,21 @@ def load(model_plus: 'ModelPlus') -> 'Params':
234234
235235
236236class SentencePieceVocab :
237- def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ]) -> None :
238- self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
237+ def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ], vocabtype : Optional [str ]) -> None :
238+ self .vocabtype = vocabtype
239+ if self .vocabtype == "bpe" :
240+ self .sentencepiece_tokenizer = json .loads (open (str (fname_tokenizer )).read ())
241+ else :
242+ self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
239243 added_tokens : Dict [str , int ]
240244 if fname_added_tokens is not None :
241245 added_tokens = json .load (open (fname_added_tokens ))
242246 else :
243247 added_tokens = {}
244- vocab_size : int = self .sentencepiece_tokenizer .vocab_size ()
248+ if self .vocabtype == "bpe" :
249+ vocab_size : int = len (self .sentencepiece_tokenizer )
250+ else :
251+ vocab_size : int = self .sentencepiece_tokenizer .vocab_size ()
245252 expected_ids = list (range (vocab_size , vocab_size + len (added_tokens )))
246253 actual_ids = sorted (added_tokens .values ())
247254 if expected_ids != actual_ids :
@@ -255,22 +262,32 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
255262
256263 def sentencepiece_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
257264 tokenizer = self .sentencepiece_tokenizer
258- for i in range (tokenizer .vocab_size ()):
265+ if self .vocabtype == "bpe" :
266+ from transformers .models .gpt2 import tokenization_gpt2
267+ byte_encoder = tokenization_gpt2 .bytes_to_unicode ()
268+ byte_decoder = {v : k for k , v in byte_encoder .items ()}
269+ for i , item in enumerate (tokenizer ):
259270 text : bytes
260- if tokenizer .is_unknown (i ):
261- text = " \u2047 " .encode ("utf-8" )
262- elif tokenizer .is_control (i ):
263- text = b""
264- elif tokenizer .is_byte (i ):
265- piece = tokenizer .id_to_piece (i )
266- if len (piece ) != 6 :
267- raise Exception (f"Invalid token: { piece } " )
268- byte_value = int (piece [3 :- 1 ], 16 )
269- text = struct .pack ("B" , byte_value )
270- else :
271- text = tokenizer .id_to_piece (i ).replace ("\u2581 " , " " ).encode ("utf-8" )
272- score : float = tokenizer .get_score (i )
271+ text = b'' .join ([x .to_bytes (1 , byteorder = 'big' ) for x in [byte_decoder [y ] for y in item ]])
272+ score : float = - i
273273 yield text , score
274+ else :
275+ for i in range (tokenizer .vocab_size ()):
276+ text : bytes
277+ if tokenizer .is_unknown (i ):
278+ text = " \u2047 " .encode ("utf-8" )
279+ elif tokenizer .is_control (i ):
280+ text = b""
281+ elif tokenizer .is_byte (i ):
282+ piece = tokenizer .id_to_piece (i )
283+ if len (piece ) != 6 :
284+ raise Exception (f"Invalid token: { piece } " )
285+ byte_value = int (piece [3 :- 1 ], 16 )
286+ text = struct .pack ("B" , byte_value )
287+ else :
288+ text = tokenizer .id_to_piece (i ).replace ("\u2581 " , " " ).encode ("utf-8" )
289+ score : float = tokenizer .get_score (i )
290+ yield text , score
274291
275292 def added_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
276293 for text in self .added_tokens_list :
@@ -1196,14 +1213,18 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
11961213 return {name : model [name ] for name in TENSORS_LIST if name in model }
11971214
11981215
1199- def load_vocab (path : Path ) -> SentencePieceVocab :
1216+ def load_vocab (path : Path , vocabtype : Optional [str ]) -> SentencePieceVocab :
1217+ print (f"vocabtype: { vocabtype } " )
12001218 # Be extra-friendly and accept either a file or a directory. Also, if it's
12011219 # a directory, it might be the model directory, and tokenizer.model might
12021220 # be in the parent of that.
12031221 if path .is_dir ():
1204- path2 = path / "tokenizer.model"
1222+ vocab_file = "tokenizer.model"
1223+ if vocabtype == 'bpe' :
1224+ vocab_file = "vocab.json"
1225+ path2 = path / vocab_file
12051226 # Use `.parent` instead of /.. to handle the symlink case better.
1206- path3 = path .parent / "tokenizer.model"
1227+ path3 = path .parent / vocab_file
12071228 if path2 .exists ():
12081229 path = path2
12091230 elif path3 .exists ():
@@ -1214,7 +1235,8 @@ def load_vocab(path: Path) -> SentencePieceVocab:
12141235 "if it's in another directory, pass the directory as --vocab-dir" )
12151236 added_tokens_path = path .parent / "added_tokens.json"
12161237 print (f"Loading vocab file { path } " )
1217- return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None )
1238+ return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None ,
1239+ vocabtype )
12181240
12191241
12201242def default_outfile (model_paths : List [Path ], file_type : GGMLFileType ) -> Path :
@@ -1252,14 +1274,15 @@ def main(args_in: Optional[List[str]] = None) -> None:
12521274 parser .add_argument ("--outfile" , type = Path , help = "path to write to; default: based on input" )
12531275 parser .add_argument ("model" , type = Path ,
12541276 help = "directory containing model file, or model file itself (*.pth, *.pt, *.bin)" )
1277+ parser .add_argument ("--vocabtype" , default = 'spm' , choices = ["spm" , "bpe" ], help = "vocab format (default: spm)" )
12551278 args = parser .parse_args (args_in )
12561279
12571280 vocab : Vocab
12581281 if args .dump_single :
12591282 model_plus = lazy_load_file (args .model )
12601283 do_dump_model (model_plus )
12611284 elif args .vocab_only :
1262- vocab = load_vocab (args .vocab_dir or args .model )
1285+ vocab = load_vocab (args .vocab_dir or args .model , args . vocabtype )
12631286 assert args .outfile , "need --outfile if using --vocab-only"
12641287 outfile = args .outfile
12651288 OutputFile .write_vocab_only (outfile , vocab )
@@ -1273,7 +1296,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
12731296 vocab = model_plus .vocab
12741297 else :
12751298 vocab_dir = args .vocab_dir if args .vocab_dir else model_plus .paths [0 ].parent
1276- vocab = load_vocab (vocab_dir )
1299+ vocab = load_vocab (vocab_dir , args . vocabtype )
12771300 params = Params .load (model_plus )
12781301 model = model_plus .model
12791302 model = do_necessary_conversions (model , params )
0 commit comments