@@ -373,6 +373,29 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
373373 except KeyError :
374374 raise NotImplementedError (f'Architecture { arch !r} not supported!' ) from None
375375
376+ def does_token_look_special (self , token : str | bytes ) -> bool :
377+ if isinstance (token , (bytes , bytearray )):
378+ token_text = token .decode (encoding = "utf-8" )
379+ elif isinstance (token , memoryview ):
380+ token_text = token .tobytes ().decode (encoding = "utf-8" )
381+ else :
382+ token_text = token
383+
384+ # Some models mark some added tokens which ought to be control tokens as not special.
385+ # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
386+ seems_special = token_text in (
387+ "<pad>" , # deepseek-coder
388+ "<mask>" , "<2mass>" , "[@BOS@]" , # gemma{,-2}
389+ )
390+
391+ seems_special = seems_special or (token_text .startswith ("<|" ) and token_text .endswith ("|>" ))
392+ seems_special = seems_special or (token_text .startswith ("<|" ) and token_text .endswith ("|>" )) # deepseek-coder
393+
394+ # TODO: should these be marked as UNUSED instead? (maybe not)
395+ seems_special = seems_special or (token_text .startswith ("<unused" ) and token_text .endswith (">" )) # gemma{,-2}
396+
397+ return seems_special
398+
376399 # used for GPT-2 BPE and WordPiece vocabs
377400 def get_vocab_base (self ) -> tuple [list [str ], list [int ], str ]:
378401 tokens : list [str ] = []
@@ -391,16 +414,18 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
391414 for i in range (vocab_size ):
392415 if i not in reverse_vocab :
393416 tokens .append (f"[PAD{ i } ]" )
394- toktypes .append (gguf .TokenType .USER_DEFINED )
395- elif reverse_vocab [i ] in added_vocab :
396- tokens .append (reverse_vocab [i ])
397- if tokenizer .added_tokens_decoder [i ].special :
398- toktypes .append (gguf .TokenType .CONTROL )
399- else :
400- toktypes .append (gguf .TokenType .USER_DEFINED )
417+ toktypes .append (gguf .TokenType .UNUSED )
401418 else :
402- tokens .append (reverse_vocab [i ])
403- toktypes .append (gguf .TokenType .NORMAL )
419+ token : str = reverse_vocab [i ]
420+ if token in added_vocab :
421+ if tokenizer .added_tokens_decoder [i ].special or self .does_token_look_special (token ):
422+ toktypes .append (gguf .TokenType .CONTROL )
423+ else :
424+ token = token .replace (b"\xe2 \x96 \x81 " .decode ("utf-8" ), " " ) # pre-normalize user-defined spaces
425+ toktypes .append (gguf .TokenType .USER_DEFINED )
426+ else :
427+ toktypes .append (gguf .TokenType .NORMAL )
428+ tokens .append (token )
404429
405430 return tokens , toktypes , tokpre
406431
@@ -559,7 +584,7 @@ def _set_vocab_qwen(self):
559584 for i in range (vocab_size ):
560585 if i not in reverse_vocab :
561586 tokens .append (f"[PAD{ i } ]" )
562- toktypes .append (gguf .TokenType .USER_DEFINED )
587+ toktypes .append (gguf .TokenType .UNUSED )
563588 elif reverse_vocab [i ] in added_vocab :
564589 tokens .append (reverse_vocab [i ])
565590 toktypes .append (gguf .TokenType .CONTROL )
@@ -609,7 +634,7 @@ def _create_vocab_sentencepiece(self):
609634
610635 tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
611636 scores : list [float ] = [- 10000.0 ] * vocab_size
612- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
637+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
613638
614639 for token_id in range (tokenizer .vocab_size ()):
615640 piece = tokenizer .IdToPiece (token_id )
@@ -644,6 +669,25 @@ def _create_vocab_sentencepiece(self):
644669 scores [token_id ] = - 1000.0
645670 toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
646671
672+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
673+ if tokenizer_config_file .is_file ():
674+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
675+ tokenizer_config_json = json .load (f )
676+ added_tokens_decoder = tokenizer_config_json .get ("added_tokens_decoder" , {})
677+ for token_id , token_data in added_tokens_decoder .items ():
678+ token_id = int (token_id )
679+ token : str = token_data ["content" ]
680+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
681+ assert tokens [token_id ] == token .encode ("utf-8" )
682+ if token_data .get ("special" ) or self .does_token_look_special (token ):
683+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
684+ else :
685+ token = token .replace (b"\xe2 \x96 \x81 " .decode ("utf-8" ), " " ) # pre-normalize user-defined spaces
686+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
687+
688+ scores [token_id ] = - 1000.0
689+ tokens [token_id ] = token .encode ("utf-8" )
690+
647691 if vocab_size > len (tokens ):
648692 pad_count = vocab_size - len (tokens )
649693 logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
@@ -1266,7 +1310,7 @@ def set_vocab(self):
12661310 if (self .dir_model / "tokenizer.json" ).is_file ():
12671311 self ._set_vocab_gpt2 ()
12681312 else :
1269- # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
1313+ # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
12701314 self ._set_vocab_qwen ()
12711315
12721316 def set_gguf_parameters (self ):
@@ -1578,7 +1622,6 @@ def set_gguf_parameters(self):
15781622 self .gguf_writer .add_rope_freq_base (attn_config ["rope_theta" ])
15791623
15801624 self .gguf_writer .add_clamp_kqv (attn_config ["clip_qkv" ])
1581- self .gguf_writer .add_file_type (self .ftype )
15821625
15831626 self .gguf_writer .add_expert_count (ffn_config ["moe_num_experts" ])
15841627 self .gguf_writer .add_expert_used_count (ffn_config ["moe_top_k" ])
@@ -1872,7 +1915,7 @@ def set_vocab(self):
18721915
18731916 tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
18741917 scores : list [float ] = [- 10000.0 ] * vocab_size
1875- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
1918+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
18761919
18771920 for token_id in range (tokenizer .vocab_size ()):
18781921
@@ -1917,7 +1960,7 @@ def set_vocab(self):
19171960 for token_id , foken_data in added_tokens_decoder .items ():
19181961 token_id = int (token_id )
19191962 token = foken_data ["content" ].encode ("utf-8" )
1920- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
1963+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
19211964 assert tokens [token_id ] == token
19221965 tokens [token_id ] = token
19231966 scores [token_id ] = - 1000.0
@@ -1933,7 +1976,7 @@ def set_vocab(self):
19331976 for foken_data in added_tokens :
19341977 token_id = int (foken_data ["id" ])
19351978 token = foken_data ["content" ].encode ("utf-8" )
1936- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
1979+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
19371980 assert tokens [token_id ] == token
19381981 tokens [token_id ] = token
19391982 scores [token_id ] = - 1000.0
@@ -2145,7 +2188,7 @@ def set_vocab(self):
21452188 toktype = SentencePieceTokenTypes .BYTE
21462189 # take care of ununsed raw token
21472190 if piece .startswith ('[UNUSED' ):
2148- toktype = SentencePieceTokenTypes .UNKNOWN
2191+ toktype = SentencePieceTokenTypes .UNUSED
21492192
21502193 tokens .append (text )
21512194 scores .append (score )
@@ -2175,7 +2218,7 @@ def set_vocab(self):
21752218 if token == chat_eos_token :
21762219 chat_eos_token_id = token_id
21772220 token = token .encode ("utf-8" )
2178- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2221+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
21792222 assert (tokens [token_id ] == token )
21802223 tokens [token_id ] = token
21812224 scores [token_id ] = - 1000.0
@@ -2194,7 +2237,7 @@ def set_vocab(self):
21942237 if token == chat_eos_token :
21952238 chat_eos_token_id = token_id
21962239 token = token .encode ("utf-8" )
2197- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2240+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
21982241 assert (tokens [token_id ] == token )
21992242 tokens [token_id ] = token
22002243 scores [token_id ] = - 1000.0
@@ -2434,19 +2477,7 @@ class Gemma2Model(Model):
24342477 model_arch = gguf .MODEL_ARCH .GEMMA2
24352478
24362479 def set_vocab (self ):
2437- tokens , scores , toktypes = self ._create_vocab_sentencepiece ()
2438- # hack: This is required so that we can properly use start/end-of-turn for chat template
2439- for i in range (108 ):
2440- # including <unusedX>, <start_of_turn>, <end_of_turn>
2441- toktypes [i ] = SentencePieceTokenTypes .CONTROL
2442- self .gguf_writer .add_tokenizer_model ("llama" )
2443- self .gguf_writer .add_tokenizer_pre ("default" )
2444- self .gguf_writer .add_token_list (tokens )
2445- self .gguf_writer .add_token_scores (scores )
2446- self .gguf_writer .add_token_types (toktypes )
2447-
2448- special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2449- special_vocab .add_to_gguf (self .gguf_writer )
2480+ self ._set_vocab_sentencepiece ()
24502481
24512482 self .gguf_writer .add_add_space_prefix (False )
24522483
@@ -2770,7 +2801,7 @@ def set_vocab(self):
27702801
27712802 tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
27722803 scores : list [float ] = [- 10000.0 ] * vocab_size
2773- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
2804+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
27742805
27752806 for token_id in range (tokenizer .vocab_size ()):
27762807
@@ -3025,7 +3056,7 @@ def set_vocab(self):
30253056
30263057 tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
30273058 scores : list [float ] = [- 10000.0 ] * vocab_size
3028- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
3059+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
30293060
30303061 for token_id in range (tokenizer .vocab_size ()):
30313062 piece = tokenizer .IdToPiece (token_id )
@@ -3243,15 +3274,14 @@ def set_vocab_chatglm3(self):
32433274 if len (piece ) != 0 and token_id < tokenizer .tokenizer .sp_model .vocab_size ():
32443275 score = tokenizer .tokenizer .sp_model .get_score (token_id )
32453276
3246- if len (piece ) == 0 :
3247- text = f"[PAD{ token_id } ]" .encode ("utf-8" )
3248-
32493277 if token_id >= tokenizer .tokenizer .sp_model .vocab_size ():
32503278 if piece in special_tokens :
3251- # show special tokens in prompt
3252- toktype = SentencePieceTokenTypes .USER_DEFINED
3279+ toktype = SentencePieceTokenTypes .CONTROL
3280+ elif len (piece ) == 0 :
3281+ text = f"[PAD{ token_id } ]" .encode ("utf-8" )
3282+ toktype = SentencePieceTokenTypes .UNUSED
32533283 else :
3254- toktype = SentencePieceTokenTypes .UNKNOWN
3284+ toktype = SentencePieceTokenTypes .USER_DEFINED
32553285 tokens .append (text )
32563286 scores .append (score )
32573287 toktypes .append (toktype )
@@ -3340,7 +3370,7 @@ def set_vocab(self):
33403370 for i in range (vocab_size ):
33413371 if i not in reverse_vocab :
33423372 tokens .append (f"[PAD{ i } ]" )
3343- toktypes .append (gguf .TokenType .USER_DEFINED )
3373+ toktypes .append (gguf .TokenType .UNUSED )
33443374 elif reverse_vocab [i ] in added_vocab :
33453375 tokens .append (reverse_vocab [i ])
33463376 if tokenizer .added_tokens_decoder [i ].special :
0 commit comments