@@ -2145,6 +2145,9 @@ def set_vocab(self):
21452145 toktype = SentencePieceTokenTypes .UNUSED
21462146 elif tokenizer .IsByte (token_id ):
21472147 toktype = SentencePieceTokenTypes .BYTE
2148+ # take care of ununsed raw token
2149+ if piece .startswith ('[UNUSED' ):
2150+ toktype = SentencePieceTokenTypes .UNKNOWN
21482151
21492152 tokens .append (text )
21502153 scores .append (score )
@@ -2160,6 +2163,47 @@ def set_vocab(self):
21602163 scores .append (- 1000.0 )
21612164 toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
21622165
2166+ chat_eos_token = '<|im_end|>'
2167+ chat_eos_token_id = None
2168+
2169+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
2170+ if tokenizer_config_file .is_file ():
2171+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
2172+ tokenizer_config_json = json .load (f )
2173+ added_tokens_decoder = tokenizer_config_json .get ("added_tokens_decoder" , {})
2174+ for token_id , foken_data in added_tokens_decoder .items ():
2175+ token_id = int (token_id )
2176+ token = foken_data ["content" ]
2177+ if token == chat_eos_token :
2178+ chat_eos_token_id = token_id
2179+ token = token .encode ("utf-8" )
2180+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2181+ assert (tokens [token_id ] == token )
2182+ tokens [token_id ] = token
2183+ scores [token_id ] = - 1000.0
2184+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2185+ if foken_data .get ("special" ):
2186+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
2187+
2188+ tokenizer_file = self .dir_model / 'tokenizer.json'
2189+ if tokenizer_file .is_file ():
2190+ with open (tokenizer_file , "r" , encoding = "utf-8" ) as f :
2191+ tokenizer_json = json .load (f )
2192+ added_tokens = tokenizer_json .get ("added_tokens" , [])
2193+ for foken_data in added_tokens :
2194+ token_id = int (foken_data ["id" ])
2195+ token = foken_data ["content" ]
2196+ if token == chat_eos_token :
2197+ chat_eos_token_id = token_id
2198+ token = token .encode ("utf-8" )
2199+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2200+ assert (tokens [token_id ] == token )
2201+ tokens [token_id ] = token
2202+ scores [token_id ] = - 1000.0
2203+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2204+ if foken_data .get ("special" ):
2205+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
2206+
21632207 self .gguf_writer .add_tokenizer_model ("llama" )
21642208 self .gguf_writer .add_tokenizer_pre ("default" )
21652209 self .gguf_writer .add_token_list (tokens )
@@ -2169,28 +2213,16 @@ def set_vocab(self):
21692213
21702214 special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
21712215 old_eos = special_vocab .special_token_ids ["eos" ]
2172- if "chat" in os . path . basename ( self . dir_model . absolute ()) :
2216+ if chat_eos_token_id is not None :
21732217 # For the chat model, we replace the eos with '<|im_end|>'.
21742218 # TODO: this is a hack, should be fixed
21752219 # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2176- special_vocab .special_token_ids ["eos" ] = self . _try_get_sft_eos ( tokenizer )
2177- logger .warning (f"Replace eos:{ old_eos } with a special token:{ special_vocab . special_token_ids [ 'eos' ] } \
2178- in chat mode so that the conversation can end normally." )
2220+ special_vocab .special_token_ids ["eos" ] = chat_eos_token_id
2221+ logger .warning (f"Replace eos:{ old_eos } with a special token:{ chat_eos_token_id } "
2222+ " in chat mode so that the conversation can end normally." )
21792223
21802224 special_vocab .add_to_gguf (self .gguf_writer )
21812225
2182- def _try_get_sft_eos (self , tokenizer ):
2183- unused_145_list = tokenizer .Encode ('[UNUSED_TOKEN_145]' )
2184- im_end_list = tokenizer .Encode ('<|im_end|>' )
2185- eos_token = None
2186- assert (len (unused_145_list ) == 1 ) ^ (len (im_end_list ) == 1 )
2187- if len (unused_145_list ) == 1 :
2188- eos_token = unused_145_list [0 ]
2189- if len (im_end_list ) == 1 :
2190- eos_token = im_end_list [0 ]
2191- assert eos_token
2192- return eos_token
2193-
21942226 def _hf_permute_qk (self , weights , n_head : int , n_head_kv : int ):
21952227 if n_head_kv is not None and n_head != n_head_kv :
21962228 n_head = n_head_kv
@@ -2209,7 +2241,11 @@ def set_gguf_parameters(self):
22092241 self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
22102242 self .gguf_writer .add_head_count_kv (self .hparams ["num_key_value_heads" ])
22112243 self .gguf_writer .add_file_type (self .ftype )
2212-
2244+ if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
2245+ if self .hparams ["rope_scaling" ].get ("type" ) == "linear" :
2246+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .LINEAR )
2247+ self .gguf_writer .add_rope_scaling_factor (self .hparams ["rope_scaling" ]["factor" ])
2248+
22132249 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
22142250 num_heads = self .hparams ["num_attention_heads" ]
22152251 num_kv_heads = self .hparams ["num_key_value_heads" ]
0 commit comments