@@ -132,15 +132,19 @@ def create_data(self, examples):
132132 )
133133 else :
134134 new_messages = [
135+ {
136+ "role" : "system" ,
137+ "content" : INTRO_BLURB + "\n " ,
138+ },
135139 {
136140 "role" : "user" ,
137141 "content" : examples ["instruction" ]
138- + "\n \n "
142+ + "\n "
139143 + INPUT_KEY
140144 + examples ["context" ]
141- + "\n \n " ,
145+ + "\n " ,
142146 },
143- {"role" : "assistant" , "content" : examples ["response" ] + "\n \n " },
147+ {"role" : "assistant" , "content" : examples ["response" ] + "\n " },
144148 ]
145149
146150 return new_messages
@@ -162,7 +166,6 @@ def tokenize_func(self, tokenizer, message):
162166 message ,
163167 tokenize = False ,
164168 )
165- print (new_tokenizer )
166169 return tokenizer (
167170 new_tokenizer , add_special_tokens = False , max_length = self .config .get ("max_length" )
168171 )
@@ -243,21 +246,9 @@ def group_texts(examples):
243246
244247
245248class SlimOrcaDataPreprocess (ChatDataPreprocess ):
246- chat_template = (
247- "{% for message in messages %}"
248- "{% if message['role'] == 'system' %}"
249- "{{ '### System: ' + message['content'] }}"
250- "{% elif message['role'] == 'user' %}"
251- "{{ '### User: ' + message['content'] }}"
252- "{% elif message['role'] == 'assistant' %}"
253- "{{ '### Assistant: ' + message['content'] }}"
254- "{% endif %}"
255- "{% endfor %}"
256- )
257249
258250 def __init__ (self , config ):
259251 super ().__init__ (config )
260- self .config ["chat_template" ] = self .chat_template
261252 self .default_system = "You are a helpful, respectful and honest assistant."
262253
263254 def create_data (self , data ):
@@ -278,22 +269,26 @@ def create_data(self, data):
278269 examples [conv [j ]["from" ]] = conv [j ]["value" ]
279270 examples [conv [j + 1 ]["from" ]] = conv [j + 1 ]["value" ]
280271
281- new_messages = [
282- {"role" : "system" , "content" : examples ["system" ] + "\n " },
283- {
284- "role" : "user" ,
285- "content" : examples ["human" ] + "\n " ,
286- },
287- {"role" : "assistant" , "content" : examples ["gpt" ] + "\n " },
288- ]
289272 if self .config .get ("gpt_base_model" ):
290273 if examples ["human" ]:
291- return SLIMORCA_PROMPT_DICT [ "prompt_with_input" ] .format (
292- system = examples ["system" ], user = examples ["human " ], gpt = examples ["gpt " ]
274+ return PROMPT_WITH_INPUT_FORMAT .format (
275+ instruction = examples ["system" ], response = examples ["gpt " ], input = examples ["human " ]
293276 )
294277 else :
295- return SLIMORCA_PROMPT_DICT [ "prompt_with_input" ] .format (
296- system = examples ["human " ], gpt = examples ["gpt" ]
278+ return PROMPT_NO_INPUT_FORMAT .format (
279+ instruction = examples ["system " ], response = examples ["gpt" ]
297280 )
298281 else :
282+ new_messages = [
283+ {"role" : "system" , "content" : INTRO_BLURB + "\n " },
284+ {
285+ "role" : "user" ,
286+ "content" : examples ["system" ]
287+ + "\n "
288+ + INPUT_KEY
289+ + examples ["human" ]
290+ + "\n " ,
291+ },
292+ {"role" : "assistant" , "content" : examples ["gpt" ] + "\n " },
293+ ]
299294 return new_messages
0 commit comments