@@ -95,8 +95,7 @@ class CompletionRequest:
9595 """
9696
9797 model : str
98- prompt : str
99- messages : Optional [List [_AbstractMessage ]]
98+ messages : List [_AbstractMessage ]
10099 frequency_penalty : float = 0.0
101100 temperature : float = 0.0
102101 stop : Optional [List [str ]] = None
@@ -121,10 +120,10 @@ class CompletionChoice:
121120 See the "The chat completion object >>> choices" section of the OpenAI API docs for more details.
122121 """
123122
124- finish_reason : str
125123 index : int
126124 message : AssistantMessage
127- logprobs : Optional [List [Any ]]
125+ finish_reason : str = None
126+ logprobs : Optional [List [Any ]] = None
128127
129128
130129@dataclass
@@ -151,7 +150,7 @@ class CompletionResponse:
151150 created : int
152151 model : str
153152 system_fingerprint : str
154- usage : UsageStats
153+ usage : Optional [ UsageStats ] = None
155154 object : str = "chat.completion"
156155 service_tier : Optional [str ] = None
157156
@@ -220,8 +219,11 @@ def __init__(self, *args, **kwargs):
220219 if self .draft_model is not None
221220 else self .model .config .max_seq_length
222221 )
222+ self .system_fingerprint = (
223+ self .builder_args .device + type (self .builder_args .precision ).__name__
224+ )
223225
224- def completion (self , completion_request : CompletionRequest ):
226+ def chunked_completion (self , completion_request : CompletionRequest ):
225227 """Handle a chat completion request and yield a chunked response.
226228
227229 ** Warning ** : Not all arguments of the CompletionRequest are consumed as the server isn't completely implemented.
@@ -246,13 +248,16 @@ def completion(self, completion_request: CompletionRequest):
246248
247249 # Initialize counters for chunk responses and encode the prompt.
248250 id = str (uuid .uuid4 ())
251+
249252 idx = 0
250253 buffer = []
251254 encoded = self .encode_tokens (
252- completion_request .prompt , bos = True , device = self .builder_args .device
255+ completion_request .messages [- 1 ].get ("content" ),
256+ bos = True ,
257+ device = self .builder_args .device ,
253258 )
254259 generator_args = GeneratorArgs (
255- completion_request .prompt ,
260+ completion_request .messages [ - 1 ]. get ( "content" ) ,
256261 encoded_prompt = encoded ,
257262 chat_mode = False ,
258263 )
@@ -302,21 +307,45 @@ def callback(x, *, done_generating=False):
302307 choices = [choice_chunk ],
303308 created = int (time .time ()),
304309 model = completion_request .model ,
305- system_fingerprint = uuid . UUID ( int = uuid . getnode ()) ,
310+ system_fingerprint = self . system_fingerprint ,
306311 )
307312 yield chunk_response
308313 self .start_pos += y .size (0 )
309314 idx += 1
310315
311316 # Yield an ending chunk indicating the generation has completed.
312- end_chunk = CompletionChoiceChunk (ChunkDelta (None , None , None ), idx , "eos" )
317+ end_chunk = CompletionChoiceChunk (
318+ ChunkDelta (None , None , None ), idx , finish_reason = "stop"
319+ )
313320
314321 yield CompletionResponseChunk (
315322 id = str (id ),
316323 choices = [end_chunk ],
317324 created = int (time .time ()),
318325 model = completion_request .model ,
319- system_fingerprint = uuid .UUID (int = uuid .getnode ()),
326+ system_fingerprint = self .system_fingerprint ,
327+ )
328+
329+ def sync_completion (self , request : CompletionRequest ):
330+ """Handle a chat completion request and yield a single, non-chunked response"""
331+ output = ""
332+ for chunk in self .chunked_completion (request ):
333+ if not chunk .choices [0 ].finish_reason :
334+ output += chunk .choices [0 ].delta .content
335+
336+ message = AssistantMessage (content = output )
337+ return CompletionResponse (
338+ id = str (uuid .uuid4 ()),
339+ choices = [
340+ CompletionChoice (
341+ finish_reason = "stop" ,
342+ index = 0 ,
343+ message = message ,
344+ )
345+ ],
346+ created = int (time .time ()),
347+ model = request .model ,
348+ system_fingerprint = self .system_fingerprint ,
320349 )
321350
322351 def _callback (self , x , * , buffer , done_generating ):
0 commit comments