@@ -95,8 +95,7 @@ class CompletionRequest:
9595 """
9696
9797 model : str
98- prompt : str
99- messages : Optional [List [_AbstractMessage ]]
98+ messages : List [_AbstractMessage ]
10099 frequency_penalty : float = 0.0
101100 temperature : float = 0.0
102101 stop : Optional [List [str ]] = None
@@ -121,10 +120,10 @@ class CompletionChoice:
121120 See the "The chat completion object >>> choices" section of the OpenAI API docs for more details.
122121 """
123122
124- finish_reason : str
125123 index : int
126124 message : AssistantMessage
127- logprobs : Optional [List [Any ]]
125+ finish_reason : str = None
126+ logprobs : Optional [List [Any ]] = None
128127
129128
130129@dataclass
@@ -151,7 +150,7 @@ class CompletionResponse:
151150 created : int
152151 model : str
153152 system_fingerprint : str
154- usage : UsageStats
153+ usage : Optional [ UsageStats ] = None
155154 object : str = "chat.completion"
156155 service_tier : Optional [str ] = None
157156
@@ -220,8 +219,13 @@ def __init__(self, *args, **kwargs):
220219 if self .draft_model is not None
221220 else self .model .config .max_seq_length
222221 )
222+ # The System fingerprint is a unique identifier for the model and its configuration.
223+ # Currently, this is not implemented in a
224+ self .system_fingerprint = (
225+ self .builder_args .device + type (self .builder_args .precision ).__name__
226+ )
223227
224- def completion (self , completion_request : CompletionRequest ):
228+ def chunked_completion (self , completion_request : CompletionRequest ):
225229 """Handle a chat completion request and yield a chunked response.
226230
227231 ** Warning ** : Not all arguments of the CompletionRequest are consumed as the server isn't completely implemented.
@@ -230,7 +234,8 @@ def completion(self, completion_request: CompletionRequest):
230234 - messages: The server consumes the final element of the array as the prompt.
231235 - model: This has no impact on the server state, i.e. changing the model in the request
232236 will not change which model is responding. Instead, use the --model flag to seelect the model when starting the server.
233- - temperature: This is used to control the randomness of the response. The server will use the temperature
237+ - temperature: This is used to control the randomness of the response.
238+ - system_fingerprint: A unique identifier for the model and its configuration. Currently unimplemented - subject to change.
234239
235240 See https://github.com/pytorch/torchchat/issues/973 for more details.
236241
@@ -246,13 +251,16 @@ def completion(self, completion_request: CompletionRequest):
246251
247252 # Initialize counters for chunk responses and encode the prompt.
248253 id = str (uuid .uuid4 ())
254+
249255 idx = 0
250256 buffer = []
251257 encoded = self .encode_tokens (
252- completion_request .prompt , bos = True , device = self .builder_args .device
258+ completion_request .messages [- 1 ].get ("content" ),
259+ bos = True ,
260+ device = self .builder_args .device ,
253261 )
254262 generator_args = GeneratorArgs (
255- completion_request .prompt ,
263+ completion_request .messages [ - 1 ]. get ( "content" ) ,
256264 encoded_prompt = encoded ,
257265 chat_mode = False ,
258266 )
@@ -302,21 +310,45 @@ def callback(x, *, done_generating=False):
302310 choices = [choice_chunk ],
303311 created = int (time .time ()),
304312 model = completion_request .model ,
305- system_fingerprint = uuid . UUID ( int = uuid . getnode ()) ,
313+ system_fingerprint = self . system_fingerprint ,
306314 )
307315 yield chunk_response
308316 self .start_pos += y .size (0 )
309317 idx += 1
310318
311319 # Yield an ending chunk indicating the generation has completed.
312- end_chunk = CompletionChoiceChunk (ChunkDelta (None , None , None ), idx , "eos" )
320+ end_chunk = CompletionChoiceChunk (
321+ ChunkDelta (None , None , None ), idx , finish_reason = "stop"
322+ )
313323
314324 yield CompletionResponseChunk (
315325 id = str (id ),
316326 choices = [end_chunk ],
317327 created = int (time .time ()),
318328 model = completion_request .model ,
319- system_fingerprint = uuid .UUID (int = uuid .getnode ()),
329+ system_fingerprint = self .system_fingerprint ,
330+ )
331+
332+ def sync_completion (self , request : CompletionRequest ):
333+ """Handle a chat completion request and yield a single, non-chunked response"""
334+ output = ""
335+ for chunk in self .chunked_completion (request ):
336+ if not chunk .choices [0 ].finish_reason :
337+ output += chunk .choices [0 ].delta .content
338+
339+ message = AssistantMessage (content = output )
340+ return CompletionResponse (
341+ id = str (uuid .uuid4 ()),
342+ choices = [
343+ CompletionChoice (
344+ finish_reason = "stop" ,
345+ index = 0 ,
346+ message = message ,
347+ )
348+ ],
349+ created = int (time .time ()),
350+ model = request .model ,
351+ system_fingerprint = self .system_fingerprint ,
320352 )
321353
322354 def _callback (self , x , * , buffer , done_generating ):
0 commit comments