@@ -164,7 +164,7 @@ def add_streaming_line(self, line: str) -> int | None:
164164 """
165165 if not (data := self .extract_line_data (line )):
166166 return None if data is None else 0
167-
167+
168168 if "id" in data and self .streaming_response_id is None :
169169 self .streaming_response_id = data ["id" ]
170170
@@ -312,6 +312,7 @@ def compile_non_streaming(
312312 request_args = str (
313313 request .arguments .model_dump () if request .arguments else None
314314 ),
315+ response_id = response .get ("id" ), # use vLLM ID if available
315316 text = text ,
316317 input_metrics = input_metrics ,
317318 output_metrics = output_metrics ,
@@ -330,6 +331,9 @@ def add_streaming_line(self, line: str) -> int | None:
330331 if not (data := self .extract_line_data (line )):
331332 return None if data is None else 0
332333
334+ if "id" in data and self .streaming_response_id is None :
335+ self .streaming_response_id = data ["id" ]
336+
333337 updated = False
334338 choices , usage = self .extract_choices_and_usage (data )
335339 choice : dict [str , dict ] = choices [0 ] if choices else {}
@@ -358,6 +362,7 @@ def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
358362 request_args = str (
359363 request .arguments .model_dump () if request .arguments else None
360364 ),
365+ response_id = self .streaming_response_id , # use vLLM ID if available
361366 text = text ,
362367 input_metrics = input_metrics ,
363368 output_metrics = output_metrics ,
@@ -391,6 +396,8 @@ def __init__(self):
391396 self .streaming_buffer : bytearray = bytearray ()
392397 self .streaming_texts : list [str ] = []
393398 self .streaming_usage : dict [str , int | dict [str , int ]] | None = None
399+ self .streaming_response_id : str | None = None
400+
394401
395402 def compile_non_streaming (
396403 self , request : GenerationRequest , response : dict
@@ -414,6 +421,7 @@ def compile_non_streaming(
414421 request_args = str (
415422 request .arguments .model_dump () if request .arguments else None
416423 ),
424+ response_id = response .get ("id" ), # use vLLM ID if available
417425 text = text ,
418426 input_metrics = input_metrics ,
419427 output_metrics = output_metrics ,
@@ -438,6 +446,9 @@ def add_streaming_line(self, line: str) -> int | None:
438446 data : dict [str , Any ] = json .loads (line )
439447 updated = False
440448
449+ if "id" in data and self .streaming_response_id is None :
450+ self .streaming_response_id = data ["id" ]
451+
441452 if text := data .get ("text" ):
442453 self .streaming_texts .append (text )
443454 updated = True
@@ -462,6 +473,7 @@ def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
462473 request_args = str (
463474 request .arguments .model_dump () if request .arguments else None
464475 ),
476+ response_id = self .streaming_response_id ,
465477 text = text ,
466478 input_metrics = input_metrics ,
467479 output_metrics = output_metrics ,
0 commit comments