Add support for streaming in openai

hmstepanek · hmstepanek · commit ee029e850319 · 2023-10-25T10:32:08.000-07:00
diff --git a/newrelic/config.py b/newrelic/config.py
@@ -2052,6 +2052,12 @@ def _process_module_builtin_defaults():
         "newrelic.hooks.mlmodel_openai",
         "instrument_openai_util",
     )
+    _process_module_definition(
+        "openai.api_requestor",
+        "newrelic.hooks.mlmodel_openai",
+        "instrument_openai_api_requestor_api_requestor",
+    )
+
     _process_module_definition(
         "asyncio.base_events",
         "newrelic.hooks.coroutines_asyncio",
diff --git a/newrelic/hooks/mlmodel_openai.py b/newrelic/hooks/mlmodel_openai.py
@@ -95,6 +95,129 @@ def wrap_embedding_create(wrapped, instance, args, kwargs):
     return response
 
 
+def wrap_interpret_response_line(wrapped, instance, args, kwargs):
+    print("Inside _wrap_interpret_response_line")
+    # If not active transaction return immediately.
+    transaction = current_transaction()
+    if not transaction:
+        return wrapped(*args, **kwargs)
+
+    try:
+        response = wrapped(*args, **kwargs)
+    except Exception as e:
+        # If StopIteration is raised, record the events as that indicates the end of the
+        # streamed response.
+        if isinstance(e, StopIteration):
+            # end of iteration
+            print("Transaction attrs:%s" % (transaction._nr_openai_content))
+            content = transaction._nr_openai_content
+            role = transaction._nr_openai_role
+
+            custom_attrs_dict = transaction._custom_params
+            conversation_id = custom_attrs_dict.get("conversation_id", "")
+
+            chat_completion_id = str(uuid.uuid4())
+            available_metadata = get_trace_linking_metadata()
+            span_id = available_metadata.get("span.id", "")
+            trace_id = available_metadata.get("trace.id", "")
+
+            response_headers = getattr(response, "_nr_response_headers", None)
+            response_model = response.get("model", "")
+            settings = transaction.settings if transaction.settings is not None else global_settings()
+            response_id = response.get("id")
+            request_id = response_headers.get("x-request-id", "")
+
+            api_key = getattr(response, "api_key", None)
+            response_usage = response.get("usage", {})
+
+            messages = kwargs.get("messages", [])
+            choices = response.get("choices", [])
+
+            chat_completion_summary_dict = {
+                "id": chat_completion_id,
+                "appName": settings.app_name,
+                "conversation_id": conversation_id,
+                "span_id": span_id,
+                "trace_id": trace_id,
+                "transaction_id": transaction._transaction_id,
+                "request_id": request_id,
+                "api_key_last_four_digits": f"sk-{api_key[-4:]}" if api_key else "",
+                "duration": ft.duration,
+                "request.model": kwargs.get("model") or kwargs.get("engine") or "",
+                "response.model": response_model,
+                "response.organization": getattr(response, "organization", ""),
+                "response.usage.completion_tokens": response_usage.get("completion_tokens", "")
+                if any(response_usage)
+                else "",
+                "response.usage.total_tokens": response_usage.get("total_tokens", "") if any(response_usage) else "",
+                "response.usage.prompt_tokens": response_usage.get("prompt_tokens", "") if any(response_usage) else "",
+                "request.temperature": kwargs.get("temperature", ""),
+                "request.max_tokens": kwargs.get("max_tokens", ""),
+                "response.choices.finish_reason": choices[0].finish_reason if choices else "",
+                "response.api_type": getattr(response, "api_type", ""),
+                "response.headers.llmVersion": response_headers.get("openai-version", ""),
+                "response.headers.ratelimitLimitRequests": check_rate_limit_header(
+                    response_headers, "x-ratelimit-limit-requests", True
+                ),
+                "response.headers.ratelimitLimitTokens": check_rate_limit_header(
+                    response_headers, "x-ratelimit-limit-tokens", True
+                ),
+                "response.headers.ratelimitResetTokens": check_rate_limit_header(
+                    response_headers, "x-ratelimit-reset-tokens", False
+                ),
+                "response.headers.ratelimitResetRequests": check_rate_limit_header(
+                    response_headers, "x-ratelimit-reset-requests", False
+                ),
+                "response.headers.ratelimitRemainingTokens": check_rate_limit_header(
+                    response_headers, "x-ratelimit-remaining-tokens", True
+                ),
+                "response.headers.ratelimitRemainingRequests": check_rate_limit_header(
+                    response_headers, "x-ratelimit-remaining-requests", True
+                ),
+                "vendor": "openAI",
+                "response.number_of_messages": len(messages) + len(choices),
+            }
+
+            transaction.record_ml_event("LlmChatCompletionSummary", chat_completion_summary_dict)
+            message_list = list(messages)
+            if choices:
+                message_list.extend([choices[0].message])
+
+            create_chat_completion_message_event(
+                transaction,
+                settings.app_name,
+                message_list,
+                chat_completion_id,
+                span_id,
+                trace_id,
+                response_model,
+                response_id,
+                request_id,
+                conversation_id,
+            )
+        raise
+
+    rbody, rcode, rheaders, stream = bind_interpret_response_line_params(*args, **kwargs)
+    if not response or not stream:
+        return response
+
+    data = getattr(response, "data", {})
+    if data:  # If there is an active transaction now.
+        choices = data.get("choices", [])
+        if choices:
+            delta = choices[0].get("delta", {})
+            if delta:
+                transaction._nr_openai_content = getattr(transaction, "_nr_openai_content", "") + delta.get(
+                    "content", ""
+                )
+                transaction._nr_openai_role = getattr(transaction, "_nr_openai_role", None) or delta.get("role")
+    return response
+
+
+def bind_interpret_response_line_params(rbody, rcode, rheaders, stream):
+    return rbody, rcode, rheaders, stream
+
+
 def wrap_chat_completion_create(wrapped, instance, args, kwargs):
     transaction = current_transaction()
 
@@ -107,7 +230,8 @@ def wrap_chat_completion_create(wrapped, instance, args, kwargs):
     with FunctionTrace(ft_name) as ft:
         response = wrapped(*args, **kwargs)
 
-    if not response:
+    stream = kwargs.get("stream", False)
+    if not response or stream:
         return response
 
     custom_attrs_dict = transaction._custom_params
@@ -444,3 +568,8 @@ def instrument_openai_api_resources_chat_completion(module):
         wrap_function_wrapper(module, "ChatCompletion.create", wrap_chat_completion_create)
     if hasattr(module.ChatCompletion, "acreate"):
         wrap_function_wrapper(module, "ChatCompletion.acreate", wrap_chat_completion_acreate)
+
+
+def instrument_openai_api_requestor_api_requestor(module):
+    if hasattr(module.APIRequestor, "_interpret_response_line"):
+        wrap_function_wrapper(module, "APIRequestor._interpret_response_line", wrap_interpret_response_line)
diff --git a/tests/mlmodel_openai/conftest.py b/tests/mlmodel_openai/conftest.py
@@ -111,20 +111,21 @@ def wrap_openai_api_requestor_request(wrapped, instance, args, kwargs):
     # Send request
     result = wrapped(*args, **kwargs)
 
-    # Clean up data
-    data = result[0].data
-    headers = result[0]._headers
-    headers = dict(
-        filter(
-            lambda k: k[0].lower() in RECORDED_HEADERS
-            or k[0].lower().startswith("openai")
-            or k[0].lower().startswith("x-ratelimit"),
-            headers.items(),
+    if hasattr(result[0], "data"):
+        # Clean up data
+        data = result[0].data
+        headers = result[0]._headers
+        headers = dict(
+            filter(
+                lambda k: k[0].lower() in RECORDED_HEADERS
+                or k[0].lower().startswith("openai")
+                or k[0].lower().startswith("x-ratelimit"),
+                headers.items(),
+            )
         )
-    )
 
-    # Log response
-    OPENAI_AUDIT_LOG_CONTENTS[prompt] = headers, data  # Append response data to audit log
+        # Log response
+        OPENAI_AUDIT_LOG_CONTENTS[prompt] = headers, data  # Append response data to audit log
     return result
 
 
diff --git a/tests/mlmodel_openai/test_chat_completion_streaming.py b/tests/mlmodel_openai/test_chat_completion_streaming.py