From a2f9f0bc89327b2d321c1922330575122702aae7 Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Thu, 10 Jul 2025 09:47:35 +0200 Subject: [PATCH] Use span.data instead of measurements for token usage --- sentry_sdk/ai/monitoring.py | 10 ++-- .../integrations/anthropic/test_anthropic.py | 46 +++++++++---------- tests/integrations/cohere/test_cohere.py | 16 +++---- .../huggingface_hub/test_huggingface_hub.py | 4 +- .../integrations/langchain/test_langchain.py | 4 +- tests/integrations/openai/test_openai.py | 32 ++++++------- 6 files changed, 58 insertions(+), 54 deletions(-) diff --git a/sentry_sdk/ai/monitoring.py b/sentry_sdk/ai/monitoring.py index ed33acd0f1..d3154f0631 100644 --- a/sentry_sdk/ai/monitoring.py +++ b/sentry_sdk/ai/monitoring.py @@ -102,15 +102,19 @@ def record_token_usage( ai_pipeline_name = get_ai_pipeline_name() if ai_pipeline_name: span.set_data(SPANDATA.AI_PIPELINE_NAME, ai_pipeline_name) + if prompt_tokens is not None: - span.set_measurement("ai_prompt_tokens_used", value=prompt_tokens) + span.set_data(SPANDATA.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens) + if completion_tokens is not None: - span.set_measurement("ai_completion_tokens_used", value=completion_tokens) + span.set_data(SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens) + if ( total_tokens is None and prompt_tokens is not None and completion_tokens is not None ): total_tokens = prompt_tokens + completion_tokens + if total_tokens is not None: - span.set_measurement("ai_total_tokens_used", total_tokens) + span.set_data(SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS, total_tokens) diff --git a/tests/integrations/anthropic/test_anthropic.py b/tests/integrations/anthropic/test_anthropic.py index 9ab0f879d1..e6e1a40aa9 100644 --- a/tests/integrations/anthropic/test_anthropic.py +++ b/tests/integrations/anthropic/test_anthropic.py @@ -125,9 +125,9 @@ def test_nonstreaming_create_message( assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] assert SPANDATA.AI_RESPONSES not in span["data"] - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 10 - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 20 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 30 + assert span["data"]["gen_ai.usage.input_tokens"] == 10 + assert span["data"]["gen_ai.usage.output_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 assert span["data"][SPANDATA.AI_STREAMING] is False @@ -193,9 +193,9 @@ async def test_nonstreaming_create_message_async( assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] assert SPANDATA.AI_RESPONSES not in span["data"] - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 10 - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 20 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 30 + assert span["data"]["gen_ai.usage.input_tokens"] == 10 + assert span["data"]["gen_ai.usage.output_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 assert span["data"][SPANDATA.AI_STREAMING] is False @@ -293,9 +293,9 @@ def test_streaming_create_message( assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] assert SPANDATA.AI_RESPONSES not in span["data"] - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 10 - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 30 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 40 + assert span["data"]["gen_ai.usage.input_tokens"] == 10 + assert span["data"]["gen_ai.usage.output_tokens"] == 30 + assert span["data"]["gen_ai.usage.total_tokens"] == 40 assert span["data"][SPANDATA.AI_STREAMING] is True @@ -396,9 +396,9 @@ async def test_streaming_create_message_async( assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] assert SPANDATA.AI_RESPONSES not in span["data"] - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 10 - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 30 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 40 + assert span["data"]["gen_ai.usage.input_tokens"] == 10 + assert span["data"]["gen_ai.usage.output_tokens"] == 30 + assert span["data"]["gen_ai.usage.total_tokens"] == 40 assert span["data"][SPANDATA.AI_STREAMING] is True @@ -525,9 +525,9 @@ def test_streaming_create_message_with_input_json_delta( assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] assert SPANDATA.AI_RESPONSES not in span["data"] - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 366 - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 51 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 417 + assert span["data"]["gen_ai.usage.input_tokens"] == 366 + assert span["data"]["gen_ai.usage.output_tokens"] == 51 + assert span["data"]["gen_ai.usage.total_tokens"] == 417 assert span["data"][SPANDATA.AI_STREAMING] is True @@ -662,9 +662,9 @@ async def test_streaming_create_message_with_input_json_delta_async( assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] assert SPANDATA.AI_RESPONSES not in span["data"] - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 366 - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 51 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 417 + assert span["data"]["gen_ai.usage.input_tokens"] == 366 + assert span["data"]["gen_ai.usage.output_tokens"] == 51 + assert span["data"]["gen_ai.usage.total_tokens"] == 417 assert span["data"][SPANDATA.AI_STREAMING] is True @@ -807,10 +807,10 @@ def test_add_ai_data_to_span_with_input_json_delta(sentry_init): content_blocks=["{'test': 'data',", "'more': 'json'}"], ) - assert span._data.get(SPANDATA.AI_RESPONSES) == [ + assert span._data.get("ai.responses") == [ {"type": "text", "text": "{'test': 'data','more': 'json'}"} ] - assert span._data.get(SPANDATA.AI_STREAMING) is True - assert span._measurements.get("ai_prompt_tokens_used")["value"] == 10 - assert span._measurements.get("ai_completion_tokens_used")["value"] == 20 - assert span._measurements.get("ai_total_tokens_used")["value"] == 30 + assert span._data.get("ai.streaming") is True + assert span._data.get("gen_ai.usage.input_tokens") == 10 + assert span._data.get("gen_ai.usage.output_tokens") == 20 + assert span._data.get("gen_ai.usage.total_tokens") == 30 diff --git a/tests/integrations/cohere/test_cohere.py b/tests/integrations/cohere/test_cohere.py index 6c1185a28e..f13a77ae90 100644 --- a/tests/integrations/cohere/test_cohere.py +++ b/tests/integrations/cohere/test_cohere.py @@ -64,9 +64,9 @@ def test_nonstreaming_chat( assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] assert SPANDATA.AI_RESPONSES not in span["data"] - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 10 - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 20 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 30 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 # noinspection PyTypeChecker @@ -135,9 +135,9 @@ def test_streaming_chat(sentry_init, capture_events, send_default_pii, include_p assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] assert SPANDATA.AI_RESPONSES not in span["data"] - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 10 - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 20 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 30 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 def test_bad_chat(sentry_init, capture_events): @@ -199,8 +199,8 @@ def test_embed(sentry_init, capture_events, send_default_pii, include_prompts): else: assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 10 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 10 + assert span["data"]["gen_ai.usage.input_tokens"] == 10 + assert span["data"]["gen_ai.usage.total_tokens"] == 10 def test_span_origin_chat(sentry_init, capture_events): diff --git a/tests/integrations/huggingface_hub/test_huggingface_hub.py b/tests/integrations/huggingface_hub/test_huggingface_hub.py index ee47cc7e56..540fd675b9 100644 --- a/tests/integrations/huggingface_hub/test_huggingface_hub.py +++ b/tests/integrations/huggingface_hub/test_huggingface_hub.py @@ -75,7 +75,7 @@ def test_nonstreaming_chat_completion( assert SPANDATA.AI_RESPONSES not in span["data"] if details_arg: - assert span["measurements"]["ai_total_tokens_used"]["value"] == 10 + assert span["data"]["gen_ai.usage.total_tokens"] == 10 @pytest.mark.parametrize( @@ -134,7 +134,7 @@ def test_streaming_chat_completion( assert SPANDATA.AI_RESPONSES not in span["data"] if details_arg: - assert span["measurements"]["ai_total_tokens_used"]["value"] == 10 + assert span["data"]["gen_ai.usage.total_tokens"] == 10 def test_bad_chat_completion(sentry_init, capture_events): diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py index 8ace6d4821..a50a2849c3 100644 --- a/tests/integrations/langchain/test_langchain.py +++ b/tests/integrations/langchain/test_langchain.py @@ -186,8 +186,8 @@ def test_langchain_agent( assert len(list(x for x in tx["spans"] if x["op"] == "ai.run.langchain")) > 0 if use_unknown_llm_type: - assert "ai_prompt_tokens_used" in chat_spans[0]["measurements"] - assert "ai_total_tokens_used" in chat_spans[0]["measurements"] + assert "gen_ai.usage.input_tokens" in chat_spans[0]["data"] + assert "gen_ai.usage.total_tokens" in chat_spans[0]["data"] else: # important: to avoid double counting, we do *not* measure # tokens used if we have an explicit integration (e.g. OpenAI) diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index 3fdc138f39..39195de277 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -90,9 +90,9 @@ def test_nonstreaming_chat_completion( assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] assert SPANDATA.AI_RESPONSES not in span["data"] - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 10 - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 20 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 30 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.asyncio @@ -132,9 +132,9 @@ async def test_nonstreaming_chat_completion_async( assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] assert SPANDATA.AI_RESPONSES not in span["data"] - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 10 - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 20 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 30 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 def tiktoken_encoding_if_installed(): @@ -228,9 +228,9 @@ def test_streaming_chat_completion( try: import tiktoken # type: ignore # noqa # pylint: disable=unused-import - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 2 - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 1 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 3 + assert span["data"]["gen_ai.usage.output_tokens"] == 2 + assert span["data"]["gen_ai.usage.input_tokens"] == 1 + assert span["data"]["gen_ai.usage.total_tokens"] == 3 except ImportError: pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly @@ -324,9 +324,9 @@ async def test_streaming_chat_completion_async( try: import tiktoken # type: ignore # noqa # pylint: disable=unused-import - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 2 - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 1 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 3 + assert span["data"]["gen_ai.usage.output_tokens"] == 2 + assert span["data"]["gen_ai.usage.input_tokens"] == 1 + assert span["data"]["gen_ai.usage.total_tokens"] == 3 except ImportError: pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly @@ -409,8 +409,8 @@ def test_embeddings_create( else: assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 20 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 30 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.asyncio @@ -457,8 +457,8 @@ async def test_embeddings_create_async( else: assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 20 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 30 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.parametrize(