From 01bbf460a05672ad76c0150fe9337c53b4ffca3d Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Tue, 10 Jun 2025 16:53:19 -0700 Subject: [PATCH 01/15] Fix a regex bug This bug was breaking the parsing of reasons and chains-of-thought present within LLM generated evaluation responses --- .../EvaluationMetricExtensions.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs index 51acfa37d10..ee3cf6488cc 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs @@ -81,7 +81,7 @@ internal static bool TryParseEvaluationResponseWithTags( static bool TryParseTag(string text, string tagName, [NotNullWhen(true)] out string? tagValue) { - const RegexOptions Options = RegexOptions.Multiline; + const RegexOptions Options = RegexOptions.Singleline; Match match = Regex.Match(text, $@"<{tagName}>(?.*?)", Options); if (!match.Success || match.Groups["value"] is not Group valueGroup || !valueGroup.Success) From 6eade5b72ca4917a25ec06e22fc69bc7c729118f Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Thu, 5 Jun 2025 11:20:52 -0700 Subject: [PATCH 02/15] Update some diagnostic messages --- .../CompletenessEvaluator.cs | 2 +- .../EquivalenceEvaluator.cs | 2 +- .../GroundednessEvaluator.cs | 2 +- .../RelevanceEvaluator.cs | 2 +- .../RelevanceTruthAndCompletenessEvaluator.cs | 2 +- .../RetrievalEvaluator.cs | 4 ++-- .../GroundednessProEvaluator.cs | 2 +- .../UngroundedAttributesEvaluator.cs | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/CompletenessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/CompletenessEvaluator.cs index a5f04300f70..3bd57cf322b 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/CompletenessEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/CompletenessEvaluator.cs @@ -87,7 +87,7 @@ public async ValueTask EvaluateAsync( { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"A value of type '{nameof(CompletenessEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.")); + $"A value of type {nameof(CompletenessEvaluatorContext)} was not found in the {nameof(additionalContext)} collection.")); return result; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs index 714a027b4a1..ced79652bc7 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs @@ -86,7 +86,7 @@ public async ValueTask EvaluateAsync( { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.")); + $"A value of type {nameof(EquivalenceEvaluatorContext)} was not found in the {nameof(additionalContext)} collection.")); return result; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs index bf9b499ebc7..a52fbcf2ad9 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs @@ -85,7 +85,7 @@ public async ValueTask EvaluateAsync( { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"A value of type '{nameof(GroundednessEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.")); + $"A value of type {nameof(GroundednessEvaluatorContext)} was not found in the {nameof(additionalContext)} collection.")); return result; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceEvaluator.cs index 86fb950e720..cf436961307 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceEvaluator.cs @@ -80,7 +80,7 @@ public async ValueTask EvaluateAsync( { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"The ${messages} supplied for evaluation did not contain a user request as the last message.")); + $"The {nameof(messages)} supplied for evaluation did not contain a user request as the last message.")); return result; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs index d10c259a4de..95cc64fd3dc 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs @@ -97,7 +97,7 @@ public async ValueTask EvaluateAsync( { result.AddDiagnosticsToAllMetrics( EvaluationDiagnostic.Error( - $"The ${messages} supplied for evaluation did not contain a user request as the last message.")); + $"The {nameof(messages)} supplied for evaluation did not contain a user request as the last message.")); return result; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs index 9ecfbb182f5..209919e30a3 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs @@ -85,7 +85,7 @@ public async ValueTask EvaluateAsync( { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"The ${messages} supplied for evaluation did not contain a user request as the last message.")); + $"The {nameof(messages)} supplied for evaluation did not contain a user request as the last message.")); return result; } @@ -95,7 +95,7 @@ public async ValueTask EvaluateAsync( { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"A value of type '{nameof(RetrievalEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.")); + $"A value of type {nameof(RetrievalEvaluatorContext)} was not found in the {nameof(additionalContext)} collection.")); return result; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/GroundednessProEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/GroundednessProEvaluator.cs index f65ddae4662..24408d5a1ad 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/GroundednessProEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/GroundednessProEvaluator.cs @@ -87,6 +87,6 @@ private static GroundednessProEvaluatorContext GetRelevantContext( } throw new InvalidOperationException( - $"A value of type '{nameof(GroundednessProEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."); + $"A value of type {nameof(GroundednessProEvaluatorContext)} was not found in the {nameof(additionalContext)} collection."); } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/UngroundedAttributesEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/UngroundedAttributesEvaluator.cs index 06019969345..4b3fe84cb4e 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/UngroundedAttributesEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/UngroundedAttributesEvaluator.cs @@ -91,6 +91,6 @@ private static UngroundedAttributesEvaluatorContext GetRelevantContext( } throw new InvalidOperationException( - $"A value of type '{nameof(UngroundedAttributesEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."); + $"A value of type {nameof(UngroundedAttributesEvaluatorContext)} was not found in the {nameof(additionalContext)} collection."); } } From b0518819c7da4fa1d075b3c6425373b2fe55274d Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Thu, 5 Jun 2025 11:23:33 -0700 Subject: [PATCH 03/15] Report a diagnostic if no context chunks are supplied when invoking RetrievalEvaluator --- .../RetrievalEvaluator.cs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs index 209919e30a3..c6f59cceee8 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs @@ -100,6 +100,15 @@ public async ValueTask EvaluateAsync( return result; } + if (context.RetrievedContextChunks.Count is 0) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"Supplied {nameof(RetrievalEvaluatorContext)} did not contain any {nameof(RetrievalEvaluatorContext.RetrievedContextChunks)}.")); + + return result; + } + List evaluationInstructions = GetEvaluationInstructions(userRequest, context); (ChatResponse evaluationResponse, TimeSpan evaluationDuration) = From cd0a2fdfb87f044ca5ae96b3f8dcc57863c71ccf Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Fri, 6 Jun 2025 12:34:07 -0700 Subject: [PATCH 04/15] Introduce ToolCallAccuracyEvaluator --- .../AIToolExtensions.cs | 34 +++ .../ChatMessageExtensions.cs | 36 +++ .../ChatResponseExtensions.cs | 52 ++++ .../EvaluationMetricExtensions.cs | 24 ++ .../ToolCallAccuracyEvaluator.cs | 234 ++++++++++++++++++ .../ToolCallAccuracyEvaluatorContext.cs | 92 +++++++ .../EvaluationMetricExtensions.cs | 2 +- 7 files changed, 473 insertions(+), 1 deletion(-) create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/AIToolExtensions.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatMessageExtensions.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatResponseExtensions.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/AIToolExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/AIToolExtensions.cs new file mode 100644 index 00000000000..ec6a35d59d3 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/AIToolExtensions.cs @@ -0,0 +1,34 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Linq; +using System.Text.Json; +using System.Text.Json.Nodes; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +internal static class AIToolExtensions +{ + internal static string RenderAsJson( + this IEnumerable toolDefinitions, + JsonSerializerOptions? options = null) + { + _ = Throw.IfNull(toolDefinitions); + + var toolDefinitionsJsonArray = new JsonArray(); + + foreach (AIFunction function in toolDefinitions.OfType()) + { + JsonNode? functionJsonNode = JsonNode.Parse(function.JsonSchema.GetRawText()); + if (functionJsonNode is not null) + { + toolDefinitionsJsonArray.Add(functionJsonNode); + } + } + + string renderedToolDefinitions = toolDefinitionsJsonArray.ToJsonString(options); + return renderedToolDefinitions; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatMessageExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatMessageExtensions.cs new file mode 100644 index 00000000000..2ad8e113f71 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatMessageExtensions.cs @@ -0,0 +1,36 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Text.Json; +using System.Text.Json.Nodes; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +internal static class ChatMessageExtensions +{ + internal static string RenderAsJson(this IEnumerable messages, JsonSerializerOptions? options = null) + { + _ = Throw.IfNull(messages); + + var messagesJsonArray = new JsonArray(); + + foreach (ChatMessage message in messages) + { + string messageJsonString = + JsonSerializer.Serialize( + message, + AIJsonUtilities.DefaultOptions.GetTypeInfo(typeof(ChatMessage))); + + JsonNode? messageJsonNode = JsonNode.Parse(messageJsonString); + if (messageJsonNode is not null) + { + messagesJsonArray.Add(messageJsonNode); + } + } + + string renderedMessages = messagesJsonArray.ToJsonString(options); + return renderedMessages; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatResponseExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatResponseExtensions.cs new file mode 100644 index 00000000000..fa4f30fd45e --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatResponseExtensions.cs @@ -0,0 +1,52 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Linq; +using System.Text.Json; +using System.Text.Json.Nodes; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +internal static class ChatResponseExtensions +{ + internal static string RenderAsJson(this ChatResponse modelResponse, JsonSerializerOptions? options = null) + { + _ = Throw.IfNull(modelResponse); + + return modelResponse.Messages.RenderAsJson(options); + } + + internal static string RenderToolCallsAndResultsAsJson( + this ChatResponse modelResponse, + JsonSerializerOptions? options = null) + { + _ = Throw.IfNull(modelResponse); + + var toolCallsAndResultsJsonArray = new JsonArray(); + + foreach (AIContent content in modelResponse.Messages.SelectMany(m => m.Contents)) + { + if (content is FunctionCallContent or FunctionResultContent) + { + Type contentType = + content is FunctionCallContent ? typeof(FunctionCallContent) : typeof(FunctionResultContent); + + string toolCallOrResultJsonString = + JsonSerializer.Serialize( + content, + AIJsonUtilities.DefaultOptions.GetTypeInfo(contentType)); + + JsonNode? toolCallOrResultJsonNode = JsonNode.Parse(toolCallOrResultJsonString); + if (toolCallOrResultJsonNode is not null) + { + toolCallsAndResultsJsonArray.Add(toolCallOrResultJsonNode); + } + } + } + + string renderedToolCallsAndResults = toolCallsAndResultsJsonArray.ToJsonString(options); + return renderedToolCallsAndResults; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs index ee3cf6488cc..792db414d1c 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs @@ -33,6 +33,25 @@ internal static EvaluationMetricInterpretation InterpretScore(this NumericMetric : new EvaluationMetricInterpretation(rating); } + internal static EvaluationMetricInterpretation InterpretScore( + this BooleanMetric metric, + bool passValue = true) + { + EvaluationRating rating = metric.Value switch + { + null => EvaluationRating.Inconclusive, + true => passValue ? EvaluationRating.Exceptional : EvaluationRating.Unacceptable, + false => passValue ? EvaluationRating.Unacceptable : EvaluationRating.Exceptional, + }; + + return metric.Value is bool value && value == passValue + ? new EvaluationMetricInterpretation(rating) + : new EvaluationMetricInterpretation( + rating, + failed: true, + reason: $"{metric.Name} is not {passValue}."); + } + internal static bool TryParseEvaluationResponseWithValue( this EvaluationMetric metric, ChatResponse evaluationResponse, @@ -131,6 +150,11 @@ private static bool TryParseValue(this EvaluationMetric metric, string val booleanMetric.Value = booleanValue; return true; } + else if (int.TryParse(valueText, out int intValue) && (intValue is 0 or 1)) + { + booleanMetric.Value = intValue is 1; + return true; + } else { metric.AddDiagnostics( diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs new file mode 100644 index 00000000000..83781fb1f7b --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs @@ -0,0 +1,234 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Utilities; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +/// +/// An that evaluates an AI system's effectiveness at using the tools supplied to it. +/// +/// +/// +/// measures how accurately an AI system uses tools by examining tool calls +/// (i.e., s) present in the supplied response to assess the relevance of these tool +/// calls to the conversation, the parameter correctness for these tool calls with regard to the tool definitions +/// supplied via , and the accuracy of the parameter +/// value extraction from the supplied conversation. +/// +/// +/// Note that at the moment, only supports evaluating calls to tools that are +/// defined as s. Any other definitions that are supplied via +/// will be ignored. +/// +/// +/// returns a that contains a score for 'Tool Call +/// Accuracy'. The score is if the tool call is irrelevant or contains information not present +/// in the conversation and if the tool call is relevant with properly extracted parameters +/// from the conversation. +/// +/// +/// Note: is an AI-based evaluator that uses an AI model to perform its +/// evaluation. While the prompt that this evaluator uses to perform its evaluation is designed to be model-agnostic, +/// the performance of this prompt (and the resulting evaluation) can vary depending on the model used, and can be +/// especially poor when a smaller / local model is used. +/// +/// +/// The prompt that uses has been tested against (and tuned to work well with) +/// the following models. So, using this evaluator with a model from the following list is likely to produce the best +/// results. (The model to be used can be configured via .) +/// +/// +/// GPT-4o +/// +/// +[Experimental("AIEVAL001")] +public sealed class ToolCallAccuracyEvaluator : IEvaluator +{ + /// + /// Gets the of the returned by + /// . + /// + public static string ToolCallAccuracyMetricName => "Tool Call Accuracy"; + + /// + public IReadOnlyCollection EvaluationMetricNames { get; } = [ToolCallAccuracyMetricName]; + + private static readonly ChatOptions _chatOptions = + new ChatOptions + { + Temperature = 0.0f, + MaxOutputTokens = 800, + TopP = 1.0f, + PresencePenalty = 0.0f, + FrequencyPenalty = 0.0f, + ResponseFormat = ChatResponseFormat.Text + }; + + /// + public async ValueTask EvaluateAsync( + IEnumerable messages, + ChatResponse modelResponse, + ChatConfiguration? chatConfiguration = null, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(modelResponse); + _ = Throw.IfNull(chatConfiguration); + + var metric = new BooleanMetric(ToolCallAccuracyMetricName); + var result = new EvaluationResult(metric); + + if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || + string.IsNullOrWhiteSpace(userRequest.Text)) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(messages)} supplied for evaluation did not contain a user request as the last message.")); + + return result; + } + + if (string.IsNullOrWhiteSpace(modelResponse.Text)) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation was null or empty.")); + + return result; + } + + if (additionalContext?.OfType().FirstOrDefault() + is not ToolCallAccuracyEvaluatorContext context) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"A value of type {nameof(ToolCallAccuracyEvaluatorContext)} was not found in the {nameof(additionalContext)} collection.")); + + return result; + } + + if (context.ToolDefinitions.Count is 0) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"Supplied {nameof(ToolCallAccuracyEvaluatorContext)} did not contain any {nameof(ToolCallAccuracyEvaluatorContext.ToolDefinitions)}.")); + + return result; + } + + IEnumerable toolCalls = modelResponse.Messages.SelectMany(m => m.Contents).OfType(); + + if (!toolCalls.Any()) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation did not contain any tool calls (i.e., {nameof(FunctionCallContent)}s.")); + + return result; + } + + var toolDefinitionNames = new HashSet(context.ToolDefinitions.Select(td => td.Name)); + + if (toolCalls.Any(t => !toolDefinitionNames.Contains(t.Name))) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation contained calls to tools that were not included in the supplied {nameof(ToolCallAccuracyEvaluatorContext)}.")); + + return result; + } + + List evaluationInstructions = GetEvaluationInstructions(messages, modelResponse, context); + + (ChatResponse evaluationResponse, TimeSpan evaluationDuration) = + await TimingHelper.ExecuteWithTimingAsync(() => + chatConfiguration.ChatClient.GetResponseAsync( + evaluationInstructions, + _chatOptions, + cancellationToken)).ConfigureAwait(false); + + _ = metric.TryParseEvaluationResponseWithTags(evaluationResponse, evaluationDuration); + metric.AddOrUpdateContext(context); + metric.Interpretation = metric.InterpretScore(); + return result; + } + + private static List GetEvaluationInstructions( + IEnumerable messages, + ChatResponse modelResponse, + ToolCallAccuracyEvaluatorContext context) + { +#pragma warning disable S103 // Lines should not be too long + const string SystemPrompt = + """ + # Instruction + ## Goal + ### You are an expert in evaluating the accuracy of a tool call considering relevance and potential usefulness including syntactic and semantic correctness of a proposed tool call from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. + - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. + - **Data**: Your input data include CONVERSATION , TOOL CALL and TOOL DEFINITION. + - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways. + """; +#pragma warning restore S103 + + List evaluationInstructions = [new ChatMessage(ChatRole.System, SystemPrompt)]; + + string renderedConversation = messages.RenderText(); + string renderedToolCallsAndResults = modelResponse.RenderToolCallsAndResultsAsJson(); + string renderedToolDefinitions = context.ToolDefinitions.RenderAsJson(); + +#pragma warning disable S103 // Lines should not be too long + string evaluationPrompt = + $$""" + # Definition + **Tool Call Accuracy** refers to the relevance and potential usefulness of a TOOL CALL in the context of an ongoing CONVERSATION and EXTRACTION of RIGHT PARAMETER VALUES from the CONVERSATION.It assesses how likely the TOOL CALL is to contribute meaningfully to the CONVERSATION and help address the user's needs. Focus on evaluating the potential value of the TOOL CALL within the specific context of the given CONVERSATION, without making assumptions beyond the provided information. + Consider the following factors in your evaluation: + + 1. Relevance: How well does the proposed tool call align with the current topic and flow of the conversation? + 2. Parameter Appropriateness: Do the parameters used in the TOOL CALL match the TOOL DEFINITION and are the parameters relevant to the latest user's query? + 3. Parameter Value Correctness: Are the parameters values used in the TOOL CALL present or inferred by CONVERSATION and relevant to the latest user's query? + 4. Potential Value: Is the information this tool call might provide likely to be useful in advancing the conversation or addressing the user expressed or implied needs? + 5. Context Appropriateness: Does the tool call make sense at this point in the conversation, given what has been discussed so far? + + + # Ratings + ## [Tool Call Accuracy: 0] (Irrelevant) + **Definition:** + 1. The TOOL CALL is not relevant and will not help resolve the user's need. + 2. TOOL CALL include parameters values that are not present or inferred from CONVERSATION. + 3. TOOL CALL has parameters that is not present in TOOL DEFINITION. + + ## [Tool Call Accuracy: 1] (Relevant) + **Definition:** + 1. The TOOL CALL is directly relevant and very likely to help resolve the user's need. + 2. TOOL CALL include parameters values that are present or inferred from CONVERSATION. + 3. TOOL CALL has parameters that is present in TOOL DEFINITION. + + # Data + CONVERSATION : {{renderedConversation}} + TOOL CALL: {{renderedToolCallsAndResults}} + TOOL DEFINITION: {{renderedToolDefinitions}} + + + # Tasks + ## Please provide your assessment Score for the previous CONVERSATION , TOOL CALL and TOOL DEFINITION based on the Definitions above. Your output should include the following information: + - **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". + - **Explanation**: a very short explanation of why you think the input Data should get that Score. + - **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "0", "1") based on the levels of the definitions. + + + ## Please provide your answers between the tags: your chain of thoughts, your explanation, your Score. + # Output + """; +#pragma warning restore S103 + + evaluationInstructions.Add(new ChatMessage(ChatRole.User, evaluationPrompt)); + + return evaluationInstructions; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs new file mode 100644 index 00000000000..c3d81c0a05f --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs @@ -0,0 +1,92 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +/// +/// Contextual information that the uses to evaluate an AI system's +/// effectiveness at using the tools supplied to it. +/// +/// +/// +/// measures how accurately an AI system uses tools by examining tool calls +/// (i.e., s) present in the supplied response to assess the relevance of these tool +/// calls to the conversation, the parameter correctness for these tool calls with regard to the tool definitions +/// supplied via , and the accuracy of the parameter value extraction from the supplied +/// conversation history. +/// +/// +/// Note that at the moment, only supports evaluating calls to tools that are +/// defined as s. Any other definitions that are supplied via +/// will be ignored. +/// +/// +[Experimental("AIEVAL001")] +public sealed class ToolCallAccuracyEvaluatorContext : EvaluationContext +{ + /// + /// Initializes a new instance of the class. + /// + /// + /// + /// The set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions will be ignored. + /// + /// + public ToolCallAccuracyEvaluatorContext(IEnumerable toolDefinitions) + : base(name: ToolCallAccuracyContextName, contents: [new TextContent(toolDefinitions.RenderAsJson())]) + { + ToolDefinitions = [.. toolDefinitions]; + } + + /// + /// Initializes a new instance of the class. + /// + /// + /// + /// The set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions will be ignored. + /// + /// + public ToolCallAccuracyEvaluatorContext(params AITool[] toolDefinitions) + : this(toolDefinitions as IEnumerable) + { + } + + /// + /// Gets the unique that is used for + /// . + /// + public static string ToolCallAccuracyContextName => "Tool Definitions (Tool Call Accuracy)"; + + /// + /// Gets set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// + /// measures how accurately an AI system uses tools by examining tool calls + /// (i.e., s) present in the supplied response to assess the relevance of these + /// tool calls to the conversation, the parameter correctness for these tool calls with regard to the tool + /// definitions supplied via , and the accuracy of the parameter value extraction from + /// the supplied conversation history. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions that are supplied via + /// will be ignored. + /// + /// + public IReadOnlyList ToolDefinitions { get; } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/EvaluationMetricExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/EvaluationMetricExtensions.cs index 8a0ffcbd31b..f6ab393750b 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/EvaluationMetricExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/EvaluationMetricExtensions.cs @@ -73,7 +73,7 @@ internal static EvaluationMetricInterpretation InterpretContentSafetyScore( : new EvaluationMetricInterpretation( rating, failed: true, - reason: $"{metric.Name} is {passValue}."); + reason: $"{metric.Name} is not {passValue}."); } internal static void LogJsonData(this EvaluationMetric metric, string data) From f554de02122b5a5e8df5a5fa1993b4c03924483c Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Tue, 10 Jun 2025 17:02:44 -0700 Subject: [PATCH 05/15] Add support for displaying tool calls and other json based content in conversation details --- .../TypeScript/components/App.tsx | 7 ++++- .../components/ConversationDetails.tsx | 27 ++++++++++++++++--- .../TypeScript/components/ReportContext.tsx | 13 +++++++-- 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx index b38c691bbb3..6d73c8220e1 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/App.tsx @@ -52,7 +52,7 @@ export const App = () => { const classes = useStyles(); const { dataset, scoreSummary, selectedTags, clearFilters, searchValue, setSearchValue } = useReportContext(); const [isSettingsOpen, setIsSettingsOpen] = useState(false); - const { renderMarkdown, setRenderMarkdown } = useReportContext(); + const { renderMarkdown, setRenderMarkdown, prettifyJson, setPrettifyJson } = useReportContext(); const { globalTags, filterableTags } = categorizeAndSortTags(dataset, scoreSummary.primaryResult.executionName); const toggleSettings = () => setIsSettingsOpen(!isSettingsOpen); @@ -127,6 +127,11 @@ export const App = () => { onChange={(_ev, data) => setRenderMarkdown(data.checked)} label={Render markdown for conversations} /> + setPrettifyJson(data.checked)} + label={Pretty print JSON content} + /> diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx index 6acf38673de..9cf40a7a574 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx @@ -18,7 +18,7 @@ export const ConversationDetails = ({ messages, model, usage, selectedMetric }: }) => { const classes = useStyles(); const [isExpanded, setIsExpanded] = useState(true); - const { renderMarkdown } = useReportContext(); + const { renderMarkdown, prettifyJson } = useReportContext(); const isUserSide = (role: string) => role.toLowerCase() === 'user' || role.toLowerCase() === 'system'; @@ -29,14 +29,33 @@ export const ConversationDetails = ({ messages, model, usage, selectedMetric }: usage?.totalTokenCount && `Total Tokens: ${usage.totalTokenCount}`, ].filter(Boolean).join(' • '); + const isValidJson = (text: string): { isValid: boolean; parsedJson?: any } => { + try { + const parsedJson = JSON.parse(text.trim()); + return { isValid: true, parsedJson }; + } catch { + return { isValid: false }; + } + }; + const renderContent = (content: AIContent) => { if (isTextContent(content)) { - return renderMarkdown ? - {content.text} : -
{content.text}
; + const { isValid, parsedJson } = isValidJson(content.text); + if (isValid) { + const jsonContent = JSON.stringify(parsedJson, null, prettifyJson ? 2 : 0); + return
{jsonContent}
; + } else { + return renderMarkdown ? + {content.text} : +
{content.text}
; + } } else if (isImageContent(content)) { const imageUrl = (content as UriContent).uri || (content as DataContent).uri; return Content; + } else { + // For any other content type, display the serialized JSON + const jsonContent = JSON.stringify(content, null, prettifyJson ? 2 : 0); + return
{jsonContent}
; } }; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ReportContext.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ReportContext.tsx index 64a1e4a3c20..74a645c70b7 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ReportContext.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ReportContext.tsx @@ -11,6 +11,8 @@ export type ReportContextType = { selectScenarioLevel: (key: string) => void, renderMarkdown: boolean, setRenderMarkdown: (renderMarkdown: boolean) => void, + prettifyJson: boolean, + setPrettifyJson: (prettifyJson: boolean) => void, searchValue: string, setSearchValue: (searchValue: string) => void, selectedTags: string[], @@ -38,6 +40,10 @@ const defaultReportContext = createContext({ setRenderMarkdown: (_renderMarkdown: boolean) => { throw new Error("setRenderMarkdown function not implemented"); }, + prettifyJson: true, + setPrettifyJson: (_prettifyJson: boolean) => { + throw new Error("setPrettifyJson function not implemented"); + }, searchValue: '', setSearchValue: (_searchValue: string | undefined) => { throw new Error("setSearchValue function not implemented"); }, selectedTags: [], @@ -65,6 +71,7 @@ export const useReportContext = () => { const useProvideReportContext = (dataset: Dataset, scoreSummary: ScoreSummary): ReportContextType => { const [selectedScenarioLevel, setSelectedScenarioLevel] = useState(undefined); const [renderMarkdown, setRenderMarkdown] = useState(true); + const [prettifyJson, setPrettifyJson] = useState(true); const [selectedTags, setSelectedTags] = useState([]); const [searchValue, setSearchValue] = useState(""); @@ -114,7 +121,7 @@ const useProvideReportContext = (dataset: Dataset, scoreSummary: ScoreSummary): } return null; - }; + }; return srch(node); } @@ -126,6 +133,8 @@ const useProvideReportContext = (dataset: Dataset, scoreSummary: ScoreSummary): selectScenarioLevel, renderMarkdown, setRenderMarkdown, + prettifyJson, + setPrettifyJson, searchValue, setSearchValue, selectedTags, @@ -133,4 +142,4 @@ const useProvideReportContext = (dataset: Dataset, scoreSummary: ScoreSummary): clearFilters, filterTree, }; -}; \ No newline at end of file +}; From 043df3e799ee1aae556e89587c401c20390d0094 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Tue, 10 Jun 2025 17:41:02 -0700 Subject: [PATCH 06/15] Add tests --- eng/packages/TestOnly.props | 2 + .../AgentQualityEvaluatorTests.cs | 269 ++++++++++++++++++ .../ChatMessageContentExtensions.cs | 78 +++++ ...ons.AI.Evaluation.Integration.Tests.csproj | 2 + .../QualityEvaluatorTests.cs | 2 + .../SafetyEvaluatorTests.cs | 2 + .../Setup.cs | 28 +- 7 files changed, 380 insertions(+), 3 deletions(-) create mode 100644 test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs create mode 100644 test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ChatMessageContentExtensions.cs diff --git a/eng/packages/TestOnly.props b/eng/packages/TestOnly.props index 3b511fa037f..5d4177e326c 100644 --- a/eng/packages/TestOnly.props +++ b/eng/packages/TestOnly.props @@ -20,6 +20,8 @@ + + diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs new file mode 100644 index 00000000000..70207bf55fb --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs @@ -0,0 +1,269 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.ComponentModel; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Quality; +using Microsoft.Extensions.AI.Evaluation.Reporting; +using Microsoft.Extensions.AI.Evaluation.Reporting.Storage; +using Microsoft.Extensions.AI.Evaluation.Tests; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.ChatCompletion; +using Microsoft.TestUtilities; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests; + +[Experimental("AIEVAL001")] +public class AgentQualityEvaluatorTests +{ + private static readonly ChatOptions? _chatOptions; + private static readonly ChatOptions? _chatOptionsWithTools; + private static readonly ReportingConfiguration? _agentQualityReportingConfiguration; + private static readonly ReportingConfiguration? _needsContextReportingConfiguration; + + static AgentQualityEvaluatorTests() + { + if (Settings.Current.Configured) + { + _chatOptions = + new ChatOptions + { + Temperature = 0.0f, + ResponseFormat = ChatResponseFormat.Text + }; + + _chatOptionsWithTools = + new ChatOptions + { + Temperature = 0.0f, + ResponseFormat = ChatResponseFormat.Text, + Tools = + [ + AIFunctionFactory.Create( + GetOrders, name: $"{nameof(AgentQualityEvaluatorTests)}_{nameof(GetOrders)}"), + AIFunctionFactory.Create( + GetOrderStatus, name: $"{nameof(AgentQualityEvaluatorTests)}_{nameof(GetOrderStatus)}") + ] + + // Note: We prefix the tool names with the test class name so that the tool names used are + // consistent between Microsoft.Extensions.AI and SemanticKernel. + }; + + ChatConfiguration chatConfiguration = Setup.CreateChatConfiguration(); + ChatClientMetadata? clientMetadata = chatConfiguration.ChatClient.GetService(); + + IChatClient chatClient = chatConfiguration.ChatClient; + IChatClient chatClientWithToolCalling = chatClient.AsBuilder().UseFunctionInvocation().Build(); + ChatConfiguration chatConfigurationWithToolCalling = new ChatConfiguration(chatClientWithToolCalling); + + string version = $"Product Version: {Constants.Version}"; + string date = $"Date: {DateTime.UtcNow:dddd, dd MMMM yyyy}"; + string projectName = $"Project: Integration Tests"; + string testClass = $"Test Class: {nameof(AgentQualityEvaluatorTests)}"; + string provider = $"Model Provider: {clientMetadata?.ProviderName ?? "Unknown"}"; + string model = $"Model: {clientMetadata?.DefaultModelId ?? "Unknown"}"; + string temperature = $"Temperature: {_chatOptionsWithTools.Temperature}"; + string usesContext = $"Feature: Context"; + + IEvaluator toolCallAccuracyEvaluator = new ToolCallAccuracyEvaluator(); + + _agentQualityReportingConfiguration = + DiskBasedReportingConfiguration.Create( + storageRootPath: Settings.Current.StorageRootPath, + evaluators: [], + chatConfiguration: chatConfigurationWithToolCalling, + executionName: Constants.Version, + tags: [version, date, projectName, testClass, provider, model, temperature]); + + _needsContextReportingConfiguration = + DiskBasedReportingConfiguration.Create( + storageRootPath: Settings.Current.StorageRootPath, + evaluators: [toolCallAccuracyEvaluator], + chatConfiguration: chatConfigurationWithToolCalling, + executionName: Constants.Version, + tags: [version, date, projectName, testClass, provider, model, temperature, usesContext]); + } + } + + [ConditionalFact] + public async Task ToolDefinitionsAreNeededButNotPassed() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _needsContextReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(AgentQualityEvaluatorTests)}.{nameof(ToolDefinitionsAreNeededButNotPassed)}"); + + (IEnumerable messages, ChatResponse response) = + await GetConversationAsync(scenarioRun.ChatConfiguration!.ChatClient); + + EvaluationResult result = await scenarioRun.EvaluateAsync(messages, response); + + Assert.True( + result.Metrics.Values.All(m => m.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error)), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Single(result.Metrics); + Assert.True(result.TryGet(ToolCallAccuracyEvaluator.ToolCallAccuracyMetricName, out BooleanMetric? _)); + } + + [ConditionalFact] + public async Task ToolDefinitionsAreNeededAndPassed() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _needsContextReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(AgentQualityEvaluatorTests)}.{nameof(ToolDefinitionsAreNeededAndPassed)}"); + + (IEnumerable messages, ChatResponse response) = + await GetConversationAsync(scenarioRun.ChatConfiguration!.ChatClient); + + var toolDefinitionsForToolCallAccuracyEvaluator = + new ToolCallAccuracyEvaluatorContext(toolDefinitions: _chatOptionsWithTools.Tools!); + + EvaluationResult result = + await scenarioRun.EvaluateAsync( + messages, + response, + additionalContext: [toolDefinitionsForToolCallAccuracyEvaluator]); + + Assert.False( + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Single(result.Metrics); + Assert.True(result.TryGet(ToolCallAccuracyEvaluator.ToolCallAccuracyMetricName, out BooleanMetric? _)); + } + + [ConditionalFact] + public async Task EvaluateToolCallsPerformedUsingSemanticKernel() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _needsContextReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(AgentQualityEvaluatorTests)}.{nameof(EvaluateToolCallsPerformedUsingSemanticKernel)}"); + + (IEnumerable messages, ChatResponse response, IEnumerable toolDefinitions) = + await GetConversationUsingSemanticKernelAsync(scenarioRun.ChatConfiguration!.ChatClient); + + var toolDefinitionsForToolCallAccuracyEvaluator = new ToolCallAccuracyEvaluatorContext(toolDefinitions); + + EvaluationResult result = + await scenarioRun.EvaluateAsync( + messages, + response, + additionalContext: [toolDefinitionsForToolCallAccuracyEvaluator]); + + Assert.False( + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Single(result.Metrics); + Assert.True(result.TryGet(ToolCallAccuracyEvaluator.ToolCallAccuracyMetricName, out BooleanMetric? _)); + } + + private static async Task<(IEnumerable messages, ChatResponse response)> + GetConversationAsync(IChatClient chatClient) + { + List messages = + [ + "You are a friendly and helpful customer service agent.".ToSystemMessage(), + "Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?".ToUserMessage() + ]; + + ChatResponse response = await chatClient.GetResponseAsync(messages, _chatOptionsWithTools); + return (messages, response); + } + + private static async Task<(IEnumerable messages, ChatResponse response, IEnumerable toolDefinitions)> + GetConversationUsingSemanticKernelAsync(IChatClient chatClient) + { + IChatCompletionService chatCompletionService = chatClient.AsChatCompletionService(); + IKernelBuilder builder = Kernel.CreateBuilder(); + builder.Services.AddSingleton(chatCompletionService); + Kernel kernel = builder.Build(); + + kernel.ImportPluginFromType(); + var settings = + new PromptExecutionSettings + { + FunctionChoiceBehavior = FunctionChoiceBehavior.Auto() + }; + + ChatHistory skHistory = new ChatHistory(); + skHistory.AddMessage(AuthorRole.System, "You are a friendly and helpful customer service agent."); + skHistory.AddMessage(AuthorRole.User, "Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?"); + List messages = [.. skHistory.Select(m => m.ToChatMessage())]; + + ChatMessageContent skResponse = + await chatCompletionService.GetChatMessageContentAsync(skHistory, settings, kernel); + + skHistory.RemoveRange(0, 2); // Trim to only include the tool call and tool call result messages. + IEnumerable toolMessages = skHistory.Select(m => m.ToChatMessage()); + ChatMessage finalResponseMessage = skResponse.ToChatMessage(); + ChatResponse response = new ChatResponse([.. toolMessages, finalResponseMessage]); + + return (messages, response, kernel.Plugins.SelectMany(p => p.AsAIFunctions())); + } + + [KernelFunction] + [Description("Gets the orders for a customer")] + private static IReadOnlyList GetOrders(int accountNumber) + { + if (accountNumber != 888) + { + throw new InvalidOperationException($"Account number {accountNumber} is not valid."); + } + + return [new Order(123), new Order(124)]; + } + + [KernelFunction] + [Description("Gets the delivery status of an order")] + private static OrderStatus GetOrderStatus(int orderId) + { + if (orderId == 123) + { + return new OrderStatus(orderId, "shipped", DateTime.Now.AddDays(1)); + } + else if (orderId == 124) + { + return new OrderStatus(orderId, "delayed", DateTime.Now.AddDays(10)); + } + else + { + throw new InvalidOperationException($"Order with ID {orderId} not found."); + } + } + + private record Order(int OrderId) + { + } + + private record OrderStatus(int OrderId, string Status, DateTime ExpectedDelivery) + { + } + + [MemberNotNull(nameof(_chatOptionsWithTools))] + [MemberNotNull(nameof(_agentQualityReportingConfiguration))] + [MemberNotNull(nameof(_needsContextReportingConfiguration))] + private static void SkipIfNotConfigured() + { + if (!Settings.Current.Configured) + { + throw new SkipTestException("Test is not configured"); + } + + Assert.NotNull(_chatOptionsWithTools); + Assert.NotNull(_agentQualityReportingConfiguration); + Assert.NotNull(_needsContextReportingConfiguration); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ChatMessageContentExtensions.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ChatMessageContentExtensions.cs new file mode 100644 index 00000000000..adbc878f564 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ChatMessageContentExtensions.cs @@ -0,0 +1,78 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics.CodeAnalysis; +using Microsoft.SemanticKernel; + +namespace Microsoft.Extensions.AI; + +internal static class ChatMessageContentExtensions +{ + [Experimental("AIEVAL001")] + internal static ChatMessage ToChatMessage(this ChatMessageContent content) + { + ChatMessage message = new() + { + AdditionalProperties = content.Metadata is not null ? new(content.Metadata) : null, + AuthorName = content.AuthorName, + RawRepresentation = content.InnerContent, + Role = content.Role.Label is string label ? new ChatRole(label) : ChatRole.User, + }; + + foreach (var item in content.Items) + { + AIContent? aiContent = null; + switch (item) + { + case SemanticKernel.TextContent tc: + aiContent = new TextContent(tc.Text); + break; + + case ImageContent ic: +#pragma warning disable S3358 // Ternary operators should not be nested + aiContent = + ic.DataUri is not null ? new Microsoft.Extensions.AI.DataContent(ic.DataUri, ic.MimeType) : + ic.Uri is not null ? new Microsoft.Extensions.AI.UriContent(ic.Uri, ic.MimeType ?? "image/*") : + null; +#pragma warning restore S3358 + break; + + case AudioContent ac: +#pragma warning disable S3358 // Ternary operators should not be nested + aiContent = + ac.DataUri is not null ? new Microsoft.Extensions.AI.DataContent(ac.DataUri, ac.MimeType) : + ac.Uri is not null ? new Microsoft.Extensions.AI.UriContent(ac.Uri, ac.MimeType ?? "audio/*") : + null; +#pragma warning restore S3358 + break; + + case BinaryContent bc: +#pragma warning disable S3358 // Ternary operators should not be nested + aiContent = + bc.DataUri is not null ? new Microsoft.Extensions.AI.DataContent(bc.DataUri, bc.MimeType) : + bc.Uri is not null ? new Microsoft.Extensions.AI.UriContent(bc.Uri, bc.MimeType ?? "application/octet-stream") : + null; +#pragma warning restore S3358 + break; + + case SemanticKernel.FunctionCallContent fcc: + aiContent = new FunctionCallContent(fcc.Id ?? string.Empty, fcc.FunctionName, fcc.Arguments); + break; + + case SemanticKernel.FunctionResultContent frc: + aiContent = new FunctionResultContent(frc.CallId ?? string.Empty, frc.Result); + break; + } + + if (aiContent is not null) + { + aiContent.RawRepresentation = item.InnerContent; + aiContent.AdditionalProperties = item.Metadata is not null ? new(item.Metadata) : null; + + message.Contents.Add(aiContent); + } + } + + return message; + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj index c08667ff421..9e61c42f356 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj @@ -23,6 +23,8 @@ + + diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs index b56a2673b60..ecec3ad51e5 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/QualityEvaluatorTests.cs @@ -278,6 +278,7 @@ await scenarioRun.EvaluateAsync( ReferenceEquals(context4, retrievedContextChunksForRetrievalEvaluator)); } + [MemberNotNull(nameof(_chatOptions))] [MemberNotNull(nameof(_qualityReportingConfiguration))] [MemberNotNull(nameof(_needsContextReportingConfiguration))] private static void SkipIfNotConfigured() @@ -287,6 +288,7 @@ private static void SkipIfNotConfigured() throw new SkipTestException("Test is not configured"); } + Assert.NotNull(_chatOptions); Assert.NotNull(_qualityReportingConfiguration); Assert.NotNull(_needsContextReportingConfiguration); } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs index 609646c8061..630adbffd8e 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/SafetyEvaluatorTests.cs @@ -548,6 +548,7 @@ await _mixedQualityAndSafetyReportingConfiguration.CreateScenarioRunAsync( Assert.True(result.TryGet(ViolenceEvaluator.ViolenceMetricName, out NumericMetric? _)); } + [MemberNotNull(nameof(_chatOptions))] [MemberNotNull(nameof(_contentSafetyReportingConfiguration))] [MemberNotNull(nameof(_imageContentSafetyReportingConfiguration))] [MemberNotNull(nameof(_codeVulnerabilityReportingConfiguration))] @@ -559,6 +560,7 @@ private static void SkipIfNotConfigured() throw new SkipTestException("Test is not configured"); } + Assert.NotNull(_chatOptions); Assert.NotNull(_contentSafetyReportingConfiguration); Assert.NotNull(_codeVulnerabilityReportingConfiguration); Assert.NotNull(_imageContentSafetyReportingConfiguration); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Setup.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Setup.cs index 30cb541e700..d8da55af77f 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Setup.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Setup.cs @@ -5,6 +5,7 @@ using System.ClientModel; using Azure.AI.OpenAI; using Azure.Identity; +using Microsoft.SemanticKernel; namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests; @@ -14,6 +15,29 @@ internal static class Setup Environment.GetEnvironmentVariable("AITESTING_OFFLINE") == "1"; internal static ChatConfiguration CreateChatConfiguration() + { + AzureOpenAIClient azureOpenAIClient = GetAzureOpenAIClient(); + IChatClient chatClient = azureOpenAIClient.GetChatClient(Settings.Current.DeploymentName).AsIChatClient(); + return new ChatConfiguration(chatClient); + } + + internal static Kernel CreateKernel() + { + AzureOpenAIClient azureOpenAIClient = GetAzureOpenAIClient(); + + Kernel kernel = + Kernel + .CreateBuilder() + .AddAzureOpenAIChatClient( + deploymentName: Settings.Current.DeploymentName, + azureOpenAIClient, + modelId: Settings.Current.ModelName) + .Build(); + + return kernel; + } + + private static AzureOpenAIClient GetAzureOpenAIClient() { var endpoint = new Uri(Settings.Current.Endpoint); AzureOpenAIClientOptions options = new(); @@ -22,8 +46,6 @@ internal static ChatConfiguration CreateChatConfiguration() OfflineOnly ? new AzureOpenAIClient(endpoint, new ApiKeyCredential("Bogus"), options) : new AzureOpenAIClient(endpoint, credential, options); - - IChatClient chatClient = azureClient.GetChatClient(Settings.Current.DeploymentName).AsIChatClient(); - return new ChatConfiguration(chatClient); + return azureClient; } } From 82dd7825cd969cba13d8bbd9a5d161c1a20c8ea1 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Tue, 10 Jun 2025 17:41:16 -0700 Subject: [PATCH 07/15] Remove SemanticKernel tests --- eng/packages/TestOnly.props | 2 - .../AgentQualityEvaluatorTests.cs | 75 +----------------- .../ChatMessageContentExtensions.cs | 78 ------------------- ...ons.AI.Evaluation.Integration.Tests.csproj | 2 - .../Setup.cs | 23 +----- 5 files changed, 5 insertions(+), 175 deletions(-) delete mode 100644 test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ChatMessageContentExtensions.cs diff --git a/eng/packages/TestOnly.props b/eng/packages/TestOnly.props index 5d4177e326c..3b511fa037f 100644 --- a/eng/packages/TestOnly.props +++ b/eng/packages/TestOnly.props @@ -20,8 +20,6 @@ - - diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs index 70207bf55fb..27c928f05c7 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs @@ -11,9 +11,6 @@ using Microsoft.Extensions.AI.Evaluation.Reporting; using Microsoft.Extensions.AI.Evaluation.Reporting.Storage; using Microsoft.Extensions.AI.Evaluation.Tests; -using Microsoft.Extensions.DependencyInjection; -using Microsoft.SemanticKernel; -using Microsoft.SemanticKernel.ChatCompletion; using Microsoft.TestUtilities; using Xunit; @@ -43,16 +40,7 @@ static AgentQualityEvaluatorTests() { Temperature = 0.0f, ResponseFormat = ChatResponseFormat.Text, - Tools = - [ - AIFunctionFactory.Create( - GetOrders, name: $"{nameof(AgentQualityEvaluatorTests)}_{nameof(GetOrders)}"), - AIFunctionFactory.Create( - GetOrderStatus, name: $"{nameof(AgentQualityEvaluatorTests)}_{nameof(GetOrderStatus)}") - ] - - // Note: We prefix the tool names with the test class name so that the tool names used are - // consistent between Microsoft.Extensions.AI and SemanticKernel. + Tools = [AIFunctionFactory.Create(GetOrders), AIFunctionFactory.Create(GetOrderStatus)] }; ChatConfiguration chatConfiguration = Setup.CreateChatConfiguration(); @@ -142,34 +130,6 @@ await scenarioRun.EvaluateAsync( Assert.True(result.TryGet(ToolCallAccuracyEvaluator.ToolCallAccuracyMetricName, out BooleanMetric? _)); } - [ConditionalFact] - public async Task EvaluateToolCallsPerformedUsingSemanticKernel() - { - SkipIfNotConfigured(); - - await using ScenarioRun scenarioRun = - await _needsContextReportingConfiguration.CreateScenarioRunAsync( - scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(AgentQualityEvaluatorTests)}.{nameof(EvaluateToolCallsPerformedUsingSemanticKernel)}"); - - (IEnumerable messages, ChatResponse response, IEnumerable toolDefinitions) = - await GetConversationUsingSemanticKernelAsync(scenarioRun.ChatConfiguration!.ChatClient); - - var toolDefinitionsForToolCallAccuracyEvaluator = new ToolCallAccuracyEvaluatorContext(toolDefinitions); - - EvaluationResult result = - await scenarioRun.EvaluateAsync( - messages, - response, - additionalContext: [toolDefinitionsForToolCallAccuracyEvaluator]); - - Assert.False( - result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), - string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); - - Assert.Single(result.Metrics); - Assert.True(result.TryGet(ToolCallAccuracyEvaluator.ToolCallAccuracyMetricName, out BooleanMetric? _)); - } - private static async Task<(IEnumerable messages, ChatResponse response)> GetConversationAsync(IChatClient chatClient) { @@ -183,38 +143,6 @@ await scenarioRun.EvaluateAsync( return (messages, response); } - private static async Task<(IEnumerable messages, ChatResponse response, IEnumerable toolDefinitions)> - GetConversationUsingSemanticKernelAsync(IChatClient chatClient) - { - IChatCompletionService chatCompletionService = chatClient.AsChatCompletionService(); - IKernelBuilder builder = Kernel.CreateBuilder(); - builder.Services.AddSingleton(chatCompletionService); - Kernel kernel = builder.Build(); - - kernel.ImportPluginFromType(); - var settings = - new PromptExecutionSettings - { - FunctionChoiceBehavior = FunctionChoiceBehavior.Auto() - }; - - ChatHistory skHistory = new ChatHistory(); - skHistory.AddMessage(AuthorRole.System, "You are a friendly and helpful customer service agent."); - skHistory.AddMessage(AuthorRole.User, "Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?"); - List messages = [.. skHistory.Select(m => m.ToChatMessage())]; - - ChatMessageContent skResponse = - await chatCompletionService.GetChatMessageContentAsync(skHistory, settings, kernel); - - skHistory.RemoveRange(0, 2); // Trim to only include the tool call and tool call result messages. - IEnumerable toolMessages = skHistory.Select(m => m.ToChatMessage()); - ChatMessage finalResponseMessage = skResponse.ToChatMessage(); - ChatResponse response = new ChatResponse([.. toolMessages, finalResponseMessage]); - - return (messages, response, kernel.Plugins.SelectMany(p => p.AsAIFunctions())); - } - - [KernelFunction] [Description("Gets the orders for a customer")] private static IReadOnlyList GetOrders(int accountNumber) { @@ -226,7 +154,6 @@ private static IReadOnlyList GetOrders(int accountNumber) return [new Order(123), new Order(124)]; } - [KernelFunction] [Description("Gets the delivery status of an order")] private static OrderStatus GetOrderStatus(int orderId) { diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ChatMessageContentExtensions.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ChatMessageContentExtensions.cs deleted file mode 100644 index adbc878f564..00000000000 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ChatMessageContentExtensions.cs +++ /dev/null @@ -1,78 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Diagnostics.CodeAnalysis; -using Microsoft.SemanticKernel; - -namespace Microsoft.Extensions.AI; - -internal static class ChatMessageContentExtensions -{ - [Experimental("AIEVAL001")] - internal static ChatMessage ToChatMessage(this ChatMessageContent content) - { - ChatMessage message = new() - { - AdditionalProperties = content.Metadata is not null ? new(content.Metadata) : null, - AuthorName = content.AuthorName, - RawRepresentation = content.InnerContent, - Role = content.Role.Label is string label ? new ChatRole(label) : ChatRole.User, - }; - - foreach (var item in content.Items) - { - AIContent? aiContent = null; - switch (item) - { - case SemanticKernel.TextContent tc: - aiContent = new TextContent(tc.Text); - break; - - case ImageContent ic: -#pragma warning disable S3358 // Ternary operators should not be nested - aiContent = - ic.DataUri is not null ? new Microsoft.Extensions.AI.DataContent(ic.DataUri, ic.MimeType) : - ic.Uri is not null ? new Microsoft.Extensions.AI.UriContent(ic.Uri, ic.MimeType ?? "image/*") : - null; -#pragma warning restore S3358 - break; - - case AudioContent ac: -#pragma warning disable S3358 // Ternary operators should not be nested - aiContent = - ac.DataUri is not null ? new Microsoft.Extensions.AI.DataContent(ac.DataUri, ac.MimeType) : - ac.Uri is not null ? new Microsoft.Extensions.AI.UriContent(ac.Uri, ac.MimeType ?? "audio/*") : - null; -#pragma warning restore S3358 - break; - - case BinaryContent bc: -#pragma warning disable S3358 // Ternary operators should not be nested - aiContent = - bc.DataUri is not null ? new Microsoft.Extensions.AI.DataContent(bc.DataUri, bc.MimeType) : - bc.Uri is not null ? new Microsoft.Extensions.AI.UriContent(bc.Uri, bc.MimeType ?? "application/octet-stream") : - null; -#pragma warning restore S3358 - break; - - case SemanticKernel.FunctionCallContent fcc: - aiContent = new FunctionCallContent(fcc.Id ?? string.Empty, fcc.FunctionName, fcc.Arguments); - break; - - case SemanticKernel.FunctionResultContent frc: - aiContent = new FunctionResultContent(frc.CallId ?? string.Empty, frc.Result); - break; - } - - if (aiContent is not null) - { - aiContent.RawRepresentation = item.InnerContent; - aiContent.AdditionalProperties = item.Metadata is not null ? new(item.Metadata) : null; - - message.Contents.Add(aiContent); - } - } - - return message; - } -} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj index 9e61c42f356..c08667ff421 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj @@ -23,8 +23,6 @@ - - diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Setup.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Setup.cs index d8da55af77f..388ba1f1415 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Setup.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Setup.cs @@ -5,7 +5,6 @@ using System.ClientModel; using Azure.AI.OpenAI; using Azure.Identity; -using Microsoft.SemanticKernel; namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests; @@ -21,31 +20,17 @@ internal static ChatConfiguration CreateChatConfiguration() return new ChatConfiguration(chatClient); } - internal static Kernel CreateKernel() - { - AzureOpenAIClient azureOpenAIClient = GetAzureOpenAIClient(); - - Kernel kernel = - Kernel - .CreateBuilder() - .AddAzureOpenAIChatClient( - deploymentName: Settings.Current.DeploymentName, - azureOpenAIClient, - modelId: Settings.Current.ModelName) - .Build(); - - return kernel; - } - private static AzureOpenAIClient GetAzureOpenAIClient() { var endpoint = new Uri(Settings.Current.Endpoint); AzureOpenAIClientOptions options = new(); var credential = new ChainedTokenCredential(new AzureCliCredential(), new DefaultAzureCredential()); - AzureOpenAIClient azureClient = + + AzureOpenAIClient azureOpenAIClient = OfflineOnly ? new AzureOpenAIClient(endpoint, new ApiKeyCredential("Bogus"), options) : new AzureOpenAIClient(endpoint, credential, options); - return azureClient; + + return azureOpenAIClient; } } From 1c4f5c88eec94d0b24f5d23651b82939d3e98ee5 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Tue, 10 Jun 2025 21:26:43 -0700 Subject: [PATCH 08/15] Introduce TaskAdherenceEvaluator --- .../RelevanceEvaluator.cs | 3 +- .../RetrievalEvaluator.cs | 3 +- .../TaskAdherenceEvaluator.cs | 267 ++++++++++++++++++ .../TaskAdherenceEvaluatorContext.cs | 90 ++++++ .../ToolCallAccuracyEvaluator.cs | 6 +- .../ToolCallAccuracyEvaluatorContext.cs | 2 +- 6 files changed, 363 insertions(+), 8 deletions(-) create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluator.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluatorContext.cs diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceEvaluator.cs index cf436961307..3946853a2a4 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceEvaluator.cs @@ -75,8 +75,7 @@ public async ValueTask EvaluateAsync( var metric = new NumericMetric(RelevanceMetricName); var result = new EvaluationResult(metric); - if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || - string.IsNullOrWhiteSpace(userRequest.Text)) + if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || string.IsNullOrWhiteSpace(userRequest.Text)) { metric.AddDiagnostics( EvaluationDiagnostic.Error( diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs index c6f59cceee8..cd2f94456e6 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RetrievalEvaluator.cs @@ -80,8 +80,7 @@ public async ValueTask EvaluateAsync( var metric = new NumericMetric(RetrievalMetricName); var result = new EvaluationResult(metric); - if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || - string.IsNullOrWhiteSpace(userRequest.Text)) + if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || string.IsNullOrWhiteSpace(userRequest.Text)) { metric.AddDiagnostics( EvaluationDiagnostic.Error( diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluator.cs new file mode 100644 index 00000000000..a51360ed68c --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluator.cs @@ -0,0 +1,267 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Utilities; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +/// +/// An that evaluates an AI system's effectiveness at adhering to the task assigned to it. +/// +/// +/// +/// measures how accurately an AI system adheres to the task assigned to it by +/// examining the alignment of the supplied response with instructions and definitions present in the conversation +/// history, the accuracy and clarity of the response, and the proper use of tool definitions supplied via +/// . +/// +/// +/// Note that at the moment, only supports evaluating calls to tools that are +/// defined as s. Any other definitions that are supplied via +/// will be ignored. +/// +/// +/// returns a that contains a score for 'Task +/// Adherence'. The score is a number between 1 and 5, with 1 indicating a poor score, and 5 indicating an excellent +/// score. +/// +/// +/// Note: is an AI-based evaluator that uses an AI model to perform its +/// evaluation. While the prompt that this evaluator uses to perform its evaluation is designed to be model-agnostic, +/// the performance of this prompt (and the resulting evaluation) can vary depending on the model used, and can be +/// especially poor when a smaller / local model is used. +/// +/// +/// The prompt that uses has been tested against (and tuned to work well with) +/// the following models. So, using this evaluator with a model from the following list is likely to produce the best +/// results. (The model to be used can be configured via .) +/// +/// +/// GPT-4o +/// +/// +[Experimental("AIEVAL001")] +public sealed class TaskAdherenceEvaluator : IEvaluator +{ + /// + /// Gets the of the returned by + /// . + /// + public static string TaskAdherenceMetricName => "Task Adherence"; + + /// + public IReadOnlyCollection EvaluationMetricNames { get; } = [TaskAdherenceMetricName]; + + private static readonly ChatOptions _chatOptions = + new ChatOptions + { + Temperature = 0.0f, + MaxOutputTokens = 800, + TopP = 1.0f, + PresencePenalty = 0.0f, + FrequencyPenalty = 0.0f, + ResponseFormat = ChatResponseFormat.Text + }; + + /// + public async ValueTask EvaluateAsync( + IEnumerable messages, + ChatResponse modelResponse, + ChatConfiguration? chatConfiguration = null, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(modelResponse); + _ = Throw.IfNull(chatConfiguration); + + var metric = new NumericMetric(TaskAdherenceMetricName); + var result = new EvaluationResult(metric); + + if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || string.IsNullOrWhiteSpace(userRequest.Text)) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(messages)} supplied for evaluation did not contain a user request as the last message.")); + + return result; + } + + if (string.IsNullOrWhiteSpace(modelResponse.Text)) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation was null or empty.")); + + return result; + } + + TaskAdherenceEvaluatorContext? context = + additionalContext?.OfType().FirstOrDefault(); + + if (context is not null && context.ToolDefinitions.Count is 0) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"Supplied {nameof(TaskAdherenceEvaluatorContext)} did not contain any {nameof(TaskAdherenceEvaluatorContext.ToolDefinitions)}.")); + + return result; + } + + var toolDefinitionNames = new HashSet(context?.ToolDefinitions.Select(td => td.Name) ?? []); + IEnumerable toolCalls = + modelResponse.Messages.SelectMany(m => m.Contents).OfType(); + + if (toolCalls.Any(t => !toolDefinitionNames.Contains(t.Name))) + { + if (context is null) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation contained calls to tools that were not supplied via {nameof(TaskAdherenceEvaluatorContext)}.")); + } + else + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation contained calls to tools that were not included in the supplied {nameof(TaskAdherenceEvaluatorContext)}.")); + } + + return result; + } + + List evaluationInstructions = GetEvaluationInstructions(messages, modelResponse, context); + + (ChatResponse evaluationResponse, TimeSpan evaluationDuration) = + await TimingHelper.ExecuteWithTimingAsync(() => + chatConfiguration.ChatClient.GetResponseAsync( + evaluationInstructions, + _chatOptions, + cancellationToken)).ConfigureAwait(false); + + _ = metric.TryParseEvaluationResponseWithTags(evaluationResponse, evaluationDuration); + + if (context is not null) + { + metric.AddOrUpdateContext(context); + } + + metric.Interpretation = metric.InterpretScore(); + return result; + } + + private static List GetEvaluationInstructions( + IEnumerable messages, + ChatResponse modelResponse, + TaskAdherenceEvaluatorContext? context) + { + string renderedConversation = messages.RenderAsJson(); + string renderedModelResponse = modelResponse.RenderAsJson(); + string? renderedToolDefinitions = context?.ToolDefinitions.RenderAsJson(); + +#pragma warning disable S103 // Lines should not be too long + string systemPrompt = + $$""" + # Instruction + ## Context + ### You are an expert in evaluating the quality of an answer from an intelligent system based on provided definitions and data. Your goal will involve answering the questions below using the information provided. + - **Definition**: Based on the provided query, response, and tool definitions, evaluate the agent's adherence to the assigned task. + - **Data**: Your input data includes query, response, and tool definitions. + - **Questions**: To complete your evaluation you will be asked to evaluate the Data in different ways. + + # Definition + + **Level 1: Fully Inadherent** + + **Definition:** + Response completely ignores instructions or deviates significantly + + **Example:** + **Query:** What is a recommended weekend itinerary in Paris? + **Response:** Paris is a lovely city with a rich history. + + Explanation: This response completely misses the task by not providing any itinerary details. It offers a generic statement about Paris rather than a structured travel plan. + + + **Level 2: Barely Adherent** + + **Definition:** + Response partially aligns with instructions but has critical gaps. + + **Example:** + **Query:** What is a recommended weekend itinerary in Paris? + **Response:** Spend your weekend visiting famous places in Paris. + + Explanation: While the response hints at visiting well-known sites, it is extremely vague and lacks specific details, such as which sites to visit or any order of activities, leaving major gaps in the instructions. + + + **Level 3: Moderately Adherent** + + **Definition:** + Response meets the core requirements but lacks precision or clarity. + + **Example:** + **Query:** What is a recommended weekend itinerary in Paris? + **Response:** Visit the Eiffel Tower and the Louvre on Saturday, and stroll through Montmartre on Sunday. + + Explanation: This answer meets the basic requirement by naming a few key attractions and assigning them to specific days. However, it lacks additional context, such as timings, additional activities, or details to make the itinerary practical and clear. + + + **Level 4: Mostly Adherent** + + **Definition:** + Response is clear, accurate, and aligns with instructions with minor issues. + + **Example:** + **Query:** What is a recommended weekend itinerary in Paris? + **Response:** For a weekend in Paris, start Saturday with a morning visit to the Eiffel Tower, then head to the Louvre in the early afternoon. In the evening, enjoy a leisurely walk along the Seine. On Sunday, begin with a visit to Notre-Dame Cathedral, followed by exploring the art and cafés in Montmartre. This plan offers a mix of cultural visits and relaxing experiences. + + Explanation: This response is clear, structured, and provides a concrete itinerary with specific attractions and a suggested order of activities. It is accurate and useful, though it might benefit from a few more details like exact timings or restaurant suggestions to be perfect. + + + **Level 5: Fully Adherent** + + **Definition:** + Response is flawless, accurate, and follows instructions to the letter. + + **Example:** + **Query:** What is a recommended weekend itinerary in Paris? + **Response:** Here is a detailed weekend itinerary in Paris: + Saturday: + Morning: Begin your day with a visit to the Eiffel Tower to admire the views from the top. + Early Afternoon: Head to the Louvre for a guided tour of its most famous exhibits. + Late Afternoon: Take a relaxing walk along the Seine, stopping at local boutiques. + Evening: Enjoy dinner at a classic Parisian bistro near the river. + Sunday: + Morning: Visit the Notre-Dame Cathedral to explore its architecture and history. + Midday: Wander the charming streets of Montmartre, stopping by art galleries and cafés. + Afternoon: Finish your trip with a scenic boat tour on the Seine. + This itinerary balances cultural immersion, leisure, and local dining experiences, ensuring a well-rounded visit. + + Explanation: This response is comprehensive and meticulously follows the instructions. It provides detailed steps, timings, and a variety of activities that fully address the query, leaving no critical gaps. + + # Data + Query: {{renderedConversation}} + Response: {{renderedModelResponse}} + Tool Definitions: {{renderedToolDefinitions}} + + # Tasks + ## Please provide your assessment Score for the previous answer. Your output should include the following information: + - **ThoughtChain**: To improve the reasoning process, Think Step by Step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and Start your ThoughtChain with "Let's think step by step:". + - **Explanation**: a very short explanation of why you think the input data should get that Score. + - **Score**: based on your previous analysis, provide your Score. The answer you give MUST be an integer score ("1", "2", ...) based on the categories of the definitions. + + ## Please provide your answers between the tags: your chain of thoughts, your explanation, your score. + # Output + """; +#pragma warning restore S103 + + List evaluationInstructions = [new ChatMessage(ChatRole.System, systemPrompt)]; + return evaluationInstructions; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluatorContext.cs new file mode 100644 index 00000000000..3d54ed74dab --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluatorContext.cs @@ -0,0 +1,90 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +/// +/// Contextual information that the uses to evaluate an AI system's +/// effectiveness at adhering to the task assigned to it. +/// +/// +/// +/// measures how accurately an AI system adheres to the task assigned to it by +/// examining the alignment of the supplied response with instructions and definitions present in the conversation +/// history, the accuracy and clarity of the response, and the proper use of tool definitions supplied via +/// . +/// +/// +/// Note that at the moment, only supports evaluating calls to tools that are +/// defined as s. Any other definitions that are supplied via +/// will be ignored. +/// +/// +[Experimental("AIEVAL001")] +public sealed class TaskAdherenceEvaluatorContext : EvaluationContext +{ + /// + /// Initializes a new instance of the class. + /// + /// + /// + /// The set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions will be ignored. + /// + /// + public TaskAdherenceEvaluatorContext(IEnumerable toolDefinitions) + : base(name: TaskAdherenceContextName, contents: [new TextContent(toolDefinitions.RenderAsJson())]) + { + ToolDefinitions = [.. toolDefinitions]; + } + + /// + /// Initializes a new instance of the class. + /// + /// + /// + /// The set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions will be ignored. + /// + /// + public TaskAdherenceEvaluatorContext(params AITool[] toolDefinitions) + : this(toolDefinitions as IEnumerable) + { + } + + /// + /// Gets the unique that is used for + /// . + /// + public static string TaskAdherenceContextName => "Tool Definitions (Task Adherence)"; + + /// + /// Gets set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// + /// measures how accurately an AI system adheres to the task assigned to it by + /// examining the alignment of the supplied response with instructions and definitions present in the conversation + /// history, the accuracy and clarity of the response, and the proper use of tool definitions supplied via + /// . + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that are + /// defined as s. Any other definitions that are supplied via + /// will be ignored. + /// + /// + public IReadOnlyList ToolDefinitions { get; } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs index 83781fb1f7b..b38ffa546e7 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs @@ -86,8 +86,7 @@ public async ValueTask EvaluateAsync( var metric = new BooleanMetric(ToolCallAccuracyMetricName); var result = new EvaluationResult(metric); - if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || - string.IsNullOrWhiteSpace(userRequest.Text)) + if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || string.IsNullOrWhiteSpace(userRequest.Text)) { metric.AddDiagnostics( EvaluationDiagnostic.Error( @@ -123,7 +122,8 @@ public async ValueTask EvaluateAsync( return result; } - IEnumerable toolCalls = modelResponse.Messages.SelectMany(m => m.Contents).OfType(); + IEnumerable toolCalls = + modelResponse.Messages.SelectMany(m => m.Contents).OfType(); if (!toolCalls.Any()) { diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs index c3d81c0a05f..d25e586163a 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluatorContext.cs @@ -66,7 +66,7 @@ public ToolCallAccuracyEvaluatorContext(params AITool[] toolDefinitions) /// /// Gets the unique that is used for - /// . + /// . /// public static string ToolCallAccuracyContextName => "Tool Definitions (Tool Call Accuracy)"; From 3f37c9efd16a4d7c4ca38d0861f387d8ee4f87df Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Tue, 10 Jun 2025 21:28:12 -0700 Subject: [PATCH 09/15] Update tests --- .../AgentQualityEvaluatorTests.cs | 88 +++++++++++++++++-- 1 file changed, 80 insertions(+), 8 deletions(-) diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs index 27c928f05c7..61d12f07d8b 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs @@ -60,11 +60,12 @@ static AgentQualityEvaluatorTests() string usesContext = $"Feature: Context"; IEvaluator toolCallAccuracyEvaluator = new ToolCallAccuracyEvaluator(); + IEvaluator taskAdherenceEvaluator = new TaskAdherenceEvaluator(); _agentQualityReportingConfiguration = DiskBasedReportingConfiguration.Create( storageRootPath: Settings.Current.StorageRootPath, - evaluators: [], + evaluators: [taskAdherenceEvaluator], chatConfiguration: chatConfigurationWithToolCalling, executionName: Constants.Version, tags: [version, date, projectName, testClass, provider, model, temperature]); @@ -72,13 +73,64 @@ static AgentQualityEvaluatorTests() _needsContextReportingConfiguration = DiskBasedReportingConfiguration.Create( storageRootPath: Settings.Current.StorageRootPath, - evaluators: [toolCallAccuracyEvaluator], + evaluators: [toolCallAccuracyEvaluator, taskAdherenceEvaluator], chatConfiguration: chatConfigurationWithToolCalling, executionName: Constants.Version, tags: [version, date, projectName, testClass, provider, model, temperature, usesContext]); } } + [ConditionalFact] + public async Task ToolDefinitionsAreNotNeededAndNotPassed() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _agentQualityReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(AgentQualityEvaluatorTests)}.{nameof(ToolDefinitionsAreNotNeededAndNotPassed)}"); + + (IEnumerable messages, ChatResponse response) = + await GetConversationWithoutToolsAsync(scenarioRun.ChatConfiguration!.ChatClient); + + EvaluationResult result = await scenarioRun.EvaluateAsync(messages, response); + + Assert.False( + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Single(result.Metrics); + Assert.True(result.TryGet(TaskAdherenceEvaluator.TaskAdherenceMetricName, out NumericMetric? _)); + } + + [ConditionalFact] + public async Task ToolDefinitionsAreNotNeededButPassed() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _agentQualityReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(AgentQualityEvaluatorTests)}.{nameof(ToolDefinitionsAreNotNeededButPassed)}"); + + (IEnumerable messages, ChatResponse response) = + await GetConversationWithoutToolsAsync(scenarioRun.ChatConfiguration!.ChatClient); + + var toolDefinitionsForTaskAdherenceEvaluator = + new TaskAdherenceEvaluatorContext(toolDefinitions: _chatOptionsWithTools.Tools!); + + EvaluationResult result = + await scenarioRun.EvaluateAsync( + messages, + response, + additionalContext: [toolDefinitionsForTaskAdherenceEvaluator]); + + Assert.False( + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Single(result.Metrics); + Assert.True(result.TryGet(TaskAdherenceEvaluator.TaskAdherenceMetricName, out NumericMetric? _)); + } + [ConditionalFact] public async Task ToolDefinitionsAreNeededButNotPassed() { @@ -89,7 +141,7 @@ await _needsContextReportingConfiguration.CreateScenarioRunAsync( scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(AgentQualityEvaluatorTests)}.{nameof(ToolDefinitionsAreNeededButNotPassed)}"); (IEnumerable messages, ChatResponse response) = - await GetConversationAsync(scenarioRun.ChatConfiguration!.ChatClient); + await GetConversationWithToolsAsync(scenarioRun.ChatConfiguration!.ChatClient); EvaluationResult result = await scenarioRun.EvaluateAsync(messages, response); @@ -97,8 +149,9 @@ await _needsContextReportingConfiguration.CreateScenarioRunAsync( result.Metrics.Values.All(m => m.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error)), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); - Assert.Single(result.Metrics); + Assert.Equal(2, result.Metrics.Count); Assert.True(result.TryGet(ToolCallAccuracyEvaluator.ToolCallAccuracyMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(TaskAdherenceEvaluator.TaskAdherenceMetricName, out NumericMetric? _)); } [ConditionalFact] @@ -111,27 +164,46 @@ await _needsContextReportingConfiguration.CreateScenarioRunAsync( scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(AgentQualityEvaluatorTests)}.{nameof(ToolDefinitionsAreNeededAndPassed)}"); (IEnumerable messages, ChatResponse response) = - await GetConversationAsync(scenarioRun.ChatConfiguration!.ChatClient); + await GetConversationWithToolsAsync(scenarioRun.ChatConfiguration!.ChatClient); var toolDefinitionsForToolCallAccuracyEvaluator = new ToolCallAccuracyEvaluatorContext(toolDefinitions: _chatOptionsWithTools.Tools!); + var toolDefinitionsForTaskAdherenceEvaluator = + new TaskAdherenceEvaluatorContext(toolDefinitions: _chatOptionsWithTools.Tools!); + EvaluationResult result = await scenarioRun.EvaluateAsync( messages, response, - additionalContext: [toolDefinitionsForToolCallAccuracyEvaluator]); + additionalContext: [ + toolDefinitionsForToolCallAccuracyEvaluator, + toolDefinitionsForTaskAdherenceEvaluator]); Assert.False( result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); - Assert.Single(result.Metrics); + Assert.Equal(2, result.Metrics.Count); Assert.True(result.TryGet(ToolCallAccuracyEvaluator.ToolCallAccuracyMetricName, out BooleanMetric? _)); + Assert.True(result.TryGet(TaskAdherenceEvaluator.TaskAdherenceMetricName, out NumericMetric? _)); + } + + private static async Task<(IEnumerable messages, ChatResponse response)> + GetConversationWithoutToolsAsync(IChatClient chatClient) + { + List messages = + [ + "You are a friendly and helpful assistant that can answer questions.".ToSystemMessage(), + "Hi, could you help me figure out the correct pronunciation for the word rendezvous?".ToUserMessage() + ]; + + ChatResponse response = await chatClient.GetResponseAsync(messages, _chatOptions); + return (messages, response); } private static async Task<(IEnumerable messages, ChatResponse response)> - GetConversationAsync(IChatClient chatClient) + GetConversationWithToolsAsync(IChatClient chatClient) { List messages = [ From 0d832ff13761ddc47150477c983546429ba31fbb Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 11 Jun 2025 15:04:44 -0700 Subject: [PATCH 10/15] Make tool definitions more complete --- .../AIToolExtensions.cs | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/AIToolExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/AIToolExtensions.cs index ec6a35d59d3..3dbc8211416 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/AIToolExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/AIToolExtensions.cs @@ -21,11 +21,21 @@ internal static string RenderAsJson( foreach (AIFunction function in toolDefinitions.OfType()) { - JsonNode? functionJsonNode = JsonNode.Parse(function.JsonSchema.GetRawText()); - if (functionJsonNode is not null) + JsonNode functionJsonNode = + new JsonObject + { + ["name"] = function.Name, + ["description"] = function.Description, + ["functionSchema"] = JsonNode.Parse(function.JsonSchema.GetRawText()), + }; + + if (function.ReturnJsonSchema is not null) { - toolDefinitionsJsonArray.Add(functionJsonNode); + functionJsonNode["functionReturnValueSchema"] = + JsonNode.Parse(function.ReturnJsonSchema.Value.GetRawText()); } + + toolDefinitionsJsonArray.Add(functionJsonNode); } string renderedToolDefinitions = toolDefinitionsJsonArray.ToJsonString(options); From 093d2e1487aab62f8e8a0b1b5081aa9264f6f5f9 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 11 Jun 2025 15:05:33 -0700 Subject: [PATCH 11/15] Simplify serialization to JsonNode --- .../ChatMessageExtensions.cs | 5 ++--- .../ChatResponseExtensions.cs | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatMessageExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatMessageExtensions.cs index 2ad8e113f71..cfad90f85f4 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatMessageExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatMessageExtensions.cs @@ -18,12 +18,11 @@ internal static string RenderAsJson(this IEnumerable messages, Json foreach (ChatMessage message in messages) { - string messageJsonString = - JsonSerializer.Serialize( + JsonNode? messageJsonNode = + JsonSerializer.SerializeToNode( message, AIJsonUtilities.DefaultOptions.GetTypeInfo(typeof(ChatMessage))); - JsonNode? messageJsonNode = JsonNode.Parse(messageJsonString); if (messageJsonNode is not null) { messagesJsonArray.Add(messageJsonNode); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatResponseExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatResponseExtensions.cs index fa4f30fd45e..c579caa7cb1 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatResponseExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatResponseExtensions.cs @@ -33,12 +33,11 @@ internal static string RenderToolCallsAndResultsAsJson( Type contentType = content is FunctionCallContent ? typeof(FunctionCallContent) : typeof(FunctionResultContent); - string toolCallOrResultJsonString = - JsonSerializer.Serialize( + JsonNode? toolCallOrResultJsonNode = + JsonSerializer.SerializeToNode( content, AIJsonUtilities.DefaultOptions.GetTypeInfo(contentType)); - JsonNode? toolCallOrResultJsonNode = JsonNode.Parse(toolCallOrResultJsonString); if (toolCallOrResultJsonNode is not null) { toolCallsAndResultsJsonArray.Add(toolCallOrResultJsonNode); From eb33b31a8d6e0670f9aaadc124157cc279abd040 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Thu, 12 Jun 2025 00:35:09 -0700 Subject: [PATCH 12/15] Introduce IntentResolutionEvaluator --- .../IntentResolutionEvaluator.cs | 406 ++++++++++++++++++ .../IntentResolutionEvaluatorContext.cs | 88 ++++ .../IntentResolutionRating.cs | 68 +++ .../JsonSerialization/SerializerContext.cs | 14 + ...nceTruthAndCompletenessEvaluator.Rating.cs | 72 ---- ...CompletenessEvaluator.SerializerContext.cs | 16 - .../RelevanceTruthAndCompletenessEvaluator.cs | 38 +- .../RelevanceTruthAndCompletenessRating.cs | 65 +++ .../AgentQualityEvaluatorTests.cs | 23 +- ...levanceTruthAndCompletenessRatingTests.cs} | 29 +- 10 files changed, 684 insertions(+), 135 deletions(-) create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluator.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluatorContext.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionRating.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/JsonSerialization/SerializerContext.cs delete mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Rating.cs delete mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.SerializerContext.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessRating.cs rename test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/{RelevanceTruthAndCompletenessEvaluatorRatingTests.cs => RelevanceTruthAndCompletenessRatingTests.cs} (80%) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluator.cs new file mode 100644 index 00000000000..43ddb5e8334 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluator.cs @@ -0,0 +1,406 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Quality.Utilities; +using Microsoft.Extensions.AI.Evaluation.Utilities; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +/// +/// An that evaluates an AI system's effectiveness at identifying and resolving user intent. +/// +/// +/// +/// evaluates an AI system's effectiveness at identifying and resolving user +/// intent based on the supplied conversation history and the tool definitions supplied via +/// . +/// +/// +/// Note that at the moment, only supports evaluating calls to tools that are +/// defined as s. Any other definitions that are supplied via +/// will be ignored. +/// +/// +/// returns a that contains a score for 'Intent +/// Resolution'. The score is a number between 1 and 5, with 1 indicating a poor score, and 5 indicating an excellent +/// score. +/// +/// +/// Note: is an AI-based evaluator that uses an AI model to perform its +/// evaluation. While the prompt that this evaluator uses to perform its evaluation is designed to be model-agnostic, +/// the performance of this prompt (and the resulting evaluation) can vary depending on the model used, and can be +/// especially poor when a smaller / local model is used. +/// +/// +/// The prompt that uses has been tested against (and tuned to work well with) +/// the following models. So, using this evaluator with a model from the following list is likely to produce the best +/// results. (The model to be used can be configured via .) +/// +/// +/// GPT-4o +/// +/// +[Experimental("AIEVAL001")] +public sealed class IntentResolutionEvaluator : IEvaluator +{ + /// + /// Gets the of the returned by + /// . + /// + public static string IntentResolutionMetricName => "Intent Resolution"; + + /// + public IReadOnlyCollection EvaluationMetricNames { get; } = [IntentResolutionMetricName]; + + private static readonly ChatOptions _chatOptions = + new ChatOptions + { + Temperature = 0.0f, + MaxOutputTokens = 800, + TopP = 1.0f, + PresencePenalty = 0.0f, + FrequencyPenalty = 0.0f, + ResponseFormat = ChatResponseFormat.Json + }; + + /// + public async ValueTask EvaluateAsync( + IEnumerable messages, + ChatResponse modelResponse, + ChatConfiguration? chatConfiguration = null, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(modelResponse); + _ = Throw.IfNull(chatConfiguration); + + var metric = new NumericMetric(IntentResolutionMetricName); + var result = new EvaluationResult(metric); + + if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || string.IsNullOrWhiteSpace(userRequest.Text)) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(messages)} supplied for evaluation did not contain a user request as the last message.")); + + return result; + } + + if (string.IsNullOrWhiteSpace(modelResponse.Text)) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation was null or empty.")); + + return result; + } + + IntentResolutionEvaluatorContext? context = + additionalContext?.OfType().FirstOrDefault(); + + if (context is not null && context.ToolDefinitions.Count is 0) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"Supplied {nameof(IntentResolutionEvaluatorContext)} did not contain any {nameof(IntentResolutionEvaluatorContext.ToolDefinitions)}.")); + + return result; + } + + var toolDefinitionNames = new HashSet(context?.ToolDefinitions.Select(td => td.Name) ?? []); + IEnumerable toolCalls = + modelResponse.Messages.SelectMany(m => m.Contents).OfType(); + + if (toolCalls.Any(t => !toolDefinitionNames.Contains(t.Name))) + { + if (context is null) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation contained calls to tools that were not supplied via {nameof(IntentResolutionEvaluatorContext)}.")); + } + else + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation contained calls to tools that were not included in the supplied {nameof(IntentResolutionEvaluatorContext)}.")); + } + + return result; + } + + List evaluationInstructions = GetEvaluationInstructions(messages, modelResponse, context); + + (ChatResponse evaluationResponse, TimeSpan evaluationDuration) = + await TimingHelper.ExecuteWithTimingAsync(() => + chatConfiguration.ChatClient.GetResponseAsync( + evaluationInstructions, + _chatOptions, + cancellationToken)).ConfigureAwait(false); + + if (context is not null) + { + metric.AddOrUpdateContext(context); + } + + await ParseEvaluationResponseAsync( + metric, + evaluationResponse, + evaluationDuration, + chatConfiguration, + cancellationToken).ConfigureAwait(false); + + return result; + } + + private static List GetEvaluationInstructions( + IEnumerable messages, + ChatResponse modelResponse, + IntentResolutionEvaluatorContext? context) + { + const string SystemPrompt = + "You are an expert in evaluating the quality of a RESPONSE from an intelligent assistant based on provided definition and Data."; + + List evaluationInstructions = [new ChatMessage(ChatRole.System, SystemPrompt)]; + + string renderedConversation = messages.RenderAsJson(); + string renderedModelResponse = modelResponse.RenderAsJson(); + string? renderedToolDefinitions = context?.ToolDefinitions.RenderAsJson(); + +#pragma warning disable S103 // Lines should not be too long + string evaluationPrompt = + $$""" + # Goal + Your goal is to assess the quality of the RESPONSE of an assistant in relation to a QUERY from a user, specifically focusing on + the assistant's ability to understand and resolve the user intent expressed in the QUERY. There is also a field for tool definitions + describing the functions, if any, that are accessible to the agent and that the agent may invoke in the RESPONSE if necessary. + + There are two components to intent resolution: + - Intent Understanding: The extent to which the agent accurately discerns the user's underlying need or inquiry. + - Response Resolution: The degree to which the agent's response is comprehensive, relevant, and adequately addresses the user's request. + + Note that the QUERY can either be a string with a user request or an entire conversation history including previous requests and responses from the assistant. + In this case, the assistant's response should be evaluated in the context of the entire conversation but the focus should be on the last intent. + + # Data + QUERY: {{renderedConversation}} + RESPONSE: {{renderedModelResponse}} + TOOL_DEFINITIONS: {{renderedToolDefinitions}} + + + # Ratings + ## [Score: 1] (Response completely unrelated to user intent) + **Definition:** The agent's response does not address the query at all. + + **Example:** + **Query:** How do I bake a chocolate cake? + **Response:** The latest smartphone models have incredible features and performance. + **Tool Definitions:** [] + + **Expected output** + { + "explanation": "The agent's response is entirely off-topic, discussing smartphones instead of providing any information about baking a chocolate cake." + "conversation_has_intent": true, + "agent_perceived_intent": "discussion about smartphone features", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": false, + "intent_resolved": false, + "resolution_score": 1, + } + + + ## [Score: 2] (Response minimally relates to user intent) + **Definition:** The response shows a token attempt to address the query by mentioning a relevant keyword or concept, but it provides almost no useful or actionable information. + + **Example input:** + **Query:** How do I bake a chocolate cake? + **Response:** Chocolate cake involves some ingredients. + **Tool Definitions:** [] + + **Expected output** + { + "explanation": "While the response mentions 'ingredients' related to a chocolate cake, it barely addresses the process or any detailed steps, leaving the query unresolved." + "conversation_has_intent": true, + "agent_perceived_intent": "mention of ingredients", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": false, + "intent_resolved": false, + "resolution_score": 2, + } + + + ## [Score: 3] (Response partially addresses the user intent but lacks complete details) + **Definition:** The response provides a basic idea related to the query by mentioning a few relevant elements, but it omits several key details and specifics needed for fully resolving the user's query. + + **Example input:** + **Query:** How do I bake a chocolate cake? + **Response:** Preheat your oven and mix the ingredients before baking the cake. + **Tool Definitions:** [] + + **Expected output** + { + "explanation": "The response outlines a minimal process (preheating and mixing) but omits critical details like ingredient measurements, baking time, and temperature specifics, resulting in only a partial resolution of the query." + "conversation_has_intent": true, + "agent_perceived_intent": "basic baking process", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": false, + "resolution_score": 3, + } + + + ## [Score: 4] (Response addresses the user intent with moderate accuracy but has minor inaccuracies or omissions) + **Definition:** The response offers a moderately detailed answer that includes several specific elements relevant to the query, yet it still lacks some finer details or complete information. + + **Example input:** + **Query:** How do I bake a chocolate cake? + **Response:** Preheat your oven to 350°F. In a bowl, combine flour, sugar, cocoa, eggs, and milk, mix well, and bake for about 30 minutes. + **Tool Definitions:** [] + + **Expected output** + { + "explanation": "The response includes specific steps and ingredients, indicating a clear intent to provide a recipe. However, it lacks detailed measurements and finer instructions, which makes the resolution only moderately complete." + "conversation_has_intent": true, + "agent_perceived_intent": "provide a basic chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": false, + "resolution_score": 4, + } + + + ## [Score: 5] (Response directly addresses the user intent and fully resolves it) + **Definition:** The response provides a complete, detailed, and accurate answer that fully resolves the user's query with all necessary information and precision. + + **Example input:** + **Query:** How do I bake a chocolate cake? + **Response:** Preheat your oven to 350°F (175°C) and grease a 9-inch round cake pan. In a large bowl, sift together 1 ¾ cups all-purpose flour, 1 cup sugar, ¾ cup unsweetened cocoa powder, 1 ½ tsp baking powder, and 1 tsp salt. In another bowl, beat 2 large eggs with 1 cup milk, ½ cup vegetable oil, and 2 tsp vanilla extract. Combine the wet ingredients with the dry ingredients, then gradually mix in 1 cup boiling water until smooth. Pour the batter into the prepared pan and bake for 30-35 minutes or until a toothpick inserted into the center comes out clean. Allow the cake to cool before serving. + **Tool Definitions:** [] + + **Expected output** + { + "explanation": "The response delivers a complete and precise recipe with detailed instructions and measurements, fully addressing the user's query about baking a chocolate cake." + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 5, + } + + + # Task + + Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above. + Your output should consist only of a JSON object, as provided in the examples, that has the following keys: + - explanation: a string that explains why you think the input Data should get this resolution_score. + - conversation_has_intent: true or false + - agent_perceived_intent: a string that describes the intent the agent perceived from the user query + - actual_user_intent: a string that describes the actual user intent + - correct_intent_detected: true or false + - intent_resolved: true or false + - resolution_score: an integer between 1 and 5 that represents the resolution score + + + # Output + """; +#pragma warning restore S103 + + evaluationInstructions.Add(new ChatMessage(ChatRole.User, evaluationPrompt)); + + return evaluationInstructions; + } + + private static async ValueTask ParseEvaluationResponseAsync( + NumericMetric metric, + ChatResponse evaluationResponse, + TimeSpan evaluationDuration, + ChatConfiguration chatConfiguration, + CancellationToken cancellationToken) + { + IntentResolutionRating rating; + + string evaluationResponseText = evaluationResponse.Text.Trim(); + if (string.IsNullOrEmpty(evaluationResponseText)) + { + rating = IntentResolutionRating.Inconclusive; + metric.AddDiagnostics( + EvaluationDiagnostic.Error("The model failed to produce a valid evaluation response.")); + } + else + { + try + { + rating = IntentResolutionRating.FromJson(evaluationResponseText); + } + catch (JsonException) + { + try + { + string repairedJson = + await JsonOutputFixer.RepairJsonAsync( + evaluationResponseText, + chatConfiguration, + cancellationToken).ConfigureAwait(false); + + if (string.IsNullOrWhiteSpace(repairedJson)) + { + rating = IntentResolutionRating.Inconclusive; + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $""" + Failed to repair the following response from the model and parse the score for '{IntentResolutionMetricName}': + {evaluationResponseText} + """)); + } + else + { + rating = IntentResolutionRating.FromJson(repairedJson); + } + } + catch (JsonException ex) + { + rating = IntentResolutionRating.Inconclusive; + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $""" + Failed to repair the following response from the model and parse the score for '{IntentResolutionMetricName}': + {evaluationResponseText} + {ex} + """)); + } + } + } + + UpdateMetric(); + + void UpdateMetric() + { + metric.AddOrUpdateChatMetadata(evaluationResponse, evaluationDuration); + metric.Value = rating.ResolutionScore; + metric.Interpretation = metric.InterpretScore(); + metric.Reason = rating.Explanation; + + if (!string.IsNullOrWhiteSpace(rating.AgentPerceivedIntent)) + { + metric.AddOrUpdateMetadata("agent_perceived_intent", rating.AgentPerceivedIntent!); + } + + if (!string.IsNullOrWhiteSpace(rating.ActualUserIntent)) + { + metric.AddOrUpdateMetadata("actual_user_intent", rating.ActualUserIntent!); + } + + metric.AddOrUpdateMetadata("conversation_has_intent", rating.ConversationHasIntent.ToString()); + metric.AddOrUpdateMetadata("correct_intent_detected", rating.CorrectIntentDetected.ToString()); + metric.AddOrUpdateMetadata("intent_resolved", rating.IntentResolved.ToString()); + } + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluatorContext.cs new file mode 100644 index 00000000000..c8dcbc996b7 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluatorContext.cs @@ -0,0 +1,88 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +/// +/// Contextual information that the uses to evaluate an AI system's +/// effectiveness at identifying and resolving user intent. +/// +/// +/// +/// evaluates an AI system's effectiveness at identifying and resolving user +/// intent based on the supplied conversation history and the tool definitions supplied via +/// . +/// +/// +/// Note that at the moment, only supports evaluating calls to tools that are +/// defined as s. Any other definitions that are supplied via +/// will be ignored. +/// +/// +[Experimental("AIEVAL001")] +public sealed class IntentResolutionEvaluatorContext : EvaluationContext +{ + /// + /// Initializes a new instance of the class. + /// + /// + /// + /// The set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions will be ignored. + /// + /// + public IntentResolutionEvaluatorContext(IEnumerable toolDefinitions) + : base(name: IntentResolutionContextName, contents: [new TextContent(toolDefinitions.RenderAsJson())]) + { + ToolDefinitions = [.. toolDefinitions]; + } + + /// + /// Initializes a new instance of the class. + /// + /// + /// + /// The set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions will be ignored. + /// + /// + public IntentResolutionEvaluatorContext(params AITool[] toolDefinitions) + : this(toolDefinitions as IEnumerable) + { + } + + /// + /// Gets the unique that is used for + /// . + /// + public static string IntentResolutionContextName => "Tool Definitions (Intent Resolution)"; + + /// + /// Gets set of tool definitions (see ) that were used when generating the model + /// response that is being evaluated. + /// + /// + /// + /// evaluates an AI system's effectiveness at identifying and resolving user + /// intent based on the supplied conversation history and the tool definitions supplied via + /// . + /// + /// + /// Note that at the moment, only supports evaluating calls to tools that + /// are defined as s. Any other definitions that are supplied via + /// will be ignored. + /// + /// + public IReadOnlyList ToolDefinitions { get; } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionRating.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionRating.cs new file mode 100644 index 00000000000..ab28f9e6483 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionRating.cs @@ -0,0 +1,68 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Text.Json; +using System.Text.Json.Serialization; +using Microsoft.Extensions.AI.Evaluation.Quality.JsonSerialization; +using Microsoft.Extensions.AI.Evaluation.Quality.Utilities; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +internal sealed class IntentResolutionRating +{ + public static IntentResolutionRating Inconclusive { get; } = new IntentResolutionRating(); + + [JsonPropertyName("resolution_score")] + public int ResolutionScore { get; } + + [JsonPropertyName("explanation")] + public string? Explanation { get; } + + [JsonPropertyName("agent_perceived_intent")] + public string? AgentPerceivedIntent { get; } + + [JsonPropertyName("actual_user_intent")] + public string? ActualUserIntent { get; } + + [JsonPropertyName("conversation_has_intent")] + public bool ConversationHasIntent { get; } + + [JsonPropertyName("correct_intent_detected")] + public bool CorrectIntentDetected { get; } + + [JsonPropertyName("intent_resolved")] + public bool IntentResolved { get; } + + private const int MinValue = 1; + private const int MaxValue = 5; + + public bool IsInconclusive => ResolutionScore < MinValue || ResolutionScore > MaxValue; + + [JsonConstructor] +#pragma warning disable S107 // Methods should not have too many parameters + public IntentResolutionRating( + int resolutionScore = -1, + string? explanation = null, + string? agentPerceivedIntent = null, + string? actualUserIntent = null, + bool conversationHasIntent = false, + bool correctIntentDetected = false, + bool intentResolved = false) +#pragma warning restore S107 + { + ResolutionScore = resolutionScore; + Explanation = explanation; + AgentPerceivedIntent = agentPerceivedIntent; + ActualUserIntent = actualUserIntent; + ConversationHasIntent = conversationHasIntent; + CorrectIntentDetected = correctIntentDetected; + IntentResolved = intentResolved; + } + + public static IntentResolutionRating FromJson(string jsonResponse) + { + ReadOnlySpan trimmed = JsonOutputFixer.TrimMarkdownDelimiters(jsonResponse); + return JsonSerializer.Deserialize(trimmed, SerializerContext.Default.IntentResolutionRating)!; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/JsonSerialization/SerializerContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/JsonSerialization/SerializerContext.cs new file mode 100644 index 00000000000..588b3d23a7e --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/JsonSerialization/SerializerContext.cs @@ -0,0 +1,14 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Text.Json.Serialization; + +namespace Microsoft.Extensions.AI.Evaluation.Quality.JsonSerialization; + +[JsonSourceGenerationOptions( + WriteIndented = true, + AllowTrailingCommas = true, + PropertyNamingPolicy = JsonKnownNamingPolicy.CamelCase)] +[JsonSerializable(typeof(RelevanceTruthAndCompletenessRating))] +[JsonSerializable(typeof(IntentResolutionRating))] +internal sealed partial class SerializerContext : JsonSerializerContext; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Rating.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Rating.cs deleted file mode 100644 index 8ff913fefe7..00000000000 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Rating.cs +++ /dev/null @@ -1,72 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System; -using System.Text.Json; -using System.Text.Json.Serialization; -using Microsoft.Extensions.AI.Evaluation.Quality.Utilities; - -namespace Microsoft.Extensions.AI.Evaluation.Quality; - -public partial class RelevanceTruthAndCompletenessEvaluator -{ - internal sealed class Rating - { - public static Rating Inconclusive { get; } = new Rating(relevance: -1, truth: -1, completeness: -1); - - public int Relevance { get; } - public string? RelevanceReasoning { get; } - public string[] RelevanceReasons { get; } = []; - - public int Truth { get; } - public string? TruthReasoning { get; } - public string[] TruthReasons { get; } = []; - - public int Completeness { get; } - public string? CompletenessReasoning { get; } - public string[] CompletenessReasons { get; } = []; - - public string? Error { get; } - - private const int MinValue = 1; - private const int MaxValue = 5; - -#pragma warning disable S1067 // Expressions should not be too complex. - public bool IsInconclusive => - Error is not null || - Relevance < MinValue || Relevance > MaxValue || - Truth < MinValue || Truth > MaxValue || - Completeness < MinValue || Completeness > MaxValue; -#pragma warning restore S1067 - - public Rating(int relevance, int truth, int completeness, string? error = null) - { - (Relevance, Truth, Completeness, Error) = (relevance, truth, completeness, error); - } - - [JsonConstructor] -#pragma warning disable S107 // Methods should not have too many parameters. - public Rating( - int relevance, string? relevanceReasoning, string[] relevanceReasons, - int truth, string? truthReasoning, string[] truthReasons, - int completeness, string? completenessReasoning, string[] completenessReasons, - string? error = null) -#pragma warning restore S107 - { - (Relevance, RelevanceReasoning, RelevanceReasons, - Truth, TruthReasoning, TruthReasons, - Completeness, CompletenessReasoning, CompletenessReasons, - Error) = - (relevance, relevanceReasoning, relevanceReasons ?? [], - truth, truthReasoning, truthReasons ?? [], - completeness, completenessReasoning, completenessReasons ?? [], - error); - } - - public static Rating FromJson(string jsonResponse) - { - ReadOnlySpan trimmed = JsonOutputFixer.TrimMarkdownDelimiters(jsonResponse); - return JsonSerializer.Deserialize(trimmed, SerializerContext.Default.Rating)!; - } - } -} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.SerializerContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.SerializerContext.cs deleted file mode 100644 index 211213d4873..00000000000 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.SerializerContext.cs +++ /dev/null @@ -1,16 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Text.Json.Serialization; - -namespace Microsoft.Extensions.AI.Evaluation.Quality; - -public partial class RelevanceTruthAndCompletenessEvaluator -{ - [JsonSourceGenerationOptions( - WriteIndented = true, - AllowTrailingCommas = true, - PropertyNamingPolicy = JsonKnownNamingPolicy.CamelCase)] - [JsonSerializable(typeof(Rating))] - internal sealed partial class SerializerContext : JsonSerializerContext; -} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs index 95cc64fd3dc..4eb41b15361 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs @@ -43,7 +43,7 @@ namespace Microsoft.Extensions.AI.Evaluation.Quality; /// Tutorial: Evaluate a model's response with response caching and reporting. /// [Experimental("AIEVAL001")] -public sealed partial class RelevanceTruthAndCompletenessEvaluator : IEvaluator +public sealed class RelevanceTruthAndCompletenessEvaluator : IEvaluator { /// /// Gets the of the returned by @@ -271,12 +271,12 @@ private static async ValueTask ParseEvaluationResponseAsync( ChatConfiguration chatConfiguration, CancellationToken cancellationToken) { - Rating rating; + RelevanceTruthAndCompletenessRating rating; string evaluationResponseText = evaluationResponse.Text.Trim(); if (string.IsNullOrEmpty(evaluationResponseText)) { - rating = Rating.Inconclusive; + rating = RelevanceTruthAndCompletenessRating.Inconclusive; result.AddDiagnosticsToAllMetrics( EvaluationDiagnostic.Error("The model failed to produce a valid evaluation response.")); } @@ -284,7 +284,7 @@ private static async ValueTask ParseEvaluationResponseAsync( { try { - rating = Rating.FromJson(evaluationResponseText); + rating = RelevanceTruthAndCompletenessRating.FromJson(evaluationResponseText); } catch (JsonException) { @@ -298,26 +298,26 @@ await JsonOutputFixer.RepairJsonAsync( if (string.IsNullOrWhiteSpace(repairedJson)) { - rating = Rating.Inconclusive; + rating = RelevanceTruthAndCompletenessRating.Inconclusive; result.AddDiagnosticsToAllMetrics( EvaluationDiagnostic.Error( $""" - Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.: + Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}': {evaluationResponseText} """)); } else { - rating = Rating.FromJson(repairedJson); + rating = RelevanceTruthAndCompletenessRating.FromJson(repairedJson); } } catch (JsonException ex) { - rating = Rating.Inconclusive; + rating = RelevanceTruthAndCompletenessRating.Inconclusive; result.AddDiagnosticsToAllMetrics( EvaluationDiagnostic.Error( $""" - Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.: + Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}': {evaluationResponseText} {ex} """)); @@ -336,10 +336,7 @@ void UpdateResult() relevance.AddOrUpdateChatMetadata(evaluationResponse, evaluationDuration); relevance.Value = rating.Relevance; relevance.Interpretation = relevance.InterpretScore(); - if (!string.IsNullOrWhiteSpace(rating.RelevanceReasoning)) - { - relevance.Reason = rating.RelevanceReasoning!; - } + relevance.Reason = rating.RelevanceReasoning; if (rating.RelevanceReasons.Any()) { @@ -351,10 +348,7 @@ void UpdateResult() truth.AddOrUpdateChatMetadata(evaluationResponse, evaluationDuration); truth.Value = rating.Truth; truth.Interpretation = truth.InterpretScore(); - if (!string.IsNullOrWhiteSpace(rating.TruthReasoning)) - { - truth.Reason = rating.TruthReasoning!; - } + truth.Reason = rating.TruthReasoning; if (rating.TruthReasons.Any()) { @@ -366,21 +360,13 @@ void UpdateResult() completeness.AddOrUpdateChatMetadata(evaluationResponse, evaluationDuration); completeness.Value = rating.Completeness; completeness.Interpretation = completeness.InterpretScore(); - if (!string.IsNullOrWhiteSpace(rating.CompletenessReasoning)) - { - completeness.Reason = rating.CompletenessReasoning!; - } + completeness.Reason = rating.CompletenessReasoning; if (rating.CompletenessReasons.Any()) { string value = string.Join(Separator, rating.CompletenessReasons); completeness.AddOrUpdateMetadata(name: Rationales, value); } - - if (!string.IsNullOrWhiteSpace(rating.Error)) - { - result.AddDiagnosticsToAllMetrics(EvaluationDiagnostic.Error(rating.Error!)); - } } } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessRating.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessRating.cs new file mode 100644 index 00000000000..f8a8a23e892 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessRating.cs @@ -0,0 +1,65 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Text.Json; +using System.Text.Json.Serialization; +using Microsoft.Extensions.AI.Evaluation.Quality.JsonSerialization; +using Microsoft.Extensions.AI.Evaluation.Quality.Utilities; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +internal sealed class RelevanceTruthAndCompletenessRating +{ + public static RelevanceTruthAndCompletenessRating Inconclusive { get; } + = new RelevanceTruthAndCompletenessRating(relevance: -1, truth: -1, completeness: -1); + + public int Relevance { get; } + public string? RelevanceReasoning { get; } + public string[] RelevanceReasons { get; } = []; + + public int Truth { get; } + public string? TruthReasoning { get; } + public string[] TruthReasons { get; } = []; + + public int Completeness { get; } + public string? CompletenessReasoning { get; } + public string[] CompletenessReasons { get; } = []; + + private const int MinValue = 1; + private const int MaxValue = 5; + +#pragma warning disable S1067 // Expressions should not be too complex. + public bool IsInconclusive => + Relevance < MinValue || Relevance > MaxValue || + Truth < MinValue || Truth > MaxValue || + Completeness < MinValue || Completeness > MaxValue; +#pragma warning restore S1067 + + public RelevanceTruthAndCompletenessRating(int relevance, int truth, int completeness) + { + (Relevance, Truth, Completeness) = (relevance, truth, completeness); + } + + [JsonConstructor] +#pragma warning disable S107 // Methods should not have too many parameters. + public RelevanceTruthAndCompletenessRating( + int relevance, string? relevanceReasoning, string[] relevanceReasons, + int truth, string? truthReasoning, string[] truthReasons, + int completeness, string? completenessReasoning, string[] completenessReasons) +#pragma warning restore S107 + { + (Relevance, RelevanceReasoning, RelevanceReasons, + Truth, TruthReasoning, TruthReasons, + Completeness, CompletenessReasoning, CompletenessReasons) = + (relevance, relevanceReasoning, relevanceReasons ?? [], + truth, truthReasoning, truthReasons ?? [], + completeness, completenessReasoning, completenessReasons ?? []); + } + + public static RelevanceTruthAndCompletenessRating FromJson(string jsonResponse) + { + ReadOnlySpan trimmed = JsonOutputFixer.TrimMarkdownDelimiters(jsonResponse); + return JsonSerializer.Deserialize(trimmed, SerializerContext.Default.RelevanceTruthAndCompletenessRating)!; + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs index 61d12f07d8b..b1bd36e5d15 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs @@ -61,11 +61,12 @@ static AgentQualityEvaluatorTests() IEvaluator toolCallAccuracyEvaluator = new ToolCallAccuracyEvaluator(); IEvaluator taskAdherenceEvaluator = new TaskAdherenceEvaluator(); + IEvaluator intentResolutionEvaluator = new IntentResolutionEvaluator(); _agentQualityReportingConfiguration = DiskBasedReportingConfiguration.Create( storageRootPath: Settings.Current.StorageRootPath, - evaluators: [taskAdherenceEvaluator], + evaluators: [taskAdherenceEvaluator, intentResolutionEvaluator], chatConfiguration: chatConfigurationWithToolCalling, executionName: Constants.Version, tags: [version, date, projectName, testClass, provider, model, temperature]); @@ -73,7 +74,7 @@ static AgentQualityEvaluatorTests() _needsContextReportingConfiguration = DiskBasedReportingConfiguration.Create( storageRootPath: Settings.Current.StorageRootPath, - evaluators: [toolCallAccuracyEvaluator, taskAdherenceEvaluator], + evaluators: [toolCallAccuracyEvaluator, taskAdherenceEvaluator, intentResolutionEvaluator], chatConfiguration: chatConfigurationWithToolCalling, executionName: Constants.Version, tags: [version, date, projectName, testClass, provider, model, temperature, usesContext]); @@ -98,8 +99,9 @@ await _agentQualityReportingConfiguration.CreateScenarioRunAsync( result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); - Assert.Single(result.Metrics); + Assert.Equal(2, result.Metrics.Count); Assert.True(result.TryGet(TaskAdherenceEvaluator.TaskAdherenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(IntentResolutionEvaluator.IntentResolutionMetricName, out NumericMetric? _)); } [ConditionalFact] @@ -127,8 +129,9 @@ await scenarioRun.EvaluateAsync( result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); - Assert.Single(result.Metrics); + Assert.Equal(2, result.Metrics.Count); Assert.True(result.TryGet(TaskAdherenceEvaluator.TaskAdherenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(IntentResolutionEvaluator.IntentResolutionMetricName, out NumericMetric? _)); } [ConditionalFact] @@ -149,9 +152,10 @@ await _needsContextReportingConfiguration.CreateScenarioRunAsync( result.Metrics.Values.All(m => m.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error)), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); - Assert.Equal(2, result.Metrics.Count); + Assert.Equal(3, result.Metrics.Count); Assert.True(result.TryGet(ToolCallAccuracyEvaluator.ToolCallAccuracyMetricName, out BooleanMetric? _)); Assert.True(result.TryGet(TaskAdherenceEvaluator.TaskAdherenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(IntentResolutionEvaluator.IntentResolutionMetricName, out NumericMetric? _)); } [ConditionalFact] @@ -172,21 +176,26 @@ await _needsContextReportingConfiguration.CreateScenarioRunAsync( var toolDefinitionsForTaskAdherenceEvaluator = new TaskAdherenceEvaluatorContext(toolDefinitions: _chatOptionsWithTools.Tools!); + var toolDefinitionsForIntentResolutionEvaluator = + new IntentResolutionEvaluatorContext(toolDefinitions: _chatOptionsWithTools.Tools!); + EvaluationResult result = await scenarioRun.EvaluateAsync( messages, response, additionalContext: [ toolDefinitionsForToolCallAccuracyEvaluator, - toolDefinitionsForTaskAdherenceEvaluator]); + toolDefinitionsForTaskAdherenceEvaluator, + toolDefinitionsForIntentResolutionEvaluator]); Assert.False( result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); - Assert.Equal(2, result.Metrics.Count); + Assert.Equal(3, result.Metrics.Count); Assert.True(result.TryGet(ToolCallAccuracyEvaluator.ToolCallAccuracyMetricName, out BooleanMetric? _)); Assert.True(result.TryGet(TaskAdherenceEvaluator.TaskAdherenceMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(IntentResolutionEvaluator.IntentResolutionMetricName, out NumericMetric? _)); } private static async Task<(IEnumerable messages, ChatResponse response)> diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessEvaluatorRatingTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessRatingTests.cs similarity index 80% rename from test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessEvaluatorRatingTests.cs rename to test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessRatingTests.cs index db7cc6e3a26..ed668695740 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessEvaluatorRatingTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessRatingTests.cs @@ -5,12 +5,13 @@ using System.Linq; using System.Text.Json; using Microsoft.Extensions.AI.Evaluation.Quality; +using Microsoft.Extensions.AI.Evaluation.Quality.JsonSerialization; using Xunit; namespace Microsoft.Extensions.AI.Evaluation.Tests; [Experimental("AIEVAL001")] -public class RelevanceTruthAndCompletenessEvaluatorRatingTests +public class RelevanceTruthAndCompletenessRatingTests { [Fact] public void JsonIsValid() @@ -19,7 +20,7 @@ public void JsonIsValid() {"relevance": 1, "truth": 5, "completeness": 4} """; - var rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); + var rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.Equal(1, rating.Relevance); Assert.Equal(5, rating.Truth); @@ -44,7 +45,7 @@ public void JsonIsSurroundedWithMarkdownSyntax() """; - var rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); + var rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.Equal(1, rating.Relevance); Assert.Equal(5, rating.Truth); @@ -69,7 +70,7 @@ public void JsonIsSurroundedWithMarkdownSyntaxWithJsonPrefix() """; - var rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); + var rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.Equal(1, rating.Relevance); Assert.Equal(5, rating.Truth); @@ -86,7 +87,7 @@ public void JsonIsSurroundedWithMarkdownSyntaxWithJsonPrefix() [Fact] public void JsonCanBeRoundTripped() { - var rating = new RelevanceTruthAndCompletenessEvaluator.Rating( + var rating = new RelevanceTruthAndCompletenessRating( relevance: 1, relevanceReasoning: "The response is not relevant to the request.", relevanceReasons: ["Reason 1", "Reason 2"], @@ -97,8 +98,8 @@ public void JsonCanBeRoundTripped() completenessReasoning: "The response is mostly complete.", completenessReasons: ["Reason 1", "Reason 2"]); - string json = JsonSerializer.Serialize(rating, RelevanceTruthAndCompletenessEvaluator.SerializerContext.Default.Rating); - var deserialized = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); + string json = JsonSerializer.Serialize(rating, SerializerContext.Default.RelevanceTruthAndCompletenessRating); + var deserialized = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.Equal(rating.Relevance, deserialized.Relevance); Assert.Equal(rating.RelevanceReasoning, deserialized.RelevanceReasoning); Assert.True(rating.RelevanceReasons.SequenceEqual(deserialized.RelevanceReasons)); @@ -115,27 +116,27 @@ public void JsonCanBeRoundTripped() public void JsonContainsInconclusiveMetrics() { string json = """{"relevance": -1, "truth": 4, "completeness": 7}"""; - var rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); + var rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.True(rating.IsInconclusive); json = """{"relevance": 0, "truth": -1, "completeness": 3}"""; - rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); + rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.True(rating.IsInconclusive); json = """{"relevance": 0, "truth": 4, "completeness": -5}"""; - rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); + rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.True(rating.IsInconclusive); json = """{"relevance": 10, "truth": 4, "completeness": 3}"""; - rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); + rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.True(rating.IsInconclusive); json = """{"relevance": 0, "truth": 5, "completeness": 3}"""; - rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); + rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.True(rating.IsInconclusive); json = """{"relevance": 1, "truth": 4, "completeness": 6}"""; - rating = RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json); + rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.True(rating.IsInconclusive); } @@ -143,6 +144,6 @@ public void JsonContainsInconclusiveMetrics() public void JsonContainsErrors() { string json = """{"relevance": 0, "truth": 2 ;"completeness": 3}"""; - Assert.Throws(() => RelevanceTruthAndCompletenessEvaluator.Rating.FromJson(json)); + Assert.Throws(() => RelevanceTruthAndCompletenessRating.FromJson(json)); } } From fd908edfc391bb49f304a0121cacbf53b122607f Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Thu, 12 Jun 2025 15:14:53 -0700 Subject: [PATCH 13/15] Add more tests --- .../IntentResolutionRating.cs | 45 ++- .../RelevanceTruthAndCompletenessRating.cs | 57 ++- .../IntentResolutionRatingTests.cs | 324 ++++++++++++++++ ...elevanceTruthAndCompletenessRatingTests.cs | 367 ++++++++++++++---- 4 files changed, 692 insertions(+), 101 deletions(-) create mode 100644 test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/IntentResolutionRatingTests.cs diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionRating.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionRating.cs index ab28f9e6483..a1d9b0ef90e 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionRating.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionRating.cs @@ -11,28 +11,43 @@ namespace Microsoft.Extensions.AI.Evaluation.Quality; internal sealed class IntentResolutionRating { - public static IntentResolutionRating Inconclusive { get; } = new IntentResolutionRating(); + public static IntentResolutionRating Inconclusive { get; } = + new IntentResolutionRating( + resolutionScore: 0, + explanation: string.Empty, + agentPerceivedIntent: string.Empty, + actualUserIntent: string.Empty, + conversationHasIntent: false, + correctIntentDetected: false, + intentResolved: false); + [JsonRequired] [JsonPropertyName("resolution_score")] - public int ResolutionScore { get; } + public int ResolutionScore { get; set; } + [JsonRequired] [JsonPropertyName("explanation")] - public string? Explanation { get; } + public string Explanation { get; set; } + [JsonRequired] [JsonPropertyName("agent_perceived_intent")] - public string? AgentPerceivedIntent { get; } + public string AgentPerceivedIntent { get; set; } + [JsonRequired] [JsonPropertyName("actual_user_intent")] - public string? ActualUserIntent { get; } + public string ActualUserIntent { get; set; } + [JsonRequired] [JsonPropertyName("conversation_has_intent")] - public bool ConversationHasIntent { get; } + public bool ConversationHasIntent { get; set; } + [JsonRequired] [JsonPropertyName("correct_intent_detected")] - public bool CorrectIntentDetected { get; } + public bool CorrectIntentDetected { get; set; } + [JsonRequired] [JsonPropertyName("intent_resolved")] - public bool IntentResolved { get; } + public bool IntentResolved { get; set; } private const int MinValue = 1; private const int MaxValue = 5; @@ -42,13 +57,13 @@ internal sealed class IntentResolutionRating [JsonConstructor] #pragma warning disable S107 // Methods should not have too many parameters public IntentResolutionRating( - int resolutionScore = -1, - string? explanation = null, - string? agentPerceivedIntent = null, - string? actualUserIntent = null, - bool conversationHasIntent = false, - bool correctIntentDetected = false, - bool intentResolved = false) + int resolutionScore, + string explanation, + string agentPerceivedIntent, + string actualUserIntent, + bool conversationHasIntent, + bool correctIntentDetected, + bool intentResolved) #pragma warning restore S107 { ResolutionScore = resolutionScore; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessRating.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessRating.cs index f8a8a23e892..83c76a1825e 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessRating.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessRating.cs @@ -11,20 +11,44 @@ namespace Microsoft.Extensions.AI.Evaluation.Quality; internal sealed class RelevanceTruthAndCompletenessRating { - public static RelevanceTruthAndCompletenessRating Inconclusive { get; } - = new RelevanceTruthAndCompletenessRating(relevance: -1, truth: -1, completeness: -1); + public static RelevanceTruthAndCompletenessRating Inconclusive { get; } = + new RelevanceTruthAndCompletenessRating( + relevance: 0, + relevanceReasoning: string.Empty, + relevanceReasons: [], + truth: 0, + truthReasoning: string.Empty, + truthReasons: [], + completeness: 0, + completenessReasoning: string.Empty, + completenessReasons: []); - public int Relevance { get; } - public string? RelevanceReasoning { get; } - public string[] RelevanceReasons { get; } = []; + [JsonRequired] + public int Relevance { get; set; } - public int Truth { get; } - public string? TruthReasoning { get; } - public string[] TruthReasons { get; } = []; + [JsonRequired] + public string RelevanceReasoning { get; set; } - public int Completeness { get; } - public string? CompletenessReasoning { get; } - public string[] CompletenessReasons { get; } = []; + [JsonRequired] + public string[] RelevanceReasons { get; set; } + + [JsonRequired] + public int Truth { get; set; } + + [JsonRequired] + public string TruthReasoning { get; set; } + + [JsonRequired] + public string[] TruthReasons { get; set; } + + [JsonRequired] + public int Completeness { get; set; } + + [JsonRequired] + public string CompletenessReasoning { get; set; } + + [JsonRequired] + public string[] CompletenessReasons { get; set; } private const int MinValue = 1; private const int MaxValue = 5; @@ -36,17 +60,12 @@ internal sealed class RelevanceTruthAndCompletenessRating Completeness < MinValue || Completeness > MaxValue; #pragma warning restore S1067 - public RelevanceTruthAndCompletenessRating(int relevance, int truth, int completeness) - { - (Relevance, Truth, Completeness) = (relevance, truth, completeness); - } - [JsonConstructor] #pragma warning disable S107 // Methods should not have too many parameters. public RelevanceTruthAndCompletenessRating( - int relevance, string? relevanceReasoning, string[] relevanceReasons, - int truth, string? truthReasoning, string[] truthReasons, - int completeness, string? completenessReasoning, string[] completenessReasons) + int relevance, string relevanceReasoning, string[] relevanceReasons, + int truth, string truthReasoning, string[] truthReasons, + int completeness, string completenessReasoning, string[] completenessReasons) #pragma warning restore S107 { (Relevance, RelevanceReasoning, RelevanceReasons, diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/IntentResolutionRatingTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/IntentResolutionRatingTests.cs new file mode 100644 index 00000000000..da839387e20 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/IntentResolutionRatingTests.cs @@ -0,0 +1,324 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Text.Json; +using Microsoft.Extensions.AI.Evaluation.Quality; +using Microsoft.Extensions.AI.Evaluation.Quality.JsonSerialization; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.Tests; + +public class IntentResolutionRatingTests +{ + [Fact] + public void JsonIsValid() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 5 + } + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.Equal(5, rating.ResolutionScore); + Assert.Equal("The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", rating.Explanation); + Assert.Equal("provide a comprehensive chocolate cake recipe", rating.AgentPerceivedIntent); + Assert.Equal("bake a chocolate cake", rating.ActualUserIntent); + Assert.True(rating.ConversationHasIntent); + Assert.True(rating.CorrectIntentDetected); + Assert.True(rating.IntentResolved); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonIsSurroundedWithMarkdownSyntax() + { + string json = + """ + + ``` + { + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 5, + } + ``` + + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.Equal(5, rating.ResolutionScore); + Assert.Equal("The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", rating.Explanation); + Assert.Equal("provide a comprehensive chocolate cake recipe", rating.AgentPerceivedIntent); + Assert.Equal("bake a chocolate cake", rating.ActualUserIntent); + Assert.True(rating.ConversationHasIntent); + Assert.True(rating.CorrectIntentDetected); + Assert.True(rating.IntentResolved); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonIsSurroundedWithMarkdownSyntaxWithJsonPrefix() + { + string json = + """ + + ```json + { + "resolution_score": 5, + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true + } + ``` + + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.Equal(5, rating.ResolutionScore); + Assert.Equal("The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", rating.Explanation); + Assert.Equal("provide a comprehensive chocolate cake recipe", rating.AgentPerceivedIntent); + Assert.Equal("bake a chocolate cake", rating.ActualUserIntent); + Assert.True(rating.ConversationHasIntent); + Assert.True(rating.CorrectIntentDetected); + Assert.True(rating.IntentResolved); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonCanBeRoundTripped() + { + IntentResolutionRating rating = + new IntentResolutionRating( + resolutionScore: 1, + explanation: "explanation", + agentPerceivedIntent: "perceived intent", + actualUserIntent: "actual intent", + conversationHasIntent: false, + correctIntentDetected: true, + intentResolved: true); + + string json = JsonSerializer.Serialize(rating, SerializerContext.Default.IntentResolutionRating); + IntentResolutionRating deserialized = IntentResolutionRating.FromJson(json); + + Assert.Equal(rating.ResolutionScore, deserialized.ResolutionScore); + Assert.Equal(rating.Explanation, deserialized.Explanation); + Assert.Equal(rating.AgentPerceivedIntent, deserialized.AgentPerceivedIntent); + Assert.Equal(rating.ActualUserIntent, deserialized.ActualUserIntent); + Assert.Equal(rating.ConversationHasIntent, deserialized.ConversationHasIntent); + Assert.Equal(rating.CorrectIntentDetected, deserialized.CorrectIntentDetected); + Assert.Equal(rating.IntentResolved, deserialized.IntentResolved); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void InconclusiveJsonCanBeRoundTripped() + { + IntentResolutionRating rating = IntentResolutionRating.Inconclusive; + + string json = JsonSerializer.Serialize(rating, SerializerContext.Default.IntentResolutionRating); + IntentResolutionRating deserialized = IntentResolutionRating.FromJson(json); + + Assert.Equal(rating.ResolutionScore, deserialized.ResolutionScore); + Assert.Equal(rating.Explanation, deserialized.Explanation); + Assert.Equal(rating.AgentPerceivedIntent, deserialized.AgentPerceivedIntent); + Assert.Equal(rating.ActualUserIntent, deserialized.ActualUserIntent); + Assert.Equal(rating.ConversationHasIntent, deserialized.ConversationHasIntent); + Assert.Equal(rating.CorrectIntentDetected, deserialized.CorrectIntentDetected); + Assert.Equal(rating.IntentResolved, deserialized.IntentResolved); + Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithNegativeScoreIsInconclusive() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": -1 + } + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithZeroScoreIsInconclusive() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 0 + } + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithExcessivelyHighScoreIsInconclusive() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 200 + } + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithAdditionalHallucinatedPropertyIsProcessedCorrectly() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "hallucinated_property": "Some hallucinated text.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 5, + } + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.Equal(5, rating.ResolutionScore); + Assert.Equal("The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", rating.Explanation); + Assert.Equal("provide a comprehensive chocolate cake recipe", rating.AgentPerceivedIntent); + Assert.Equal("bake a chocolate cake", rating.ActualUserIntent); + Assert.True(rating.ConversationHasIntent); + Assert.True(rating.CorrectIntentDetected); + Assert.True(rating.IntentResolved); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonWithDuplicatePropertyUsesLastValue() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "explanation": "Duplicate explanation.", + "conversation_has_intent": true, + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 5, + } + """; + + IntentResolutionRating rating = IntentResolutionRating.FromJson(json); + + Assert.Equal(5, rating.ResolutionScore); + Assert.Equal("Duplicate explanation.", rating.Explanation); + Assert.Equal("provide a comprehensive chocolate cake recipe", rating.AgentPerceivedIntent); + Assert.Equal("bake a chocolate cake", rating.ActualUserIntent); + Assert.True(rating.ConversationHasIntent); + Assert.True(rating.CorrectIntentDetected); + Assert.True(rating.IntentResolved); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonWithSemicolonsInsteadOfCommasThrowsException() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake."; + "conversation_has_intent": true; + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe"; + "actual_user_intent": "bake a chocolate cake"; + "correct_intent_detected": true; + "intent_resolved": true; + "resolution_score": 5 + } + """; + + Assert.Throws(() => IntentResolutionRating.FromJson(json)); + } + + [Fact] + public void JsonWithMissingPropertiesThrowsException() + { + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "intent_resolved": true, + "resolution_score": 5 + } + """; + + Assert.Throws(() => IntentResolutionRating.FromJson(json)); + } + + [Fact] + public void JsonWithIncorrectPropertyValueTypeThrowsException() + { + // Incorrect property value (string instead of boolean for conversation_has_intent). + string json = + """ + { + "explanation": "The response delivers a complete and precise recipe, fully addressing the user's query about baking a chocolate cake.", + "conversation_has_intent": "A string value", + "agent_perceived_intent": "provide a comprehensive chocolate cake recipe", + "actual_user_intent": "bake a chocolate cake", + "correct_intent_detected": true, + "intent_resolved": true, + "resolution_score": 5, + } + """; + + Assert.Throws(() => IntentResolutionRating.FromJson(json)); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessRatingTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessRatingTests.cs index ed668695740..6be1a8ba142 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessRatingTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Tests/RelevanceTruthAndCompletenessRatingTests.cs @@ -1,7 +1,6 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Diagnostics.CodeAnalysis; using System.Linq; using System.Text.Json; using Microsoft.Extensions.AI.Evaluation.Quality; @@ -10,49 +9,76 @@ namespace Microsoft.Extensions.AI.Evaluation.Tests; -[Experimental("AIEVAL001")] public class RelevanceTruthAndCompletenessRatingTests { [Fact] public void JsonIsValid() { - string json = """ - {"relevance": 1, "truth": 5, "completeness": 4} - """; + string json = + """ + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 1, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; - var rating = RelevanceTruthAndCompletenessRating.FromJson(json); + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.Equal(1, rating.Relevance); - Assert.Equal(5, rating.Truth); - Assert.Equal(4, rating.Completeness); - Assert.Null(rating.RelevanceReasoning); - Assert.Null(rating.TruthReasoning); - Assert.Null(rating.CompletenessReasoning); - Assert.Empty(rating.RelevanceReasons); - Assert.Empty(rating.TruthReasons); - Assert.Empty(rating.CompletenessReasons); + Assert.Equal(1, rating.Truth); + Assert.Equal(1, rating.Completeness); + Assert.Equal("The reason for the relevance score", rating.RelevanceReasoning); + Assert.Equal("The reason for the truth score", rating.TruthReasoning); + Assert.Equal("The reason for the completeness score", rating.CompletenessReasoning); + Assert.Single(rating.RelevanceReasons); + Assert.Equal("relevance_reason_distant_topic", rating.RelevanceReasons[0]); + Assert.Equal(3, rating.TruthReasons.Length); + Assert.Contains("truth_reason_incorrect_information", rating.TruthReasons); + Assert.Contains("truth_reason_outdated_information", rating.TruthReasons); + Assert.Contains("truth_reason_misleading_incorrectforintent", rating.TruthReasons); + Assert.Equal(2, rating.CompletenessReasons.Length); + Assert.Contains("completeness_reason_no_solution", rating.CompletenessReasons); + Assert.Contains("completeness_reason_genericsolution_missingcode", rating.CompletenessReasons); Assert.False(rating.IsInconclusive); } [Fact] public void JsonIsSurroundedWithMarkdownSyntax() { - string json = """ + string json = + """ - ``` - {"relevance": 1, "truth": 5, "completeness": 4} - ``` + ``` + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasons": [], + "truth": 4, + "truthReasoning": "The reason for the truth score", + "truthReasons": [], + "completeness": 5, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": [] + } + ``` - """; + """; - var rating = RelevanceTruthAndCompletenessRating.FromJson(json); + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.Equal(1, rating.Relevance); - Assert.Equal(5, rating.Truth); - Assert.Equal(4, rating.Completeness); - Assert.Null(rating.RelevanceReasoning); - Assert.Null(rating.TruthReasoning); - Assert.Null(rating.CompletenessReasoning); + Assert.Equal(4, rating.Truth); + Assert.Equal(5, rating.Completeness); + Assert.Equal("The reason for the relevance score", rating.RelevanceReasoning); + Assert.Equal("The reason for the truth score", rating.TruthReasoning); + Assert.Equal("The reason for the completeness score", rating.CompletenessReasoning); Assert.Empty(rating.RelevanceReasons); Assert.Empty(rating.TruthReasons); Assert.Empty(rating.CompletenessReasons); @@ -62,44 +88,60 @@ public void JsonIsSurroundedWithMarkdownSyntax() [Fact] public void JsonIsSurroundedWithMarkdownSyntaxWithJsonPrefix() { - string json = """ + string json = + """ - ```json - {"relevance": 1, "truth": 5, "completeness": 4} - ``` + ```json + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 3, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_misleading_incorrectforintent"], + "completeness": 2, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution"], + } + ``` - """; + """; - var rating = RelevanceTruthAndCompletenessRating.FromJson(json); + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.Equal(1, rating.Relevance); - Assert.Equal(5, rating.Truth); - Assert.Equal(4, rating.Completeness); - Assert.Null(rating.RelevanceReasoning); - Assert.Null(rating.TruthReasoning); - Assert.Null(rating.CompletenessReasoning); - Assert.Empty(rating.RelevanceReasons); - Assert.Empty(rating.TruthReasons); - Assert.Empty(rating.CompletenessReasons); + Assert.Equal(3, rating.Truth); + Assert.Equal(2, rating.Completeness); + Assert.Equal("The reason for the relevance score", rating.RelevanceReasoning); + Assert.Equal("The reason for the truth score", rating.TruthReasoning); + Assert.Equal("The reason for the completeness score", rating.CompletenessReasoning); + Assert.Single(rating.RelevanceReasons); + Assert.Equal("relevance_reason_distant_topic", rating.RelevanceReasons[0]); + Assert.Single(rating.TruthReasons); + Assert.Contains("truth_reason_misleading_incorrectforintent", rating.TruthReasons); + Assert.Single(rating.CompletenessReasons); + Assert.Contains("completeness_reason_no_solution", rating.CompletenessReasons); Assert.False(rating.IsInconclusive); } [Fact] public void JsonCanBeRoundTripped() { - var rating = new RelevanceTruthAndCompletenessRating( - relevance: 1, - relevanceReasoning: "The response is not relevant to the request.", - relevanceReasons: ["Reason 1", "Reason 2"], - truth: 5, - truthReasoning: "The response is mostly true.", - truthReasons: ["Reason 1", "Reason 2"], - completeness: 4, - completenessReasoning: "The response is mostly complete.", - completenessReasons: ["Reason 1", "Reason 2"]); + RelevanceTruthAndCompletenessRating rating = + new RelevanceTruthAndCompletenessRating( + relevance: 1, + relevanceReasoning: "The response is not relevant to the request.", + relevanceReasons: ["Reason 1", "Reason 2"], + truth: 5, + truthReasoning: "The response is mostly true.", + truthReasons: ["Reason 1", "Reason 2"], + completeness: 4, + completenessReasoning: "The response is mostly complete.", + completenessReasons: ["Reason 1", "Reason 2"]); string json = JsonSerializer.Serialize(rating, SerializerContext.Default.RelevanceTruthAndCompletenessRating); - var deserialized = RelevanceTruthAndCompletenessRating.FromJson(json); + RelevanceTruthAndCompletenessRating deserialized = RelevanceTruthAndCompletenessRating.FromJson(json); + Assert.Equal(rating.Relevance, deserialized.Relevance); Assert.Equal(rating.RelevanceReasoning, deserialized.RelevanceReasoning); Assert.True(rating.RelevanceReasons.SequenceEqual(deserialized.RelevanceReasons)); @@ -113,37 +155,228 @@ public void JsonCanBeRoundTripped() } [Fact] - public void JsonContainsInconclusiveMetrics() + public void InconclusiveJsonCanBeRoundTripped() { - string json = """{"relevance": -1, "truth": 4, "completeness": 7}"""; - var rating = RelevanceTruthAndCompletenessRating.FromJson(json); - Assert.True(rating.IsInconclusive); + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.Inconclusive; - json = """{"relevance": 0, "truth": -1, "completeness": 3}"""; - rating = RelevanceTruthAndCompletenessRating.FromJson(json); - Assert.True(rating.IsInconclusive); + string json = JsonSerializer.Serialize(rating, SerializerContext.Default.RelevanceTruthAndCompletenessRating); + RelevanceTruthAndCompletenessRating deserialized = RelevanceTruthAndCompletenessRating.FromJson(json); - json = """{"relevance": 0, "truth": 4, "completeness": -5}"""; - rating = RelevanceTruthAndCompletenessRating.FromJson(json); + Assert.Equal(rating.Relevance, deserialized.Relevance); + Assert.Equal(rating.RelevanceReasoning, deserialized.RelevanceReasoning); + Assert.True(rating.RelevanceReasons.SequenceEqual(deserialized.RelevanceReasons)); + Assert.Equal(rating.Truth, deserialized.Truth); + Assert.Equal(rating.TruthReasoning, deserialized.TruthReasoning); + Assert.True(rating.TruthReasons.SequenceEqual(deserialized.TruthReasons)); + Assert.Equal(rating.Completeness, deserialized.Completeness); + Assert.Equal(rating.CompletenessReasoning, deserialized.CompletenessReasoning); + Assert.True(rating.CompletenessReasons.SequenceEqual(deserialized.CompletenessReasons)); Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithNegativeScoreIsInconclusive() + { + string json = + """ + { + "relevance": -1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 1, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; + + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); - json = """{"relevance": 10, "truth": 4, "completeness": 3}"""; - rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithZeroScoreIsInconclusive() + { + string json = + """ + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 0, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 1, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; + + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); - json = """{"relevance": 0, "truth": 5, "completeness": 3}"""; - rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.True(rating.IsInconclusive); + } + + [Fact] + public void JsonWithExcessivelyHighScoreIsInconclusive() + { + string json = + """ + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 100, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; + + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); - json = """{"relevance": 1, "truth": 4, "completeness": 6}"""; - rating = RelevanceTruthAndCompletenessRating.FromJson(json); Assert.True(rating.IsInconclusive); } [Fact] - public void JsonContainsErrors() + public void JsonWithAdditionalHallucinatedPropertyIsProcessedCorrectly() + { + string json = + """ + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "hallucinatedProperty": "Some hallucinated text", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 1, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; + + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); + + Assert.Equal(1, rating.Relevance); + Assert.Equal(1, rating.Truth); + Assert.Equal(1, rating.Completeness); + Assert.Equal("The reason for the relevance score", rating.RelevanceReasoning); + Assert.Equal("The reason for the truth score", rating.TruthReasoning); + Assert.Equal("The reason for the completeness score", rating.CompletenessReasoning); + Assert.Single(rating.RelevanceReasons); + Assert.Equal("relevance_reason_distant_topic", rating.RelevanceReasons[0]); + Assert.Equal(3, rating.TruthReasons.Length); + Assert.Contains("truth_reason_incorrect_information", rating.TruthReasons); + Assert.Contains("truth_reason_outdated_information", rating.TruthReasons); + Assert.Contains("truth_reason_misleading_incorrectforintent", rating.TruthReasons); + Assert.Equal(2, rating.CompletenessReasons.Length); + Assert.Contains("completeness_reason_no_solution", rating.CompletenessReasons); + Assert.Contains("completeness_reason_genericsolution_missingcode", rating.CompletenessReasons); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonWithDuplicatePropertyUsesLastValue() { - string json = """{"relevance": 0, "truth": 2 ;"completeness": 3}"""; + string json = + """ + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasoning": "Duplicate reason for the relevance score", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 1, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; + + RelevanceTruthAndCompletenessRating rating = RelevanceTruthAndCompletenessRating.FromJson(json); + + Assert.Equal(1, rating.Relevance); + Assert.Equal(1, rating.Truth); + Assert.Equal(1, rating.Completeness); + Assert.Equal("Duplicate reason for the relevance score", rating.RelevanceReasoning); + Assert.Equal("The reason for the truth score", rating.TruthReasoning); + Assert.Equal("The reason for the completeness score", rating.CompletenessReasoning); + Assert.Single(rating.RelevanceReasons); + Assert.Equal("relevance_reason_distant_topic", rating.RelevanceReasons[0]); + Assert.Equal(3, rating.TruthReasons.Length); + Assert.Contains("truth_reason_incorrect_information", rating.TruthReasons); + Assert.Contains("truth_reason_outdated_information", rating.TruthReasons); + Assert.Contains("truth_reason_misleading_incorrectforintent", rating.TruthReasons); + Assert.Equal(2, rating.CompletenessReasons.Length); + Assert.Contains("completeness_reason_no_solution", rating.CompletenessReasons); + Assert.Contains("completeness_reason_genericsolution_missingcode", rating.CompletenessReasons); + Assert.False(rating.IsInconclusive); + } + + [Fact] + public void JsonWithSemicolonsInsteadOfCommasThrowsException() + { + string json = + """ + { + "relevance": 1; + "relevanceReasoning": "The reason for the relevance score"; + "relevanceReasons": ["relevance_reason_distant_topic"]; + "truth": 1; + "truthReasoning": "The reason for the truth score"; + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"]; + "completeness": 1; + "completenessReasoning": "The reason for the completeness score"; + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"]; + } + """; + + Assert.Throws(() => RelevanceTruthAndCompletenessRating.FromJson(json)); + } + + [Fact] + public void JsonWithMissingPropertiesThrowsException() + { + string json = + """ + { + "relevance": 1, + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + } + """; + + Assert.Throws(() => RelevanceTruthAndCompletenessRating.FromJson(json)); + } + + [Fact] + public void JsonWithIncorrectPropertyValueTypeThrowsException() + { + // Incorrect property value (integer instead of string for relevanceReasoning). + string json = + """ + { + "relevance": 1, + "relevanceReasoning": 6, + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 1, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + """; + Assert.Throws(() => RelevanceTruthAndCompletenessRating.FromJson(json)); } } From ac33a8703203300cd147b61c300d0c9a73498fff Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Fri, 13 Jun 2025 00:05:00 -0700 Subject: [PATCH 14/15] Improve validation for inputs --- .../IntentResolutionEvaluator.cs | 9 ++++---- .../TaskAdherenceEvaluator.cs | 9 ++++---- .../ToolCallAccuracyEvaluator.cs | 22 ++++++------------- 3 files changed, 17 insertions(+), 23 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluator.cs index 43ddb5e8334..4f19d308f10 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/IntentResolutionEvaluator.cs @@ -85,19 +85,20 @@ public async ValueTask EvaluateAsync( var metric = new NumericMetric(IntentResolutionMetricName); var result = new EvaluationResult(metric); - if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || string.IsNullOrWhiteSpace(userRequest.Text)) + if (!messages.Any()) { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"The {nameof(messages)} supplied for evaluation did not contain a user request as the last message.")); + "The conversation history supplied for evaluation did not include any messages.")); return result; } - if (string.IsNullOrWhiteSpace(modelResponse.Text)) + if (!modelResponse.Messages.Any()) { metric.AddDiagnostics( - EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation was null or empty.")); + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation did not include any messages.")); return result; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluator.cs index a51360ed68c..cf4ba4073ee 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/TaskAdherenceEvaluator.cs @@ -84,19 +84,20 @@ public async ValueTask EvaluateAsync( var metric = new NumericMetric(TaskAdherenceMetricName); var result = new EvaluationResult(metric); - if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || string.IsNullOrWhiteSpace(userRequest.Text)) + if (!messages.Any()) { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"The {nameof(messages)} supplied for evaluation did not contain a user request as the last message.")); + "The conversation history supplied for evaluation did not include any messages.")); return result; } - if (string.IsNullOrWhiteSpace(modelResponse.Text)) + if (!modelResponse.Messages.Any()) { metric.AddDiagnostics( - EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation was null or empty.")); + EvaluationDiagnostic.Error( + $"The {nameof(modelResponse)} supplied for evaluation did not include any messages.")); return result; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs index b38ffa546e7..5b3631bf598 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ToolCallAccuracyEvaluator.cs @@ -86,19 +86,22 @@ public async ValueTask EvaluateAsync( var metric = new BooleanMetric(ToolCallAccuracyMetricName); var result = new EvaluationResult(metric); - if (!messages.TryGetUserRequest(out ChatMessage? userRequest) || string.IsNullOrWhiteSpace(userRequest.Text)) + if (!messages.Any()) { metric.AddDiagnostics( EvaluationDiagnostic.Error( - $"The {nameof(messages)} supplied for evaluation did not contain a user request as the last message.")); + "The conversation history supplied for evaluation did not include any messages.")); return result; } - if (string.IsNullOrWhiteSpace(modelResponse.Text)) + IEnumerable toolCalls = + modelResponse.Messages.SelectMany(m => m.Contents).OfType(); + + if (!toolCalls.Any()) { metric.AddDiagnostics( - EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation was null or empty.")); + EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation did not contain any tool calls (i.e., {nameof(FunctionCallContent)}s).")); return result; } @@ -122,17 +125,6 @@ public async ValueTask EvaluateAsync( return result; } - IEnumerable toolCalls = - modelResponse.Messages.SelectMany(m => m.Contents).OfType(); - - if (!toolCalls.Any()) - { - metric.AddDiagnostics( - EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation did not contain any tool calls (i.e., {nameof(FunctionCallContent)}s.")); - - return result; - } - var toolDefinitionNames = new HashSet(context.ToolDefinitions.Select(td => td.Name)); if (toolCalls.Any(t => !toolDefinitionNames.Contains(t.Name))) From cf1717fcf3ffc9ddaf1655554400efee5970c565 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Fri, 13 Jun 2025 00:21:16 -0700 Subject: [PATCH 15/15] Add missing context --- .../AgentQualityEvaluatorTests.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs index b1bd36e5d15..134d6a50f32 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/AgentQualityEvaluatorTests.cs @@ -119,11 +119,14 @@ await _agentQualityReportingConfiguration.CreateScenarioRunAsync( var toolDefinitionsForTaskAdherenceEvaluator = new TaskAdherenceEvaluatorContext(toolDefinitions: _chatOptionsWithTools.Tools!); + var toolDefinitionsForIntentResolution = + new IntentResolutionEvaluatorContext(toolDefinitions: _chatOptionsWithTools.Tools!); + EvaluationResult result = await scenarioRun.EvaluateAsync( messages, response, - additionalContext: [toolDefinitionsForTaskAdherenceEvaluator]); + additionalContext: [toolDefinitionsForTaskAdherenceEvaluator, toolDefinitionsForIntentResolution]); Assert.False( result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning),