dotnet · shyamnamboodiripad · Apr 17, 2025 · Apr 17, 2025 · Apr 16, 2025 · Apr 17, 2025
@@ -90,7 +90,7 @@ private static async Task<int> Main(string[] args)
 
         var formatOpt =
             new Option<ReportCommand.Format>(
-                "--format",
+                ["-f", "--format"],
                 () => ReportCommand.Format.html,
                 "Specify the format for the generated report.");
 

@@ -1,7 +1,6 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
@@ -35,7 +34,7 @@ public abstract class ChatConversationEvaluator : IEvaluator
     protected virtual string? SystemPrompt => null;
 
     /// <inheritdoc/>
-    public async ValueTask<EvaluationResult> EvaluateAsync(
+    public virtual async ValueTask<EvaluationResult> EvaluateAsync(
         IEnumerable<ChatMessage> messages,
         ChatResponse modelResponse,
         ChatConfiguration? chatConfiguration = null,
@@ -49,7 +48,7 @@ public async ValueTask<EvaluationResult> EvaluateAsync(
 
         if (string.IsNullOrWhiteSpace(modelResponse.Text))
         {
-            result.AddDiagnosticToAllMetrics(
+            result.AddDiagnosticsToAllMetrics(
                 EvaluationDiagnostic.Error(
                     "Evaluation failed because the model response supplied for evaluation was null or empty."));
 
@@ -73,7 +72,7 @@ void OnTokenBudgetExceeded()
                     EvaluationDiagnostic.Error(
                         $"Evaluation failed because the specified limit of {inputTokenLimit} input tokens was exceeded.");
 
-                result.AddDiagnosticToAllMetrics(tokenBudgetExceeded);
+                result.AddDiagnosticsToAllMetrics(tokenBudgetExceeded);
             }
 
             if (!string.IsNullOrWhiteSpace(SystemPrompt))
@@ -176,7 +175,7 @@ await PerformEvaluationAsync(
         if (inputTokenLimit > 0 && ignoredMessagesCount > 0)
         {
 #pragma warning disable S103 // Lines should not be too long
-            result.AddDiagnosticToAllMetrics(
+            result.AddDiagnosticsToAllMetrics(
                 EvaluationDiagnostic.Warning(
                     $"The evaluation may be inconclusive because the oldest {ignoredMessagesCount} messages in the supplied conversation history were ignored in order to stay under the specified limit of {inputTokenLimit} input tokens."));
 #pragma warning restore S103

@@ -49,6 +49,28 @@ public sealed class EquivalenceEvaluator : SingleNumericMetricEvaluator
     /// <inheritdoc/>
     protected override bool IgnoresHistory => true;
 
+    /// <inheritdoc/>
+    public override async ValueTask<EvaluationResult> EvaluateAsync(
+        IEnumerable<ChatMessage> messages,
+        ChatResponse modelResponse,
+        ChatConfiguration? chatConfiguration = null,
+        IEnumerable<EvaluationContext>? additionalContext = null,
+        CancellationToken cancellationToken = default)
+    {
+        EvaluationResult result =
+            await base.EvaluateAsync(
+                messages,
+                modelResponse,
+                chatConfiguration,
+                additionalContext,
+                cancellationToken).ConfigureAwait(false);
+
+        EquivalenceEvaluatorContext context = GetRelevantContext(additionalContext);
+        result.AddOrUpdateContextInAllMetrics("Ground Truth", context.GetContents());
+
+        return result;
+    }
+
     /// <inheritdoc/>
     protected override async ValueTask<string> RenderEvaluationPromptAsync(
         ChatMessage? userRequest,
@@ -66,18 +88,8 @@ userRequest is not null
                 ? await RenderAsync(userRequest, cancellationToken).ConfigureAwait(false)
                 : string.Empty;
 
-        string groundTruth;
-
-        if (additionalContext?.OfType<EquivalenceEvaluatorContext>().FirstOrDefault()
-                is EquivalenceEvaluatorContext context)
-        {
-            groundTruth = context.GroundTruth;
-        }
-        else
-        {
-            throw new InvalidOperationException(
-                $"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.");
-        }
+        EquivalenceEvaluatorContext context = GetRelevantContext(additionalContext);
+        string groundTruth = context.GroundTruth;
 
         string prompt =
             $$"""
@@ -149,4 +161,16 @@ alleviating stress and augmenting general mood.
 
         return prompt;
     }
+
+    private static EquivalenceEvaluatorContext GetRelevantContext(IEnumerable<EvaluationContext>? additionalContext)
+    {
+        if (additionalContext?.OfType<EquivalenceEvaluatorContext>().FirstOrDefault()
+                is EquivalenceEvaluatorContext context)
+        {
+            return context;
+        }
+
+        throw new InvalidOperationException(
+            $"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.");
+    }
 }
@@ -49,6 +49,30 @@ public sealed class GroundednessEvaluator : SingleNumericMetricEvaluator
     /// <inheritdoc/>
     protected override bool IgnoresHistory => false;
 
+    /// <inheritdoc/>
+    public override async ValueTask<EvaluationResult> EvaluateAsync(
+        IEnumerable<ChatMessage> messages,
+        ChatResponse modelResponse,
+        ChatConfiguration? chatConfiguration = null,
+        IEnumerable<EvaluationContext>? additionalContext = null,
+        CancellationToken cancellationToken = default)
+    {
+        EvaluationResult result =
+            await base.EvaluateAsync(
+                messages,
+                modelResponse,
+                chatConfiguration,
+                additionalContext,
+                cancellationToken).ConfigureAwait(false);
+
+        if (GetRelevantContext(additionalContext) is GroundednessEvaluatorContext context)
+        {
+            result.AddOrUpdateContextInAllMetrics("Grounding Context", context.GetContents());
+        }
+
+        return result;
+    }
+
     /// <inheritdoc/>
     protected override async ValueTask<string> RenderEvaluationPromptAsync(
         ChatMessage? userRequest,
@@ -68,8 +92,7 @@ userRequest is not null
 
         var builder = new StringBuilder();
 
-        if (additionalContext?.OfType<GroundednessEvaluatorContext>().FirstOrDefault()
-                is GroundednessEvaluatorContext context)
+        if (GetRelevantContext(additionalContext) is GroundednessEvaluatorContext context)
         {
             _ = builder.Append(context.GroundingContext);
             _ = builder.AppendLine();
@@ -162,4 +185,15 @@ is not French.
 
         return prompt;
     }
+
+    private static GroundednessEvaluatorContext? GetRelevantContext(IEnumerable<EvaluationContext>? additionalContext)
+    {
+        if (additionalContext?.OfType<GroundednessEvaluatorContext>().FirstOrDefault()
+                is GroundednessEvaluatorContext context)
+        {
+            return context;
+        }
+
+        return null;
+    }
 }
@@ -145,7 +145,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(
             if (string.IsNullOrEmpty(evaluationResponseText))
             {
                 rating = Rating.Inconclusive;
-                result.AddDiagnosticToAllMetrics(
+                result.AddDiagnosticsToAllMetrics(
                     EvaluationDiagnostic.Error(
                         "Evaluation failed because the model failed to produce a valid evaluation response."));
             }
@@ -168,7 +168,7 @@ await JsonOutputFixer.RepairJsonAsync(
                         if (string.IsNullOrEmpty(repairedJson))
                         {
                             rating = Rating.Inconclusive;
-                            result.AddDiagnosticToAllMetrics(
+                            result.AddDiagnosticsToAllMetrics(
                                 EvaluationDiagnostic.Error(
                                     $"""
                                     Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
@@ -183,7 +183,7 @@ await JsonOutputFixer.RepairJsonAsync(
                     catch (JsonException ex)
                     {
                         rating = Rating.Inconclusive;
-                        result.AddDiagnosticToAllMetrics(
+                        result.AddDiagnosticsToAllMetrics(
                             EvaluationDiagnostic.Error(
                                 $"""
                                 Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
@@ -281,7 +281,7 @@ void UpdateResult()
 
             if (!string.IsNullOrWhiteSpace(rating.Error))
             {
-                result.AddDiagnosticToAllMetrics(EvaluationDiagnostic.Error(rating.Error!));
+                result.AddDiagnosticsToAllMetrics(EvaluationDiagnostic.Error(rating.Error!));
             }
         }
     }

@@ -105,7 +105,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(
 
             if (string.IsNullOrEmpty(evaluationResponseText))
             {
-                metric.AddDiagnostic(
+                metric.AddDiagnostics(
                     EvaluationDiagnostic.Error(
                         "Evaluation failed because the model failed to produce a valid evaluation response."));
             }
@@ -115,7 +115,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(
             }
             else
             {
-                metric.AddDiagnostic(
+                metric.AddDiagnostics(
                     EvaluationDiagnostic.Error(
                         $"Failed to parse '{evaluationResponseText!}' as an integer score for '{MetricName}'."));
             }

@@ -8,11 +8,13 @@ import ReactMarkdown from "react-markdown";
 import { useReportContext } from "./ReportContext";
 import { useStyles } from "./Styles";
 import { ChatMessageDisplay, isTextContent, isImageContent } from "./Summary";
+import type { MetricType } from "./MetricCard";
 
-export const ConversationDetails = ({ messages, model, usage }: {
+export const ConversationDetails = ({ messages, model, usage, selectedMetric }: {
     messages: ChatMessageDisplay[];
     model?: string;
     usage?: UsageDetails;
+    selectedMetric?: MetricType | null;
 }) => {
     const classes = useStyles();
     const [isExpanded, setIsExpanded] = useState(true);
@@ -59,7 +61,27 @@ export const ConversationDetails = ({ messages, model, usage }: {
         return result;
     };
 
+    const getContextGroups = () => {
+        if (!selectedMetric || !selectedMetric.context) {
+            return [];
+        }
+
+        const contextGroups: { key: string, contents: AIContent[] }[] = [];
+
+        for (const [key, contents] of Object.entries(selectedMetric.context)) {
+            if (contents && contents.length > 0) {
+                contextGroups.push({
+                    key: key.toLowerCase(),
+                    contents: contents
+                });
+            }
+        }
+
+        return contextGroups;
+    };
+
     const messageGroups = groupMessages();
+    const contextGroups = getContextGroups();
 
     return (
         <div className={classes.section}>
@@ -79,7 +101,7 @@ export const ConversationDetails = ({ messages, model, usage }: {
                         );
 
                         return (
-                            <div key={index} className={messageRowClass}>
+                            <div key={`msg-${index}`} className={messageRowClass}>
                                 <div className={classes.messageParticipantName}>{group.participantName}</div>
                                 <div className={classes.messageBubble}>
                                     {group.contents.map((content, contentIndex) => (
@@ -91,6 +113,19 @@ export const ConversationDetails = ({ messages, model, usage }: {
                             </div>
                         );
                     })}
+
+                    {contextGroups.map((group, index) => (
+                        <div key={`context-${index}`} className={mergeClasses(classes.messageRow, classes.userMessageRow)}>
+                            <div className={classes.messageParticipantName}>{`supplied evaluation context (${group.key})`}</div>
+                            <div className={classes.contextBubble}>
+                                {group.contents.map((content, contentIndex) => (
+                                    <div key={contentIndex}>
+                                        {renderContent(content)}
+                                    </div>
+                                ))}
+                            </div>
+                        </div>
+                    ))}
                 </div>
             )}
         </div>

@@ -94,6 +94,9 @@ type BaseEvaluationMetric = {
     $type: string;
     name: string;
     interpretation?: EvaluationMetricInterpretation;
+    context?: {
+        [K: string]: AIContent[]
+    };
     diagnostics?: EvaluationDiagnostic[];
     metadata: { 
         [K: string]: string 

@@ -32,7 +32,7 @@ export const ScoreDetail = ({ scenario, scoreSummary }: { scenario: ScenarioRunR
             onMetricSelect={setSelectedMetric}
             selectedMetric={selectedMetric} />
         {selectedMetric && <MetricDetailsSection metric={selectedMetric} />}
-        <ConversationDetails messages={messages} model={model} usage={usage} />
+        <ConversationDetails messages={messages} model={model} usage={usage} selectedMetric={selectedMetric} />
         {scenario.chatDetails && scenario.chatDetails.turnDetails.length > 0 && <ChatDetailsSection chatDetails={scenario.chatDetails} />}
     </div>);
 };
@@ -127,6 +127,14 @@ export const useStyles = makeStyles({
         backgroundColor: tokens.colorNeutralBackground3,
         border: '1px solid ' + tokens.colorNeutralStroke2,
     },
+    contextBubble: {
+        padding: '0.75rem 1rem',
+        borderRadius: '12px',
+        overflow: 'hidden',
+        wordBreak: 'break-word',
+        backgroundColor: tokens.colorBrandBackground2,
+        border: '1px solid ' + tokens.colorNeutralStroke2,
+    },
     cacheHitIcon: {
         color: tokens.colorPaletteGreenForeground1,
     },

@@ -97,6 +97,6 @@ internal static void LogJsonData(this EvaluationMetric metric, string data)
     internal static void LogJsonData(this EvaluationMetric metric, JsonNode data)
     {
         string serializedData = data.ToJsonString(new JsonSerializerOptions { WriteIndented = true });
-        metric.AddDiagnostic(EvaluationDiagnostic.Informational(serializedData));
+        metric.AddDiagnostics(EvaluationDiagnostic.Informational(serializedData));
     }
 }
@@ -63,22 +63,30 @@ await EvaluateContentSafetyAsync(
                 contentSafetyServicePayloadFormat: ContentSafetyServicePayloadFormat.QuestionAnswer.ToString(),
                 cancellationToken: cancellationToken).ConfigureAwait(false);
 
+        GroundednessProEvaluatorContext context = GetRelevantContext(additionalContext);
+        result.AddOrUpdateContextInAllMetrics("Grounding Context", context.GetContents());
+
         return result;
     }
 
     /// <inheritdoc/>
     protected override IReadOnlyList<EvaluationContext>? FilterAdditionalContext(
         IEnumerable<EvaluationContext>? additionalContext)
+    {
+        GroundednessProEvaluatorContext context = GetRelevantContext(additionalContext);
+        return [context];
+    }
+
+    private static GroundednessProEvaluatorContext GetRelevantContext(
+        IEnumerable<EvaluationContext>? additionalContext)
     {
         if (additionalContext?.OfType<GroundednessProEvaluatorContext>().FirstOrDefault()
                 is GroundednessProEvaluatorContext context)
         {
-            return [context];
-        }
-        else
-        {
-            throw new InvalidOperationException(
-                $"A value of type '{nameof(GroundednessProEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.");
+            return context;
         }
+
+        throw new InvalidOperationException(
+            $"A value of type '{nameof(GroundednessProEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.");
     }
 }