Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ private static async Task<int> Main(string[] args)

var formatOpt =
new Option<ReportCommand.Format>(
"--format",
["-f", "--format"],
() => ReportCommand.Format.html,
"Specify the format for the generated report.");

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
Expand Down Expand Up @@ -35,7 +34,7 @@ public abstract class ChatConversationEvaluator : IEvaluator
protected virtual string? SystemPrompt => null;

/// <inheritdoc/>
public async ValueTask<EvaluationResult> EvaluateAsync(
public virtual async ValueTask<EvaluationResult> EvaluateAsync(
IEnumerable<ChatMessage> messages,
ChatResponse modelResponse,
ChatConfiguration? chatConfiguration = null,
Expand All @@ -49,7 +48,7 @@ public async ValueTask<EvaluationResult> EvaluateAsync(

if (string.IsNullOrWhiteSpace(modelResponse.Text))
{
result.AddDiagnosticToAllMetrics(
result.AddDiagnosticsToAllMetrics(
EvaluationDiagnostic.Error(
"Evaluation failed because the model response supplied for evaluation was null or empty."));

Expand All @@ -73,7 +72,7 @@ void OnTokenBudgetExceeded()
EvaluationDiagnostic.Error(
$"Evaluation failed because the specified limit of {inputTokenLimit} input tokens was exceeded.");

result.AddDiagnosticToAllMetrics(tokenBudgetExceeded);
result.AddDiagnosticsToAllMetrics(tokenBudgetExceeded);
}

if (!string.IsNullOrWhiteSpace(SystemPrompt))
Expand Down Expand Up @@ -176,7 +175,7 @@ await PerformEvaluationAsync(
if (inputTokenLimit > 0 && ignoredMessagesCount > 0)
{
#pragma warning disable S103 // Lines should not be too long
result.AddDiagnosticToAllMetrics(
result.AddDiagnosticsToAllMetrics(
EvaluationDiagnostic.Warning(
$"The evaluation may be inconclusive because the oldest {ignoredMessagesCount} messages in the supplied conversation history were ignored in order to stay under the specified limit of {inputTokenLimit} input tokens."));
#pragma warning restore S103
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,28 @@ public sealed class EquivalenceEvaluator : SingleNumericMetricEvaluator
/// <inheritdoc/>
protected override bool IgnoresHistory => true;

/// <inheritdoc/>
public override async ValueTask<EvaluationResult> EvaluateAsync(
IEnumerable<ChatMessage> messages,
ChatResponse modelResponse,
ChatConfiguration? chatConfiguration = null,
IEnumerable<EvaluationContext>? additionalContext = null,
CancellationToken cancellationToken = default)
{
EvaluationResult result =
await base.EvaluateAsync(
messages,
modelResponse,
chatConfiguration,
additionalContext,
cancellationToken).ConfigureAwait(false);

EquivalenceEvaluatorContext context = GetRelevantContext(additionalContext);
result.AddOrUpdateContextInAllMetrics("Ground Truth", context.GetContents());

return result;
}

/// <inheritdoc/>
protected override async ValueTask<string> RenderEvaluationPromptAsync(
ChatMessage? userRequest,
Expand All @@ -66,18 +88,8 @@ userRequest is not null
? await RenderAsync(userRequest, cancellationToken).ConfigureAwait(false)
: string.Empty;

string groundTruth;

if (additionalContext?.OfType<EquivalenceEvaluatorContext>().FirstOrDefault()
is EquivalenceEvaluatorContext context)
{
groundTruth = context.GroundTruth;
}
else
{
throw new InvalidOperationException(
$"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.");
}
EquivalenceEvaluatorContext context = GetRelevantContext(additionalContext);
string groundTruth = context.GroundTruth;

string prompt =
$$"""
Expand Down Expand Up @@ -149,4 +161,16 @@ alleviating stress and augmenting general mood.

return prompt;
}

private static EquivalenceEvaluatorContext GetRelevantContext(IEnumerable<EvaluationContext>? additionalContext)
{
if (additionalContext?.OfType<EquivalenceEvaluatorContext>().FirstOrDefault()
is EquivalenceEvaluatorContext context)
{
return context;
}

throw new InvalidOperationException(
$"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,30 @@ public sealed class GroundednessEvaluator : SingleNumericMetricEvaluator
/// <inheritdoc/>
protected override bool IgnoresHistory => false;

/// <inheritdoc/>
public override async ValueTask<EvaluationResult> EvaluateAsync(
IEnumerable<ChatMessage> messages,
ChatResponse modelResponse,
ChatConfiguration? chatConfiguration = null,
IEnumerable<EvaluationContext>? additionalContext = null,
CancellationToken cancellationToken = default)
{
EvaluationResult result =
await base.EvaluateAsync(
messages,
modelResponse,
chatConfiguration,
additionalContext,
cancellationToken).ConfigureAwait(false);

if (GetRelevantContext(additionalContext) is GroundednessEvaluatorContext context)
{
result.AddOrUpdateContextInAllMetrics("Grounding Context", context.GetContents());
}

return result;
}

/// <inheritdoc/>
protected override async ValueTask<string> RenderEvaluationPromptAsync(
ChatMessage? userRequest,
Expand All @@ -68,8 +92,7 @@ userRequest is not null

var builder = new StringBuilder();

if (additionalContext?.OfType<GroundednessEvaluatorContext>().FirstOrDefault()
is GroundednessEvaluatorContext context)
if (GetRelevantContext(additionalContext) is GroundednessEvaluatorContext context)
{
_ = builder.Append(context.GroundingContext);
_ = builder.AppendLine();
Expand Down Expand Up @@ -162,4 +185,15 @@ is not French.

return prompt;
}

private static GroundednessEvaluatorContext? GetRelevantContext(IEnumerable<EvaluationContext>? additionalContext)
{
if (additionalContext?.OfType<GroundednessEvaluatorContext>().FirstOrDefault()
is GroundednessEvaluatorContext context)
{
return context;
}

return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(
if (string.IsNullOrEmpty(evaluationResponseText))
{
rating = Rating.Inconclusive;
result.AddDiagnosticToAllMetrics(
result.AddDiagnosticsToAllMetrics(
EvaluationDiagnostic.Error(
"Evaluation failed because the model failed to produce a valid evaluation response."));
}
Expand All @@ -168,7 +168,7 @@ await JsonOutputFixer.RepairJsonAsync(
if (string.IsNullOrEmpty(repairedJson))
{
rating = Rating.Inconclusive;
result.AddDiagnosticToAllMetrics(
result.AddDiagnosticsToAllMetrics(
EvaluationDiagnostic.Error(
$"""
Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
Expand All @@ -183,7 +183,7 @@ await JsonOutputFixer.RepairJsonAsync(
catch (JsonException ex)
{
rating = Rating.Inconclusive;
result.AddDiagnosticToAllMetrics(
result.AddDiagnosticsToAllMetrics(
EvaluationDiagnostic.Error(
$"""
Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
Expand Down Expand Up @@ -281,7 +281,7 @@ void UpdateResult()

if (!string.IsNullOrWhiteSpace(rating.Error))
{
result.AddDiagnosticToAllMetrics(EvaluationDiagnostic.Error(rating.Error!));
result.AddDiagnosticsToAllMetrics(EvaluationDiagnostic.Error(rating.Error!));
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(

if (string.IsNullOrEmpty(evaluationResponseText))
{
metric.AddDiagnostic(
metric.AddDiagnostics(
EvaluationDiagnostic.Error(
"Evaluation failed because the model failed to produce a valid evaluation response."));
}
Expand All @@ -115,7 +115,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(
}
else
{
metric.AddDiagnostic(
metric.AddDiagnostics(
EvaluationDiagnostic.Error(
$"Failed to parse '{evaluationResponseText!}' as an integer score for '{MetricName}'."));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ import ReactMarkdown from "react-markdown";
import { useReportContext } from "./ReportContext";
import { useStyles } from "./Styles";
import { ChatMessageDisplay, isTextContent, isImageContent } from "./Summary";
import type { MetricType } from "./MetricCard";

export const ConversationDetails = ({ messages, model, usage }: {
export const ConversationDetails = ({ messages, model, usage, selectedMetric }: {
messages: ChatMessageDisplay[];
model?: string;
usage?: UsageDetails;
selectedMetric?: MetricType | null;
}) => {
const classes = useStyles();
const [isExpanded, setIsExpanded] = useState(true);
Expand Down Expand Up @@ -59,7 +61,27 @@ export const ConversationDetails = ({ messages, model, usage }: {
return result;
};

const getContextGroups = () => {
if (!selectedMetric || !selectedMetric.context) {
return [];
}

const contextGroups: { key: string, contents: AIContent[] }[] = [];

for (const [key, contents] of Object.entries(selectedMetric.context)) {
if (contents && contents.length > 0) {
contextGroups.push({
key: key.toLowerCase(),
contents: contents
});
}
}

return contextGroups;
};

const messageGroups = groupMessages();
const contextGroups = getContextGroups();

return (
<div className={classes.section}>
Expand All @@ -79,7 +101,7 @@ export const ConversationDetails = ({ messages, model, usage }: {
);

return (
<div key={index} className={messageRowClass}>
<div key={`msg-${index}`} className={messageRowClass}>
<div className={classes.messageParticipantName}>{group.participantName}</div>
<div className={classes.messageBubble}>
{group.contents.map((content, contentIndex) => (
Expand All @@ -91,6 +113,19 @@ export const ConversationDetails = ({ messages, model, usage }: {
</div>
);
})}

{contextGroups.map((group, index) => (
<div key={`context-${index}`} className={mergeClasses(classes.messageRow, classes.userMessageRow)}>
<div className={classes.messageParticipantName}>{`supplied evaluation context (${group.key})`}</div>
<div className={classes.contextBubble}>
{group.contents.map((content, contentIndex) => (
<div key={contentIndex}>
{renderContent(content)}
</div>
))}
</div>
</div>
))}
</div>
)}
</div>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ type BaseEvaluationMetric = {
$type: string;
name: string;
interpretation?: EvaluationMetricInterpretation;
context?: {
[K: string]: AIContent[]
};
diagnostics?: EvaluationDiagnostic[];
metadata: {
[K: string]: string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ export const ScoreDetail = ({ scenario, scoreSummary }: { scenario: ScenarioRunR
onMetricSelect={setSelectedMetric}
selectedMetric={selectedMetric} />
{selectedMetric && <MetricDetailsSection metric={selectedMetric} />}
<ConversationDetails messages={messages} model={model} usage={usage} />
<ConversationDetails messages={messages} model={model} usage={usage} selectedMetric={selectedMetric} />
{scenario.chatDetails && scenario.chatDetails.turnDetails.length > 0 && <ChatDetailsSection chatDetails={scenario.chatDetails} />}
</div>);
};
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,14 @@ export const useStyles = makeStyles({
backgroundColor: tokens.colorNeutralBackground3,
border: '1px solid ' + tokens.colorNeutralStroke2,
},
contextBubble: {
padding: '0.75rem 1rem',
borderRadius: '12px',
overflow: 'hidden',
wordBreak: 'break-word',
backgroundColor: tokens.colorBrandBackground2,
border: '1px solid ' + tokens.colorNeutralStroke2,
},
cacheHitIcon: {
color: tokens.colorPaletteGreenForeground1,
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,6 @@ internal static void LogJsonData(this EvaluationMetric metric, string data)
internal static void LogJsonData(this EvaluationMetric metric, JsonNode data)
{
string serializedData = data.ToJsonString(new JsonSerializerOptions { WriteIndented = true });
metric.AddDiagnostic(EvaluationDiagnostic.Informational(serializedData));
metric.AddDiagnostics(EvaluationDiagnostic.Informational(serializedData));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -63,22 +63,30 @@ await EvaluateContentSafetyAsync(
contentSafetyServicePayloadFormat: ContentSafetyServicePayloadFormat.QuestionAnswer.ToString(),
cancellationToken: cancellationToken).ConfigureAwait(false);

GroundednessProEvaluatorContext context = GetRelevantContext(additionalContext);
result.AddOrUpdateContextInAllMetrics("Grounding Context", context.GetContents());

return result;
}

/// <inheritdoc/>
protected override IReadOnlyList<EvaluationContext>? FilterAdditionalContext(
IEnumerable<EvaluationContext>? additionalContext)
{
GroundednessProEvaluatorContext context = GetRelevantContext(additionalContext);
return [context];
}

private static GroundednessProEvaluatorContext GetRelevantContext(
IEnumerable<EvaluationContext>? additionalContext)
{
if (additionalContext?.OfType<GroundednessProEvaluatorContext>().FirstOrDefault()
is GroundednessProEvaluatorContext context)
{
return [context];
}
else
{
throw new InvalidOperationException(
$"A value of type '{nameof(GroundednessProEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.");
return context;
}

throw new InvalidOperationException(
$"A value of type '{nameof(GroundednessProEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.");
}
}
Loading
Loading