Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/ragas/metrics/collections/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
)
from ragas.metrics.collections._bleu_score import BleuScore
from ragas.metrics.collections._context_entity_recall import ContextEntityRecall
from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
from ragas.metrics.collections._rouge_score import RougeScore
from ragas.metrics.collections._semantic_similarity import SemanticSimilarity
from ragas.metrics.collections._simple_criteria import SimpleCriteria
Expand All @@ -35,6 +36,7 @@
"ContextEntityRecall",
"DistanceMeasure",
"ExactMatch",
"NoiseSensitivity",
"NonLLMStringSimilarity",
"RougeScore",
"SemanticSimilarity",
Expand Down
244 changes: 244 additions & 0 deletions src/ragas/metrics/collections/_noise_sensitivity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
"""Noise Sensitivity metric v2 - Modern implementation with function-based prompts."""

import typing as t
from typing import Dict, List, Literal

import numpy as np
from pydantic import BaseModel

from ragas.metrics.collections.base import BaseMetric
from ragas.metrics.result import MetricResult
from ragas.prompt.metrics.answer_correctness import statement_generator_prompt
from ragas.prompt.metrics.noise_sensitivity import nli_statement_prompt

if t.TYPE_CHECKING:
from ragas.llms.base import InstructorBaseRagasLLM


class StatementGeneratorOutput(BaseModel):
"""Structured output for statement generation."""

statements: List[str]


class StatementFaithfulnessAnswer(BaseModel):
"""Individual statement with reason and verdict for NLI evaluation."""

statement: str
reason: str
verdict: int


class NLIStatementOutput(BaseModel):
"""Structured output for NLI statement evaluation."""

statements: List[StatementFaithfulnessAnswer]


class NoiseSensitivity(BaseMetric):
"""
Modern v2 implementation of noise sensitivity evaluation.

Measures how often a system makes errors by providing incorrect responses
when utilizing either relevant or irrelevant retrieved documents.

The metric works by:
1. Decomposing reference and response into atomic statements
2. Using NLI to evaluate statement faithfulness against each retrieved context
3. Computing noise sensitivity based on incorrect claims from relevant/irrelevant contexts

This implementation uses modern instructor LLMs with structured output.
Only supports modern components - legacy wrappers are rejected with clear error messages.

Usage:
>>> import instructor
>>> from openai import AsyncOpenAI
>>> from ragas.llms.base import instructor_llm_factory
>>> from ragas.metrics.collections import NoiseSensitivity
>>>
>>> # Setup dependencies
>>> client = AsyncOpenAI()
>>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o-mini")
>>>
>>> # Create metric instance
>>> metric = NoiseSensitivity(llm=llm)
>>>
>>> # Single evaluation
>>> result = await metric.ascore(
... user_input="What is LIC known for?",
... response="LIC is the largest insurance company in India...",
... reference="LIC is known for managing investments...",
... retrieved_contexts=["LIC was established in 1956...", ...]
... )
>>> print(f"Noise Sensitivity: {result.value}")
>>>
>>> # Test irrelevant context sensitivity
>>> irrelevant_metric = NoiseSensitivity(llm=llm, mode="irrelevant")

Attributes:
llm: Modern instructor-based LLM for statement generation and NLI evaluation
name: The metric name
mode: Either "relevant" or "irrelevant" context sensitivity
allowed_values: Score range (0.0 to 1.0, lower is better)
"""

# Type hints for linter (attributes are set in __init__)
llm: "InstructorBaseRagasLLM"

def __init__(
self,
llm: "InstructorBaseRagasLLM",
name: str = "noise_sensitivity",
mode: Literal["relevant", "irrelevant"] = "relevant",
**kwargs,
):
"""
Initialize NoiseSensitivity metric with required components.

Args:
llm: Modern instructor-based LLM for statement generation and NLI evaluation
name: The metric name
mode: Either "relevant" or "irrelevant" context sensitivity mode
"""
# Set attributes explicitly before calling super()
self.llm = llm
self.mode = mode

# Validate mode
if mode not in {"relevant", "irrelevant"}:
raise ValueError(
f"Invalid argument passed for 'mode': {mode}. Must be 'relevant' or 'irrelevant'."
)

# Call super() for validation (without passing llm in kwargs)
super().__init__(name=name, **kwargs)

async def ascore(
self,
user_input: str,
response: str,
reference: str,
retrieved_contexts: List[str],
) -> MetricResult:
"""
Calculate noise sensitivity score.

Args:
user_input: The original question
response: The answer to evaluate
reference: The ground truth reference
retrieved_contexts: The retrieved contexts used to generate the response

Returns:
MetricResult with noise sensitivity score (0.0-1.0, lower is better)
"""
# Input validation
if not reference:
raise ValueError(
"reference is missing. Please add reference to the test sample."
)
if not user_input:
raise ValueError(
"user_input is missing. Please add user_input to the test sample."
)
if not response:
raise ValueError(
"response is missing. Please add response to the test sample."
)
if not retrieved_contexts:
raise ValueError(
"retrieved_contexts is missing. Please add retrieved_contexts to the test sample."
)

# Step 1: Decompose reference and response into statements
gt_statements = await self._decompose_answer_into_statements(
reference, user_input
)
ans_statements = await self._decompose_answer_into_statements(
response, user_input
)

# Step 2: Evaluate statement faithfulness against each retrieved context
gt_verdictslist = []
ans_verdictslist = []

for ctx in retrieved_contexts:
# Evaluate ground truth statements against this context
gt_verdicts = await self._evaluate_statement_faithfulness(
gt_statements, ctx
)
gt_verdictslist.append(np.array(gt_verdicts))

# Evaluate answer statements against this context
ans_verdicts = await self._evaluate_statement_faithfulness(
ans_statements, ctx
)
ans_verdictslist.append(np.array(ans_verdicts))

# Step 3: Build matrices for computation (exact legacy shape handling)
answers = {}
answers["retrieved2ground_truth"] = np.array(gt_verdictslist).T
answers["retrieved2answer"] = np.array(ans_verdictslist).T

# Evaluate answer statements against reference (ground truth)
gt_to_ans_verdicts = await self._evaluate_statement_faithfulness(
ans_statements, reference
)
answers["ground_truth2answer"] = np.array(gt_to_ans_verdicts)
# Wrap in another array to match legacy shape handling
answers["ground_truth2answer"] = np.array([answers["ground_truth2answer"]])

# Convert all to boolean arrays
answers = {k: v.astype(bool) for k, v in answers.items()}

# Step 4: Compute noise sensitivity score
score = self._compute_score(answers)

return MetricResult(value=float(score))

async def _decompose_answer_into_statements(
self, text: str, question: str
) -> List[str]:
"""Decompose answer text into atomic statements."""
prompt = statement_generator_prompt(question, text)
result = await self.llm.agenerate(prompt, StatementGeneratorOutput)
return result.statements

async def _evaluate_statement_faithfulness(
self, statements: List[str], context: str
) -> List[int]:
"""Evaluate faithfulness of statements against context using NLI."""
prompt = nli_statement_prompt(context, statements)
result = await self.llm.agenerate(prompt, NLIStatementOutput)

verdict_list = [
1 if statement.verdict else 0 for statement in result.statements
]
return verdict_list

def _compute_score(self, answers: Dict) -> float:
"""Compute noise sensitivity score from faithfulness matrices."""
incorrect = ~answers["ground_truth2answer"]

# Compute relevant retrievals (needed for both modes)
relevant_retrieved = np.max(
answers["retrieved2ground_truth"], axis=0, keepdims=True
)
relevant_faithful = np.max(
relevant_retrieved & answers["retrieved2answer"], axis=1
)

if self.mode == "irrelevant":
# Compute irrelevant retrievals
irrelevant_retrieved = ~relevant_retrieved
irrelevant_faithful = np.max(
irrelevant_retrieved & answers["retrieved2answer"], axis=1
)

# Keep them exclusive (irrelevant should not include relevant)
irrelevant_faithful &= ~relevant_faithful

return float(np.mean(irrelevant_faithful & incorrect))

else: # mode == "relevant"
return float(np.mean(relevant_faithful & incorrect))
85 changes: 85 additions & 0 deletions src/ragas/prompt/metrics/noise_sensitivity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""Noise Sensitivity prompts - V1-identical using exact PydanticPrompt.to_string() output."""

import json
import typing as t


def nli_statement_prompt(context: str, statements: t.List[str]) -> str:
"""
V1-identical NLI statement evaluation - matches PydanticPrompt.to_string() exactly.

Args:
context: The context to evaluate statements against
statements: The statements to judge for faithfulness

Returns:
V1-identical prompt string for the LLM
"""
# Format inputs exactly like V1's model_dump_json(indent=4, exclude_none=True)
safe_context = json.dumps(context)
safe_statements = json.dumps(statements, indent=4).replace("\n", "\n ")

return f"""Your task is to judge the faithfulness of a series of statements based on a given context. For each statement you must return verdict as 1 if the statement can be directly inferred based on the context or 0 if the statement can not be directly inferred based on the context.
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
{{"$defs": {{"StatementFaithfulnessAnswer": {{"properties": {{"statement": {{"description": "the original statement, word-by-word", "title": "Statement", "type": "string"}}, "reason": {{"description": "the reason of the verdict", "title": "Reason", "type": "string"}}, "verdict": {{"description": "the verdict(0/1) of the faithfulness.", "title": "Verdict", "type": "integer"}}}}, "required": ["statement", "reason", "verdict"], "title": "StatementFaithfulnessAnswer", "type": "object"}}}}, "properties": {{"statements": {{"items": {{"$ref": "#/$defs/StatementFaithfulnessAnswer"}}, "title": "Statements", "type": "array"}}}}, "required": ["statements"], "title": "NLIStatementOutput", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash.

--------EXAMPLES-----------
Example 1
Input: {{
"context": "John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.",
"statements": [
"John is majoring in Biology.",
"John is taking a course on Artificial Intelligence.",
"John is a dedicated student.",
"John has a part-time job."
]
}}
Output: {{
"statements": [
{{
"statement": "John is majoring in Biology.",
"reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.",
"verdict": 0
}},
{{
"statement": "John is taking a course on Artificial Intelligence.",
"reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.",
"verdict": 0
}},
{{
"statement": "John is a dedicated student.",
"reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.",
"verdict": 1
}},
{{
"statement": "John has a part-time job.",
"reason": "There is no information given in the context about John having a part-time job.",
"verdict": 0
}}
]
}}

Example 2
Input: {{
"context": "Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.",
"statements": [
"Albert Einstein was a genius."
]
}}
Output: {{
"statements": [
{{
"statement": "Albert Einstein was a genius.",
"reason": "The context and statement are unrelated",
"verdict": 0
}}
]
}}
-----------------------------

Now perform the same with the following input
input: {{
"context": {safe_context},
"statements": {safe_statements}
}}
Output: """
Loading
Loading