Skip to content

Commit c75f103

Browse files
authored
Migrate SummaryScore (#2376)
1 parent 3f5b2de commit c75f103

File tree

4 files changed

+544
-0
lines changed

4 files changed

+544
-0
lines changed

src/ragas/metrics/collections/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
NonLLMStringSimilarity,
2323
StringPresence,
2424
)
25+
from ragas.metrics.collections._summary_score import SummaryScore
2526
from ragas.metrics.collections.base import BaseMetric
2627

2728
__all__ = [
@@ -39,6 +40,7 @@
3940
"SemanticSimilarity",
4041
"SimpleCriteria",
4142
"StringPresence",
43+
"SummaryScore",
4244
# AspectCritic helper functions
4345
"coherence",
4446
"conciseness",
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
"""Summary Score metric v2 - Modern implementation with function-based prompts."""
2+
3+
import logging
4+
import typing as t
5+
from typing import List
6+
7+
from pydantic import BaseModel
8+
9+
from ragas.metrics.collections.base import BaseMetric
10+
from ragas.metrics.result import MetricResult
11+
from ragas.prompt.metrics.summary_score import (
12+
extract_keyphrases_prompt,
13+
generate_answers_prompt,
14+
generate_questions_prompt,
15+
)
16+
17+
if t.TYPE_CHECKING:
18+
from ragas.llms.base import InstructorBaseRagasLLM
19+
20+
21+
class ExtractedKeyphrases(BaseModel):
22+
"""Structured output for keyphrase extraction."""
23+
24+
keyphrases: List[str]
25+
26+
27+
class QuestionsGenerated(BaseModel):
28+
"""Structured output for question generation."""
29+
30+
questions: List[str]
31+
32+
33+
class AnswersGenerated(BaseModel):
34+
"""Structured output for answer generation."""
35+
36+
answers: List[str]
37+
38+
39+
class SummaryScore(BaseMetric):
40+
"""
41+
Modern v2 implementation of summarization score evaluation.
42+
43+
Measures how well a summary captures important information from contexts by:
44+
1. Extracting keyphrases from the original contexts
45+
2. Generating yes/no questions from those keyphrases
46+
3. Checking if the summary can answer those questions
47+
4. Optionally penalizing overly long summaries for conciseness
48+
49+
This implementation uses modern instructor LLMs with structured output.
50+
Only supports modern components - legacy wrappers are rejected with clear error messages.
51+
52+
Usage:
53+
>>> import instructor
54+
>>> from openai import AsyncOpenAI
55+
>>> from ragas.llms.base import instructor_llm_factory
56+
>>> from ragas.metrics.collections import SummaryScore
57+
>>>
58+
>>> # Setup dependencies
59+
>>> client = AsyncOpenAI()
60+
>>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o-mini")
61+
>>>
62+
>>> # Create metric instance
63+
>>> metric = SummaryScore(llm=llm)
64+
>>>
65+
>>> # Single evaluation
66+
>>> result = await metric.ascore(
67+
... reference_contexts=["Apple Inc. is a technology company..."],
68+
... response="Apple is a tech company founded by Steve Jobs."
69+
... )
70+
>>> print(f"Summary Score: {result.value}")
71+
>>>
72+
>>> # Custom configuration (more conciseness focus)
73+
>>> concise_metric = SummaryScore(
74+
... llm=llm,
75+
... length_penalty=True,
76+
... coeff=0.8 # More weight on conciseness
77+
... )
78+
79+
Attributes:
80+
llm: Modern instructor-based LLM for keyphrase, question, and answer generation
81+
name: The metric name
82+
length_penalty: Whether to apply conciseness penalty for long summaries
83+
coeff: Weight for conciseness score (0.0=only QA, 1.0=only conciseness)
84+
allowed_values: Score range (0.0 to 1.0)
85+
"""
86+
87+
# Type hints for linter (attributes are set in __init__)
88+
llm: "InstructorBaseRagasLLM"
89+
90+
def __init__(
91+
self,
92+
llm: "InstructorBaseRagasLLM",
93+
name: str = "summary_score",
94+
length_penalty: bool = True,
95+
coeff: float = 0.5,
96+
**kwargs,
97+
):
98+
"""
99+
Initialize SummaryScore metric with required components.
100+
101+
Args:
102+
llm: Modern instructor-based LLM for keyphrase, question, and answer generation
103+
name: The metric name
104+
length_penalty: Whether to apply conciseness penalty for long summaries
105+
coeff: Weight for conciseness score (0.0=only QA, 1.0=only conciseness)
106+
"""
107+
# Set attributes explicitly before calling super()
108+
self.llm = llm
109+
self.length_penalty = length_penalty
110+
self.coeff = coeff
111+
112+
# Validate coefficient
113+
if not (0.0 <= coeff <= 1.0):
114+
raise ValueError(f"Coefficient must be between 0.0 and 1.0, got {coeff}")
115+
116+
# Call super() for validation (without passing llm in kwargs)
117+
super().__init__(name=name, **kwargs)
118+
119+
async def ascore(
120+
self, reference_contexts: List[str], response: str
121+
) -> MetricResult:
122+
"""
123+
Calculate summary score.
124+
125+
Args:
126+
reference_contexts: The original contexts that were summarized
127+
response: The summary to evaluate
128+
129+
Returns:
130+
MetricResult with summary score (0.0-1.0)
131+
132+
Raises:
133+
ValueError: If reference_contexts is empty or response is empty/whitespace only
134+
"""
135+
# Input validation
136+
if not reference_contexts or not any(ctx.strip() for ctx in reference_contexts):
137+
raise ValueError(
138+
"reference_contexts cannot be empty or contain only whitespace"
139+
)
140+
141+
if not response or not response.strip():
142+
raise ValueError("response cannot be empty or whitespace only")
143+
144+
# Step 1: Combine contexts and extract keyphrases
145+
text = "\n".join(reference_contexts)
146+
keyphrases = await self._extract_keyphrases(text)
147+
148+
if not keyphrases:
149+
# Match legacy behavior: log error and continue with empty list
150+
logging.error("No keyphrases generated, unable to calculate the score.")
151+
keyphrases = []
152+
153+
# Step 2: Generate questions from keyphrases
154+
questions = await self._generate_questions(text, keyphrases)
155+
156+
if not questions:
157+
# Match legacy behavior: log error and continue with empty list
158+
logging.error("No questions generated, unable to calculate the score.")
159+
questions = []
160+
161+
# Step 3: Check if summary can answer the questions
162+
answers = await self._generate_answers(response, questions)
163+
164+
# Step 4: Calculate QA score
165+
qa_score = self._compute_qa_score(answers)
166+
167+
# Step 5: Calculate final score (with optional conciseness penalty)
168+
if self.length_penalty:
169+
conciseness_score = self._compute_conciseness_score(text, response)
170+
final_score = qa_score * (1 - self.coeff) + conciseness_score * self.coeff
171+
else:
172+
final_score = qa_score
173+
174+
return MetricResult(value=float(final_score))
175+
176+
async def _extract_keyphrases(self, text: str) -> List[str]:
177+
"""Extract keyphrases from text using the keyphrase extraction prompt."""
178+
prompt = extract_keyphrases_prompt(text)
179+
result = await self.llm.agenerate(prompt, ExtractedKeyphrases)
180+
return result.keyphrases
181+
182+
async def _generate_questions(self, text: str, keyphrases: List[str]) -> List[str]:
183+
"""Generate questions from text and keyphrases."""
184+
prompt = generate_questions_prompt(text, keyphrases)
185+
result = await self.llm.agenerate(prompt, QuestionsGenerated)
186+
return result.questions
187+
188+
async def _generate_answers(self, summary: str, questions: List[str]) -> List[str]:
189+
"""Generate answers by checking if summary can answer questions."""
190+
prompt = generate_answers_prompt(summary, questions)
191+
result = await self.llm.agenerate(prompt, AnswersGenerated)
192+
return result.answers
193+
194+
def _compute_qa_score(self, answers: List[str]) -> float:
195+
"""Compute QA score as ratio of correct answers. Matches legacy behavior exactly."""
196+
correct = sum([1 for a in answers if a.lower() == "1"])
197+
return correct / len(
198+
answers
199+
) # Will raise ZeroDivisionError if answers is empty (legacy behavior)
200+
201+
def _compute_conciseness_score(self, text: str, summary: str) -> float:
202+
"""Compute conciseness score based on length ratio."""
203+
return 1 - min(len(summary), len(text)) / (len(text) + 1e-10)
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
"""Summary Score prompts - V1-identical using exact PydanticPrompt.to_string() output."""
2+
3+
import json
4+
import typing as t
5+
6+
7+
def extract_keyphrases_prompt(text: str) -> str:
8+
"""
9+
V1-identical keyphrase extraction - matches PydanticPrompt.to_string() exactly.
10+
11+
Args:
12+
text: The text to extract keyphrases from
13+
14+
Returns:
15+
V1-identical prompt string for the LLM
16+
"""
17+
# Format input exactly like V1's model_dump_json(indent=4, exclude_none=True)
18+
safe_text = json.dumps(text)
19+
20+
return f"""Extract keyphrases of type: Person, Organization, Location, Date/Time, Monetary Values, and Percentages.
21+
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
22+
{{"properties": {{"keyphrases": {{"items": {{"type": "string"}}, "title": "Keyphrases", "type": "array"}}}}, "required": ["keyphrases"], "title": "ExtractedKeyphrases", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash.
23+
24+
--------EXAMPLES-----------
25+
Example 1
26+
Input: {{
27+
"text": "Apple Inc. is a technology company based in Cupertino, California. Founded by Steve Jobs in 1976, it reached a market capitalization of $3 trillion in 2023."
28+
}}
29+
Output: {{
30+
"keyphrases": [
31+
"Apple Inc.",
32+
"Cupertino, California",
33+
"Steve Jobs",
34+
"1976",
35+
"$3 trillion",
36+
"2023"
37+
]
38+
}}
39+
-----------------------------
40+
41+
Now perform the same with the following input
42+
input: {{
43+
"text": {safe_text}
44+
}}
45+
Output: """
46+
47+
48+
def generate_questions_prompt(text: str, keyphrases: t.List[str]) -> str:
49+
"""
50+
V1-identical question generation - matches PydanticPrompt.to_string() exactly.
51+
52+
Args:
53+
text: The text to generate questions about
54+
keyphrases: The keyphrases extracted from the text
55+
56+
Returns:
57+
V1-identical prompt string for the LLM
58+
"""
59+
# Format inputs exactly like V1's model_dump_json(indent=4, exclude_none=True)
60+
safe_text = json.dumps(text)
61+
safe_keyphrases = json.dumps(keyphrases, indent=4).replace("\n", "\n ")
62+
63+
return f"""Based on the given text and keyphrases, generate closed-ended questions that can be answered with '1' if the question can be answered using the text, or '0' if it cannot. The questions should ALWAYS result in a '1' based on the given text.
64+
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
65+
{{"properties": {{"questions": {{"items": {{"type": "string"}}, "title": "Questions", "type": "array"}}}}, "required": ["questions"], "title": "QuestionsGenerated", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash.
66+
67+
--------EXAMPLES-----------
68+
Example 1
69+
Input: {{
70+
"text": "Apple Inc. is a technology company based in Cupertino, California. Founded by Steve Jobs in 1976, it reached a market capitalization of $3 trillion in 2023.",
71+
"keyphrases": [
72+
"Apple Inc.",
73+
"Cupertino, California",
74+
"Steve Jobs",
75+
"1976",
76+
"$3 trillion",
77+
"2023"
78+
]
79+
}}
80+
Output: {{
81+
"questions": [
82+
"Is Apple Inc. a technology company?",
83+
"Is Apple Inc. based in Cupertino, California?",
84+
"Was Apple Inc. founded by Steve Jobs?",
85+
"Was Apple Inc. founded in 1976?",
86+
"Did Apple Inc. reach a market capitalization of $3 trillion?",
87+
"Did Apple Inc. reach a market capitalization of $3 trillion in 2023?"
88+
]
89+
}}
90+
-----------------------------
91+
92+
Now perform the same with the following input
93+
input: {{
94+
"text": {safe_text},
95+
"keyphrases": {safe_keyphrases}
96+
}}
97+
Output: """
98+
99+
100+
def generate_answers_prompt(summary: str, questions: t.List[str]) -> str:
101+
"""
102+
V1-identical answer generation - matches PydanticPrompt.to_string() exactly.
103+
104+
Args:
105+
summary: The summary to evaluate
106+
questions: The questions to check against the summary
107+
108+
Returns:
109+
V1-identical prompt string for the LLM
110+
"""
111+
# Format inputs exactly like V1's model_dump_json(indent=4, exclude_none=True)
112+
safe_summary = json.dumps(summary)
113+
safe_questions = json.dumps(questions, indent=4).replace("\n", "\n ")
114+
115+
return f"""Based on the list of close-ended '1' or '0' questions, generate a JSON with key 'answers', which is a list of strings that determines whether the provided summary contains sufficient information to answer EACH question. Answers should STRICTLY be either '1' or '0'. Answer '0' if the provided summary does not contain enough information to answer the question and answer '1' if the provided summary can answer the question.
116+
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
117+
{{"properties": {{"answers": {{"items": {{"type": "string"}}, "title": "Answers", "type": "array"}}}}, "required": ["answers"], "title": "AnswersGenerated", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash.
118+
119+
--------EXAMPLES-----------
120+
Example 1
121+
Input: {{
122+
"summary": "Apple Inc. is a technology company based in Cupertino, California. Founded by Steve Jobs in 1976, it reached a market capitalization of $3 trillion in 2023.",
123+
"questions": [
124+
"Is Apple Inc. a technology company?",
125+
"Is Apple Inc. based in Cupertino, California?",
126+
"Was Apple Inc. founded by Steve Jobs?",
127+
"Was Apple Inc. founded in 1976?",
128+
"Did Apple Inc. reach a market capitalization of $3 trillion?",
129+
"Did Apple Inc. reach a market capitalization of $3 trillion in 2023?",
130+
"Is Apple Inc. a major software company?",
131+
"Is Apple Inc. known for the iPhone?",
132+
"Was Steve Jobs the co-founder of Apple Inc.?"
133+
]
134+
}}
135+
Output: {{
136+
"answers": [
137+
"1",
138+
"1",
139+
"1",
140+
"1",
141+
"1",
142+
"1",
143+
"0",
144+
"0",
145+
"1"
146+
]
147+
}}
148+
-----------------------------
149+
150+
Now perform the same with the following input
151+
input: {{
152+
"summary": {safe_summary},
153+
"questions": {safe_questions}
154+
}}
155+
Output: """

0 commit comments

Comments
 (0)