diff --git a/src/ragas/metrics/collections/__init__.py b/src/ragas/metrics/collections/__init__.py index 29a9ae64d..e82fca60b 100644 --- a/src/ragas/metrics/collections/__init__.py +++ b/src/ragas/metrics/collections/__init__.py @@ -22,6 +22,7 @@ NonLLMStringSimilarity, StringPresence, ) +from ragas.metrics.collections._summary_score import SummaryScore from ragas.metrics.collections.base import BaseMetric __all__ = [ @@ -39,6 +40,7 @@ "SemanticSimilarity", "SimpleCriteria", "StringPresence", + "SummaryScore", # AspectCritic helper functions "coherence", "conciseness", diff --git a/src/ragas/metrics/collections/_summary_score.py b/src/ragas/metrics/collections/_summary_score.py new file mode 100644 index 000000000..ac9e93061 --- /dev/null +++ b/src/ragas/metrics/collections/_summary_score.py @@ -0,0 +1,203 @@ +"""Summary Score metric v2 - Modern implementation with function-based prompts.""" + +import logging +import typing as t +from typing import List + +from pydantic import BaseModel + +from ragas.metrics.collections.base import BaseMetric +from ragas.metrics.result import MetricResult +from ragas.prompt.metrics.summary_score import ( + extract_keyphrases_prompt, + generate_answers_prompt, + generate_questions_prompt, +) + +if t.TYPE_CHECKING: + from ragas.llms.base import InstructorBaseRagasLLM + + +class ExtractedKeyphrases(BaseModel): + """Structured output for keyphrase extraction.""" + + keyphrases: List[str] + + +class QuestionsGenerated(BaseModel): + """Structured output for question generation.""" + + questions: List[str] + + +class AnswersGenerated(BaseModel): + """Structured output for answer generation.""" + + answers: List[str] + + +class SummaryScore(BaseMetric): + """ + Modern v2 implementation of summarization score evaluation. + + Measures how well a summary captures important information from contexts by: + 1. Extracting keyphrases from the original contexts + 2. Generating yes/no questions from those keyphrases + 3. Checking if the summary can answer those questions + 4. Optionally penalizing overly long summaries for conciseness + + This implementation uses modern instructor LLMs with structured output. + Only supports modern components - legacy wrappers are rejected with clear error messages. + + Usage: + >>> import instructor + >>> from openai import AsyncOpenAI + >>> from ragas.llms.base import instructor_llm_factory + >>> from ragas.metrics.collections import SummaryScore + >>> + >>> # Setup dependencies + >>> client = AsyncOpenAI() + >>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o-mini") + >>> + >>> # Create metric instance + >>> metric = SummaryScore(llm=llm) + >>> + >>> # Single evaluation + >>> result = await metric.ascore( + ... reference_contexts=["Apple Inc. is a technology company..."], + ... response="Apple is a tech company founded by Steve Jobs." + ... ) + >>> print(f"Summary Score: {result.value}") + >>> + >>> # Custom configuration (more conciseness focus) + >>> concise_metric = SummaryScore( + ... llm=llm, + ... length_penalty=True, + ... coeff=0.8 # More weight on conciseness + ... ) + + Attributes: + llm: Modern instructor-based LLM for keyphrase, question, and answer generation + name: The metric name + length_penalty: Whether to apply conciseness penalty for long summaries + coeff: Weight for conciseness score (0.0=only QA, 1.0=only conciseness) + allowed_values: Score range (0.0 to 1.0) + """ + + # Type hints for linter (attributes are set in __init__) + llm: "InstructorBaseRagasLLM" + + def __init__( + self, + llm: "InstructorBaseRagasLLM", + name: str = "summary_score", + length_penalty: bool = True, + coeff: float = 0.5, + **kwargs, + ): + """ + Initialize SummaryScore metric with required components. + + Args: + llm: Modern instructor-based LLM for keyphrase, question, and answer generation + name: The metric name + length_penalty: Whether to apply conciseness penalty for long summaries + coeff: Weight for conciseness score (0.0=only QA, 1.0=only conciseness) + """ + # Set attributes explicitly before calling super() + self.llm = llm + self.length_penalty = length_penalty + self.coeff = coeff + + # Validate coefficient + if not (0.0 <= coeff <= 1.0): + raise ValueError(f"Coefficient must be between 0.0 and 1.0, got {coeff}") + + # Call super() for validation (without passing llm in kwargs) + super().__init__(name=name, **kwargs) + + async def ascore( + self, reference_contexts: List[str], response: str + ) -> MetricResult: + """ + Calculate summary score. + + Args: + reference_contexts: The original contexts that were summarized + response: The summary to evaluate + + Returns: + MetricResult with summary score (0.0-1.0) + + Raises: + ValueError: If reference_contexts is empty or response is empty/whitespace only + """ + # Input validation + if not reference_contexts or not any(ctx.strip() for ctx in reference_contexts): + raise ValueError( + "reference_contexts cannot be empty or contain only whitespace" + ) + + if not response or not response.strip(): + raise ValueError("response cannot be empty or whitespace only") + + # Step 1: Combine contexts and extract keyphrases + text = "\n".join(reference_contexts) + keyphrases = await self._extract_keyphrases(text) + + if not keyphrases: + # Match legacy behavior: log error and continue with empty list + logging.error("No keyphrases generated, unable to calculate the score.") + keyphrases = [] + + # Step 2: Generate questions from keyphrases + questions = await self._generate_questions(text, keyphrases) + + if not questions: + # Match legacy behavior: log error and continue with empty list + logging.error("No questions generated, unable to calculate the score.") + questions = [] + + # Step 3: Check if summary can answer the questions + answers = await self._generate_answers(response, questions) + + # Step 4: Calculate QA score + qa_score = self._compute_qa_score(answers) + + # Step 5: Calculate final score (with optional conciseness penalty) + if self.length_penalty: + conciseness_score = self._compute_conciseness_score(text, response) + final_score = qa_score * (1 - self.coeff) + conciseness_score * self.coeff + else: + final_score = qa_score + + return MetricResult(value=float(final_score)) + + async def _extract_keyphrases(self, text: str) -> List[str]: + """Extract keyphrases from text using the keyphrase extraction prompt.""" + prompt = extract_keyphrases_prompt(text) + result = await self.llm.agenerate(prompt, ExtractedKeyphrases) + return result.keyphrases + + async def _generate_questions(self, text: str, keyphrases: List[str]) -> List[str]: + """Generate questions from text and keyphrases.""" + prompt = generate_questions_prompt(text, keyphrases) + result = await self.llm.agenerate(prompt, QuestionsGenerated) + return result.questions + + async def _generate_answers(self, summary: str, questions: List[str]) -> List[str]: + """Generate answers by checking if summary can answer questions.""" + prompt = generate_answers_prompt(summary, questions) + result = await self.llm.agenerate(prompt, AnswersGenerated) + return result.answers + + def _compute_qa_score(self, answers: List[str]) -> float: + """Compute QA score as ratio of correct answers. Matches legacy behavior exactly.""" + correct = sum([1 for a in answers if a.lower() == "1"]) + return correct / len( + answers + ) # Will raise ZeroDivisionError if answers is empty (legacy behavior) + + def _compute_conciseness_score(self, text: str, summary: str) -> float: + """Compute conciseness score based on length ratio.""" + return 1 - min(len(summary), len(text)) / (len(text) + 1e-10) diff --git a/src/ragas/prompt/metrics/summary_score.py b/src/ragas/prompt/metrics/summary_score.py new file mode 100644 index 000000000..a0459a20e --- /dev/null +++ b/src/ragas/prompt/metrics/summary_score.py @@ -0,0 +1,155 @@ +"""Summary Score prompts - V1-identical using exact PydanticPrompt.to_string() output.""" + +import json +import typing as t + + +def extract_keyphrases_prompt(text: str) -> str: + """ + V1-identical keyphrase extraction - matches PydanticPrompt.to_string() exactly. + + Args: + text: The text to extract keyphrases from + + Returns: + V1-identical prompt string for the LLM + """ + # Format input exactly like V1's model_dump_json(indent=4, exclude_none=True) + safe_text = json.dumps(text) + + return f"""Extract keyphrases of type: Person, Organization, Location, Date/Time, Monetary Values, and Percentages. +Please return the output in a JSON format that complies with the following schema as specified in JSON Schema: +{{"properties": {{"keyphrases": {{"items": {{"type": "string"}}, "title": "Keyphrases", "type": "array"}}}}, "required": ["keyphrases"], "title": "ExtractedKeyphrases", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash. + +--------EXAMPLES----------- +Example 1 +Input: {{ + "text": "Apple Inc. is a technology company based in Cupertino, California. Founded by Steve Jobs in 1976, it reached a market capitalization of $3 trillion in 2023." +}} +Output: {{ + "keyphrases": [ + "Apple Inc.", + "Cupertino, California", + "Steve Jobs", + "1976", + "$3 trillion", + "2023" + ] +}} +----------------------------- + +Now perform the same with the following input +input: {{ + "text": {safe_text} +}} +Output: """ + + +def generate_questions_prompt(text: str, keyphrases: t.List[str]) -> str: + """ + V1-identical question generation - matches PydanticPrompt.to_string() exactly. + + Args: + text: The text to generate questions about + keyphrases: The keyphrases extracted from the text + + Returns: + V1-identical prompt string for the LLM + """ + # Format inputs exactly like V1's model_dump_json(indent=4, exclude_none=True) + safe_text = json.dumps(text) + safe_keyphrases = json.dumps(keyphrases, indent=4).replace("\n", "\n ") + + return f"""Based on the given text and keyphrases, generate closed-ended questions that can be answered with '1' if the question can be answered using the text, or '0' if it cannot. The questions should ALWAYS result in a '1' based on the given text. +Please return the output in a JSON format that complies with the following schema as specified in JSON Schema: +{{"properties": {{"questions": {{"items": {{"type": "string"}}, "title": "Questions", "type": "array"}}}}, "required": ["questions"], "title": "QuestionsGenerated", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash. + +--------EXAMPLES----------- +Example 1 +Input: {{ + "text": "Apple Inc. is a technology company based in Cupertino, California. Founded by Steve Jobs in 1976, it reached a market capitalization of $3 trillion in 2023.", + "keyphrases": [ + "Apple Inc.", + "Cupertino, California", + "Steve Jobs", + "1976", + "$3 trillion", + "2023" + ] +}} +Output: {{ + "questions": [ + "Is Apple Inc. a technology company?", + "Is Apple Inc. based in Cupertino, California?", + "Was Apple Inc. founded by Steve Jobs?", + "Was Apple Inc. founded in 1976?", + "Did Apple Inc. reach a market capitalization of $3 trillion?", + "Did Apple Inc. reach a market capitalization of $3 trillion in 2023?" + ] +}} +----------------------------- + +Now perform the same with the following input +input: {{ + "text": {safe_text}, + "keyphrases": {safe_keyphrases} +}} +Output: """ + + +def generate_answers_prompt(summary: str, questions: t.List[str]) -> str: + """ + V1-identical answer generation - matches PydanticPrompt.to_string() exactly. + + Args: + summary: The summary to evaluate + questions: The questions to check against the summary + + Returns: + V1-identical prompt string for the LLM + """ + # Format inputs exactly like V1's model_dump_json(indent=4, exclude_none=True) + safe_summary = json.dumps(summary) + safe_questions = json.dumps(questions, indent=4).replace("\n", "\n ") + + return f"""Based on the list of close-ended '1' or '0' questions, generate a JSON with key 'answers', which is a list of strings that determines whether the provided summary contains sufficient information to answer EACH question. Answers should STRICTLY be either '1' or '0'. Answer '0' if the provided summary does not contain enough information to answer the question and answer '1' if the provided summary can answer the question. +Please return the output in a JSON format that complies with the following schema as specified in JSON Schema: +{{"properties": {{"answers": {{"items": {{"type": "string"}}, "title": "Answers", "type": "array"}}}}, "required": ["answers"], "title": "AnswersGenerated", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash. + +--------EXAMPLES----------- +Example 1 +Input: {{ + "summary": "Apple Inc. is a technology company based in Cupertino, California. Founded by Steve Jobs in 1976, it reached a market capitalization of $3 trillion in 2023.", + "questions": [ + "Is Apple Inc. a technology company?", + "Is Apple Inc. based in Cupertino, California?", + "Was Apple Inc. founded by Steve Jobs?", + "Was Apple Inc. founded in 1976?", + "Did Apple Inc. reach a market capitalization of $3 trillion?", + "Did Apple Inc. reach a market capitalization of $3 trillion in 2023?", + "Is Apple Inc. a major software company?", + "Is Apple Inc. known for the iPhone?", + "Was Steve Jobs the co-founder of Apple Inc.?" + ] +}} +Output: {{ + "answers": [ + "1", + "1", + "1", + "1", + "1", + "1", + "0", + "0", + "1" + ] +}} +----------------------------- + +Now perform the same with the following input +input: {{ + "summary": {safe_summary}, + "questions": {safe_questions} +}} +Output: """ diff --git a/tests/e2e/metrics_migration/test_summary_score_migration.py b/tests/e2e/metrics_migration/test_summary_score_migration.py new file mode 100644 index 000000000..df092d35e --- /dev/null +++ b/tests/e2e/metrics_migration/test_summary_score_migration.py @@ -0,0 +1,184 @@ +"""E2E tests for Summary Score metric migration from v1 to v2.""" + +import pytest + +from ragas.dataset_schema import SingleTurnSample +from ragas.metrics._summarization import SummarizationScore as LegacySummaryScore +from ragas.metrics.collections import SummaryScore + + +class TestSummaryScoreE2EMigration: + """E2E test compatibility between legacy SummaryScore and new V2 SummaryScore with modern components.""" + + @pytest.fixture + def sample_data(self): + """Real-world test cases for summary score evaluation.""" + return [ + { + "reference_contexts": [ + "Apple Inc. is a technology company based in Cupertino, California. Founded by Steve Jobs in 1976, it reached a market capitalization of $3 trillion in 2023. The company is known for innovative products like iPhone, iPad, and Mac computers. Apple has retail stores worldwide and employs over 150,000 people." + ], + "response": "Apple Inc. is a technology company founded by Steve Jobs in 1976, based in Cupertino, California. The company reached a $3 trillion market cap in 2023.", + "description": "Good summary with key facts", + }, + { + "reference_contexts": [ + "Climate change refers to long-term shifts in global temperatures and weather patterns. Since the 1800s, human activities have been the main driver of climate change, primarily due to fossil fuel burning which releases greenhouse gases. The effects include rising sea levels, extreme weather events, and ecosystem disruption." + ], + "response": "Weather changes happen sometimes.", + "description": "Very brief summary missing key details", + }, + { + "reference_contexts": [ + "The Great Wall of China is an ancient series of walls and fortifications built across the northern borders of China. Construction began in the 7th century BC and continued for centuries. The wall stretches over 13,000 miles and was built to protect against invasions." + ], + "response": "The Great Wall of China is an ancient series of walls and fortifications built across northern China starting in the 7th century BC. It stretches over 13,000 miles and was built for protection against invasions.", + "description": "Comprehensive summary with most details", + }, + ] + + @pytest.fixture + def test_llm(self): + """Create a LangChain LLM for legacy summary score evaluation.""" + try: + from langchain_openai import ChatOpenAI + + from ragas.llms import LangchainLLMWrapper + + langchain_llm = ChatOpenAI(model="gpt-4o", temperature=0.01) + return LangchainLLMWrapper(langchain_llm) + except ImportError as e: + pytest.skip(f"LangChain LLM not available: {e}") + except Exception as e: + pytest.skip(f"Could not create LangChain LLM (API key may be missing): {e}") + + @pytest.fixture + def test_modern_llm(self): + """Create a modern instructor LLM for v2 implementation.""" + try: + import openai + + from ragas.llms.base import llm_factory + + client = openai.AsyncOpenAI() + return llm_factory("gpt-4o", client=client) + except ImportError as e: + pytest.skip(f"LLM factory not available: {e}") + except Exception as e: + pytest.skip(f"Could not create modern LLM (API key may be missing): {e}") + + @pytest.mark.asyncio + async def test_legacy_summary_score_vs_v2_summary_score_e2e_compatibility( + self, sample_data, test_llm, test_modern_llm + ): + """E2E test that legacy and v2 implementations produce similar scores.""" + + if test_llm is None or test_modern_llm is None: + pytest.skip("LLM required for E2E testing") + + for i, data in enumerate(sample_data): + print(f"\n🧪 Testing Summary Score - Case {i + 1}: {data['description']}") + print(f" Contexts: {data['reference_contexts'][0][:80]}...") + print(f" Response: {data['response'][:80]}...") + + # Legacy implementation + legacy_summary_score = LegacySummaryScore(llm=test_llm) + legacy_sample = SingleTurnSample( + reference_contexts=data["reference_contexts"], + response=data["response"], + ) + legacy_score = await legacy_summary_score._single_turn_ascore( + legacy_sample, None + ) + + # V2 implementation + v2_summary_score = SummaryScore(llm=test_modern_llm) + v2_result = await v2_summary_score.ascore( + reference_contexts=data["reference_contexts"], + response=data["response"], + ) + + score_diff = abs(legacy_score - v2_result.value) + print(f" Legacy: {legacy_score:.6f}") + print(f" V2: {v2_result.value:.6f}") + print(f" Diff: {score_diff:.6f}") + + # Ensure implementations give reasonably similar scores for complex multi-step metric + assert score_diff < 0.2, ( + f"Legacy and V2 scores should be reasonably similar: Legacy={legacy_score:.6f}, " + f"V2={v2_result.value:.6f}, Diff={score_diff:.6f} (tolerance: 0.2)" + ) + print(" ✅ Both implementations give consistent scores") + + # Validate score ranges + assert 0.0 <= legacy_score <= 1.0 + assert 0.0 <= v2_result.value <= 1.0 + + @pytest.mark.asyncio + async def test_summary_score_weight_configuration(self, test_modern_llm): + """Test that v2 implementation respects weight configuration.""" + + if test_modern_llm is None: + pytest.skip("Modern LLM required for weight testing") + + # Test data + contexts = [ + "Apple Inc. is a technology company founded by Steve Jobs in 1976. The company is based in Cupertino, California." + ] + summary = "Apple is a tech company." + + # Test different coefficient values + coefficients = [0.0, 0.5, 1.0] # 0=only QA, 0.5=balanced, 1.0=only conciseness + + results = [] + for coeff in coefficients: + metric = SummaryScore(llm=test_modern_llm, coeff=coeff, length_penalty=True) + result = await metric.ascore(reference_contexts=contexts, response=summary) + results.append(result.value) + + # Validate score range + assert 0.0 <= result.value <= 1.0 + + print( + f"Coefficient results: coeff=0.0: {results[0]:.3f}, coeff=0.5: {results[1]:.3f}, coeff=1.0: {results[2]:.3f}" + ) + + # Different coefficients should produce different scores + assert results[0] != results[2], ( + "Different coefficients should produce different scores" + ) + + @pytest.mark.asyncio + async def test_summary_score_parameter_validation(self, test_modern_llm): + """Test that v2 implementation validates parameters correctly.""" + + if test_modern_llm is None: + pytest.skip("Modern LLM required for parameter testing") + + # Test invalid coefficient (too high) + with pytest.raises(ValueError, match="Coefficient must be between 0.0 and 1.0"): + SummaryScore(llm=test_modern_llm, coeff=1.5) + + # Test invalid coefficient (negative) + with pytest.raises(ValueError, match="Coefficient must be between 0.0 and 1.0"): + SummaryScore(llm=test_modern_llm, coeff=-0.1) + + # Test valid configurations + metric1 = SummaryScore(llm=test_modern_llm, length_penalty=True, coeff=0.0) + metric2 = SummaryScore(llm=test_modern_llm, length_penalty=False, coeff=1.0) + + assert metric1.length_penalty is True + assert metric1.coeff == 0.0 + assert metric2.length_penalty is False + assert metric2.coeff == 1.0 + + def test_summary_score_migration_requirements_documented(self): + """Test that migration requirements are properly documented.""" + + # V2 implementation should not accept legacy components + with pytest.raises((TypeError, ValueError, AttributeError)): + SummaryScore(llm="invalid_llm_type") # Should reject string + + # V2 should only accept InstructorBaseRagasLLM + with pytest.raises((TypeError, ValueError, AttributeError)): + SummaryScore(llm=None) # Should reject None