|
| 1 | +"""Summary Score metric v2 - Modern implementation with function-based prompts.""" |
| 2 | + |
| 3 | +import logging |
| 4 | +import typing as t |
| 5 | +from typing import List |
| 6 | + |
| 7 | +from pydantic import BaseModel |
| 8 | + |
| 9 | +from ragas.metrics.collections.base import BaseMetric |
| 10 | +from ragas.metrics.result import MetricResult |
| 11 | +from ragas.prompt.metrics.summary_score import ( |
| 12 | + extract_keyphrases_prompt, |
| 13 | + generate_answers_prompt, |
| 14 | + generate_questions_prompt, |
| 15 | +) |
| 16 | + |
| 17 | +if t.TYPE_CHECKING: |
| 18 | + from ragas.llms.base import InstructorBaseRagasLLM |
| 19 | + |
| 20 | + |
| 21 | +class ExtractedKeyphrases(BaseModel): |
| 22 | + """Structured output for keyphrase extraction.""" |
| 23 | + |
| 24 | + keyphrases: List[str] |
| 25 | + |
| 26 | + |
| 27 | +class QuestionsGenerated(BaseModel): |
| 28 | + """Structured output for question generation.""" |
| 29 | + |
| 30 | + questions: List[str] |
| 31 | + |
| 32 | + |
| 33 | +class AnswersGenerated(BaseModel): |
| 34 | + """Structured output for answer generation.""" |
| 35 | + |
| 36 | + answers: List[str] |
| 37 | + |
| 38 | + |
| 39 | +class SummaryScore(BaseMetric): |
| 40 | + """ |
| 41 | + Modern v2 implementation of summarization score evaluation. |
| 42 | +
|
| 43 | + Measures how well a summary captures important information from contexts by: |
| 44 | + 1. Extracting keyphrases from the original contexts |
| 45 | + 2. Generating yes/no questions from those keyphrases |
| 46 | + 3. Checking if the summary can answer those questions |
| 47 | + 4. Optionally penalizing overly long summaries for conciseness |
| 48 | +
|
| 49 | + This implementation uses modern instructor LLMs with structured output. |
| 50 | + Only supports modern components - legacy wrappers are rejected with clear error messages. |
| 51 | +
|
| 52 | + Usage: |
| 53 | + >>> import instructor |
| 54 | + >>> from openai import AsyncOpenAI |
| 55 | + >>> from ragas.llms.base import instructor_llm_factory |
| 56 | + >>> from ragas.metrics.collections import SummaryScore |
| 57 | + >>> |
| 58 | + >>> # Setup dependencies |
| 59 | + >>> client = AsyncOpenAI() |
| 60 | + >>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o-mini") |
| 61 | + >>> |
| 62 | + >>> # Create metric instance |
| 63 | + >>> metric = SummaryScore(llm=llm) |
| 64 | + >>> |
| 65 | + >>> # Single evaluation |
| 66 | + >>> result = await metric.ascore( |
| 67 | + ... reference_contexts=["Apple Inc. is a technology company..."], |
| 68 | + ... response="Apple is a tech company founded by Steve Jobs." |
| 69 | + ... ) |
| 70 | + >>> print(f"Summary Score: {result.value}") |
| 71 | + >>> |
| 72 | + >>> # Custom configuration (more conciseness focus) |
| 73 | + >>> concise_metric = SummaryScore( |
| 74 | + ... llm=llm, |
| 75 | + ... length_penalty=True, |
| 76 | + ... coeff=0.8 # More weight on conciseness |
| 77 | + ... ) |
| 78 | +
|
| 79 | + Attributes: |
| 80 | + llm: Modern instructor-based LLM for keyphrase, question, and answer generation |
| 81 | + name: The metric name |
| 82 | + length_penalty: Whether to apply conciseness penalty for long summaries |
| 83 | + coeff: Weight for conciseness score (0.0=only QA, 1.0=only conciseness) |
| 84 | + allowed_values: Score range (0.0 to 1.0) |
| 85 | + """ |
| 86 | + |
| 87 | + # Type hints for linter (attributes are set in __init__) |
| 88 | + llm: "InstructorBaseRagasLLM" |
| 89 | + |
| 90 | + def __init__( |
| 91 | + self, |
| 92 | + llm: "InstructorBaseRagasLLM", |
| 93 | + name: str = "summary_score", |
| 94 | + length_penalty: bool = True, |
| 95 | + coeff: float = 0.5, |
| 96 | + **kwargs, |
| 97 | + ): |
| 98 | + """ |
| 99 | + Initialize SummaryScore metric with required components. |
| 100 | +
|
| 101 | + Args: |
| 102 | + llm: Modern instructor-based LLM for keyphrase, question, and answer generation |
| 103 | + name: The metric name |
| 104 | + length_penalty: Whether to apply conciseness penalty for long summaries |
| 105 | + coeff: Weight for conciseness score (0.0=only QA, 1.0=only conciseness) |
| 106 | + """ |
| 107 | + # Set attributes explicitly before calling super() |
| 108 | + self.llm = llm |
| 109 | + self.length_penalty = length_penalty |
| 110 | + self.coeff = coeff |
| 111 | + |
| 112 | + # Validate coefficient |
| 113 | + if not (0.0 <= coeff <= 1.0): |
| 114 | + raise ValueError(f"Coefficient must be between 0.0 and 1.0, got {coeff}") |
| 115 | + |
| 116 | + # Call super() for validation (without passing llm in kwargs) |
| 117 | + super().__init__(name=name, **kwargs) |
| 118 | + |
| 119 | + async def ascore( |
| 120 | + self, reference_contexts: List[str], response: str |
| 121 | + ) -> MetricResult: |
| 122 | + """ |
| 123 | + Calculate summary score. |
| 124 | +
|
| 125 | + Args: |
| 126 | + reference_contexts: The original contexts that were summarized |
| 127 | + response: The summary to evaluate |
| 128 | +
|
| 129 | + Returns: |
| 130 | + MetricResult with summary score (0.0-1.0) |
| 131 | +
|
| 132 | + Raises: |
| 133 | + ValueError: If reference_contexts is empty or response is empty/whitespace only |
| 134 | + """ |
| 135 | + # Input validation |
| 136 | + if not reference_contexts or not any(ctx.strip() for ctx in reference_contexts): |
| 137 | + raise ValueError( |
| 138 | + "reference_contexts cannot be empty or contain only whitespace" |
| 139 | + ) |
| 140 | + |
| 141 | + if not response or not response.strip(): |
| 142 | + raise ValueError("response cannot be empty or whitespace only") |
| 143 | + |
| 144 | + # Step 1: Combine contexts and extract keyphrases |
| 145 | + text = "\n".join(reference_contexts) |
| 146 | + keyphrases = await self._extract_keyphrases(text) |
| 147 | + |
| 148 | + if not keyphrases: |
| 149 | + # Match legacy behavior: log error and continue with empty list |
| 150 | + logging.error("No keyphrases generated, unable to calculate the score.") |
| 151 | + keyphrases = [] |
| 152 | + |
| 153 | + # Step 2: Generate questions from keyphrases |
| 154 | + questions = await self._generate_questions(text, keyphrases) |
| 155 | + |
| 156 | + if not questions: |
| 157 | + # Match legacy behavior: log error and continue with empty list |
| 158 | + logging.error("No questions generated, unable to calculate the score.") |
| 159 | + questions = [] |
| 160 | + |
| 161 | + # Step 3: Check if summary can answer the questions |
| 162 | + answers = await self._generate_answers(response, questions) |
| 163 | + |
| 164 | + # Step 4: Calculate QA score |
| 165 | + qa_score = self._compute_qa_score(answers) |
| 166 | + |
| 167 | + # Step 5: Calculate final score (with optional conciseness penalty) |
| 168 | + if self.length_penalty: |
| 169 | + conciseness_score = self._compute_conciseness_score(text, response) |
| 170 | + final_score = qa_score * (1 - self.coeff) + conciseness_score * self.coeff |
| 171 | + else: |
| 172 | + final_score = qa_score |
| 173 | + |
| 174 | + return MetricResult(value=float(final_score)) |
| 175 | + |
| 176 | + async def _extract_keyphrases(self, text: str) -> List[str]: |
| 177 | + """Extract keyphrases from text using the keyphrase extraction prompt.""" |
| 178 | + prompt = extract_keyphrases_prompt(text) |
| 179 | + result = await self.llm.agenerate(prompt, ExtractedKeyphrases) |
| 180 | + return result.keyphrases |
| 181 | + |
| 182 | + async def _generate_questions(self, text: str, keyphrases: List[str]) -> List[str]: |
| 183 | + """Generate questions from text and keyphrases.""" |
| 184 | + prompt = generate_questions_prompt(text, keyphrases) |
| 185 | + result = await self.llm.agenerate(prompt, QuestionsGenerated) |
| 186 | + return result.questions |
| 187 | + |
| 188 | + async def _generate_answers(self, summary: str, questions: List[str]) -> List[str]: |
| 189 | + """Generate answers by checking if summary can answer questions.""" |
| 190 | + prompt = generate_answers_prompt(summary, questions) |
| 191 | + result = await self.llm.agenerate(prompt, AnswersGenerated) |
| 192 | + return result.answers |
| 193 | + |
| 194 | + def _compute_qa_score(self, answers: List[str]) -> float: |
| 195 | + """Compute QA score as ratio of correct answers. Matches legacy behavior exactly.""" |
| 196 | + correct = sum([1 for a in answers if a.lower() == "1"]) |
| 197 | + return correct / len( |
| 198 | + answers |
| 199 | + ) # Will raise ZeroDivisionError if answers is empty (legacy behavior) |
| 200 | + |
| 201 | + def _compute_conciseness_score(self, text: str, summary: str) -> float: |
| 202 | + """Compute conciseness score based on length ratio.""" |
| 203 | + return 1 - min(len(summary), len(text)) / (len(text) + 1e-10) |
0 commit comments