- 
                Notifications
    You must be signed in to change notification settings 
- Fork 1.1k
Migrate SummaryScore #2376
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
          
     Merged
      
      
            anistark
  merged 5 commits into
  explodinggradients:main
from
rhlbhatnagar:rbhatnagar/migrate_metrics_4
  
      
      
   
  Oct 28, 2025 
      
    
  
     Merged
                    Migrate SummaryScore #2376
Changes from all commits
      Commits
    
    
            Show all changes
          
          
            5 commits
          
        
        Select commit
          Hold shift + click to select a range
      
      eb5fdba
              
                Migrate CER
              
              
                rhlbhatnagar c462a61
              
                Migrate Summary Score
              
              
                rhlbhatnagar 9c19bec
              
                Merge branch 'main' into rbhatnagar/migrate_metrics_4
              
              
                rhlbhatnagar 4197eb8
              
                Merge branch 'main' into rbhatnagar/migrate_metrics_4
              
              
                rhlbhatnagar a439732
              
                PR comments
              
              
                rhlbhatnagar File filter
Filter by extension
Conversations
          Failed to load comments.   
        
        
          
      Loading
        
  Jump to
        
          Jump to file
        
      
      
          Failed to load files.   
        
        
          
      Loading
        
  Diff view
Diff view
There are no files selected for viewing
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
              
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
              | Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,203 @@ | ||
| """Summary Score metric v2 - Modern implementation with function-based prompts.""" | ||
|  | ||
| import logging | ||
| import typing as t | ||
| from typing import List | ||
|  | ||
| from pydantic import BaseModel | ||
|  | ||
| from ragas.metrics.collections.base import BaseMetric | ||
| from ragas.metrics.result import MetricResult | ||
| from ragas.prompt.metrics.summary_score import ( | ||
| extract_keyphrases_prompt, | ||
| generate_answers_prompt, | ||
| generate_questions_prompt, | ||
| ) | ||
|  | ||
| if t.TYPE_CHECKING: | ||
| from ragas.llms.base import InstructorBaseRagasLLM | ||
|  | ||
|  | ||
| class ExtractedKeyphrases(BaseModel): | ||
| """Structured output for keyphrase extraction.""" | ||
|  | ||
| keyphrases: List[str] | ||
|  | ||
|  | ||
| class QuestionsGenerated(BaseModel): | ||
| """Structured output for question generation.""" | ||
|  | ||
| questions: List[str] | ||
|  | ||
|  | ||
| class AnswersGenerated(BaseModel): | ||
| """Structured output for answer generation.""" | ||
|  | ||
| answers: List[str] | ||
|  | ||
|  | ||
| class SummaryScore(BaseMetric): | ||
| """ | ||
| Modern v2 implementation of summarization score evaluation. | ||
|  | ||
| Measures how well a summary captures important information from contexts by: | ||
| 1. Extracting keyphrases from the original contexts | ||
| 2. Generating yes/no questions from those keyphrases | ||
| 3. Checking if the summary can answer those questions | ||
| 4. Optionally penalizing overly long summaries for conciseness | ||
|  | ||
| This implementation uses modern instructor LLMs with structured output. | ||
| Only supports modern components - legacy wrappers are rejected with clear error messages. | ||
|  | ||
| Usage: | ||
| >>> import instructor | ||
| >>> from openai import AsyncOpenAI | ||
| >>> from ragas.llms.base import instructor_llm_factory | ||
| >>> from ragas.metrics.collections import SummaryScore | ||
| >>> | ||
| >>> # Setup dependencies | ||
| >>> client = AsyncOpenAI() | ||
| >>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o-mini") | ||
| >>> | ||
| >>> # Create metric instance | ||
| >>> metric = SummaryScore(llm=llm) | ||
| >>> | ||
| >>> # Single evaluation | ||
| >>> result = await metric.ascore( | ||
| ... reference_contexts=["Apple Inc. is a technology company..."], | ||
| ... response="Apple is a tech company founded by Steve Jobs." | ||
| ... ) | ||
| >>> print(f"Summary Score: {result.value}") | ||
| >>> | ||
| >>> # Custom configuration (more conciseness focus) | ||
| >>> concise_metric = SummaryScore( | ||
| ... llm=llm, | ||
| ... length_penalty=True, | ||
| ... coeff=0.8 # More weight on conciseness | ||
| ... ) | ||
|  | ||
| Attributes: | ||
| llm: Modern instructor-based LLM for keyphrase, question, and answer generation | ||
| name: The metric name | ||
| length_penalty: Whether to apply conciseness penalty for long summaries | ||
| coeff: Weight for conciseness score (0.0=only QA, 1.0=only conciseness) | ||
| allowed_values: Score range (0.0 to 1.0) | ||
| """ | ||
|  | ||
| # Type hints for linter (attributes are set in __init__) | ||
| llm: "InstructorBaseRagasLLM" | ||
|  | ||
| def __init__( | ||
| self, | ||
| llm: "InstructorBaseRagasLLM", | ||
| name: str = "summary_score", | ||
| length_penalty: bool = True, | ||
| coeff: float = 0.5, | ||
| **kwargs, | ||
| ): | ||
| """ | ||
| Initialize SummaryScore metric with required components. | ||
|  | ||
| Args: | ||
| llm: Modern instructor-based LLM for keyphrase, question, and answer generation | ||
| name: The metric name | ||
| length_penalty: Whether to apply conciseness penalty for long summaries | ||
| coeff: Weight for conciseness score (0.0=only QA, 1.0=only conciseness) | ||
| """ | ||
| # Set attributes explicitly before calling super() | ||
| self.llm = llm | ||
| self.length_penalty = length_penalty | ||
| self.coeff = coeff | ||
|  | ||
| # Validate coefficient | ||
| if not (0.0 <= coeff <= 1.0): | ||
| raise ValueError(f"Coefficient must be between 0.0 and 1.0, got {coeff}") | ||
|  | ||
| # Call super() for validation (without passing llm in kwargs) | ||
| super().__init__(name=name, **kwargs) | ||
|  | ||
| async def ascore( | ||
| self, reference_contexts: List[str], response: str | ||
| ) -> MetricResult: | ||
| """ | ||
| Calculate summary score. | ||
|  | ||
| Args: | ||
| reference_contexts: The original contexts that were summarized | ||
| response: The summary to evaluate | ||
|  | ||
| Returns: | ||
| MetricResult with summary score (0.0-1.0) | ||
|  | ||
| Raises: | ||
| ValueError: If reference_contexts is empty or response is empty/whitespace only | ||
| """ | ||
| # Input validation | ||
| if not reference_contexts or not any(ctx.strip() for ctx in reference_contexts): | ||
| raise ValueError( | ||
| "reference_contexts cannot be empty or contain only whitespace" | ||
| ) | ||
|  | ||
| if not response or not response.strip(): | ||
| raise ValueError("response cannot be empty or whitespace only") | ||
|  | ||
| # Step 1: Combine contexts and extract keyphrases | ||
| text = "\n".join(reference_contexts) | ||
| keyphrases = await self._extract_keyphrases(text) | ||
|  | ||
| if not keyphrases: | ||
| # Match legacy behavior: log error and continue with empty list | ||
| logging.error("No keyphrases generated, unable to calculate the score.") | ||
| keyphrases = [] | ||
|  | ||
| # Step 2: Generate questions from keyphrases | ||
| questions = await self._generate_questions(text, keyphrases) | ||
|  | ||
| if not questions: | ||
| # Match legacy behavior: log error and continue with empty list | ||
| logging.error("No questions generated, unable to calculate the score.") | ||
| questions = [] | ||
|  | ||
| # Step 3: Check if summary can answer the questions | ||
| answers = await self._generate_answers(response, questions) | ||
|  | ||
| # Step 4: Calculate QA score | ||
| qa_score = self._compute_qa_score(answers) | ||
|  | ||
| # Step 5: Calculate final score (with optional conciseness penalty) | ||
| if self.length_penalty: | ||
| conciseness_score = self._compute_conciseness_score(text, response) | ||
| final_score = qa_score * (1 - self.coeff) + conciseness_score * self.coeff | ||
| else: | ||
| final_score = qa_score | ||
|  | ||
| return MetricResult(value=float(final_score)) | ||
|  | ||
| async def _extract_keyphrases(self, text: str) -> List[str]: | ||
| """Extract keyphrases from text using the keyphrase extraction prompt.""" | ||
| prompt = extract_keyphrases_prompt(text) | ||
| result = await self.llm.agenerate(prompt, ExtractedKeyphrases) | ||
| return result.keyphrases | ||
|  | ||
| async def _generate_questions(self, text: str, keyphrases: List[str]) -> List[str]: | ||
| """Generate questions from text and keyphrases.""" | ||
| prompt = generate_questions_prompt(text, keyphrases) | ||
| result = await self.llm.agenerate(prompt, QuestionsGenerated) | ||
| return result.questions | ||
|  | ||
| async def _generate_answers(self, summary: str, questions: List[str]) -> List[str]: | ||
| """Generate answers by checking if summary can answer questions.""" | ||
| prompt = generate_answers_prompt(summary, questions) | ||
| result = await self.llm.agenerate(prompt, AnswersGenerated) | ||
| return result.answers | ||
|  | ||
| def _compute_qa_score(self, answers: List[str]) -> float: | ||
| """Compute QA score as ratio of correct answers. Matches legacy behavior exactly.""" | ||
| correct = sum([1 for a in answers if a.lower() == "1"]) | ||
| return correct / len( | ||
| answers | ||
| ) # Will raise ZeroDivisionError if answers is empty (legacy behavior) | ||
|  | ||
| def _compute_conciseness_score(self, text: str, summary: str) -> float: | ||
| """Compute conciseness score based on length ratio.""" | ||
| return 1 - min(len(summary), len(text)) / (len(text) + 1e-10) | ||
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
              | Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,155 @@ | ||
| """Summary Score prompts - V1-identical using exact PydanticPrompt.to_string() output.""" | ||
|  | ||
| import json | ||
| import typing as t | ||
|  | ||
|  | ||
| def extract_keyphrases_prompt(text: str) -> str: | ||
| """ | ||
| V1-identical keyphrase extraction - matches PydanticPrompt.to_string() exactly. | ||
|  | ||
| Args: | ||
| text: The text to extract keyphrases from | ||
|  | ||
| Returns: | ||
| V1-identical prompt string for the LLM | ||
| """ | ||
| # Format input exactly like V1's model_dump_json(indent=4, exclude_none=True) | ||
| safe_text = json.dumps(text) | ||
|  | ||
| return f"""Extract keyphrases of type: Person, Organization, Location, Date/Time, Monetary Values, and Percentages. | ||
| Please return the output in a JSON format that complies with the following schema as specified in JSON Schema: | ||
| {{"properties": {{"keyphrases": {{"items": {{"type": "string"}}, "title": "Keyphrases", "type": "array"}}}}, "required": ["keyphrases"], "title": "ExtractedKeyphrases", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash. | ||
|  | ||
| --------EXAMPLES----------- | ||
| Example 1 | ||
| Input: {{ | ||
| "text": "Apple Inc. is a technology company based in Cupertino, California. Founded by Steve Jobs in 1976, it reached a market capitalization of $3 trillion in 2023." | ||
| }} | ||
| Output: {{ | ||
| "keyphrases": [ | ||
| "Apple Inc.", | ||
| "Cupertino, California", | ||
| "Steve Jobs", | ||
| "1976", | ||
| "$3 trillion", | ||
| "2023" | ||
| ] | ||
| }} | ||
| ----------------------------- | ||
|  | ||
| Now perform the same with the following input | ||
| input: {{ | ||
| "text": {safe_text} | ||
| }} | ||
| Output: """ | ||
|  | ||
|  | ||
| def generate_questions_prompt(text: str, keyphrases: t.List[str]) -> str: | ||
| """ | ||
| V1-identical question generation - matches PydanticPrompt.to_string() exactly. | ||
|  | ||
| Args: | ||
| text: The text to generate questions about | ||
| keyphrases: The keyphrases extracted from the text | ||
|  | ||
| Returns: | ||
| V1-identical prompt string for the LLM | ||
| """ | ||
| # Format inputs exactly like V1's model_dump_json(indent=4, exclude_none=True) | ||
| safe_text = json.dumps(text) | ||
| safe_keyphrases = json.dumps(keyphrases, indent=4).replace("\n", "\n ") | ||
|  | ||
| return f"""Based on the given text and keyphrases, generate closed-ended questions that can be answered with '1' if the question can be answered using the text, or '0' if it cannot. The questions should ALWAYS result in a '1' based on the given text. | ||
| Please return the output in a JSON format that complies with the following schema as specified in JSON Schema: | ||
| {{"properties": {{"questions": {{"items": {{"type": "string"}}, "title": "Questions", "type": "array"}}}}, "required": ["questions"], "title": "QuestionsGenerated", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash. | ||
|  | ||
| --------EXAMPLES----------- | ||
| Example 1 | ||
| Input: {{ | ||
| "text": "Apple Inc. is a technology company based in Cupertino, California. Founded by Steve Jobs in 1976, it reached a market capitalization of $3 trillion in 2023.", | ||
| "keyphrases": [ | ||
| "Apple Inc.", | ||
| "Cupertino, California", | ||
| "Steve Jobs", | ||
| "1976", | ||
| "$3 trillion", | ||
| "2023" | ||
| ] | ||
| }} | ||
| Output: {{ | ||
| "questions": [ | ||
| "Is Apple Inc. a technology company?", | ||
| "Is Apple Inc. based in Cupertino, California?", | ||
| "Was Apple Inc. founded by Steve Jobs?", | ||
| "Was Apple Inc. founded in 1976?", | ||
| "Did Apple Inc. reach a market capitalization of $3 trillion?", | ||
| "Did Apple Inc. reach a market capitalization of $3 trillion in 2023?" | ||
| ] | ||
| }} | ||
| ----------------------------- | ||
|  | ||
| Now perform the same with the following input | ||
| input: {{ | ||
| "text": {safe_text}, | ||
| "keyphrases": {safe_keyphrases} | ||
| }} | ||
| Output: """ | ||
|  | ||
|  | ||
| def generate_answers_prompt(summary: str, questions: t.List[str]) -> str: | ||
| """ | ||
| V1-identical answer generation - matches PydanticPrompt.to_string() exactly. | ||
|  | ||
| Args: | ||
| summary: The summary to evaluate | ||
| questions: The questions to check against the summary | ||
|  | ||
| Returns: | ||
| V1-identical prompt string for the LLM | ||
| """ | ||
| # Format inputs exactly like V1's model_dump_json(indent=4, exclude_none=True) | ||
| safe_summary = json.dumps(summary) | ||
| safe_questions = json.dumps(questions, indent=4).replace("\n", "\n ") | ||
|  | ||
| return f"""Based on the list of close-ended '1' or '0' questions, generate a JSON with key 'answers', which is a list of strings that determines whether the provided summary contains sufficient information to answer EACH question. Answers should STRICTLY be either '1' or '0'. Answer '0' if the provided summary does not contain enough information to answer the question and answer '1' if the provided summary can answer the question. | ||
| Please return the output in a JSON format that complies with the following schema as specified in JSON Schema: | ||
| {{"properties": {{"answers": {{"items": {{"type": "string"}}, "title": "Answers", "type": "array"}}}}, "required": ["answers"], "title": "AnswersGenerated", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash. | ||
|  | ||
| --------EXAMPLES----------- | ||
| Example 1 | ||
| Input: {{ | ||
| "summary": "Apple Inc. is a technology company based in Cupertino, California. Founded by Steve Jobs in 1976, it reached a market capitalization of $3 trillion in 2023.", | ||
| "questions": [ | ||
| "Is Apple Inc. a technology company?", | ||
| "Is Apple Inc. based in Cupertino, California?", | ||
| "Was Apple Inc. founded by Steve Jobs?", | ||
| "Was Apple Inc. founded in 1976?", | ||
| "Did Apple Inc. reach a market capitalization of $3 trillion?", | ||
| "Did Apple Inc. reach a market capitalization of $3 trillion in 2023?", | ||
| "Is Apple Inc. a major software company?", | ||
| "Is Apple Inc. known for the iPhone?", | ||
| "Was Steve Jobs the co-founder of Apple Inc.?" | ||
| ] | ||
| }} | ||
| Output: {{ | ||
| "answers": [ | ||
| "1", | ||
| "1", | ||
| "1", | ||
| "1", | ||
| "1", | ||
| "1", | ||
| "0", | ||
| "0", | ||
| "1" | ||
| ] | ||
| }} | ||
| ----------------------------- | ||
|  | ||
| Now perform the same with the following input | ||
| input: {{ | ||
| "summary": {safe_summary}, | ||
| "questions": {safe_questions} | ||
| }} | ||
| Output: """ | 
      
      Oops, something went wrong.
        
    
  
      
      Oops, something went wrong.
        
    
  
  Add this suggestion to a batch that can be applied as a single commit.
  This suggestion is invalid because no changes were made to the code.
  Suggestions cannot be applied while the pull request is closed.
  Suggestions cannot be applied while viewing a subset of changes.
  Only one suggestion per line can be applied in a batch.
  Add this suggestion to a batch that can be applied as a single commit.
  Applying suggestions on deleted lines is not supported.
  You must change the existing code in this line in order to create a valid suggestion.
  Outdated suggestions cannot be applied.
  This suggestion has been applied or marked resolved.
  Suggestions cannot be applied from pending reviews.
  Suggestions cannot be applied on multi-line comments.
  Suggestions cannot be applied while the pull request is queued to merge.
  Suggestion cannot be applied right now. Please check back later.
  
    
  
    
Uh oh!
There was an error while loading. Please reload this page.