Skip to content

Commit 010b5fa

Browse files
committed
finish extract funtion
1 parent a2d9245 commit 010b5fa

File tree

3 files changed

+564
-54
lines changed

3 files changed

+564
-54
lines changed

docs/howtos/customizations/metrics/_cost.md

Lines changed: 16 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -69,15 +69,12 @@ from ragas.cost import get_token_usage_for_openai
6969
get_token_usage_for_openai(llm_result)
7070
```
7171

72-
/opt/homebrew/Caskroom/miniforge/base/envs/ragas/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
73-
from .autonotebook import tqdm as notebook_tqdm
74-
75-
76-
77-
78-
79-
TokenUsage(input_tokens=9, output_tokens=9, model='')
72+
```py
73+
/opt/homebrew/Caskroom/miniforge/base/envs/ragas/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
74+
from .autonotebook import tqdm as notebook_tqdm
8075

76+
TokenUsage(input_tokens=9, output_tokens=9, model='')
77+
```
8178

8279

8380
You can define your own or import parsers if they are defined. If you would like to suggest parser for LLM providers or contribute your own ones please check out this [issue](https://github.com/explodinggradients/ragas/issues/1151) 🙂.
@@ -101,9 +98,9 @@ metric = AspectCriticWithReference(
10198
)
10299
```
103100

104-
Repo card metadata block was not found. Setting CardData to empty.
105-
106-
101+
```py
102+
Repo card metadata block was not found. Setting CardData to empty.
103+
```
107104

108105
```python
109106
from ragas import evaluate
@@ -117,38 +114,27 @@ results = evaluate(
117114
)
118115
```
119116

120-
Evaluating: 100%|██████████| 5/5 [00:01<00:00, 2.81it/s]
121-
122-
117+
```py
118+
Evaluating: 100%|██████████| 5/5 [00:01<00:00, 2.81it/s]
119+
```
123120

124121
```python
125122
results.total_tokens()
126123
```
127124

128-
129-
130-
131-
TokenUsage(input_tokens=5463, output_tokens=355, model='')
132-
125+
```py
126+
TokenUsage(input_tokens=5463, output_tokens=355, model='')
127+
```
133128

134129

135130
You can compute the cost for each run by passing in the cost per token to `Result.total_cost()` function.
136131

137132
In this case GPT-4o costs $5 for 1M input tokens and $15 for 1M output tokens.
138133

139-
140134
```python
141135
results.total_cost(cost_per_input_token=5 / 1e6, cost_per_output_token=15 / 1e6)
142136
```
143137

144-
145-
146-
147-
0.03264
148-
149-
150-
151-
152-
```python
153-
138+
```py
139+
0.03264
154140
```

src/ragas/batch_evaluation.py

Lines changed: 153 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,7 @@ def evaluate(
134134
responses=responses,
135135
)
136136

137-
# Process responses to extract scores if possible
138-
# This is a simplified implementation - actual score extraction
139-
# would depend on the specific metric and response format
137+
# Process responses to extract scores
140138
try:
141139
result.scores = self._extract_scores(metric, responses)
142140
except Exception as e:
@@ -243,15 +241,159 @@ def _extract_scores(
243241
"""
244242
Extract scores from batch responses.
245243
246-
This is a placeholder implementation. In practice, this would need
247-
to parse the specific response format for each metric and extract
248-
the numerical scores.
244+
This method parses the batch responses and attempts to extract numerical scores
245+
based on the metric's output format. It handles common patterns like JSON
246+
responses with verdict fields or direct numerical outputs.
249247
"""
250-
logger.warning(
251-
f"Score extraction not implemented for metric: {metric.name}. "
252-
"Batch responses contain raw LLM outputs that need manual processing."
253-
)
254-
return [None] * len(responses)
248+
scores = []
249+
250+
for response in responses:
251+
score = None
252+
253+
if response.error is not None:
254+
logger.error(
255+
f"Error in batch response {response.custom_id}: {response.error}"
256+
)
257+
scores.append(None)
258+
continue
259+
260+
if response.response is None:
261+
logger.warning(f"No response content for {response.custom_id}")
262+
scores.append(None)
263+
continue
264+
265+
try:
266+
# Extract content from OpenAI response format
267+
content = self._extract_content_from_response(response.response)
268+
if content is None:
269+
scores.append(None)
270+
continue
271+
272+
# Parse as structured output first (JSON)
273+
score = self._parse_structured_score(content, metric.name)
274+
275+
# Parse raw text for score patterns
276+
if score is None:
277+
score = self._parse_text_score(content)
278+
279+
scores.append(score)
280+
281+
except Exception as e:
282+
logger.error(
283+
f"Failed to extract score from response {response.custom_id}: {e}"
284+
)
285+
scores.append(None)
286+
287+
return scores
288+
289+
def _extract_content_from_response(
290+
self, response: t.Dict[str, t.Any]
291+
) -> t.Optional[str]:
292+
"""Extract text content from OpenAI API response format."""
293+
try:
294+
# Standard OpenAI chat completion response format
295+
choices = response.get("choices", [])
296+
if choices and len(choices) > 0:
297+
message = choices[0].get("message", {})
298+
return message.get("content", "")
299+
return None
300+
except Exception as e:
301+
logger.error(f"Failed to extract content from response: {e}")
302+
return None
303+
304+
def _parse_structured_score(
305+
self, content: str, metric_name: str
306+
) -> t.Optional[float]:
307+
"""Parse structured JSON output to extract score."""
308+
try:
309+
import json
310+
import re
311+
312+
# Clean the content to extract JSON
313+
content = content.strip()
314+
315+
# Look for JSON blocks
316+
json_match = re.search(r"```json\s*(\{.*?\})\s*```", content, re.DOTALL)
317+
if json_match:
318+
content = json_match.group(1)
319+
elif content.startswith("{") and content.endswith("}"):
320+
pass # Already clean JSON
321+
else:
322+
# Look for JSON object in text
323+
json_match = re.search(r"\{[^{}]*\}", content)
324+
if json_match:
325+
content = json_match.group(0)
326+
else:
327+
return None
328+
329+
parsed = json.loads(content)
330+
331+
# Common patterns for different metrics
332+
score_patterns = [
333+
"score",
334+
"verdict",
335+
"faithfulness_score",
336+
"relevance_score",
337+
"correctness_score",
338+
"precision",
339+
"recall",
340+
"f1_score",
341+
]
342+
343+
for pattern in score_patterns:
344+
if pattern in parsed:
345+
value = parsed[pattern]
346+
if isinstance(value, (int, float)):
347+
return float(value)
348+
elif isinstance(value, str) and value.replace(".", "").isdigit():
349+
return float(value)
350+
351+
# For faithfulness-like metrics, calculate score from statements
352+
if "statements" in parsed and isinstance(parsed["statements"], list):
353+
statements = parsed["statements"]
354+
if statements:
355+
verdicts = []
356+
for stmt in statements:
357+
if isinstance(stmt, dict) and "verdict" in stmt:
358+
verdict = stmt["verdict"]
359+
if isinstance(verdict, (int, float)):
360+
verdicts.append(verdict)
361+
362+
if verdicts:
363+
return sum(verdicts) / len(verdicts)
364+
365+
return None
366+
367+
except json.JSONDecodeError:
368+
return None
369+
except Exception as e:
370+
logger.debug(f"Error parsing structured score: {e}")
371+
return None
372+
373+
def _parse_text_score(self, content: str) -> t.Optional[float]:
374+
"""Parse raw text content to find numerical scores."""
375+
import re
376+
377+
# Look for common score patterns
378+
patterns = [
379+
r"score[:\s]*([0-9]*\.?[0-9]+)",
380+
r"verdict[:\s]*([0-9]*\.?[0-9]+)",
381+
r"rating[:\s]*([0-9]*\.?[0-9]+)",
382+
r"([0-9]*\.?[0-9]+)(?:\s*/\s*[0-9]+)?", # Simple number or fraction
383+
]
384+
385+
for pattern in patterns:
386+
matches = re.findall(pattern, content.lower())
387+
if matches:
388+
try:
389+
score = float(matches[0])
390+
# Validate score is in reasonable range (0-1 or 0-10)
391+
if 0 <= score <= 1 or 0 <= score <= 10:
392+
return score
393+
except (ValueError, IndexError):
394+
continue
395+
396+
return None
255397

256398

257399
def create_batch_evaluator(

0 commit comments

Comments
 (0)