|
1 | 1 | import os
|
2 | 2 | import logging
|
3 | 3 | import time
|
4 |
| -from typing import Dict, Tuple, Optional |
5 |
| -import boto3 |
| 4 | +from src.llm import get_llm |
6 | 5 | from datasets import Dataset
|
7 | 6 | from dotenv import load_dotenv
|
8 |
| -from langchain_anthropic import ChatAnthropic |
9 |
| -from langchain_aws import ChatBedrock |
10 |
| -from langchain_community.chat_models import ChatOllama |
11 |
| -from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer |
12 |
| -from langchain_fireworks import ChatFireworks |
13 |
| -from langchain_google_vertexai import ( |
14 |
| - ChatVertexAI, |
15 |
| - HarmBlockThreshold, |
16 |
| - HarmCategory, |
17 |
| -) |
18 |
| -from langchain_groq import ChatGroq |
19 |
| -from langchain_openai import AzureChatOpenAI, ChatOpenAI |
20 | 7 | from ragas import evaluate
|
21 |
| -from ragas.metrics import answer_relevancy, context_utilization, faithfulness |
| 8 | +from ragas.metrics import answer_relevancy, faithfulness |
22 | 9 | from src.shared.common_fn import load_embedding_model
|
23 |
| - |
24 | 10 | load_dotenv()
|
25 | 11 |
|
26 |
| -RAGAS_MODEL_VERSIONS = { |
27 |
| - "openai_gpt_3.5": "gpt-3.5-turbo-16k", |
28 |
| - "openai_gpt_4": "gpt-4-turbo-2024-04-09", |
29 |
| - "openai_gpt_4o_mini": "gpt-4o-mini-2024-07-18", |
30 |
| - "openai_gpt_4o": "gpt-4o-mini-2024-07-18", |
31 |
| - "groq_llama3_70b": "groq_llama3_70b", |
32 |
| -} |
33 |
| -EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL") |
| 12 | +EMBEDDING_MODEL = os.getenv("RAGAS_EMBEDDING_MODEL") |
34 | 13 | EMBEDDING_FUNCTION, _ = load_embedding_model(EMBEDDING_MODEL)
|
35 | 14 |
|
36 |
| - |
37 |
| -def get_ragas_llm(model: str) -> Tuple[object, str]: |
38 |
| - """Retrieves the specified language model. Improved error handling and structure.""" |
39 |
| - env_key = f"LLM_MODEL_CONFIG_{model}" |
40 |
| - env_value = os.environ.get(env_key) |
41 |
| - logging.info(f"Loading model configuration: {env_key}") |
42 |
| - try: |
43 |
| - if "openai" in model: |
44 |
| - model_name = RAGAS_MODEL_VERSIONS[model] |
45 |
| - llm = ChatOpenAI( |
46 |
| - api_key=os.environ.get("OPENAI_API_KEY"), model=model_name, temperature=0 |
47 |
| - ) |
48 |
| - elif "groq" in model: |
49 |
| - model_name, base_url, api_key = env_value.split(",") |
50 |
| - llm = ChatGroq(api_key=api_key, model_name=model_name, temperature=0) |
51 |
| - else: |
52 |
| - raise ValueError(f"Unsupported model for evaluation: {model}") |
53 |
| - |
54 |
| - logging.info(f"Model loaded - Model Version: {model}") |
55 |
| - return llm, model_name |
56 |
| - except (ValueError, KeyError) as e: |
57 |
| - logging.error(f"Error loading LLM: {e}") |
58 |
| - raise |
59 |
| - |
60 |
| - |
61 |
| -def get_ragas_metrics( |
62 |
| - question: str, context: str, answer: str, model: str |
63 |
| -) -> Optional[Dict[str, float]]: |
| 15 | +def get_ragas_metrics(question: str, context: list, answer: list, model: str): |
64 | 16 | """Calculates RAGAS metrics."""
|
65 | 17 | try:
|
66 | 18 | start_time = time.time()
|
67 | 19 | dataset = Dataset.from_dict(
|
68 |
| - {"question": [question], "answer": [answer], "contexts": [[context]]} |
| 20 | + {"question": [question] * len(answer), "answer": answer, "contexts": [[ctx] for ctx in context]} |
69 | 21 | )
|
70 |
| - logging.info("Dataset created successfully.") |
71 |
| - |
72 |
| - llm, model_name = get_ragas_llm(model=model) |
| 22 | + logging.info("Evaluation dataset created successfully.") |
| 23 | + if ("diffbot" in model) or ("ollama" in model): |
| 24 | + raise ValueError(f"Unsupported model for evaluation: {model}") |
| 25 | + else: |
| 26 | + llm, model_name = get_llm(model=model) |
| 27 | + |
73 | 28 | logging.info(f"Evaluating with model: {model_name}")
|
74 |
| - |
| 29 | + |
75 | 30 | score = evaluate(
|
76 | 31 | dataset=dataset,
|
77 |
| - metrics=[faithfulness, answer_relevancy, context_utilization], |
| 32 | + metrics=[faithfulness, answer_relevancy], |
78 | 33 | llm=llm,
|
79 | 34 | embeddings=EMBEDDING_FUNCTION,
|
80 | 35 | )
|
81 |
| - |
| 36 | + |
82 | 37 | score_dict = (
|
83 |
| - score.to_pandas()[["faithfulness", "answer_relevancy", "context_utilization"]] |
| 38 | + score.to_pandas()[["faithfulness", "answer_relevancy"]] |
| 39 | + .fillna(0) |
84 | 40 | .round(4)
|
85 |
| - .to_dict(orient="records")[0] |
| 41 | + .to_dict(orient="list") |
86 | 42 | )
|
87 | 43 | end_time = time.time()
|
88 | 44 | logging.info(f"Evaluation completed in: {end_time - start_time:.2f} seconds")
|
89 | 45 | return score_dict
|
90 | 46 | except ValueError as e:
|
91 | 47 | if "Unsupported model for evaluation" in str(e):
|
92 | 48 | logging.error(f"Unsupported model error: {e}")
|
93 |
| - return {"error": str(e)} # Return the specific error message as a dictionary |
| 49 | + return {"error": str(e)} |
94 | 50 | logging.exception(f"ValueError during metrics evaluation: {e}")
|
95 | 51 | return {"error": str(e)}
|
96 | 52 | except Exception as e:
|
|
0 commit comments