diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py index a07754b69d56..8f3c6515c505 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py @@ -93,10 +93,10 @@ def __call__( # pylint: disable=docstring-missing-param or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of turns, the evaluator will aggregate the results of each turn. + :keyword query: The query to be evaluated. + :paramtype query: str :keyword response: The response to be evaluated. :paramtype response: Optional[str] - :keyword context: The context to be evaluated. - :paramtype context: Optional[str] :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index 1b3be9c31087..12dbac53101c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -27,8 +27,6 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]): :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - :param eval_last_turn: Whether to evaluate the last turn of a conversation. Default is False. - :type eval_last_turn: bool :param kwargs: Additional arguments to pass to the evaluator. :type kwargs: Any :return: A function that evaluates content-safety metrics for "question-answering" scenario. @@ -69,8 +67,8 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]): """ # TODO address 3579092 to re-enabled parallel evals. - def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs): - super().__init__(eval_last_turn=eval_last_turn) + def __init__(self, credential, azure_ai_project, **kwargs): + super().__init__() self._parallel = kwargs.pop("_parallel", False) self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [ ViolenceEvaluator(credential, azure_ai_project), diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py index afb645d45768..f30b717cf351 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py @@ -50,13 +50,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=EvaluationMetrics.HATE_FAIRNESS, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py index 66cc70280737..0a2435860ceb 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py @@ -50,13 +50,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=EvaluationMetrics.SELF_HARM, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py index dbf7a2a0ae12..5fd1239024b3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py @@ -50,13 +50,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=EvaluationMetrics.SEXUAL, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py index f43c08726dcd..f1742dae645f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py @@ -50,13 +50,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=EvaluationMetrics.VIOLENCE, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py index c89df72fb13a..5579e821875f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py @@ -55,13 +55,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=_InternalEvaluationMetrics.ECI, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py index 66c162a03993..ebd22e8d5a44 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py @@ -55,7 +55,7 @@ def __call__( *, response: str, ) -> Dict[str, Union[str, float]]: - """Evaluate fluency in given query/response + """Evaluate fluency in given response :keyword response: The response to be evaluated. :paramtype response: str diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py index fb7dc8aefcb3..60384eb8f0ee 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py @@ -56,13 +56,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=EvaluationMetrics.PROTECTED_MATERIAL, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py index b23cf62b10be..92e610e86814 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py @@ -180,14 +180,14 @@ def __call__( self, *, conversation: Conversation, - ) -> Dict[str, Union[float, Dict[str, List[float]]]]: + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluates retrieval for a for a multi-turn evaluation. If the conversation has more than one turn, the evaluator will aggregate the results of each turn. :keyword conversation: The conversation to be evaluated. :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The scores for Chat scenario. - :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]] + :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ def __call__(self, *args, **kwargs): # pylint: disable=docstring-missing-param @@ -202,7 +202,7 @@ def __call__(self, *args, **kwargs): # pylint: disable=docstring-missing-param :keyword conversation: The conversation to be evaluated. :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The scores for Chat scenario. - :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]] + :rtype: :rtype: Dict[str, Union[float, Dict[str, List[str, float]]]] """ query = kwargs.pop("query", None) context = kwargs.pop("context", None) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py index 4175823e9598..73601f684a90 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py @@ -1,7 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import List, Optional, Union, Dict +from typing import List, Union, Dict from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental @@ -105,18 +105,18 @@ def __init__( def __call__( self, *, - query: Optional[str] = None, - response: Optional[str] = None, - context: Optional[str] = None, + response: str, + context: str, + query: str, ) -> Dict[str, Union[str, bool]]: """Evaluate groundedness for a given query/response/context - :keyword query: The query to be evaluated. - :paramtype query: Optional[str] :keyword response: The response to be evaluated. - :paramtype response: Optional[str] + :paramtype response: str :keyword context: The context to be evaluated. - :paramtype context: Optional[str] + :paramtype context: str + :keyword query: The query to be evaluated. + :paramtype query: Optional[str] :return: The relevance score. :rtype: Dict[str, Union[str, bool]] """ diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py index 9d591f8d75b7..23146bd5300b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py @@ -27,9 +27,6 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue, - focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False - :type eval_last_turn: bool **Usage** @@ -57,13 +54,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=EvaluationMetrics.XPIA, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py index af96e637b389..294dfce473a9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py @@ -15,6 +15,7 @@ from azure.ai.evaluation._common.utils import validate_azure_ai_project from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from azure.ai.evaluation._http_utils import get_async_http_client +from azure.ai.evaluation._model_configurations import AzureAIProject from azure.ai.evaluation.simulator import AdversarialScenario from azure.ai.evaluation.simulator._adversarial_scenario import _UnstableAdversarialScenario from azure.core.credentials import TokenCredential @@ -48,7 +49,7 @@ class AdversarialSimulator: :type credential: ~azure.core.credentials.TokenCredential """ - def __init__(self, *, azure_ai_project: dict, credential): + def __init__(self, *, azure_ai_project: AzureAIProject, credential: TokenCredential): """Constructor.""" try: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py index faa9f989dfdf..7278ff3be2de 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py @@ -11,6 +11,7 @@ from azure.ai.evaluation._common.utils import validate_azure_ai_project from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from azure.ai.evaluation.simulator import AdversarialScenario +from azure.ai.evaluation._model_configurations import AzureAIProject from azure.core.credentials import TokenCredential from ._adversarial_simulator import AdversarialSimulator @@ -32,7 +33,7 @@ class DirectAttackSimulator: :type credential: ~azure.core.credentials.TokenCredential """ - def __init__(self, *, azure_ai_project: dict, credential): + def __init__(self, *, azure_ai_project: AzureAIProject, credential: TokenCredential): """Constructor.""" try: @@ -125,7 +126,7 @@ async def __call__( - '**$schema**': A string indicating the schema URL for the conversation format. The 'content' for 'assistant' role messages may includes the messages that your callback returned. - :rtype: Dict[str, [List[Dict[str, Any]]]] with two elements + :rtype: Dict[str, [List[Dict[str, Any]]]] **Output format** @@ -179,7 +180,7 @@ async def __call__( randomization_seed = randint(0, 1000000) regular_sim = AdversarialSimulator( - azure_ai_project=cast(dict, self.azure_ai_project), credential=self.credential + azure_ai_project=self.azure_ai_project, credential=self.credential ) regular_sim_results = await regular_sim( scenario=scenario, @@ -193,7 +194,7 @@ async def __call__( randomize_order=False, randomization_seed=randomization_seed, ) - jb_sim = AdversarialSimulator(azure_ai_project=cast(dict, self.azure_ai_project), credential=self.credential) + jb_sim = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential) jb_sim_results = await jb_sim( scenario=scenario, target=target, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py index 32b0f2fa99c1..24bfe5bc8268 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py @@ -13,6 +13,7 @@ from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from azure.ai.evaluation.simulator import AdversarialScenarioJailbreak, SupportedLanguages +from azure.ai.evaluation._model_configurations import AzureAIProject from azure.core.credentials import TokenCredential from ._adversarial_simulator import AdversarialSimulator, JsonLineList @@ -34,7 +35,7 @@ class IndirectAttackSimulator(AdversarialSimulator): :type credential: ~azure.core.credentials.TokenCredential """ - def __init__(self, *, azure_ai_project: dict, credential): + def __init__(self, *, azure_ai_project: AzureAIProject, credential: TokenCredential): """Constructor.""" try: diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py index 5bd942be4000..843963493ce9 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py @@ -285,12 +285,13 @@ def test_content_safety_evaluator_self_harm(self, project_scope, azure_cred, sim assert score2["evaluation_per_turn"]["self_harm_score"][1] > 0 assert all(score2["evaluation_per_turn"]["self_harm_reason"]), "self_harm_reason must not be None or empty." - # Test eval_last_turn behavior - eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True) - score3 = eval_fn_last(conversation=simple_conversation) - assert score3["self_harm"] in (level.value for level in HarmSeverityLevel) - assert score3["self_harm_score"] >= 1 - assert score3["self_harm_reason"], "self_harm_reason must not be None or empty." + # removed eval_last_turn from evaluators since behavior is incorrect. We can re-enable when + # it's added back to evaluators + # eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True) + # score3 = eval_fn_last(conversation=simple_conversation) + # assert score3["self_harm"] == "Very low" + # assert score3["self_harm_score"] >= 1 + # assert score3["self_harm_reason"], "self_harm_reason must not be None or empty." def test_content_safety_evaluator_hate_unfairness(self, project_scope, azure_cred, simple_conversation): eval_fn = HateUnfairnessEvaluator(azure_cred, project_scope)