From aee710e2d2c626f4993d5f196033c2ed2117fbdf Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Thu, 31 Oct 2024 13:23:10 -0400 Subject: [PATCH 01/10] Fix __call__ overload issues --- .../evaluation/_evaluators/_coherence/_coherence.py | 4 ++-- .../ai/evaluation/_evaluators/_fluency/_fluency.py | 2 +- .../evaluation/_evaluators/_retrieval/_retrieval.py | 4 ++-- .../_service_groundedness/_service_groundedness.py | 12 ++++++------ 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py index a07754b69d56..8f3c6515c505 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py @@ -93,10 +93,10 @@ def __call__( # pylint: disable=docstring-missing-param or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of turns, the evaluator will aggregate the results of each turn. + :keyword query: The query to be evaluated. + :paramtype query: str :keyword response: The response to be evaluated. :paramtype response: Optional[str] - :keyword context: The context to be evaluated. - :paramtype context: Optional[str] :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role". diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py index 66c162a03993..ebd22e8d5a44 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py @@ -55,7 +55,7 @@ def __call__( *, response: str, ) -> Dict[str, Union[str, float]]: - """Evaluate fluency in given query/response + """Evaluate fluency in given response :keyword response: The response to be evaluated. :paramtype response: str diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py index b23cf62b10be..aa3ca25032b4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py @@ -180,7 +180,7 @@ def __call__( self, *, conversation: Conversation, - ) -> Dict[str, Union[float, Dict[str, List[float]]]]: + ) -> Dict[str, Union[float, Dict[str, List[str, float]]]]: """Evaluates retrieval for a for a multi-turn evaluation. If the conversation has more than one turn, the evaluator will aggregate the results of each turn. @@ -202,7 +202,7 @@ def __call__(self, *args, **kwargs): # pylint: disable=docstring-missing-param :keyword conversation: The conversation to be evaluated. :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The scores for Chat scenario. - :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]] + :rtype: :rtype: Dict[str, Union[float, Dict[str, List[str, float]]]] """ query = kwargs.pop("query", None) context = kwargs.pop("context", None) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py index be0d249c99b3..b7e311a1da9e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py @@ -105,18 +105,18 @@ def __init__( def __call__( self, *, + response: str, + context: str, query: Optional[str] = None, - response: Optional[str] = None, - context: Optional[str] = None, ) -> Dict[str, Union[str, bool]]: """Evaluate groundedness for a given query/response/context - :keyword query: The query to be evaluated. - :paramtype query: Optional[str] :keyword response: The response to be evaluated. - :paramtype response: Optional[str] + :paramtype response: str :keyword context: The context to be evaluated. - :paramtype context: Optional[str] + :paramtype context: str + :keyword query: The query to be evaluated. + :paramtype query: Optional[str] :return: The relevance score. :rtype: Dict[str, Union[str, bool]] """ From 854a50a70588e7846b8dec8d7c71f734de2bf2a1 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Thu, 31 Oct 2024 14:21:21 -0400 Subject: [PATCH 02/10] fix typing issue --- .../azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py index aa3ca25032b4..5bfc0fdd3617 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py @@ -180,7 +180,7 @@ def __call__( self, *, conversation: Conversation, - ) -> Dict[str, Union[float, Dict[str, List[str, float]]]]: + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluates retrieval for a for a multi-turn evaluation. If the conversation has more than one turn, the evaluator will aggregate the results of each turn. From c04786bdccb4986e59007b0289822bcd42089276 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 1 Nov 2024 14:14:11 -0400 Subject: [PATCH 03/10] make query required for groundednesspro --- .../_evaluators/_service_groundedness/_service_groundedness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py index b7e311a1da9e..ad24c3486c19 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py @@ -107,7 +107,7 @@ def __call__( *, response: str, context: str, - query: Optional[str] = None, + query: str, ) -> Dict[str, Union[str, bool]]: """Evaluate groundedness for a given query/response/context From 8b6ee675082d1113c81e9a8d05754d4c57c846d0 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 1 Nov 2024 14:42:09 -0400 Subject: [PATCH 04/10] fix a malformatted docstring --- .../azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py index 5bfc0fdd3617..92e610e86814 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py @@ -187,7 +187,7 @@ def __call__( :keyword conversation: The conversation to be evaluated. :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The scores for Chat scenario. - :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]] + :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ def __call__(self, *args, **kwargs): # pylint: disable=docstring-missing-param From 61995843c496a36922ac37277d2654e30b23f2d0 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 1 Nov 2024 15:28:46 -0400 Subject: [PATCH 05/10] fix some type hints and remove eval_last_turn from evaluators --- .../_evaluators/_content_safety/_content_safety.py | 6 ++---- .../_evaluators/_content_safety/_hate_unfairness.py | 2 -- .../ai/evaluation/_evaluators/_content_safety/_self_harm.py | 2 -- .../ai/evaluation/_evaluators/_content_safety/_sexual.py | 2 -- .../ai/evaluation/_evaluators/_content_safety/_violence.py | 2 -- .../azure/ai/evaluation/_evaluators/_eci/_eci.py | 2 -- .../_evaluators/_protected_material/_protected_material.py | 2 -- .../azure/ai/evaluation/_evaluators/_xpia/xpia.py | 5 ----- .../azure/ai/evaluation/simulator/_adversarial_simulator.py | 3 ++- .../ai/evaluation/simulator/_direct_attack_simulator.py | 3 ++- .../ai/evaluation/simulator/_indirect_attack_simulator.py | 3 ++- 11 files changed, 8 insertions(+), 24 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py index 1b3be9c31087..12dbac53101c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py @@ -27,8 +27,6 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]): :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - :param eval_last_turn: Whether to evaluate the last turn of a conversation. Default is False. - :type eval_last_turn: bool :param kwargs: Additional arguments to pass to the evaluator. :type kwargs: Any :return: A function that evaluates content-safety metrics for "question-answering" scenario. @@ -69,8 +67,8 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]): """ # TODO address 3579092 to re-enabled parallel evals. - def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs): - super().__init__(eval_last_turn=eval_last_turn) + def __init__(self, credential, azure_ai_project, **kwargs): + super().__init__() self._parallel = kwargs.pop("_parallel", False) self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [ ViolenceEvaluator(credential, azure_ai_project), diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py index afb645d45768..f30b717cf351 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py @@ -50,13 +50,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=EvaluationMetrics.HATE_FAIRNESS, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py index 66cc70280737..0a2435860ceb 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py @@ -50,13 +50,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=EvaluationMetrics.SELF_HARM, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py index dbf7a2a0ae12..5fd1239024b3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py @@ -50,13 +50,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=EvaluationMetrics.SEXUAL, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py index f43c08726dcd..f1742dae645f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_violence.py @@ -50,13 +50,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=EvaluationMetrics.VIOLENCE, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py index c89df72fb13a..5579e821875f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_eci/_eci.py @@ -55,13 +55,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=_InternalEvaluationMetrics.ECI, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py index fb7dc8aefcb3..60384eb8f0ee 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py @@ -56,13 +56,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=EvaluationMetrics.PROTECTED_MATERIAL, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py index 9d591f8d75b7..23146bd5300b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_xpia/xpia.py @@ -27,9 +27,6 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue, - focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False - :type eval_last_turn: bool **Usage** @@ -57,13 +54,11 @@ def __init__( self, credential, azure_ai_project, - eval_last_turn: bool = False, ): super().__init__( eval_metric=EvaluationMetrics.XPIA, azure_ai_project=azure_ai_project, credential=credential, - eval_last_turn=eval_last_turn, ) @overload diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py index a78de5a4778d..230c75b8e833 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_adversarial_simulator.py @@ -14,6 +14,7 @@ from azure.ai.evaluation._common.utils import validate_azure_ai_project from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from azure.ai.evaluation._http_utils import get_async_http_client +from azure.ai.evaluation._model_configurations import AzureAIProject from azure.ai.evaluation.simulator import AdversarialScenario from azure.ai.evaluation.simulator._adversarial_scenario import _UnstableAdversarialScenario from azure.core.credentials import TokenCredential @@ -47,7 +48,7 @@ class AdversarialSimulator: :type credential: ~azure.core.credentials.TokenCredential """ - def __init__(self, *, azure_ai_project: dict, credential): + def __init__(self, *, azure_ai_project: AzureAIProject, credential: TokenCredential): """Constructor.""" try: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py index 6f2369ed3539..07247bcc9f40 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py @@ -11,6 +11,7 @@ from azure.ai.evaluation._common.utils import validate_azure_ai_project from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from azure.ai.evaluation.simulator import AdversarialScenario +from azure.ai.evaluation._model_configurations import AzureAIProject from azure.core.credentials import TokenCredential from ._adversarial_simulator import AdversarialSimulator @@ -32,7 +33,7 @@ class DirectAttackSimulator: :type credential: ~azure.core.credentials.TokenCredential """ - def __init__(self, *, azure_ai_project: dict, credential): + def __init__(self, *, azure_ai_project: AzureAIProject, credential: TokenCredential): """Constructor.""" try: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py index 32b0f2fa99c1..24bfe5bc8268 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_indirect_attack_simulator.py @@ -13,6 +13,7 @@ from azure.ai.evaluation._common._experimental import experimental from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from azure.ai.evaluation.simulator import AdversarialScenarioJailbreak, SupportedLanguages +from azure.ai.evaluation._model_configurations import AzureAIProject from azure.core.credentials import TokenCredential from ._adversarial_simulator import AdversarialSimulator, JsonLineList @@ -34,7 +35,7 @@ class IndirectAttackSimulator(AdversarialSimulator): :type credential: ~azure.core.credentials.TokenCredential """ - def __init__(self, *, azure_ai_project: dict, credential): + def __init__(self, *, azure_ai_project: AzureAIProject, credential: TokenCredential): """Constructor.""" try: From ecf3982d86fc123ae1edf2a9ec43033801c3b000 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 1 Nov 2024 15:31:19 -0400 Subject: [PATCH 06/10] fix optional import --- .../_evaluators/_service_groundedness/_service_groundedness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py index ad24c3486c19..0a32bad481a6 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py @@ -1,7 +1,7 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from typing import List, Optional, Union, Dict +from typing import List, Union, Dict from typing_extensions import overload, override from azure.ai.evaluation._common._experimental import experimental From fc60f2b72c20ae56d74ff36dfb47fb250199f7f5 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 1 Nov 2024 15:33:27 -0400 Subject: [PATCH 07/10] comment out eval last turn section --- .../tests/e2etests/test_builtin_evaluators.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py index 7667b6d25a86..ddc07597f168 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py @@ -284,12 +284,13 @@ def test_content_safety_evaluator_self_harm(self, project_scope, azure_cred, sim assert score2["evaluation_per_turn"]["self_harm_score"][1] > 0 assert all(score2["evaluation_per_turn"]["self_harm_reason"]), "self_harm_reason must not be None or empty." - # Test eval_last_turn behavior - eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True) - score3 = eval_fn_last(conversation=simple_conversation) - assert score3["self_harm"] == "Very low" - assert score3["self_harm_score"] >= 1 - assert score3["self_harm_reason"], "self_harm_reason must not be None or empty." + # removed eval_last_turn from evaluators since behavior is incorrect. We can re-enable when + # it's added bakc to evaluators + # eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True) + # score3 = eval_fn_last(conversation=simple_conversation) + # assert score3["self_harm"] == "Very low" + # assert score3["self_harm_score"] >= 1 + # assert score3["self_harm_reason"], "self_harm_reason must not be None or empty." def test_content_safety_evaluator_hate_unfairness(self, project_scope, azure_cred, simple_conversation): eval_fn = HateUnfairnessEvaluator(azure_cred, project_scope) From 8f49321662ac61e80a1e9ae2e7ef04e6ee236962 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 1 Nov 2024 15:44:29 -0400 Subject: [PATCH 08/10] get rid of string from type hint --- .../azure/ai/evaluation/simulator/_direct_attack_simulator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py index 23a1fada4d04..6c7de986cdf7 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py @@ -126,7 +126,7 @@ async def __call__( - '**$schema**': A string indicating the schema URL for the conversation format. The 'content' for 'assistant' role messages may includes the messages that your callback returned. - :rtype: Dict[str, [List[Dict[str, Any]]]] with two elements + :rtype: Dict[str, [List[Dict[str, Any]]]] **Output format** From 1a1d8c5ae997df6ae4980068a3d7378c9def3645 Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 1 Nov 2024 15:45:51 -0400 Subject: [PATCH 09/10] fix typo --- .../tests/e2etests/test_builtin_evaluators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py index 3ab27e2a1ec4..843963493ce9 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py @@ -286,7 +286,7 @@ def test_content_safety_evaluator_self_harm(self, project_scope, azure_cred, sim assert all(score2["evaluation_per_turn"]["self_harm_reason"]), "self_harm_reason must not be None or empty." # removed eval_last_turn from evaluators since behavior is incorrect. We can re-enable when - # it's added bakc to evaluators + # it's added back to evaluators # eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True) # score3 = eval_fn_last(conversation=simple_conversation) # assert score3["self_harm"] == "Very low" From db9c96fe4fcd468b7f3f199ca99d2e878a009b9e Mon Sep 17 00:00:00 2001 From: Neehar Duvvuri Date: Fri, 1 Nov 2024 16:13:52 -0400 Subject: [PATCH 10/10] fix mypy errors --- .../azure/ai/evaluation/simulator/_direct_attack_simulator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py index 6c7de986cdf7..7278ff3be2de 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py @@ -180,7 +180,7 @@ async def __call__( randomization_seed = randint(0, 1000000) regular_sim = AdversarialSimulator( - azure_ai_project=cast(dict, self.azure_ai_project), credential=self.credential + azure_ai_project=self.azure_ai_project, credential=self.credential ) regular_sim_results = await regular_sim( scenario=scenario, @@ -194,7 +194,7 @@ async def __call__( randomize_order=False, randomization_seed=randomization_seed, ) - jb_sim = AdversarialSimulator(azure_ai_project=cast(dict, self.azure_ai_project), credential=self.credential) + jb_sim = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential) jb_sim_results = await jb_sim( scenario=scenario, target=target,