Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path.
- Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
- Fixed an issue with the `ContentSafetyEvaluator` that caused parallel execution of sub-evaluators to fail. Parallel execution is now enabled by default again, but can still be disabled via the '_parallel' boolean keyword argument during class initialization.

### Other Changes
- Refined error messages for serviced-based evaluators and simulators.
Expand Down
2 changes: 1 addition & 1 deletion sdk/evaluation/azure-ai-evaluation/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "python",
"TagPrefix": "python/evaluation/azure-ai-evaluation",
"Tag": "python/evaluation/azure-ai-evaluation_f6269c684c"
"Tag": "python/evaluation/azure-ai-evaluation_acededcaea"
}
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
# TODO address 3579092 to re-enabled parallel evals.
def __init__(self, credential, azure_ai_project, **kwargs):
super().__init__()
self._parallel = kwargs.pop("_parallel", False)
self._parallel = kwargs.pop("_parallel", True)
self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
ViolenceEvaluator(credential, azure_ai_project),
SexualEvaluator(credential, azure_ai_project),
Expand Down Expand Up @@ -152,7 +152,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
with ThreadPoolExecutor() as executor:
# pylint: disable=no-value-for-parameter
futures = {
executor.submit(query=query, response=response, conversation=conversation): evaluator
executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator
for evaluator in self._evaluators
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -318,26 +318,6 @@ def test_content_safety_evaluator_hate_unfairness(self, project_scope, azure_cre
score2["evaluation_per_turn"]["hate_unfairness_reason"]
), "hate_unfairness_reason must not be None or empty."

# TODO remove?

@pytest.mark.skipif(True, reason="Content safety service IS available now!")
def test_content_safety_service_unavailable(self, project_scope, azure_cred):
eval_fn = ViolenceEvaluator(azure_cred, project_scope)
# Doing this is replay mode breaks causes mismatch between scrubbed recordings
# and the actual request made.
if is_live():
# Warning, live testing fails due to unstable region.
# We need a use a new region.
project_scope["project_name"] = "pf-evals-ws-westus2"

with pytest.raises(Exception) as exc_info:
score = eval_fn(
query="What is the capital of Japan?",
response="The capital of Japan is Tokyo.",
)

assert "RAI service is not available in this region" in exc_info._excinfo[1].args[0]

@pytest.mark.parametrize("parallel", [False, True])
def test_composite_evaluator_qa(self, model_config, parallel):
qa_eval = QAEvaluator(model_config, _parallel=parallel)
Expand Down Expand Up @@ -387,8 +367,9 @@ def test_composite_evaluator_qa_for_nans(self, model_config):
assert not math.isnan(score["fluency"])
assert not math.isnan(score["similarity"])

def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, _parallel=False)
@pytest.mark.parametrize("parallel", [True, False])
def test_composite_evaluator_content_safety(self, project_scope, azure_cred, parallel):
safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, _parallel=parallel)
score = safety_eval(
query="Tokyo is the capital of which country?",
response="Japan",
Expand All @@ -408,8 +389,11 @@ def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
assert score["hate_unfairness_score"] < 1.0
assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."

def test_composite_evaluator_content_safety_with_conversation(self, project_scope, azure_cred, simple_conversation):
safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=False)
@pytest.mark.parametrize("parallel", [True, False])
def test_composite_evaluator_content_safety_with_conversation(
self, project_scope, azure_cred, simple_conversation, parallel
):
safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=parallel)
score = safety_eval(
conversation=simple_conversation,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,15 +192,16 @@ def test_evaluate_with_relative_data_path(self, model_config):
finally:
os.chdir(original_working_dir)

def test_evaluate_with_content_safety_evaluator(self, project_scope, azure_cred, data_file):
@pytest.mark.parametrize("parallel", [True, False])
def test_evaluate_with_content_safety_evaluator(self, project_scope, azure_cred, data_file, parallel):
input_data = pd.read_json(data_file, lines=True)

# CS evaluator tries to store the credential, which breaks multiprocessing at
# pickling stage. So we pass None for credential and let child evals
# generate a default credential at runtime.
# Internal Parallelism is also disabled to avoid faulty recordings.
content_safety_eval = ContentSafetyEvaluator(
credential=azure_cred, azure_ai_project=project_scope, _parallel=False
credential=azure_cred, azure_ai_project=project_scope, _parallel=parallel
)

# run the evaluation
Expand Down