From 6c81f6ab4fa366acf78520e872ca3583a1ab1404 Mon Sep 17 00:00:00 2001 From: mohessie Date: Tue, 16 Jun 2026 19:21:25 +0300 Subject: [PATCH 01/13] feat(evaluation): unify validators with azureml-assets - add DEVELOPER role, EvaluationLevel, MessagesOrQueryResponseInputValidator + level utils - support actions/expected_actions aliases in TaskNavigationEfficiencyValidator - align check_for_unsupported_tools flags in tool_call/input/output evaluators --- .../_common/_validators/__init__.py | 16 ++ .../_validators/_conversation_validator.py | 57 +++---- .../_validators/_evaluation_level_utils.py | 65 +++++++ .../_messages_or_query_response_validator.py | 158 ++++++++++++++++++ .../_task_navigation_efficiency_validator.py | 34 +++- .../_validators/_validation_constants.py | 12 ++ .../_tool_call_accuracy.py | 2 +- .../_tool_input_accuracy.py | 2 +- .../_tool_output_utilization.py | 4 +- 9 files changed, 316 insertions(+), 34 deletions(-) create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py index 92be4feac022..d7aefa8ccbd4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py @@ -3,16 +3,32 @@ """Validators package init.""" +from ._validation_constants import MessageRole, ContentType, EvaluationLevel from ._validator_interface import ValidatorInterface from ._conversation_validator import ConversationValidator from ._tool_definitions_validator import ToolDefinitionsValidator from ._tool_calls_validator import ToolCallsValidator from ._task_navigation_efficiency_validator import TaskNavigationEfficiencyValidator +from ._messages_or_query_response_validator import MessagesOrQueryResponseInputValidator +from ._evaluation_level_utils import ( + _resolve_evaluation_level, + _merge_query_response_messages, + _split_messages_at_latest_user, + _wrap_string_messages, +) __all__ = [ + "MessageRole", + "ContentType", + "EvaluationLevel", "ValidatorInterface", "ConversationValidator", "ToolDefinitionsValidator", "ToolCallsValidator", "TaskNavigationEfficiencyValidator", + "MessagesOrQueryResponseInputValidator", + "_resolve_evaluation_level", + "_merge_query_response_messages", + "_split_messages_at_latest_user", + "_wrap_string_messages", ] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py index 5e43f0265b0f..a1c375340bfc 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py @@ -130,7 +130,7 @@ def _validate_text_content_item(self, content_item: Dict[str, Any], role: str) - if not isinstance(content_item["text"], str): return EvaluationException( - message=f"The 'text' field must be a string in content items.", + message="The 'text' field must be a string in content items.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, @@ -196,16 +196,16 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu """Validate assistant message content.""" content = message["content"] - valid_assistant_content_types = [ - ContentType.TEXT, - ContentType.OUTPUT_TEXT, - ContentType.TOOL_CALL, - ContentType.FUNCTION_CALL, - ContentType.MCP_APPROVAL_REQUEST, - ContentType.OPENAPI_CALL, - ] - valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types] if isinstance(content, list): + valid_assistant_content_types = [ + ContentType.TEXT, + ContentType.OUTPUT_TEXT, + ContentType.TOOL_CALL, + ContentType.FUNCTION_CALL, + ContentType.MCP_APPROVAL_REQUEST, + ContentType.OPENAPI_CALL, + ] + valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types] for content_item in content: content_type = content_item["type"] if content_type not in valid_assistant_content_types: @@ -225,19 +225,21 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu if error: return error - # Raise error in case of unsupported tools for evaluators that enabled check_for_unsupported_tools - if self.check_for_unsupported_tools: - if content_type == ContentType.TOOL_CALL or content_type == ContentType.OPENAPI_CALL: - name = ( - "openapi_call" if content_type == ContentType.OPENAPI_CALL else content_item["name"].lower() - ) - if name in self.UNSUPPORTED_TOOLS: - return EvaluationException( - message=f"{name} tool call is currently not supported for {self.error_target.value} evaluator.", - blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.NOT_APPLICABLE, - target=self.error_target, + # Raise error in case of unsupported tools for evaluators that enabled check_for_unsupported_tools + if self.check_for_unsupported_tools: + if content_type == ContentType.TOOL_CALL or content_type == ContentType.OPENAPI_CALL: + name = ( + "openapi_call" + if content_type == ContentType.OPENAPI_CALL + else content_item["name"].lower() ) + if name in self.UNSUPPORTED_TOOLS: + return EvaluationException( + message=f"{name} tool call is currently not supported for {self.error_target.value} evaluator.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.NOT_APPLICABLE, + target=self.error_target, + ) return None def _validate_tool_message(self, message: Dict[str, Any]) -> Optional[EvaluationException]: @@ -314,7 +316,7 @@ def _validate_message_dict(self, message: Dict[str, Any]) -> Optional[Evaluation ) if not content_is_string_or_list_of_dicts: return EvaluationException( - message=f"The 'content' field must be a string or a list of dictionaries messages.", + message="The 'content' field must be a string or a list of dictionaries messages.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, @@ -322,23 +324,22 @@ def _validate_message_dict(self, message: Dict[str, Any]) -> Optional[Evaluation if len(content) == 0: return EvaluationException( - message=f"The 'content' field can't be empty.", + message="The 'content' field can't be empty.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, ) if isinstance(content, list): - all_messages_have_type_field = all("type" in item for item in content) - if not all_messages_have_type_field: + if not all("type" in item for item in content): return EvaluationException( - message=f"Each content item in the 'content' list must contain a 'type' field.", + message="Each content item in the 'content' list must contain a 'type' field.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, ) - if role in [MessageRole.USER, MessageRole.SYSTEM]: + if role in [MessageRole.USER, MessageRole.SYSTEM, MessageRole.DEVELOPER]: error = self._validate_user_or_system_message(message, role) if error: return error diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py new file mode 100644 index 000000000000..7dcfefed147d --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py @@ -0,0 +1,65 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Utilities for resolving evaluation levels and reshaping query/response/messages inputs. +""" + +from typing import List, Optional, Tuple, Union +from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget +from ._validation_constants import MessageRole, EvaluationLevel + + +def _resolve_evaluation_level( + evaluation_level: Optional[Union[EvaluationLevel, str]], + error_target: ErrorTarget, +) -> Optional[EvaluationLevel]: + """Validate and normalize the evaluation_level parameter. + + :param evaluation_level: The evaluation level to resolve. + :type evaluation_level: Optional[Union[EvaluationLevel, str]] + :param error_target: The error target for exceptions. + :type error_target: ErrorTarget + :return: The resolved EvaluationLevel or None for auto-detect. + :rtype: Optional[EvaluationLevel] + """ + valid = [level.value for level in EvaluationLevel] + if evaluation_level is None or evaluation_level == "": + return None + if isinstance(evaluation_level, EvaluationLevel): + return evaluation_level + if isinstance(evaluation_level, str): + try: + return EvaluationLevel(evaluation_level) + except ValueError as exc: + raise EvaluationException( + message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=error_target, + ) from exc + raise EvaluationException( + message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=error_target, + ) + + +def _merge_query_response_messages(query: List[dict], response: List[dict]) -> List[dict]: + """Merge query and response message lists into a single conversation.""" + return [*query, *response] + + +def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]: + """Split messages into query/response slices at the latest user turn.""" + latest_user_index = max(i for i, message in enumerate(messages) if message["role"] == MessageRole.USER) + return messages[: latest_user_index + 1], messages[latest_user_index + 1 :] + + +def _wrap_string_messages(query: str, response: str) -> Tuple[List[dict], List[dict]]: + """Wrap string query/response into separate message lists.""" + return ( + [{"role": "user", "content": [{"type": "text", "text": query}]}], + [{"role": "assistant", "content": [{"type": "text", "text": response}]}], + ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py new file mode 100644 index 000000000000..370d3d3edd1d --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py @@ -0,0 +1,158 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Validator that supports both single-turn (query/response) and multi-turn (messages) inputs. +""" + +from typing import Any, Dict +from typing_extensions import override +from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget +from ._validation_constants import MessageRole, ContentType +from ._conversation_validator import ConversationValidator +from ._tool_definitions_validator import ToolDefinitionsValidator + + +class MessagesOrQueryResponseInputValidator(ToolDefinitionsValidator): + """Validator that supports both single-turn (query/response) and multi-turn (messages) inputs. + + A single implementation serves all evaluators via two behavior flags: + - ``enforce_tool_definitions`` (default True): validate ``tool_definitions`` in both the + messages path and the query/response path. Set False for evaluators that do not accept + tool definitions (parity with a plain ``ConversationValidator``). + - ``deep_validate_messages`` (default False): additionally run full per-message + ``_validate_message_dict`` checks in the messages path. + """ + + enforce_tool_definitions: bool = True + deep_validate_messages: bool = False + + def __init__( + self, + error_target: ErrorTarget, + requires_query: bool = True, + optional_tool_definitions: bool = True, + check_for_unsupported_tools: bool = False, + *, + enforce_tool_definitions: bool = True, + deep_validate_messages: bool = False, + ): + """Initialize MessagesOrQueryResponseInputValidator.""" + super().__init__(error_target, requires_query, optional_tool_definitions, check_for_unsupported_tools) + self.enforce_tool_definitions = enforce_tool_definitions + self.deep_validate_messages = deep_validate_messages + + @override + def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: + """Validate evaluation input, supporting messages as an alternative to query/response.""" + # Multi-turn path (messages list) + messages = eval_input.get("messages") + if messages is not None: + if not isinstance(messages, list): + raise EvaluationException( + message="messages must be provided as a list of message dictionaries.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + if len(messages) == 0: + raise EvaluationException( + message="messages list must not be empty.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + + # Per-message structural checks + valid_roles = {role.value for role in MessageRole} + roles_present = set() + for index, message in enumerate(messages): + if not isinstance(message, dict): + raise EvaluationException( + message=( + f"Each item in 'messages' must be a dictionary, " + f"but item at index {index} is {type(message).__name__}." + ), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + role = message.get("role") + if role is None: + raise EvaluationException( + message=f"Each message must contain a 'role' key, but message at index {index} is missing it.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + if role not in valid_roles: + raise EvaluationException( + message=( + f"Invalid role '{role}' at message index {index}. " + f"Must be one of: {sorted(valid_roles)}." + ), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + roles_present.add(role) + + # Conversation-level checks + if MessageRole.USER.value not in roles_present: + raise EvaluationException( + message="messages must contain at least one message with role 'user'.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + if MessageRole.ASSISTANT.value not in roles_present: + raise EvaluationException( + message="messages must contain at least one message with role 'assistant'.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + # The final assistant message must contain text + last_content = messages[-1].get("content", "") + if isinstance(last_content, list): + has_text = any( + ( + isinstance(content_item, dict) + and content_item.get("type") + in ( + ContentType.TEXT, + ContentType.INPUT_TEXT, + ContentType.OUTPUT_TEXT, + ) + ) + or isinstance(content_item, str) + for content_item in last_content + ) + if not has_text: + raise EvaluationException( + message=( + "The last message must contain text content, " + "not only tool calls. The conversation appears to be " + "mid-execution \u2014 provide the agent's final text response." + ), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + + if self.deep_validate_messages: + for message in messages: + error = self._validate_message_dict(message) + if error: + raise error + + if self.enforce_tool_definitions: + tool_definitions = eval_input.get("tool_definitions") + tool_definitions_validation_exception = self._validate_tool_definitions(tool_definitions) + if tool_definitions_validation_exception: + raise tool_definitions_validation_exception + return True + + if self.enforce_tool_definitions: + return super().validate_eval_input(eval_input) + return ConversationValidator.validate_eval_input(self, eval_input) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py index 132303129546..3c0d6018b2eb 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py @@ -17,17 +17,38 @@ class TaskNavigationEfficiencyValidator(ValidatorInterface): """ Validate task navigation efficiency inputs (response and ground_truth). + Accepts either the SDK input names (``response``/``ground_truth``) or the + azureml-assets names (``actions``/``expected_actions``). + Validates: - - response: List of assistant messages containing tool calls - - ground_truth: Either a list of expected tool names, or a tuple of (tool names, parameters dict) + - response (alias ``actions``): List of assistant messages containing tool calls + - ground_truth (alias ``expected_actions``): Either a list of expected tool names, or a + tuple of (tool names, parameters dict) """ error_target: ErrorTarget + # Canonical input key -> accepted alternate (azureml-assets) key name. + _INPUT_ALIASES: Dict[str, str] = { + "response": "actions", + "ground_truth": "expected_actions", + } + def __init__(self, error_target: ErrorTarget): """Initialize with error target.""" self.error_target = error_target + def _normalize_input_aliases(self, eval_input: Dict[str, Any]) -> None: + """Map azureml-assets-style input keys onto the canonical keys in place. + + If a canonical key (``response``/``ground_truth``) is absent but its alias + (``actions``/``expected_actions``) is provided, copy the alias value to the canonical + key so the rest of the pipeline can rely on a single set of names. + """ + for canonical, alias in self._INPUT_ALIASES.items(): + if eval_input.get(canonical) is None and eval_input.get(alias) is not None: + eval_input[canonical] = eval_input[alias] + def _validate_response(self, response: Any) -> Optional[EvaluationException]: """Validate the response parameter.""" if response is None: @@ -221,8 +242,12 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: """ Validate task navigation evaluation input. + Accepts either the SDK input names (``response``/``ground_truth``) or the + azureml-assets names (``actions``/``expected_actions``). + Args: - eval_input: Dictionary containing 'response' and 'ground_truth'. + eval_input: Dictionary containing 'response'/'ground_truth' (or their + 'actions'/'expected_actions' aliases). Returns: True if validation passes. @@ -230,6 +255,9 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: Raises: EvaluationException: If validation fails. """ + # Normalize azureml-assets-style aliases ('actions'/'expected_actions') onto canonical keys. + self._normalize_input_aliases(eval_input) + # If response or ground_truth is a string, try to parse it as JSON for key in ("response", "ground_truth"): value = eval_input.get(key) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py index f4c242a9f02b..3c6795309672 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py @@ -15,6 +15,7 @@ class MessageRole(str, Enum): ASSISTANT = "assistant" SYSTEM = "system" TOOL = "tool" + DEVELOPER = "developer" class ContentType(str, Enum): @@ -31,3 +32,14 @@ class ContentType(str, Enum): MCP_APPROVAL_RESPONSE = "mcp_approval_response" OPENAPI_CALL = "openapi_call" OPENAPI_CALL_OUTPUT = "openapi_call_output" + + +class EvaluationLevel(str, Enum): + """Supported evaluation levels for the evaluator. + + - ``CONVERSATION``: Force conversation-level evaluation using the multi-turn path. + - ``TURN``: Force turn-level evaluation using the single-turn query/response path. + """ + + CONVERSATION = "conversation" + TURN = "turn" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 3a2ccb1ace85..f5057f09e947 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, # Initialize input validator self._validator = ToolCallsValidator( error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 7ebc20c7e130..198fefde02d1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -92,7 +92,7 @@ def __init__( self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, optional_tool_definitions=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py index 6339fdab2bb6..6f8605c5a071 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py @@ -86,7 +86,9 @@ def __init__( # Initialize input validator self._validator = ToolDefinitionsValidator( - error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, optional_tool_definitions=False + error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, + optional_tool_definitions=False, + check_for_unsupported_tools=True, ) super().__init__( From 2808e5af81159bee3d897b0179f266db5fa33458 Mon Sep 17 00:00:00 2001 From: Mohamed Hessien Date: Tue, 16 Jun 2026 20:23:28 +0300 Subject: [PATCH 02/13] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../_common/_validators/_evaluation_level_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py index 7dcfefed147d..379e3e065902 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py @@ -53,7 +53,12 @@ def _merge_query_response_messages(query: List[dict], response: List[dict]) -> L def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]: """Split messages into query/response slices at the latest user turn.""" - latest_user_index = max(i for i, message in enumerate(messages) if message["role"] == MessageRole.USER) + latest_user_index = max( + (i for i, message in enumerate(messages) if message.get("role") == MessageRole.USER.value), + default=-1, + ) + if latest_user_index == -1: + raise ValueError("messages must contain at least one message with role 'user'.") return messages[: latest_user_index + 1], messages[latest_user_index + 1 :] From 106ac42ef1f22b784f8dcd942b8ec2eaa284df08 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 16 Jun 2026 17:28:15 +0000 Subject: [PATCH 03/13] Add unit tests for actions/expected_actions alias input normalization Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> --- ...t_task_navigation_efficiency_evaluators.py | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py index 2c1a5dfba237..8a1f3b512b9b 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py @@ -404,3 +404,95 @@ def test_matching_mode_validation(self): # Test invalid type for mode with pytest.raises(Exception): # EvaluationException _TaskNavigationEfficiencyEvaluator(matching_mode=123) # type: ignore + + # ==================== ALIAS INPUT NORMALIZATION TESTS ==================== + + def test_alias_actions_normalized_as_response(self): + """Test that 'actions' alias is accepted and normalized to 'response'.""" + evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + actions = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}], + }, + ] + ground_truth = ["search", "analyze"] + + result = evaluator(actions=actions, ground_truth=ground_truth) + assert result["task_navigation_efficiency_passed"] is True + assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0 + assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0 + + def test_alias_expected_actions_normalized_as_ground_truth(self): + """Test that 'expected_actions' alias is accepted and normalized to 'ground_truth'.""" + evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + response = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}], + }, + ] + expected_actions = ["search", "analyze"] + + result = evaluator(response=response, expected_actions=expected_actions) + assert result["task_navigation_efficiency_passed"] is True + assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0 + assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0 + + def test_both_aliases_normalized_and_evaluated(self): + """Test that both 'actions' and 'expected_actions' aliases together produce the correct result.""" + evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + actions = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}], + }, + ] + expected_actions = ["search", "analyze"] + + result = evaluator(actions=actions, expected_actions=expected_actions) + assert result["task_navigation_efficiency_passed"] is True + assert result["task_navigation_efficiency_result"] == "pass" + assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0 + assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0 + assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0 + + def test_alias_inputs_mismatch(self): + """Test that alias inputs produce a failing result when actions do not match expected_actions.""" + evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + # Agent performs 'search' and 'extra_step', but expected is 'search' and 'analyze' + actions = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "extra_step", "arguments": {}}], + }, + ] + expected_actions = ["search", "analyze"] + + result = evaluator(actions=actions, expected_actions=expected_actions) + assert result["task_navigation_efficiency_passed"] is False + assert result["task_navigation_efficiency_result"] == "fail" + # precision: 1 match out of 2 agent steps = 0.5 + assert result["task_navigation_efficiency_properties"]["precision_score"] == 0.5 + # recall: 1 match out of 2 expected steps = 0.5 + assert result["task_navigation_efficiency_properties"]["recall_score"] == 0.5 From aadd11ca772af3e370a1ed910f934b5426d912de Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 16 Jun 2026 17:29:13 +0000 Subject: [PATCH 04/13] Remove redundant assertions from test_both_aliases_normalized_and_evaluated Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> --- .../unittests/test_task_navigation_efficiency_evaluators.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py index 8a1f3b512b9b..7b3ab816ad22 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py @@ -467,10 +467,6 @@ def test_both_aliases_normalized_and_evaluated(self): result = evaluator(actions=actions, expected_actions=expected_actions) assert result["task_navigation_efficiency_passed"] is True - assert result["task_navigation_efficiency_result"] == "pass" - assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0 - assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0 - assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0 def test_alias_inputs_mismatch(self): """Test that alias inputs produce a failing result when actions do not match expected_actions.""" From 6b6616dd9ba5271040fdf41fe28a682642ae3034 Mon Sep 17 00:00:00 2001 From: mohessie Date: Sun, 21 Jun 2026 17:09:34 +0300 Subject: [PATCH 05/13] Move changes --- .../azure/ai/evaluation/_common/constants.py | 11 +++ .../azure/ai/evaluation/_common/utils.py | 76 ++++++++++++++++++- .../_common/_validators/__init__.py | 15 +--- .../_validators/_evaluation_level_utils.py | 70 ----------------- ...se_validator.py => _messages_validator.py} | 2 +- .../_validators/_validation_constants.py | 11 --- 6 files changed, 89 insertions(+), 96 deletions(-) delete mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py rename sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/{_messages_or_query_response_validator.py => _messages_validator.py} (99%) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py index 5640efea3ab4..32d7cfead5f4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py @@ -61,6 +61,17 @@ class EvaluatorScoringPattern(Enum): SCALE_1_5 = "scale_1_5" # 1-5 scale (quality evaluators) +class EvaluationLevel(str, Enum): + """Supported evaluation levels for the evaluator. + + - ``CONVERSATION``: Force conversation-level evaluation using the multi-turn path. + - ``TURN``: Force turn-level evaluation using the single-turn query/response path. + """ + + CONVERSATION = "conversation" + TURN = "turn" + + class Tasks: """Defines types of annotation tasks supported by RAI Service.""" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py index 32b56a018cdb..013a03a2ed12 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py @@ -21,7 +21,12 @@ ) from . import constants -from .constants import EvaluatorScoringPattern, EVALUATOR_SCORING_PATTERNS, SCORING_PATTERN_CONFIG +from .constants import ( + EvaluatorScoringPattern, + EVALUATOR_SCORING_PATTERNS, + SCORING_PATTERN_CONFIG, + EvaluationLevel, +) _nltk_data_download_lock = threading.Lock() @@ -962,3 +967,72 @@ def upload(path: str, container_client: ContainerClient, logger=None): category=ErrorCategory.UPLOAD_ERROR, blame=ErrorBlame.SYSTEM_ERROR, ) from e + + +# region Multi-turn utilities + + +def _resolve_evaluation_level( + evaluation_level: Optional[Union[EvaluationLevel, str]], + error_target: ErrorTarget, +) -> Optional[EvaluationLevel]: + """Validate and normalize the evaluation_level parameter. + + :param evaluation_level: The evaluation level to resolve. + :type evaluation_level: Optional[Union[EvaluationLevel, str]] + :param error_target: The error target for exceptions. + :type error_target: ErrorTarget + :return: The resolved EvaluationLevel or None for auto-detect. + :rtype: Optional[EvaluationLevel] + """ + valid = [level.value for level in EvaluationLevel] + if evaluation_level is None or evaluation_level == "": + return None + if isinstance(evaluation_level, EvaluationLevel): + return evaluation_level + if isinstance(evaluation_level, str): + try: + return EvaluationLevel(evaluation_level) + except ValueError as exc: + raise EvaluationException( + message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=error_target, + ) from exc + raise EvaluationException( + message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=error_target, + ) + + +def _merge_query_response_messages(query: List[dict], response: List[dict]) -> List[dict]: + """Merge query and response message lists into a single conversation.""" + return [*query, *response] + + +def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]: + """Split messages into query/response slices at the latest user turn.""" + # Local import to avoid a circular import with the validators subpackage. + from azure.ai.evaluation._evaluators._common._validators._validation_constants import MessageRole + + latest_user_index = max( + (i for i, message in enumerate(messages) if message.get("role") == MessageRole.USER.value), + default=-1, + ) + if latest_user_index == -1: + raise ValueError("messages must contain at least one message with role 'user'.") + return messages[: latest_user_index + 1], messages[latest_user_index + 1 :] + + +def _wrap_string_messages(query: str, response: str) -> Tuple[List[dict], List[dict]]: + """Wrap string query/response into separate message lists.""" + return ( + [{"role": "user", "content": [{"type": "text", "text": query}]}], + [{"role": "assistant", "content": [{"type": "text", "text": response}]}], + ) + + +# endregion diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py index d7aefa8ccbd4..78905602485e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py @@ -3,32 +3,21 @@ """Validators package init.""" -from ._validation_constants import MessageRole, ContentType, EvaluationLevel +from ._validation_constants import MessageRole, ContentType from ._validator_interface import ValidatorInterface from ._conversation_validator import ConversationValidator from ._tool_definitions_validator import ToolDefinitionsValidator from ._tool_calls_validator import ToolCallsValidator from ._task_navigation_efficiency_validator import TaskNavigationEfficiencyValidator -from ._messages_or_query_response_validator import MessagesOrQueryResponseInputValidator -from ._evaluation_level_utils import ( - _resolve_evaluation_level, - _merge_query_response_messages, - _split_messages_at_latest_user, - _wrap_string_messages, -) +from ._messages_validator import MessagesOrQueryResponseInputValidator __all__ = [ "MessageRole", "ContentType", - "EvaluationLevel", "ValidatorInterface", "ConversationValidator", "ToolDefinitionsValidator", "ToolCallsValidator", "TaskNavigationEfficiencyValidator", "MessagesOrQueryResponseInputValidator", - "_resolve_evaluation_level", - "_merge_query_response_messages", - "_split_messages_at_latest_user", - "_wrap_string_messages", ] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py deleted file mode 100644 index 379e3e065902..000000000000 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -""" -Utilities for resolving evaluation levels and reshaping query/response/messages inputs. -""" - -from typing import List, Optional, Tuple, Union -from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget -from ._validation_constants import MessageRole, EvaluationLevel - - -def _resolve_evaluation_level( - evaluation_level: Optional[Union[EvaluationLevel, str]], - error_target: ErrorTarget, -) -> Optional[EvaluationLevel]: - """Validate and normalize the evaluation_level parameter. - - :param evaluation_level: The evaluation level to resolve. - :type evaluation_level: Optional[Union[EvaluationLevel, str]] - :param error_target: The error target for exceptions. - :type error_target: ErrorTarget - :return: The resolved EvaluationLevel or None for auto-detect. - :rtype: Optional[EvaluationLevel] - """ - valid = [level.value for level in EvaluationLevel] - if evaluation_level is None or evaluation_level == "": - return None - if isinstance(evaluation_level, EvaluationLevel): - return evaluation_level - if isinstance(evaluation_level, str): - try: - return EvaluationLevel(evaluation_level) - except ValueError as exc: - raise EvaluationException( - message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), - blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.INVALID_VALUE, - target=error_target, - ) from exc - raise EvaluationException( - message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), - blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.INVALID_VALUE, - target=error_target, - ) - - -def _merge_query_response_messages(query: List[dict], response: List[dict]) -> List[dict]: - """Merge query and response message lists into a single conversation.""" - return [*query, *response] - - -def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]: - """Split messages into query/response slices at the latest user turn.""" - latest_user_index = max( - (i for i, message in enumerate(messages) if message.get("role") == MessageRole.USER.value), - default=-1, - ) - if latest_user_index == -1: - raise ValueError("messages must contain at least one message with role 'user'.") - return messages[: latest_user_index + 1], messages[latest_user_index + 1 :] - - -def _wrap_string_messages(query: str, response: str) -> Tuple[List[dict], List[dict]]: - """Wrap string query/response into separate message lists.""" - return ( - [{"role": "user", "content": [{"type": "text", "text": query}]}], - [{"role": "assistant", "content": [{"type": "text", "text": response}]}], - ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py similarity index 99% rename from sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py rename to sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py index 370d3d3edd1d..c5b9b6e21677 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py @@ -65,7 +65,7 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: # Per-message structural checks valid_roles = {role.value for role in MessageRole} - roles_present = set() + roles_present: set = set() for index, message in enumerate(messages): if not isinstance(message, dict): raise EvaluationException( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py index 3c6795309672..45ec115e978c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py @@ -32,14 +32,3 @@ class ContentType(str, Enum): MCP_APPROVAL_RESPONSE = "mcp_approval_response" OPENAPI_CALL = "openapi_call" OPENAPI_CALL_OUTPUT = "openapi_call_output" - - -class EvaluationLevel(str, Enum): - """Supported evaluation levels for the evaluator. - - - ``CONVERSATION``: Force conversation-level evaluation using the multi-turn path. - - ``TURN``: Force turn-level evaluation using the single-turn query/response path. - """ - - CONVERSATION = "conversation" - TURN = "turn" From f58f14ff3ab690aa1dc0fc62a319c16d2badc8b3 Mon Sep 17 00:00:00 2001 From: mohessie Date: Sun, 21 Jun 2026 18:53:32 +0300 Subject: [PATCH 06/13] Adding unit tests --- .../azure/ai/evaluation/_common/constants.py | 2 +- .../_validators/_conversation_validator.py | 28 +- .../tests/unittests/test_common_validators.py | 539 ++++++++++++++++++ 3 files changed, 554 insertions(+), 15 deletions(-) create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py index 32d7cfead5f4..8f62b8ee9c27 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py @@ -62,7 +62,7 @@ class EvaluatorScoringPattern(Enum): class EvaluationLevel(str, Enum): - """Supported evaluation levels for the evaluator. + """Supported evaluation levels for multi-turn evaluators. - ``CONVERSATION``: Force conversation-level evaluation using the multi-turn path. - ``TURN``: Force turn-level evaluation using the single-turn query/response path. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py index a1c375340bfc..23c4d1a5be6a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py @@ -225,21 +225,21 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu if error: return error - # Raise error in case of unsupported tools for evaluators that enabled check_for_unsupported_tools - if self.check_for_unsupported_tools: - if content_type == ContentType.TOOL_CALL or content_type == ContentType.OPENAPI_CALL: - name = ( - "openapi_call" - if content_type == ContentType.OPENAPI_CALL - else content_item["name"].lower() + # Raise error in case of unsupported tools for evaluators that enabled check_for_unsupported_tools + if self.check_for_unsupported_tools: + if content_type == ContentType.TOOL_CALL or content_type == ContentType.OPENAPI_CALL: + name = ( + "openapi_call" + if content_type == ContentType.OPENAPI_CALL + else content_item["name"].lower() + ) + if name in self.UNSUPPORTED_TOOLS: + return EvaluationException( + message=f"{name} tool call is currently not supported for {self.error_target.value} evaluator.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.NOT_APPLICABLE, + target=self.error_target, ) - if name in self.UNSUPPORTED_TOOLS: - return EvaluationException( - message=f"{name} tool call is currently not supported for {self.error_target.value} evaluator.", - blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.NOT_APPLICABLE, - target=self.error_target, - ) return None def _validate_tool_message(self, message: Dict[str, Any]) -> Optional[EvaluationException]: diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py new file mode 100644 index 000000000000..adc7e60b8b5d --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py @@ -0,0 +1,539 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""Unit tests for the shared evaluator input validators.""" + +import pytest + +from azure.ai.evaluation._exceptions import EvaluationException, ErrorCategory, ErrorTarget +from azure.ai.evaluation._evaluators._common._validators import ( + MessageRole, + ContentType, + ConversationValidator, + ToolDefinitionsValidator, + ToolCallsValidator, + TaskNavigationEfficiencyValidator, + MessagesOrQueryResponseInputValidator, +) + + +TARGET = ErrorTarget.CONVERSATION + + +def _user_message(text="hello"): + return {"role": "user", "content": text} + + +def _assistant_message(text="hi there"): + return {"role": "assistant", "content": text} + + +def _tool_call_content_item(name="search", tool_call_id="call_1"): + return { + "type": "tool_call", + "name": name, + "arguments": {"q": "foo"}, + "tool_call_id": tool_call_id, + } + + +def _tool_definition(name="search"): + return {"name": name, "parameters": {"type": "object"}} + + +@pytest.mark.unittest +class TestValidationConstants: + def test_message_role_values(self): + assert MessageRole.USER == "user" + assert MessageRole.ASSISTANT == "assistant" + assert MessageRole.SYSTEM == "system" + assert MessageRole.TOOL == "tool" + assert MessageRole.DEVELOPER == "developer" + + def test_content_type_values(self): + assert ContentType.TEXT == "text" + assert ContentType.TOOL_CALL == "tool_call" + assert ContentType.TOOL_RESULT == "tool_result" + assert ContentType.FUNCTION_CALL == "function_call" + assert ContentType.MCP_APPROVAL_REQUEST == "mcp_approval_request" + + +@pytest.mark.unittest +class TestConversationValidator: + def test_valid_query_response(self): + validator = ConversationValidator(error_target=TARGET) + eval_input = {"query": [_user_message()], "response": [_assistant_message()]} + assert validator.validate_eval_input(eval_input) is True + + def test_valid_string_query_response(self): + validator = ConversationValidator(error_target=TARGET) + eval_input = {"query": "what is the weather?", "response": "it is sunny"} + assert validator.validate_eval_input(eval_input) is True + + def test_valid_conversation(self): + validator = ConversationValidator(error_target=TARGET) + eval_input = {"conversation": {"messages": [_user_message(), _assistant_message()]}} + assert validator.validate_eval_input(eval_input) is True + + def test_query_not_required(self): + validator = ConversationValidator(error_target=TARGET, requires_query=False) + eval_input = {"response": [_assistant_message()]} + assert validator.validate_eval_input(eval_input) is True + + def test_missing_query_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"response": [_assistant_message()]}) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + def test_empty_query_list_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"query": [], "response": [_assistant_message()]}) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + def test_empty_query_string_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": "", "response": [_assistant_message()]}) + + def test_query_wrong_type_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"query": 123, "response": [_assistant_message()]}) + assert exc_info.value.category == ErrorCategory.INVALID_VALUE + + def test_message_not_dict_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": ["not a dict"], "response": [_assistant_message()]}) + + def test_message_missing_role_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": [{"content": "hi"}], "response": [_assistant_message()]}) + + def test_message_missing_content_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": [{"role": "user"}], "response": [_assistant_message()]}) + + def test_empty_content_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": [{"role": "user", "content": ""}], "response": [_assistant_message()]}) + + def test_content_list_item_missing_type_raises(self): + validator = ConversationValidator(error_target=TARGET) + bad = {"role": "user", "content": [{"text": "hi"}]} + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": [bad], "response": [_assistant_message()]}) + + def test_user_message_invalid_content_type_raises(self): + validator = ConversationValidator(error_target=TARGET) + bad = {"role": "user", "content": [{"type": "tool_call", "text": "hi"}]} + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": [bad], "response": [_assistant_message()]}) + + def test_assistant_message_with_tool_call(self): + validator = ConversationValidator(error_target=TARGET) + assistant = {"role": "assistant", "content": [_tool_call_content_item()]} + eval_input = {"query": [_user_message()], "response": [assistant]} + assert validator.validate_eval_input(eval_input) is True + + def test_assistant_tool_call_missing_name_raises(self): + validator = ConversationValidator(error_target=TARGET) + bad_item = {"type": "tool_call", "arguments": {}, "tool_call_id": "1"} + assistant = {"role": "assistant", "content": [bad_item]} + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": [_user_message()], "response": [assistant]}) + + def test_unsupported_tool_raises_when_enabled(self): + validator = ConversationValidator(error_target=TARGET, check_for_unsupported_tools=True) + unsupported = _tool_call_content_item(name="bing_grounding") + assistant = {"role": "assistant", "content": [unsupported]} + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"query": [_user_message()], "response": [assistant]}) + assert exc_info.value.category == ErrorCategory.NOT_APPLICABLE + + def test_unsupported_tool_allowed_when_disabled(self): + validator = ConversationValidator(error_target=TARGET, check_for_unsupported_tools=False) + unsupported = _tool_call_content_item(name="bing_grounding") + assistant = {"role": "assistant", "content": [unsupported]} + eval_input = {"query": [_user_message()], "response": [assistant]} + assert validator.validate_eval_input(eval_input) is True + + def test_conversation_not_dict_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"conversation": ["not a dict"]}) + + def test_conversation_missing_messages_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"conversation": {}}) + + def test_tool_message_valid(self): + validator = ConversationValidator(error_target=TARGET) + tool_msg = { + "role": "tool", + "tool_call_id": "call_1", + "content": [{"type": "tool_result", "tool_result": "done"}], + } + eval_input = {"query": [_user_message(), tool_msg], "response": [_assistant_message()]} + assert validator.validate_eval_input(eval_input) is True + + def test_tool_message_content_not_list_raises(self): + validator = ConversationValidator(error_target=TARGET) + tool_msg = {"role": "tool", "tool_call_id": "call_1", "content": "result"} + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": [tool_msg], "response": [_assistant_message()]}) + + +@pytest.mark.unittest +class TestToolDefinitionsValidator: + def test_optional_tool_definitions_absent_ok(self): + validator = ToolDefinitionsValidator(error_target=TARGET, optional_tool_definitions=True) + eval_input = {"query": [_user_message()], "response": [_assistant_message()]} + assert validator.validate_eval_input(eval_input) is True + + def test_required_tool_definitions_absent_raises(self): + validator = ToolDefinitionsValidator(error_target=TARGET, optional_tool_definitions=False) + eval_input = {"query": [_user_message()], "response": [_assistant_message()]} + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + def test_valid_tool_definitions(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": [_tool_definition()], + } + assert validator.validate_eval_input(eval_input) is True + + def test_tool_definitions_not_list_raises(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": 123, + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert exc_info.value.category == ErrorCategory.INVALID_VALUE + + def test_tool_definition_missing_name_raises(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": [{"parameters": {}}], + } + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_tool_definition_missing_parameters_raises(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": [{"name": "search"}], + } + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_openapi_tool_definition_valid(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": [{"type": "openapi", "functions": [_tool_definition()]}], + } + assert validator.validate_eval_input(eval_input) is True + + def test_openapi_tool_definition_missing_functions_raises(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": [{"type": "openapi"}], + } + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_string_tool_definitions_ok(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": "some string", + } + assert validator.validate_eval_input(eval_input) is True + + +@pytest.mark.unittest +class TestToolCallsValidator: + def test_valid_tool_calls(self): + validator = ToolCallsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": [_tool_definition()], + "tool_calls": [_tool_call_content_item()], + } + assert validator.validate_eval_input(eval_input) is True + + def test_missing_tool_calls_and_response_raises(self): + validator = ToolCallsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "tool_definitions": [_tool_definition()], + } + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_tool_calls_from_response_ok(self): + validator = ToolCallsValidator(error_target=TARGET) + assistant = {"role": "assistant", "content": [_tool_call_content_item()]} + eval_input = { + "query": [_user_message()], + "response": [assistant], + "tool_definitions": [_tool_definition()], + } + assert validator.validate_eval_input(eval_input) is True + + def test_tool_calls_not_list_raises(self): + validator = ToolCallsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "tool_definitions": [_tool_definition()], + "tool_calls": 123, + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert exc_info.value.category == ErrorCategory.INVALID_VALUE + + def test_tool_call_item_not_dict_raises(self): + validator = ToolCallsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "tool_definitions": [_tool_definition()], + "tool_calls": ["not a dict"], + } + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_tool_definitions_required(self): + validator = ToolCallsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "tool_calls": [_tool_call_content_item()], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + +@pytest.mark.unittest +class TestTaskNavigationEfficiencyValidator: + def _response(self): + return [ + {"role": "user", "content": "do the task"}, + { + "role": "assistant", + "content": [{"type": "tool_call", "name": "search"}], + }, + ] + + def test_valid_list_ground_truth(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"response": self._response(), "ground_truth": ["search", "summarize"]} + assert validator.validate_eval_input(eval_input) is True + + def test_valid_tuple_ground_truth(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = { + "response": self._response(), + "ground_truth": (["search"], {"search": {"q": "foo"}}), + } + assert validator.validate_eval_input(eval_input) is True + + def test_alias_inputs_normalized(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"actions": self._response(), "expected_actions": ["search"]} + assert validator.validate_eval_input(eval_input) is True + + def test_json_string_inputs_parsed(self): + import json + + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = { + "response": json.dumps(self._response()), + "ground_truth": json.dumps(["search"]), + } + assert validator.validate_eval_input(eval_input) is True + + def test_response_none_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"response": None, "ground_truth": ["search"]}) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + def test_response_not_list_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"response": {"role": "user"}, "ground_truth": ["search"]}) + + def test_action_missing_role_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"response": [{"content": []}], "ground_truth": ["search"]} + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_assistant_action_content_not_list_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = { + "response": [{"role": "assistant", "content": "not a list"}], + "ground_truth": ["search"], + } + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_tool_call_missing_name_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = { + "response": [{"role": "assistant", "content": [{"type": "tool_call"}]}], + "ground_truth": ["search"], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + def test_ground_truth_empty_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"response": self._response(), "ground_truth": []}) + + def test_ground_truth_wrong_type_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"response": self._response(), "ground_truth": 123}) + + def test_ground_truth_list_non_string_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"response": self._response(), "ground_truth": ["search", 1]}) + + def test_ground_truth_tuple_wrong_length_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"response": self._response(), "ground_truth": (["search"],)}) + + def test_ground_truth_tuple_params_not_dict_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"response": self._response(), "ground_truth": (["search"], {"search": "bad"})} + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + +@pytest.mark.unittest +class TestMessagesOrQueryResponseInputValidator: + def _messages(self): + return [_user_message(), _assistant_message()] + + def test_valid_messages(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + assert validator.validate_eval_input({"messages": self._messages()}) is True + + def test_valid_query_response_fallback(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + eval_input = {"query": [_user_message()], "response": [_assistant_message()]} + assert validator.validate_eval_input(eval_input) is True + + def test_messages_not_list_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"messages": "not a list"}) + assert exc_info.value.category == ErrorCategory.INVALID_VALUE + + def test_messages_empty_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": []}) + + def test_message_not_dict_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": ["not a dict"]}) + + def test_message_missing_role_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": [{"content": "hi"}]}) + + def test_invalid_role_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + bad = [{"role": "bot", "content": "hi"}, _assistant_message()] + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": bad}) + + def test_missing_user_role_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": [_assistant_message()]}) + + def test_missing_assistant_role_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": [_user_message()]}) + + def test_last_message_only_tool_calls_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + messages = [ + _user_message(), + {"role": "assistant", "content": [_tool_call_content_item()]}, + ] + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"messages": messages}) + assert exc_info.value.category == ErrorCategory.INVALID_VALUE + + def test_last_message_with_text_content_ok(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + messages = [ + _user_message(), + {"role": "assistant", "content": [{"type": "output_text", "text": "done"}]}, + ] + assert validator.validate_eval_input({"messages": messages}) is True + + def test_enforce_tool_definitions_required(self): + validator = MessagesOrQueryResponseInputValidator( + error_target=TARGET, optional_tool_definitions=False + ) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": self._messages()}) + + def test_no_enforce_tool_definitions_ok(self): + validator = MessagesOrQueryResponseInputValidator( + error_target=TARGET, enforce_tool_definitions=False + ) + assert validator.validate_eval_input({"messages": self._messages()}) is True + + def test_deep_validate_messages_catches_bad_content(self): + validator = MessagesOrQueryResponseInputValidator( + error_target=TARGET, enforce_tool_definitions=False, deep_validate_messages=True + ) + messages = [ + {"role": "user", "content": [{"type": "tool_call", "text": "bad"}]}, + _assistant_message(), + ] + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": messages}) + + def test_query_response_fallback_no_enforce_tool_definitions(self): + validator = MessagesOrQueryResponseInputValidator( + error_target=TARGET, enforce_tool_definitions=False + ) + eval_input = {"query": [_user_message()], "response": [_assistant_message()]} + assert validator.validate_eval_input(eval_input) is True From bc232444dd92c690400616ba0bb080372af79ebd Mon Sep 17 00:00:00 2001 From: mohessie Date: Sun, 21 Jun 2026 19:16:48 +0300 Subject: [PATCH 07/13] add unit tests Co-authored-by: Copilot --- .../tests/unittests/test_common_validators.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py index adc7e60b8b5d..7b06ca8527d6 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py @@ -365,6 +365,59 @@ def test_alias_inputs_normalized(self): eval_input = {"actions": self._response(), "expected_actions": ["search"]} assert validator.validate_eval_input(eval_input) is True + def test_actions_alias_normalized_onto_response(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"actions": self._response(), "ground_truth": ["search"]} + assert validator.validate_eval_input(eval_input) is True + # The alias value should be copied onto the canonical 'response' key in place. + assert eval_input["response"] == eval_input["actions"] + + def test_expected_actions_alias_normalized_onto_ground_truth(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"response": self._response(), "expected_actions": ["search"]} + assert validator.validate_eval_input(eval_input) is True + assert eval_input["ground_truth"] == eval_input["expected_actions"] + + def test_mixed_canonical_and_alias_inputs(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"actions": self._response(), "ground_truth": ["search"]} + assert validator.validate_eval_input(eval_input) is True + + def test_canonical_takes_precedence_over_alias(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + # 'response' (canonical) is valid; 'actions' (alias) is invalid and must be ignored. + eval_input = { + "response": self._response(), + "actions": "not a valid list", + "ground_truth": ["search"], + } + assert validator.validate_eval_input(eval_input) is True + # Canonical value is preserved; alias does not overwrite it. + assert eval_input["response"] == self._response() + + def test_alias_does_not_overwrite_empty_string_canonical(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + # Canonical present but falsy ("") is still not None, so alias must not overwrite it. + eval_input = {"response": "", "actions": self._response(), "ground_truth": ["search"]} + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_alias_json_string_inputs_parsed(self): + import json + + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = { + "actions": json.dumps(self._response()), + "expected_actions": json.dumps(["search"]), + } + assert validator.validate_eval_input(eval_input) is True + + def test_missing_canonical_and_alias_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"ground_truth": ["search"]}) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + def test_json_string_inputs_parsed(self): import json From 4cebe09b14937d99a70d6ae8f2a37a1dd4a461d0 Mon Sep 17 00:00:00 2001 From: mohessie Date: Sun, 21 Jun 2026 19:27:38 +0300 Subject: [PATCH 08/13] Remove checking that the final message contains text as this is an intended behavior --- .../_validators/_messages_validator.py | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py index b090752d65f2..f23fe59b0ceb 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py @@ -113,33 +113,6 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: category=ErrorCategory.INVALID_VALUE, target=self.error_target, ) - # The final assistant message must contain text - last_content = messages[-1].get("content", "") - if isinstance(last_content, list): - has_text = any( - ( - isinstance(content_item, dict) - and content_item.get("type") - in ( - ContentType.TEXT, - ContentType.INPUT_TEXT, - ContentType.OUTPUT_TEXT, - ) - ) - or isinstance(content_item, str) - for content_item in last_content - ) - if not has_text: - raise EvaluationException( - message=( - "The last message must contain text content, " - "not only tool calls. The conversation appears to be " - "mid-execution \u2014 provide the agent's final text response." - ), - blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.INVALID_VALUE, - target=self.error_target, - ) if self.deep_validate_messages: for message in messages: From bc60d7cbce3eb14eb26de458639d54eff349f950 Mon Sep 17 00:00:00 2001 From: mohessie Date: Sun, 21 Jun 2026 20:05:31 +0300 Subject: [PATCH 09/13] update default flags Co-authored-by: Copilot --- .../_validators/_messages_validator.py | 25 ++++----------- .../tests/unittests/test_common_validators.py | 32 ++----------------- 2 files changed, 9 insertions(+), 48 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py index f23fe59b0ceb..c2f24c252f97 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py @@ -9,7 +9,7 @@ from typing import Any, Dict from typing_extensions import override from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget -from ._validation_constants import MessageRole, ContentType +from ._validation_constants import MessageRole from ._conversation_validator import ConversationValidator from ._tool_definitions_validator import ToolDefinitionsValidator @@ -17,16 +17,13 @@ class MessagesOrQueryResponseInputValidator(ToolDefinitionsValidator): """Validator that supports both single-turn (query/response) and multi-turn (messages) inputs. - A single implementation serves all evaluators via two behavior flags: - - ``enforce_tool_definitions`` (default True): validate ``tool_definitions`` in both the - messages path and the query/response path. Set False for evaluators that do not accept - tool definitions (parity with a plain ``ConversationValidator``). - - ``deep_validate_messages`` (default False): additionally run full per-message - ``_validate_message_dict`` checks in the messages path. + A single implementation serves all evaluators via a behavior flag: + - ``enforce_tool_definitions`` (default False): validate ``tool_definitions`` in both the + messages path and the query/response path. Set True for evaluators that require + tool definitions. """ - enforce_tool_definitions: bool = True - deep_validate_messages: bool = False + enforce_tool_definitions: bool = False def __init__( self, @@ -35,13 +32,11 @@ def __init__( optional_tool_definitions: bool = True, check_for_unsupported_tools: bool = False, *, - enforce_tool_definitions: bool = True, - deep_validate_messages: bool = False, + enforce_tool_definitions: bool = False, ): """Initialize MessagesOrQueryResponseInputValidator.""" super().__init__(error_target, requires_query, optional_tool_definitions, check_for_unsupported_tools) self.enforce_tool_definitions = enforce_tool_definitions - self.deep_validate_messages = deep_validate_messages @override def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: @@ -114,12 +109,6 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: target=self.error_target, ) - if self.deep_validate_messages: - for message in messages: - error = self._validate_message_dict(message) - if error: - raise error - if self.enforce_tool_definitions: tool_definitions = eval_input.get("tool_definitions") tool_definitions_validation_exception = self._validate_tool_definitions(tool_definitions) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py index 7b06ca8527d6..fa309cdde8f3 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py @@ -542,27 +542,10 @@ def test_missing_assistant_role_raises(self): with pytest.raises(EvaluationException): validator.validate_eval_input({"messages": [_user_message()]}) - def test_last_message_only_tool_calls_raises(self): - validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) - messages = [ - _user_message(), - {"role": "assistant", "content": [_tool_call_content_item()]}, - ] - with pytest.raises(EvaluationException) as exc_info: - validator.validate_eval_input({"messages": messages}) - assert exc_info.value.category == ErrorCategory.INVALID_VALUE - - def test_last_message_with_text_content_ok(self): - validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) - messages = [ - _user_message(), - {"role": "assistant", "content": [{"type": "output_text", "text": "done"}]}, - ] - assert validator.validate_eval_input({"messages": messages}) is True - def test_enforce_tool_definitions_required(self): validator = MessagesOrQueryResponseInputValidator( - error_target=TARGET, optional_tool_definitions=False + error_target=TARGET, optional_tool_definitions=False, + enforce_tool_definitions=True ) with pytest.raises(EvaluationException): validator.validate_eval_input({"messages": self._messages()}) @@ -573,17 +556,6 @@ def test_no_enforce_tool_definitions_ok(self): ) assert validator.validate_eval_input({"messages": self._messages()}) is True - def test_deep_validate_messages_catches_bad_content(self): - validator = MessagesOrQueryResponseInputValidator( - error_target=TARGET, enforce_tool_definitions=False, deep_validate_messages=True - ) - messages = [ - {"role": "user", "content": [{"type": "tool_call", "text": "bad"}]}, - _assistant_message(), - ] - with pytest.raises(EvaluationException): - validator.validate_eval_input({"messages": messages}) - def test_query_response_fallback_no_enforce_tool_definitions(self): validator = MessagesOrQueryResponseInputValidator( error_target=TARGET, enforce_tool_definitions=False From 93267aebb13c34879abbe826ab27b9fadc790ee2 Mon Sep 17 00:00:00 2001 From: mohessie Date: Sun, 21 Jun 2026 21:57:45 +0300 Subject: [PATCH 10/13] format --- .../_validators/_conversation_validator.py | 4 +--- .../tests/unittests/test_common_validators.py | 15 ++++++--------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py index 23c4d1a5be6a..6a8f9feb299b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py @@ -229,9 +229,7 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu if self.check_for_unsupported_tools: if content_type == ContentType.TOOL_CALL or content_type == ContentType.OPENAPI_CALL: name = ( - "openapi_call" - if content_type == ContentType.OPENAPI_CALL - else content_item["name"].lower() + "openapi_call" if content_type == ContentType.OPENAPI_CALL else content_item["name"].lower() ) if name in self.UNSUPPORTED_TOOLS: return EvaluationException( diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py index fa309cdde8f3..91031216e853 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py @@ -121,7 +121,9 @@ def test_message_missing_content_raises(self): def test_empty_content_raises(self): validator = ConversationValidator(error_target=TARGET) with pytest.raises(EvaluationException): - validator.validate_eval_input({"query": [{"role": "user", "content": ""}], "response": [_assistant_message()]}) + validator.validate_eval_input( + {"query": [{"role": "user", "content": ""}], "response": [_assistant_message()]} + ) def test_content_list_item_missing_type_raises(self): validator = ConversationValidator(error_target=TARGET) @@ -544,21 +546,16 @@ def test_missing_assistant_role_raises(self): def test_enforce_tool_definitions_required(self): validator = MessagesOrQueryResponseInputValidator( - error_target=TARGET, optional_tool_definitions=False, - enforce_tool_definitions=True + error_target=TARGET, optional_tool_definitions=False, enforce_tool_definitions=True ) with pytest.raises(EvaluationException): validator.validate_eval_input({"messages": self._messages()}) def test_no_enforce_tool_definitions_ok(self): - validator = MessagesOrQueryResponseInputValidator( - error_target=TARGET, enforce_tool_definitions=False - ) + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET, enforce_tool_definitions=False) assert validator.validate_eval_input({"messages": self._messages()}) is True def test_query_response_fallback_no_enforce_tool_definitions(self): - validator = MessagesOrQueryResponseInputValidator( - error_target=TARGET, enforce_tool_definitions=False - ) + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET, enforce_tool_definitions=False) eval_input = {"query": [_user_message()], "response": [_assistant_message()]} assert validator.validate_eval_input(eval_input) is True From 633bb2dd9ed945755854b8fbb6fde3603822aa87 Mon Sep 17 00:00:00 2001 From: Mohamed Hessien Date: Thu, 25 Jun 2026 22:32:39 +0300 Subject: [PATCH 11/13] fix(evaluation): add missing __init__.py to autogen package The autogen/ directory had no __init__.py, so it was not discoverable by find_packages() and could not be imported as azure.ai.evaluation.autogen. This caused `azpysdk apistub` (API.md Consistency check) to fail with ModuleNotFoundError: No module named 'azure.ai.evaluation.autogen'. --- .../azure/ai/evaluation/autogen/__init__.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/autogen/__init__.py diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/autogen/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/autogen/__init__.py new file mode 100644 index 000000000000..d540fd20468c --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/autogen/__init__.py @@ -0,0 +1,3 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- From 1df643112ecbae842a7f250e6909fe9043630380 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Jun 2026 22:55:08 +0300 Subject: [PATCH 12/13] Regenerate azure-ai-evaluation API files (#47674) Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> --- sdk/evaluation/azure-ai-evaluation/api.md | 1717 +++++++++++++++++ .../azure-ai-evaluation/api.metadata.yml | 3 + 2 files changed, 1720 insertions(+) create mode 100644 sdk/evaluation/azure-ai-evaluation/api.md create mode 100644 sdk/evaluation/azure-ai-evaluation/api.metadata.yml diff --git a/sdk/evaluation/azure-ai-evaluation/api.md b/sdk/evaluation/azure-ai-evaluation/api.md new file mode 100644 index 000000000000..6b69ad8edf9d --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/api.md @@ -0,0 +1,1717 @@ +```py +namespace azure.ai.evaluation + + def azure.ai.evaluation.evaluate( + *, + azure_ai_project: Optional[Union[str, AzureAIProject]] = ..., + data: Union[str, PathLike], + evaluation_name: Optional[str] = ..., + evaluator_config: Optional[Dict[str, EvaluatorConfig]] = ..., + evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]], + fail_on_evaluator_errors: bool = False, + output_path: Optional[Union[str, PathLike]] = ..., + tags: Optional[Dict[str, str]] = ..., + target: Optional[Callable] = ..., + user_agent: Optional[str] = ..., + **kwargs + ) -> EvaluationResult: ... + + + class azure.ai.evaluation.AzureAIProject(TypedDict): + key "project_name": str + key "resource_group_name": str + key "subscription_id": str + + + @experimental + class azure.ai.evaluation.AzureOpenAIGrader: + id = azureai://built-in/evaluators/azure-openai/custom_grader + + def __init__( + self, + *, + credential: Optional[TokenCredential] = ..., + grader_config: Dict[str, Any], + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + **kwargs: Any + ): ... + + def get_client(self) -> Any: ... + + + @experimental + class azure.ai.evaluation.AzureOpenAILabelGrader(AzureOpenAIGrader): + id = azureai://built-in/evaluators/azure-openai/label_grader + + def __init__( + self, + *, + credential: Optional[TokenCredential] = ..., + input: List[Dict[str, str]], + labels: List[str], + model: str, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + name: str, + passing_labels: List[str], + **kwargs: Any + ): ... + + def get_client(self) -> Any: ... + + + class azure.ai.evaluation.AzureOpenAIModelConfiguration(TypedDict): + key "azure_deployment": str + key "azure_endpoint": str + api_key: NotRequired[str] + api_version: NotRequired[str] + credential: NotRequired[Any] + type: NotRequired[Literal["azure_openai"]] + + + @experimental + class azure.ai.evaluation.AzureOpenAIPythonGrader(AzureOpenAIGrader): + id = azureai://built-in/evaluators/azure-openai/python_grader + + def __init__( + self, + *, + credential: Optional[TokenCredential] = ..., + image_tag: Optional[str] = ..., + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + name: str, + pass_threshold: float, + source: str, + **kwargs: Any + ): ... + + def get_client(self) -> Any: ... + + + @experimental + class azure.ai.evaluation.AzureOpenAIScoreModelGrader(AzureOpenAIGrader): + id = azureai://built-in/evaluators/azure-openai/score_model_grader + + def __init__( + self, + *, + credential: Optional[TokenCredential] = ..., + input: List[Dict[str, str]], + model: str, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + name: str, + pass_threshold: Optional[float] = ..., + range: Optional[List[float]] = ..., + sampling_params: Optional[Dict[str, Any]] = ..., + **kwargs: Any + ): ... + + def get_client(self) -> Any: ... + + + @experimental + class azure.ai.evaluation.AzureOpenAIStringCheckGrader(AzureOpenAIGrader): + id = azureai://built-in/evaluators/azure-openai/string_check_grader + + def __init__( + self, + *, + credential: Optional[TokenCredential] = ..., + input: str, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + name: str, + operation: Literal["eq", "ne", "like", "ilike"], + reference: str, + **kwargs: Any + ): ... + + def get_client(self) -> Any: ... + + + @experimental + class azure.ai.evaluation.AzureOpenAITextSimilarityGrader(AzureOpenAIGrader): + id = azureai://built-in/evaluators/azure-openai/text_similarity_grader + + def __init__( + self, + *, + credential: Optional[TokenCredential] = ..., + evaluation_metric: Literal["fuzzy_match", "bleu", "gleu", "meteor", "rouge_1", "rouge_2", "rouge_3", "rouge_4", "rouge_5", "rouge_l", "cosine"], + input: str, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + name: str, + pass_threshold: float, + reference: str, + **kwargs: Any + ): ... + + def get_client(self) -> Any: ... + + + class azure.ai.evaluation.BleuScoreEvaluator(EvaluatorBase): + id = azureai://built-in/evaluators/bleu_score + + @overload + def __call__( + self, + *, + ground_truth: str, + response: str + ): ... + + def __init__( + self, + *, + threshold = 0.5 + ): ... + + + @experimental + class azure.ai.evaluation.CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): + id = azureai://built-in/evaluators/code_vulnerability + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + **kwargs: Any + ): ... + + + class azure.ai.evaluation.CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/coherence + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + is_reasoning_model: bool = ..., + threshold = 3, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/content_safety + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + *, + hate_unfairness_threshold: int = 3, + self_harm_threshold: int = 3, + sexual_threshold: int = 3, + violence_threshold: int = 3, + **kwargs: Any + ): ... + + + class azure.ai.evaluation.Conversation(TypedDict): + key "messages": Union[List[Message], List[Dict]] + context: NotRequired[Dict[str, Any]] + + + class azure.ai.evaluation.EvaluationResult(TypedDict): + key "metrics": Dict + key "rows": List[Dict] + oai_eval_run_ids: NotRequired[List[Dict[str, str]]] + studio_url: NotRequired[str] + + + class azure.ai.evaluation.EvaluatorConfig(TypedDict, total=False): + key "column_mapping": Dict[str, str] + + + class azure.ai.evaluation.F1ScoreEvaluator(EvaluatorBase): + id = azureai://built-in/evaluators/f1_score + + @overload + def __call__( + self, + *, + ground_truth: str, + response: str + ) -> Dict[str, float]: ... + + def __init__( + self, + *, + threshold = 0.5 + ): ... + + + class azure.ai.evaluation.FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/fluency + + @overload + def __call__( + self, + *, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + is_reasoning_model: bool = ..., + threshold = 3, + **kwargs + ): ... + + + class azure.ai.evaluation.GleuScoreEvaluator(EvaluatorBase): + id = azureai://built-in/evaluators/gleu_score + + @overload + def __call__( + self, + *, + ground_truth: str, + response: str + ): ... + + @override + def __init__( + self, + *, + threshold = 0.5 + ): ... + + + class azure.ai.evaluation.GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/groundedness + + @overload + def __call__( + self, + *, + context: str, + query: Optional[str] = ..., + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + query: str, + response: List[dict], + tool_definitions: Optional[List[dict]] = ... + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + is_reasoning_model: bool = ..., + threshold = 3, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): + id = azureai://built-in/evaluators/groundedness_pro + + @overload + def __call__( + self, + *, + context: str, + query: str, + response: str + ) -> Dict[str, Union[str, bool]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + *, + threshold: int = 5, + **kwargs: Any + ): ... + + + @experimental + class azure.ai.evaluation.HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/hate_unfairness + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + *, + threshold: int = 3, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): + id = azureai://built-in/evaluators/indirect_attack + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, bool]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/intent_resolution + + @overload + def __call__( + self, + *, + query: Union[str, List[dict]], + response: Union[str, List[dict]], + tool_definitions: Optional[Union[dict, List[dict]]] = ... + ) -> Dict[str, Union[str, float]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD, + **kwargs + ): ... + + + class azure.ai.evaluation.Message(TypedDict): + key "content": Union[str, List[Dict]] + key "role": str + context: NotRequired[Dict[str, Any]] + + + class azure.ai.evaluation.MeteorScoreEvaluator(EvaluatorBase): + id = azureai://built-in/evaluators/meteor_score + + @overload + def __call__( + self, + *, + ground_truth: str, + response: str + ) -> Dict[str, float]: ... + + @override + def __init__( + self, + alpha: float = 0.9, + beta: float = 3.0, + gamma: float = 0.5, + *, + threshold: float = 0.5 + ): ... + + + class azure.ai.evaluation.OpenAIModelConfiguration(TypedDict): + key "api_key": str + key "model": str + base_url: NotRequired[str] + extra_headers: NotRequired[Dict[str, str]] + organization: NotRequired[str] + type: NotRequired[Literal["openai"]] + + + @experimental + class azure.ai.evaluation.ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): + id = azureai://built-in/evaluators/protected_material + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, bool]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + **kwargs + ): ... + + + class azure.ai.evaluation.QAEvaluator(MultiEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/qa + + @overload + def __call__( + self, + *, + context: str, + ground_truth: str, + query: str, + response: str + ): ... + + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + coherence_threshold: int = 3, + f1_score_threshold: float = 0.5, + fluency_threshold: int = 3, + groundedness_threshold: int = 3, + relevance_threshold: int = 3, + similarity_threshold: int = 3, + **kwargs: Any + ): ... + + + class azure.ai.evaluation.RelevanceEvaluator(PromptyEvaluatorBase): + id = azureai://built-in/evaluators/relevance + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + is_reasoning_model: bool = ..., + threshold = 3, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/response_completeness + + @overload + def __call__( + self, + *, + ground_truth: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, + **kwargs + ): ... + + + class azure.ai.evaluation.RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/retrieval + + @overload + def __call__( + self, + *, + context: str, + query: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + is_reasoning_model: bool = ..., + threshold: float = 3, + **kwargs + ) -> Callable: ... + + + class azure.ai.evaluation.RougeScoreEvaluator(EvaluatorBase): + id = azureai://built-in/evaluators/rouge_score + + @overload + def __call__( + self, + *, + ground_truth: str, + response: str + ) -> Dict[str, float]: ... + + @override + def __init__( + self, + rouge_type: RougeType, + *, + f1_score_threshold: float = 0.5, + precision_threshold: float = 0.5, + recall_threshold: float = 0.5 + ): ... + + + class azure.ai.evaluation.RougeType(str, Enum): + ROUGE_1 = "rouge1" + ROUGE_2 = "rouge2" + ROUGE_3 = "rouge3" + ROUGE_4 = "rouge4" + ROUGE_5 = "rouge5" + ROUGE_L = "rougeL" + + + @experimental + class azure.ai.evaluation.SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/self_harm + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + *, + threshold: int = 3, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/sexual + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + *, + threshold: int = 3, + **kwargs + ): ... + + + class azure.ai.evaluation.SimilarityEvaluator(PromptyEvaluatorBase): + id = azureai://built-in/evaluators/similarity + + @overload + def __call__( + self, + *, + ground_truth: str, + query: str, + response: str + ) -> Dict[str, float]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + is_reasoning_model: bool = ..., + threshold = 3, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/task_adherence + + @overload + def __call__( + self, + *, + query: Union[str, List[dict]], + response: Union[str, List[dict]], + tool_definitions: Optional[Union[dict, List[dict]]] = ... + ) -> Dict[str, Union[str, float]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + threshold = _DEFAULT_TASK_ADHERENCE_SCORE, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/tool_call_accuracy + + @overload + def __call__( + self, + *, + query: Union[str, List[dict]], + response: Union[str, List[dict]] = ..., + tool_calls: Union[dict, List[dict]] = ..., + tool_definitions: Union[dict, List[dict]] + ) -> Dict[str, Union[str, float]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + threshold = _DEFAULT_TOOL_CALL_ACCURACY_SCORE, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): + id = azureai://built-in/evaluators/ungrounded_attributes + + @overload + def __call__( + self, + *, + context: str, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + **kwargs: Any + ): ... + + + @experimental + class azure.ai.evaluation.ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/violence + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + *, + threshold: int = 3, + **kwargs + ): ... + + +namespace azure.ai.evaluation.autogen.raiclient + + class azure.ai.evaluation.autogen.raiclient.MachineLearningServicesClient: implements ContextManager + rai_svc: RAISvcOperations + + def __init__( + self, + endpoint: str, + subscription_id: str, + resource_group_name: str, + workspace_name: str, + credential: TokenCredential, + *, + api_version: str = ..., + **kwargs: Any + ) -> None: ... + + def close(self) -> None: ... + + def send_request( + self, + request: HttpRequest, + *, + stream: bool = False, + **kwargs: Any + ) -> HttpResponse: ... + + +namespace azure.ai.evaluation.autogen.raiclient.aio + + class azure.ai.evaluation.autogen.raiclient.aio.MachineLearningServicesClient: implements AsyncContextManager + rai_svc: RAISvcOperations + + def __init__( + self, + endpoint: str, + subscription_id: str, + resource_group_name: str, + workspace_name: str, + credential: AsyncTokenCredential, + *, + api_version: str = ..., + **kwargs: Any + ) -> None: ... + + async def close(self) -> None: ... + + def send_request( + self, + request: HttpRequest, + *, + stream: bool = False, + **kwargs: Any + ) -> Awaitable[AsyncHttpResponse]: ... + + +namespace azure.ai.evaluation.autogen.raiclient.aio.operations + + class azure.ai.evaluation.autogen.raiclient.aio.operations.RAISvcOperations: + + def __init__( + self, + *args, + **kwargs + ) -> None: ... + + @distributed_trace_async + async def get_annotation(self, **kwargs: Any) -> List[str]: ... + + @distributed_trace_async + async def get_attack_objectives( + self, + *, + lang: str, + risk_types: List[str], + **kwargs: Any + ) -> str: ... + + @distributed_trace_async + async def get_jail_break_dataset(self, **kwargs: Any) -> str: ... + + @distributed_trace_async + async def get_jail_break_dataset_with_type( + self, + type: str, + **kwargs: Any + ) -> str: ... + + @distributed_trace_async + async def get_operation_result( + self, + operation_id: str, + *, + api_key: Optional[str] = ..., + model_endpoint: Optional[str] = ..., + **kwargs: Any + ) -> str: ... + + @distributed_trace_async + async def get_template_parameters(self, **kwargs: Any) -> str: ... + + @distributed_trace_async + async def get_template_parameters_image( + self, + *, + path: str, + **kwargs: Any + ) -> str: ... + + @distributed_trace_async + async def get_template_parameters_with_type( + self, + type: str, + **kwargs: Any + ) -> str: ... + + @overload + async def submit_annotation( + self, + body: AnnotationDTO, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_annotation( + self, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_annotation( + self, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_aoai_evaluation( + self, + body: GradersDTO, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_aoai_evaluation( + self, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_aoai_evaluation( + self, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_simulation( + self, + body: SimulationDTO, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_simulation( + self, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_simulation( + self, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + +namespace azure.ai.evaluation.autogen.raiclient.models + + class azure.ai.evaluation.autogen.raiclient.models.AnnotationDTO(Model): + annotation_task: str + content_type: str + contents: List[Content] + metric_list: List[str] + prompt_version: str + user_text_list: List[str] + + @overload + def __init__( + self, + *, + annotation_task: str, + content_type: str, + contents: List[Content], + metric_list: List[str], + prompt_version: str, + user_text_list: List[str] + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.Content(Model): + messages: List[Any] + + @overload + def __init__( + self, + *, + messages: List[Any] + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.CustomizationParameters(Model): + application_scenario: Optional[str] + harm_categories: List[str] + + @overload + def __init__( + self, + *, + application_scenario: Optional[str] = ..., + harm_categories: List[str] + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.Data(Model): + asset_id: str + + @overload + def __init__( + self, + *, + asset_id: str + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.Grader(Model): + config: GraderConfigBase + description: str + name: str + + @overload + def __init__( + self, + *, + config: GraderConfigBase, + description: str, + name: str + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.GraderConfigBase(Model): + type: str + + @overload + def __init__( + self, + *, + type: str + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.GradersDTO(Model): + data: Data + graders: List[Grader] + model_config: ModelConfig + sample_generators: List[SampleGenerator] + + @overload + def __init__( + self, + *, + data: Data, + graders: List[Grader], + model_config: ModelConfig, + sample_generators: List[SampleGenerator] + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.LongRunningResponse(Model): + location: str + operation_result: Any + + @overload + def __init__( + self, + *, + location: str, + operation_result: Any + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.ModelConfig(Model): + azure_endpoint: str + + @overload + def __init__( + self, + *, + azure_endpoint: str + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.SampleGenerator(Model): + model_name: str + sampling_params: Any + trajectory_template: Any + type: str + + @overload + def __init__( + self, + *, + model_name: str, + sampling_params: Any, + trajectory_template: Any, + type: str + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.SimulationDTO(Model): + customization_parameters: Optional[CustomizationParameters] + headers: Optional[Dict[str, str]] + is_microsoft_tenant: Optional[bool] + json: Optional[str] + params: Optional[Dict[str, str]] + resource_group_name: Optional[str] + simulation_type: Optional[Union[str, SimulationType]] + subscription_id: Optional[str] + template_key: Optional[str] + template_parameters: Optional[Dict[str, str]] + url: Optional[str] + workspace_name: Optional[str] + + @overload + def __init__( + self, + *, + customization_parameters: Optional[CustomizationParameters] = ..., + headers: Optional[Dict[str, str]] = ..., + is_microsoft_tenant: Optional[bool] = ..., + json: Optional[str] = ..., + params: Optional[Dict[str, str]] = ..., + resource_group_name: Optional[str] = ..., + simulation_type: Optional[Union[str, SimulationType]] = ..., + subscription_id: Optional[str] = ..., + template_key: Optional[str] = ..., + template_parameters: Optional[Dict[str, str]] = ..., + url: Optional[str] = ..., + workspace_name: Optional[str] = ... + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.SimulationType(str, Enum, metaclass=CaseInsensitiveEnumMeta): + CUSTOM_PERSONA = "CustomPersona" + DEFAULT = "Default" + HARM_TURN_GENERATOR = "HarmTurnGenerator" + + +namespace azure.ai.evaluation.autogen.raiclient.operations + + class azure.ai.evaluation.autogen.raiclient.operations.RAISvcOperations: + + def __init__( + self, + *args, + **kwargs + ): ... + + @distributed_trace + def get_annotation(self, **kwargs: Any) -> List[str]: ... + + @distributed_trace + def get_attack_objectives( + self, + *, + lang: str, + risk_category: Optional[str] = ..., + risk_types: List[str], + strategy: Optional[str] = ..., + target_type: Optional[str] = ..., + **kwargs: Any + ) -> str: ... + + @distributed_trace + def get_jail_break_dataset(self, **kwargs: Any) -> str: ... + + @distributed_trace + def get_jail_break_dataset_with_type( + self, + type: str, + **kwargs: Any + ) -> str: ... + + @distributed_trace + def get_operation_result( + self, + operation_id: str, + *, + api_key: Optional[str] = ..., + model_endpoint: Optional[str] = ..., + **kwargs: Any + ) -> str: ... + + @distributed_trace + def get_template_parameters(self, **kwargs: Any) -> str: ... + + @distributed_trace + def get_template_parameters_image( + self, + *, + path: str, + **kwargs: Any + ) -> str: ... + + @distributed_trace + def get_template_parameters_with_type( + self, + type: str, + **kwargs: Any + ) -> str: ... + + @overload + def submit_annotation( + self, + body: AnnotationDTO, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_annotation( + self, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_annotation( + self, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_aoai_evaluation( + self, + body: GradersDTO, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_aoai_evaluation( + self, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_aoai_evaluation( + self, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_simulation( + self, + body: SimulationDTO, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_simulation( + self, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_simulation( + self, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + +namespace azure.ai.evaluation.red_team + + @experimental + class azure.ai.evaluation.red_team.AttackStrategy(Enum): + AnsiAttack = "ansi_attack" + AsciiArt = "ascii_art" + AsciiSmuggler = "ascii_smuggler" + Atbash = "atbash" + Base64 = "base64" + Baseline = "baseline" + Binary = "binary" + Caesar = "caesar" + CharSwap = "char_swap" + CharacterSpace = "character_space" + Crescendo = "crescendo" + DIFFICULT = "difficult" + Diacritic = "diacritic" + EASY = "easy" + Flip = "flip" + IndirectJailbreak = "indirect_jailbreak" + Jailbreak = "jailbreak" + Leetspeak = "leetspeak" + MODERATE = "moderate" + Morse = "morse" + MultiTurn = "multi_turn" + ROT13 = "rot13" + StringJoin = "string_join" + SuffixAppend = "suffix_append" + Tense = "tense" + UnicodeConfusable = "unicode_confusable" + UnicodeSubstitution = "unicode_substitution" + Url = "url" + + + @experimental + class azure.ai.evaluation.red_team.RedTeam: + + def __init__( + self, + azure_ai_project: Union[dict, str], + credential: TokenCredential, + *, + application_scenario: Optional[str] = ..., + attack_success_thresholds: Optional[Dict[RiskCategory, int]] = ..., + custom_attack_seed_prompts: Optional[str] = ..., + language: SupportedLanguages = SupportedLanguages.English, + num_objectives: int = 10, + output_dir = ".", + risk_categories: Optional[List[RiskCategory]] = ..., + **kwargs + ): ... + + async def scan( + self, + target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration, PromptChatTarget], + *, + application_scenario: Optional[str] = ..., + attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]] = [], + max_parallel_tasks: int = 5, + output_path: Optional[Union[str, PathLike]] = ..., + parallel_execution: bool = True, + scan_name: Optional[str] = ..., + skip_evals: bool = False, + skip_upload: bool = False, + timeout: int = 3600, + **kwargs: Any + ) -> RedTeamResult: ... + + + @experimental + class azure.ai.evaluation.red_team.RedTeamResult: + + def __init__( + self, + scan_result: Optional[ScanResult] = None, + attack_details: Optional[List[AttackDetails]] = None + ): ... + + def attack_simulation(self) -> str: ... + + def to_eval_qr_json_lines(self) -> str: ... + + def to_json(self) -> str: ... + + def to_scorecard(self) -> Optional[RedTeamingScorecard]: ... + + + @experimental + class azure.ai.evaluation.red_team.RiskCategory(str, Enum): + CodeVulnerability = "code_vulnerability" + HateUnfairness = "hate_unfairness" + ProhibitedActions = "prohibited_actions" + ProtectedMaterial = "protected_material" + SelfHarm = "self_harm" + SensitiveDataLeakage = "sensitive_data_leakage" + Sexual = "sexual" + TaskAdherence = "task_adherence" + UngroundedAttributes = "ungrounded_attributes" + Violence = "violence" + + + @experimental + class azure.ai.evaluation.red_team.SupportedLanguages(Enum): + English = "en" + French = "fr" + German = "de" + Italian = "it" + Japanese = "ja" + Korean = "ko" + Portuguese = "pt" + SimplifiedChinese = "zh-cn" + Spanish = "es" + + +namespace azure.ai.evaluation.simulator + + @experimental + class azure.ai.evaluation.simulator.AdversarialScenario(Enum): + ADVERSARIAL_CODE_VULNERABILITY = "adv_code_vuln" + ADVERSARIAL_CONTENT_GEN_GROUNDED = "adv_content_gen_grounded" + ADVERSARIAL_CONTENT_GEN_UNGROUNDED = "adv_content_gen_ungrounded" + ADVERSARIAL_CONTENT_PROTECTED_MATERIAL = "adv_content_protected_material" + ADVERSARIAL_CONVERSATION = "adv_conversation" + ADVERSARIAL_QA = "adv_qa" + ADVERSARIAL_QA_DOCUMENTS = "adv_qa_documents" + ADVERSARIAL_REWRITE = "adv_rewrite" + ADVERSARIAL_SEARCH = "adv_search" + ADVERSARIAL_SUMMARIZATION = "adv_summarization" + ADVERSARIAL_UNGROUNDED_ATTRIBUTES = "adv_isa" + + + @experimental + class azure.ai.evaluation.simulator.AdversarialScenarioJailbreak(Enum): + ADVERSARIAL_INDIRECT_JAILBREAK = "adv_xpia" + + + @experimental + class azure.ai.evaluation.simulator.AdversarialSimulator: + + async def __call__( + self, + *, + api_call_delay_sec: int = 0, + api_call_retry_limit: int = 3, + api_call_retry_sleep_sec: int = 1, + concurrent_async_task: int = 3, + language: SupportedLanguages = SupportedLanguages.English, + max_conversation_turns: int = 1, + max_simulation_results: int = 3, + randomization_seed: Optional[int] = ..., + randomize_order: bool = True, + scenario: AdversarialScenario, + target: Callable, + **kwargs + ) -> List[Dict[str, Any]]: ... + + def __init__( + self, + *, + azure_ai_project: Union[str, AzureAIProject], + credential: TokenCredential + ): ... + + def call_sync( + self, + *, + api_call_delay_sec: int, + api_call_retry_limit: int, + api_call_retry_sleep_sec: int, + concurrent_async_task: int, + max_conversation_turns: int, + max_simulation_results: int, + scenario: AdversarialScenario, + target: Callable + ) -> List[Dict[str, Any]]: ... + + + @experimental + class azure.ai.evaluation.simulator.DirectAttackSimulator: + + async def __call__( + self, + *, + api_call_delay_sec: int = 0, + api_call_retry_limit: int = 3, + api_call_retry_sleep_sec: int = 1, + concurrent_async_task: int = 3, + max_conversation_turns: int = 1, + max_simulation_results: int = 3, + randomization_seed: Optional[int] = ..., + scenario: AdversarialScenario, + target: Callable + ) -> Dict[str, [List[Dict[str, Any]]]]: ... + + def __init__( + self, + *, + azure_ai_project: Union[str, AzureAIProject], + credential: TokenCredential + ): ... + + + @experimental + class azure.ai.evaluation.simulator.IndirectAttackSimulator(AdversarialSimulator): + + async def __call__( + self, + *, + api_call_delay_sec: int = 0, + api_call_retry_limit: int = 3, + api_call_retry_sleep_sec: int = 1, + concurrent_async_task: int = 3, + max_simulation_results: int = 3, + randomization_seed: Optional[int] = ..., + target: Callable, + **kwargs + ) -> List[Dict[str, Any]]: ... + + def __init__( + self, + *, + azure_ai_project: Union[str, AzureAIProject], + credential: TokenCredential + ): ... + + def call_sync( + self, + *, + api_call_delay_sec: int, + api_call_retry_limit: int, + api_call_retry_sleep_sec: int, + concurrent_async_task: int, + max_conversation_turns: int, + max_simulation_results: int, + scenario: AdversarialScenario, + target: Callable + ) -> List[Dict[str, Any]]: ... + + + @experimental + class azure.ai.evaluation.simulator.Simulator: + + async def __call__( + self, + *, + api_call_delay_sec: float = 1, + concurrent_async_tasks: int = 5, + conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [], + max_conversation_turns: int = 5, + num_queries: int = 5, + query_response_generating_prompty: Optional[str] = ..., + query_response_generating_prompty_options: Dict[str, Any] = {}, + randomization_seed: Optional[int] = ..., + target: Callable, + tasks: List[str] = [], + text: str = "", + user_simulator_prompty: Optional[str] = ..., + user_simulator_prompty_options: Dict[str, Any] = {}, + **kwargs + ) -> List[JsonLineChatProtocol]: ... + + def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]): ... + + + class azure.ai.evaluation.simulator.SupportedLanguages(Enum): + English = "en" + French = "fr" + German = "de" + Italian = "it" + Japanese = "ja" + Korean = "ko" + Portuguese = "pt" + SimplifiedChinese = "zh-cn" + Spanish = "es" + + +``` \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/api.metadata.yml b/sdk/evaluation/azure-ai-evaluation/api.metadata.yml new file mode 100644 index 000000000000..58b96aa4ee23 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/api.metadata.yml @@ -0,0 +1,3 @@ +apiMdSha256: 942162ce260c408be1be04a373bc86eef35d0a6e386fd5d530aa3fcb60c691dd +parserVersion: 0.3.28 +pythonVersion: 3.10.20 From 5140c468792d57665b2742617741b197841d682f Mon Sep 17 00:00:00 2001 From: mohessie Date: Thu, 25 Jun 2026 23:59:50 +0300 Subject: [PATCH 13/13] Update api.md --- sdk/evaluation/azure-ai-evaluation/api.md | 24 +++++++++---------- .../azure-ai-evaluation/api.metadata.yml | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/api.md b/sdk/evaluation/azure-ai-evaluation/api.md index 6b69ad8edf9d..43b64879c6a6 100644 --- a/sdk/evaluation/azure-ai-evaluation/api.md +++ b/sdk/evaluation/azure-ai-evaluation/api.md @@ -60,12 +60,12 @@ namespace azure.ai.evaluation class azure.ai.evaluation.AzureOpenAIModelConfiguration(TypedDict): + key "api_key": NotRequired[str] + key "api_version": NotRequired[str] key "azure_deployment": str key "azure_endpoint": str - api_key: NotRequired[str] - api_version: NotRequired[str] - credential: NotRequired[Any] - type: NotRequired[Literal["azure_openai"]] + key "credential": NotRequired[Any] + key "type": NotRequired[Literal["azure_openai"]] @experimental @@ -249,15 +249,15 @@ namespace azure.ai.evaluation class azure.ai.evaluation.Conversation(TypedDict): + key "context": NotRequired[Dict[str, Any]] key "messages": Union[List[Message], List[Dict]] - context: NotRequired[Dict[str, Any]] class azure.ai.evaluation.EvaluationResult(TypedDict): key "metrics": Dict + key "oai_eval_run_ids": NotRequired[List[Dict[str, str]]] key "rows": List[Dict] - oai_eval_run_ids: NotRequired[List[Dict[str, str]]] - studio_url: NotRequired[str] + key "studio_url": NotRequired[str] class azure.ai.evaluation.EvaluatorConfig(TypedDict, total=False): @@ -485,8 +485,8 @@ namespace azure.ai.evaluation class azure.ai.evaluation.Message(TypedDict): key "content": Union[str, List[Dict]] + key "context": NotRequired[Dict[str, Any]] key "role": str - context: NotRequired[Dict[str, Any]] class azure.ai.evaluation.MeteorScoreEvaluator(EvaluatorBase): @@ -513,11 +513,11 @@ namespace azure.ai.evaluation class azure.ai.evaluation.OpenAIModelConfiguration(TypedDict): key "api_key": str + key "base_url": NotRequired[str] + key "extra_headers": NotRequired[Dict[str, str]] key "model": str - base_url: NotRequired[str] - extra_headers: NotRequired[Dict[str, str]] - organization: NotRequired[str] - type: NotRequired[Literal["openai"]] + key "organization": NotRequired[str] + key "type": NotRequired[Literal["openai"]] @experimental diff --git a/sdk/evaluation/azure-ai-evaluation/api.metadata.yml b/sdk/evaluation/azure-ai-evaluation/api.metadata.yml index 58b96aa4ee23..9dd46c51df57 100644 --- a/sdk/evaluation/azure-ai-evaluation/api.metadata.yml +++ b/sdk/evaluation/azure-ai-evaluation/api.metadata.yml @@ -1,3 +1,3 @@ -apiMdSha256: 942162ce260c408be1be04a373bc86eef35d0a6e386fd5d530aa3fcb60c691dd +apiMdSha256: 942f3460a345d8989df4e5dd3163526fec7bb0683b5183a6c043f0a99545e482 parserVersion: 0.3.28 -pythonVersion: 3.10.20 +pythonVersion: 3.11.4