Azure · m7md7sien · Jun 25, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
@@ -0,0 +1,3 @@
+apiMdSha256: 942f3460a345d8989df4e5dd3163526fec7bb0683b5183a6c043f0a99545e482
+parserVersion: 0.3.28
+pythonVersion: 3.11.4
@@ -21,7 +21,12 @@
 )
 
 from . import constants
-from .constants import EvaluatorScoringPattern, EVALUATOR_SCORING_PATTERNS, SCORING_PATTERN_CONFIG
+from .constants import (
+    EvaluatorScoringPattern,
+    EVALUATOR_SCORING_PATTERNS,
+    SCORING_PATTERN_CONFIG,
+    EvaluationLevel,
+)
 
 _nltk_data_download_lock = threading.Lock()
 
@@ -967,7 +972,7 @@ def upload(path: str, container_client: ContainerClient, logger=None):
 # region Multi-turn utilities
 
 
-def _merge_query_response_messages(query, response):
+def _merge_query_response_messages(query: List[dict], response: List[dict]) -> List[dict]:
     """Merge query and response message lists into a single conversation.
 
     :param query: The query messages.
@@ -980,19 +985,24 @@ def _merge_query_response_messages(query, response):
     return [*query, *response]
 
 
-def _split_messages_at_latest_user(messages):
+def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]:
     """Split messages into query/response slices at the latest user turn.
 
     :param messages: The conversation messages.
     :type messages: List[dict]
     :return: A tuple of (query_messages, response_messages).
     :rtype: Tuple[List[dict], List[dict]]
     """
-    latest_user_index = max(i for i, message in enumerate(messages) if message["role"] == "user")
+    latest_user_index = max(
+        (i for i, message in enumerate(messages) if message.get("role") == "user"),
+        default=-1,
+    )
+    if latest_user_index == -1:
+        raise ValueError("messages must contain at least one message with role 'user'.")
     return messages[: latest_user_index + 1], messages[latest_user_index + 1 :]
 
 
-def _wrap_string_messages(query, response):
+def _wrap_string_messages(query: str, response: str) -> Tuple[List[dict], List[dict]]:
     """Wrap string query/response into separate message lists.
 
     :param query: The query string.
@@ -1119,7 +1129,10 @@ def serialize_messages(messages):
     return result.rstrip("\n")
 
 
-def _resolve_evaluation_level(evaluation_level, error_target):
+def _resolve_evaluation_level(
+    evaluation_level: Optional[Union[EvaluationLevel, str]],
+    error_target: ErrorTarget,
+) -> Optional[EvaluationLevel]:
     """Validate and normalize the evaluation_level parameter.
 
     :param evaluation_level: The evaluation level to resolve.
@@ -1129,8 +1142,6 @@ def _resolve_evaluation_level(evaluation_level, error_target):
     :return: The resolved EvaluationLevel or None for auto-detect.
     :rtype: Optional[EvaluationLevel]
     """
-    from .constants import EvaluationLevel
-
     valid = [level.value for level in EvaluationLevel]
     if evaluation_level is None or evaluation_level == "":
         return None
@@ -1139,13 +1150,13 @@ def _resolve_evaluation_level(evaluation_level, error_target):
     if isinstance(evaluation_level, str):
         try:
             return EvaluationLevel(evaluation_level)
-        except ValueError:
+        except ValueError as exc:
             raise EvaluationException(
                 message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."),
                 blame=ErrorBlame.USER_ERROR,
                 category=ErrorCategory.INVALID_VALUE,
                 target=error_target,
-            )
+            ) from exc
     raise EvaluationException(
         message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."),
         blame=ErrorBlame.USER_ERROR,

@@ -3,6 +3,7 @@
 
 """Validators package init."""
 
+from ._validation_constants import MessageRole, ContentType
 from ._validator_interface import ValidatorInterface
 from ._conversation_validator import ConversationValidator
 from ._tool_definitions_validator import ToolDefinitionsValidator
@@ -11,6 +12,8 @@
 from ._messages_validator import MessagesOrQueryResponseInputValidator
 
 __all__ = [
+    "MessageRole",
+    "ContentType",
     "ValidatorInterface",
     "ConversationValidator",
     "ToolDefinitionsValidator",

@@ -130,7 +130,7 @@ def _validate_text_content_item(self, content_item: Dict[str, Any], role: str) -
 
         if not isinstance(content_item["text"], str):
             return EvaluationException(
-                message=f"The 'text' field must be a string in content items.",
+                message="The 'text' field must be a string in content items.",
                 blame=ErrorBlame.USER_ERROR,
                 category=ErrorCategory.INVALID_VALUE,
                 target=self.error_target,
@@ -196,16 +196,16 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu
         """Validate assistant message content."""
         content = message["content"]
 
-        valid_assistant_content_types = [
-            ContentType.TEXT,
-            ContentType.OUTPUT_TEXT,
-            ContentType.TOOL_CALL,
-            ContentType.FUNCTION_CALL,
-            ContentType.MCP_APPROVAL_REQUEST,
-            ContentType.OPENAPI_CALL,
-        ]
-        valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types]
         if isinstance(content, list):
+            valid_assistant_content_types = [
+                ContentType.TEXT,
+                ContentType.OUTPUT_TEXT,
+                ContentType.TOOL_CALL,
+                ContentType.FUNCTION_CALL,
+                ContentType.MCP_APPROVAL_REQUEST,
+                ContentType.OPENAPI_CALL,
+            ]
+            valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types]
             for content_item in content:
                 content_type = content_item["type"]
                 if content_type not in valid_assistant_content_types:
@@ -314,31 +314,30 @@ def _validate_message_dict(self, message: Dict[str, Any]) -> Optional[Evaluation
         )
         if not content_is_string_or_list_of_dicts:
             return EvaluationException(
-                message=f"The 'content' field must be a string or a list of dictionaries messages.",
+                message="The 'content' field must be a string or a list of dictionaries messages.",
                 blame=ErrorBlame.USER_ERROR,
                 category=ErrorCategory.INVALID_VALUE,
                 target=self.error_target,
             )
 
         if len(content) == 0:
             return EvaluationException(
-                message=f"The 'content' field can't be empty.",
+                message="The 'content' field can't be empty.",
                 blame=ErrorBlame.USER_ERROR,
                 category=ErrorCategory.INVALID_VALUE,
                 target=self.error_target,
             )
 
         if isinstance(content, list):
-            all_messages_have_type_field = all("type" in item for item in content)
-            if not all_messages_have_type_field:
+            if not all("type" in item for item in content):
                 return EvaluationException(
-                    message=f"Each content item in the 'content' list must contain a 'type' field.",
+                    message="Each content item in the 'content' list must contain a 'type' field.",
                     blame=ErrorBlame.USER_ERROR,
                     category=ErrorCategory.INVALID_VALUE,
                     target=self.error_target,
                 )
 
-        if role in [MessageRole.USER, MessageRole.SYSTEM]:
+        if role in [MessageRole.USER, MessageRole.SYSTEM, MessageRole.DEVELOPER]:
             error = self._validate_user_or_system_message(message, role)
             if error:
                 return error

@@ -8,21 +8,40 @@
 
 from typing import Any, Dict
 from typing_extensions import override
-from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory
-from ._tool_definitions_validator import ToolDefinitionsValidator
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 from ._validation_constants import MessageRole
+from ._conversation_validator import ConversationValidator
+from ._tool_definitions_validator import ToolDefinitionsValidator
 
 
 class MessagesOrQueryResponseInputValidator(ToolDefinitionsValidator):
     """Validator that supports both single-turn (query/response) and multi-turn (messages) inputs.
 
-    When ``messages`` is provided, it validates the messages list and optional tool_definitions.
-    Otherwise, it delegates to the parent ``ToolDefinitionsValidator`` for the query/response path.
+    A single implementation serves all evaluators via a behavior flag:
+      - ``enforce_tool_definitions`` (default False): validate ``tool_definitions`` in both the
+        messages path and the query/response path. Set True for evaluators that require
+        tool definitions.
     """
 
+    enforce_tool_definitions: bool = False
+
+    def __init__(
+        self,
+        error_target: ErrorTarget,
+        requires_query: bool = True,
+        optional_tool_definitions: bool = True,
+        check_for_unsupported_tools: bool = False,
+        *,
+        enforce_tool_definitions: bool = False,
+    ):
+        """Initialize MessagesOrQueryResponseInputValidator."""
+        super().__init__(error_target, requires_query, optional_tool_definitions, check_for_unsupported_tools)
+        self.enforce_tool_definitions = enforce_tool_definitions
+
     @override
     def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
         """Validate evaluation input, supporting messages as an alternative to query/response."""
+        # Multi-turn path (messages list)
         messages = eval_input.get("messages")
         if messages is not None:
             if not isinstance(messages, list):
@@ -41,31 +60,32 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
                 )
 
             # Per-message structural checks
-            valid_roles = {r.value for r in MessageRole}
+            valid_roles = {role.value for role in MessageRole}
             roles_present: set = set()
-            for i, msg in enumerate(messages):
-                if not isinstance(msg, dict):
+            for index, message in enumerate(messages):
+                if not isinstance(message, dict):
                     raise EvaluationException(
                         message=(
-                            f"Each item in 'messages' must be a dictionary, "
-                            f"but item at index {i} is {type(msg).__name__}."
+                            "Each item in 'messages' must be a dictionary, "
+                            f"but item at index {index} is {type(message).__name__}."
                         ),
                         blame=ErrorBlame.USER_ERROR,
                         category=ErrorCategory.INVALID_VALUE,
                         target=self.error_target,
                     )
-                role = msg.get("role")
+                role = message.get("role")
                 if role is None:
                     raise EvaluationException(
-                        message=f"Each message must contain a 'role' key, but message at index {i} is missing it.",
+                        message=f"Each message must contain a 'role' key, but message at index {index} is missing it.",
                         blame=ErrorBlame.USER_ERROR,
                         category=ErrorCategory.INVALID_VALUE,
                         target=self.error_target,
                     )
                 if role not in valid_roles:
                     raise EvaluationException(
                         message=(
-                            f"Invalid role '{role}' at message index {i}. " f"Must be one of: {sorted(valid_roles)}."
+                            f"Invalid role '{role}' at message index {index}. "
+                            f"Must be one of: {sorted(valid_roles)}."
                         ),
                         blame=ErrorBlame.USER_ERROR,
                         category=ErrorCategory.INVALID_VALUE,
@@ -74,24 +94,28 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
                 roles_present.add(role)
 
             # Conversation-level checks
-            if MessageRole.USER not in roles_present:
+            if MessageRole.USER.value not in roles_present:
                 raise EvaluationException(
                     message="messages must contain at least one message with role 'user'.",
                     blame=ErrorBlame.USER_ERROR,
                     category=ErrorCategory.INVALID_VALUE,
                     target=self.error_target,
                 )
-            if MessageRole.ASSISTANT not in roles_present:
+            if MessageRole.ASSISTANT.value not in roles_present:
                 raise EvaluationException(
                     message="messages must contain at least one message with role 'assistant'.",
                     blame=ErrorBlame.USER_ERROR,
                     category=ErrorCategory.INVALID_VALUE,
                     target=self.error_target,
                 )
 
-            tool_definitions = eval_input.get("tool_definitions")
-            tool_definitions_error = self._validate_tool_definitions(tool_definitions)
-            if tool_definitions_error:
-                raise tool_definitions_error
+            if self.enforce_tool_definitions:
+                tool_definitions = eval_input.get("tool_definitions")
+                tool_definitions_validation_exception = self._validate_tool_definitions(tool_definitions)
+                if tool_definitions_validation_exception:
+                    raise tool_definitions_validation_exception
             return True
-        return super().validate_eval_input(eval_input)
+
+        if self.enforce_tool_definitions:
+            return super().validate_eval_input(eval_input)
+        return ConversationValidator.validate_eval_input(self, eval_input)
@@ -17,17 +17,38 @@ class TaskNavigationEfficiencyValidator(ValidatorInterface):
     """
     Validate task navigation efficiency inputs (response and ground_truth).
 
+    Accepts either the SDK input names (``response``/``ground_truth``) or the
+    azureml-assets names (``actions``/``expected_actions``).
+
     Validates:
-    - response: List of assistant messages containing tool calls
-    - ground_truth: Either a list of expected tool names, or a tuple of (tool names, parameters dict)
+    - response (alias ``actions``): List of assistant messages containing tool calls
+    - ground_truth (alias ``expected_actions``): Either a list of expected tool names, or a
+      tuple of (tool names, parameters dict)
     """
 
     error_target: ErrorTarget
 
+    # Canonical input key -> accepted alternate (azureml-assets) key name.
+    _INPUT_ALIASES: Dict[str, str] = {
+        "response": "actions",
+        "ground_truth": "expected_actions",
+    }
+
     def __init__(self, error_target: ErrorTarget):
         """Initialize with error target."""
         self.error_target = error_target
 
+    def _normalize_input_aliases(self, eval_input: Dict[str, Any]) -> None:
+        """Map azureml-assets-style input keys onto the canonical keys in place.
+
+        If a canonical key (``response``/``ground_truth``) is absent but its alias
+        (``actions``/``expected_actions``) is provided, copy the alias value to the canonical
+        key so the rest of the pipeline can rely on a single set of names.
+        """
+        for canonical, alias in self._INPUT_ALIASES.items():
+            if eval_input.get(canonical) is None and eval_input.get(alias) is not None:
+                eval_input[canonical] = eval_input[alias]
+
     def _validate_response(self, response: Any) -> Optional[EvaluationException]:
         """Validate the response parameter."""
         if response is None:
@@ -221,15 +242,22 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
         """
         Validate task navigation evaluation input.
 
+        Accepts either the SDK input names (``response``/``ground_truth``) or the
+        azureml-assets names (``actions``/``expected_actions``).
+
         Args:
-            eval_input: Dictionary containing 'response' and 'ground_truth'.
+            eval_input: Dictionary containing 'response'/'ground_truth' (or their
+                'actions'/'expected_actions' aliases).
 
         Returns:
             True if validation passes.
 
         Raises:
             EvaluationException: If validation fails.
         """
+        # Normalize azureml-assets-style aliases ('actions'/'expected_actions') onto canonical keys.
+        self._normalize_input_aliases(eval_input)
+
         # If response or ground_truth is a string, try to parse it as JSON
         for key in ("response", "ground_truth"):
             value = eval_input.get(key)

@@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
         # Initialize input validator
         self._validator = ToolCallsValidator(
             error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(

@@ -92,7 +92,7 @@ def __init__(
         self._validator = ToolDefinitionsValidator(
             error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
             optional_tool_definitions=False,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(

@@ -86,7 +86,9 @@ def __init__(
 
         # Initialize input validator
         self._validator = ToolDefinitionsValidator(
-            error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, optional_tool_definitions=False
+            error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR,
+            optional_tool_definitions=False,
+            check_for_unsupported_tools=True,
         )
 
         super().__init__(