Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,717 changes: 1,717 additions & 0 deletions sdk/evaluation/azure-ai-evaluation/api.md

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions sdk/evaluation/azure-ai-evaluation/api.metadata.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
apiMdSha256: 942f3460a345d8989df4e5dd3163526fec7bb0683b5183a6c043f0a99545e482
parserVersion: 0.3.28
pythonVersion: 3.11.4
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,12 @@
)

from . import constants
from .constants import EvaluatorScoringPattern, EVALUATOR_SCORING_PATTERNS, SCORING_PATTERN_CONFIG
from .constants import (
EvaluatorScoringPattern,
EVALUATOR_SCORING_PATTERNS,
SCORING_PATTERN_CONFIG,
EvaluationLevel,
)

_nltk_data_download_lock = threading.Lock()

Expand Down Expand Up @@ -967,7 +972,7 @@ def upload(path: str, container_client: ContainerClient, logger=None):
# region Multi-turn utilities


def _merge_query_response_messages(query, response):
def _merge_query_response_messages(query: List[dict], response: List[dict]) -> List[dict]:
"""Merge query and response message lists into a single conversation.

:param query: The query messages.
Expand All @@ -980,19 +985,24 @@ def _merge_query_response_messages(query, response):
return [*query, *response]


def _split_messages_at_latest_user(messages):
def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]:
"""Split messages into query/response slices at the latest user turn.

:param messages: The conversation messages.
:type messages: List[dict]
:return: A tuple of (query_messages, response_messages).
:rtype: Tuple[List[dict], List[dict]]
"""
latest_user_index = max(i for i, message in enumerate(messages) if message["role"] == "user")
latest_user_index = max(
(i for i, message in enumerate(messages) if message.get("role") == "user"),
default=-1,
)
if latest_user_index == -1:
raise ValueError("messages must contain at least one message with role 'user'.")
return messages[: latest_user_index + 1], messages[latest_user_index + 1 :]


def _wrap_string_messages(query, response):
def _wrap_string_messages(query: str, response: str) -> Tuple[List[dict], List[dict]]:
"""Wrap string query/response into separate message lists.

:param query: The query string.
Expand Down Expand Up @@ -1119,7 +1129,10 @@ def serialize_messages(messages):
return result.rstrip("\n")


def _resolve_evaluation_level(evaluation_level, error_target):
def _resolve_evaluation_level(
evaluation_level: Optional[Union[EvaluationLevel, str]],
error_target: ErrorTarget,
) -> Optional[EvaluationLevel]:
"""Validate and normalize the evaluation_level parameter.

:param evaluation_level: The evaluation level to resolve.
Expand All @@ -1129,8 +1142,6 @@ def _resolve_evaluation_level(evaluation_level, error_target):
:return: The resolved EvaluationLevel or None for auto-detect.
:rtype: Optional[EvaluationLevel]
"""
from .constants import EvaluationLevel

valid = [level.value for level in EvaluationLevel]
if evaluation_level is None or evaluation_level == "":
return None
Expand All @@ -1139,13 +1150,13 @@ def _resolve_evaluation_level(evaluation_level, error_target):
if isinstance(evaluation_level, str):
try:
return EvaluationLevel(evaluation_level)
except ValueError:
except ValueError as exc:
raise EvaluationException(
message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."),
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=error_target,
)
) from exc
raise EvaluationException(
message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."),
blame=ErrorBlame.USER_ERROR,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""Validators package init."""

from ._validation_constants import MessageRole, ContentType
from ._validator_interface import ValidatorInterface
from ._conversation_validator import ConversationValidator
from ._tool_definitions_validator import ToolDefinitionsValidator
Expand All @@ -11,6 +12,8 @@
from ._messages_validator import MessagesOrQueryResponseInputValidator

__all__ = [
"MessageRole",
"ContentType",
"ValidatorInterface",
"ConversationValidator",
"ToolDefinitionsValidator",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def _validate_text_content_item(self, content_item: Dict[str, Any], role: str) -

if not isinstance(content_item["text"], str):
return EvaluationException(
message=f"The 'text' field must be a string in content items.",
message="The 'text' field must be a string in content items.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
Expand Down Expand Up @@ -196,16 +196,16 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu
"""Validate assistant message content."""
content = message["content"]

valid_assistant_content_types = [
ContentType.TEXT,
ContentType.OUTPUT_TEXT,
ContentType.TOOL_CALL,
ContentType.FUNCTION_CALL,
ContentType.MCP_APPROVAL_REQUEST,
ContentType.OPENAPI_CALL,
]
valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types]
if isinstance(content, list):
valid_assistant_content_types = [
ContentType.TEXT,
ContentType.OUTPUT_TEXT,
ContentType.TOOL_CALL,
ContentType.FUNCTION_CALL,
ContentType.MCP_APPROVAL_REQUEST,
ContentType.OPENAPI_CALL,
]
valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types]
for content_item in content:
content_type = content_item["type"]
if content_type not in valid_assistant_content_types:
Expand Down Expand Up @@ -314,31 +314,30 @@ def _validate_message_dict(self, message: Dict[str, Any]) -> Optional[Evaluation
)
if not content_is_string_or_list_of_dicts:
return EvaluationException(
message=f"The 'content' field must be a string or a list of dictionaries messages.",
message="The 'content' field must be a string or a list of dictionaries messages.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)

if len(content) == 0:
return EvaluationException(
message=f"The 'content' field can't be empty.",
message="The 'content' field can't be empty.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)

if isinstance(content, list):
all_messages_have_type_field = all("type" in item for item in content)
if not all_messages_have_type_field:
if not all("type" in item for item in content):
return EvaluationException(
message=f"Each content item in the 'content' list must contain a 'type' field.",
message="Each content item in the 'content' list must contain a 'type' field.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)

if role in [MessageRole.USER, MessageRole.SYSTEM]:
if role in [MessageRole.USER, MessageRole.SYSTEM, MessageRole.DEVELOPER]:
error = self._validate_user_or_system_message(message, role)
if error:
return error
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,40 @@

from typing import Any, Dict
from typing_extensions import override
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory
from ._tool_definitions_validator import ToolDefinitionsValidator
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
from ._validation_constants import MessageRole
from ._conversation_validator import ConversationValidator
from ._tool_definitions_validator import ToolDefinitionsValidator


class MessagesOrQueryResponseInputValidator(ToolDefinitionsValidator):
Comment thread
m7md7sien marked this conversation as resolved.
"""Validator that supports both single-turn (query/response) and multi-turn (messages) inputs.

When ``messages`` is provided, it validates the messages list and optional tool_definitions.
Otherwise, it delegates to the parent ``ToolDefinitionsValidator`` for the query/response path.
A single implementation serves all evaluators via a behavior flag:
- ``enforce_tool_definitions`` (default False): validate ``tool_definitions`` in both the
messages path and the query/response path. Set True for evaluators that require
tool definitions.
"""

enforce_tool_definitions: bool = False

def __init__(
self,
error_target: ErrorTarget,
requires_query: bool = True,
optional_tool_definitions: bool = True,
check_for_unsupported_tools: bool = False,
*,
enforce_tool_definitions: bool = False,
):
"""Initialize MessagesOrQueryResponseInputValidator."""
super().__init__(error_target, requires_query, optional_tool_definitions, check_for_unsupported_tools)
self.enforce_tool_definitions = enforce_tool_definitions

@override
def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
"""Validate evaluation input, supporting messages as an alternative to query/response."""
# Multi-turn path (messages list)
messages = eval_input.get("messages")
if messages is not None:
if not isinstance(messages, list):
Expand All @@ -41,31 +60,32 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
)

# Per-message structural checks
valid_roles = {r.value for r in MessageRole}
valid_roles = {role.value for role in MessageRole}
roles_present: set = set()
for i, msg in enumerate(messages):
if not isinstance(msg, dict):
for index, message in enumerate(messages):
if not isinstance(message, dict):
raise EvaluationException(
message=(
f"Each item in 'messages' must be a dictionary, "
f"but item at index {i} is {type(msg).__name__}."
"Each item in 'messages' must be a dictionary, "
f"but item at index {index} is {type(message).__name__}."
),
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)
role = msg.get("role")
role = message.get("role")
if role is None:
raise EvaluationException(
message=f"Each message must contain a 'role' key, but message at index {i} is missing it.",
message=f"Each message must contain a 'role' key, but message at index {index} is missing it.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)
if role not in valid_roles:
raise EvaluationException(
message=(
f"Invalid role '{role}' at message index {i}. " f"Must be one of: {sorted(valid_roles)}."
f"Invalid role '{role}' at message index {index}. "
f"Must be one of: {sorted(valid_roles)}."
),
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
Expand All @@ -74,24 +94,28 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
roles_present.add(role)

# Conversation-level checks
if MessageRole.USER not in roles_present:
if MessageRole.USER.value not in roles_present:
raise EvaluationException(
message="messages must contain at least one message with role 'user'.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)
if MessageRole.ASSISTANT not in roles_present:
if MessageRole.ASSISTANT.value not in roles_present:
raise EvaluationException(
message="messages must contain at least one message with role 'assistant'.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=self.error_target,
)

tool_definitions = eval_input.get("tool_definitions")
tool_definitions_error = self._validate_tool_definitions(tool_definitions)
if tool_definitions_error:
raise tool_definitions_error
if self.enforce_tool_definitions:
tool_definitions = eval_input.get("tool_definitions")
tool_definitions_validation_exception = self._validate_tool_definitions(tool_definitions)
if tool_definitions_validation_exception:
raise tool_definitions_validation_exception
return True
return super().validate_eval_input(eval_input)

if self.enforce_tool_definitions:
return super().validate_eval_input(eval_input)
return ConversationValidator.validate_eval_input(self, eval_input)
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,38 @@ class TaskNavigationEfficiencyValidator(ValidatorInterface):
"""
Validate task navigation efficiency inputs (response and ground_truth).

Accepts either the SDK input names (``response``/``ground_truth``) or the
azureml-assets names (``actions``/``expected_actions``).

Validates:
- response: List of assistant messages containing tool calls
- ground_truth: Either a list of expected tool names, or a tuple of (tool names, parameters dict)
- response (alias ``actions``): List of assistant messages containing tool calls
- ground_truth (alias ``expected_actions``): Either a list of expected tool names, or a
tuple of (tool names, parameters dict)
"""

error_target: ErrorTarget

# Canonical input key -> accepted alternate (azureml-assets) key name.
_INPUT_ALIASES: Dict[str, str] = {
"response": "actions",
"ground_truth": "expected_actions",
}

def __init__(self, error_target: ErrorTarget):
"""Initialize with error target."""
self.error_target = error_target

def _normalize_input_aliases(self, eval_input: Dict[str, Any]) -> None:
"""Map azureml-assets-style input keys onto the canonical keys in place.

If a canonical key (``response``/``ground_truth``) is absent but its alias
(``actions``/``expected_actions``) is provided, copy the alias value to the canonical
key so the rest of the pipeline can rely on a single set of names.
"""
for canonical, alias in self._INPUT_ALIASES.items():
if eval_input.get(canonical) is None and eval_input.get(alias) is not None:
eval_input[canonical] = eval_input[alias]

def _validate_response(self, response: Any) -> Optional[EvaluationException]:
"""Validate the response parameter."""
if response is None:
Expand Down Expand Up @@ -221,15 +242,22 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
"""
Validate task navigation evaluation input.

Accepts either the SDK input names (``response``/``ground_truth``) or the
azureml-assets names (``actions``/``expected_actions``).

Args:
eval_input: Dictionary containing 'response' and 'ground_truth'.
eval_input: Dictionary containing 'response'/'ground_truth' (or their
'actions'/'expected_actions' aliases).

Returns:
True if validation passes.

Raises:
EvaluationException: If validation fails.
"""
# Normalize azureml-assets-style aliases ('actions'/'expected_actions') onto canonical keys.
self._normalize_input_aliases(eval_input)

Comment thread
m7md7sien marked this conversation as resolved.
# If response or ground_truth is a string, try to parse it as JSON
for key in ("response", "ground_truth"):
value = eval_input.get(key)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
# Initialize input validator
self._validator = ToolCallsValidator(
error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
check_for_unsupported_tools=True,
check_for_unsupported_tools=False,
Comment thread
m7md7sien marked this conversation as resolved.
)

super().__init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def __init__(
self._validator = ToolDefinitionsValidator(
error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
optional_tool_definitions=False,
check_for_unsupported_tools=True,
check_for_unsupported_tools=False,
)

super().__init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ def __init__(

# Initialize input validator
self._validator = ToolDefinitionsValidator(
error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, optional_tool_definitions=False
error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR,
optional_tool_definitions=False,
check_for_unsupported_tools=True,
Comment thread
m7md7sien marked this conversation as resolved.
)

super().__init__(
Expand Down
Loading
Loading