diff --git a/sdk/evaluation/azure-ai-evaluation/api.md b/sdk/evaluation/azure-ai-evaluation/api.md new file mode 100644 index 000000000000..43b64879c6a6 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/api.md @@ -0,0 +1,1717 @@ +```py +namespace azure.ai.evaluation + + def azure.ai.evaluation.evaluate( + *, + azure_ai_project: Optional[Union[str, AzureAIProject]] = ..., + data: Union[str, PathLike], + evaluation_name: Optional[str] = ..., + evaluator_config: Optional[Dict[str, EvaluatorConfig]] = ..., + evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]], + fail_on_evaluator_errors: bool = False, + output_path: Optional[Union[str, PathLike]] = ..., + tags: Optional[Dict[str, str]] = ..., + target: Optional[Callable] = ..., + user_agent: Optional[str] = ..., + **kwargs + ) -> EvaluationResult: ... + + + class azure.ai.evaluation.AzureAIProject(TypedDict): + key "project_name": str + key "resource_group_name": str + key "subscription_id": str + + + @experimental + class azure.ai.evaluation.AzureOpenAIGrader: + id = azureai://built-in/evaluators/azure-openai/custom_grader + + def __init__( + self, + *, + credential: Optional[TokenCredential] = ..., + grader_config: Dict[str, Any], + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + **kwargs: Any + ): ... + + def get_client(self) -> Any: ... + + + @experimental + class azure.ai.evaluation.AzureOpenAILabelGrader(AzureOpenAIGrader): + id = azureai://built-in/evaluators/azure-openai/label_grader + + def __init__( + self, + *, + credential: Optional[TokenCredential] = ..., + input: List[Dict[str, str]], + labels: List[str], + model: str, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + name: str, + passing_labels: List[str], + **kwargs: Any + ): ... + + def get_client(self) -> Any: ... + + + class azure.ai.evaluation.AzureOpenAIModelConfiguration(TypedDict): + key "api_key": NotRequired[str] + key "api_version": NotRequired[str] + key "azure_deployment": str + key "azure_endpoint": str + key "credential": NotRequired[Any] + key "type": NotRequired[Literal["azure_openai"]] + + + @experimental + class azure.ai.evaluation.AzureOpenAIPythonGrader(AzureOpenAIGrader): + id = azureai://built-in/evaluators/azure-openai/python_grader + + def __init__( + self, + *, + credential: Optional[TokenCredential] = ..., + image_tag: Optional[str] = ..., + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + name: str, + pass_threshold: float, + source: str, + **kwargs: Any + ): ... + + def get_client(self) -> Any: ... + + + @experimental + class azure.ai.evaluation.AzureOpenAIScoreModelGrader(AzureOpenAIGrader): + id = azureai://built-in/evaluators/azure-openai/score_model_grader + + def __init__( + self, + *, + credential: Optional[TokenCredential] = ..., + input: List[Dict[str, str]], + model: str, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + name: str, + pass_threshold: Optional[float] = ..., + range: Optional[List[float]] = ..., + sampling_params: Optional[Dict[str, Any]] = ..., + **kwargs: Any + ): ... + + def get_client(self) -> Any: ... + + + @experimental + class azure.ai.evaluation.AzureOpenAIStringCheckGrader(AzureOpenAIGrader): + id = azureai://built-in/evaluators/azure-openai/string_check_grader + + def __init__( + self, + *, + credential: Optional[TokenCredential] = ..., + input: str, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + name: str, + operation: Literal["eq", "ne", "like", "ilike"], + reference: str, + **kwargs: Any + ): ... + + def get_client(self) -> Any: ... + + + @experimental + class azure.ai.evaluation.AzureOpenAITextSimilarityGrader(AzureOpenAIGrader): + id = azureai://built-in/evaluators/azure-openai/text_similarity_grader + + def __init__( + self, + *, + credential: Optional[TokenCredential] = ..., + evaluation_metric: Literal["fuzzy_match", "bleu", "gleu", "meteor", "rouge_1", "rouge_2", "rouge_3", "rouge_4", "rouge_5", "rouge_l", "cosine"], + input: str, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + name: str, + pass_threshold: float, + reference: str, + **kwargs: Any + ): ... + + def get_client(self) -> Any: ... + + + class azure.ai.evaluation.BleuScoreEvaluator(EvaluatorBase): + id = azureai://built-in/evaluators/bleu_score + + @overload + def __call__( + self, + *, + ground_truth: str, + response: str + ): ... + + def __init__( + self, + *, + threshold = 0.5 + ): ... + + + @experimental + class azure.ai.evaluation.CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): + id = azureai://built-in/evaluators/code_vulnerability + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + **kwargs: Any + ): ... + + + class azure.ai.evaluation.CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/coherence + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + is_reasoning_model: bool = ..., + threshold = 3, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/content_safety + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + *, + hate_unfairness_threshold: int = 3, + self_harm_threshold: int = 3, + sexual_threshold: int = 3, + violence_threshold: int = 3, + **kwargs: Any + ): ... + + + class azure.ai.evaluation.Conversation(TypedDict): + key "context": NotRequired[Dict[str, Any]] + key "messages": Union[List[Message], List[Dict]] + + + class azure.ai.evaluation.EvaluationResult(TypedDict): + key "metrics": Dict + key "oai_eval_run_ids": NotRequired[List[Dict[str, str]]] + key "rows": List[Dict] + key "studio_url": NotRequired[str] + + + class azure.ai.evaluation.EvaluatorConfig(TypedDict, total=False): + key "column_mapping": Dict[str, str] + + + class azure.ai.evaluation.F1ScoreEvaluator(EvaluatorBase): + id = azureai://built-in/evaluators/f1_score + + @overload + def __call__( + self, + *, + ground_truth: str, + response: str + ) -> Dict[str, float]: ... + + def __init__( + self, + *, + threshold = 0.5 + ): ... + + + class azure.ai.evaluation.FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/fluency + + @overload + def __call__( + self, + *, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + is_reasoning_model: bool = ..., + threshold = 3, + **kwargs + ): ... + + + class azure.ai.evaluation.GleuScoreEvaluator(EvaluatorBase): + id = azureai://built-in/evaluators/gleu_score + + @overload + def __call__( + self, + *, + ground_truth: str, + response: str + ): ... + + @override + def __init__( + self, + *, + threshold = 0.5 + ): ... + + + class azure.ai.evaluation.GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/groundedness + + @overload + def __call__( + self, + *, + context: str, + query: Optional[str] = ..., + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + query: str, + response: List[dict], + tool_definitions: Optional[List[dict]] = ... + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + is_reasoning_model: bool = ..., + threshold = 3, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): + id = azureai://built-in/evaluators/groundedness_pro + + @overload + def __call__( + self, + *, + context: str, + query: str, + response: str + ) -> Dict[str, Union[str, bool]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + *, + threshold: int = 5, + **kwargs: Any + ): ... + + + @experimental + class azure.ai.evaluation.HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/hate_unfairness + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + *, + threshold: int = 3, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): + id = azureai://built-in/evaluators/indirect_attack + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, bool]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/intent_resolution + + @overload + def __call__( + self, + *, + query: Union[str, List[dict]], + response: Union[str, List[dict]], + tool_definitions: Optional[Union[dict, List[dict]]] = ... + ) -> Dict[str, Union[str, float]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD, + **kwargs + ): ... + + + class azure.ai.evaluation.Message(TypedDict): + key "content": Union[str, List[Dict]] + key "context": NotRequired[Dict[str, Any]] + key "role": str + + + class azure.ai.evaluation.MeteorScoreEvaluator(EvaluatorBase): + id = azureai://built-in/evaluators/meteor_score + + @overload + def __call__( + self, + *, + ground_truth: str, + response: str + ) -> Dict[str, float]: ... + + @override + def __init__( + self, + alpha: float = 0.9, + beta: float = 3.0, + gamma: float = 0.5, + *, + threshold: float = 0.5 + ): ... + + + class azure.ai.evaluation.OpenAIModelConfiguration(TypedDict): + key "api_key": str + key "base_url": NotRequired[str] + key "extra_headers": NotRequired[Dict[str, str]] + key "model": str + key "organization": NotRequired[str] + key "type": NotRequired[Literal["openai"]] + + + @experimental + class azure.ai.evaluation.ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): + id = azureai://built-in/evaluators/protected_material + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, bool]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + **kwargs + ): ... + + + class azure.ai.evaluation.QAEvaluator(MultiEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/qa + + @overload + def __call__( + self, + *, + context: str, + ground_truth: str, + query: str, + response: str + ): ... + + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + coherence_threshold: int = 3, + f1_score_threshold: float = 0.5, + fluency_threshold: int = 3, + groundedness_threshold: int = 3, + relevance_threshold: int = 3, + similarity_threshold: int = 3, + **kwargs: Any + ): ... + + + class azure.ai.evaluation.RelevanceEvaluator(PromptyEvaluatorBase): + id = azureai://built-in/evaluators/relevance + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + is_reasoning_model: bool = ..., + threshold = 3, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/response_completeness + + @overload + def __call__( + self, + *, + ground_truth: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, + **kwargs + ): ... + + + class azure.ai.evaluation.RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/retrieval + + @overload + def __call__( + self, + *, + context: str, + query: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + is_reasoning_model: bool = ..., + threshold: float = 3, + **kwargs + ) -> Callable: ... + + + class azure.ai.evaluation.RougeScoreEvaluator(EvaluatorBase): + id = azureai://built-in/evaluators/rouge_score + + @overload + def __call__( + self, + *, + ground_truth: str, + response: str + ) -> Dict[str, float]: ... + + @override + def __init__( + self, + rouge_type: RougeType, + *, + f1_score_threshold: float = 0.5, + precision_threshold: float = 0.5, + recall_threshold: float = 0.5 + ): ... + + + class azure.ai.evaluation.RougeType(str, Enum): + ROUGE_1 = "rouge1" + ROUGE_2 = "rouge2" + ROUGE_3 = "rouge3" + ROUGE_4 = "rouge4" + ROUGE_5 = "rouge5" + ROUGE_L = "rougeL" + + + @experimental + class azure.ai.evaluation.SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/self_harm + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + *, + threshold: int = 3, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/sexual + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + *, + threshold: int = 3, + **kwargs + ): ... + + + class azure.ai.evaluation.SimilarityEvaluator(PromptyEvaluatorBase): + id = azureai://built-in/evaluators/similarity + + @overload + def __call__( + self, + *, + ground_truth: str, + query: str, + response: str + ) -> Dict[str, float]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + is_reasoning_model: bool = ..., + threshold = 3, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/task_adherence + + @overload + def __call__( + self, + *, + query: Union[str, List[dict]], + response: Union[str, List[dict]], + tool_definitions: Optional[Union[dict, List[dict]]] = ... + ) -> Dict[str, Union[str, float]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + threshold = _DEFAULT_TASK_ADHERENCE_SCORE, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/tool_call_accuracy + + @overload + def __call__( + self, + *, + query: Union[str, List[dict]], + response: Union[str, List[dict]] = ..., + tool_calls: Union[dict, List[dict]] = ..., + tool_definitions: Union[dict, List[dict]] + ) -> Dict[str, Union[str, float]]: ... + + @override + def __init__( + self, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + *, + credential = ..., + threshold = _DEFAULT_TOOL_CALL_ACCURACY_SCORE, + **kwargs + ): ... + + + @experimental + class azure.ai.evaluation.UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]): + id = azureai://built-in/evaluators/ungrounded_attributes + + @overload + def __call__( + self, + *, + context: str, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + **kwargs: Any + ): ... + + + @experimental + class azure.ai.evaluation.ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]): + id = azureai://built-in/evaluators/violence + + @overload + def __call__( + self, + *, + query: str, + response: str + ) -> Dict[str, Union[str, float]]: ... + + @overload + def __call__( + self, + *, + conversation: Conversation + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: ... + + @override + def __init__( + self, + credential: TokenCredential, + azure_ai_project: Union[str, AzureAIProject], + *, + threshold: int = 3, + **kwargs + ): ... + + +namespace azure.ai.evaluation.autogen.raiclient + + class azure.ai.evaluation.autogen.raiclient.MachineLearningServicesClient: implements ContextManager + rai_svc: RAISvcOperations + + def __init__( + self, + endpoint: str, + subscription_id: str, + resource_group_name: str, + workspace_name: str, + credential: TokenCredential, + *, + api_version: str = ..., + **kwargs: Any + ) -> None: ... + + def close(self) -> None: ... + + def send_request( + self, + request: HttpRequest, + *, + stream: bool = False, + **kwargs: Any + ) -> HttpResponse: ... + + +namespace azure.ai.evaluation.autogen.raiclient.aio + + class azure.ai.evaluation.autogen.raiclient.aio.MachineLearningServicesClient: implements AsyncContextManager + rai_svc: RAISvcOperations + + def __init__( + self, + endpoint: str, + subscription_id: str, + resource_group_name: str, + workspace_name: str, + credential: AsyncTokenCredential, + *, + api_version: str = ..., + **kwargs: Any + ) -> None: ... + + async def close(self) -> None: ... + + def send_request( + self, + request: HttpRequest, + *, + stream: bool = False, + **kwargs: Any + ) -> Awaitable[AsyncHttpResponse]: ... + + +namespace azure.ai.evaluation.autogen.raiclient.aio.operations + + class azure.ai.evaluation.autogen.raiclient.aio.operations.RAISvcOperations: + + def __init__( + self, + *args, + **kwargs + ) -> None: ... + + @distributed_trace_async + async def get_annotation(self, **kwargs: Any) -> List[str]: ... + + @distributed_trace_async + async def get_attack_objectives( + self, + *, + lang: str, + risk_types: List[str], + **kwargs: Any + ) -> str: ... + + @distributed_trace_async + async def get_jail_break_dataset(self, **kwargs: Any) -> str: ... + + @distributed_trace_async + async def get_jail_break_dataset_with_type( + self, + type: str, + **kwargs: Any + ) -> str: ... + + @distributed_trace_async + async def get_operation_result( + self, + operation_id: str, + *, + api_key: Optional[str] = ..., + model_endpoint: Optional[str] = ..., + **kwargs: Any + ) -> str: ... + + @distributed_trace_async + async def get_template_parameters(self, **kwargs: Any) -> str: ... + + @distributed_trace_async + async def get_template_parameters_image( + self, + *, + path: str, + **kwargs: Any + ) -> str: ... + + @distributed_trace_async + async def get_template_parameters_with_type( + self, + type: str, + **kwargs: Any + ) -> str: ... + + @overload + async def submit_annotation( + self, + body: AnnotationDTO, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_annotation( + self, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_annotation( + self, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_aoai_evaluation( + self, + body: GradersDTO, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_aoai_evaluation( + self, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_aoai_evaluation( + self, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_simulation( + self, + body: SimulationDTO, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_simulation( + self, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + async def submit_simulation( + self, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + +namespace azure.ai.evaluation.autogen.raiclient.models + + class azure.ai.evaluation.autogen.raiclient.models.AnnotationDTO(Model): + annotation_task: str + content_type: str + contents: List[Content] + metric_list: List[str] + prompt_version: str + user_text_list: List[str] + + @overload + def __init__( + self, + *, + annotation_task: str, + content_type: str, + contents: List[Content], + metric_list: List[str], + prompt_version: str, + user_text_list: List[str] + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.Content(Model): + messages: List[Any] + + @overload + def __init__( + self, + *, + messages: List[Any] + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.CustomizationParameters(Model): + application_scenario: Optional[str] + harm_categories: List[str] + + @overload + def __init__( + self, + *, + application_scenario: Optional[str] = ..., + harm_categories: List[str] + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.Data(Model): + asset_id: str + + @overload + def __init__( + self, + *, + asset_id: str + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.Grader(Model): + config: GraderConfigBase + description: str + name: str + + @overload + def __init__( + self, + *, + config: GraderConfigBase, + description: str, + name: str + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.GraderConfigBase(Model): + type: str + + @overload + def __init__( + self, + *, + type: str + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.GradersDTO(Model): + data: Data + graders: List[Grader] + model_config: ModelConfig + sample_generators: List[SampleGenerator] + + @overload + def __init__( + self, + *, + data: Data, + graders: List[Grader], + model_config: ModelConfig, + sample_generators: List[SampleGenerator] + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.LongRunningResponse(Model): + location: str + operation_result: Any + + @overload + def __init__( + self, + *, + location: str, + operation_result: Any + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.ModelConfig(Model): + azure_endpoint: str + + @overload + def __init__( + self, + *, + azure_endpoint: str + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.SampleGenerator(Model): + model_name: str + sampling_params: Any + trajectory_template: Any + type: str + + @overload + def __init__( + self, + *, + model_name: str, + sampling_params: Any, + trajectory_template: Any, + type: str + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.SimulationDTO(Model): + customization_parameters: Optional[CustomizationParameters] + headers: Optional[Dict[str, str]] + is_microsoft_tenant: Optional[bool] + json: Optional[str] + params: Optional[Dict[str, str]] + resource_group_name: Optional[str] + simulation_type: Optional[Union[str, SimulationType]] + subscription_id: Optional[str] + template_key: Optional[str] + template_parameters: Optional[Dict[str, str]] + url: Optional[str] + workspace_name: Optional[str] + + @overload + def __init__( + self, + *, + customization_parameters: Optional[CustomizationParameters] = ..., + headers: Optional[Dict[str, str]] = ..., + is_microsoft_tenant: Optional[bool] = ..., + json: Optional[str] = ..., + params: Optional[Dict[str, str]] = ..., + resource_group_name: Optional[str] = ..., + simulation_type: Optional[Union[str, SimulationType]] = ..., + subscription_id: Optional[str] = ..., + template_key: Optional[str] = ..., + template_parameters: Optional[Dict[str, str]] = ..., + url: Optional[str] = ..., + workspace_name: Optional[str] = ... + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: ... + + + class azure.ai.evaluation.autogen.raiclient.models.SimulationType(str, Enum, metaclass=CaseInsensitiveEnumMeta): + CUSTOM_PERSONA = "CustomPersona" + DEFAULT = "Default" + HARM_TURN_GENERATOR = "HarmTurnGenerator" + + +namespace azure.ai.evaluation.autogen.raiclient.operations + + class azure.ai.evaluation.autogen.raiclient.operations.RAISvcOperations: + + def __init__( + self, + *args, + **kwargs + ): ... + + @distributed_trace + def get_annotation(self, **kwargs: Any) -> List[str]: ... + + @distributed_trace + def get_attack_objectives( + self, + *, + lang: str, + risk_category: Optional[str] = ..., + risk_types: List[str], + strategy: Optional[str] = ..., + target_type: Optional[str] = ..., + **kwargs: Any + ) -> str: ... + + @distributed_trace + def get_jail_break_dataset(self, **kwargs: Any) -> str: ... + + @distributed_trace + def get_jail_break_dataset_with_type( + self, + type: str, + **kwargs: Any + ) -> str: ... + + @distributed_trace + def get_operation_result( + self, + operation_id: str, + *, + api_key: Optional[str] = ..., + model_endpoint: Optional[str] = ..., + **kwargs: Any + ) -> str: ... + + @distributed_trace + def get_template_parameters(self, **kwargs: Any) -> str: ... + + @distributed_trace + def get_template_parameters_image( + self, + *, + path: str, + **kwargs: Any + ) -> str: ... + + @distributed_trace + def get_template_parameters_with_type( + self, + type: str, + **kwargs: Any + ) -> str: ... + + @overload + def submit_annotation( + self, + body: AnnotationDTO, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_annotation( + self, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_annotation( + self, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_aoai_evaluation( + self, + body: GradersDTO, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_aoai_evaluation( + self, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_aoai_evaluation( + self, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_simulation( + self, + body: SimulationDTO, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_simulation( + self, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + @overload + def submit_simulation( + self, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> LongRunningResponse: ... + + +namespace azure.ai.evaluation.red_team + + @experimental + class azure.ai.evaluation.red_team.AttackStrategy(Enum): + AnsiAttack = "ansi_attack" + AsciiArt = "ascii_art" + AsciiSmuggler = "ascii_smuggler" + Atbash = "atbash" + Base64 = "base64" + Baseline = "baseline" + Binary = "binary" + Caesar = "caesar" + CharSwap = "char_swap" + CharacterSpace = "character_space" + Crescendo = "crescendo" + DIFFICULT = "difficult" + Diacritic = "diacritic" + EASY = "easy" + Flip = "flip" + IndirectJailbreak = "indirect_jailbreak" + Jailbreak = "jailbreak" + Leetspeak = "leetspeak" + MODERATE = "moderate" + Morse = "morse" + MultiTurn = "multi_turn" + ROT13 = "rot13" + StringJoin = "string_join" + SuffixAppend = "suffix_append" + Tense = "tense" + UnicodeConfusable = "unicode_confusable" + UnicodeSubstitution = "unicode_substitution" + Url = "url" + + + @experimental + class azure.ai.evaluation.red_team.RedTeam: + + def __init__( + self, + azure_ai_project: Union[dict, str], + credential: TokenCredential, + *, + application_scenario: Optional[str] = ..., + attack_success_thresholds: Optional[Dict[RiskCategory, int]] = ..., + custom_attack_seed_prompts: Optional[str] = ..., + language: SupportedLanguages = SupportedLanguages.English, + num_objectives: int = 10, + output_dir = ".", + risk_categories: Optional[List[RiskCategory]] = ..., + **kwargs + ): ... + + async def scan( + self, + target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration, PromptChatTarget], + *, + application_scenario: Optional[str] = ..., + attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]] = [], + max_parallel_tasks: int = 5, + output_path: Optional[Union[str, PathLike]] = ..., + parallel_execution: bool = True, + scan_name: Optional[str] = ..., + skip_evals: bool = False, + skip_upload: bool = False, + timeout: int = 3600, + **kwargs: Any + ) -> RedTeamResult: ... + + + @experimental + class azure.ai.evaluation.red_team.RedTeamResult: + + def __init__( + self, + scan_result: Optional[ScanResult] = None, + attack_details: Optional[List[AttackDetails]] = None + ): ... + + def attack_simulation(self) -> str: ... + + def to_eval_qr_json_lines(self) -> str: ... + + def to_json(self) -> str: ... + + def to_scorecard(self) -> Optional[RedTeamingScorecard]: ... + + + @experimental + class azure.ai.evaluation.red_team.RiskCategory(str, Enum): + CodeVulnerability = "code_vulnerability" + HateUnfairness = "hate_unfairness" + ProhibitedActions = "prohibited_actions" + ProtectedMaterial = "protected_material" + SelfHarm = "self_harm" + SensitiveDataLeakage = "sensitive_data_leakage" + Sexual = "sexual" + TaskAdherence = "task_adherence" + UngroundedAttributes = "ungrounded_attributes" + Violence = "violence" + + + @experimental + class azure.ai.evaluation.red_team.SupportedLanguages(Enum): + English = "en" + French = "fr" + German = "de" + Italian = "it" + Japanese = "ja" + Korean = "ko" + Portuguese = "pt" + SimplifiedChinese = "zh-cn" + Spanish = "es" + + +namespace azure.ai.evaluation.simulator + + @experimental + class azure.ai.evaluation.simulator.AdversarialScenario(Enum): + ADVERSARIAL_CODE_VULNERABILITY = "adv_code_vuln" + ADVERSARIAL_CONTENT_GEN_GROUNDED = "adv_content_gen_grounded" + ADVERSARIAL_CONTENT_GEN_UNGROUNDED = "adv_content_gen_ungrounded" + ADVERSARIAL_CONTENT_PROTECTED_MATERIAL = "adv_content_protected_material" + ADVERSARIAL_CONVERSATION = "adv_conversation" + ADVERSARIAL_QA = "adv_qa" + ADVERSARIAL_QA_DOCUMENTS = "adv_qa_documents" + ADVERSARIAL_REWRITE = "adv_rewrite" + ADVERSARIAL_SEARCH = "adv_search" + ADVERSARIAL_SUMMARIZATION = "adv_summarization" + ADVERSARIAL_UNGROUNDED_ATTRIBUTES = "adv_isa" + + + @experimental + class azure.ai.evaluation.simulator.AdversarialScenarioJailbreak(Enum): + ADVERSARIAL_INDIRECT_JAILBREAK = "adv_xpia" + + + @experimental + class azure.ai.evaluation.simulator.AdversarialSimulator: + + async def __call__( + self, + *, + api_call_delay_sec: int = 0, + api_call_retry_limit: int = 3, + api_call_retry_sleep_sec: int = 1, + concurrent_async_task: int = 3, + language: SupportedLanguages = SupportedLanguages.English, + max_conversation_turns: int = 1, + max_simulation_results: int = 3, + randomization_seed: Optional[int] = ..., + randomize_order: bool = True, + scenario: AdversarialScenario, + target: Callable, + **kwargs + ) -> List[Dict[str, Any]]: ... + + def __init__( + self, + *, + azure_ai_project: Union[str, AzureAIProject], + credential: TokenCredential + ): ... + + def call_sync( + self, + *, + api_call_delay_sec: int, + api_call_retry_limit: int, + api_call_retry_sleep_sec: int, + concurrent_async_task: int, + max_conversation_turns: int, + max_simulation_results: int, + scenario: AdversarialScenario, + target: Callable + ) -> List[Dict[str, Any]]: ... + + + @experimental + class azure.ai.evaluation.simulator.DirectAttackSimulator: + + async def __call__( + self, + *, + api_call_delay_sec: int = 0, + api_call_retry_limit: int = 3, + api_call_retry_sleep_sec: int = 1, + concurrent_async_task: int = 3, + max_conversation_turns: int = 1, + max_simulation_results: int = 3, + randomization_seed: Optional[int] = ..., + scenario: AdversarialScenario, + target: Callable + ) -> Dict[str, [List[Dict[str, Any]]]]: ... + + def __init__( + self, + *, + azure_ai_project: Union[str, AzureAIProject], + credential: TokenCredential + ): ... + + + @experimental + class azure.ai.evaluation.simulator.IndirectAttackSimulator(AdversarialSimulator): + + async def __call__( + self, + *, + api_call_delay_sec: int = 0, + api_call_retry_limit: int = 3, + api_call_retry_sleep_sec: int = 1, + concurrent_async_task: int = 3, + max_simulation_results: int = 3, + randomization_seed: Optional[int] = ..., + target: Callable, + **kwargs + ) -> List[Dict[str, Any]]: ... + + def __init__( + self, + *, + azure_ai_project: Union[str, AzureAIProject], + credential: TokenCredential + ): ... + + def call_sync( + self, + *, + api_call_delay_sec: int, + api_call_retry_limit: int, + api_call_retry_sleep_sec: int, + concurrent_async_task: int, + max_conversation_turns: int, + max_simulation_results: int, + scenario: AdversarialScenario, + target: Callable + ) -> List[Dict[str, Any]]: ... + + + @experimental + class azure.ai.evaluation.simulator.Simulator: + + async def __call__( + self, + *, + api_call_delay_sec: float = 1, + concurrent_async_tasks: int = 5, + conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [], + max_conversation_turns: int = 5, + num_queries: int = 5, + query_response_generating_prompty: Optional[str] = ..., + query_response_generating_prompty_options: Dict[str, Any] = {}, + randomization_seed: Optional[int] = ..., + target: Callable, + tasks: List[str] = [], + text: str = "", + user_simulator_prompty: Optional[str] = ..., + user_simulator_prompty_options: Dict[str, Any] = {}, + **kwargs + ) -> List[JsonLineChatProtocol]: ... + + def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]): ... + + + class azure.ai.evaluation.simulator.SupportedLanguages(Enum): + English = "en" + French = "fr" + German = "de" + Italian = "it" + Japanese = "ja" + Korean = "ko" + Portuguese = "pt" + SimplifiedChinese = "zh-cn" + Spanish = "es" + + +``` \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/api.metadata.yml b/sdk/evaluation/azure-ai-evaluation/api.metadata.yml new file mode 100644 index 000000000000..9dd46c51df57 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/api.metadata.yml @@ -0,0 +1,3 @@ +apiMdSha256: 942f3460a345d8989df4e5dd3163526fec7bb0683b5183a6c043f0a99545e482 +parserVersion: 0.3.28 +pythonVersion: 3.11.4 diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py index 89d2ecbf0286..d0e7f8e0f77e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py @@ -21,7 +21,12 @@ ) from . import constants -from .constants import EvaluatorScoringPattern, EVALUATOR_SCORING_PATTERNS, SCORING_PATTERN_CONFIG +from .constants import ( + EvaluatorScoringPattern, + EVALUATOR_SCORING_PATTERNS, + SCORING_PATTERN_CONFIG, + EvaluationLevel, +) _nltk_data_download_lock = threading.Lock() @@ -967,7 +972,7 @@ def upload(path: str, container_client: ContainerClient, logger=None): # region Multi-turn utilities -def _merge_query_response_messages(query, response): +def _merge_query_response_messages(query: List[dict], response: List[dict]) -> List[dict]: """Merge query and response message lists into a single conversation. :param query: The query messages. @@ -980,7 +985,7 @@ def _merge_query_response_messages(query, response): return [*query, *response] -def _split_messages_at_latest_user(messages): +def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]: """Split messages into query/response slices at the latest user turn. :param messages: The conversation messages. @@ -988,11 +993,16 @@ def _split_messages_at_latest_user(messages): :return: A tuple of (query_messages, response_messages). :rtype: Tuple[List[dict], List[dict]] """ - latest_user_index = max(i for i, message in enumerate(messages) if message["role"] == "user") + latest_user_index = max( + (i for i, message in enumerate(messages) if message.get("role") == "user"), + default=-1, + ) + if latest_user_index == -1: + raise ValueError("messages must contain at least one message with role 'user'.") return messages[: latest_user_index + 1], messages[latest_user_index + 1 :] -def _wrap_string_messages(query, response): +def _wrap_string_messages(query: str, response: str) -> Tuple[List[dict], List[dict]]: """Wrap string query/response into separate message lists. :param query: The query string. @@ -1119,7 +1129,10 @@ def serialize_messages(messages): return result.rstrip("\n") -def _resolve_evaluation_level(evaluation_level, error_target): +def _resolve_evaluation_level( + evaluation_level: Optional[Union[EvaluationLevel, str]], + error_target: ErrorTarget, +) -> Optional[EvaluationLevel]: """Validate and normalize the evaluation_level parameter. :param evaluation_level: The evaluation level to resolve. @@ -1129,8 +1142,6 @@ def _resolve_evaluation_level(evaluation_level, error_target): :return: The resolved EvaluationLevel or None for auto-detect. :rtype: Optional[EvaluationLevel] """ - from .constants import EvaluationLevel - valid = [level.value for level in EvaluationLevel] if evaluation_level is None or evaluation_level == "": return None @@ -1139,13 +1150,13 @@ def _resolve_evaluation_level(evaluation_level, error_target): if isinstance(evaluation_level, str): try: return EvaluationLevel(evaluation_level) - except ValueError: + except ValueError as exc: raise EvaluationException( message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=error_target, - ) + ) from exc raise EvaluationException( message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), blame=ErrorBlame.USER_ERROR, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py index 08656278bdcd..78905602485e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py @@ -3,6 +3,7 @@ """Validators package init.""" +from ._validation_constants import MessageRole, ContentType from ._validator_interface import ValidatorInterface from ._conversation_validator import ConversationValidator from ._tool_definitions_validator import ToolDefinitionsValidator @@ -11,6 +12,8 @@ from ._messages_validator import MessagesOrQueryResponseInputValidator __all__ = [ + "MessageRole", + "ContentType", "ValidatorInterface", "ConversationValidator", "ToolDefinitionsValidator", diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py index 5e43f0265b0f..6a8f9feb299b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py @@ -130,7 +130,7 @@ def _validate_text_content_item(self, content_item: Dict[str, Any], role: str) - if not isinstance(content_item["text"], str): return EvaluationException( - message=f"The 'text' field must be a string in content items.", + message="The 'text' field must be a string in content items.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, @@ -196,16 +196,16 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu """Validate assistant message content.""" content = message["content"] - valid_assistant_content_types = [ - ContentType.TEXT, - ContentType.OUTPUT_TEXT, - ContentType.TOOL_CALL, - ContentType.FUNCTION_CALL, - ContentType.MCP_APPROVAL_REQUEST, - ContentType.OPENAPI_CALL, - ] - valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types] if isinstance(content, list): + valid_assistant_content_types = [ + ContentType.TEXT, + ContentType.OUTPUT_TEXT, + ContentType.TOOL_CALL, + ContentType.FUNCTION_CALL, + ContentType.MCP_APPROVAL_REQUEST, + ContentType.OPENAPI_CALL, + ] + valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types] for content_item in content: content_type = content_item["type"] if content_type not in valid_assistant_content_types: @@ -314,7 +314,7 @@ def _validate_message_dict(self, message: Dict[str, Any]) -> Optional[Evaluation ) if not content_is_string_or_list_of_dicts: return EvaluationException( - message=f"The 'content' field must be a string or a list of dictionaries messages.", + message="The 'content' field must be a string or a list of dictionaries messages.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, @@ -322,23 +322,22 @@ def _validate_message_dict(self, message: Dict[str, Any]) -> Optional[Evaluation if len(content) == 0: return EvaluationException( - message=f"The 'content' field can't be empty.", + message="The 'content' field can't be empty.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, ) if isinstance(content, list): - all_messages_have_type_field = all("type" in item for item in content) - if not all_messages_have_type_field: + if not all("type" in item for item in content): return EvaluationException( - message=f"Each content item in the 'content' list must contain a 'type' field.", + message="Each content item in the 'content' list must contain a 'type' field.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, ) - if role in [MessageRole.USER, MessageRole.SYSTEM]: + if role in [MessageRole.USER, MessageRole.SYSTEM, MessageRole.DEVELOPER]: error = self._validate_user_or_system_message(message, role) if error: return error diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py index 38fca6b8d159..c2f24c252f97 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_validator.py @@ -8,21 +8,40 @@ from typing import Any, Dict from typing_extensions import override -from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory -from ._tool_definitions_validator import ToolDefinitionsValidator +from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget from ._validation_constants import MessageRole +from ._conversation_validator import ConversationValidator +from ._tool_definitions_validator import ToolDefinitionsValidator class MessagesOrQueryResponseInputValidator(ToolDefinitionsValidator): """Validator that supports both single-turn (query/response) and multi-turn (messages) inputs. - When ``messages`` is provided, it validates the messages list and optional tool_definitions. - Otherwise, it delegates to the parent ``ToolDefinitionsValidator`` for the query/response path. + A single implementation serves all evaluators via a behavior flag: + - ``enforce_tool_definitions`` (default False): validate ``tool_definitions`` in both the + messages path and the query/response path. Set True for evaluators that require + tool definitions. """ + enforce_tool_definitions: bool = False + + def __init__( + self, + error_target: ErrorTarget, + requires_query: bool = True, + optional_tool_definitions: bool = True, + check_for_unsupported_tools: bool = False, + *, + enforce_tool_definitions: bool = False, + ): + """Initialize MessagesOrQueryResponseInputValidator.""" + super().__init__(error_target, requires_query, optional_tool_definitions, check_for_unsupported_tools) + self.enforce_tool_definitions = enforce_tool_definitions + @override def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: """Validate evaluation input, supporting messages as an alternative to query/response.""" + # Multi-turn path (messages list) messages = eval_input.get("messages") if messages is not None: if not isinstance(messages, list): @@ -41,23 +60,23 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: ) # Per-message structural checks - valid_roles = {r.value for r in MessageRole} + valid_roles = {role.value for role in MessageRole} roles_present: set = set() - for i, msg in enumerate(messages): - if not isinstance(msg, dict): + for index, message in enumerate(messages): + if not isinstance(message, dict): raise EvaluationException( message=( - f"Each item in 'messages' must be a dictionary, " - f"but item at index {i} is {type(msg).__name__}." + "Each item in 'messages' must be a dictionary, " + f"but item at index {index} is {type(message).__name__}." ), blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, ) - role = msg.get("role") + role = message.get("role") if role is None: raise EvaluationException( - message=f"Each message must contain a 'role' key, but message at index {i} is missing it.", + message=f"Each message must contain a 'role' key, but message at index {index} is missing it.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, @@ -65,7 +84,8 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: if role not in valid_roles: raise EvaluationException( message=( - f"Invalid role '{role}' at message index {i}. " f"Must be one of: {sorted(valid_roles)}." + f"Invalid role '{role}' at message index {index}. " + f"Must be one of: {sorted(valid_roles)}." ), blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, @@ -74,14 +94,14 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: roles_present.add(role) # Conversation-level checks - if MessageRole.USER not in roles_present: + if MessageRole.USER.value not in roles_present: raise EvaluationException( message="messages must contain at least one message with role 'user'.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, ) - if MessageRole.ASSISTANT not in roles_present: + if MessageRole.ASSISTANT.value not in roles_present: raise EvaluationException( message="messages must contain at least one message with role 'assistant'.", blame=ErrorBlame.USER_ERROR, @@ -89,9 +109,13 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: target=self.error_target, ) - tool_definitions = eval_input.get("tool_definitions") - tool_definitions_error = self._validate_tool_definitions(tool_definitions) - if tool_definitions_error: - raise tool_definitions_error + if self.enforce_tool_definitions: + tool_definitions = eval_input.get("tool_definitions") + tool_definitions_validation_exception = self._validate_tool_definitions(tool_definitions) + if tool_definitions_validation_exception: + raise tool_definitions_validation_exception return True - return super().validate_eval_input(eval_input) + + if self.enforce_tool_definitions: + return super().validate_eval_input(eval_input) + return ConversationValidator.validate_eval_input(self, eval_input) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py index 132303129546..3c0d6018b2eb 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py @@ -17,17 +17,38 @@ class TaskNavigationEfficiencyValidator(ValidatorInterface): """ Validate task navigation efficiency inputs (response and ground_truth). + Accepts either the SDK input names (``response``/``ground_truth``) or the + azureml-assets names (``actions``/``expected_actions``). + Validates: - - response: List of assistant messages containing tool calls - - ground_truth: Either a list of expected tool names, or a tuple of (tool names, parameters dict) + - response (alias ``actions``): List of assistant messages containing tool calls + - ground_truth (alias ``expected_actions``): Either a list of expected tool names, or a + tuple of (tool names, parameters dict) """ error_target: ErrorTarget + # Canonical input key -> accepted alternate (azureml-assets) key name. + _INPUT_ALIASES: Dict[str, str] = { + "response": "actions", + "ground_truth": "expected_actions", + } + def __init__(self, error_target: ErrorTarget): """Initialize with error target.""" self.error_target = error_target + def _normalize_input_aliases(self, eval_input: Dict[str, Any]) -> None: + """Map azureml-assets-style input keys onto the canonical keys in place. + + If a canonical key (``response``/``ground_truth``) is absent but its alias + (``actions``/``expected_actions``) is provided, copy the alias value to the canonical + key so the rest of the pipeline can rely on a single set of names. + """ + for canonical, alias in self._INPUT_ALIASES.items(): + if eval_input.get(canonical) is None and eval_input.get(alias) is not None: + eval_input[canonical] = eval_input[alias] + def _validate_response(self, response: Any) -> Optional[EvaluationException]: """Validate the response parameter.""" if response is None: @@ -221,8 +242,12 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: """ Validate task navigation evaluation input. + Accepts either the SDK input names (``response``/``ground_truth``) or the + azureml-assets names (``actions``/``expected_actions``). + Args: - eval_input: Dictionary containing 'response' and 'ground_truth'. + eval_input: Dictionary containing 'response'/'ground_truth' (or their + 'actions'/'expected_actions' aliases). Returns: True if validation passes. @@ -230,6 +255,9 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: Raises: EvaluationException: If validation fails. """ + # Normalize azureml-assets-style aliases ('actions'/'expected_actions') onto canonical keys. + self._normalize_input_aliases(eval_input) + # If response or ground_truth is a string, try to parse it as JSON for key in ("response", "ground_truth"): value = eval_input.get(key) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 3a2ccb1ace85..f5057f09e947 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, # Initialize input validator self._validator = ToolCallsValidator( error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 7ebc20c7e130..198fefde02d1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -92,7 +92,7 @@ def __init__( self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, optional_tool_definitions=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py index 6339fdab2bb6..6f8605c5a071 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py @@ -86,7 +86,9 @@ def __init__( # Initialize input validator self._validator = ToolDefinitionsValidator( - error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, optional_tool_definitions=False + error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, + optional_tool_definitions=False, + check_for_unsupported_tools=True, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/autogen/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/autogen/__init__.py new file mode 100644 index 000000000000..d540fd20468c --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/autogen/__init__.py @@ -0,0 +1,3 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py new file mode 100644 index 000000000000..91031216e853 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_common_validators.py @@ -0,0 +1,561 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""Unit tests for the shared evaluator input validators.""" + +import pytest + +from azure.ai.evaluation._exceptions import EvaluationException, ErrorCategory, ErrorTarget +from azure.ai.evaluation._evaluators._common._validators import ( + MessageRole, + ContentType, + ConversationValidator, + ToolDefinitionsValidator, + ToolCallsValidator, + TaskNavigationEfficiencyValidator, + MessagesOrQueryResponseInputValidator, +) + + +TARGET = ErrorTarget.CONVERSATION + + +def _user_message(text="hello"): + return {"role": "user", "content": text} + + +def _assistant_message(text="hi there"): + return {"role": "assistant", "content": text} + + +def _tool_call_content_item(name="search", tool_call_id="call_1"): + return { + "type": "tool_call", + "name": name, + "arguments": {"q": "foo"}, + "tool_call_id": tool_call_id, + } + + +def _tool_definition(name="search"): + return {"name": name, "parameters": {"type": "object"}} + + +@pytest.mark.unittest +class TestValidationConstants: + def test_message_role_values(self): + assert MessageRole.USER == "user" + assert MessageRole.ASSISTANT == "assistant" + assert MessageRole.SYSTEM == "system" + assert MessageRole.TOOL == "tool" + assert MessageRole.DEVELOPER == "developer" + + def test_content_type_values(self): + assert ContentType.TEXT == "text" + assert ContentType.TOOL_CALL == "tool_call" + assert ContentType.TOOL_RESULT == "tool_result" + assert ContentType.FUNCTION_CALL == "function_call" + assert ContentType.MCP_APPROVAL_REQUEST == "mcp_approval_request" + + +@pytest.mark.unittest +class TestConversationValidator: + def test_valid_query_response(self): + validator = ConversationValidator(error_target=TARGET) + eval_input = {"query": [_user_message()], "response": [_assistant_message()]} + assert validator.validate_eval_input(eval_input) is True + + def test_valid_string_query_response(self): + validator = ConversationValidator(error_target=TARGET) + eval_input = {"query": "what is the weather?", "response": "it is sunny"} + assert validator.validate_eval_input(eval_input) is True + + def test_valid_conversation(self): + validator = ConversationValidator(error_target=TARGET) + eval_input = {"conversation": {"messages": [_user_message(), _assistant_message()]}} + assert validator.validate_eval_input(eval_input) is True + + def test_query_not_required(self): + validator = ConversationValidator(error_target=TARGET, requires_query=False) + eval_input = {"response": [_assistant_message()]} + assert validator.validate_eval_input(eval_input) is True + + def test_missing_query_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"response": [_assistant_message()]}) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + def test_empty_query_list_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"query": [], "response": [_assistant_message()]}) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + def test_empty_query_string_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": "", "response": [_assistant_message()]}) + + def test_query_wrong_type_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"query": 123, "response": [_assistant_message()]}) + assert exc_info.value.category == ErrorCategory.INVALID_VALUE + + def test_message_not_dict_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": ["not a dict"], "response": [_assistant_message()]}) + + def test_message_missing_role_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": [{"content": "hi"}], "response": [_assistant_message()]}) + + def test_message_missing_content_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": [{"role": "user"}], "response": [_assistant_message()]}) + + def test_empty_content_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input( + {"query": [{"role": "user", "content": ""}], "response": [_assistant_message()]} + ) + + def test_content_list_item_missing_type_raises(self): + validator = ConversationValidator(error_target=TARGET) + bad = {"role": "user", "content": [{"text": "hi"}]} + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": [bad], "response": [_assistant_message()]}) + + def test_user_message_invalid_content_type_raises(self): + validator = ConversationValidator(error_target=TARGET) + bad = {"role": "user", "content": [{"type": "tool_call", "text": "hi"}]} + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": [bad], "response": [_assistant_message()]}) + + def test_assistant_message_with_tool_call(self): + validator = ConversationValidator(error_target=TARGET) + assistant = {"role": "assistant", "content": [_tool_call_content_item()]} + eval_input = {"query": [_user_message()], "response": [assistant]} + assert validator.validate_eval_input(eval_input) is True + + def test_assistant_tool_call_missing_name_raises(self): + validator = ConversationValidator(error_target=TARGET) + bad_item = {"type": "tool_call", "arguments": {}, "tool_call_id": "1"} + assistant = {"role": "assistant", "content": [bad_item]} + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": [_user_message()], "response": [assistant]}) + + def test_unsupported_tool_raises_when_enabled(self): + validator = ConversationValidator(error_target=TARGET, check_for_unsupported_tools=True) + unsupported = _tool_call_content_item(name="bing_grounding") + assistant = {"role": "assistant", "content": [unsupported]} + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"query": [_user_message()], "response": [assistant]}) + assert exc_info.value.category == ErrorCategory.NOT_APPLICABLE + + def test_unsupported_tool_allowed_when_disabled(self): + validator = ConversationValidator(error_target=TARGET, check_for_unsupported_tools=False) + unsupported = _tool_call_content_item(name="bing_grounding") + assistant = {"role": "assistant", "content": [unsupported]} + eval_input = {"query": [_user_message()], "response": [assistant]} + assert validator.validate_eval_input(eval_input) is True + + def test_conversation_not_dict_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"conversation": ["not a dict"]}) + + def test_conversation_missing_messages_raises(self): + validator = ConversationValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"conversation": {}}) + + def test_tool_message_valid(self): + validator = ConversationValidator(error_target=TARGET) + tool_msg = { + "role": "tool", + "tool_call_id": "call_1", + "content": [{"type": "tool_result", "tool_result": "done"}], + } + eval_input = {"query": [_user_message(), tool_msg], "response": [_assistant_message()]} + assert validator.validate_eval_input(eval_input) is True + + def test_tool_message_content_not_list_raises(self): + validator = ConversationValidator(error_target=TARGET) + tool_msg = {"role": "tool", "tool_call_id": "call_1", "content": "result"} + with pytest.raises(EvaluationException): + validator.validate_eval_input({"query": [tool_msg], "response": [_assistant_message()]}) + + +@pytest.mark.unittest +class TestToolDefinitionsValidator: + def test_optional_tool_definitions_absent_ok(self): + validator = ToolDefinitionsValidator(error_target=TARGET, optional_tool_definitions=True) + eval_input = {"query": [_user_message()], "response": [_assistant_message()]} + assert validator.validate_eval_input(eval_input) is True + + def test_required_tool_definitions_absent_raises(self): + validator = ToolDefinitionsValidator(error_target=TARGET, optional_tool_definitions=False) + eval_input = {"query": [_user_message()], "response": [_assistant_message()]} + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + def test_valid_tool_definitions(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": [_tool_definition()], + } + assert validator.validate_eval_input(eval_input) is True + + def test_tool_definitions_not_list_raises(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": 123, + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert exc_info.value.category == ErrorCategory.INVALID_VALUE + + def test_tool_definition_missing_name_raises(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": [{"parameters": {}}], + } + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_tool_definition_missing_parameters_raises(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": [{"name": "search"}], + } + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_openapi_tool_definition_valid(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": [{"type": "openapi", "functions": [_tool_definition()]}], + } + assert validator.validate_eval_input(eval_input) is True + + def test_openapi_tool_definition_missing_functions_raises(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": [{"type": "openapi"}], + } + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_string_tool_definitions_ok(self): + validator = ToolDefinitionsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": "some string", + } + assert validator.validate_eval_input(eval_input) is True + + +@pytest.mark.unittest +class TestToolCallsValidator: + def test_valid_tool_calls(self): + validator = ToolCallsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "response": [_assistant_message()], + "tool_definitions": [_tool_definition()], + "tool_calls": [_tool_call_content_item()], + } + assert validator.validate_eval_input(eval_input) is True + + def test_missing_tool_calls_and_response_raises(self): + validator = ToolCallsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "tool_definitions": [_tool_definition()], + } + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_tool_calls_from_response_ok(self): + validator = ToolCallsValidator(error_target=TARGET) + assistant = {"role": "assistant", "content": [_tool_call_content_item()]} + eval_input = { + "query": [_user_message()], + "response": [assistant], + "tool_definitions": [_tool_definition()], + } + assert validator.validate_eval_input(eval_input) is True + + def test_tool_calls_not_list_raises(self): + validator = ToolCallsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "tool_definitions": [_tool_definition()], + "tool_calls": 123, + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert exc_info.value.category == ErrorCategory.INVALID_VALUE + + def test_tool_call_item_not_dict_raises(self): + validator = ToolCallsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "tool_definitions": [_tool_definition()], + "tool_calls": ["not a dict"], + } + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_tool_definitions_required(self): + validator = ToolCallsValidator(error_target=TARGET) + eval_input = { + "query": [_user_message()], + "tool_calls": [_tool_call_content_item()], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + +@pytest.mark.unittest +class TestTaskNavigationEfficiencyValidator: + def _response(self): + return [ + {"role": "user", "content": "do the task"}, + { + "role": "assistant", + "content": [{"type": "tool_call", "name": "search"}], + }, + ] + + def test_valid_list_ground_truth(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"response": self._response(), "ground_truth": ["search", "summarize"]} + assert validator.validate_eval_input(eval_input) is True + + def test_valid_tuple_ground_truth(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = { + "response": self._response(), + "ground_truth": (["search"], {"search": {"q": "foo"}}), + } + assert validator.validate_eval_input(eval_input) is True + + def test_alias_inputs_normalized(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"actions": self._response(), "expected_actions": ["search"]} + assert validator.validate_eval_input(eval_input) is True + + def test_actions_alias_normalized_onto_response(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"actions": self._response(), "ground_truth": ["search"]} + assert validator.validate_eval_input(eval_input) is True + # The alias value should be copied onto the canonical 'response' key in place. + assert eval_input["response"] == eval_input["actions"] + + def test_expected_actions_alias_normalized_onto_ground_truth(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"response": self._response(), "expected_actions": ["search"]} + assert validator.validate_eval_input(eval_input) is True + assert eval_input["ground_truth"] == eval_input["expected_actions"] + + def test_mixed_canonical_and_alias_inputs(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"actions": self._response(), "ground_truth": ["search"]} + assert validator.validate_eval_input(eval_input) is True + + def test_canonical_takes_precedence_over_alias(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + # 'response' (canonical) is valid; 'actions' (alias) is invalid and must be ignored. + eval_input = { + "response": self._response(), + "actions": "not a valid list", + "ground_truth": ["search"], + } + assert validator.validate_eval_input(eval_input) is True + # Canonical value is preserved; alias does not overwrite it. + assert eval_input["response"] == self._response() + + def test_alias_does_not_overwrite_empty_string_canonical(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + # Canonical present but falsy ("") is still not None, so alias must not overwrite it. + eval_input = {"response": "", "actions": self._response(), "ground_truth": ["search"]} + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_alias_json_string_inputs_parsed(self): + import json + + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = { + "actions": json.dumps(self._response()), + "expected_actions": json.dumps(["search"]), + } + assert validator.validate_eval_input(eval_input) is True + + def test_missing_canonical_and_alias_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"ground_truth": ["search"]}) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + def test_json_string_inputs_parsed(self): + import json + + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = { + "response": json.dumps(self._response()), + "ground_truth": json.dumps(["search"]), + } + assert validator.validate_eval_input(eval_input) is True + + def test_response_none_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"response": None, "ground_truth": ["search"]}) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + def test_response_not_list_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"response": {"role": "user"}, "ground_truth": ["search"]}) + + def test_action_missing_role_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"response": [{"content": []}], "ground_truth": ["search"]} + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_assistant_action_content_not_list_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = { + "response": [{"role": "assistant", "content": "not a list"}], + "ground_truth": ["search"], + } + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + def test_tool_call_missing_name_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = { + "response": [{"role": "assistant", "content": [{"type": "tool_call"}]}], + "ground_truth": ["search"], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert exc_info.value.category == ErrorCategory.MISSING_FIELD + + def test_ground_truth_empty_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"response": self._response(), "ground_truth": []}) + + def test_ground_truth_wrong_type_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"response": self._response(), "ground_truth": 123}) + + def test_ground_truth_list_non_string_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"response": self._response(), "ground_truth": ["search", 1]}) + + def test_ground_truth_tuple_wrong_length_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"response": self._response(), "ground_truth": (["search"],)}) + + def test_ground_truth_tuple_params_not_dict_raises(self): + validator = TaskNavigationEfficiencyValidator(error_target=TARGET) + eval_input = {"response": self._response(), "ground_truth": (["search"], {"search": "bad"})} + with pytest.raises(EvaluationException): + validator.validate_eval_input(eval_input) + + +@pytest.mark.unittest +class TestMessagesOrQueryResponseInputValidator: + def _messages(self): + return [_user_message(), _assistant_message()] + + def test_valid_messages(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + assert validator.validate_eval_input({"messages": self._messages()}) is True + + def test_valid_query_response_fallback(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + eval_input = {"query": [_user_message()], "response": [_assistant_message()]} + assert validator.validate_eval_input(eval_input) is True + + def test_messages_not_list_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input({"messages": "not a list"}) + assert exc_info.value.category == ErrorCategory.INVALID_VALUE + + def test_messages_empty_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": []}) + + def test_message_not_dict_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": ["not a dict"]}) + + def test_message_missing_role_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": [{"content": "hi"}]}) + + def test_invalid_role_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + bad = [{"role": "bot", "content": "hi"}, _assistant_message()] + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": bad}) + + def test_missing_user_role_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": [_assistant_message()]}) + + def test_missing_assistant_role_raises(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": [_user_message()]}) + + def test_enforce_tool_definitions_required(self): + validator = MessagesOrQueryResponseInputValidator( + error_target=TARGET, optional_tool_definitions=False, enforce_tool_definitions=True + ) + with pytest.raises(EvaluationException): + validator.validate_eval_input({"messages": self._messages()}) + + def test_no_enforce_tool_definitions_ok(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET, enforce_tool_definitions=False) + assert validator.validate_eval_input({"messages": self._messages()}) is True + + def test_query_response_fallback_no_enforce_tool_definitions(self): + validator = MessagesOrQueryResponseInputValidator(error_target=TARGET, enforce_tool_definitions=False) + eval_input = {"query": [_user_message()], "response": [_assistant_message()]} + assert validator.validate_eval_input(eval_input) is True diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py index 2c1a5dfba237..7b3ab816ad22 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py @@ -404,3 +404,91 @@ def test_matching_mode_validation(self): # Test invalid type for mode with pytest.raises(Exception): # EvaluationException _TaskNavigationEfficiencyEvaluator(matching_mode=123) # type: ignore + + # ==================== ALIAS INPUT NORMALIZATION TESTS ==================== + + def test_alias_actions_normalized_as_response(self): + """Test that 'actions' alias is accepted and normalized to 'response'.""" + evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + actions = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}], + }, + ] + ground_truth = ["search", "analyze"] + + result = evaluator(actions=actions, ground_truth=ground_truth) + assert result["task_navigation_efficiency_passed"] is True + assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0 + assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0 + + def test_alias_expected_actions_normalized_as_ground_truth(self): + """Test that 'expected_actions' alias is accepted and normalized to 'ground_truth'.""" + evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + response = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}], + }, + ] + expected_actions = ["search", "analyze"] + + result = evaluator(response=response, expected_actions=expected_actions) + assert result["task_navigation_efficiency_passed"] is True + assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0 + assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0 + + def test_both_aliases_normalized_and_evaluated(self): + """Test that both 'actions' and 'expected_actions' aliases together produce the correct result.""" + evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + actions = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}], + }, + ] + expected_actions = ["search", "analyze"] + + result = evaluator(actions=actions, expected_actions=expected_actions) + assert result["task_navigation_efficiency_passed"] is True + + def test_alias_inputs_mismatch(self): + """Test that alias inputs produce a failing result when actions do not match expected_actions.""" + evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + # Agent performs 'search' and 'extra_step', but expected is 'search' and 'analyze' + actions = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "extra_step", "arguments": {}}], + }, + ] + expected_actions = ["search", "analyze"] + + result = evaluator(actions=actions, expected_actions=expected_actions) + assert result["task_navigation_efficiency_passed"] is False + assert result["task_navigation_efficiency_result"] == "fail" + # precision: 1 match out of 2 agent steps = 0.5 + assert result["task_navigation_efficiency_properties"]["precision_score"] == 0.5 + # recall: 1 match out of 2 expected steps = 0.5 + assert result["task_navigation_efficiency_properties"]["recall_score"] == 0.5