diff --git a/src/agentevals/config.py b/src/agentevals/config.py index da8e776..c5dca77 100644 --- a/src/agentevals/config.py +++ b/src/agentevals/config.py @@ -84,13 +84,18 @@ class OpenAIEvalDef(BaseModel): @classmethod def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]: grader_type = v.get("type") - if grader_type != "text_similarity": - raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'") - metric = v.get("evaluation_metric") - if not metric: - raise ValueError("'evaluation_metric' is required for text_similarity grader") - if metric not in _VALID_SIMILARITY_METRICS: - raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}") + if grader_type == "text_similarity": + metric = v.get("evaluation_metric") + if not metric: + raise ValueError("'evaluation_metric' is required for text_similarity grader") + if metric not in _VALID_SIMILARITY_METRICS: + raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}") + elif grader_type == "label_model": + for field in ("model", "input", "labels", "passing_labels"): + if not v.get(field): + raise ValueError(f"'{field}' is required for label_model grader") + else: + raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: label_model, text_similarity") return v diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py index a6e9c00..196d11e 100644 --- a/src/agentevals/openai_eval_backend.py +++ b/src/agentevals/openai_eval_backend.py @@ -31,6 +31,12 @@ "required": ["actual_response", "expected_response"], } +_ACTUAL_ONLY_SCHEMA = { + "type": "object", + "properties": {"actual_response": {"type": "string"}}, + "required": ["actual_response"], +} + def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]: """Build the OpenAI testing_criteria dict from the evaluator config. @@ -51,28 +57,31 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]: "pass_threshold": evaluator_def.threshold, } + if grader_type == "label_model": + return { + "type": "label_model", + "name": evaluator_def.name, + "model": grader["model"], + "input": grader["input"], + "labels": grader["labels"], + "passing_labels": grader["passing_labels"], + } + raise ValueError(f"Unsupported grader type: {grader_type}") def _build_jsonl_items( actual_invocations: list[Invocation], expected_invocations: list[Invocation], + include_expected: bool = True, ) -> list[dict[str, Any]]: items = [] for i, actual_inv in enumerate(actual_invocations): - actual_text = _content_to_text(actual_inv.final_response) - if i < len(expected_invocations): - expected_text = _content_to_text(expected_invocations[i].final_response) - else: - expected_text = "" - items.append( - { - "item": { - "actual_response": actual_text, - "expected_response": expected_text, - } - } - ) + entry: dict[str, Any] = {"actual_response": _content_to_text(actual_inv.final_response)} + if include_expected: + expected_text = _content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else "" + entry["expected_response"] = expected_text + items.append({"item": entry}) return items @@ -111,13 +120,15 @@ async def evaluate_openai_eval( error="OPENAI_API_KEY environment variable is not set.", ) - if expected_invocations is None: + grader_type = evaluator_def.grader["type"] + + if grader_type == "text_similarity" and expected_invocations is None: return MetricResult( metric_name=evaluator_def.name, - error="OpenAI text_similarity grader requires expected invocations (golden eval set).", + error="text_similarity grader requires expected invocations (golden eval set).", ) - items = _build_jsonl_items(actual_invocations, expected_invocations) + items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=(grader_type != "label_model")) if not items: return MetricResult( metric_name=evaluator_def.name, @@ -130,12 +141,13 @@ async def evaluate_openai_eval( try: client = await asyncio.to_thread(_get_openai_client) + item_schema = _ACTUAL_ONLY_SCHEMA if grader_type == "label_model" else _TEXT_PAIR_SCHEMA eval_obj = await asyncio.to_thread( client.evals.create, name=f"agentevals-{evaluator_def.name}", data_source_config={ "type": "custom", - "item_schema": _TEXT_PAIR_SCHEMA, + "item_schema": item_schema, "include_sample_schema": False, }, testing_criteria=[testing_criteria], @@ -225,12 +237,17 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva total = result_counts.total if result_counts else 0 eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED" + grader = evaluator_def.grader details: dict[str, Any] = { "openai_eval_id": eval_id, "openai_run_id": run_id, - "evaluation_metric": evaluator_def.grader.get("evaluation_metric"), "result_counts": {"passed": passed, "failed": failed, "total": total}, } + if grader["type"] == "text_similarity": + details["evaluation_metric"] = grader.get("evaluation_metric") + elif grader["type"] == "label_model": + details["model"] = grader.get("model") + details["passing_labels"] = grader.get("passing_labels") per_criteria = getattr(run, "per_testing_criteria_results", None) if per_criteria: details["per_testing_criteria"] = [ diff --git a/tests/test_openai_eval_backend.py b/tests/test_openai_eval_backend.py new file mode 100644 index 0000000..38c6e64 --- /dev/null +++ b/tests/test_openai_eval_backend.py @@ -0,0 +1,137 @@ +"""Tests for OpenAI Evals backend config and criteria building.""" + +from __future__ import annotations + +import pytest +from unittest.mock import MagicMock + +from agentevals.config import OpenAIEvalDef +from agentevals.openai_eval_backend import ( + _ACTUAL_ONLY_SCHEMA, + _TEXT_PAIR_SCHEMA, + _build_jsonl_items, + _build_testing_criteria, + evaluate_openai_eval, +) + + +def _label_grader(**overrides): + base = { + "type": "label_model", + "model": "gpt-4o-mini", + "input": [{"role": "user", "content": "Rate: {{ item.actual_response }}"}], + "labels": ["good", "bad"], + "passing_labels": ["good"], + } + base.update(overrides) + return base + + +def _invocation(text: str): + inv = MagicMock() + inv.final_response.parts = [MagicMock(text=text)] + return inv + + +class TestOpenAIEvalDefValidation: + def test_text_similarity_valid(self): + d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}) + assert d.grader["type"] == "text_similarity" + + def test_text_similarity_missing_metric(self): + with pytest.raises(Exception, match="evaluation_metric"): + OpenAIEvalDef(name="sim", grader={"type": "text_similarity"}) + + def test_text_similarity_bad_metric(self): + with pytest.raises(Exception, match="Unknown evaluation_metric"): + OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "invalid"}) + + def test_label_model_valid(self): + d = OpenAIEvalDef(name="lm", grader=_label_grader()) + assert d.grader["type"] == "label_model" + + @pytest.mark.parametrize("field", ["model", "input", "labels", "passing_labels"]) + def test_label_model_missing_required_field(self, field): + with pytest.raises(Exception, match=field): + OpenAIEvalDef(name="lm", grader=_label_grader(**{field: None})) + + def test_unsupported_grader_type(self): + with pytest.raises(Exception, match="Unsupported grader type"): + OpenAIEvalDef(name="x", grader={"type": "unknown"}) + + +class TestBuildTestingCriteria: + def test_text_similarity_shape(self): + d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}, threshold=0.7) + c = _build_testing_criteria(d) + assert c["type"] == "text_similarity" + assert c["evaluation_metric"] == "bleu" + assert c["pass_threshold"] == 0.7 + assert "{{ item.actual_response }}" in c["input"] + assert "{{ item.expected_response }}" in c["reference"] + + def test_label_model_shape(self): + grader = _label_grader() + d = OpenAIEvalDef(name="quality", grader=grader) + c = _build_testing_criteria(d) + assert c["type"] == "label_model" + assert c["model"] == "gpt-4o-mini" + assert c["labels"] == ["good", "bad"] + assert c["passing_labels"] == ["good"] + assert c["input"] == grader["input"] + + +class TestBuildJsonlItems: + def test_text_similarity_includes_expected(self): + actual = [_invocation("hello")] + expected = [_invocation("world")] + items = _build_jsonl_items(actual, expected, include_expected=True) + assert len(items) == 1 + assert "expected_response" in items[0]["item"] + + def test_label_model_excludes_expected(self): + actual = [_invocation("hello")] + items = _build_jsonl_items(actual, [], include_expected=False) + assert len(items) == 1 + assert "expected_response" not in items[0]["item"] + assert items[0]["item"]["actual_response"] is not None + + def test_empty_expected_falls_back_to_empty_string(self): + actual = [_invocation("hello")] + items = _build_jsonl_items(actual, [], include_expected=True) + assert items[0]["item"]["expected_response"] == "" + + +class TestSchemas: + def test_actual_only_has_no_expected(self): + assert "expected_response" not in _ACTUAL_ONLY_SCHEMA["properties"] + + def test_text_pair_has_both(self): + assert "actual_response" in _TEXT_PAIR_SCHEMA["properties"] + assert "expected_response" in _TEXT_PAIR_SCHEMA["properties"] + + +class TestEvaluateOpenAIEval: + @pytest.mark.asyncio + async def test_no_api_key_returns_error(self, monkeypatch): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}) + result = await evaluate_openai_eval(d, [], []) + assert result.error is not None + assert "OPENAI_API_KEY" in result.error + + @pytest.mark.asyncio + async def test_text_similarity_requires_expected(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}) + result = await evaluate_openai_eval(d, [_invocation("hi")], None) + assert result.error is not None + assert "expected invocations" in result.error + + @pytest.mark.asyncio + async def test_label_model_does_not_require_expected(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None) + d = OpenAIEvalDef(name="lm", grader=_label_grader()) + result = await evaluate_openai_eval(d, [_invocation("hi")], None) + assert "expected invocations" not in (result.error or "")