Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions src/agentevals/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,18 @@ class OpenAIEvalDef(BaseModel):
@classmethod
def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
grader_type = v.get("type")
if grader_type != "text_similarity":
raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
metric = v.get("evaluation_metric")
if not metric:
raise ValueError("'evaluation_metric' is required for text_similarity grader")
if metric not in _VALID_SIMILARITY_METRICS:
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
if grader_type == "text_similarity":
metric = v.get("evaluation_metric")
if not metric:
raise ValueError("'evaluation_metric' is required for text_similarity grader")
if metric not in _VALID_SIMILARITY_METRICS:
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
elif grader_type == "label_model":
for field in ("model", "input", "labels", "passing_labels"):
if not v.get(field):
raise ValueError(f"'{field}' is required for label_model grader")
else:
raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: label_model, text_similarity")
return v


Expand Down
53 changes: 35 additions & 18 deletions src/agentevals/openai_eval_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@
"required": ["actual_response", "expected_response"],
}

_ACTUAL_ONLY_SCHEMA = {
"type": "object",
"properties": {"actual_response": {"type": "string"}},
"required": ["actual_response"],
}


def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
"""Build the OpenAI testing_criteria dict from the evaluator config.
Expand All @@ -51,28 +57,31 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
"pass_threshold": evaluator_def.threshold,
}

if grader_type == "label_model":
return {
"type": "label_model",
"name": evaluator_def.name,
"model": grader["model"],
"input": grader["input"],
"labels": grader["labels"],
"passing_labels": grader["passing_labels"],
}

raise ValueError(f"Unsupported grader type: {grader_type}")


def _build_jsonl_items(
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
include_expected: bool = True,
) -> list[dict[str, Any]]:
items = []
for i, actual_inv in enumerate(actual_invocations):
actual_text = _content_to_text(actual_inv.final_response)
if i < len(expected_invocations):
expected_text = _content_to_text(expected_invocations[i].final_response)
else:
expected_text = ""
items.append(
{
"item": {
"actual_response": actual_text,
"expected_response": expected_text,
}
}
)
entry: dict[str, Any] = {"actual_response": _content_to_text(actual_inv.final_response)}
if include_expected:
expected_text = _content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else ""
entry["expected_response"] = expected_text
items.append({"item": entry})
return items


Expand Down Expand Up @@ -111,13 +120,15 @@ async def evaluate_openai_eval(
error="OPENAI_API_KEY environment variable is not set.",
)

if expected_invocations is None:
grader_type = evaluator_def.grader["type"]

if grader_type == "text_similarity" and expected_invocations is None:
return MetricResult(
metric_name=evaluator_def.name,
error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
error="text_similarity grader requires expected invocations (golden eval set).",
)

items = _build_jsonl_items(actual_invocations, expected_invocations)
items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=(grader_type != "label_model"))
if not items:
return MetricResult(
metric_name=evaluator_def.name,
Expand All @@ -130,12 +141,13 @@ async def evaluate_openai_eval(
try:
client = await asyncio.to_thread(_get_openai_client)

item_schema = _ACTUAL_ONLY_SCHEMA if grader_type == "label_model" else _TEXT_PAIR_SCHEMA
eval_obj = await asyncio.to_thread(
client.evals.create,
name=f"agentevals-{evaluator_def.name}",
data_source_config={
"type": "custom",
"item_schema": _TEXT_PAIR_SCHEMA,
"item_schema": item_schema,
"include_sample_schema": False,
},
testing_criteria=[testing_criteria],
Expand Down Expand Up @@ -225,12 +237,17 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
total = result_counts.total if result_counts else 0
eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"

grader = evaluator_def.grader
details: dict[str, Any] = {
"openai_eval_id": eval_id,
"openai_run_id": run_id,
"evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
"result_counts": {"passed": passed, "failed": failed, "total": total},
}
if grader["type"] == "text_similarity":
details["evaluation_metric"] = grader.get("evaluation_metric")
elif grader["type"] == "label_model":
details["model"] = grader.get("model")
details["passing_labels"] = grader.get("passing_labels")
per_criteria = getattr(run, "per_testing_criteria_results", None)
if per_criteria:
details["per_testing_criteria"] = [
Expand Down
137 changes: 137 additions & 0 deletions tests/test_openai_eval_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""Tests for OpenAI Evals backend config and criteria building."""

from __future__ import annotations

import pytest
from unittest.mock import MagicMock

from agentevals.config import OpenAIEvalDef
from agentevals.openai_eval_backend import (
_ACTUAL_ONLY_SCHEMA,
_TEXT_PAIR_SCHEMA,
_build_jsonl_items,
_build_testing_criteria,
evaluate_openai_eval,
)


def _label_grader(**overrides):
base = {
"type": "label_model",
"model": "gpt-4o-mini",
"input": [{"role": "user", "content": "Rate: {{ item.actual_response }}"}],
"labels": ["good", "bad"],
"passing_labels": ["good"],
}
base.update(overrides)
return base


def _invocation(text: str):
inv = MagicMock()
inv.final_response.parts = [MagicMock(text=text)]
return inv


class TestOpenAIEvalDefValidation:
def test_text_similarity_valid(self):
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
assert d.grader["type"] == "text_similarity"

def test_text_similarity_missing_metric(self):
with pytest.raises(Exception, match="evaluation_metric"):
OpenAIEvalDef(name="sim", grader={"type": "text_similarity"})

def test_text_similarity_bad_metric(self):
with pytest.raises(Exception, match="Unknown evaluation_metric"):
OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "invalid"})

def test_label_model_valid(self):
d = OpenAIEvalDef(name="lm", grader=_label_grader())
assert d.grader["type"] == "label_model"

@pytest.mark.parametrize("field", ["model", "input", "labels", "passing_labels"])
def test_label_model_missing_required_field(self, field):
with pytest.raises(Exception, match=field):
OpenAIEvalDef(name="lm", grader=_label_grader(**{field: None}))

def test_unsupported_grader_type(self):
with pytest.raises(Exception, match="Unsupported grader type"):
OpenAIEvalDef(name="x", grader={"type": "unknown"})


class TestBuildTestingCriteria:
def test_text_similarity_shape(self):
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}, threshold=0.7)
c = _build_testing_criteria(d)
assert c["type"] == "text_similarity"
assert c["evaluation_metric"] == "bleu"
assert c["pass_threshold"] == 0.7
assert "{{ item.actual_response }}" in c["input"]
assert "{{ item.expected_response }}" in c["reference"]

def test_label_model_shape(self):
grader = _label_grader()
d = OpenAIEvalDef(name="quality", grader=grader)
c = _build_testing_criteria(d)
assert c["type"] == "label_model"
assert c["model"] == "gpt-4o-mini"
assert c["labels"] == ["good", "bad"]
assert c["passing_labels"] == ["good"]
assert c["input"] == grader["input"]


class TestBuildJsonlItems:
def test_text_similarity_includes_expected(self):
actual = [_invocation("hello")]
expected = [_invocation("world")]
items = _build_jsonl_items(actual, expected, include_expected=True)
assert len(items) == 1
assert "expected_response" in items[0]["item"]

def test_label_model_excludes_expected(self):
actual = [_invocation("hello")]
items = _build_jsonl_items(actual, [], include_expected=False)
assert len(items) == 1
assert "expected_response" not in items[0]["item"]
assert items[0]["item"]["actual_response"] is not None

def test_empty_expected_falls_back_to_empty_string(self):
actual = [_invocation("hello")]
items = _build_jsonl_items(actual, [], include_expected=True)
assert items[0]["item"]["expected_response"] == ""


class TestSchemas:
def test_actual_only_has_no_expected(self):
assert "expected_response" not in _ACTUAL_ONLY_SCHEMA["properties"]

def test_text_pair_has_both(self):
assert "actual_response" in _TEXT_PAIR_SCHEMA["properties"]
assert "expected_response" in _TEXT_PAIR_SCHEMA["properties"]


class TestEvaluateOpenAIEval:
@pytest.mark.asyncio
async def test_no_api_key_returns_error(self, monkeypatch):
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
result = await evaluate_openai_eval(d, [], [])
assert result.error is not None
assert "OPENAI_API_KEY" in result.error

@pytest.mark.asyncio
async def test_text_similarity_requires_expected(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
assert result.error is not None
assert "expected invocations" in result.error

@pytest.mark.asyncio
async def test_label_model_does_not_require_expected(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
d = OpenAIEvalDef(name="lm", grader=_label_grader())
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
assert "expected invocations" not in (result.error or "")