Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ py: ${VENV_PYTHON_PACKAGES}
VENV_INITIALIZED := venv/.initialized

${VENV_INITIALIZED}:
rm -rf venv && python -m venv venv
rm -rf venv && python3 -m venv venv
@touch ${VENV_INITIALIZED}

VENV_PYTHON_PACKAGES := venv/.python_packages
Expand Down
10 changes: 5 additions & 5 deletions SCORERS.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Evaluates whether the output is factually consistent with the expected answer.
- `input` (string): The input question or prompt
- `output` (string, required): The generated answer to evaluate
- `expected` (string, required): The ground truth answer
- `model` (string, optional): Model to use (default: configured via `init()` or "gpt-4o")
- `model` (string, optional): Model to use (default: configured via `init()` or "gpt-5-mini")
- `client` (Client, optional): Custom OpenAI client

**Score Range:** 0-1
Expand Down Expand Up @@ -209,7 +209,7 @@ Evaluates how relevant the retrieved context is to the input question.
- `input` (string, required): The question
- `output` (string, required): The generated answer
- `context` (string[] | string, required): Retrieved context passages
- `model` (string, optional): Model to use (default: "gpt-4o-mini")
- `model` (string, optional): Model to use (default: "gpt-5-nano")

**Score Range:** 0-1

Expand Down Expand Up @@ -600,7 +600,7 @@ Note: Interpretation varies by scorer type. Binary scorers (ExactMatch, ValidJSO

Many scorers share these common parameters:

- `model` (string): LLM model to use for evaluation (default: configured via `init()` or "gpt-4o")
- `model` (string): LLM model to use for evaluation (default: configured via `init()` or "gpt-5-mini")
- `client` (Client): Custom OpenAI-compatible client
- `use_cot` (boolean): Enable chain-of-thought reasoning for LLM scorers (default: true)
- `temperature` (number): LLM temperature setting
Expand All @@ -616,13 +616,13 @@ import OpenAI from "openai";

init({
client: new OpenAI({ apiKey: "..." }),
defaultModel: "gpt-4o",
defaultModel: "gpt-5-mini",
});
```

```python
from autoevals import init
from openai import OpenAI

init(OpenAI(api_key="..."), default_model="gpt-4o")
init(OpenAI(api_key="..."), default_model="gpt-5-mini")
```
4 changes: 2 additions & 2 deletions js/init-models.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ describe("init with defaultModel parameter", () => {

expect(getDefaultEmbeddingModel()).toBe("text-embedding-3-large");
// Completion model should remain at default since we didn't update it
expect(getDefaultModel()).toBe("gpt-4o");
expect(getDefaultModel()).toBe("gpt-5-mini");
});

test("object form can set both models", () => {
Expand Down Expand Up @@ -76,7 +76,7 @@ describe("init with defaultModel parameter", () => {
test("falls back to defaults when not set", () => {
init();

expect(getDefaultModel()).toBe("gpt-4o");
expect(getDefaultModel()).toBe("gpt-5-mini");
expect(getDefaultEmbeddingModel()).toBe("text-embedding-ada-002");
});

Expand Down
20 changes: 10 additions & 10 deletions js/llm.fixtures.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
id: "chatcmpl-B7XFw0OCpCbMVwLizRts3Cl72Obg0",
object: "chat.completion",
created: 1741135832,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you should send up a monolith repo PR. you'll likely need to update various expect tests

choices: [
{
index: 0,
Expand Down Expand Up @@ -98,7 +98,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
id: "chatcmpl-B7YPU81s7cb2uzlwJ8w9aS5qhfhtJ",
object: "chat.completion",
created: 1741140268,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -141,7 +141,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
id: "chatcmpl-B7YQ9ILZ9DJR2AjY2s4qU15Rc6qII",
object: "chat.completion",
created: 1741140309,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -180,7 +180,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
id: "chatcmpl-B7YQa80DGu61zUWpdPtXRaJdRQz6l",
object: "chat.completion",
created: 1741140336,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -222,7 +222,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
id: "chatcmpl-B7YSMVJ7qaQTJ9OtR6zPUEdHxrNbT",
object: "chat.completion",
created: 1741140446,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -265,7 +265,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
id: "chatcmpl-B7YTPWIPOFpRcVOjEnU6s0kZXgPdB",
object: "chat.completion",
created: 1741140511,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -308,7 +308,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
id: "chatcmpl-B7YU2qluNL0SenvL1zBiSzrka236n",
object: "chat.completion",
created: 1741140550,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -351,7 +351,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
id: "chatcmpl-B7YUTk3771FhLlXQNZPaobEC0d8R6",
object: "chat.completion",
created: 1741140577,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -390,7 +390,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
id: "chatcmpl-B7YUtrpit4RvQCeqfOcZme9L6pMAP",
object: "chat.completion",
created: 1741140603,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down Expand Up @@ -432,7 +432,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
id: "chatcmpl-B7YV8HHTm4hZU58Zp9gcjwp3MigEl",
object: "chat.completion",
created: 1741140618,
model: "gpt-4o-2024-08-06",
model: "gpt-5-mini-2025-08-07",
choices: [
{
index: 0,
Expand Down
129 changes: 63 additions & 66 deletions js/llm.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,27 @@ beforeAll(() => {
},
});

// Add default handler for Responses API (GPT-5 models)
server.use(
http.post("https://api.openai.com/v1/responses", async ({ request }) => {
const body = (await request.json()) as any;

// Convert to Responses API format
return HttpResponse.json({
id: "resp-test",
object: "response",
created: Math.floor(Date.now() / 1000),
model: body.model,
output: [
{
type: "output_text",
content: "Test response",
},
],
});
}),
);

init({
client: new OpenAI({
apiKey: "test-api-key",
Expand Down Expand Up @@ -147,6 +168,7 @@ Issue Description: {{page_content}}
2: {{expected}}`,
choiceScores: { "1": 1, "2": 0 },
useCoT,
model: "gpt-4o-mini",
});

let response = await classifier({
Expand Down Expand Up @@ -197,6 +219,7 @@ Issue Description: {{page_content}}
output: "600",
expected: "6",
client,
model: "gpt-4o-mini",
});

expect(response.error).toBeUndefined();
Expand All @@ -207,12 +230,14 @@ Issue Description: {{page_content}}
output: "6",
expected: "600",
client,
model: "gpt-4o-mini",
});

expect(response.error).toBeUndefined();

response = await Battle({
useCoT,
model: "gpt-4o-mini",
instructions: "Add the following numbers: 1, 2, 3",
output: "6",
expected: "6",
Expand All @@ -227,38 +252,24 @@ Issue Description: {{page_content}}
let capturedRequestBody: any;

server.use(
http.post(
"https://api.openai.com/v1/chat/completions",
async ({ request }) => {
capturedRequestBody = await request.json();

return HttpResponse.json({
id: "chatcmpl-test",
object: "chat.completion",
created: 1234567890,
model: "gpt-4o",
choices: [
{
index: 0,
message: {
role: "assistant",
tool_calls: [
{
id: "call_test",
type: "function",
function: {
name: "select_choice",
arguments: JSON.stringify({ choice: "1" }),
},
},
],
},
finish_reason: "tool_calls",
},
],
});
},
),
http.post("https://api.openai.com/v1/responses", async ({ request }) => {
capturedRequestBody = await request.json();

return HttpResponse.json({
id: "resp-test",
object: "response",
created: 1234567890,
model: "gpt-5-mini",
output: [
{
type: "function_call",
call_id: "call_test",
name: "select_choice",
arguments: JSON.stringify({ choice: "1" }),
},
],
});
}),
);

init({
Expand All @@ -285,38 +296,24 @@ Issue Description: {{page_content}}
let capturedRequestBody: any;

server.use(
http.post(
"https://api.openai.com/v1/chat/completions",
async ({ request }) => {
capturedRequestBody = await request.json();

return HttpResponse.json({
id: "chatcmpl-test",
object: "chat.completion",
created: 1234567890,
model: "gpt-4o",
choices: [
{
index: 0,
message: {
role: "assistant",
tool_calls: [
{
id: "call_test",
type: "function",
function: {
name: "select_choice",
arguments: JSON.stringify({ choice: "1" }),
},
},
],
},
finish_reason: "tool_calls",
},
],
});
},
),
http.post("https://api.openai.com/v1/responses", async ({ request }) => {
capturedRequestBody = await request.json();

return HttpResponse.json({
id: "resp-test",
object: "response",
created: 1234567890,
model: "gpt-5-mini",
output: [
{
type: "function_call",
call_id: "call_test",
name: "select_choice",
arguments: JSON.stringify({ choice: "1" }),
},
],
});
}),
);

init({
Expand All @@ -336,9 +333,9 @@ Issue Description: {{page_content}}

await classifier({ output: "test output", expected: "test expected" });

// Verify that max_tokens and temperature ARE in the request with correct values
expect(capturedRequestBody.max_tokens).toBe(256);
// Verify that temperature is in the request (max_tokens not supported by Responses API)
expect(capturedRequestBody.temperature).toBe(0.5);
expect(capturedRequestBody.max_tokens).toBeUndefined();
});

test("LLMClassifierFromTemplate uses configured default model", async () => {
Expand Down
2 changes: 1 addition & 1 deletion js/llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ export type LLMArgs = {
* The default model to use for LLM-based evaluations.
* @deprecated Use `init({ defaultModel: "..." })` to configure the default model instead.
*/
export const DEFAULT_MODEL = "gpt-4o";
export const DEFAULT_MODEL = "gpt-5-mini";

const PLAIN_RESPONSE_SCHEMA = {
properties: {
Expand Down
6 changes: 3 additions & 3 deletions js/oai.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,8 @@ describe("OAI", () => {
expect(Object.is(builtClient, otherClient)).toBe(true);
});

test("getDefaultModel returns gpt-4o by default", () => {
expect(getDefaultModel()).toBe("gpt-4o");
test("getDefaultModel returns gpt-5-mini by default", () => {
expect(getDefaultModel()).toBe("gpt-5-mini");
});

test("init sets default model", () => {
Expand All @@ -314,7 +314,7 @@ describe("OAI", () => {
expect(getDefaultModel()).toBe("claude-3-5-sonnet-20241022");

init({ defaultModel: undefined });
expect(getDefaultModel()).toBe("gpt-4o");
expect(getDefaultModel()).toBe("gpt-5-mini");
});

test("init can set both client and default model", () => {
Expand Down
Loading