Skip to content

Commit c52da64

Browse files
Qardclaude
andauthored
Bump to gpt5 models (#169)
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 71e61dd commit c52da64

20 files changed

Lines changed: 562 additions & 322 deletions

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ py: ${VENV_PYTHON_PACKAGES}
1212
VENV_INITIALIZED := venv/.initialized
1313

1414
${VENV_INITIALIZED}:
15-
rm -rf venv && python -m venv venv
15+
rm -rf venv && python3 -m venv venv
1616
@touch ${VENV_INITIALIZED}
1717

1818
VENV_PYTHON_PACKAGES := venv/.python_packages

SCORERS.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Evaluates whether the output is factually consistent with the expected answer.
2525
- `input` (string): The input question or prompt
2626
- `output` (string, required): The generated answer to evaluate
2727
- `expected` (string, required): The ground truth answer
28-
- `model` (string, optional): Model to use (default: configured via `init()` or "gpt-4o")
28+
- `model` (string, optional): Model to use (default: configured via `init()` or "gpt-5-mini")
2929
- `client` (Client, optional): Custom OpenAI client
3030

3131
**Score Range:** 0-1
@@ -209,7 +209,7 @@ Evaluates how relevant the retrieved context is to the input question.
209209
- `input` (string, required): The question
210210
- `output` (string, required): The generated answer
211211
- `context` (string[] | string, required): Retrieved context passages
212-
- `model` (string, optional): Model to use (default: "gpt-4o-mini")
212+
- `model` (string, optional): Model to use (default: "gpt-5-nano")
213213

214214
**Score Range:** 0-1
215215

@@ -600,7 +600,7 @@ Note: Interpretation varies by scorer type. Binary scorers (ExactMatch, ValidJSO
600600

601601
Many scorers share these common parameters:
602602

603-
- `model` (string): LLM model to use for evaluation (default: configured via `init()` or "gpt-4o")
603+
- `model` (string): LLM model to use for evaluation (default: configured via `init()` or "gpt-5-mini")
604604
- `client` (Client): Custom OpenAI-compatible client
605605
- `use_cot` (boolean): Enable chain-of-thought reasoning for LLM scorers (default: true)
606606
- `temperature` (number): LLM temperature setting
@@ -616,13 +616,13 @@ import OpenAI from "openai";
616616

617617
init({
618618
client: new OpenAI({ apiKey: "..." }),
619-
defaultModel: "gpt-4o",
619+
defaultModel: "gpt-5-mini",
620620
});
621621
```
622622

623623
```python
624624
from autoevals import init
625625
from openai import OpenAI
626626

627-
init(OpenAI(api_key="..."), default_model="gpt-4o")
627+
init(OpenAI(api_key="..."), default_model="gpt-5-mini")
628628
```

js/init-models.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ describe("init with defaultModel parameter", () => {
3636

3737
expect(getDefaultEmbeddingModel()).toBe("text-embedding-3-large");
3838
// Completion model should remain at default since we didn't update it
39-
expect(getDefaultModel()).toBe("gpt-4o");
39+
expect(getDefaultModel()).toBe("gpt-5-mini");
4040
});
4141

4242
test("object form can set both models", () => {
@@ -76,7 +76,7 @@ describe("init with defaultModel parameter", () => {
7676
test("falls back to defaults when not set", () => {
7777
init();
7878

79-
expect(getDefaultModel()).toBe("gpt-4o");
79+
expect(getDefaultModel()).toBe("gpt-5-mini");
8080
expect(getDefaultEmbeddingModel()).toBe("text-embedding-ada-002");
8181
});
8282

js/llm.fixtures.ts

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
5252
id: "chatcmpl-B7XFw0OCpCbMVwLizRts3Cl72Obg0",
5353
object: "chat.completion",
5454
created: 1741135832,
55-
model: "gpt-4o-2024-08-06",
55+
model: "gpt-5-mini-2025-08-07",
5656
choices: [
5757
{
5858
index: 0,
@@ -98,7 +98,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
9898
id: "chatcmpl-B7YPU81s7cb2uzlwJ8w9aS5qhfhtJ",
9999
object: "chat.completion",
100100
created: 1741140268,
101-
model: "gpt-4o-2024-08-06",
101+
model: "gpt-5-mini-2025-08-07",
102102
choices: [
103103
{
104104
index: 0,
@@ -141,7 +141,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
141141
id: "chatcmpl-B7YQ9ILZ9DJR2AjY2s4qU15Rc6qII",
142142
object: "chat.completion",
143143
created: 1741140309,
144-
model: "gpt-4o-2024-08-06",
144+
model: "gpt-5-mini-2025-08-07",
145145
choices: [
146146
{
147147
index: 0,
@@ -180,7 +180,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
180180
id: "chatcmpl-B7YQa80DGu61zUWpdPtXRaJdRQz6l",
181181
object: "chat.completion",
182182
created: 1741140336,
183-
model: "gpt-4o-2024-08-06",
183+
model: "gpt-5-mini-2025-08-07",
184184
choices: [
185185
{
186186
index: 0,
@@ -222,7 +222,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
222222
id: "chatcmpl-B7YSMVJ7qaQTJ9OtR6zPUEdHxrNbT",
223223
object: "chat.completion",
224224
created: 1741140446,
225-
model: "gpt-4o-2024-08-06",
225+
model: "gpt-5-mini-2025-08-07",
226226
choices: [
227227
{
228228
index: 0,
@@ -265,7 +265,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
265265
id: "chatcmpl-B7YTPWIPOFpRcVOjEnU6s0kZXgPdB",
266266
object: "chat.completion",
267267
created: 1741140511,
268-
model: "gpt-4o-2024-08-06",
268+
model: "gpt-5-mini-2025-08-07",
269269
choices: [
270270
{
271271
index: 0,
@@ -308,7 +308,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
308308
id: "chatcmpl-B7YU2qluNL0SenvL1zBiSzrka236n",
309309
object: "chat.completion",
310310
created: 1741140550,
311-
model: "gpt-4o-2024-08-06",
311+
model: "gpt-5-mini-2025-08-07",
312312
choices: [
313313
{
314314
index: 0,
@@ -351,7 +351,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
351351
id: "chatcmpl-B7YUTk3771FhLlXQNZPaobEC0d8R6",
352352
object: "chat.completion",
353353
created: 1741140577,
354-
model: "gpt-4o-2024-08-06",
354+
model: "gpt-5-mini-2025-08-07",
355355
choices: [
356356
{
357357
index: 0,
@@ -390,7 +390,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
390390
id: "chatcmpl-B7YUtrpit4RvQCeqfOcZme9L6pMAP",
391391
object: "chat.completion",
392392
created: 1741140603,
393-
model: "gpt-4o-2024-08-06",
393+
model: "gpt-5-mini-2025-08-07",
394394
choices: [
395395
{
396396
index: 0,
@@ -432,7 +432,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
432432
id: "chatcmpl-B7YV8HHTm4hZU58Zp9gcjwp3MigEl",
433433
object: "chat.completion",
434434
created: 1741140618,
435-
model: "gpt-4o-2024-08-06",
435+
model: "gpt-5-mini-2025-08-07",
436436
choices: [
437437
{
438438
index: 0,

js/llm.test.ts

Lines changed: 63 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,27 @@ beforeAll(() => {
2525
},
2626
});
2727

28+
// Add default handler for Responses API (GPT-5 models)
29+
server.use(
30+
http.post("https://api.openai.com/v1/responses", async ({ request }) => {
31+
const body = (await request.json()) as any;
32+
33+
// Convert to Responses API format
34+
return HttpResponse.json({
35+
id: "resp-test",
36+
object: "response",
37+
created: Math.floor(Date.now() / 1000),
38+
model: body.model,
39+
output: [
40+
{
41+
type: "output_text",
42+
content: "Test response",
43+
},
44+
],
45+
});
46+
}),
47+
);
48+
2849
init({
2950
client: new OpenAI({
3051
apiKey: "test-api-key",
@@ -147,6 +168,7 @@ Issue Description: {{page_content}}
147168
2: {{expected}}`,
148169
choiceScores: { "1": 1, "2": 0 },
149170
useCoT,
171+
model: "gpt-4o-mini",
150172
});
151173

152174
let response = await classifier({
@@ -197,6 +219,7 @@ Issue Description: {{page_content}}
197219
output: "600",
198220
expected: "6",
199221
client,
222+
model: "gpt-4o-mini",
200223
});
201224

202225
expect(response.error).toBeUndefined();
@@ -207,12 +230,14 @@ Issue Description: {{page_content}}
207230
output: "6",
208231
expected: "600",
209232
client,
233+
model: "gpt-4o-mini",
210234
});
211235

212236
expect(response.error).toBeUndefined();
213237

214238
response = await Battle({
215239
useCoT,
240+
model: "gpt-4o-mini",
216241
instructions: "Add the following numbers: 1, 2, 3",
217242
output: "6",
218243
expected: "6",
@@ -227,38 +252,24 @@ Issue Description: {{page_content}}
227252
let capturedRequestBody: any;
228253

229254
server.use(
230-
http.post(
231-
"https://api.openai.com/v1/chat/completions",
232-
async ({ request }) => {
233-
capturedRequestBody = await request.json();
234-
235-
return HttpResponse.json({
236-
id: "chatcmpl-test",
237-
object: "chat.completion",
238-
created: 1234567890,
239-
model: "gpt-4o",
240-
choices: [
241-
{
242-
index: 0,
243-
message: {
244-
role: "assistant",
245-
tool_calls: [
246-
{
247-
id: "call_test",
248-
type: "function",
249-
function: {
250-
name: "select_choice",
251-
arguments: JSON.stringify({ choice: "1" }),
252-
},
253-
},
254-
],
255-
},
256-
finish_reason: "tool_calls",
257-
},
258-
],
259-
});
260-
},
261-
),
255+
http.post("https://api.openai.com/v1/responses", async ({ request }) => {
256+
capturedRequestBody = await request.json();
257+
258+
return HttpResponse.json({
259+
id: "resp-test",
260+
object: "response",
261+
created: 1234567890,
262+
model: "gpt-5-mini",
263+
output: [
264+
{
265+
type: "function_call",
266+
call_id: "call_test",
267+
name: "select_choice",
268+
arguments: JSON.stringify({ choice: "1" }),
269+
},
270+
],
271+
});
272+
}),
262273
);
263274

264275
init({
@@ -285,38 +296,24 @@ Issue Description: {{page_content}}
285296
let capturedRequestBody: any;
286297

287298
server.use(
288-
http.post(
289-
"https://api.openai.com/v1/chat/completions",
290-
async ({ request }) => {
291-
capturedRequestBody = await request.json();
292-
293-
return HttpResponse.json({
294-
id: "chatcmpl-test",
295-
object: "chat.completion",
296-
created: 1234567890,
297-
model: "gpt-4o",
298-
choices: [
299-
{
300-
index: 0,
301-
message: {
302-
role: "assistant",
303-
tool_calls: [
304-
{
305-
id: "call_test",
306-
type: "function",
307-
function: {
308-
name: "select_choice",
309-
arguments: JSON.stringify({ choice: "1" }),
310-
},
311-
},
312-
],
313-
},
314-
finish_reason: "tool_calls",
315-
},
316-
],
317-
});
318-
},
319-
),
299+
http.post("https://api.openai.com/v1/responses", async ({ request }) => {
300+
capturedRequestBody = await request.json();
301+
302+
return HttpResponse.json({
303+
id: "resp-test",
304+
object: "response",
305+
created: 1234567890,
306+
model: "gpt-5-mini",
307+
output: [
308+
{
309+
type: "function_call",
310+
call_id: "call_test",
311+
name: "select_choice",
312+
arguments: JSON.stringify({ choice: "1" }),
313+
},
314+
],
315+
});
316+
}),
320317
);
321318

322319
init({
@@ -336,9 +333,9 @@ Issue Description: {{page_content}}
336333

337334
await classifier({ output: "test output", expected: "test expected" });
338335

339-
// Verify that max_tokens and temperature ARE in the request with correct values
340-
expect(capturedRequestBody.max_tokens).toBe(256);
336+
// Verify that temperature is in the request (max_tokens not supported by Responses API)
341337
expect(capturedRequestBody.temperature).toBe(0.5);
338+
expect(capturedRequestBody.max_tokens).toBeUndefined();
342339
});
343340

344341
test("LLMClassifierFromTemplate uses configured default model", async () => {

js/llm.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ export type LLMArgs = {
7979
* The default model to use for LLM-based evaluations.
8080
* @deprecated Use `init({ defaultModel: "..." })` to configure the default model instead.
8181
*/
82-
export const DEFAULT_MODEL = "gpt-4o";
82+
export const DEFAULT_MODEL = "gpt-5-mini";
8383

8484
const PLAIN_RESPONSE_SCHEMA = {
8585
properties: {

js/oai.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,8 @@ describe("OAI", () => {
300300
expect(Object.is(builtClient, otherClient)).toBe(true);
301301
});
302302

303-
test("getDefaultModel returns gpt-4o by default", () => {
304-
expect(getDefaultModel()).toBe("gpt-4o");
303+
test("getDefaultModel returns gpt-5-mini by default", () => {
304+
expect(getDefaultModel()).toBe("gpt-5-mini");
305305
});
306306

307307
test("init sets default model", () => {
@@ -314,7 +314,7 @@ describe("OAI", () => {
314314
expect(getDefaultModel()).toBe("claude-3-5-sonnet-20241022");
315315

316316
init({ defaultModel: undefined });
317-
expect(getDefaultModel()).toBe("gpt-4o");
317+
expect(getDefaultModel()).toBe("gpt-5-mini");
318318
});
319319

320320
test("init can set both client and default model", () => {

0 commit comments

Comments
 (0)