braintrustdata
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎SCORERS.md‎
Lines changed: 5 additions & 5 deletions b/‎SCORERS.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎js/init-models.test.ts‎
Lines changed: 2 additions & 2 deletions b/‎js/init-models.test.ts‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎js/llm.fixtures.ts‎
Lines changed: 10 additions & 10 deletions b/‎js/llm.fixtures.ts‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎js/llm.test.ts‎
Lines changed: 63 additions & 66 deletions b/‎js/llm.test.ts‎
Lines changed: 63 additions & 66 deletions
diff --git a/‎js/llm.ts‎
Lines changed: 1 addition & 1 deletion b/‎js/llm.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎js/oai.test.ts‎
Lines changed: 3 additions & 3 deletions b/‎js/oai.test.ts‎
Lines changed: 3 additions & 3 deletions
@@ -12,7 +12,7 @@ py: ${VENV_PYTHON_PACKAGES}
 VENV_INITIALIZED := venv/.initialized
 
 ${VENV_INITIALIZED}:
-	rm -rf venv && python -m venv venv
+	rm -rf venv && python3 -m venv venv
 	@touch ${VENV_INITIALIZED}
 
 VENV_PYTHON_PACKAGES := venv/.python_packages
 
@@ -25,7 +25,7 @@ Evaluates whether the output is factually consistent with the expected answer.
 - `input` (string): The input question or prompt
 - `output` (string, required): The generated answer to evaluate
 - `expected` (string, required): The ground truth answer
-- `model` (string, optional): Model to use (default: configured via `init()` or "gpt-4o")
+- `model` (string, optional): Model to use (default: configured via `init()` or "gpt-5-mini")
 - `client` (Client, optional): Custom OpenAI client
 
 **Score Range:** 0-1
@@ -209,7 +209,7 @@ Evaluates how relevant the retrieved context is to the input question.
 - `input` (string, required): The question
 - `output` (string, required): The generated answer
 - `context` (string[] | string, required): Retrieved context passages
-- `model` (string, optional): Model to use (default: "gpt-4o-mini")
+- `model` (string, optional): Model to use (default: "gpt-5-nano")
 
 **Score Range:** 0-1
 
@@ -600,7 +600,7 @@ Note: Interpretation varies by scorer type. Binary scorers (ExactMatch, ValidJSO
 
 Many scorers share these common parameters:
 
-- `model` (string): LLM model to use for evaluation (default: configured via `init()` or "gpt-4o")
+- `model` (string): LLM model to use for evaluation (default: configured via `init()` or "gpt-5-mini")
 - `client` (Client): Custom OpenAI-compatible client
 - `use_cot` (boolean): Enable chain-of-thought reasoning for LLM scorers (default: true)
 - `temperature` (number): LLM temperature setting
@@ -616,13 +616,13 @@ import OpenAI from "openai";
 
 init({
   client: new OpenAI({ apiKey: "..." }),
-  defaultModel: "gpt-4o",
+  defaultModel: "gpt-5-mini",
 });
 ```
 
 ```python
 from autoevals import init
 from openai import OpenAI
 
-init(OpenAI(api_key="..."), default_model="gpt-4o")
+init(OpenAI(api_key="..."), default_model="gpt-5-mini")
 ```
@@ -36,7 +36,7 @@ describe("init with defaultModel parameter", () => {
 
     expect(getDefaultEmbeddingModel()).toBe("text-embedding-3-large");
     // Completion model should remain at default since we didn't update it
-    expect(getDefaultModel()).toBe("gpt-4o");
+    expect(getDefaultModel()).toBe("gpt-5-mini");
   });
 
   test("object form can set both models", () => {
@@ -76,7 +76,7 @@ describe("init with defaultModel parameter", () => {
   test("falls back to defaults when not set", () => {
     init();
 
-    expect(getDefaultModel()).toBe("gpt-4o");
+    expect(getDefaultModel()).toBe("gpt-5-mini");
     expect(getDefaultEmbeddingModel()).toBe("text-embedding-ada-002");
   });
 
 
@@ -52,7 +52,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
     id: "chatcmpl-B7XFw0OCpCbMVwLizRts3Cl72Obg0",
     object: "chat.completion",
     created: 1741135832,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -98,7 +98,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
     id: "chatcmpl-B7YPU81s7cb2uzlwJ8w9aS5qhfhtJ",
     object: "chat.completion",
     created: 1741140268,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -141,7 +141,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
     id: "chatcmpl-B7YQ9ILZ9DJR2AjY2s4qU15Rc6qII",
     object: "chat.completion",
     created: 1741140309,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -180,7 +180,7 @@ export const openaiClassifierShouldEvaluateTitlesWithCoT = [
     id: "chatcmpl-B7YQa80DGu61zUWpdPtXRaJdRQz6l",
     object: "chat.completion",
     created: 1741140336,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -222,7 +222,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
     id: "chatcmpl-B7YSMVJ7qaQTJ9OtR6zPUEdHxrNbT",
     object: "chat.completion",
     created: 1741140446,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -265,7 +265,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
     id: "chatcmpl-B7YTPWIPOFpRcVOjEnU6s0kZXgPdB",
     object: "chat.completion",
     created: 1741140511,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -308,7 +308,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
     id: "chatcmpl-B7YU2qluNL0SenvL1zBiSzrka236n",
     object: "chat.completion",
     created: 1741140550,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -351,7 +351,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
     id: "chatcmpl-B7YUTk3771FhLlXQNZPaobEC0d8R6",
     object: "chat.completion",
     created: 1741140577,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -390,7 +390,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
     id: "chatcmpl-B7YUtrpit4RvQCeqfOcZme9L6pMAP",
     object: "chat.completion",
     created: 1741140603,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
@@ -432,7 +432,7 @@ export const openaiClassifierShouldEvaluateArithmeticExpressions = [
     id: "chatcmpl-B7YV8HHTm4hZU58Zp9gcjwp3MigEl",
     object: "chat.completion",
     created: 1741140618,
-    model: "gpt-4o-2024-08-06",
+    model: "gpt-5-mini-2025-08-07",
     choices: [
       {
         index: 0,
 
@@ -25,6 +25,27 @@ beforeAll(() => {
     },
   });
 
+  // Add default handler for Responses API (GPT-5 models)
+  server.use(
+    http.post("https://api.openai.com/v1/responses", async ({ request }) => {
+      const body = (await request.json()) as any;
+
+      // Convert to Responses API format
+      return HttpResponse.json({
+        id: "resp-test",
+        object: "response",
+        created: Math.floor(Date.now() / 1000),
+        model: body.model,
+        output: [
+          {
+            type: "output_text",
+            content: "Test response",
+          },
+        ],
+      });
+    }),
+  );
+
   init({
     client: new OpenAI({
       apiKey: "test-api-key",
@@ -147,6 +168,7 @@ Issue Description: {{page_content}}
 2: {{expected}}`,
         choiceScores: { "1": 1, "2": 0 },
         useCoT,
+        model: "gpt-4o-mini",
       });
 
       let response = await classifier({
@@ -197,6 +219,7 @@ Issue Description: {{page_content}}
         output: "600",
         expected: "6",
         client,
+        model: "gpt-4o-mini",
       });
 
       expect(response.error).toBeUndefined();
@@ -207,12 +230,14 @@ Issue Description: {{page_content}}
         output: "6",
         expected: "600",
         client,
+        model: "gpt-4o-mini",
       });
 
       expect(response.error).toBeUndefined();
 
       response = await Battle({
         useCoT,
+        model: "gpt-4o-mini",
         instructions: "Add the following numbers: 1, 2, 3",
         output: "6",
         expected: "6",
@@ -227,38 +252,24 @@ Issue Description: {{page_content}}
     let capturedRequestBody: any;
 
     server.use(
-      http.post(
-        "https://api.openai.com/v1/chat/completions",
-        async ({ request }) => {
-          capturedRequestBody = await request.json();
-
-          return HttpResponse.json({
-            id: "chatcmpl-test",
-            object: "chat.completion",
-            created: 1234567890,
-            model: "gpt-4o",
-            choices: [
-              {
-                index: 0,
-                message: {
-                  role: "assistant",
-                  tool_calls: [
-                    {
-                      id: "call_test",
-                      type: "function",
-                      function: {
-                        name: "select_choice",
-                        arguments: JSON.stringify({ choice: "1" }),
-                      },
-                    },
-                  ],
-                },
-                finish_reason: "tool_calls",
-              },
-            ],
-          });
-        },
-      ),
+      http.post("https://api.openai.com/v1/responses", async ({ request }) => {
+        capturedRequestBody = await request.json();
+
+        return HttpResponse.json({
+          id: "resp-test",
+          object: "response",
+          created: 1234567890,
+          model: "gpt-5-mini",
+          output: [
+            {
+              type: "function_call",
+              call_id: "call_test",
+              name: "select_choice",
+              arguments: JSON.stringify({ choice: "1" }),
+            },
+          ],
+        });
+      }),
     );
 
     init({
@@ -285,38 +296,24 @@ Issue Description: {{page_content}}
     let capturedRequestBody: any;
 
     server.use(
-      http.post(
-        "https://api.openai.com/v1/chat/completions",
-        async ({ request }) => {
-          capturedRequestBody = await request.json();
-
-          return HttpResponse.json({
-            id: "chatcmpl-test",
-            object: "chat.completion",
-            created: 1234567890,
-            model: "gpt-4o",
-            choices: [
-              {
-                index: 0,
-                message: {
-                  role: "assistant",
-                  tool_calls: [
-                    {
-                      id: "call_test",
-                      type: "function",
-                      function: {
-                        name: "select_choice",
-                        arguments: JSON.stringify({ choice: "1" }),
-                      },
-                    },
-                  ],
-                },
-                finish_reason: "tool_calls",
-              },
-            ],
-          });
-        },
-      ),
+      http.post("https://api.openai.com/v1/responses", async ({ request }) => {
+        capturedRequestBody = await request.json();
+
+        return HttpResponse.json({
+          id: "resp-test",
+          object: "response",
+          created: 1234567890,
+          model: "gpt-5-mini",
+          output: [
+            {
+              type: "function_call",
+              call_id: "call_test",
+              name: "select_choice",
+              arguments: JSON.stringify({ choice: "1" }),
+            },
+          ],
+        });
+      }),
     );
 
     init({
@@ -336,9 +333,9 @@ Issue Description: {{page_content}}
 
     await classifier({ output: "test output", expected: "test expected" });
 
-    // Verify that max_tokens and temperature ARE in the request with correct values
-    expect(capturedRequestBody.max_tokens).toBe(256);
+    // Verify that temperature is in the request (max_tokens not supported by Responses API)
     expect(capturedRequestBody.temperature).toBe(0.5);
+    expect(capturedRequestBody.max_tokens).toBeUndefined();
   });
 
   test("LLMClassifierFromTemplate uses configured default model", async () => {
 
@@ -79,7 +79,7 @@ export type LLMArgs = {
  * The default model to use for LLM-based evaluations.
  * @deprecated Use `init({ defaultModel: "..." })` to configure the default model instead.
  */
-export const DEFAULT_MODEL = "gpt-4o";
+export const DEFAULT_MODEL = "gpt-5-mini";
 
 const PLAIN_RESPONSE_SCHEMA = {
   properties: {
 
@@ -300,8 +300,8 @@ describe("OAI", () => {
     expect(Object.is(builtClient, otherClient)).toBe(true);
   });
 
-  test("getDefaultModel returns gpt-4o by default", () => {
-    expect(getDefaultModel()).toBe("gpt-4o");
+  test("getDefaultModel returns gpt-5-mini by default", () => {
+    expect(getDefaultModel()).toBe("gpt-5-mini");
   });
 
   test("init sets default model", () => {
@@ -314,7 +314,7 @@ describe("OAI", () => {
     expect(getDefaultModel()).toBe("claude-3-5-sonnet-20241022");
 
     init({ defaultModel: undefined });
-    expect(getDefaultModel()).toBe("gpt-4o");
+    expect(getDefaultModel()).toBe("gpt-5-mini");
   });
 
   test("init can set both client and default model", () => {