Skip to content

Commit d95c0ea

Browse files
committed
Better align with braintrust-spec
1 parent d485de8 commit d95c0ea

8 files changed

Lines changed: 139 additions & 74 deletions

File tree

js/dev/server.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import {
88
EvalScorer,
99
EvaluatorDef,
1010
OneOrMoreScores,
11+
classifierName,
1112
scorerName,
1213
} from "../src/framework";
1314
import { errorHandler } from "./errorHandler";
@@ -120,6 +121,9 @@ export function runDevServer(
120121
scores: (evaluator.scores ?? []).map((score, idx) => ({
121122
name: scorerName(score, idx),
122123
})),
124+
classifiers: (evaluator.classifiers ?? []).map((classifier, idx) => ({
125+
name: classifierName(classifier, idx),
126+
})),
123127
};
124128
}
125129

js/dev/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ export type SerializedParametersContainer = z.infer<
111111
export const evaluatorDefinitionSchema = z.object({
112112
parameters: serializedParametersContainerSchema.optional(),
113113
scores: z.array(z.object({ name: z.string() })).optional(),
114+
classifiers: z.array(z.object({ name: z.string() })).optional(),
114115
});
115116
export type EvaluatorDefinition = z.infer<typeof evaluatorDefinitionSchema>;
116117

js/src/cli/functions/infer-source.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ export async function findCodeDefinition({
8686
fn =
8787
location.position.type === "task"
8888
? evaluator.task
89-
: (evaluator.scores ?? [])[location.position.index];
89+
: location.position.type === "scorer"
90+
? (evaluator.scores ?? [])[location.position.index]
91+
: (evaluator.classifiers ?? [])[location.position.index];
9092
}
9193
} else if (location.type === "function") {
9294
fn = outFileModule.functions[location.index].handler;

js/src/cli/functions/upload.test.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,37 @@ describe("buildBundledFunctionEntry", () => {
8484

8585
expect(entry.tags).toBeUndefined();
8686
});
87+
88+
test("preserves classifier experiment locations", async () => {
89+
const entry = await buildBundledFunctionEntry({
90+
spec: {
91+
project_id: "proj-123",
92+
name: "test-classifier",
93+
slug: "test-classifier",
94+
description: "Test classifier",
95+
location: {
96+
type: "experiment" as const,
97+
eval_name: "eval-1",
98+
position: {
99+
type: "classifier" as const,
100+
index: 0,
101+
},
102+
},
103+
function_type: "classifier" as const,
104+
},
105+
runtime_context: { runtime: "node", version: "22.0.0" },
106+
bundleId: "bundle-123",
107+
sourceMapContext: undefined,
108+
});
109+
110+
expect(entry.function_type).toBe("classifier");
111+
expect(entry.function_data.data.location).toEqual({
112+
type: "experiment",
113+
eval_name: "eval-1",
114+
position: {
115+
type: "classifier",
116+
index: 0,
117+
},
118+
});
119+
});
87120
});

js/src/cli/functions/upload.ts

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import {
44
type IfExistsType as IfExists,
55
} from "../../generated_types";
66
import type { BuildSuccess, EvaluatorState, FileHandle } from "../types";
7-
import { scorerName, warning } from "../../framework";
7+
import { classifierName, scorerName, warning } from "../../framework";
88
import {
99
_internalGetGlobalState,
1010
Experiment,
@@ -200,6 +200,23 @@ export async function uploadHandleBundles({
200200
};
201201
},
202202
),
203+
...(evaluator.evaluator.classifiers ?? []).map(
204+
(classifier, i): BundledFunctionSpec => {
205+
const name = classifierName(classifier, i);
206+
return {
207+
...baseInfo,
208+
...formatNameAndSlug(["eval", namePrefix, "classifier", name]),
209+
description: `Classifier ${name} for eval ${namePrefix}`,
210+
location: {
211+
type: "experiment",
212+
eval_name: evaluator.evaluator.evalName,
213+
position: { type: "classifier", index: i },
214+
},
215+
function_type: "classifier",
216+
origin,
217+
};
218+
},
219+
),
203220
];
204221

205222
bundleSpecs.push(...fileSpecs);
@@ -225,6 +242,11 @@ export async function uploadHandleBundles({
225242
scores: (evaluator.evaluator.scores ?? []).map((score, i) => ({
226243
name: scorerName(score, i),
227244
})),
245+
classifiers: (evaluator.evaluator.classifiers ?? []).map(
246+
(classifier, i) => ({
247+
name: classifierName(classifier, i),
248+
}),
249+
),
228250
};
229251

230252
bundleSpecs.push({

js/src/framework.test.ts

Lines changed: 22 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,9 @@ describe("runEvaluator", () => {
186186
undefined,
187187
);
188188

189-
expect(out.results.every((r) => Object.keys(r.scores).length === 0)).toBe(
190-
true,
191-
);
189+
expect(
190+
out.results.every((r) => Object.keys(r.scores ?? {}).length === 0),
191+
).toBe(true);
192192
});
193193

194194
describe("errorScoreHandler", () => {
@@ -216,8 +216,8 @@ describe("runEvaluator", () => {
216216
expect(
217217
out.results.every(
218218
(r) =>
219-
Object.keys(r.scores).length === 3 &&
220-
Object.values(r.scores).every((v) => v === 0),
219+
Object.keys(r.scores ?? {}).length === 3 &&
220+
Object.values(r.scores ?? {}).every((v) => v === 0),
221221
),
222222
).toBe(true);
223223
});
@@ -245,10 +245,10 @@ describe("runEvaluator", () => {
245245
expect(
246246
out.results.every(
247247
(r) =>
248-
Object.keys(r.scores).length === 3 &&
249-
r.scores.scorer_0 === 0 &&
250-
r.scores.scorer_1 === 1 &&
251-
r.scores.scorer_2 === 1,
248+
Object.keys(r.scores ?? {}).length === 3 &&
249+
r.scores?.scorer_0 === 0 &&
250+
r.scores?.scorer_1 === 1 &&
251+
r.scores?.scorer_2 === 1,
252252
),
253253
).toBe(true);
254254
});
@@ -276,7 +276,7 @@ describe("runEvaluator", () => {
276276
);
277277

278278
expect(
279-
out.results.every((r) => Object.keys(r.scores).length === 0),
279+
out.results.every((r) => Object.keys(r.scores ?? {}).length === 0),
280280
).toBe(true);
281281
});
282282

@@ -303,8 +303,8 @@ describe("runEvaluator", () => {
303303
expect(
304304
out.results.every(
305305
(r) =>
306-
Object.keys(r.scores).length === 1 &&
307-
r.scores.error_score === 1,
306+
Object.keys(r.scores ?? {}).length === 1 &&
307+
r.scores?.error_score === 1,
308308
),
309309
).toBe(true);
310310
});
@@ -579,13 +579,13 @@ test("Eval with noSendLogs: true runs locally without creating experiment", asyn
579579
expect(result.results).toHaveLength(2);
580580
expect(result.results[0].input).toBe("hello");
581581
expect(result.results[0].output).toBe("hello world");
582-
expect(result.results[0].scores.exact_match).toBe(1);
583-
expect(result.results[0].scores.simple_scorer).toBe(0.8);
582+
expect(result.results[0].scores?.exact_match).toBe(1);
583+
expect(result.results[0].scores?.simple_scorer).toBe(0.8);
584584

585585
expect(result.results[1].input).toBe("test");
586586
expect(result.results[1].output).toBe("test world");
587-
expect(result.results[1].scores.exact_match).toBe(1);
588-
expect(result.results[1].scores.simple_scorer).toBe(0.8);
587+
expect(result.results[1].scores?.exact_match).toBe(1);
588+
expect(result.results[1].scores?.simple_scorer).toBe(0.8);
589589

590590
// Verify it builds a local summary (no experimentUrl means local run)
591591
expect(result.summary.projectName).toBe("test-no-logs");
@@ -660,10 +660,10 @@ test("Eval with returnResults: true collects all results", async () => {
660660
expect(result.results).toHaveLength(2);
661661
expect(result.results[0].input).toBe("hello");
662662
expect(result.results[0].output).toBe("hello world");
663-
expect(result.results[0].scores.exact_match).toBe(1);
663+
expect(result.results[0].scores?.exact_match).toBe(1);
664664
expect(result.results[1].input).toBe("test");
665665
expect(result.results[1].output).toBe("test world");
666-
expect(result.results[1].scores.exact_match).toBe(1);
666+
expect(result.results[1].scores?.exact_match).toBe(1);
667667

668668
// Summary should also be correct
669669
expect(result.summary.scores.exact_match.score).toBe(1);
@@ -862,7 +862,7 @@ test("scorer spans have purpose='scorer' attribute", async () => {
862862
);
863863

864864
expect(result.results).toHaveLength(1);
865-
expect(result.results[0].scores.simple_scorer).toBe(1);
865+
expect(result.results[0].scores?.simple_scorer).toBe(1);
866866

867867
await memoryLogger.flush();
868868
const logs = await memoryLogger.drain();
@@ -1635,6 +1635,7 @@ test("classifier-only evaluator populates classifications field", async () => {
16351635

16361636
expect(result.results).toHaveLength(1);
16371637
const r = result.results[0];
1638+
expect(r.scores).toBeUndefined();
16381639
expect(r.classifications?.category).toEqual([
16391640
{
16401641
id: "greeting",
@@ -1661,7 +1662,7 @@ test("scorer-only evaluator populates scores field", async () => {
16611662
);
16621663

16631664
expect(result.results).toHaveLength(1);
1664-
expect(result.results[0].scores.exact_match).toBe(1);
1665+
expect(result.results[0].scores?.exact_match).toBe(1);
16651666
expect(result.results[0].classifications).toBeUndefined();
16661667
});
16671668

@@ -1713,27 +1714,8 @@ test("mixed evaluator populates both scores and classifications", async () => {
17131714
);
17141715

17151716
expect(result.results).toHaveLength(1);
1716-
expect(result.results[0].scores.exact_match).toBe(1);
1717+
expect(result.results[0].scores?.exact_match).toBe(1);
17171718
expect(result.results[0].classifications?.category).toEqual([
17181719
{ id: "greeting", label: "Greeting" },
17191720
]);
17201721
});
1721-
1722-
test("malformed classifier output fails clearly", async () => {
1723-
const result = await Eval(
1724-
"test-invalid-classifier-output",
1725-
{
1726-
data: [{ input: "hello" }],
1727-
task: (input) => input,
1728-
classifiers: [() => ({}) as never],
1729-
},
1730-
{ noSendLogs: true, returnResults: true },
1731-
);
1732-
1733-
expect(result.results).toHaveLength(1);
1734-
expect((result.results[0] as any).metadata?.classifier_errors).toMatchObject({
1735-
classifier_0: expect.stringMatching(
1736-
/must return classifications with a non-empty string name/,
1737-
),
1738-
});
1739-
});

0 commit comments

Comments
 (0)