Skip to content

Commit c39a9dc

Browse files
committed
switch to bt
1 parent c0dd75b commit c39a9dc

7 files changed

Lines changed: 326 additions & 164 deletions

File tree

action.yml

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,35 @@ inputs:
2121
required: false
2222
default: "."
2323
runtime:
24-
description: "The runtime to use for evals. Valid values: node, python."
25-
required: true
24+
description:
25+
"Deprecated. Use 'runner' instead. The runtime to use for evals. Valid
26+
values: node, python."
27+
required: false
28+
default: ""
2629
package_manager:
2730
description:
28-
"The package manager to use for evals. Valid values: npm, pnpm, yarn, pip,
29-
or uv depending on the runtime."
31+
"Deprecated. Use 'runner' instead. The package manager to use for evals."
32+
required: false
33+
default: ""
34+
runner:
35+
description:
36+
"The eval runner binary to use (e.g. tsx, vite-node, ts-node, bun,
37+
python3). When omitted, bt auto-detects from file extensions. Overrides
38+
the deprecated 'runtime' and 'package_manager' inputs."
39+
required: false
40+
default: ""
41+
filter:
42+
description:
43+
"Comma-separated filter expression(s) used to select which evaluators to
44+
run."
45+
required: false
46+
default: ""
47+
bt_version:
48+
description:
49+
"Version of the bt CLI to install. Leave empty for the latest stable
50+
release. Pass a semver (e.g. '0.2.0') to pin a stable version. Pass a
51+
full release tag (e.g. 'canary-add-glob-support') to install a canary
52+
build. Note: bt eval is currently Unix (Linux/macOS) only."
3053
required: false
3154
default: ""
3255
use_proxy:

eval/dist/index.js

Lines changed: 38 additions & 37 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

eval/dist/index.js.map

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

eval/package.json

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,6 @@
6969
"dependencies": {
7070
"@actions/core": "^1.10.1",
7171
"@actions/github": "^6.0.0",
72-
"@braintrust/core": "^0.0.34",
73-
"braintrust": "^0.0.124",
7472
"zod": "^3.23.4"
7573
},
7674
"devDependencies": {

eval/src/braintrust.test.ts

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import { spawn } from "child_process";
2+
import * as core from "@actions/core";
3+
4+
// We'll test the actual spawning behavior with real commands
5+
describe("braintrust Buffer handling", () => {
6+
beforeEach(() => {
7+
// Silence the core.info and core.error logs during tests
8+
jest.spyOn(core, "info").mockImplementation();
9+
jest.spyOn(core, "error").mockImplementation();
10+
});
11+
12+
afterEach(() => {
13+
jest.restoreAllMocks();
14+
});
15+
16+
it("should handle Buffer output from echo command", async () => {
17+
const result = await new Promise<string[]>((resolve, reject) => {
18+
const outputs: string[] = [];
19+
const process = spawn("echo", ["test output"], { shell: true });
20+
21+
process.stdout?.on("data", (data: Buffer) => {
22+
// This is what our fix does - convert Buffer to string
23+
outputs.push(data.toString());
24+
});
25+
26+
process.on("close", code => {
27+
if (code === 0) {
28+
resolve(outputs);
29+
} else {
30+
reject(new Error(`Process failed with code ${code}`));
31+
}
32+
});
33+
});
34+
35+
expect(result).toHaveLength(1);
36+
expect(result[0]).toContain("test output");
37+
});
38+
39+
it("should handle multiline Buffer output", async () => {
40+
const testJson = { test_field: "value", score: 0.95 };
41+
const jsonString = JSON.stringify(testJson);
42+
43+
const result = await new Promise<any[]>((resolve, reject) => {
44+
const parsed: any[] = [];
45+
// Use printf to output exactly what we want without extra newline
46+
const process = spawn("printf", [`'${jsonString}\\n'`], { shell: true });
47+
48+
process.stdout?.on("data", (data: Buffer) => {
49+
const text = data.toString();
50+
text
51+
.split("\n")
52+
.map(line => line.trim())
53+
.filter(line => line.length > 0)
54+
.forEach(line => {
55+
try {
56+
parsed.push(JSON.parse(line));
57+
} catch (e) {
58+
// Skip non-JSON lines
59+
}
60+
});
61+
});
62+
63+
process.on("close", code => {
64+
if (code === 0) {
65+
resolve(parsed);
66+
} else {
67+
reject(new Error(`Process failed with code ${code}`));
68+
}
69+
});
70+
});
71+
72+
expect(result).toHaveLength(1);
73+
expect(result[0]).toEqual(testJson);
74+
});
75+
76+
it("should handle stderr as Buffer", async () => {
77+
const stderrOutput = await new Promise<string>((resolve, reject) => {
78+
let error = "";
79+
// Command that writes to stderr
80+
const process = spawn("sh", ["-c", "echo 'error message' >&2"], {
81+
shell: false,
82+
});
83+
84+
process.stderr?.on("data", (data: Buffer) => {
85+
error += data.toString();
86+
});
87+
88+
process.on("close", () => {
89+
resolve(error);
90+
});
91+
});
92+
93+
expect(stderrOutput).toContain("error message");
94+
});
95+
});

eval/src/braintrust.ts

Lines changed: 131 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,38 @@
1+
import os from "os";
12
import path from "path";
23
import * as core from "@actions/core";
34
import { spawn } from "child_process";
45

56
import { Params } from "./main";
6-
import { ExperimentSummary } from "braintrust";
7+
8+
export interface ScoreSummary {
9+
name: string;
10+
score: number;
11+
diff?: number;
12+
improvements: number;
13+
regressions: number;
14+
}
15+
16+
export interface MetricSummary {
17+
name: string;
18+
metric: number;
19+
unit: string;
20+
diff?: number;
21+
improvements: number;
22+
regressions: number;
23+
}
24+
25+
export interface ExperimentSummary {
26+
projectName: string;
27+
experimentName: string;
28+
projectId?: string;
29+
experimentId?: string;
30+
projectUrl?: string;
31+
experimentUrl?: string;
32+
comparisonExperimentName?: string;
33+
scores: Record<string, ScoreSummary>;
34+
metrics?: Record<string, MetricSummary>;
35+
}
736

837
export interface ExperimentFailure {
938
evaluatorName: string;
@@ -12,50 +41,79 @@ export interface ExperimentFailure {
1241

1342
type OnSummaryFn = (summary: (ExperimentSummary | ExperimentFailure)[]) => void;
1443

15-
function snakeToCamelCase(str: string) {
16-
return str.replace(/([-_][a-z])/g, group => group.charAt(1).toUpperCase());
44+
// Installs the bt CLI and adds its bin directory to PATH for the current
45+
// process. version may be:
46+
// "" → latest stable via https://bt.dev/cli/install.sh
47+
// semver like "0.2.0" → pinned stable via the same script with --version
48+
// release tag like "canary-add-glob-support" → canary installer from GH release
49+
async function installBt(version: string): Promise<void> {
50+
const isCanary = version !== "" && !version.match(/^\d+\.\d+\.\d+/);
51+
52+
let installCmd: string;
53+
if (isCanary) {
54+
installCmd = `curl -fsSL https://github.com/braintrustdata/bt/releases/download/${version}/bt-installer.sh | sh`;
55+
} else if (version !== "") {
56+
installCmd = `curl -fsSL https://bt.dev/cli/install.sh | sh -s -- --version ${version}`;
57+
} else {
58+
installCmd = `curl -fsSL https://bt.dev/cli/install.sh | sh`;
59+
}
60+
61+
core.info(`Installing bt CLI: ${installCmd}`);
62+
await runCommand(installCmd, () => {});
63+
64+
// The installer puts the binary in ~/.local/bin (or $XDG_BIN_HOME).
65+
// Make sure the spawned child processes can find it.
66+
const localBin = path.join(os.homedir(), ".local", "bin");
67+
const xdgBin = process.env.XDG_BIN_HOME ?? "";
68+
for (const dir of [xdgBin, localBin]) {
69+
if (dir && !process.env.PATH?.includes(dir)) {
70+
process.env.PATH = `${dir}:${process.env.PATH}`;
71+
}
72+
}
1773
}
1874

19-
async function runCommand(command: string, onSummary: OnSummaryFn) {
75+
async function runCommand(
76+
command: string,
77+
onSummary: OnSummaryFn,
78+
): Promise<string> {
2079
core.info(`> $ ${command}`);
2180
return new Promise((resolve, reject) => {
22-
const process = spawn(command, { shell: true });
23-
24-
process.stdout?.on("data", (data: Buffer) => {
25-
onSummary(
26-
data
27-
.toString()
28-
.split("\n")
29-
.map(line => line.trim())
30-
.filter(line => line.length > 0)
31-
.flatMap(line => {
32-
try {
33-
const parsedLine = JSON.parse(line);
34-
const camelCaseLine = Object.fromEntries(
35-
Object.entries(parsedLine).map(([key, value]) => [
36-
snakeToCamelCase(key),
37-
value,
38-
]),
39-
);
40-
// TODO: This is hacky and we should be parsing what comes off the wire
41-
return [camelCaseLine as unknown as ExperimentSummary];
42-
} catch (e) {
43-
core.error(`Failed to parse jsonl data: ${e}`);
44-
return [];
45-
}
46-
}),
47-
);
81+
const stderrChunks: string[] = [];
82+
83+
const child = spawn(command, { shell: true });
84+
85+
child.stdout?.on("data", (data: Buffer) => {
86+
data
87+
.toString()
88+
.split("\n")
89+
.map(line => line.trim())
90+
.filter(line => line.length > 0)
91+
.forEach(line => {
92+
try {
93+
const parsed = JSON.parse(line) as ExperimentSummary;
94+
onSummary([parsed]);
95+
} catch (e) {
96+
core.error(`Failed to parse jsonl data: ${e}`);
97+
}
98+
});
4899
});
49100

50-
process.stderr?.on("data", (data: Buffer) => {
51-
core.info(data.toString()); // Outputs the stderr of the command
101+
child.stderr?.on("data", (data: Buffer) => {
102+
const text = data.toString();
103+
stderrChunks.push(text);
104+
core.info(text);
52105
});
53106

54-
process.on("close", code => {
107+
child.on("close", code => {
55108
if (code === 0) {
56-
resolve(null);
109+
resolve(stderrChunks.join(""));
57110
} else {
58-
reject(new Error(`Command failed with exit code ${code}`));
111+
reject(
112+
Object.assign(
113+
new Error(`Command failed with exit code ${code}`),
114+
{ stderr: stderrChunks.join("") },
115+
),
116+
);
59117
}
60118
});
61119
});
@@ -64,7 +122,6 @@ async function runCommand(command: string, onSummary: OnSummaryFn) {
64122
export async function runEval(args: Params, onSummary: OnSummaryFn) {
65123
const { api_key, root, paths, terminate_on_failure } = args;
66124

67-
// Add the API key to the environment
68125
core.exportVariable("BRAINTRUST_API_KEY", api_key);
69126

70127
if (!process.env.OPENAI_API_KEY) {
@@ -75,43 +132,46 @@ export async function runEval(args: Params, onSummary: OnSummaryFn) {
75132
core.exportVariable("OPENAI_BASE_URL", "https://braintrustproxy.com/v1");
76133
}
77134

78-
// Change working directory
135+
await installBt(args.bt_version);
136+
79137
process.chdir(path.resolve(root));
80138

81-
const terminateFlag = terminate_on_failure ? "--terminate-on-failure" : "";
82-
83-
const baseCommand = (() => {
84-
switch (args.runtime.toLowerCase().trim()) {
85-
case "node":
86-
switch (args.package_manager) {
87-
case "":
88-
case "npm":
89-
return "npx braintrust";
90-
case "pnpm":
91-
return "pnpm dlx braintrust";
92-
default:
93-
throw new Error(
94-
`Unsupported package manager: ${args.package_manager}`,
95-
);
96-
}
97-
case "python":
98-
switch ((args.package_manager || "").toLowerCase().trim()) {
99-
case "":
100-
case "pip":
101-
return `braintrust`;
102-
case "uv":
103-
return `uv run braintrust`;
104-
default:
105-
throw new Error(
106-
`Unsupported package manager: ${args.package_manager}`,
107-
);
108-
}
109-
default:
110-
throw new Error(`Unsupported runtime: ${args.runtime}`);
111-
}
112-
})();
139+
// Build bt eval flags
140+
const flags: string[] = ["--jsonl"];
141+
142+
if (terminate_on_failure) {
143+
flags.push("--terminate-on-failure");
144+
}
113145

114-
const command = `${baseCommand} eval --jsonl ${terminateFlag} ${paths}`;
146+
// --runner: explicit input takes precedence; fall back to deriving --language
147+
// from the deprecated runtime input so existing configs keep working.
148+
if (args.runner) {
149+
flags.push(`--runner ${args.runner}`);
150+
} else if (args.runtime === "python") {
151+
flags.push("--language python");
152+
} else if (args.runtime === "node") {
153+
flags.push("--language js");
154+
}
155+
156+
if (args.filter) {
157+
flags.push(`--filter ${args.filter}`);
158+
}
115159

116-
await runCommand(command, onSummary);
160+
const command = `bt eval ${flags.join(" ")} ${paths}`;
161+
162+
try {
163+
await runCommand(command, onSummary);
164+
} catch (err: any) {
165+
// Surface stderr as a structured failure so the PR comment can show details.
166+
const stderr: string = err?.stderr ?? "";
167+
if (stderr) {
168+
onSummary([
169+
{
170+
evaluatorName: "eval",
171+
errors: stderr.split("\n").filter((l: string) => l.trim()),
172+
},
173+
]);
174+
}
175+
throw err;
176+
}
117177
}

0 commit comments

Comments
 (0)