1+ import os from "os" ;
12import path from "path" ;
23import * as core from "@actions/core" ;
34import { spawn } from "child_process" ;
45
56import { Params } from "./main" ;
6- import { ExperimentSummary } from "braintrust" ;
7+
8+ export interface ScoreSummary {
9+ name : string ;
10+ score : number ;
11+ diff ?: number ;
12+ improvements : number ;
13+ regressions : number ;
14+ }
15+
16+ export interface MetricSummary {
17+ name : string ;
18+ metric : number ;
19+ unit : string ;
20+ diff ?: number ;
21+ improvements : number ;
22+ regressions : number ;
23+ }
24+
25+ export interface ExperimentSummary {
26+ projectName : string ;
27+ experimentName : string ;
28+ projectId ?: string ;
29+ experimentId ?: string ;
30+ projectUrl ?: string ;
31+ experimentUrl ?: string ;
32+ comparisonExperimentName ?: string ;
33+ scores : Record < string , ScoreSummary > ;
34+ metrics ?: Record < string , MetricSummary > ;
35+ }
736
837export interface ExperimentFailure {
938 evaluatorName : string ;
@@ -12,50 +41,79 @@ export interface ExperimentFailure {
1241
1342type OnSummaryFn = ( summary : ( ExperimentSummary | ExperimentFailure ) [ ] ) => void ;
1443
15- function snakeToCamelCase ( str : string ) {
16- return str . replace ( / ( [ - _ ] [ a - z ] ) / g, group => group . charAt ( 1 ) . toUpperCase ( ) ) ;
44+ // Installs the bt CLI and adds its bin directory to PATH for the current
45+ // process. version may be:
46+ // "" → latest stable via https://bt.dev/cli/install.sh
47+ // semver like "0.2.0" → pinned stable via the same script with --version
48+ // release tag like "canary-add-glob-support" → canary installer from GH release
49+ async function installBt ( version : string ) : Promise < void > {
50+ const isCanary = version !== "" && ! version . match ( / ^ \d + \. \d + \. \d + / ) ;
51+
52+ let installCmd : string ;
53+ if ( isCanary ) {
54+ installCmd = `curl -fsSL https://github.com/braintrustdata/bt/releases/download/${ version } /bt-installer.sh | sh` ;
55+ } else if ( version !== "" ) {
56+ installCmd = `curl -fsSL https://bt.dev/cli/install.sh | sh -s -- --version ${ version } ` ;
57+ } else {
58+ installCmd = `curl -fsSL https://bt.dev/cli/install.sh | sh` ;
59+ }
60+
61+ core . info ( `Installing bt CLI: ${ installCmd } ` ) ;
62+ await runCommand ( installCmd , ( ) => { } ) ;
63+
64+ // The installer puts the binary in ~/.local/bin (or $XDG_BIN_HOME).
65+ // Make sure the spawned child processes can find it.
66+ const localBin = path . join ( os . homedir ( ) , ".local" , "bin" ) ;
67+ const xdgBin = process . env . XDG_BIN_HOME ?? "" ;
68+ for ( const dir of [ xdgBin , localBin ] ) {
69+ if ( dir && ! process . env . PATH ?. includes ( dir ) ) {
70+ process . env . PATH = `${ dir } :${ process . env . PATH } ` ;
71+ }
72+ }
1773}
1874
19- async function runCommand ( command : string , onSummary : OnSummaryFn ) {
75+ async function runCommand (
76+ command : string ,
77+ onSummary : OnSummaryFn ,
78+ ) : Promise < string > {
2079 core . info ( `> $ ${ command } ` ) ;
2180 return new Promise ( ( resolve , reject ) => {
22- const process = spawn ( command , { shell : true } ) ;
23-
24- process . stdout ?. on ( "data" , ( data : Buffer ) => {
25- onSummary (
26- data
27- . toString ( )
28- . split ( "\n" )
29- . map ( line => line . trim ( ) )
30- . filter ( line => line . length > 0 )
31- . flatMap ( line => {
32- try {
33- const parsedLine = JSON . parse ( line ) ;
34- const camelCaseLine = Object . fromEntries (
35- Object . entries ( parsedLine ) . map ( ( [ key , value ] ) => [
36- snakeToCamelCase ( key ) ,
37- value ,
38- ] ) ,
39- ) ;
40- // TODO: This is hacky and we should be parsing what comes off the wire
41- return [ camelCaseLine as unknown as ExperimentSummary ] ;
42- } catch ( e ) {
43- core . error ( `Failed to parse jsonl data: ${ e } ` ) ;
44- return [ ] ;
45- }
46- } ) ,
47- ) ;
81+ const stderrChunks : string [ ] = [ ] ;
82+
83+ const child = spawn ( command , { shell : true } ) ;
84+
85+ child . stdout ?. on ( "data" , ( data : Buffer ) => {
86+ data
87+ . toString ( )
88+ . split ( "\n" )
89+ . map ( line => line . trim ( ) )
90+ . filter ( line => line . length > 0 )
91+ . forEach ( line => {
92+ try {
93+ const parsed = JSON . parse ( line ) as ExperimentSummary ;
94+ onSummary ( [ parsed ] ) ;
95+ } catch ( e ) {
96+ core . error ( `Failed to parse jsonl data: ${ e } ` ) ;
97+ }
98+ } ) ;
4899 } ) ;
49100
50- process . stderr ?. on ( "data" , ( data : Buffer ) => {
51- core . info ( data . toString ( ) ) ; // Outputs the stderr of the command
101+ child . stderr ?. on ( "data" , ( data : Buffer ) => {
102+ const text = data . toString ( ) ;
103+ stderrChunks . push ( text ) ;
104+ core . info ( text ) ;
52105 } ) ;
53106
54- process . on ( "close" , code => {
107+ child . on ( "close" , code => {
55108 if ( code === 0 ) {
56- resolve ( null ) ;
109+ resolve ( stderrChunks . join ( "" ) ) ;
57110 } else {
58- reject ( new Error ( `Command failed with exit code ${ code } ` ) ) ;
111+ reject (
112+ Object . assign (
113+ new Error ( `Command failed with exit code ${ code } ` ) ,
114+ { stderr : stderrChunks . join ( "" ) } ,
115+ ) ,
116+ ) ;
59117 }
60118 } ) ;
61119 } ) ;
@@ -64,7 +122,6 @@ async function runCommand(command: string, onSummary: OnSummaryFn) {
64122export async function runEval ( args : Params , onSummary : OnSummaryFn ) {
65123 const { api_key, root, paths, terminate_on_failure } = args ;
66124
67- // Add the API key to the environment
68125 core . exportVariable ( "BRAINTRUST_API_KEY" , api_key ) ;
69126
70127 if ( ! process . env . OPENAI_API_KEY ) {
@@ -75,43 +132,46 @@ export async function runEval(args: Params, onSummary: OnSummaryFn) {
75132 core . exportVariable ( "OPENAI_BASE_URL" , "https://braintrustproxy.com/v1" ) ;
76133 }
77134
78- // Change working directory
135+ await installBt ( args . bt_version ) ;
136+
79137 process . chdir ( path . resolve ( root ) ) ;
80138
81- const terminateFlag = terminate_on_failure ? "--terminate-on-failure" : "" ;
82-
83- const baseCommand = ( ( ) => {
84- switch ( args . runtime . toLowerCase ( ) . trim ( ) ) {
85- case "node" :
86- switch ( args . package_manager ) {
87- case "" :
88- case "npm" :
89- return "npx braintrust" ;
90- case "pnpm" :
91- return "pnpm dlx braintrust" ;
92- default :
93- throw new Error (
94- `Unsupported package manager: ${ args . package_manager } ` ,
95- ) ;
96- }
97- case "python" :
98- switch ( ( args . package_manager || "" ) . toLowerCase ( ) . trim ( ) ) {
99- case "" :
100- case "pip" :
101- return `braintrust` ;
102- case "uv" :
103- return `uv run braintrust` ;
104- default :
105- throw new Error (
106- `Unsupported package manager: ${ args . package_manager } ` ,
107- ) ;
108- }
109- default :
110- throw new Error ( `Unsupported runtime: ${ args . runtime } ` ) ;
111- }
112- } ) ( ) ;
139+ // Build bt eval flags
140+ const flags : string [ ] = [ "--jsonl" ] ;
141+
142+ if ( terminate_on_failure ) {
143+ flags . push ( "--terminate-on-failure" ) ;
144+ }
113145
114- const command = `${ baseCommand } eval --jsonl ${ terminateFlag } ${ paths } ` ;
146+ // --runner: explicit input takes precedence; fall back to deriving --language
147+ // from the deprecated runtime input so existing configs keep working.
148+ if ( args . runner ) {
149+ flags . push ( `--runner ${ args . runner } ` ) ;
150+ } else if ( args . runtime === "python" ) {
151+ flags . push ( "--language python" ) ;
152+ } else if ( args . runtime === "node" ) {
153+ flags . push ( "--language js" ) ;
154+ }
155+
156+ if ( args . filter ) {
157+ flags . push ( `--filter ${ args . filter } ` ) ;
158+ }
115159
116- await runCommand ( command , onSummary ) ;
160+ const command = `bt eval ${ flags . join ( " " ) } ${ paths } ` ;
161+
162+ try {
163+ await runCommand ( command , onSummary ) ;
164+ } catch ( err : any ) {
165+ // Surface stderr as a structured failure so the PR comment can show details.
166+ const stderr : string = err ?. stderr ?? "" ;
167+ if ( stderr ) {
168+ onSummary ( [
169+ {
170+ evaluatorName : "eval" ,
171+ errors : stderr . split ( "\n" ) . filter ( ( l : string ) => l . trim ( ) ) ,
172+ } ,
173+ ] ) ;
174+ }
175+ throw err ;
176+ }
117177}
0 commit comments