Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 62 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,11 @@ If you prefer explicit control, you can add tracing middleware manually to your

## Evaluations

Run [evals](https://www.braintrust.dev/docs/guides/evals) with custom test cases and scoring functions:
Run [evals](https://www.braintrust.dev/docs/guides/evals) with custom test cases and scoring functions.

### Define and run

Define an eval once with its task and scorers, then run it against any dataset:

```go
package main
Expand Down Expand Up @@ -135,16 +139,9 @@ func main() {
log.Fatal(err)
}

// Create an evaluator with your task's input and output types
evaluator := braintrust.NewEvaluator[string, string](client)

// Run an evaluation
_, err = evaluator.Run(ctx, eval.Opts[string, string]{
Experiment: "greeting-experiment",
Dataset: eval.NewDataset([]eval.Case[string, string]{
{Input: "World", Expected: "Hello World"},
{Input: "Alice", Expected: "Hello Alice"},
}),
// Create an eval
e := braintrust.NewEval(client, &eval.Eval[string, string]{
Name: "greeting-experiment",
Task: eval.T(func(ctx context.Context, input string) (string, error) {
return "Hello " + input, nil
}),
Expand All @@ -158,12 +155,64 @@ func main() {
}),
},
})

// Run against a dataset
_, err = e.Run(ctx, eval.RunOpts[string, string]{
Dataset: eval.NewDataset([]eval.Case[string, string]{
{Input: "World", Expected: "Hello World"},
{Input: "Alice", Expected: "Hello Alice"},
}),
})
if err != nil {
log.Fatal(err)
}
}
```

### Remote Eval Server

The same eval definition can be registered with a [remote eval server](https://www.braintrust.dev/docs/evaluate/remote-evals), letting you run evals from the Braintrust playground against code on your own infrastructure:

```go
package main

import (
"context"
"log"
"strings"

"github.com/braintrustdata/braintrust-sdk-go/eval"
"github.com/braintrustdata/braintrust-sdk-go/server"
)

func main() {
// Define the eval once
classify := &eval.Eval[string, string]{
Name: "classify",
Task: eval.T(func(ctx context.Context, input string) (string, error) {
return strings.ToUpper(input), nil
}),
Scorers: []eval.Scorer[string, string]{
eval.NewScorer("exact_match", func(ctx context.Context, r eval.TaskResult[string, string]) (eval.Scores, error) {
if r.Output == r.Expected { return eval.S(1.0), nil }
return eval.S(0.0), nil
}),
},
}

// Register with server for remote execution
srv := server.New(
server.WithAddress("localhost:8300"),
server.WithNoAuth(), // Remove for production
)
server.RegisterEval(srv, classify, server.RegisterEvalOpts{})

log.Fatal(srv.Start())
}
```

Then configure `http://localhost:8300` in your Braintrust project settings under **Remote evals**.

## API Client

Manage Braintrust resources programmatically:
Expand Down Expand Up @@ -244,10 +293,12 @@ Complete working examples are available in [`examples/`](./examples/):
- **[langchaingo](./examples/langchaingo/main.go)** - LangChainGo integration
- **[datasets](./examples/datasets/main.go)** - Using Braintrust datasets
- **[adk-go](./examples/adk/main.go)** - ADK integration
- **[eval-server](./examples/internal/eval-server/main.go)** - Remote eval server

## Features

- **Evaluations** - Systematic testing with custom scoring functions
- **Remote Eval Server** - Run evals from the Braintrust UI against your own code
- **Tracing** - Automatic instrumentation for major LLM providers
- **Datasets** - Manage and version evaluation datasets
- **Experiments** - Track versions and configurations
Expand Down
32 changes: 13 additions & 19 deletions client.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,30 +191,24 @@ func (c *Client) Tracer(name string, opts ...oteltrace.TracerOption) oteltrace.T
return c.tracerProvider.Tracer(name, opts...)
}

// NewEvaluator creates a new evaluator for running multiple evaluations with the same
// input and output types.
// NewEval creates a runnable [eval.Eval] by combining a client with an eval definition.
//
// Example:
//
// client, _ := braintrust.New(tp)
//
// // Create an evaluator for string → string evaluations
// evaluator := braintrust.NewEvaluator[string, string](client)
//
// // Run multiple evaluations
// result1, _ := evaluator.Run(ctx, eval.Opts[string, string]{
// Experiment: "test-1",
// Dataset: dataset1,
// Task: task1,
// Scorers: scorers,
// })
//
// result2, _ := evaluator.Run(ctx, eval.Opts[string, string]{
// Experiment: "test-2",
// Dataset: dataset2,
// Task: task2,
// Scorers: scorers,
// e := braintrust.NewEval(client, &eval.Eval[string, string]{
// Name: "classify",
// Task: task,
// Scorers: scorers,
// })
// result, _ := e.Run(ctx, eval.RunOpts[string, string]{Dataset: dataset})
func NewEval[I, R any](client *Client, e *eval.Eval[I, R]) *eval.Eval[I, R] {
evaluator := eval.NewEvaluator[I, R](client.session, client.tracerProvider, client.API(), client.config.DefaultProjectName)
return eval.NewEval(evaluator, e)
}

// NewEvaluator creates a new evaluator for running evaluations with the same
// input and output types.
func NewEvaluator[I, R any](client *Client) *eval.Evaluator[I, R] {
return eval.NewEvaluator[I, R](client.session, client.tracerProvider, client.API(), client.config.DefaultProjectName)
}
Expand Down
Loading
Loading