From 53cf0ce41fa4bc3722892bd8b5cfec3af80c01ad Mon Sep 17 00:00:00 2001 From: YaroShkvorets Date: Wed, 27 May 2026 16:29:32 -0400 Subject: [PATCH 1/2] fix: surface per-block execution timeouts instead of silently swallowing them When a WASM host function (e.g. eth_call) panics in wasmtime because the per-block execution context's deadline has fired, the panic propagates up to Pipeline.execute's deferred recover, which re-panics with "unknown error: %s" (chain broken) up to executeModules's deferred recover, which calls recoverExecutionPanic. recoverExecutionPanic had two bugs that combined to silently drop the offending block: 1. The DeadlineExceeded handler was dead code: the catch-all `if ctx.Err() != nil || errors.Is(recoveredErr, context.Canceled)` above it matched DeadlineExceeded too, so the deadline-specific branch below was never reached. The function would return `executionError` (typically nil), making executeModules return nil and the pipeline move on to the next block as if nothing happened. 2. Pipeline.execute's re-panic used `%s` instead of `%w` when wrapping the recovered error, severing the wrap chain so callers couldn't `errors.Is(err, context.DeadlineExceeded)` even when the swallow condition was avoided. Fix: - Check `errors.Is(ctx.Err(), context.DeadlineExceeded)` first so the intended `CodeDeadlineExceeded` connect error is returned. The catch-all for Canceled remains below. - Wrap the recovered error with `%w` on re-panic so the deadline information survives all the way to the gRPC error mapper. Bug introduced in commit 5b3f59b2 (2025-10-21, "Improved panic handling and make clearer the intent when throwing errors in the WASM handlers"), present in v1.17.0 onward. Affects wasmtime (default runtime for Rust-built spkgs); wazero is unaffected because it converts host panics to error returns at f.Call(), bypassing the recover path. Co-Authored-By: Claude Opus 4.7 --- docs/release-notes/change-log.md | 1 + pipeline/process_block.go | 18 +++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/docs/release-notes/change-log.md b/docs/release-notes/change-log.md index aa6854c32..7546dcdf9 100644 --- a/docs/release-notes/change-log.md +++ b/docs/release-notes/change-log.md @@ -13,6 +13,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Fixed +- Server: per-block execution timeouts (`--substreams-block-execution-timeout`) are no longer silently swallowed when a WASM host-function panic (e.g. wasmtime) coincides with the deadline. Previously, `recoverExecutionPanic` would return `nil` instead of `CodeDeadlineExceeded`, causing the offending block to be skipped and the stream to complete successfully. - CI: Docker image login, build and push are now skipped for fork PRs; image is still built (without push) to validate the Dockerfile. ## v1.18.5 diff --git a/pipeline/process_block.go b/pipeline/process_block.go index 3ae782dd2..1f458910d 100644 --- a/pipeline/process_block.go +++ b/pipeline/process_block.go @@ -663,16 +663,18 @@ func recoverExecutionPanic(ctx context.Context, executionError error, recovered recoveredErr = fmt.Errorf("%v", recovered) } + // If the context deadline was exceeded, return a deadline exceeded error RPC error. + // This must be checked before context.Canceled, because the catch-all `ctx.Err() != nil` + // below would otherwise swallow deadline-exceeded panics and silently drop the block. + if errors.Is(ctx.Err(), context.DeadlineExceeded) { + return connect.NewError(connect.CodeDeadlineExceeded, fmt.Errorf("execution timed out at block %s: %w", blockRef, recoveredErr)) + } + // If recovering from context cancel, simply return the execution error as-is if ctx.Err() != nil || errors.Is(recoveredErr, context.Canceled) { return executionError } - // If the context deadline was exceeded, return a deadline exceeded error RPC error - if ctx.Err() == context.DeadlineExceeded { - return connect.NewError(connect.CodeDeadlineExceeded, fmt.Errorf("execution timed out at block %s: %w", blockRef, recoveredErr)) - } - // Otherwise, log the panic and return a generic error if PrintStack { debug.PrintStack() @@ -721,8 +723,10 @@ func (p *Pipeline) execute(ctx context.Context, executor exec.ModuleExecutor, ex return } - // Move the panic up so it's handled at a higher level by those who call execute() - panic(fmt.Errorf("unknown error: %s", r)) + // Move the panic up so it's handled at a higher level by those who call execute(). + // Wrap with %w (not %s) to preserve the error chain so that callers can + // errors.Is(err, context.DeadlineExceeded) on the result. + panic(fmt.Errorf("unknown error: %w", recoveredErr)) } }() From e4d38d93faf18120def142305b4df6976e4da355 Mon Sep 17 00:00:00 2001 From: YaroShkvorets Date: Wed, 27 May 2026 16:36:07 -0400 Subject: [PATCH 2/2] test: add regression coverage for recoverExecutionPanic Five focused unit tests covering each branch of recoverExecutionPanic. The deadline-exceeded test asserts: - result is non-nil (catches the silent-swallow regression directly) - result is a connect.Error with CodeDeadlineExceeded - the original context.DeadlineExceeded survives the wrap chain so callers can errors.Is() on it - the block ref is in the message Verified: this test fails on the pre-fix code with the exact assertion "deadline-exceeded panic must not be silently swallowed", and passes with the fix. Co-Authored-By: Claude Opus 4.7 --- pipeline/process_block_recover_test.go | 99 ++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 pipeline/process_block_recover_test.go diff --git a/pipeline/process_block_recover_test.go b/pipeline/process_block_recover_test.go new file mode 100644 index 000000000..9c1f874fc --- /dev/null +++ b/pipeline/process_block_recover_test.go @@ -0,0 +1,99 @@ +package pipeline + +import ( + "context" + "errors" + "fmt" + "testing" + "time" + + "connectrpc.com/connect" + "github.com/streamingfast/bstream" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestRecoverExecutionPanic_DeadlineExceeded is the regression test for the +// silent-swallow bug: when a WASM host-function panic coincides with the +// per-block execution deadline, recoverExecutionPanic must return a +// CodeDeadlineExceeded connect error — not nil — so the stream terminates +// instead of skipping the block. +func TestRecoverExecutionPanic_DeadlineExceeded(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond) + defer cancel() + // Wait for the deadline to fire. + <-ctx.Done() + require.ErrorIs(t, ctx.Err(), context.DeadlineExceeded) + + blockRef := bstream.NewBlockRef("blk-id", 42) + recovered := fmt.Errorf("running wasm extension %q: %w", "eth::eth_call", + fmt.Errorf("timeout while doing eth_call: %w", context.DeadlineExceeded)) + + got := recoverExecutionPanic(ctx, nil, recovered, blockRef) + + require.NotNil(t, got, "deadline-exceeded panic must not be silently swallowed") + + var ce *connect.Error + require.ErrorAs(t, got, &ce, "result must be a connect.Error") + assert.Equal(t, connect.CodeDeadlineExceeded, ce.Code()) + + // The original error chain must be preserved so callers can introspect. + assert.ErrorIs(t, got, context.DeadlineExceeded) + assert.Contains(t, got.Error(), "blk-id") +} + +// TestRecoverExecutionPanic_ContextCanceled covers the non-deadline cancel +// path: the recover must return the caller's executionError as-is (we don't +// invent a new error when the request was cancelled). +func TestRecoverExecutionPanic_ContextCanceled(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() + require.ErrorIs(t, ctx.Err(), context.Canceled) + + blockRef := bstream.NewBlockRef("blk-id", 42) + executionErr := errors.New("some prior error") + + got := recoverExecutionPanic(ctx, executionErr, errors.New("panicked"), blockRef) + assert.Same(t, executionErr, got, "canceled ctx should return executionError unchanged") +} + +// TestRecoverExecutionPanic_RecoveredIsCanceled covers the case where ctx +// itself is alive but the panic value contains context.Canceled (e.g. +// propagated through wrapping). +func TestRecoverExecutionPanic_RecoveredIsCanceled(t *testing.T) { + ctx := context.Background() + blockRef := bstream.NewBlockRef("blk-id", 42) + executionErr := errors.New("some prior error") + + recovered := fmt.Errorf("wrapped: %w", context.Canceled) + got := recoverExecutionPanic(ctx, executionErr, recovered, blockRef) + assert.Same(t, executionErr, got) +} + +// TestRecoverExecutionPanic_GenericPanic verifies the fallback path: a real +// runtime panic with a live ctx should yield a "panic at block ..." error +// that wraps the recovered value. +func TestRecoverExecutionPanic_GenericPanic(t *testing.T) { + ctx := context.Background() + blockRef := bstream.NewBlockRef("blk-id", 42) + + recovered := errors.New("nil pointer dereference") + got := recoverExecutionPanic(ctx, nil, recovered, blockRef) + + require.NotNil(t, got) + assert.ErrorIs(t, got, recovered) + assert.Contains(t, got.Error(), "blk-id") +} + +// TestRecoverExecutionPanic_NonErrorRecovered handles the case where the +// panic value isn't an `error` (e.g. a plain string panic from somewhere). +func TestRecoverExecutionPanic_NonErrorRecovered(t *testing.T) { + ctx := context.Background() + blockRef := bstream.NewBlockRef("blk-id", 42) + + got := recoverExecutionPanic(ctx, nil, "something broke", blockRef) + + require.NotNil(t, got) + assert.Contains(t, got.Error(), "something broke") + assert.Contains(t, got.Error(), "blk-id") +}