diff --git a/internal/agent/agent.go b/internal/agent/agent.go index 3bb67968..4be6504d 100644 --- a/internal/agent/agent.go +++ b/internal/agent/agent.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "runtime/debug" "strings" "sync" "sync/atomic" @@ -347,6 +348,21 @@ func (a *Agent) dispatchSubtasks(ctx context.Context) ([]model.LlmComment, error go func(d model.Diff) { defer wg.Done() defer func() { <-sem }() // release + // A panic while reviewing one file must be isolated exactly like an + // error return: counted in subtaskFailed and recorded as a + // subtask_error warning, so other files still complete and the + // all-failed rollup below stays correct. Registered before the + // timeout-cancel defer, so cancel() still runs first on unwind and + // fileCtx is already cancelled here — use the parent ctx for telemetry. + defer func() { + if r := recover(); r != nil { + atomic.AddInt64(&a.subtaskFailed, 1) + fmt.Fprintf(stdout.Writer(), "[ocr] Subtask panic for %s: %v\n%s\n", d.NewPath, r, debug.Stack()) + telemetry.ErrorEvent(ctx, "subtask.panic", fmt.Errorf("panic: %v", r), + telemetry.AnyToAttr("file.path", d.NewPath)) + a.recordWarning("subtask_error", d.NewPath, fmt.Sprintf("panic: %v", r)) + } + }() var fileCtx context.Context var cancel context.CancelFunc diff --git a/internal/llmloop/pool.go b/internal/llmloop/pool.go index 3df2ea58..a3d3f6f6 100644 --- a/internal/llmloop/pool.go +++ b/internal/llmloop/pool.go @@ -9,6 +9,7 @@ package llmloop import ( "fmt" + "runtime/debug" "sync" "github.com/open-code-review/open-code-review/internal/model" @@ -55,6 +56,14 @@ func (p *CommentWorkerPool) Submit(f func() ([]model.LlmComment, error)) { p.wg.Go(func() { p.semaphore <- struct{}{} defer func() { <-p.semaphore }() + // Contain a panic in the submitted work so one bad unit of work cannot + // crash the whole process. The work that panics contributes no comments; + // the semaphore is still released via the defer above. + defer func() { + if r := recover(); r != nil { + fmt.Fprintf(stdout.Writer(), "[ocr] CommentWorkerPool panic: %v\n%s\n", r, debug.Stack()) + } + }() comments, err := f() if err != nil { @@ -68,6 +77,16 @@ func (p *CommentWorkerPool) Submit(f func() ([]model.LlmComment, error)) { // Await blocks until all submitted work has completed and returns // aggregated results from every Submit call so far. +// +// A panic in submitted work is recovered and logged inside Submit (see the +// recover defer there) but is not surfaced here as an error or reflected in +// the returned count — a unit that panics contributes no comments and is +// indistinguishable from one that produced zero. +// +// Concurrency contract: Await must not run concurrently with Submit. Submit +// calls wg.Go (which does wg.Add(1) synchronously), so a Submit racing Await +// would risk sync.WaitGroup's "Add called concurrently with Wait" panic. +// Callers must ensure every Submit has returned before calling Await. func (p *CommentWorkerPool) Await() []model.LlmComment { p.wg.Wait() return p.results diff --git a/internal/llmloop/pool_test.go b/internal/llmloop/pool_test.go index 1db6bbe2..26380952 100644 --- a/internal/llmloop/pool_test.go +++ b/internal/llmloop/pool_test.go @@ -97,3 +97,24 @@ func TestCommentWorkerPool_AwaitEmpty(t *testing.T) { t.Errorf("expected nil for no submissions, got %v", results) } } + +func TestCommentWorkerPool_PanicIsIsolated(t *testing.T) { + p := NewCommentWorkerPool(2) + + p.Submit(func() ([]model.LlmComment, error) { + panic("boom in submitted work") + }) + p.Submit(func() ([]model.LlmComment, error) { + return []model.LlmComment{{Path: "healthy.go", Content: "fine"}}, nil + }) + + // Await must not crash: the recovered panic contributes no comments, and the + // healthy task's result is still collected. + results := p.Await() + if len(results) != 1 { + t.Fatalf("expected 1 result after a panicking task, got %d", len(results)) + } + if results[0].Path != "healthy.go" { + t.Errorf("Path = %q, want healthy.go", results[0].Path) + } +}