From 66270cd57f9f412f6725b6f2a36703eb152751f7 Mon Sep 17 00:00:00 2001 From: Jason Allum <21417+jallum@users.noreply.github.com> Date: Thu, 21 May 2026 20:43:36 -0400 Subject: [PATCH] Bump retries, add backoff/jitter, fully reopen TreeFS on retry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three attempts is not enough under sustained contention — the race can win the CAS check between Refresh and Commit on every attempt even when Refresh correctly observes the new ref. Bump retries to 12 and add exponential backoff with full jitter so colliding writers desynchronize. Also: rather than just Refresh (which reuses the same go-git Repository handle), fully reopen the TreeFS between attempts. Adds Repo.Reopen and Store.ReopenFS via a new Reopener interface. Defense in depth against any cached go-git state (packed-refs cache etc.) that Refresh might miss. --- cmd/bw/commit_retry.go | 44 ++++++++++--- cmd/bw/commit_retry_test.go | 127 ++++++++++++++++++++++++++++++++++++ internal/issue/issue.go | 24 +++++++ internal/repo/repo.go | 19 ++++++ 4 files changed, 206 insertions(+), 8 deletions(-) diff --git a/cmd/bw/commit_retry.go b/cmd/bw/commit_retry.go index 1a2922f0..ecd3130f 100644 --- a/cmd/bw/commit_retry.go +++ b/cmd/bw/commit_retry.go @@ -3,30 +3,45 @@ package main import ( "errors" "fmt" + "math/rand" + "time" "github.com/jallum/beadwork/internal/issue" "github.com/jallum/beadwork/internal/treefs" ) // commitMaxRetries bounds how many times mutating commands retry a commit -// after losing the CAS race against a concurrent writer. -const commitMaxRetries = 3 +// after losing the CAS race against a concurrent writer. Set generously +// because in real-world use (multiple agents, background sync) the race +// can be sustained over many attempts; 3 turned out to be too few. +const commitMaxRetries = 12 + +// commitBackoffBase is the base delay between retry attempts. The actual +// delay is roughly base * 2^(attempt-1) with jitter, capped at +// commitBackoffMax. Backoff desynchronizes colliding writers so the +// retry storm dissipates. +const commitBackoffBase = 5 * time.Millisecond + +// commitBackoffMax caps the per-attempt delay so a stuck retry loop +// finishes in bounded time. +const commitBackoffMax = 250 * time.Millisecond // commitWithRetry runs apply, then store.Commit(intent), retrying up to // maxRetries times when the commit fails with treefs.ErrRefMoved. Between -// attempts the store cache is cleared and the underlying TreeFS is -// refreshed so apply observes the latest ref state. +// attempts the store fully reopens its TreeFS (fresh go-git handle, no +// cached state) so apply observes the latest ref state, then waits a +// jittered backoff before re-running. // // apply must re-derive all state from the store on every invocation — -// values captured from a previous attempt are stale after Refresh and +// values captured from a previous attempt are stale after reopen and // must not be reused. func commitWithRetry(store *issue.Store, maxRetries int, apply func() (intent string, err error)) error { var commitErr error for attempt := 0; attempt < maxRetries; attempt++ { if attempt > 0 { - store.ClearCache() - if err := store.Refresh(); err != nil { - return fmt.Errorf("refresh after conflict: %w", err) + sleepBackoff(attempt) + if err := store.ReopenFS(); err != nil { + return fmt.Errorf("reopen after conflict: %w", err) } } intent, aerr := apply() @@ -43,3 +58,16 @@ func commitWithRetry(store *issue.Store, maxRetries int, apply func() (intent st } return fmt.Errorf("commit failed after %d attempts: %w", maxRetries, commitErr) } + +// sleepBackoff sleeps for an exponentially-growing, jittered duration. +// attempt is 1-based. +func sleepBackoff(attempt int) { + d := commitBackoffBase << (attempt - 1) + if d > commitBackoffMax || d <= 0 { + d = commitBackoffMax + } + // Full jitter: pick a duration in [0, d). Avoids thundering-herd + // retries when several writers collide at the same instant. + jittered := time.Duration(rand.Int63n(int64(d) + 1)) + time.Sleep(jittered) +} diff --git a/cmd/bw/commit_retry_test.go b/cmd/bw/commit_retry_test.go index 38eb4bec..a93395fa 100644 --- a/cmd/bw/commit_retry_test.go +++ b/cmd/bw/commit_retry_test.go @@ -3,6 +3,8 @@ package main import ( "errors" "fmt" + "os/exec" + "strings" "testing" "github.com/jallum/beadwork/internal/issue" @@ -10,6 +12,26 @@ import ( "github.com/jallum/beadwork/internal/treefs" ) +func run(t *testing.T, dir, name string, args ...string) { + t.Helper() + cmd := exec.Command(name, args...) + cmd.Dir = dir + if out, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("%s %v: %s: %v", name, args, out, err) + } +} + +func runOut(t *testing.T, dir, name string, args ...string) string { + t.Helper() + cmd := exec.Command(name, args...) + cmd.Dir = dir + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("%s %v: %s: %v", name, args, out, err) + } + return string(out) +} + // TestCommitWithRetryRecoversFromRefMove proves the retry helper refreshes // and replays when another writer moves the ref between mutation and commit. func TestCommitWithRetryRecoversFromRefMove(t *testing.T) { @@ -96,6 +118,111 @@ func TestCommitWithRetryGivesUpAfterMax(t *testing.T) { } } +// TestCommitWithRetryRecoversFromRawGitRefMove proves the helper picks up +// state when the ref is moved by a raw git process (not go-git) — defending +// against any in-process caching the existing TreeFS might be holding. +// This is the closest analogue to the user's reported failure mode: a +// concurrent `bw` invocation in a different process advances the ref via +// the on-disk loose ref file. +func TestCommitWithRetryRecoversFromRawGitRefMove(t *testing.T) { + env := testutil.NewEnv(t) + defer env.Cleanup() + + iss, _ := env.Store.Create("To start", issue.CreateOpts{}) + if err := env.Repo.Commit("create " + iss.ID); err != nil { + t.Fatalf("seed commit: %v", err) + } + + // Build a fully-independent commit object that we can advance the ref + // to via raw git plumbing (no go-git involvement). We pre-create a + // dangling commit referencing the current ref's tree so the ref move + // is valid history. + cur := strings.TrimSpace(runOut(t, env.Dir, "git", "rev-parse", "refs/heads/beadwork")) + tree := strings.TrimSpace(runOut(t, env.Dir, "git", "rev-parse", "refs/heads/beadwork^{tree}")) + newHash := strings.TrimSpace(runOut(t, env.Dir, + "sh", "-c", + "GIT_AUTHOR_NAME=racer GIT_AUTHOR_EMAIL=r@r GIT_COMMITTER_NAME=racer GIT_COMMITTER_EMAIL=r@r "+ + "git commit-tree "+tree+" -p "+cur+" -m racer")) + if newHash == "" { + t.Fatal("failed to build racer commit") + } + + calls := 0 + err := commitWithRetry(env.Store, 3, func() (string, error) { + calls++ + if _, serr := env.Store.Start(iss.ID, "alice"); serr != nil { + return "", serr + } + if calls == 1 { + // Advance the ref via raw git on the filesystem. The + // in-process TreeFS's go-git Repository handle is now + // looking at a stale view of refs/heads/beadwork. + run(t, env.Dir, "git", "update-ref", "refs/heads/beadwork", newHash) + } + return "start " + iss.ID, nil + }) + if err != nil { + t.Fatalf("commitWithRetry: %v", err) + } + if calls < 2 { + t.Errorf("apply called %d times; expected at least 2 (one retry)", calls) + } + got, _ := env.Store.Get(iss.ID) + if got.Status != "in_progress" { + t.Errorf("final status = %q, want in_progress", got.Status) + } +} + +// TestCommitWithRetrySurvivesSustainedContention proves the helper can +// recover when the ref is being moved on every retry until eventually +// the racer stops. This is the closest analogue to the user-reported +// failure: "commit failed after 3 attempts" because the race kept +// winning. With more retries (and ideally backoff), we should succeed. +func TestCommitWithRetrySurvivesSustainedContention(t *testing.T) { + env := testutil.NewEnv(t) + defer env.Cleanup() + + iss, _ := env.Store.Create("To start", issue.CreateOpts{}) + if err := env.Repo.Commit("create " + iss.ID); err != nil { + t.Fatalf("seed commit: %v", err) + } + + racer, err := treefs.Open(env.Dir, "refs/heads/beadwork") + if err != nil { + t.Fatalf("open racer: %v", err) + } + + // Move the ref on the first 5 attempts; let the 6th succeed. + calls := 0 + err = commitWithRetry(env.Store, 10, func() (string, error) { + calls++ + if _, serr := env.Store.Start(iss.ID, "alice"); serr != nil { + return "", serr + } + if calls <= 5 { + racer.WriteFile(fmt.Sprintf("r%d.txt", calls), []byte("x")) + if cerr := racer.Commit(fmt.Sprintf("racer %d", calls)); cerr != nil { + return "", cerr + } + } + return "start " + iss.ID, nil + }) + if err != nil { + t.Fatalf("commitWithRetry: %v", err) + } + if calls < 6 { + t.Errorf("apply called %d times; expected at least 6", calls) + } +} + +// TestCommitMaxRetriesIsGenerous documents the chosen retry budget. +// Three was too few in practice; bump it. +func TestCommitMaxRetriesIsGenerous(t *testing.T) { + if commitMaxRetries < 10 { + t.Errorf("commitMaxRetries = %d, want >= 10 (got user-reported failure at 3)", commitMaxRetries) + } +} + // TestCommitWithRetryPropagatesNonRetryableErrors proves we don't loop on // errors unrelated to CAS. func TestCommitWithRetryPropagatesNonRetryableErrors(t *testing.T) { diff --git a/internal/issue/issue.go b/internal/issue/issue.go index 6ce1f00a..8d940201 100644 --- a/internal/issue/issue.go +++ b/internal/issue/issue.go @@ -91,6 +91,30 @@ func (s *Store) Refresh() error { return s.FS.Refresh() } +// Reopener is implemented by Committers that can fully reopen their +// underlying TreeFS (fresh git.PlainOpen, fresh storer). Used between +// retry attempts to discard any cached state. +type Reopener interface { + Reopen() (*treefs.TreeFS, error) +} + +// ReopenFS fully replaces the underlying TreeFS via the Committer's +// Reopener, then clears caches. Stronger than Refresh: discards any +// cached go-git state (packed-refs cache, etc.). Falls back to Refresh +// if the Committer does not implement Reopener. +func (s *Store) ReopenFS() error { + s.ClearCache() + if r, ok := s.Committer.(Reopener); ok { + tfs, err := r.Reopen() + if err != nil { + return err + } + s.FS = tfs + return nil + } + return s.FS.Refresh() +} + // Now returns the current time in UTC. If the BW_CLOCK environment variable // is set to an RFC3339 value, that fixed time is used instead of the real // clock. This enables deterministic timestamps for testing and migration. diff --git a/internal/repo/repo.go b/internal/repo/repo.go index 51f01a2a..ca80b20f 100644 --- a/internal/repo/repo.go +++ b/internal/repo/repo.go @@ -103,6 +103,25 @@ func (r *Repo) TreeFS() *treefs.TreeFS { return r.tfs } +// Reopen replaces the in-memory go-git repository handle and TreeFS with +// freshly-opened copies. Use this between retry attempts in concurrent +// scenarios so any cached state (packed-refs cache, etc.) is discarded +// and the next operation reads directly from disk. The new TreeFS is +// returned so callers (e.g. issue.Store) can swap their own pointer. +func (r *Repo) Reopen() (*treefs.TreeFS, error) { + repoDir := filepath.Dir(r.GitDir) + goRepo, err := openGitRepo(repoDir) + if err != nil { + return nil, fmt.Errorf("reopen repo: %w", err) + } + tfs, err := treefs.OpenFromRepo(goRepo, refLocal) + if err != nil { + return nil, fmt.Errorf("reopen treefs: %w", err) + } + r.tfs = tfs + return tfs, nil +} + // UserName reads user.name from the git config (local, then global, then // system) using go-git's ConfigScoped. Returns "unknown" if unset. func (r *Repo) UserName() string {