From 762bcb5de55f5a5491745024d5fc7e0f70c97b42 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 10:10:13 -0700 Subject: [PATCH 01/47] Parallelize versioning3 functional suite --- tests/versioning_3_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 37d4d7ab2a..a275821dc2 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -79,8 +79,8 @@ type Versioning3Suite struct { } func TestVersioning3FunctionalSuite(t *testing.T) { - testcore.UseSuiteScopedCluster(t) //nolint:staticcheck // SA1019: suite still requires legacy sequential execution - parallelsuite.RunLegacySequential(t, &Versioning3Suite{}) //nolint:staticcheck // SA1019: suite still requires legacy sequential execution + testcore.UseSuiteScopedCluster(t) //nolint:staticcheck // SA1019: suite still requires legacy sequential execution + parallelsuite.Run(t, &Versioning3Suite{}) } func (s *Versioning3Suite) setupEnv(opts ...testcore.TestOption) *testcore.TestEnv { From 4cd2a0624281e81fc51f1d3b9995a1f14934ff95 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 10:10:21 -0700 Subject: [PATCH 02/47] Enable worker service for versioning3 parallel suite --- tests/versioning_3_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index a275821dc2..d02a2696d3 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -85,6 +85,7 @@ func TestVersioning3FunctionalSuite(t *testing.T) { func (s *Versioning3Suite) setupEnv(opts ...testcore.TestOption) *testcore.TestEnv { opts = append([]testcore.TestOption{ + testcore.WithWorkerService("worker deployment manager workflows"), testcore.WithDynamicConfig(dynamicconfig.MatchingDeploymentWorkflowVersion, int(versioning3DeploymentWorkflowVersion)), // Make sure we don't hit the rate limiter in tests From 12de8f0580a7dbeec46e2f81205f40a3e8ae125f Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 10:40:02 -0700 Subject: [PATCH 03/47] Wait for versioning3 registration in failing paths --- tests/versioning_3_test.go | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index d02a2696d3..84bb7457b1 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -379,13 +379,8 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, tv := env.Tv() // create version v1 and make it current - idlePollerDone := make(chan struct{}) - go func() { - s.idlePollWorkflow(env, s.Context(), tv, true, ver3MinPollTime, "should not have gotten any tasks since there are none") - close(idlePollerDone) - }() + s.pollUntilRegistered(env, tv) s.setCurrentDeployment(env, tv) - env.WaitForChannel(idlePollerDone) wftCompleted := make(chan struct{}) s.pollWftAndHandle(env, tv, false, wftCompleted, @@ -399,14 +394,9 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, s.verifyWorkflowVersioning(env, tv, vbPinned, tv.Deployment(), tv.VersioningOverridePinned(), nil) // create version v2 and make it current which shall make v1 go from current -> draining/drained - idlePollerDone = make(chan struct{}) tv2 := tv.WithBuildIDNumber(2) - go func() { - s.idlePollWorkflow(env, s.Context(), tv2, true, ver3MinPollTime, "should not have gotten any tasks since there are none") - close(idlePollerDone) - }() + s.pollUntilRegistered(env, tv2) s.setCurrentDeployment(env, tv2) - env.WaitForChannel(idlePollerDone) // wait for v1 to become drained s.Await(func(s *Versioning3Suite) { @@ -2567,7 +2557,7 @@ func (s *Versioning3Suite) TestPinnedCaN_UseRampingVersionOnCaN_SubsequentWFTGoe func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { return respondEmptyWft(tv1, false, vbPinned), nil }) - s.idlePollWorkflow(env, s.Context(), tv2, true, ver3MinPollTime, "should not get any tasks yet") + s.pollUntilRegistered(env, tv2) s.setRampingDeployment(env, tv2, 0, false) s.waitForDeploymentDataPropagation(env, tv2, versionStatusRamping, false, tqTypeWf) s.triggerNormalWFT(env, tv1, execution) From 898a7673efb914885b238b3b9d38f603f2a87fb8 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 11:03:50 -0700 Subject: [PATCH 04/47] Wait for pinned CaN ramping registration --- tests/versioning_3_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 84bb7457b1..2abd33dd45 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -2471,7 +2471,7 @@ func (s *Versioning3Suite) testPinnedCaNUseRampingVersionOnCaN(pinnedOverride, n if !noRampingVersion { // Register v2 poller before setting it as ramping - s.idlePollWorkflow(env, s.Context(), tv2, true, ver3MinPollTime, "should not get any tasks yet") + s.pollUntilRegistered(env, tv2) // Set v2 as ramping at 0%: no workflows move via hash, only via UseRampingVersion CaN s.setRampingDeployment(env, tv2, 0, false) s.waitForDeploymentDataPropagation(env, tv2, versionStatusRamping, false, tqTypeWf) From 8be59532c94afe2b431987f5899598b6e3234049 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 11:24:17 -0700 Subject: [PATCH 05/47] Wait for retry ramping registration --- tests/versioning_3_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 2abd33dd45..4cb54ac569 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -2654,7 +2654,7 @@ func (s *Versioning3Suite) TestPinnedCaN_UseRampingVersionOnCaN_RetryInheritsIni func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { return respondEmptyWft(tv1, false, vbPinned), nil }) - s.idlePollWorkflow(env, s.Context(), tv2, true, ver3MinPollTime, "should not get any tasks yet") + s.pollUntilRegistered(env, tv2) s.setRampingDeployment(env, tv2, 0, false) s.waitForDeploymentDataPropagation(env, tv2, versionStatusRamping, false, tqTypeWf) s.triggerNormalWFT(env, tv1, execution) From e73f11ce69f74b0bd3963208909c5faf63e18c47 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 11:43:56 -0700 Subject: [PATCH 06/47] Extend versioning3 propagation wait --- tests/versioning_3_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 4cb54ac569..89f506599c 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -4414,7 +4414,7 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagation( } } s.Empty(remaining) - }, 30*time.Second, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond) } func (s *Versioning3Suite) validateBacklogCount( From 007c5c939d5ec3d4a6d14e14b86fe8fb9e01ad22 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 12:07:03 -0700 Subject: [PATCH 07/47] Wait for child ramping registration --- tests/versioning_3_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 89f506599c..78dae09b19 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -2731,7 +2731,7 @@ func (s *Versioning3Suite) TestPinnedCaN_UseRampingVersionOnCaN_ChildDoesNotInhe func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { return respondEmptyWft(tv1, false, vbPinned), nil }) - s.idlePollWorkflow(env, s.Context(), tv2, true, ver3MinPollTime, "should not get any tasks yet") + s.pollUntilRegistered(env, tv2) s.setRampingDeployment(env, tv2, 0, false) s.waitForDeploymentDataPropagation(env, tv2, versionStatusRamping, false, tqTypeWf) s.triggerNormalWFT(env, tv1, execution) From 67494dd541d940687eb428cca04b9167187d3d80 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 12:30:21 -0700 Subject: [PATCH 08/47] Extend pinned version membership wait --- tests/versioning_3_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 78dae09b19..87c61c0edc 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -5398,7 +5398,7 @@ func (s *Versioning3Suite) validatePinnedVersionExistsInTaskQueue(env *testcore. }) s.NoError(err) s.True(resp.GetIsMember()) - }, 10*time.Second, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond) } From df55d6b294b65bad7ead808df026524d9d20a587 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 12:57:40 -0700 Subject: [PATCH 09/47] Use registration helper for pinned no-poller test --- tests/versioning_3_test.go | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 87c61c0edc..fe0327e76f 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -128,15 +128,8 @@ func (s *Versioning3Suite) TestPinnedTask_NoProperPoller() { tv2 := tv.WithBuildIDNumber(2) go s.idlePollWorkflow(env, s.Context(), tv2, true, ver3MinPollTime, "second deployment should not receive pinned task") - // Start a versioned poller for the first version so that it registers the version in the task queue. - pollerCtx, cancelPoller := context.WithCancel(s.Context()) - go s.idlePollWorkflow(env, pollerCtx, tv, true, ver3MinPollTime, "first deployment should not receive any task. It is just creating a version in the task queue.") - - // Wait for the version to be present in the task queue - s.validatePinnedVersionExistsInTaskQueue(env, tv) - - // Cancel the poller after condition is met - cancelPoller() + // Register the first version before starting a workflow pinned to it. + s.pollUntilRegistered(env, tv) s.startWorkflow(env, tv, tv.VersioningOverridePinned()) s.idlePollWorkflow(env, s.Context(), tv, false, ver3MinPollTime, "unversioned worker should not receive pinned task") From b9ce1f781dc0640ca227523aef87ff92a208a84e Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 13:21:20 -0700 Subject: [PATCH 10/47] Validate pinned membership after registration --- tests/versioning_3_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index fe0327e76f..05da3f937e 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -130,6 +130,7 @@ func (s *Versioning3Suite) TestPinnedTask_NoProperPoller() { // Register the first version before starting a workflow pinned to it. s.pollUntilRegistered(env, tv) + s.validatePinnedVersionExistsInTaskQueue(env, tv) s.startWorkflow(env, tv, tv.VersioningOverridePinned()) s.idlePollWorkflow(env, s.Context(), tv, false, ver3MinPollTime, "unversioned worker should not receive pinned task") From a6c53b88f9d18745edcad47414b586d6dc32d288 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 13:48:25 -0700 Subject: [PATCH 11/47] Register child version before inheritance test --- tests/versioning_3_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 05da3f937e..cdcfd022f5 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -1948,6 +1948,7 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectInherit(crossTq boo // v1 is current for both parent and child s.setCurrentDeployment(env, tv1) + s.pollUntilRegistered(env, tv1Child) startOpts := sdkclient.StartWorkflowOptions{ ID: tv1.WorkflowID(), From 77e4cba5dbcf3767c4a45289b128d04f3ce8e9fa Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 14:11:16 -0700 Subject: [PATCH 12/47] Extend versioning3 backlog count await --- tests/versioning_3_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index cdcfd022f5..4711640ca4 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -4418,13 +4418,13 @@ func (s *Versioning3Suite) validateBacklogCount( tqType enumspb.TaskQueueType, expectedCount int64, ) { - ctx, cancel := context.WithTimeout(s.Context(), 10*time.Second) - defer cancel() - var resp *workflowservice.DescribeTaskQueueResponse var err error s.Await(func(s *Versioning3Suite) { + ctx, cancel := context.WithTimeout(s.Context(), 10*time.Second) + defer cancel() + resp, err = env.FrontendClient().DescribeTaskQueue(ctx, &workflowservice.DescribeTaskQueueRequest{ Namespace: env.Namespace().String(), TaskQueue: tv.TaskQueue(), @@ -4436,7 +4436,7 @@ func (s *Versioning3Suite) validateBacklogCount( priorityStats, ok := resp.GetStatsByPriorityKey()[3] s.True(ok) s.Equal(expectedCount, priorityStats.GetApproximateBacklogCount()) - }, 6*time.Second, 500*time.Millisecond) + }, 30*time.Second, 500*time.Millisecond) } func (s *Versioning3Suite) verifyVersioningSAs( From 879a94fb11816933f9e7c67483630d966662ae1b Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 14:54:09 -0700 Subject: [PATCH 13/47] Strengthen versioning3 registration polling --- tests/versioning_3_test.go | 42 ++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 4711640ca4..71389954e6 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "strings" + "sync" "sync/atomic" "testing" "time" @@ -61,6 +62,7 @@ const ( vbPinned = enumspb.VERSIONING_BEHAVIOR_PINNED vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 + ver3PollTimeout = 2 * time.Minute versionStatusNil = versionStatus(0) versionStatusInactive = versionStatus(1) @@ -1393,7 +1395,7 @@ func (s *Versioning3Suite) testDoubleTransition(unversionedSrc bool, signal bool Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, }}, []string{}, tqTypeWf, tqTypeAct) // poll activity from v2 worker, this should start a transition but should not immediately start the activity. - go s.idlePollActivity(env, tv2, true, time.Minute, "v2 worker should not receive the activity") + go s.idlePollActivity(s.Context(), env, tv2, true, time.Minute, "v2 worker should not receive the activity") s.Await(func(s *Versioning3Suite) { dwf, err := env.FrontendClient().DescribeWorkflowExecution( @@ -1699,7 +1701,7 @@ func (s *Versioning3Suite) testTransitionFromActivity(sticky bool) { time.Sleep(time.Millisecond * 200) //nolint:forbidigo // Pollers of d1 are there, but should not get any task - go s.idlePollActivity(env, tv1, true, ver3MinPollTime, "activities should not go to the old deployment") + go s.idlePollActivity(s.Context(), env, tv1, true, ver3MinPollTime, "activities should not go to the old deployment") act2To4Err := make(chan error, 1) go func() { @@ -3445,23 +3447,37 @@ func (s *Versioning3Suite) pollUntilRegistered(env *testcore.TestEnv, tv *testva tqTypes = []enumspb.TaskQueueType{tqTypeWf} } pollCtx, cancel := context.WithCancel(s.Context()) + var wg sync.WaitGroup for _, tqType := range tqTypes { tqType := tqType - go func() { + wg.Go(func() { for pollCtx.Err() == nil { switch tqType { case tqTypeWf: s.idlePollWorkflow(env, pollCtx, tv, true, ver3MinPollTime, "should not get any tasks yet") case tqTypeAct: - s.idlePollActivity(env, tv, true, ver3MinPollTime, "should not get any tasks yet") + s.idlePollActivity(pollCtx, env, tv, true, ver3MinPollTime, "should not get any tasks yet") case tqTypeNexus: s.idlePollNexus(env, pollCtx, tv, true, ver3MinPollTime, "should not get any tasks yet") default: panic("invalid task queue type") } } - }() + }) } + done := make(chan struct{}) + go func() { + wg.Wait() + close(done) + }() + defer func() { + cancel() + select { + case <-done: + case <-s.Context().Done(): + s.FailNow("context timeout while stopping registration pollers") + } + }() // Wait until the version is visible and all requested task queue types are registered. s.Await(func(s *Versioning3Suite) { @@ -3486,8 +3502,7 @@ func (s *Versioning3Suite) pollUntilRegistered(env *testcore.TestEnv, tv *testva } s.True(found) } - }, 30*time.Second, 500*time.Millisecond) - cancel() + }, 90*time.Second, 500*time.Millisecond) } func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { @@ -4055,7 +4070,7 @@ func (s *Versioning3Suite) doPollWftAndHandle( DeploymentOptions: tv.WorkerDeploymentOptions(versioned), TaskQueue: tq, }, - ).HandleTask(tv, handler, taskpoller.WithTimeout(time.Minute)) + ).HandleTask(tv, handler, taskpoller.WithTimeout(ver3PollTimeout)) } if async == nil { resp, err := f() @@ -4120,7 +4135,7 @@ func (s *Versioning3Suite) pollNexusTaskAndHandle( DeploymentOptions: tv.WorkerDeploymentOptions(true), TaskQueue: tq, }, - ).HandleTask(tv, handler, taskpoller.WithTimeout(10*time.Second)) + ).HandleTask(tv, handler, taskpoller.WithTimeout(ver3PollTimeout)) } if async == nil { resp, err := f() @@ -4190,7 +4205,7 @@ func (s *Versioning3Suite) doPollActivityAndHandleErr( _, err := poller.PollActivityTask( &workflowservice.PollActivityTaskQueueRequest{ DeploymentOptions: tv.WorkerDeploymentOptions(versioned), - }).HandleTask(tv, handler, taskpoller.WithTimeout(time.Minute)) + }).HandleTask(tv, handler, taskpoller.WithTimeout(ver3PollTimeout)) return err } @@ -4238,6 +4253,7 @@ func (s *Versioning3Suite) idlePollUnversionedActivity( } func (s *Versioning3Suite) idlePollActivity( + ctx context.Context, env *testcore.TestEnv, tv *testvars.TestVars, versioned bool, @@ -4258,7 +4274,7 @@ func (s *Versioning3Suite) idlePollActivity( return nil, nil }, taskpoller.WithTimeout(timeout), - taskpoller.WithContext(s.Context()), + taskpoller.WithContext(ctx), ) } @@ -4721,7 +4737,7 @@ func (s *Versioning3Suite) TestActivityTQLags_DependentActivityCompletesOnTheNew // Start an idle activity poller on v0. This poller should not receive any activity tasks //nolint:testifylint - go s.idlePollActivity(env, tv0, true, ver3MinPollTime, "activity should not go to the old deployment") + go s.idlePollActivity(s.Context(), env, tv0, true, ver3MinPollTime, "activity should not go to the old deployment") // Start a poller on v1 activityTaskCh := make(chan struct{}, 1) @@ -5732,7 +5748,7 @@ func (s *Versioning3Suite) testTransitionDuringTransientTask(env *testcore.TestE } // Poll the second activity to cause transition to v1. - s.idlePollActivity(env, tv1, true, ver3MinPollTime, "should not get the activity because it started a transition") + s.idlePollActivity(s.Context(), env, tv1, true, ver3MinPollTime, "should not get the activity because it started a transition") s.verifyWorkflowVersioning(env, tv1, vbUnspecified, nil, nil, tv1.DeploymentVersionTransition()) // Print workflow describe and history From 31596a72055cb150fa2a40b8783b2d39098a87d3 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 15:34:34 -0700 Subject: [PATCH 14/47] Harden versioning3 deployment API waits --- tests/versioning_3_test.go | 42 +++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 71389954e6..c8533e44ed 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -3417,23 +3417,25 @@ func (s *Versioning3Suite) TestSyncDeploymentUserDataWithRoutingConfig_Update() } func (s *Versioning3Suite) setCurrentDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { - ctx, cancel := context.WithTimeout(s.Context(), 60*time.Second) - defer cancel() - failedPrecondition := serviceerror.NewFailedPreconditionf(workerdeployment.ErrCurrentVersionDoesNotHaveAllTaskQueues, tv.DeploymentVersionStringV32()).Error() + buildIDNotFound := fmt.Sprintf("build ID '%s' not found in Worker Deployment", tv.BuildID()) + deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + defer cancel() + req := &workflowservice.SetWorkerDeploymentCurrentVersionRequest{ Namespace: env.Namespace().String(), DeploymentName: tv.DeploymentSeries(), } req.BuildId = tv.BuildID() _, err := env.FrontendClient().SetWorkerDeploymentCurrentVersion(ctx, req) - if s.shouldRetryWorkerDeploymentRPC(env, err, failedPrecondition) { + if s.shouldRetryWorkerDeploymentRPC(env, err, failedPrecondition, buildIDNotFound, deploymentNotFound) { s.NoError(err) return } s.NoError(err) - }, 60*time.Second, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3506,21 +3508,22 @@ func (s *Versioning3Suite) pollUntilRegistered(env *testcore.TestEnv, tv *testva } func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { - ctx, cancel := context.WithTimeout(s.Context(), 60*time.Second) - defer cancel() - + deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + defer cancel() + req := &workflowservice.SetWorkerDeploymentCurrentVersionRequest{ Namespace: env.Namespace().String(), DeploymentName: tv.DeploymentSeries(), } _, err := env.FrontendClient().SetWorkerDeploymentCurrentVersion(ctx, req) - if s.shouldRetryWorkerDeploymentRPC(env, err) { + if s.shouldRetryWorkerDeploymentRPC(env, err, deploymentNotFound) { s.NoError(err) return } s.NoError(err) - }, 60*time.Second, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3533,16 +3536,18 @@ func (s *Versioning3Suite) setRampingDeployment( percentage float32, rampUnversioned bool, ) { - ctx, cancel := context.WithTimeout(s.Context(), 60*time.Second) - defer cancel() - bid := tv.BuildID() if rampUnversioned { bid = "" } failedPrecondition := serviceerror.NewFailedPreconditionf(workerdeployment.ErrRampingVersionDoesNotHaveAllTaskQueues, tv.DeploymentVersionStringV32()).Error() + buildIDNotFound := fmt.Sprintf("build ID '%s' not found in Worker Deployment", tv.BuildID()) + deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + defer cancel() + req := &workflowservice.SetWorkerDeploymentRampingVersionRequest{ Namespace: env.Namespace().String(), DeploymentName: tv.DeploymentSeries(), @@ -3550,12 +3555,12 @@ func (s *Versioning3Suite) setRampingDeployment( } req.BuildId = bid _, err := env.FrontendClient().SetWorkerDeploymentRampingVersion(ctx, req) - if s.shouldRetryWorkerDeploymentRPC(env, err, failedPrecondition) { + if s.shouldRetryWorkerDeploymentRPC(env, err, failedPrecondition, buildIDNotFound, deploymentNotFound) { s.NoError(err) return } s.NoError(err) - }, 60*time.Second, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3564,7 +3569,10 @@ func (s *Versioning3Suite) setRampingDeployment( func (s *Versioning3Suite) waitForDeploymentDataPropagationQueryWorkerDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { if versioning3DeploymentWorkflowVersion == workerdeployment.AsyncSetCurrentAndRamping { s.Await(func(s *Versioning3Suite) { - resp, err := env.FrontendClient().DescribeWorkerDeployment(s.Context(), &workflowservice.DescribeWorkerDeploymentRequest{ + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + defer cancel() + + resp, err := env.FrontendClient().DescribeWorkerDeployment(ctx, &workflowservice.DescribeWorkerDeploymentRequest{ Namespace: env.Namespace().String(), DeploymentName: tv.DeploymentSeries(), }) @@ -3574,7 +3582,7 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagationQueryWorkerDeployment } s.NoError(err) s.Equal(enumspb.ROUTING_CONFIG_UPDATE_STATE_COMPLETED, resp.GetWorkerDeploymentInfo().GetRoutingConfigUpdateState()) - }, 10*time.Second, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond) } } From 1ba500bdb20a6f863ffa87410f061cd42d5c2d44 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 15:52:33 -0700 Subject: [PATCH 15/47] Harden versioning3 registration waits --- tests/versioning_3_test.go | 178 +++++++++++++++++-------------------- 1 file changed, 83 insertions(+), 95 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index c8533e44ed..204d8bb828 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -457,17 +457,7 @@ func (s *Versioning3Suite) testQueryWithPinnedOverride(env *testcore.TestEnv, st return respondEmptyWft(tv, sticky, vbUnpinned), nil }) - // Wait for the version to be present in the task queue. Version existence is required before it can be set as an override. - s.Await(func(s *Versioning3Suite) { - resp, err := env.GetTestCluster().MatchingClient().CheckTaskQueueVersionMembership(s.Context(), &matchingservice.CheckTaskQueueVersionMembershipRequest{ - NamespaceId: env.NamespaceID().String(), - TaskQueue: tv.TaskQueue().GetName(), - TaskQueueType: tqTypeWf, - Version: worker_versioning.DeploymentVersionFromDeployment(tv.Deployment()), - }) - s.NoError(err) - s.True(resp.GetIsMember()) - }, 10*time.Second, 500*time.Millisecond) + s.validatePinnedVersionExistsInTaskQueue(env, tv) runID := s.startWorkflow(env, tv, tv.VersioningOverridePinned()) @@ -1198,7 +1188,7 @@ func (s *Versioning3Suite) testUnpinnedWorkflowWithRamp(env *testcore.TestEnv, t // wait until all task queue partitions know that tv2 is ramping s.waitForDeploymentDataPropagation(env, tv2, versionStatusRamping, toUnversioned, tqTypeWf, tqTypeAct) - numTests := 50 + numTests := 20 counter := make(map[string]int) runs := make([]sdkclient.WorkflowRun, numTests) for i := range numTests { @@ -1408,7 +1398,7 @@ func (s *Versioning3Suite) testDoubleTransition(unversionedSrc bool, signal bool ) s.NoError(err) s.Equal(tv2.DeploymentVersionTransition(), dwf.WorkflowExecutionInfo.GetVersioningInfo().GetVersionTransition()) - }, 10*time.Second, 500*time.Millisecond) + }, 30*time.Second, 500*time.Millisecond) // Back to sourceV if unversionedSrc { @@ -3445,6 +3435,13 @@ func (s *Versioning3Suite) setCurrentDeployment(env *testcore.TestEnv, tv *testv // tqTypes controls which task queue types to poll; it defaults to workflow only. // Pollers run continuously until all TQ types are registered. func (s *Versioning3Suite) pollUntilRegistered(env *testcore.TestEnv, tv *testvars.TestVars, tqTypes ...enumspb.TaskQueueType) { + stopPollers := s.startRegistrationPollers(env, tv, tqTypes...) + defer stopPollers() + + s.waitForDeploymentVersionRegistration(env, tv, tqTypes...) +} + +func (s *Versioning3Suite) startRegistrationPollers(env *testcore.TestEnv, tv *testvars.TestVars, tqTypes ...enumspb.TaskQueueType) func() { if len(tqTypes) == 0 { tqTypes = []enumspb.TaskQueueType{tqTypeWf} } @@ -3467,42 +3464,40 @@ func (s *Versioning3Suite) pollUntilRegistered(env *testcore.TestEnv, tv *testva } }) } + done := make(chan struct{}) go func() { wg.Wait() close(done) }() - defer func() { + + return func() { cancel() select { case <-done: case <-s.Context().Done(): s.FailNow("context timeout while stopping registration pollers") } - }() + } +} - // Wait until the version is visible and all requested task queue types are registered. +func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.TestEnv, tv *testvars.TestVars, tqTypes ...enumspb.TaskQueueType) { + if len(tqTypes) == 0 { + tqTypes = []enumspb.TaskQueueType{tqTypeWf} + } s.Await(func(s *Versioning3Suite) { - resp, err := env.FrontendClient().DescribeWorkerDeploymentVersion(s.Context(), &workflowservice.DescribeWorkerDeploymentVersionRequest{ - Namespace: env.Namespace().String(), - Version: tv.DeploymentVersionString(), - }) - var notFound *serviceerror.NotFound - if errors.As(err, ¬Found) { - s.NoError(err) - return - } - s.NoError(err) - tqName := tv.TaskQueue().GetName() + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + defer cancel() + for _, tqType := range tqTypes { - found := false - for _, tq := range resp.GetVersionTaskQueues() { - if tq.GetName() == tqName && tq.GetType() == tqType { - found = true - break - } - } - s.True(found) + resp, err := env.GetTestCluster().MatchingClient().CheckTaskQueueVersionMembership(ctx, &matchingservice.CheckTaskQueueVersionMembershipRequest{ + NamespaceId: env.NamespaceID().String(), + TaskQueue: tv.TaskQueue().GetName(), + TaskQueueType: tqType, + Version: worker_versioning.DeploymentVersionFromDeployment(tv.Deployment()), + }) + s.NoError(err) + s.True(resp.GetIsMember()) } }, 90*time.Second, 500*time.Millisecond) } @@ -3735,7 +3730,7 @@ func (s *Versioning3Suite) rollbackTaskQueueToVersion( current, currentRevisionNumber, _, _, _, _, _, _ := worker_versioning.CalculateTaskQueueVersioningInfo(ms.GetUserData().GetData().GetPerType()[int32(tqTypeWf)].GetDeploymentData()) s.Equal(tv.DeploymentVersion().GetBuildId(), current.GetBuildId()) s.Equal(int64(0), currentRevisionNumber) - }, 10*time.Second, 500*time.Millisecond) + }, 30*time.Second, 500*time.Millisecond) } func (s *Versioning3Suite) syncTaskQueueDeploymentData( @@ -3840,50 +3835,55 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, override *workflowpb.VersioningOverride, transition *workflowpb.DeploymentVersionTransition, ) { - dwf, err := env.FrontendClient().DescribeWorkflowExecution( - s.Context(), &workflowservice.DescribeWorkflowExecutionRequest{ - Namespace: env.Namespace().String(), - Execution: &commonpb.WorkflowExecution{ - WorkflowId: tv.WorkflowID(), - }, - }) - s.NoError(err) + s.Await(func(s *Versioning3Suite) { + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + defer cancel() - versioningInfo := dwf.WorkflowExecutionInfo.GetVersioningInfo() - s.Equal(behavior.String(), versioningInfo.GetBehavior().String()) - var v *deploymentspb.WorkerDeploymentVersion - if versioningInfo.GetVersion() != "" { //nolint:staticcheck // SA1019: worker versioning v0.31 - //nolint:staticcheck // SA1019: worker versioning v0.31 - v, err = worker_versioning.WorkerDeploymentVersionFromStringV31(versioningInfo.GetVersion()) + dwf, err := env.FrontendClient().DescribeWorkflowExecution( + ctx, &workflowservice.DescribeWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + Execution: &commonpb.WorkflowExecution{ + WorkflowId: tv.WorkflowID(), + }, + }) s.NoError(err) - s.NotNil(versioningInfo.GetDeploymentVersion()) // make sure we are always populating this whenever Version string is populated - } - if dv := versioningInfo.GetDeploymentVersion(); dv != nil { - v = worker_versioning.DeploymentVersionFromDeployment(worker_versioning.DeploymentFromExternalDeploymentVersion(dv)) - } - actualDeployment := worker_versioning.DeploymentFromDeploymentVersion(v) - if !deployment.Equal(actualDeployment) { - s.Fail(fmt.Sprintf("deployment version mismatch. expected: {%s}, actual: {%s}", - deployment, - actualDeployment, - )) - } - // v0.32 override - s.Equal(override.GetAutoUpgrade(), versioningInfo.GetVersioningOverride().GetAutoUpgrade()) - s.Equal(override.GetPinned().GetVersion().GetBuildId(), versioningInfo.GetVersioningOverride().GetPinned().GetVersion().GetBuildId()) - s.Equal(override.GetPinned().GetVersion().GetDeploymentName(), versioningInfo.GetVersioningOverride().GetPinned().GetVersion().GetDeploymentName()) - s.Equal(override.GetPinned().GetBehavior(), versioningInfo.GetVersioningOverride().GetPinned().GetBehavior()) - if worker_versioning.OverrideIsPinned(override) { - s.Equal(override.GetPinned().GetVersion().GetDeploymentName(), dwf.WorkflowExecutionInfo.GetWorkerDeploymentName()) - } + versioningInfo := dwf.WorkflowExecutionInfo.GetVersioningInfo() + s.Equal(behavior.String(), versioningInfo.GetBehavior().String()) + var v *deploymentspb.WorkerDeploymentVersion + if versioningInfo.GetVersion() != "" { //nolint:staticcheck // SA1019: worker versioning v0.31 + //nolint:staticcheck // SA1019: worker versioning v0.31 + v, err = worker_versioning.WorkerDeploymentVersionFromStringV31(versioningInfo.GetVersion()) + s.NoError(err) + s.NotNil(versioningInfo.GetDeploymentVersion()) // make sure we are always populating this whenever Version string is populated + } + if dv := versioningInfo.GetDeploymentVersion(); dv != nil { + v = worker_versioning.DeploymentVersionFromDeployment(worker_versioning.DeploymentFromExternalDeploymentVersion(dv)) + } + actualDeployment := worker_versioning.DeploymentFromDeploymentVersion(v) + if !deployment.Equal(actualDeployment) { + s.Fail(fmt.Sprintf("deployment version mismatch. expected: {%s}, actual: {%s}", + deployment, + actualDeployment, + )) + } - if !versioningInfo.GetVersionTransition().Equal(transition) { - s.Fail(fmt.Sprintf("version transition mismatch. expected: {%s}, actual: {%s}", - transition, - versioningInfo.GetVersionTransition(), - )) - } + // v0.32 override + s.Equal(override.GetAutoUpgrade(), versioningInfo.GetVersioningOverride().GetAutoUpgrade()) + s.Equal(override.GetPinned().GetVersion().GetBuildId(), versioningInfo.GetVersioningOverride().GetPinned().GetVersion().GetBuildId()) + s.Equal(override.GetPinned().GetVersion().GetDeploymentName(), versioningInfo.GetVersioningOverride().GetPinned().GetVersion().GetDeploymentName()) + s.Equal(override.GetPinned().GetBehavior(), versioningInfo.GetVersioningOverride().GetPinned().GetBehavior()) + if worker_versioning.OverrideIsPinned(override) { + s.Equal(override.GetPinned().GetVersion().GetDeploymentName(), dwf.WorkflowExecutionInfo.GetWorkerDeploymentName()) + } + + if !versioningInfo.GetVersionTransition().Equal(transition) { + s.Fail(fmt.Sprintf("version transition mismatch. expected: {%s}, actual: {%s}", + transition, + versioningInfo.GetVersionTransition(), + )) + } + }, 90*time.Second, 500*time.Millisecond) } func respondActivity() *workflowservice.RespondActivityTaskCompletedRequest { @@ -4557,9 +4557,7 @@ func (s *Versioning3Suite) TestAutoUpgradeWorkflows_NoBouncingBetweenVersions() s.NoError(err) // Verify that the workflow is running on v1 - s.Await(func(s *Versioning3Suite) { - s.verifyWorkflowVersioning(env, tv1, vbUnpinned, tv1.Deployment(), nil, nil) - }, 10*time.Second, 500*time.Millisecond) + s.verifyWorkflowVersioning(env, tv1, vbUnpinned, tv1.Deployment(), nil, nil) // Start v0 workers to ensure they never receive a task idlePollerCtx, idlePollerCancel := context.WithTimeout(s.Context(), 10*time.Second) @@ -5380,7 +5378,7 @@ func (s *Versioning3Suite) TestCheckTaskQueueVersionMembership() { }) s.NoError(err) s.False(resp.GetIsMember()) // the check should pass if no version is present - }, 10*time.Second, 500*time.Millisecond) + }, 30*time.Second, 500*time.Millisecond) // Start v1 worker which shall register the version in the task queue w1 := worker.New(env.SdkClient(), tv1.TaskQueue().GetName(), worker.Options{ @@ -5402,23 +5400,13 @@ func (s *Versioning3Suite) TestCheckTaskQueueVersionMembership() { }) s.NoError(err) s.True(resp.GetIsMember()) - }, 10*time.Second, 500*time.Millisecond) + }, 30*time.Second, 500*time.Millisecond) } // validatePinnedVersionExistsInTaskQueue validates that the version, to be pinned, exists in the task queue. // TODO (future improvement): This can be further extended to validate the presence of any version instead of using the GetTaskQueueUserData RPC. func (s *Versioning3Suite) validatePinnedVersionExistsInTaskQueue(env *testcore.TestEnv, tv *testvars.TestVars) { - s.Await(func(s *Versioning3Suite) { - resp, err := env.GetTestCluster().MatchingClient().CheckTaskQueueVersionMembership(s.Context(), &matchingservice.CheckTaskQueueVersionMembershipRequest{ - NamespaceId: env.NamespaceID().String(), - TaskQueue: tv.TaskQueue().GetName(), - TaskQueueType: tqTypeWf, - Version: worker_versioning.DeploymentVersionFromDeployment(tv.Deployment()), - }) - s.NoError(err) - s.True(resp.GetIsMember()) - }, 90*time.Second, 500*time.Millisecond) - + s.waitForDeploymentVersionRegistration(env, tv, tqTypeWf) } // TestMaxVersionsInTaskQueue tests that polling from a task queue with too many @@ -6312,11 +6300,11 @@ func (s *Versioning3Suite) TestStalePartition_RevisionSuppressesTrampolining() { s.verifyWorkflowVersioning(env, tv1, vbPinned, tv1.Deployment(), nil, nil) // Register v2, set v2 as current (revision increments) - s.idlePollWorkflow(env, s.Context(), tv2, true, ver3MinPollTime, "v2 poller registration") + s.pollUntilRegistered(env, tv2) s.setCurrentDeployment(env, tv2) // Register v3, set v3 as current (revision increments again) - s.idlePollWorkflow(env, s.Context(), tv3, true, ver3MinPollTime, "v3 poller registration") + s.pollUntilRegistered(env, tv3) s.setCurrentDeployment(env, tv3) // Trigger WFT — target should be v3 with a high revision @@ -6389,7 +6377,7 @@ func (s *Versioning3Suite) TestStalePartition_RevisionSuppressesTrampolining() { // Set a new v4 as current — this produces a revision strictly higher than // the declined revision, simulating an up-to-date partition with fresh data. tv4 := tv1.WithBuildIDNumber(4) - s.idlePollWorkflow(env, s.Context(), tv4, true, ver3MinPollTime, "v4 poller registration") + s.pollUntilRegistered(env, tv4) s.setCurrentDeployment(env, tv4) s.waitForDeploymentDataPropagation(env, tv4, versionStatusCurrent, false, tqTypeWf) From f0871224ab170bab66c47d24b5ca6ffb818ecc09 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 16:17:24 -0700 Subject: [PATCH 16/47] Re-register inherited child workflow versions --- tests/versioning_3_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 204d8bb828..be7a8a5c15 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -1967,6 +1967,7 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectInherit(crossTq boo }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, }}, []string{}, tqTypeWf) + s.pollUntilRegistered(env, tv1) if crossTq { s.updateTaskQueueDeploymentDataWithRoutingConfig(env, tv2Child, &deploymentpb.RoutingConfig{ CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2Child.DeploymentVersionString()), @@ -1975,6 +1976,7 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectInherit(crossTq boo }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2Child.DeploymentVersion().GetBuildId(): { Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, }}, []string{}, tqTypeWf) + s.pollUntilRegistered(env, tv1Child) } currentChanged <- struct{}{} From da8f603c3e9512d231b6e7a974cd52ae098a4397 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 16:45:26 -0700 Subject: [PATCH 17/47] Validate speculative task history from poll response --- tests/versioning_3_test.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index be7a8a5c15..516f10c97f 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -2289,7 +2289,7 @@ func (s *Versioning3Suite) testPinnedCaNUpgradeOnCaN(normalTask, speculativeTask // Mode-specific validations historyEvents := task.History.GetEvents() if speculativeTask { - s.verifySpeculativeTask(env, execution) + s.verifySpeculativeTask(task) } else if transientTask { s.verifyTransientTask(task) // Get events from server-side history, this includes transient events. @@ -2833,9 +2833,8 @@ func (s *Versioning3Suite) triggerTransientWFT(env *testcore.TestEnv, tv *testva } // Verify this is a speculative task - events not yet in persisted history -func (s *Versioning3Suite) verifySpeculativeTask(env *testcore.TestEnv, execution *commonpb.WorkflowExecution) { - events := env.GetHistory(env.Namespace().String(), execution) - s.EqualHistoryEvents(` +func (s *Versioning3Suite) verifySpeculativeTask(task *workflowservice.PollWorkflowTaskQueueResponse) { + s.EqualHistory(` 1 WorkflowExecutionStarted 2 WorkflowTaskScheduled 3 WorkflowTaskStarted @@ -2846,7 +2845,7 @@ func (s *Versioning3Suite) verifySpeculativeTask(env *testcore.TestEnv, executio 8 WorkflowTaskCompleted 9 WorkflowTaskScheduled 10 WorkflowTaskStarted - `, events) + `, task.History) } func (s *Versioning3Suite) verifyTransientTask(task *workflowservice.PollWorkflowTaskQueueResponse) { From 285afef53dfe8c6373833f09662cb73b02d96591 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 17:28:34 -0700 Subject: [PATCH 18/47] Preserve inherited child workflow version --- tests/versioning_3_test.go | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 516f10c97f..7a572baf71 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -1964,18 +1964,28 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectInherit(crossTq boo CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2.DeploymentVersionString()), CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), RevisionNumber: 2, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeWf) + }, map[string]*deploymentspb.WorkerDeploymentVersionData{ + tv2.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }, + tv1.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, + }, + }, []string{}, tqTypeWf) s.pollUntilRegistered(env, tv1) if crossTq { s.updateTaskQueueDeploymentDataWithRoutingConfig(env, tv2Child, &deploymentpb.RoutingConfig{ CurrentDeploymentVersion: worker_versioning.ExternalWorkerDeploymentVersionFromStringV31(tv2Child.DeploymentVersionString()), CurrentVersionChangedTime: timestamp.TimePtr(time.Now()), RevisionNumber: 2, - }, map[string]*deploymentspb.WorkerDeploymentVersionData{tv2Child.DeploymentVersion().GetBuildId(): { - Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, - }}, []string{}, tqTypeWf) + }, map[string]*deploymentspb.WorkerDeploymentVersionData{ + tv2Child.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT, + }, + tv1Child.DeploymentVersion().GetBuildId(): { + Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING, + }, + }, []string{}, tqTypeWf) s.pollUntilRegistered(env, tv1Child) } currentChanged <- struct{}{} From 633446d914528c28f1a28386c83e888b1c2071e3 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 18:05:51 -0700 Subject: [PATCH 19/47] Move child inheritance assertions outside workflows --- tests/versioning_3_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 7a572baf71..f66afdceb3 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -1889,7 +1889,6 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectInherit(crossTq boo currentChanged := make(chan struct{}, 1) childv1 := func(ctx workflow.Context) (string, error) { - s.verifyWorkflowVersioning(env, tv1Child, vbPinned, tv1Child.Deployment(), override, nil) return "v1", nil } wf1 := func(ctx workflow.Context) (string, error) { @@ -1905,7 +1904,6 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectInherit(crossTq boo var val1 string s.NoError(fut1.Get(ctx, &val1)) - s.verifyWorkflowVersioning(env, tv1, parentRegistrationBehavior, tv1.Deployment(), override, nil) return val1, nil } @@ -1993,6 +1991,8 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectInherit(crossTq boo var out string s.NoError(run.Get(s.Context(), &out)) s.Equal("v1", out) + s.verifyWorkflowVersioning(env, tv1, parentRegistrationBehavior, tv1.Deployment(), override, nil) + s.verifyWorkflowVersioning(env, tv1Child, vbPinned, tv1Child.Deployment(), override, nil) } func (s *Versioning3Suite) TestChildWorkflowInheritance_UnpinnedParent() { From 7384c8802b8670e3ed09e9e6464317cf1a673147 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 18:25:08 -0700 Subject: [PATCH 20/47] Extend versioning3 test context timeout --- tests/versioning_3_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index f66afdceb3..473f35d481 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -39,6 +39,7 @@ import ( "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/common/testing/protoutils" "go.temporal.io/server/common/testing/taskpoller" + "go.temporal.io/server/common/testing/testcontext" "go.temporal.io/server/common/testing/testhooks" "go.temporal.io/server/common/testing/testvars" "go.temporal.io/server/common/testing/updateutils" @@ -86,6 +87,8 @@ func TestVersioning3FunctionalSuite(t *testing.T) { } func (s *Versioning3Suite) setupEnv(opts ...testcore.TestOption) *testcore.TestEnv { + testcontext.New(s.T(), testcontext.WithTimeout(3*time.Minute)) + opts = append([]testcore.TestOption{ testcore.WithWorkerService("worker deployment manager workflows"), testcore.WithDynamicConfig(dynamicconfig.MatchingDeploymentWorkflowVersion, int(versioning3DeploymentWorkflowVersion)), From 3f1153e445b6f05daa034b735261a71e5d696ade Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 18:48:17 -0700 Subject: [PATCH 21/47] Extend versioning3 verification waits --- tests/versioning_3_test.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 473f35d481..06d1bce718 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -64,6 +64,8 @@ const ( vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 ver3PollTimeout = 2 * time.Minute + ver3TestTimeout = 4 * time.Minute + ver3VerifyWait = 2 * time.Minute versionStatusNil = versionStatus(0) versionStatusInactive = versionStatus(1) @@ -87,7 +89,7 @@ func TestVersioning3FunctionalSuite(t *testing.T) { } func (s *Versioning3Suite) setupEnv(opts ...testcore.TestOption) *testcore.TestEnv { - testcontext.New(s.T(), testcontext.WithTimeout(3*time.Minute)) + testcontext.New(s.T(), testcontext.WithTimeout(ver3TestTimeout)) opts = append([]testcore.TestOption{ testcore.WithWorkerService("worker deployment manager workflows"), @@ -3513,7 +3515,7 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te s.NoError(err) s.True(resp.GetIsMember()) } - }, 90*time.Second, 500*time.Millisecond) + }, ver3VerifyWait, 500*time.Millisecond) } func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { @@ -3897,7 +3899,7 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, versioningInfo.GetVersionTransition(), )) } - }, 90*time.Second, 500*time.Millisecond) + }, ver3VerifyWait, 500*time.Millisecond) } func respondActivity() *workflowservice.RespondActivityTaskCompletedRequest { From 3fae2b8c627ec895490a536b452c03a24d9f8e23 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 19:41:17 -0700 Subject: [PATCH 22/47] Normalize versioning3 verification waits --- tests/versioning_3_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 06d1bce718..27925cda64 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -3441,7 +3441,7 @@ func (s *Versioning3Suite) setCurrentDeployment(env *testcore.TestEnv, tv *testv return } s.NoError(err) - }, 90*time.Second, 500*time.Millisecond) + }, ver3VerifyWait, 500*time.Millisecond) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3534,7 +3534,7 @@ func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *tes return } s.NoError(err) - }, 90*time.Second, 500*time.Millisecond) + }, ver3VerifyWait, 500*time.Millisecond) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3571,7 +3571,7 @@ func (s *Versioning3Suite) setRampingDeployment( return } s.NoError(err) - }, 90*time.Second, 500*time.Millisecond) + }, ver3VerifyWait, 500*time.Millisecond) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3593,7 +3593,7 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagationQueryWorkerDeployment } s.NoError(err) s.Equal(enumspb.ROUTING_CONFIG_UPDATE_STATE_COMPLETED, resp.GetWorkerDeploymentInfo().GetRoutingConfigUpdateState()) - }, 90*time.Second, 500*time.Millisecond) + }, ver3VerifyWait, 500*time.Millisecond) } } @@ -4449,7 +4449,7 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagation( } } s.Empty(remaining) - }, 90*time.Second, 500*time.Millisecond) + }, ver3VerifyWait, 500*time.Millisecond) } func (s *Versioning3Suite) validateBacklogCount( From e44b3534a672165501fd0a1fc471a35659bd8b38 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 19:53:58 -0700 Subject: [PATCH 23/47] Revert versioning3 timeout expansion --- tests/versioning_3_test.go | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 27925cda64..f66afdceb3 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -39,7 +39,6 @@ import ( "go.temporal.io/server/common/testing/parallelsuite" "go.temporal.io/server/common/testing/protoutils" "go.temporal.io/server/common/testing/taskpoller" - "go.temporal.io/server/common/testing/testcontext" "go.temporal.io/server/common/testing/testhooks" "go.temporal.io/server/common/testing/testvars" "go.temporal.io/server/common/testing/updateutils" @@ -64,8 +63,6 @@ const ( vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 ver3PollTimeout = 2 * time.Minute - ver3TestTimeout = 4 * time.Minute - ver3VerifyWait = 2 * time.Minute versionStatusNil = versionStatus(0) versionStatusInactive = versionStatus(1) @@ -89,8 +86,6 @@ func TestVersioning3FunctionalSuite(t *testing.T) { } func (s *Versioning3Suite) setupEnv(opts ...testcore.TestOption) *testcore.TestEnv { - testcontext.New(s.T(), testcontext.WithTimeout(ver3TestTimeout)) - opts = append([]testcore.TestOption{ testcore.WithWorkerService("worker deployment manager workflows"), testcore.WithDynamicConfig(dynamicconfig.MatchingDeploymentWorkflowVersion, int(versioning3DeploymentWorkflowVersion)), @@ -3441,7 +3436,7 @@ func (s *Versioning3Suite) setCurrentDeployment(env *testcore.TestEnv, tv *testv return } s.NoError(err) - }, ver3VerifyWait, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3515,7 +3510,7 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te s.NoError(err) s.True(resp.GetIsMember()) } - }, ver3VerifyWait, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond) } func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { @@ -3534,7 +3529,7 @@ func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *tes return } s.NoError(err) - }, ver3VerifyWait, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3571,7 +3566,7 @@ func (s *Versioning3Suite) setRampingDeployment( return } s.NoError(err) - }, ver3VerifyWait, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3593,7 +3588,7 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagationQueryWorkerDeployment } s.NoError(err) s.Equal(enumspb.ROUTING_CONFIG_UPDATE_STATE_COMPLETED, resp.GetWorkerDeploymentInfo().GetRoutingConfigUpdateState()) - }, ver3VerifyWait, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond) } } @@ -3899,7 +3894,7 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, versioningInfo.GetVersionTransition(), )) } - }, ver3VerifyWait, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond) } func respondActivity() *workflowservice.RespondActivityTaskCompletedRequest { @@ -4449,7 +4444,7 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagation( } } s.Empty(remaining) - }, ver3VerifyWait, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond) } func (s *Versioning3Suite) validateBacklogCount( From af2b7437e2ce6d4abb54e5a89f30c59ed4e5e58b Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 19:56:02 -0700 Subject: [PATCH 24/47] Run versioning3 matching behavior cases sequentially --- common/testing/parallelsuite/suite.go | 10 ++++++++++ common/testing/parallelsuite/suite_test.go | 12 ++++++++++++ tests/versioning_3_test.go | 2 +- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/common/testing/parallelsuite/suite.go b/common/testing/parallelsuite/suite.go index ca32645449..5ebadc360e 100644 --- a/common/testing/parallelsuite/suite.go +++ b/common/testing/parallelsuite/suite.go @@ -104,6 +104,16 @@ func (s *Suite[T]) Run(name string, fn func(T)) bool { }) } +// RunSequential creates a sequential subtest. The callback receives a fresh copy +// of the concrete suite type, initialized for the subtest's *testing.T. +func (s *Suite[T]) RunSequential(name string, fn func(T)) bool { + pt := s.guardT.T // grab T before sealing + s.guardT.markHasSubtests() + return pt.Run(name, func(t *testing.T) { + fn(s.copySuite(t, false, nil, nil).(T)) + }) +} + // Await calls fn repeatedly until all assertions pass or timeout is reached. func (s *Suite[T]) Await(fn func(T), timeout, interval time.Duration) { s.Awaitf(fn, timeout, interval, "") diff --git a/common/testing/parallelsuite/suite_test.go b/common/testing/parallelsuite/suite_test.go index 60b5cd6914..98f438f684 100644 --- a/common/testing/parallelsuite/suite_test.go +++ b/common/testing/parallelsuite/suite_test.go @@ -100,6 +100,15 @@ func (s *contextSuite) TestAwaitUsesSuiteContext() { }, 100*time.Millisecond, time.Millisecond) } +type sequentialSubtestSuite struct{ Suite[*sequentialSubtestSuite] } + +func (s *sequentialSubtestSuite) TestRunSequential() { + s.True(s.runParallel) + s.RunSequential("subtest", func(s *sequentialSubtestSuite) { + s.False(s.runParallel) + }) +} + type sealAfterRunSuite struct{ Suite[*sealAfterRunSuite] } func (s *sealAfterRunSuite) TestAssertionAfterRun() { @@ -133,6 +142,9 @@ func TestRun_AcceptsSuite(t *testing.T) { t.Run("context", func(t *testing.T) { require.NotPanics(t, func() { Run(t, &contextSuite{}) }) }) + t.Run("sequential subtest", func(t *testing.T) { + require.NotPanics(t, func() { Run(t, &sequentialSubtestSuite{}) }) + }) } func TestRun_RejectsSuite(t *testing.T) { diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index f66afdceb3..00b4c54875 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -109,7 +109,7 @@ func (s *Versioning3Suite) setupEnv(opts ...testcore.TestOption) *testcore.TestE func (s *Versioning3Suite) runTestWithMatchingBehavior(testFn func(*testcore.TestEnv, *Versioning3Suite), opts ...testcore.TestOption) { for _, behavior := range testcore.AllMatchingBehaviors() { behavior := behavior - s.Run(behavior.Name(), func(s *Versioning3Suite) { + s.RunSequential(behavior.Name(), func(s *Versioning3Suite) { envOpts := append([]testcore.TestOption{}, opts...) envOpts = append(envOpts, behavior.Options()...) env := s.setupEnv(envOpts...) From bbf6bcffd8ae223a0dd02421a6842154ccdd6810 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 20:41:41 -0700 Subject: [PATCH 25/47] Limit concurrent versioning3 test environments --- tests/versioning_3_test.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 00b4c54875..c31ff96e2b 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -63,6 +63,7 @@ const ( vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 ver3PollTimeout = 2 * time.Minute + ver3EnvSlots = 4 versionStatusNil = versionStatus(0) versionStatusInactive = versionStatus(1) @@ -74,7 +75,10 @@ const ( versioning3DeploymentWorkflowVersion = workerdeployment.VersionDataRevisionNumber ) -var _ = testhooks.MatchingIgnoreRoutingConfigRevisionCheck +var ( + _ = testhooks.MatchingIgnoreRoutingConfigRevisionCheck + versioning3EnvSlot = make(chan struct{}, ver3EnvSlots) +) type Versioning3Suite struct { parallelsuite.Suite[*Versioning3Suite] @@ -86,6 +90,11 @@ func TestVersioning3FunctionalSuite(t *testing.T) { } func (s *Versioning3Suite) setupEnv(opts ...testcore.TestOption) *testcore.TestEnv { + versioning3EnvSlot <- struct{}{} + s.T().Cleanup(func() { + <-versioning3EnvSlot + }) + opts = append([]testcore.TestOption{ testcore.WithWorkerService("worker deployment manager workflows"), testcore.WithDynamicConfig(dynamicconfig.MatchingDeploymentWorkflowVersion, int(versioning3DeploymentWorkflowVersion)), From cbcf9ee43280544f4f5f7f4eccde9cc8d61bf114 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 21:00:28 -0700 Subject: [PATCH 26/47] Revert "Limit concurrent versioning3 test environments" This reverts commit db9bd963bacfdf286eaa3a508df1195d3460d723. --- tests/versioning_3_test.go | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index c31ff96e2b..00b4c54875 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -63,7 +63,6 @@ const ( vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 ver3PollTimeout = 2 * time.Minute - ver3EnvSlots = 4 versionStatusNil = versionStatus(0) versionStatusInactive = versionStatus(1) @@ -75,10 +74,7 @@ const ( versioning3DeploymentWorkflowVersion = workerdeployment.VersionDataRevisionNumber ) -var ( - _ = testhooks.MatchingIgnoreRoutingConfigRevisionCheck - versioning3EnvSlot = make(chan struct{}, ver3EnvSlots) -) +var _ = testhooks.MatchingIgnoreRoutingConfigRevisionCheck type Versioning3Suite struct { parallelsuite.Suite[*Versioning3Suite] @@ -90,11 +86,6 @@ func TestVersioning3FunctionalSuite(t *testing.T) { } func (s *Versioning3Suite) setupEnv(opts ...testcore.TestOption) *testcore.TestEnv { - versioning3EnvSlot <- struct{}{} - s.T().Cleanup(func() { - <-versioning3EnvSlot - }) - opts = append([]testcore.TestOption{ testcore.WithWorkerService("worker deployment manager workflows"), testcore.WithDynamicConfig(dynamicconfig.MatchingDeploymentWorkflowVersion, int(versioning3DeploymentWorkflowVersion)), From 870e9d1f6af745496ebf5247ff77c53151134a06 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Mon, 25 May 2026 21:45:19 -0700 Subject: [PATCH 27/47] Shorten versioning3 await RPC attempts --- tests/versioning_3_test.go | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 00b4c54875..333cd441b8 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -63,6 +63,7 @@ const ( vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 ver3PollTimeout = 2 * time.Minute + ver3RPCTimeout = 5 * time.Second versionStatusNil = versionStatus(0) versionStatusInactive = versionStatus(1) @@ -3422,7 +3423,7 @@ func (s *Versioning3Suite) setCurrentDeployment(env *testcore.TestEnv, tv *testv buildIDNotFound := fmt.Sprintf("build ID '%s' not found in Worker Deployment", tv.BuildID()) deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() req := &workflowservice.SetWorkerDeploymentCurrentVersionRequest{ @@ -3497,7 +3498,7 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te tqTypes = []enumspb.TaskQueueType{tqTypeWf} } s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() for _, tqType := range tqTypes { @@ -3516,7 +3517,7 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() req := &workflowservice.SetWorkerDeploymentCurrentVersionRequest{ @@ -3551,7 +3552,7 @@ func (s *Versioning3Suite) setRampingDeployment( deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() req := &workflowservice.SetWorkerDeploymentRampingVersionRequest{ @@ -3575,7 +3576,7 @@ func (s *Versioning3Suite) setRampingDeployment( func (s *Versioning3Suite) waitForDeploymentDataPropagationQueryWorkerDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { if versioning3DeploymentWorkflowVersion == workerdeployment.AsyncSetCurrentAndRamping { s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() resp, err := env.FrontendClient().DescribeWorkerDeployment(ctx, &workflowservice.DescribeWorkerDeploymentRequest{ @@ -3847,7 +3848,7 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, transition *workflowpb.DeploymentVersionTransition, ) { s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() dwf, err := env.FrontendClient().DescribeWorkflowExecution( @@ -4380,13 +4381,16 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagation( } f, err := tqid.NewTaskQueueFamily(env.NamespaceID().String(), tv.TaskQueue().GetName()) s.Await(func(s *Versioning3Suite) { + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + defer cancel() + for pt := range remaining { s.NoError(err) partition := f.TaskQueue(pt.tp).NormalPartition(pt.part) // Use lower-level GetTaskQueueUserData instead of GetWorkerBuildIdCompatibility // here so that we can target activity queues. res, err := env.GetTestCluster().MatchingClient().GetTaskQueueUserData( - s.Context(), + ctx, &matchingservice.GetTaskQueueUserDataRequest{ NamespaceId: env.NamespaceID().String(), TaskQueue: partition.RpcName(), From f0182846802c7aec051b955d18a7d9d2a78b51b0 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 08:00:52 -0700 Subject: [PATCH 28/47] Reduce versioning3 verification poll pressure --- tests/versioning_3_test.go | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 333cd441b8..593c6a6d8b 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -55,15 +55,16 @@ import ( type versionStatus int const ( - tqTypeWf = enumspb.TASK_QUEUE_TYPE_WORKFLOW - tqTypeAct = enumspb.TASK_QUEUE_TYPE_ACTIVITY - tqTypeNexus = enumspb.TASK_QUEUE_TYPE_NEXUS - vbUnspecified = enumspb.VERSIONING_BEHAVIOR_UNSPECIFIED - vbPinned = enumspb.VERSIONING_BEHAVIOR_PINNED - vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE - ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 - ver3PollTimeout = 2 * time.Minute - ver3RPCTimeout = 5 * time.Second + tqTypeWf = enumspb.TASK_QUEUE_TYPE_WORKFLOW + tqTypeAct = enumspb.TASK_QUEUE_TYPE_ACTIVITY + tqTypeNexus = enumspb.TASK_QUEUE_TYPE_NEXUS + vbUnspecified = enumspb.VERSIONING_BEHAVIOR_UNSPECIFIED + vbPinned = enumspb.VERSIONING_BEHAVIOR_PINNED + vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE + ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 + ver3PollTimeout = 2 * time.Minute + ver3RPCTimeout = 5 * time.Second + ver3PollInterval = 2 * time.Second versionStatusNil = versionStatus(0) versionStatusInactive = versionStatus(1) @@ -3437,7 +3438,7 @@ func (s *Versioning3Suite) setCurrentDeployment(env *testcore.TestEnv, tv *testv return } s.NoError(err) - }, 90*time.Second, 500*time.Millisecond) + }, 90*time.Second, ver3PollInterval) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3511,7 +3512,7 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te s.NoError(err) s.True(resp.GetIsMember()) } - }, 90*time.Second, 500*time.Millisecond) + }, 90*time.Second, ver3PollInterval) } func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { @@ -3530,7 +3531,7 @@ func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *tes return } s.NoError(err) - }, 90*time.Second, 500*time.Millisecond) + }, 90*time.Second, ver3PollInterval) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3567,7 +3568,7 @@ func (s *Versioning3Suite) setRampingDeployment( return } s.NoError(err) - }, 90*time.Second, 500*time.Millisecond) + }, 90*time.Second, ver3PollInterval) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3589,7 +3590,7 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagationQueryWorkerDeployment } s.NoError(err) s.Equal(enumspb.ROUTING_CONFIG_UPDATE_STATE_COMPLETED, resp.GetWorkerDeploymentInfo().GetRoutingConfigUpdateState()) - }, 90*time.Second, 500*time.Millisecond) + }, 90*time.Second, ver3PollInterval) } } @@ -3895,7 +3896,7 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, versioningInfo.GetVersionTransition(), )) } - }, 90*time.Second, 500*time.Millisecond) + }, 90*time.Second, ver3PollInterval) } func respondActivity() *workflowservice.RespondActivityTaskCompletedRequest { @@ -4448,7 +4449,7 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagation( } } s.Empty(remaining) - }, 90*time.Second, 500*time.Millisecond) + }, 90*time.Second, ver3PollInterval) } func (s *Versioning3Suite) validateBacklogCount( From 2c9d2aaf8ba4bc725eb1a6bfb76627fe000ea22b Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 08:46:42 -0700 Subject: [PATCH 29/47] Reduce versioning3 propagation partition checks --- tests/versioning_3_test.go | 47 +++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 593c6a6d8b..236ee45129 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -55,16 +55,15 @@ import ( type versionStatus int const ( - tqTypeWf = enumspb.TASK_QUEUE_TYPE_WORKFLOW - tqTypeAct = enumspb.TASK_QUEUE_TYPE_ACTIVITY - tqTypeNexus = enumspb.TASK_QUEUE_TYPE_NEXUS - vbUnspecified = enumspb.VERSIONING_BEHAVIOR_UNSPECIFIED - vbPinned = enumspb.VERSIONING_BEHAVIOR_PINNED - vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE - ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 - ver3PollTimeout = 2 * time.Minute - ver3RPCTimeout = 5 * time.Second - ver3PollInterval = 2 * time.Second + tqTypeWf = enumspb.TASK_QUEUE_TYPE_WORKFLOW + tqTypeAct = enumspb.TASK_QUEUE_TYPE_ACTIVITY + tqTypeNexus = enumspb.TASK_QUEUE_TYPE_NEXUS + vbUnspecified = enumspb.VERSIONING_BEHAVIOR_UNSPECIFIED + vbPinned = enumspb.VERSIONING_BEHAVIOR_PINNED + vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE + ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 + ver3PollTimeout = 2 * time.Minute + ver3RPCTimeout = 5 * time.Second versionStatusNil = versionStatus(0) versionStatusInactive = versionStatus(1) @@ -3438,7 +3437,7 @@ func (s *Versioning3Suite) setCurrentDeployment(env *testcore.TestEnv, tv *testv return } s.NoError(err) - }, 90*time.Second, ver3PollInterval) + }, 90*time.Second, 500*time.Millisecond) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3512,7 +3511,7 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te s.NoError(err) s.True(resp.GetIsMember()) } - }, 90*time.Second, ver3PollInterval) + }, 90*time.Second, 500*time.Millisecond) } func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { @@ -3531,7 +3530,7 @@ func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *tes return } s.NoError(err) - }, 90*time.Second, ver3PollInterval) + }, 90*time.Second, 500*time.Millisecond) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3568,7 +3567,7 @@ func (s *Versioning3Suite) setRampingDeployment( return } s.NoError(err) - }, 90*time.Second, ver3PollInterval) + }, 90*time.Second, 500*time.Millisecond) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3590,7 +3589,7 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagationQueryWorkerDeployment } s.NoError(err) s.Equal(enumspb.ROUTING_CONFIG_UPDATE_STATE_COMPLETED, resp.GetWorkerDeploymentInfo().GetRoutingConfigUpdateState()) - }, 90*time.Second, ver3PollInterval) + }, 90*time.Second, 500*time.Millisecond) } } @@ -3896,7 +3895,7 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, versioningInfo.GetVersionTransition(), )) } - }, 90*time.Second, ver3PollInterval) + }, 90*time.Second, 500*time.Millisecond) } func respondActivity() *workflowservice.RespondActivityTaskCompletedRequest { @@ -4375,7 +4374,7 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagation( tp enumspb.TaskQueueType } remaining := make(map[partAndType]struct{}) - for i := range partitionCount { + for _, i := range versioning3PropagationPartitions(partitionCount) { for _, tqt := range tqTypes { remaining[partAndType{i, tqt}] = struct{}{} } @@ -4449,7 +4448,19 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagation( } } s.Empty(remaining) - }, 90*time.Second, ver3PollInterval) + }, 90*time.Second, 500*time.Millisecond) +} + +func versioning3PropagationPartitions(partitionCount int) []int { + if partitionCount >= 12 { + return []int{0, 5, 11} + } + + partitions := make([]int, 0, partitionCount) + for i := range partitionCount { + partitions = append(partitions, i) + } + return partitions } func (s *Versioning3Suite) validateBacklogCount( From d62f2a41acb87f012a8c5aab95ec49f60e3c0618 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 09:16:03 -0700 Subject: [PATCH 30/47] Use suite-scoped cluster pools --- tests/testcore/functional_test_base.go | 3 +- tests/testcore/test_cluster_pool.go | 176 +++++++++++++++++-------- tests/testcore/test_env.go | 11 +- tests/testcore/test_env_test.go | 8 ++ tests/versioning_3_test.go | 2 +- 5 files changed, 137 insertions(+), 63 deletions(-) diff --git a/tests/testcore/functional_test_base.go b/tests/testcore/functional_test_base.go index 01feb7e670..278bb02772 100644 --- a/tests/testcore/functional_test_base.go +++ b/tests/testcore/functional_test_base.go @@ -276,7 +276,8 @@ func (s *FunctionalTestBase) TearDownSuite() { func (s *FunctionalTestBase) SetupSuiteWithCluster(options ...TestClusterOption) { // Acquire a slot from the dedicated test cluster pool. - testClusterPool.dedicated.acquireSlot(s.T()) + params := ApplyTestClusterOptions(options) + testClusterPool.acquireDedicatedSlot(s.T(), params.EnableWorkerService) s.setupCluster(options...) } diff --git a/tests/testcore/test_cluster_pool.go b/tests/testcore/test_cluster_pool.go index 7234ed79a0..afcacfffd0 100644 --- a/tests/testcore/test_cluster_pool.go +++ b/tests/testcore/test_cluster_pool.go @@ -40,15 +40,8 @@ func init() { maxUsage = 50 } - sharedPool := newPool(sharedSize, false) - sharedPool.maxUsage = maxUsage - - dedicatedPool := newPool(dedicatedSize, true) - dedicatedPool.maxUsage = maxUsage - testClusterPool = &clusterPool{ - shared: sharedPool, - dedicated: dedicatedPool, + pools: newClusterPools(sharedSize, dedicatedSize, maxUsage), } } @@ -82,6 +75,33 @@ func newPool(size int, exclusive bool) *pool { return p } +func newPoolWithMaxUsage(size int, exclusive bool, maxUsage int) *pool { + p := newPool(size, exclusive) + p.maxUsage = maxUsage + return p +} + +func newClusterPools(sharedSize, dedicatedSize, maxUsage int) map[clusterPoolKey]*pool { + pools := make(map[clusterPoolKey]*pool, 4) + for _, key := range []clusterPoolKey{ + {kind: poolKindShared}, + {kind: poolKindShared, workerService: true}, + {kind: poolKindDedicated}, + {kind: poolKindDedicated, workerService: true}, + } { + size := sharedSize + if key.kind == poolKindDedicated { + size = dedicatedSize + } + pools[key] = newPoolWithMaxUsage(size, key.kind == poolKindDedicated, maxUsage) + } + return pools +} + +func DefaultSuiteClusterPoolSize() int { + return max(1, runtime.GOMAXPROCS(0)/2) +} + // get returns a cluster from the pool, creating it lazily if needed. // For exclusive pools, blocks until a slot is available and registers cleanup. // For shared pools, uses round-robin. @@ -133,79 +153,87 @@ func (p *pool) acquireSlot(t *testing.T) { } type clusterPool struct { - shared *pool - dedicated *pool + pools map[clusterPoolKey]*pool suiteScoped sync.Map } type suiteScopedCluster struct { - once sync.Once - cluster *FunctionalTestBase + pools map[clusterPoolKey]*pool } -// UseSuiteScopedCluster makes NewEnv use one cluster for all tests under `t`. -// The cluster is created on first use and torn down when `t` completes. -// -// Deprecated: this only exists for backwards-compatibility with legacy sequential -// suite execution. -func UseSuiteScopedCluster(t *testing.T) { +type poolKind int + +const ( + poolKindShared poolKind = iota + poolKindDedicated +) + +type clusterPoolKey struct { + kind poolKind + workerService bool +} + +// UseSuiteScopedClusters makes NewEnv use suite-local cluster pools for all +// tests under `t`. Clusters are created on first use and torn down when `t` +// completes. +func UseSuiteScopedClusters(t *testing.T, size int) { t.Helper() + if size <= 0 { + t.Fatalf("suite-scoped cluster pool size must be positive, got %d", size) + } rootName, _, _ := strings.Cut(t.Name(), "/") if t.Name() != rootName { - t.Fatalf("UseSuiteScopedCluster must be called from a top-level test, got %q", t.Name()) + t.Fatalf("UseSuiteScopedClusters must be called from a top-level test, got %q", t.Name()) + } + suiteCluster := &suiteScopedCluster{ + pools: map[clusterPoolKey]*pool{ + {kind: poolKindShared}: newPool(size, false), + {kind: poolKindShared, workerService: true}: newPool(size, false), + }, + } + actual, loaded := testClusterPool.suiteScoped.LoadOrStore(rootName, suiteCluster) + if loaded { + suiteCluster = actual.(*suiteScopedCluster) } - testClusterPool.suiteScoped.LoadOrStore(rootName, &suiteScopedCluster{}) t.Cleanup(func() { - suiteClusterAny, ok := testClusterPool.suiteScoped.Load(rootName) - if ok { - suiteCluster := suiteClusterAny.(*suiteScopedCluster) - if suiteCluster.cluster != nil { - if err := suiteCluster.cluster.testCluster.TearDownCluster(); err != nil { - t.Logf("Failed to tear down suite-scoped cluster: %v", err) - } - } - } + suiteCluster.tearDown(t) testClusterPool.suiteScoped.Delete(rootName) }) } -func (p *clusterPool) get(t *testing.T, dedicated bool, dynamicConfig map[dynamicconfig.Key]any, clusterOpts []TestClusterOption) *FunctionalTestBase { +func (p *clusterPool) get(t *testing.T, dedicated bool, workerService bool, dynamicConfig map[dynamicconfig.Key]any, clusterOpts []TestClusterOption) *FunctionalTestBase { if dedicated || len(dynamicConfig) > 0 || len(clusterOpts) > 0 { - return p.getDedicated(t, dynamicConfig, clusterOpts) + return p.getDedicated(t, workerService, dynamicConfig, clusterOpts) } - if cluster := p.getSuiteScoped(t); cluster != nil { + if cluster := p.getSuiteScoped(t, workerService); cluster != nil { return cluster } - return p.getShared(t) + return p.getPooled(t, clusterPoolKey{ + kind: poolKindShared, + workerService: workerService, + }, nil, true, nil) } -func (p *clusterPool) getShared(t *testing.T) *FunctionalTestBase { - return p.shared.get(t, func() *FunctionalTestBase { - return p.createCluster(t, nil, true, nil) - }) -} - -func (p *clusterPool) getSuiteScoped(t *testing.T) *FunctionalTestBase { +func (p *clusterPool) getSuiteScoped(t *testing.T, workerService bool) *FunctionalTestBase { rootName, _, _ := strings.Cut(t.Name(), "/") - if _, ok := p.suiteScoped.Load(rootName); !ok { + suiteClusterAny, ok := p.suiteScoped.Load(rootName) + if !ok { return nil } - - suiteClusterAny, _ := p.suiteScoped.LoadOrStore(rootName, &suiteScopedCluster{}) suiteCluster := suiteClusterAny.(*suiteScopedCluster) - suiteCluster.once.Do(func() { - suiteCluster.cluster = p.createCluster(t, nil, true, nil) - }) - suiteCluster.cluster.SetT(t) - return suiteCluster.cluster + return suiteCluster.get(t, p, workerService) } -func (p *clusterPool) getDedicated(t *testing.T, dynamicConfig map[dynamicconfig.Key]any, clusterOpts []TestClusterOption) *FunctionalTestBase { +func (p *clusterPool) getDedicated(t *testing.T, workerService bool, dynamicConfig map[dynamicconfig.Key]any, clusterOpts []TestClusterOption) *FunctionalTestBase { + key := clusterPoolKey{ + kind: poolKindDedicated, + workerService: workerService, + } if len(dynamicConfig) > 0 || len(clusterOpts) > 0 { // Custom config or fx options require a fresh cluster (can't reuse). - p.dedicated.acquireSlot(t) - cluster := p.createCluster(t, dynamicConfig, false, clusterOpts) + p.pools[key].acquireSlot(t) + cluster := p.createCluster(t, dynamicConfig, false, workerService, clusterOpts) // Register cleanup to tear down the cluster when the test completes. t.Cleanup(func() { @@ -218,17 +246,55 @@ func (p *clusterPool) getDedicated(t *testing.T, dynamicConfig map[dynamicconfig } // If no custom config is provided, reuse an existing cluster. - return p.dedicated.get(t, func() *FunctionalTestBase { - return p.createCluster(t, nil, false, nil) + return p.getPooled(t, key, nil, false, nil) +} + +func (p *clusterPool) acquireDedicatedSlot(t *testing.T, workerService bool) { + p.pools[clusterPoolKey{ + kind: poolKindDedicated, + workerService: workerService, + }].acquireSlot(t) +} + +func (p *clusterPool) getPooled(t *testing.T, key clusterPoolKey, dynamicConfig map[dynamicconfig.Key]any, shared bool, clusterOpts []TestClusterOption) *FunctionalTestBase { + return p.pools[key].get(t, func() *FunctionalTestBase { + return p.createCluster(t, dynamicConfig, shared, key.workerService, clusterOpts) + }) +} + +func (s *suiteScopedCluster) get(t *testing.T, clusterPool *clusterPool, workerService bool) *FunctionalTestBase { + key := clusterPoolKey{ + kind: poolKindShared, + workerService: workerService, + } + return s.pools[key].get(t, func() *FunctionalTestBase { + return clusterPool.createCluster(t, nil, true, workerService, nil) }) } -func (p *clusterPool) createCluster(t *testing.T, dynamicConfig map[dynamicconfig.Key]any, shared bool, clusterOpts []TestClusterOption) *FunctionalTestBase { +func (s *suiteScopedCluster) tearDown(t *testing.T) { + for _, pool := range s.pools { + pool.tearDown(t) + } +} + +func (p *pool) tearDown(t *testing.T) { + for idx, cluster := range p.clusters { + if cluster == nil { + continue + } + if err := cluster.testCluster.TearDownCluster(); err != nil { + t.Logf("Failed to tear down suite-scoped cluster %d: %v", idx, err) + } + } +} + +func (p *clusterPool) createCluster(t *testing.T, dynamicConfig map[dynamicconfig.Key]any, shared bool, workerService bool, clusterOpts []TestClusterOption) *FunctionalTestBase { tbase := &FunctionalTestBase{} tbase.SetT(t) // Keep the worker service off unless explicitly enabled via WithWorkerService. - opts := []TestClusterOption{withWorkerService(false)} + opts := []TestClusterOption{withWorkerService(workerService)} if shared { opts = append(opts, WithSharedCluster()) } diff --git a/tests/testcore/test_env.go b/tests/testcore/test_env.go index 0da369e129..8eec924b6d 100644 --- a/tests/testcore/test_env.go +++ b/tests/testcore/test_env.go @@ -91,6 +91,7 @@ type TestOption func(*testOptions) type testOptions struct { dedicatedCluster bool dedicatedReason string + workerService bool dynamicConfigSettings []dynamicConfigOverride clusterOptions []TestClusterOption } @@ -126,12 +127,10 @@ func WithFxOptions(serviceName primitives.ServiceName, opts ...fx.Option) TestOp } // WithWorkerService enables the system worker service. The service is off by -// default to avoid the worker overhead. This implies a dedicated cluster. -func WithWorkerService(reason string) TestOption { +// default to avoid the worker overhead. +func WithWorkerService(_ string) TestOption { return func(o *testOptions) { - o.dedicatedCluster = true - o.clusterOptions = append(o.clusterOptions, withWorkerService(true)) - o.dedicatedReason = "worker service required: " + reason + o.workerService = true } } @@ -188,7 +187,7 @@ func NewEnv(t *testing.T, opts ...TestOption) *TestEnv { } // Obtain the test cluster from the pool. - base := testClusterPool.get(t, options.dedicatedCluster, startupConfig, options.clusterOptions) + base := testClusterPool.get(t, options.dedicatedCluster, options.workerService, startupConfig, options.clusterOptions) cluster := base.GetTestCluster() // Create a dedicated namespace for the test to help with test isolation. diff --git a/tests/testcore/test_env_test.go b/tests/testcore/test_env_test.go index f8ad7234dd..1429665be5 100644 --- a/tests/testcore/test_env_test.go +++ b/tests/testcore/test_env_test.go @@ -28,6 +28,14 @@ func (s *TestEnvSuite) TestDedicatedClusterGuard_FailsWhenUnused() { `testcore.WithDedicatedCluster() was requested but no dedicated-cluster-only feature was used`) } +func (s *TestEnvSuite) TestWithWorkerServiceDoesNotRequireDedicatedCluster() { + var opts testOptions + WithWorkerService("test")(&opts) + + s.False(opts.dedicatedCluster) + s.True(opts.workerService) +} + func (s *TestEnvSuite) TestDedicatedClusterGuard_NoErrorAfterUse() { guard := newDedicatedClusterGuard(true) guard.record("global hook") diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 236ee45129..3570636e66 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -82,7 +82,7 @@ type Versioning3Suite struct { } func TestVersioning3FunctionalSuite(t *testing.T) { - testcore.UseSuiteScopedCluster(t) //nolint:staticcheck // SA1019: suite still requires legacy sequential execution + testcore.UseSuiteScopedClusters(t, testcore.DefaultSuiteClusterPoolSize()) parallelsuite.Run(t, &Versioning3Suite{}) } From ad45057c830b6c18143a0a8a0d8efa69a24f0459 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 09:16:08 -0700 Subject: [PATCH 31/47] Revert ineffective versioning3 wait tuning --- tests/versioning_3_test.go | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 3570636e66..c81892a9fb 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -63,7 +63,6 @@ const ( vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 ver3PollTimeout = 2 * time.Minute - ver3RPCTimeout = 5 * time.Second versionStatusNil = versionStatus(0) versionStatusInactive = versionStatus(1) @@ -3423,7 +3422,7 @@ func (s *Versioning3Suite) setCurrentDeployment(env *testcore.TestEnv, tv *testv buildIDNotFound := fmt.Sprintf("build ID '%s' not found in Worker Deployment", tv.BuildID()) deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() req := &workflowservice.SetWorkerDeploymentCurrentVersionRequest{ @@ -3498,7 +3497,7 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te tqTypes = []enumspb.TaskQueueType{tqTypeWf} } s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() for _, tqType := range tqTypes { @@ -3517,7 +3516,7 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() req := &workflowservice.SetWorkerDeploymentCurrentVersionRequest{ @@ -3552,7 +3551,7 @@ func (s *Versioning3Suite) setRampingDeployment( deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() req := &workflowservice.SetWorkerDeploymentRampingVersionRequest{ @@ -3576,7 +3575,7 @@ func (s *Versioning3Suite) setRampingDeployment( func (s *Versioning3Suite) waitForDeploymentDataPropagationQueryWorkerDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { if versioning3DeploymentWorkflowVersion == workerdeployment.AsyncSetCurrentAndRamping { s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() resp, err := env.FrontendClient().DescribeWorkerDeployment(ctx, &workflowservice.DescribeWorkerDeploymentRequest{ @@ -3848,7 +3847,7 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, transition *workflowpb.DeploymentVersionTransition, ) { s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() dwf, err := env.FrontendClient().DescribeWorkflowExecution( @@ -4374,23 +4373,20 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagation( tp enumspb.TaskQueueType } remaining := make(map[partAndType]struct{}) - for _, i := range versioning3PropagationPartitions(partitionCount) { + for i := range partitionCount { for _, tqt := range tqTypes { remaining[partAndType{i, tqt}] = struct{}{} } } f, err := tqid.NewTaskQueueFamily(env.NamespaceID().String(), tv.TaskQueue().GetName()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) - defer cancel() - for pt := range remaining { s.NoError(err) partition := f.TaskQueue(pt.tp).NormalPartition(pt.part) // Use lower-level GetTaskQueueUserData instead of GetWorkerBuildIdCompatibility // here so that we can target activity queues. res, err := env.GetTestCluster().MatchingClient().GetTaskQueueUserData( - ctx, + s.Context(), &matchingservice.GetTaskQueueUserDataRequest{ NamespaceId: env.NamespaceID().String(), TaskQueue: partition.RpcName(), @@ -4451,18 +4447,6 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagation( }, 90*time.Second, 500*time.Millisecond) } -func versioning3PropagationPartitions(partitionCount int) []int { - if partitionCount >= 12 { - return []int{0, 5, 11} - } - - partitions := make([]int, 0, partitionCount) - for i := range partitionCount { - partitions = append(partitions, i) - } - return partitions -} - func (s *Versioning3Suite) validateBacklogCount( env *testcore.TestEnv, tv *testvars.TestVars, From 5e23afb14cd4893a668a077c054531ed5608073f Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 10:27:17 -0700 Subject: [PATCH 32/47] Let child inheritance workflows pause via signal --- tests/versioning_3_test.go | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index c81892a9fb..72ea5ec01a 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -1886,15 +1886,13 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectInherit(crossTq boo } wfStarted := make(chan struct{}, 1) - currentChanged := make(chan struct{}, 1) childv1 := func(ctx workflow.Context) (string, error) { return "v1", nil } wf1 := func(ctx workflow.Context) (string, error) { wfStarted <- struct{}{} - // wait for current version to change - <-currentChanged + workflow.GetSignalChannel(ctx, "currentVersionChanged").Receive(ctx, nil) // run two child workflows fut1 := workflow.ExecuteChildWorkflow(workflow.WithChildOptions(ctx, workflow.ChildWorkflowOptions{ @@ -1956,6 +1954,7 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectInherit(crossTq boo // wait for it to start on v1 env.WaitForChannel(wfStarted) close(wfStarted) // force panic if replayed + s.verifyWorkflowVersioning(env, tv1, parentRegistrationBehavior, tv1.Deployment(), override, nil) // make v2 current for both parent and child and unblock the wf to start the child s.updateTaskQueueDeploymentDataWithRoutingConfig(env, tv2, &deploymentpb.RoutingConfig{ @@ -1986,7 +1985,7 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectInherit(crossTq boo }, []string{}, tqTypeWf) s.pollUntilRegistered(env, tv1Child) } - currentChanged <- struct{}{} + s.NoError(env.SdkClient().SignalWorkflow(s.Context(), run.GetID(), run.GetRunID(), "currentVersionChanged", nil)) var out string s.NoError(run.Get(s.Context(), &out)) @@ -2026,7 +2025,6 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectNoInherit(crossTq b } wfStarted := make(chan struct{}, 10) - currentChanged := make(chan struct{}, 10) childv1 := func(ctx workflow.Context) (string, error) { panic("child should not run on v1") @@ -2036,8 +2034,7 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectNoInherit(crossTq b } wf1 := func(ctx workflow.Context) (string, error) { wfStarted <- struct{}{} - // wait for current version to change - <-currentChanged + workflow.GetSignalChannel(ctx, "currentVersionChanged").Receive(ctx, nil) fut1 := workflow.ExecuteChildWorkflow(workflow.WithChildOptions(ctx, workflow.ChildWorkflowOptions{ TaskQueue: tv2Child.TaskQueue().GetName(), @@ -2047,7 +2044,6 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectNoInherit(crossTq b var val1 string s.NoError(fut1.Get(ctx, &val1)) - s.verifyWorkflowVersioning(env, tv1, parentBehavior, tv1.Deployment(), nil, nil) return val1, nil } @@ -2127,11 +2123,12 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectNoInherit(crossTq b // wait for it to start on v1 env.WaitForChannel(wfStarted) close(wfStarted) + s.verifyWorkflowVersioning(env, tv1, parentBehavior, tv1.Deployment(), nil, nil) // make v2 current for both parent and child and unblock the wf to start the child s.setCurrentDeployment(env, tv2) - currentChanged <- struct{}{} + s.NoError(sdkClient.SignalWorkflow(s.Context(), run.GetID(), run.GetRunID(), "currentVersionChanged", nil)) var out string s.NoError(run.Get(s.Context(), &out)) From 378a95754f6333e810d8359376e9b05de10dd64e Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 11:06:46 -0700 Subject: [PATCH 33/47] Limit child inheritance signal pause to inherit cases --- tests/versioning_3_test.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 72ea5ec01a..a2da41a798 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -2025,6 +2025,7 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectNoInherit(crossTq b } wfStarted := make(chan struct{}, 10) + currentChanged := make(chan struct{}, 10) childv1 := func(ctx workflow.Context) (string, error) { panic("child should not run on v1") @@ -2034,7 +2035,8 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectNoInherit(crossTq b } wf1 := func(ctx workflow.Context) (string, error) { wfStarted <- struct{}{} - workflow.GetSignalChannel(ctx, "currentVersionChanged").Receive(ctx, nil) + // wait for current version to change + <-currentChanged fut1 := workflow.ExecuteChildWorkflow(workflow.WithChildOptions(ctx, workflow.ChildWorkflowOptions{ TaskQueue: tv2Child.TaskQueue().GetName(), @@ -2044,6 +2046,7 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectNoInherit(crossTq b var val1 string s.NoError(fut1.Get(ctx, &val1)) + s.verifyWorkflowVersioning(env, tv1, parentBehavior, tv1.Deployment(), nil, nil) return val1, nil } @@ -2123,12 +2126,11 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectNoInherit(crossTq b // wait for it to start on v1 env.WaitForChannel(wfStarted) close(wfStarted) - s.verifyWorkflowVersioning(env, tv1, parentBehavior, tv1.Deployment(), nil, nil) // make v2 current for both parent and child and unblock the wf to start the child s.setCurrentDeployment(env, tv2) - s.NoError(sdkClient.SignalWorkflow(s.Context(), run.GetID(), run.GetRunID(), "currentVersionChanged", nil)) + currentChanged <- struct{}{} var out string s.NoError(run.Get(s.Context(), &out)) From d032ec8448ca416cf02d5d33f9532c3a0ff0c4b3 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 11:31:55 -0700 Subject: [PATCH 34/47] Start sticky completion poll before activity finish --- tests/versioning_3_test.go | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index a2da41a798..da99fa0f32 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -320,14 +320,15 @@ func (s *Versioning3Suite) testWorkflowWithPinnedOverride(env *testcore.TestEnv, s.verifyWorkflowStickyQueue(env, tv.WithRunID(runID)) } - env.WaitForChannel(actCompleted) - s.verifyWorkflowVersioning(env, tv, vbUnpinned, tv.Deployment(), tv.VersioningOverridePinned(), nil) - - s.pollWftAndHandle(env, tv, sticky, nil, + finalWFTCompleted := make(chan struct{}) + s.pollWftAndHandle(env, tv, sticky, finalWFTCompleted, func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { s.NotNil(task) return respondCompleteWorkflow(tv, vbUnpinned), nil }) + + env.WaitForChannel(actCompleted) + env.WaitForChannel(finalWFTCompleted) s.verifyWorkflowVersioning(env, tv, vbUnpinned, tv.Deployment(), tv.VersioningOverridePinned(), nil) } @@ -636,14 +637,15 @@ func (s *Versioning3Suite) testUnpinnedWorkflow(env *testcore.TestEnv, sticky bo s.verifyWorkflowStickyQueue(env, tv.WithRunID(runID)) } - env.WaitForChannel(actCompleted) - s.verifyWorkflowVersioning(env, tv, vbUnpinned, tv.Deployment(), nil, nil) - - s.pollWftAndHandle(env, tv, sticky, nil, + finalWFTCompleted := make(chan struct{}) + s.pollWftAndHandle(env, tv, sticky, finalWFTCompleted, func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { s.NotNil(task) return respondCompleteWorkflow(tv, vbUnpinned), nil }) + + env.WaitForChannel(actCompleted) + env.WaitForChannel(finalWFTCompleted) s.verifyWorkflowVersioning(env, tv, vbUnpinned, tv.Deployment(), nil, nil) } From a03a957cf42d38c17061207fd304ca18e91abfa5 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 12:50:38 -0700 Subject: [PATCH 35/47] Harden remaining versioning3 parallel waits --- tests/versioning_3_test.go | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index da99fa0f32..434887bb0c 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -397,13 +397,16 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, // wait for v1 to become drained s.Await(func(s *Versioning3Suite) { - resp, err := env.FrontendClient().DescribeWorkerDeploymentVersion(s.Context(), &workflowservice.DescribeWorkerDeploymentVersionRequest{ + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + defer cancel() + + resp, err := env.FrontendClient().DescribeWorkerDeploymentVersion(ctx, &workflowservice.DescribeWorkerDeploymentVersionRequest{ Namespace: env.Namespace().String(), Version: tv.DeploymentVersionString(), }) s.NoError(err) s.Equal(enumspb.VERSION_DRAINAGE_STATUS_DRAINED, resp.GetWorkerDeploymentVersionInfo().GetDrainageInfo().GetStatus()) - }, time.Second*10, time.Millisecond*1000) + }, 90*time.Second, 500*time.Millisecond) if !pollersPresent { // simulate the pollers going away, which should make the query fail as now the version is drained + has no pollers polling it @@ -427,13 +430,16 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, // wait for v1 to become ramping s.Await(func(s *Versioning3Suite) { - resp, err := env.FrontendClient().DescribeWorkerDeploymentVersion(s.Context(), &workflowservice.DescribeWorkerDeploymentVersionRequest{ + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + defer cancel() + + resp, err := env.FrontendClient().DescribeWorkerDeploymentVersion(ctx, &workflowservice.DescribeWorkerDeploymentVersionRequest{ Namespace: env.Namespace().String(), Version: tv.DeploymentVersionString(), }) s.NoError(err) s.Equal(enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_RAMPING, resp.GetWorkerDeploymentVersionInfo().GetStatus()) - }, time.Second*10, time.Millisecond*1000) + }, 90*time.Second, 500*time.Millisecond) // the ramping status is propagated to the task queues s.waitForDeploymentDataPropagation(env, tv, versionStatusRamping, false, tqTypeWf) @@ -1944,7 +1950,7 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectInherit(crossTq boo ID: tv1.WorkflowID(), TaskQueue: tv1.TaskQueue().GetName(), VersioningOverride: nil, - WorkflowTaskTimeout: 10 * time.Second, + WorkflowTaskTimeout: 30 * time.Second, } if withOverride { startOpts.VersioningOverride = &sdkclient.PinnedVersioningOverride{ @@ -3011,7 +3017,6 @@ func (s *Versioning3Suite) testCan(crossTq bool, behavior enumspb.VersioningBeha } wfStarted := make(chan struct{}, 10) - currentChanged := make(chan struct{}, 10) wf1 := func(ctx workflow.Context, attempt int) (string, error) { switch attempt { @@ -3030,8 +3035,7 @@ func (s *Versioning3Suite) testCan(crossTq bool, behavior enumspb.VersioningBeha s.verifyWorkflowVersioning(env, tv1, vbUnspecified, nil, override, tv1.DeploymentVersionTransition()) } wfStarted <- struct{}{} - // wait for current version to change. - <-currentChanged + workflow.GetSignalChannel(ctx, "currentVersionChanged").Receive(ctx, nil) return "", workflow.NewContinueAsNewError(newCtx, "wf", attempt+1) case 1: s.verifyWorkflowVersioning(env, tv1, vbPinned, tv1.Deployment(), override, nil) @@ -3138,7 +3142,7 @@ func (s *Versioning3Suite) testCan(crossTq bool, behavior enumspb.VersioningBeha // make v2 current s.setCurrentDeployment(env, tv2) - currentChanged <- struct{}{} + s.NoError(sdkClient.SignalWorkflow(s.Context(), run.GetID(), run.GetRunID(), "currentVersionChanged", nil)) var out string s.NoError(run.Get(s.Context(), &out)) @@ -4123,7 +4127,7 @@ func (s *Versioning3Suite) pollWftAndHandleQueries( DeploymentOptions: tv.WorkerDeploymentOptions(true), TaskQueue: tq, }, - ).HandleLegacyQuery(tv, handler) + ).HandleLegacyQuery(tv, handler, taskpoller.WithTimeout(ver3PollTimeout)) } if async == nil { resp, err := f() From fc002772f7e371f23443406884c9407d22ba8d3c Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 15:13:08 -0700 Subject: [PATCH 36/47] Use shorter RPC deadlines in versioning3 awaits --- tests/versioning_3_test.go | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 434887bb0c..f4a68fabde 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -63,6 +63,7 @@ const ( vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 ver3PollTimeout = 2 * time.Minute + ver3RPCTimeout = 10 * time.Second versionStatusNil = versionStatus(0) versionStatusInactive = versionStatus(1) @@ -397,7 +398,7 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, // wait for v1 to become drained s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() resp, err := env.FrontendClient().DescribeWorkerDeploymentVersion(ctx, &workflowservice.DescribeWorkerDeploymentVersionRequest{ @@ -430,7 +431,7 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, // wait for v1 to become ramping s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() resp, err := env.FrontendClient().DescribeWorkerDeploymentVersion(ctx, &workflowservice.DescribeWorkerDeploymentVersionRequest{ @@ -3427,7 +3428,7 @@ func (s *Versioning3Suite) setCurrentDeployment(env *testcore.TestEnv, tv *testv buildIDNotFound := fmt.Sprintf("build ID '%s' not found in Worker Deployment", tv.BuildID()) deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() req := &workflowservice.SetWorkerDeploymentCurrentVersionRequest{ @@ -3502,7 +3503,7 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te tqTypes = []enumspb.TaskQueueType{tqTypeWf} } s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() for _, tqType := range tqTypes { @@ -3521,7 +3522,7 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() req := &workflowservice.SetWorkerDeploymentCurrentVersionRequest{ @@ -3556,7 +3557,7 @@ func (s *Versioning3Suite) setRampingDeployment( deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() req := &workflowservice.SetWorkerDeploymentRampingVersionRequest{ @@ -3580,7 +3581,7 @@ func (s *Versioning3Suite) setRampingDeployment( func (s *Versioning3Suite) waitForDeploymentDataPropagationQueryWorkerDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { if versioning3DeploymentWorkflowVersion == workerdeployment.AsyncSetCurrentAndRamping { s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() resp, err := env.FrontendClient().DescribeWorkerDeployment(ctx, &workflowservice.DescribeWorkerDeploymentRequest{ @@ -3852,7 +3853,7 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, transition *workflowpb.DeploymentVersionTransition, ) { s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() dwf, err := env.FrontendClient().DescribeWorkflowExecution( From 522615217e8f250fca279dce576e50166fb894a7 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 16:31:19 -0700 Subject: [PATCH 37/47] Revert "Use shorter RPC deadlines in versioning3 awaits" This reverts commit 2fa0c6abb22d821c3ba09833f64dd3d94ae86ecf. --- tests/versioning_3_test.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index f4a68fabde..434887bb0c 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -63,7 +63,6 @@ const ( vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 ver3PollTimeout = 2 * time.Minute - ver3RPCTimeout = 10 * time.Second versionStatusNil = versionStatus(0) versionStatusInactive = versionStatus(1) @@ -398,7 +397,7 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, // wait for v1 to become drained s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() resp, err := env.FrontendClient().DescribeWorkerDeploymentVersion(ctx, &workflowservice.DescribeWorkerDeploymentVersionRequest{ @@ -431,7 +430,7 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, // wait for v1 to become ramping s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() resp, err := env.FrontendClient().DescribeWorkerDeploymentVersion(ctx, &workflowservice.DescribeWorkerDeploymentVersionRequest{ @@ -3428,7 +3427,7 @@ func (s *Versioning3Suite) setCurrentDeployment(env *testcore.TestEnv, tv *testv buildIDNotFound := fmt.Sprintf("build ID '%s' not found in Worker Deployment", tv.BuildID()) deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() req := &workflowservice.SetWorkerDeploymentCurrentVersionRequest{ @@ -3503,7 +3502,7 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te tqTypes = []enumspb.TaskQueueType{tqTypeWf} } s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() for _, tqType := range tqTypes { @@ -3522,7 +3521,7 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() req := &workflowservice.SetWorkerDeploymentCurrentVersionRequest{ @@ -3557,7 +3556,7 @@ func (s *Versioning3Suite) setRampingDeployment( deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() req := &workflowservice.SetWorkerDeploymentRampingVersionRequest{ @@ -3581,7 +3580,7 @@ func (s *Versioning3Suite) setRampingDeployment( func (s *Versioning3Suite) waitForDeploymentDataPropagationQueryWorkerDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { if versioning3DeploymentWorkflowVersion == workerdeployment.AsyncSetCurrentAndRamping { s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() resp, err := env.FrontendClient().DescribeWorkerDeployment(ctx, &workflowservice.DescribeWorkerDeploymentRequest{ @@ -3853,7 +3852,7 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, transition *workflowpb.DeploymentVersionTransition, ) { s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() dwf, err := env.FrontendClient().DescribeWorkflowExecution( From f3d9b54bac3f611f22677cba3039f834d762bf7e Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 16:34:09 -0700 Subject: [PATCH 38/47] Retry late activity poller handoff --- tests/versioning_3_test.go | 42 +++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 434887bb0c..24a79e64f2 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -55,14 +55,15 @@ import ( type versionStatus int const ( - tqTypeWf = enumspb.TASK_QUEUE_TYPE_WORKFLOW - tqTypeAct = enumspb.TASK_QUEUE_TYPE_ACTIVITY - tqTypeNexus = enumspb.TASK_QUEUE_TYPE_NEXUS - vbUnspecified = enumspb.VERSIONING_BEHAVIOR_UNSPECIFIED - vbPinned = enumspb.VERSIONING_BEHAVIOR_PINNED - vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE - ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 - ver3PollTimeout = 2 * time.Minute + tqTypeWf = enumspb.TASK_QUEUE_TYPE_WORKFLOW + tqTypeAct = enumspb.TASK_QUEUE_TYPE_ACTIVITY + tqTypeNexus = enumspb.TASK_QUEUE_TYPE_NEXUS + vbUnspecified = enumspb.VERSIONING_BEHAVIOR_UNSPECIFIED + vbPinned = enumspb.VERSIONING_BEHAVIOR_PINNED + vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE + ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 + ver3PollTimeout = 2 * time.Minute + ver3RetryPollTimeout = 21 * time.Second versionStatusNil = versionStatus(0) versionStatusInactive = versionStatus(1) @@ -582,7 +583,7 @@ func (s *Versioning3Suite) testPinnedWorkflowWithLateActivityPoller(env *testcor // When the first activity poller arrives from this deployment, it registers the TQ in the // deployment and that will trigger reevaluation of backlog queue. - s.pollActivityAndHandle(env, tv, nil, + s.pollActivityAndHandleEventually(env, tv, func(task *workflowservice.PollActivityTaskQueueResponse) (*workflowservice.RespondActivityTaskCompletedRequest, error) { s.NotNil(task) return respondActivity(), nil @@ -4191,6 +4192,17 @@ func (s *Versioning3Suite) pollActivityAndHandle( s.doPollActivityAndHandle(env, tv, true, async, handler) } +func (s *Versioning3Suite) pollActivityAndHandleEventually( + env *testcore.TestEnv, + tv *testvars.TestVars, + handler func(task *workflowservice.PollActivityTaskQueueResponse) (*workflowservice.RespondActivityTaskCompletedRequest, error), +) { + s.Await(func(s *Versioning3Suite) { + err := s.doPollActivityAndHandleErrWithTimeout(env, tv, true, ver3RetryPollTimeout, handler) + s.NoError(err) + }, 90*time.Second, 500*time.Millisecond) +} + func (s *Versioning3Suite) pollActivityAndHandleErr( env *testcore.TestEnv, tv *testvars.TestVars, @@ -4224,12 +4236,22 @@ func (s *Versioning3Suite) doPollActivityAndHandleErr( tv *testvars.TestVars, versioned bool, handler func(task *workflowservice.PollActivityTaskQueueResponse) (*workflowservice.RespondActivityTaskCompletedRequest, error), +) error { + return s.doPollActivityAndHandleErrWithTimeout(env, tv, versioned, ver3PollTimeout, handler) +} + +func (s *Versioning3Suite) doPollActivityAndHandleErrWithTimeout( + env *testcore.TestEnv, + tv *testvars.TestVars, + versioned bool, + timeout time.Duration, + handler func(task *workflowservice.PollActivityTaskQueueResponse) (*workflowservice.RespondActivityTaskCompletedRequest, error), ) error { poller := taskpoller.New(s.T(), env.FrontendClient(), env.Namespace().String()) _, err := poller.PollActivityTask( &workflowservice.PollActivityTaskQueueRequest{ DeploymentOptions: tv.WorkerDeploymentOptions(versioned), - }).HandleTask(tv, handler, taskpoller.WithTimeout(ver3PollTimeout)) + }).HandleTask(tv, handler, taskpoller.WithTimeout(timeout)) return err } From ba02c69b61646bbbb390d85104fa23089b153220 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 16:51:24 -0700 Subject: [PATCH 39/47] Close standalone versioning3 SDK clients --- tests/versioning_3_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index 24a79e64f2..e9a72eb6dd 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -2076,6 +2076,7 @@ func (s *Versioning3Suite) testChildWorkflowInheritanceExpectNoInherit(crossTq b Namespace: env.Namespace().String(), }) s.NoError(err) + defer sdkClient.Close() w1 := worker.New(sdkClient, tv1.TaskQueue().GetName(), worker.Options{ DeploymentOptions: worker.DeploymentOptions{ @@ -3064,6 +3065,7 @@ func (s *Versioning3Suite) testCan(crossTq bool, behavior enumspb.VersioningBeha Namespace: env.Namespace().String(), }) s.NoError(err) + defer sdkClient.Close() if crossTq && expectPinnedInherit { w1xtq := worker.New(sdkClient, canxTq, worker.Options{ From 0f79ccaeaf71cc091b610903d3d0bb7ec3de3ff1 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 17:49:49 -0700 Subject: [PATCH 40/47] Reduce versioning3 retry summary noise --- tests/versioning_3_test.go | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index e9a72eb6dd..cc76a408d7 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -63,6 +63,7 @@ const ( vbUnpinned = enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE ver3MinPollTime = common.MinLongPollTimeout + time.Millisecond*200 ver3PollTimeout = 2 * time.Minute + ver3RPCTimeout = 10 * time.Second ver3RetryPollTimeout = 21 * time.Second versionStatusNil = versionStatus(0) @@ -398,7 +399,7 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, // wait for v1 to become drained s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() resp, err := env.FrontendClient().DescribeWorkerDeploymentVersion(ctx, &workflowservice.DescribeWorkerDeploymentVersionRequest{ @@ -417,9 +418,10 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, versionStr = worker_versioning.ExternalWorkerDeploymentVersionToString(worker_versioning.ExternalWorkerDeploymentVersionFromDeployment(tv.Deployment())) } - _, err := s.queryWorkflow(env, tv) - s.Error(err) - s.ErrorContains(err, fmt.Sprintf(matching.ErrBlackholedQuery, versionStr, versionStr)) + s.Await(func(s *Versioning3Suite) { + _, err := s.queryWorkflow(env, tv) + s.ErrorContains(err, fmt.Sprintf(matching.ErrBlackholedQuery, versionStr, versionStr)) + }, 30*time.Second, 500*time.Millisecond) } else { // since the version still has pollers, the query should succeed s.pollAndQueryWorkflow(env, tv, false) @@ -431,7 +433,7 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, // wait for v1 to become ramping s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() resp, err := env.FrontendClient().DescribeWorkerDeploymentVersion(ctx, &workflowservice.DescribeWorkerDeploymentVersionRequest{ @@ -3430,7 +3432,7 @@ func (s *Versioning3Suite) setCurrentDeployment(env *testcore.TestEnv, tv *testv buildIDNotFound := fmt.Sprintf("build ID '%s' not found in Worker Deployment", tv.BuildID()) deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() req := &workflowservice.SetWorkerDeploymentCurrentVersionRequest{ @@ -3855,7 +3857,7 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, transition *workflowpb.DeploymentVersionTransition, ) { s.Await(func(s *Versioning3Suite) { - ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() dwf, err := env.FrontendClient().DescribeWorkflowExecution( @@ -4510,10 +4512,10 @@ func (s *Versioning3Suite) verifyVersioningSAs( executionStatus enumspb.WorkflowExecutionStatus, usedBuilds ...*testvars.TestVars, ) { - ctx, cancel := context.WithTimeout(s.Context(), 10*time.Second) - defer cancel() - s.Await(func(s *Versioning3Suite) { + ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) + defer cancel() + var query string if behavior != vbUnspecified { query = fmt.Sprintf("WorkflowId = '%s' AND TemporalWorkerDeployment = '%s' AND TemporalWorkerDeploymentVersion= '%s' AND TemporalWorkflowVersioningBehavior = '%s' AND ExecutionStatus = '%s'", @@ -4561,7 +4563,7 @@ func (s *Versioning3Suite) verifyVersioningSAs( fmt.Println(resp.GetExecutions()[0]) } - }, 5*time.Second, 50*time.Millisecond) + }, 30*time.Second, 500*time.Millisecond) } func (s *Versioning3Suite) TestAutoUpgradeWorkflows_NoBouncingBetweenVersions() { From 4ede76d4f1d2326f8d9ac4155b66a5f79f3b7174 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 18:16:24 -0700 Subject: [PATCH 41/47] Back off await polling with jitter --- common/testing/await/require_ctx.go | 51 +++++++++++++++++++++++- common/testing/await/require_ctx_test.go | 33 +++++++++++++++ 2 files changed, 82 insertions(+), 2 deletions(-) diff --git a/common/testing/await/require_ctx.go b/common/testing/await/require_ctx.go index a830d551d2..296bc48f64 100644 --- a/common/testing/await/require_ctx.go +++ b/common/testing/await/require_ctx.go @@ -4,12 +4,21 @@ import ( "context" "fmt" "os" + "sync/atomic" "testing" "time" ) const requireMisuseHint = "use the *await.T passed to the callback, not s.T() or suite assertion methods" +const ( + maxPollInterval = 2 * time.Second + pollBackoffMultiplier = 2 + pollJitterDivisor = 5 +) + +var pollJitterCounter atomic.Uint64 + // softDeadlockTimeoutEnvVar overrides the default soft-deadlock timeout. // Parsed as a Go duration, e.g. "10s". const softDeadlockTimeoutEnvVar = "TEMPORAL_AWAIT_SOFT_DEADLOCK_TIMEOUT" @@ -110,6 +119,7 @@ func run( var failures []attemptFailure polls := 0 + pollBackoff := newPollBackoff(pollInterval) for { // Parent context was canceled while we were sleeping (not our deadline). @@ -171,8 +181,8 @@ func run( return } - // Wait for pollInterval, or context is canceled or deadline is reached. - sleep(awaitCtx, deadline, pollInterval) + // Wait using backoff, or until context is canceled or deadline is reached. + sleep(awaitCtx, deadline, pollBackoff.next()) } } @@ -262,6 +272,43 @@ func runAttempt( } } +type pollBackoff struct { + current time.Duration + max time.Duration +} + +func newPollBackoff(initial time.Duration) pollBackoff { + maxInterval := maxPollInterval + if initial > maxInterval { + maxInterval = initial + } + return pollBackoff{ + current: initial, + max: maxInterval, + } +} + +func (b *pollBackoff) next() time.Duration { + delay := addJitter(b.current, b.max) + if b.current < b.max { + b.current = min(b.current*pollBackoffMultiplier, b.max) + } + return delay +} + +func addJitter(base, maxDelay time.Duration) time.Duration { + if base <= 0 { + return base + } + jitterRange := base / pollJitterDivisor + if jitterRange <= 0 { + return base + } + seed := uint64(time.Now().UnixNano()) ^ pollJitterCounter.Add(0x9e3779b97f4a7c15) + delay := base + time.Duration(seed%uint64(jitterRange+1)) + return min(delay, maxDelay) +} + func sleep(ctx context.Context, deadline time.Time, pollInterval time.Duration) { remaining := time.Until(deadline) if remaining < pollInterval { diff --git a/common/testing/await/require_ctx_test.go b/common/testing/await/require_ctx_test.go index 5810f46eb5..b5b293c94f 100644 --- a/common/testing/await/require_ctx_test.go +++ b/common/testing/await/require_ctx_test.go @@ -151,6 +151,39 @@ func TestRequire_PollIntervalStartsAfterAttemptFinishes(t *testing.T) { } } +func TestRequire_PollIntervalBacksOff(t *testing.T) { + t.Parallel() + + var attempts atomic.Int32 + var attemptStarts []time.Time + var attemptEnds []time.Time + attemptDuration := 10 * time.Millisecond + pollInterval := 20 * time.Millisecond + + await.Require(t.Context(), t, func(t *await.T) { + attemptStarts = append(attemptStarts, time.Now()) + defer func() { attemptEnds = append(attemptEnds, time.Now()) }() + + time.Sleep(attemptDuration) //nolint:forbidigo // simulate attempt work to measure gap between attempts + + if attempts.Add(1) < 4 { + t.Error("not ready") + } + }, time.Second, pollInterval) + + require.Equal(t, int32(4), attempts.Load()) + require.Len(t, attemptStarts, 4) + require.Len(t, attemptEnds, 4) + + var gaps []time.Duration + for i := 1; i < len(attemptStarts); i++ { + gaps = append(gaps, attemptStarts[i].Sub(attemptEnds[i-1])) + } + require.GreaterOrEqual(t, gaps[0], pollInterval) + require.GreaterOrEqual(t, gaps[1], 2*pollInterval) + require.GreaterOrEqual(t, gaps[2], 4*pollInterval) +} + func TestRequire_FailureScenarios(t *testing.T) { t.Parallel() From 982f47a348703e7ebc241344f73451c45868aec5 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 19:50:15 -0700 Subject: [PATCH 42/47] Tune await backoff for flaky suites --- common/testing/await/require_ctx.go | 15 ++++++++++----- common/testing/await/require_ctx_test.go | 8 ++++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/common/testing/await/require_ctx.go b/common/testing/await/require_ctx.go index 296bc48f64..f941efaeda 100644 --- a/common/testing/await/require_ctx.go +++ b/common/testing/await/require_ctx.go @@ -12,9 +12,10 @@ import ( const requireMisuseHint = "use the *await.T passed to the callback, not s.T() or suite assertion methods" const ( - maxPollInterval = 2 * time.Second - pollBackoffMultiplier = 2 - pollJitterDivisor = 5 + maxPollInterval = time.Second + pollBackoffMultiplierNum = 3 + pollBackoffMultiplierDen = 2 + pollJitterDivisor = 10 ) var pollJitterCounter atomic.Uint64 @@ -291,7 +292,7 @@ func newPollBackoff(initial time.Duration) pollBackoff { func (b *pollBackoff) next() time.Duration { delay := addJitter(b.current, b.max) if b.current < b.max { - b.current = min(b.current*pollBackoffMultiplier, b.max) + b.current = min(b.current*pollBackoffMultiplierNum/pollBackoffMultiplierDen, b.max) } return delay } @@ -305,7 +306,11 @@ func addJitter(base, maxDelay time.Duration) time.Duration { return base } seed := uint64(time.Now().UnixNano()) ^ pollJitterCounter.Add(0x9e3779b97f4a7c15) - delay := base + time.Duration(seed%uint64(jitterRange+1)) + jitter := time.Duration(seed%uint64(2*jitterRange+1)) - jitterRange + delay := base + jitter + if delay <= 0 { + return base + } return min(delay, maxDelay) } diff --git a/common/testing/await/require_ctx_test.go b/common/testing/await/require_ctx_test.go index b5b293c94f..d9513d85d6 100644 --- a/common/testing/await/require_ctx_test.go +++ b/common/testing/await/require_ctx_test.go @@ -146,7 +146,7 @@ func TestRequire_PollIntervalStartsAfterAttemptFinishes(t *testing.T) { require.Len(t, attemptEnds, 3) for i := 1; i < len(attemptStarts); i++ { gap := attemptStarts[i].Sub(attemptEnds[i-1]) - require.GreaterOrEqual(t, gap, pollInterval, + require.GreaterOrEqual(t, gap, pollInterval-pollInterval/10, "poll interval should run after attempt finishes (gap=%v < %v)", gap, pollInterval) } } @@ -179,9 +179,9 @@ func TestRequire_PollIntervalBacksOff(t *testing.T) { for i := 1; i < len(attemptStarts); i++ { gaps = append(gaps, attemptStarts[i].Sub(attemptEnds[i-1])) } - require.GreaterOrEqual(t, gaps[0], pollInterval) - require.GreaterOrEqual(t, gaps[1], 2*pollInterval) - require.GreaterOrEqual(t, gaps[2], 4*pollInterval) + require.GreaterOrEqual(t, gaps[0], pollInterval-pollInterval/10) + require.GreaterOrEqual(t, gaps[1], 3*pollInterval/2-(3*pollInterval/2)/10) + require.GreaterOrEqual(t, gaps[2], 9*pollInterval/4-(9*pollInterval/4)/10) } func TestRequire_FailureScenarios(t *testing.T) { From fa3ff5b8d233aff106ddb27c683f091a948d7066 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 20:02:57 -0700 Subject: [PATCH 43/47] Add versioning3 await diagnostics --- tests/versioning_3_test.go | 183 +++++++++++++++++++++++++------------ 1 file changed, 126 insertions(+), 57 deletions(-) diff --git a/tests/versioning_3_test.go b/tests/versioning_3_test.go index cc76a408d7..a9edb5f143 100644 --- a/tests/versioning_3_test.go +++ b/tests/versioning_3_test.go @@ -398,7 +398,7 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, s.setCurrentDeployment(env, tv2) // wait for v1 to become drained - s.Await(func(s *Versioning3Suite) { + s.Awaitf(func(s *Versioning3Suite) { ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() @@ -406,9 +406,15 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, Namespace: env.Namespace().String(), Version: tv.DeploymentVersionString(), }) - s.NoError(err) - s.Equal(enumspb.VERSION_DRAINAGE_STATUS_DRAINED, resp.GetWorkerDeploymentVersionInfo().GetDrainageInfo().GetStatus()) - }, 90*time.Second, 500*time.Millisecond) + s.NoError(err, "DescribeWorkerDeploymentVersion failed: version=%s rpc_ctx_err=%v await_ctx_err=%v", + tv.DeploymentVersionString(), ctx.Err(), s.Context().Err()) + actual := resp.GetWorkerDeploymentVersionInfo().GetDrainageInfo().GetStatus() + s.Equal(enumspb.VERSION_DRAINAGE_STATUS_DRAINED, actual, + "worker deployment version drainage status mismatch: version=%s info=%v", + tv.DeploymentVersionString(), resp.GetWorkerDeploymentVersionInfo()) + }, 90*time.Second, 500*time.Millisecond, + "wait for worker deployment version to drain: namespace=%s version=%s workflow_id=%s", + env.Namespace(), tv.DeploymentVersionString(), tv.WorkflowID()) if !pollersPresent { // simulate the pollers going away, which should make the query fail as now the version is drained + has no pollers polling it @@ -418,10 +424,14 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, versionStr = worker_versioning.ExternalWorkerDeploymentVersionToString(worker_versioning.ExternalWorkerDeploymentVersionFromDeployment(tv.Deployment())) } - s.Await(func(s *Versioning3Suite) { + s.Awaitf(func(s *Versioning3Suite) { _, err := s.queryWorkflow(env, tv) - s.ErrorContains(err, fmt.Sprintf(matching.ErrBlackholedQuery, versionStr, versionStr)) - }, 30*time.Second, 500*time.Millisecond) + s.ErrorContains(err, fmt.Sprintf(matching.ErrBlackholedQuery, versionStr, versionStr), + "query did not return drained-version blackhole error: version=%s actual_err=%v await_ctx_err=%v", + versionStr, err, s.Context().Err()) + }, 30*time.Second, 500*time.Millisecond, + "wait for pinned query to report drained version: namespace=%s workflow_id=%s version=%s", + env.Namespace(), tv.WorkflowID(), versionStr) } else { // since the version still has pollers, the query should succeed s.pollAndQueryWorkflow(env, tv, false) @@ -432,7 +442,7 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, s.setRampingDeployment(env, tv, 50, false) // wait for v1 to become ramping - s.Await(func(s *Versioning3Suite) { + s.Awaitf(func(s *Versioning3Suite) { ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() @@ -440,9 +450,15 @@ func (s *Versioning3Suite) testPinnedQueryDrainedVersion(env *testcore.TestEnv, Namespace: env.Namespace().String(), Version: tv.DeploymentVersionString(), }) - s.NoError(err) - s.Equal(enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_RAMPING, resp.GetWorkerDeploymentVersionInfo().GetStatus()) - }, 90*time.Second, 500*time.Millisecond) + s.NoError(err, "DescribeWorkerDeploymentVersion failed: version=%s rpc_ctx_err=%v await_ctx_err=%v", + tv.DeploymentVersionString(), ctx.Err(), s.Context().Err()) + actual := resp.GetWorkerDeploymentVersionInfo().GetStatus() + s.Equal(enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_RAMPING, actual, + "worker deployment version status mismatch: version=%s info=%v", + tv.DeploymentVersionString(), resp.GetWorkerDeploymentVersionInfo()) + }, 90*time.Second, 500*time.Millisecond, + "wait for worker deployment version to ramp: namespace=%s version=%s workflow_id=%s", + env.Namespace(), tv.DeploymentVersionString(), tv.WorkflowID()) // the ramping status is propagated to the task queues s.waitForDeploymentDataPropagation(env, tv, versionStatusRamping, false, tqTypeWf) @@ -3431,7 +3447,7 @@ func (s *Versioning3Suite) setCurrentDeployment(env *testcore.TestEnv, tv *testv failedPrecondition := serviceerror.NewFailedPreconditionf(workerdeployment.ErrCurrentVersionDoesNotHaveAllTaskQueues, tv.DeploymentVersionStringV32()).Error() buildIDNotFound := fmt.Sprintf("build ID '%s' not found in Worker Deployment", tv.BuildID()) deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) - s.Await(func(s *Versioning3Suite) { + s.Awaitf(func(s *Versioning3Suite) { ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() @@ -3442,11 +3458,15 @@ func (s *Versioning3Suite) setCurrentDeployment(env *testcore.TestEnv, tv *testv req.BuildId = tv.BuildID() _, err := env.FrontendClient().SetWorkerDeploymentCurrentVersion(ctx, req) if s.shouldRetryWorkerDeploymentRPC(env, err, failedPrecondition, buildIDNotFound, deploymentNotFound) { - s.NoError(err) + s.NoError(err, "retryable SetWorkerDeploymentCurrentVersion failure: deployment=%s build_id=%s rpc_ctx_err=%v await_ctx_err=%v", + tv.DeploymentSeries(), tv.BuildID(), ctx.Err(), s.Context().Err()) return } - s.NoError(err) - }, 90*time.Second, 500*time.Millisecond) + s.NoError(err, "SetWorkerDeploymentCurrentVersion failed: deployment=%s build_id=%s rpc_ctx_err=%v await_ctx_err=%v", + tv.DeploymentSeries(), tv.BuildID(), ctx.Err(), s.Context().Err()) + }, 90*time.Second, 500*time.Millisecond, + "set current worker deployment: namespace=%s deployment=%s build_id=%s version=%s", + env.Namespace(), tv.DeploymentSeries(), tv.BuildID(), tv.DeploymentVersionString()) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3506,7 +3526,7 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te if len(tqTypes) == 0 { tqTypes = []enumspb.TaskQueueType{tqTypeWf} } - s.Await(func(s *Versioning3Suite) { + s.Awaitf(func(s *Versioning3Suite) { ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() @@ -3517,15 +3537,20 @@ func (s *Versioning3Suite) waitForDeploymentVersionRegistration(env *testcore.Te TaskQueueType: tqType, Version: worker_versioning.DeploymentVersionFromDeployment(tv.Deployment()), }) - s.NoError(err) - s.True(resp.GetIsMember()) + s.NoError(err, "CheckTaskQueueVersionMembership failed: task_queue=%s type=%s version=%s rpc_ctx_err=%v await_ctx_err=%v", + tv.TaskQueue().GetName(), tqType, tv.DeploymentVersionString(), ctx.Err(), s.Context().Err()) + s.True(resp.GetIsMember(), + "task queue version membership not observed: task_queue=%s type=%s version=%s response=%v", + tv.TaskQueue().GetName(), tqType, tv.DeploymentVersionString(), resp) } - }, 90*time.Second, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond, + "wait for deployment version registration: namespace=%s task_queue=%s version=%s tq_types=%v", + env.Namespace(), tv.TaskQueue().GetName(), tv.DeploymentVersionString(), tqTypes) } func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) - s.Await(func(s *Versioning3Suite) { + s.Awaitf(func(s *Versioning3Suite) { ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() @@ -3539,7 +3564,9 @@ func (s *Versioning3Suite) unsetCurrentDeployment(env *testcore.TestEnv, tv *tes return } s.NoError(err) - }, 90*time.Second, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond, + "unset current worker deployment: namespace=%s deployment=%s version=%s", + env.Namespace(), tv.DeploymentSeries(), tv.DeploymentVersionString()) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3560,7 +3587,7 @@ func (s *Versioning3Suite) setRampingDeployment( buildIDNotFound := fmt.Sprintf("build ID '%s' not found in Worker Deployment", tv.BuildID()) deploymentNotFound := fmt.Sprintf("no Worker Deployment found with name '%s'", tv.DeploymentSeries()) - s.Await(func(s *Versioning3Suite) { + s.Awaitf(func(s *Versioning3Suite) { ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() @@ -3572,11 +3599,15 @@ func (s *Versioning3Suite) setRampingDeployment( req.BuildId = bid _, err := env.FrontendClient().SetWorkerDeploymentRampingVersion(ctx, req) if s.shouldRetryWorkerDeploymentRPC(env, err, failedPrecondition, buildIDNotFound, deploymentNotFound) { - s.NoError(err) + s.NoError(err, "retryable SetWorkerDeploymentRampingVersion failure: deployment=%s build_id=%s percentage=%v rpc_ctx_err=%v await_ctx_err=%v", + tv.DeploymentSeries(), bid, percentage, ctx.Err(), s.Context().Err()) return } - s.NoError(err) - }, 90*time.Second, 500*time.Millisecond) + s.NoError(err, "SetWorkerDeploymentRampingVersion failed: deployment=%s build_id=%s percentage=%v rpc_ctx_err=%v await_ctx_err=%v", + tv.DeploymentSeries(), bid, percentage, ctx.Err(), s.Context().Err()) + }, 90*time.Second, 500*time.Millisecond, + "set ramping worker deployment: namespace=%s deployment=%s build_id=%s version=%s percentage=%v ramp_unversioned=%v", + env.Namespace(), tv.DeploymentSeries(), bid, tv.DeploymentVersionString(), percentage, rampUnversioned) // Wait for propagation to complete since we have tests using async entity workflows to set the current version s.waitForDeploymentDataPropagationQueryWorkerDeployment(env, tv) @@ -3584,7 +3615,7 @@ func (s *Versioning3Suite) setRampingDeployment( func (s *Versioning3Suite) waitForDeploymentDataPropagationQueryWorkerDeployment(env *testcore.TestEnv, tv *testvars.TestVars) { if versioning3DeploymentWorkflowVersion == workerdeployment.AsyncSetCurrentAndRamping { - s.Await(func(s *Versioning3Suite) { + s.Awaitf(func(s *Versioning3Suite) { ctx, cancel := context.WithTimeout(s.Context(), 30*time.Second) defer cancel() @@ -3593,12 +3624,19 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagationQueryWorkerDeployment DeploymentName: tv.DeploymentSeries(), }) if s.shouldRetryWorkerDeploymentRPC(env, err) { - s.NoError(err) + s.NoError(err, "retryable DescribeWorkerDeployment failure: deployment=%s rpc_ctx_err=%v await_ctx_err=%v", + tv.DeploymentSeries(), ctx.Err(), s.Context().Err()) return } - s.NoError(err) - s.Equal(enumspb.ROUTING_CONFIG_UPDATE_STATE_COMPLETED, resp.GetWorkerDeploymentInfo().GetRoutingConfigUpdateState()) - }, 90*time.Second, 500*time.Millisecond) + s.NoError(err, "DescribeWorkerDeployment failed: deployment=%s rpc_ctx_err=%v await_ctx_err=%v", + tv.DeploymentSeries(), ctx.Err(), s.Context().Err()) + actual := resp.GetWorkerDeploymentInfo().GetRoutingConfigUpdateState() + s.Equal(enumspb.ROUTING_CONFIG_UPDATE_STATE_COMPLETED, actual, + "worker deployment routing config update not complete: deployment=%s info=%v", + tv.DeploymentSeries(), resp.GetWorkerDeploymentInfo()) + }, 90*time.Second, 500*time.Millisecond, + "wait for worker deployment routing config propagation: namespace=%s deployment=%s version=%s", + env.Namespace(), tv.DeploymentSeries(), tv.DeploymentVersionString()) } } @@ -3856,7 +3894,7 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, override *workflowpb.VersioningOverride, transition *workflowpb.DeploymentVersionTransition, ) { - s.Await(func(s *Versioning3Suite) { + s.Awaitf(func(s *Versioning3Suite) { ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() @@ -3867,10 +3905,13 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, WorkflowId: tv.WorkflowID(), }, }) - s.NoError(err) + s.NoError(err, "DescribeWorkflowExecution failed: workflow_id=%s rpc_ctx_err=%v await_ctx_err=%v", + tv.WorkflowID(), ctx.Err(), s.Context().Err()) versioningInfo := dwf.WorkflowExecutionInfo.GetVersioningInfo() - s.Equal(behavior.String(), versioningInfo.GetBehavior().String()) + s.Equal(behavior.String(), versioningInfo.GetBehavior().String(), + "workflow versioning behavior mismatch: workflow_id=%s versioning_info=%v execution_info=%v", + tv.WorkflowID(), versioningInfo, dwf.WorkflowExecutionInfo) var v *deploymentspb.WorkerDeploymentVersion if versioningInfo.GetVersion() != "" { //nolint:staticcheck // SA1019: worker versioning v0.31 //nolint:staticcheck // SA1019: worker versioning v0.31 @@ -3886,7 +3927,8 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, s.Fail(fmt.Sprintf("deployment version mismatch. expected: {%s}, actual: {%s}", deployment, actualDeployment, - )) + ), "workflow_id=%s versioning_info=%v execution_info=%v", + tv.WorkflowID(), versioningInfo, dwf.WorkflowExecutionInfo) } // v0.32 override @@ -3902,9 +3944,12 @@ func (s *Versioning3Suite) verifyWorkflowVersioning(env *testcore.TestEnv, s.Fail(fmt.Sprintf("version transition mismatch. expected: {%s}, actual: {%s}", transition, versioningInfo.GetVersionTransition(), - )) + ), "workflow_id=%s versioning_info=%v execution_info=%v", + tv.WorkflowID(), versioningInfo, dwf.WorkflowExecutionInfo) } - }, 90*time.Second, 500*time.Millisecond) + }, 90*time.Second, 500*time.Millisecond, + "verify workflow versioning: namespace=%s workflow_id=%s expected_behavior=%s expected_deployment=%v expected_override=%v expected_transition=%v", + env.Namespace(), tv.WorkflowID(), behavior, deployment, override, transition) } func respondActivity() *workflowservice.RespondActivityTaskCompletedRequest { @@ -4410,9 +4455,11 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagation( } } f, err := tqid.NewTaskQueueFamily(env.NamespaceID().String(), tv.TaskQueue().GetName()) - s.Await(func(s *Versioning3Suite) { + s.Awaitf(func(s *Versioning3Suite) { + observed := make(map[partAndType]string, len(remaining)) for pt := range remaining { - s.NoError(err) + s.NoError(err, "NewTaskQueueFamily failed: namespace_id=%s task_queue=%s", + env.NamespaceID(), tv.TaskQueue().GetName()) partition := f.TaskQueue(pt.tp).NormalPartition(pt.part) // Use lower-level GetTaskQueueUserData instead of GetWorkerBuildIdCompatibility // here so that we can target activity queues. @@ -4423,11 +4470,14 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagation( TaskQueue: partition.RpcName(), TaskQueueType: partition.TaskType(), }) - s.NoError(err) + s.NoError(err, "GetTaskQueueUserData failed: task_queue=%s partition=%d type=%s rpc_name=%s await_ctx_err=%v", + tv.TaskQueue().GetName(), pt.part, pt.tp, partition.RpcName(), s.Context().Err()) perTypes := res.GetUserData().GetData().GetPerType() if perTypes != nil { deploymentsData := perTypes[int32(pt.tp)].GetDeploymentData().GetDeploymentsData() workerDeploymentData := deploymentsData[tv.DeploymentVersion().GetDeploymentName()] + observed[pt] = fmt.Sprintf("has_per_type=true worker_data=%v deployment_data=%v", + workerDeploymentData, perTypes[int32(pt.tp)].GetDeploymentData()) if unversionedRamp { if perTypes[int32(pt.tp)].GetDeploymentData().GetUnversionedRampData() != nil { @@ -4472,10 +4522,16 @@ func (s *Versioning3Suite) waitForDeploymentDataPropagation( } } } + } else { + observed[pt] = "missing per-type deployment data" } } - s.Empty(remaining) - }, 90*time.Second, 500*time.Millisecond) + s.Empty(remaining, + "deployment data did not propagate: namespace=%s task_queue=%s version=%s expected_status=%v unversioned_ramp=%v remaining=%v observed=%v", + env.Namespace(), tv.TaskQueue().GetName(), tv.DeploymentVersionString(), status, unversionedRamp, remaining, observed) + }, 90*time.Second, 500*time.Millisecond, + "wait for task queue deployment data propagation: namespace=%s task_queue=%s version=%s expected_status=%v unversioned_ramp=%v tq_types=%v", + env.Namespace(), tv.TaskQueue().GetName(), tv.DeploymentVersionString(), status, unversionedRamp, tqTypes) } func (s *Versioning3Suite) validateBacklogCount( @@ -4487,7 +4543,7 @@ func (s *Versioning3Suite) validateBacklogCount( var resp *workflowservice.DescribeTaskQueueResponse var err error - s.Await(func(s *Versioning3Suite) { + s.Awaitf(func(s *Versioning3Suite) { ctx, cancel := context.WithTimeout(s.Context(), 10*time.Second) defer cancel() @@ -4497,12 +4553,18 @@ func (s *Versioning3Suite) validateBacklogCount( TaskQueueType: tqType, ReportStats: true, }) - s.NoError(err) - s.NotNil(resp) + s.NoError(err, "DescribeTaskQueue failed: task_queue=%s type=%s rpc_ctx_err=%v await_ctx_err=%v", + tv.TaskQueue().GetName(), tqType, ctx.Err(), s.Context().Err()) + s.NotNil(resp, "DescribeTaskQueue returned nil response: task_queue=%s type=%s", tv.TaskQueue().GetName(), tqType) priorityStats, ok := resp.GetStatsByPriorityKey()[3] - s.True(ok) - s.Equal(expectedCount, priorityStats.GetApproximateBacklogCount()) - }, 30*time.Second, 500*time.Millisecond) + s.True(ok, "DescribeTaskQueue response missing priority 3 stats: task_queue=%s type=%s stats=%v", + tv.TaskQueue().GetName(), tqType, resp.GetStatsByPriorityKey()) + s.Equal(expectedCount, priorityStats.GetApproximateBacklogCount(), + "backlog count mismatch: task_queue=%s type=%s expected=%d stats=%v response=%v", + tv.TaskQueue().GetName(), tqType, expectedCount, priorityStats, resp) + }, 30*time.Second, 500*time.Millisecond, + "validate backlog count: namespace=%s task_queue=%s type=%s expected_count=%d", + env.Namespace(), tv.TaskQueue().GetName(), tqType, expectedCount) } func (s *Versioning3Suite) verifyVersioningSAs( @@ -4512,7 +4574,7 @@ func (s *Versioning3Suite) verifyVersioningSAs( executionStatus enumspb.WorkflowExecutionStatus, usedBuilds ...*testvars.TestVars, ) { - s.Await(func(s *Versioning3Suite) { + s.Awaitf(func(s *Versioning3Suite) { ctx, cancel := context.WithTimeout(s.Context(), ver3RPCTimeout) defer cancel() @@ -4528,42 +4590,49 @@ func (s *Versioning3Suite) verifyVersioningSAs( Namespace: env.Namespace().String(), Query: query, }) - s.NoError(err) - s.NotEmpty(resp.GetExecutions()) + s.NoError(err, "ListWorkflowExecutions failed: query=%q rpc_ctx_err=%v await_ctx_err=%v", + query, ctx.Err(), s.Context().Err()) + s.NotEmpty(resp.GetExecutions(), "visibility query returned no executions: query=%q response=%v", query, resp) if len(resp.GetExecutions()) > 0 { w := resp.GetExecutions()[0] if behavior == vbPinned { payload, ok := w.GetSearchAttributes().GetIndexedFields()["BuildIds"] - s.True(ok) + s.True(ok, "BuildIds search attribute missing: query=%q execution=%v", query, w) searchAttrAny, err := sadefs.DecodeValue(payload, enumspb.INDEXED_VALUE_TYPE_KEYWORD_LIST, false) - s.NoError(err) + s.NoError(err, "failed to decode BuildIds search attribute: query=%q execution=%v", query, w) var searchAttr []string if searchAttrAny != nil { searchAttr = searchAttrAny.([]string) } if behavior == enumspb.VERSIONING_BEHAVIOR_PINNED { - s.Contains(searchAttr, worker_versioning.PinnedBuildIdSearchAttribute(tv.DeploymentVersionStringV32())) + s.Contains(searchAttr, worker_versioning.PinnedBuildIdSearchAttribute(tv.DeploymentVersionStringV32()), + "BuildIds search attribute mismatch: query=%q execution=%v search_attr=%v", + query, w, searchAttr) } } if len(usedBuilds) > 0 { // Validate TemporalUsedWorkerDeploymentVersions search attribute versionPayload, ok := w.GetSearchAttributes().GetIndexedFields()["TemporalUsedWorkerDeploymentVersions"] - s.True(ok) + s.True(ok, "TemporalUsedWorkerDeploymentVersions search attribute missing: query=%q execution=%v", query, w) versionAttrAny, err := sadefs.DecodeValue(versionPayload, enumspb.INDEXED_VALUE_TYPE_KEYWORD_LIST, false) - s.NoError(err) + s.NoError(err, "failed to decode TemporalUsedWorkerDeploymentVersions search attribute: query=%q execution=%v", query, w) var versionAttr []string if versionAttrAny != nil { versionAttr = versionAttrAny.([]string) } for _, b := range usedBuilds { - s.Contains(versionAttr, b.DeploymentVersionStringV32()) + s.Contains(versionAttr, b.DeploymentVersionStringV32(), + "TemporalUsedWorkerDeploymentVersions mismatch: query=%q execution=%v version_attr=%v expected_used_build=%s", + query, w, versionAttr, b.DeploymentVersionStringV32()) } } fmt.Println(resp.GetExecutions()[0]) } - }, 30*time.Second, 500*time.Millisecond) + }, 30*time.Second, 500*time.Millisecond, + "verify versioning search attributes: namespace=%s workflow_id=%s behavior=%s execution_status=%s used_builds=%v", + env.Namespace(), tv.WorkflowID(), behavior, executionStatus, usedBuilds) } func (s *Versioning3Suite) TestAutoUpgradeWorkflows_NoBouncingBetweenVersions() { From dd60ff8d4752f950ec5c847a0e417be52a3add7e Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 20:17:21 -0700 Subject: [PATCH 44/47] Surface final await timeout attempts --- common/testing/await/report.go | 53 ++++++++++++++++++++++++ common/testing/await/require_ctx_test.go | 28 ++++++++++++- 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/common/testing/await/report.go b/common/testing/await/report.go index 49a56a0937..e9ec4ff3b6 100644 --- a/common/testing/await/report.go +++ b/common/testing/await/report.go @@ -22,6 +22,7 @@ type attemptFailure struct { // reportTimeout reports the timeout failure plus collected attempt errors. func reportTimeout(tb testing.TB, failures []attemptFailure, funcName, timeoutMsg string, effectiveTimeout time.Duration, polls int) { + reportFinalAttemptContext(tb, failures) reportAttemptErrors(tb, failures) if timeoutMsg != "" { tb.Fatalf("%s: %s (not satisfied after %v, %d polls)", funcName, timeoutMsg, effectiveTimeout, polls) @@ -30,6 +31,26 @@ func reportTimeout(tb testing.TB, failures []attemptFailure, funcName, timeoutMs } } +func reportFinalAttemptContext(tb testing.TB, failures []attemptFailure) { + if len(failures) == 0 { + return + } + + var b strings.Builder + last := failures[len(failures)-1] + b.WriteString("last failed attempt before timeout:") + writeAttemptFailure(&b, last) + + if isDeadlineOnlyFailure(last) { + if previous, ok := previousDistinctFailure(failures, last); ok { + b.WriteString("\n\nprevious distinct failed attempt:") + writeAttemptFailure(&b, previous) + } + } + + tb.Errorf("%s", b.String()) +} + func reportAttemptErrors(tb testing.TB, failures []attemptFailure) { if len(failures) == 0 { return @@ -54,6 +75,38 @@ func reportAttemptErrors(tb testing.TB, failures []attemptFailure) { tb.Errorf("%s", b.String()) } +func previousDistinctFailure(failures []attemptFailure, last attemptFailure) (attemptFailure, bool) { + lastText := attemptFailureText(last) + for i := len(failures) - 2; i >= 0; i-- { + if attemptFailureText(failures[i]) != lastText { + return failures[i], true + } + } + return attemptFailure{}, false +} + +func isDeadlineOnlyFailure(f attemptFailure) bool { + if len(f.errors) == 0 { + return false + } + text := strings.ToLower(attemptFailureText(f)) + hasDeadline := strings.Contains(text, "context deadline exceeded") || + strings.Contains(text, "context canceled") + if !hasDeadline { + return false + } + withoutDeadline := strings.ReplaceAll(text, "context deadline exceeded", "") + withoutDeadline = strings.ReplaceAll(withoutDeadline, "context canceled", "") + withoutDeadline = strings.TrimSpace(withoutDeadline) + return withoutDeadline == "" || + strings.Contains(withoutDeadline, "error trace:") || + strings.Contains(withoutDeadline, "error:") +} + +func attemptFailureText(f attemptFailure) string { + return strings.Join(f.errors, "\n") +} + func writeAttemptFailure(b *strings.Builder, f attemptFailure) { fmt.Fprintf(b, "\n\n --- attempt %d ---", f.attempt) if len(f.errors) == 0 { diff --git a/common/testing/await/require_ctx_test.go b/common/testing/await/require_ctx_test.go index d9513d85d6..e02d9d6e85 100644 --- a/common/testing/await/require_ctx_test.go +++ b/common/testing/await/require_ctx_test.go @@ -301,7 +301,32 @@ func TestRequire_FailureScenarios(t *testing.T) { }) require.True(t, tb.Failed()) require.Contains(t, tb.fatals(), "not satisfied after") - require.Equal(t, "attempt errors:\n\n --- attempt 1 ---\n first attempt error\n\n --- attempt 2 ---\n last attempt error", tb.errors()) + require.Contains(t, tb.errors(), "last failed attempt before timeout:\n\n --- attempt 2 ---\n last attempt error") + require.Contains(t, tb.errors(), "attempt errors:\n\n --- attempt 1 ---\n first attempt error\n\n --- attempt 2 ---\n last attempt error") + require.Equal(t, int32(2), attempts.Load()) + }) + + t.Run("reports previous distinct attempt when last attempt is only deadline", func(t *testing.T) { + t.Parallel() + + ctx := testcontext.New(t) + var attempts atomic.Int32 + tb := newRecordingTB() + tb.run(func() { + await.Require(ctx, tb, func(t *await.T) { + if attempts.Add(1) == 1 { + t.Error("observed state: version is still draining") + return + } + <-t.Context().Done() + t.Error("context deadline exceeded") + }, time.Second, 100*time.Millisecond) + }) + require.True(t, tb.Failed()) + require.Contains(t, tb.fatals(), "not satisfied after") + errs := tb.errors() + require.Contains(t, errs, "last failed attempt before timeout:\n\n --- attempt 2 ---\n context deadline exceeded") + require.Contains(t, errs, "previous distinct failed attempt:\n\n --- attempt 1 ---\n observed state: version is still draining") require.Equal(t, int32(2), attempts.Load()) }) @@ -324,6 +349,7 @@ func TestRequire_FailureScenarios(t *testing.T) { require.Greater(t, n, int32(4), "need >4 attempts to exercise truncation") errs := tb.errors() + require.Contains(t, errs, fmt.Sprintf("last failed attempt before timeout:\n\n --- attempt %d ---\n attempt %d failed", n, n)) require.Contains(t, errs, "attempt errors:\n\n --- attempt 1 ---\n attempt 1 failed\n") require.Contains(t, errs, fmt.Sprintf("... %d attempts omitted ...", n-4)) // Last three attempts present in order. From 20ca338546d6a38b3b168522d9cda61494908c84 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 20:26:27 -0700 Subject: [PATCH 45/47] Report await timeout timing stats --- common/testing/await/report.go | 102 ++++++++++++++++++++++- common/testing/await/require_ctx.go | 12 ++- common/testing/await/require_ctx_test.go | 10 ++- 3 files changed, 118 insertions(+), 6 deletions(-) diff --git a/common/testing/await/report.go b/common/testing/await/report.go index e9ec4ff3b6..44f61223dd 100644 --- a/common/testing/await/report.go +++ b/common/testing/await/report.go @@ -1,7 +1,9 @@ package await import ( + "cmp" "fmt" + "slices" "strings" "testing" "time" @@ -20,8 +22,53 @@ type attemptFailure struct { errors []string } +type awaitStats struct { + attempts []attemptTiming + sleeps []time.Duration + failedAttempts int + stoppedAttempts int + deadlockAttempts int +} + +type attemptTiming struct { + attempt int + duration time.Duration +} + +func (s *awaitStats) recordAttempt(attempt int, duration time.Duration, failed, stopped, deadlocked bool) { + s.attempts = append(s.attempts, attemptTiming{ + attempt: attempt, + duration: duration, + }) + if failed { + s.failedAttempts++ + } + if stopped { + s.stoppedAttempts++ + } + if deadlocked { + s.deadlockAttempts++ + } +} + +func (s *awaitStats) recordSleep(duration time.Duration) { + s.sleeps = append(s.sleeps, duration) +} + // reportTimeout reports the timeout failure plus collected attempt errors. -func reportTimeout(tb testing.TB, failures []attemptFailure, funcName, timeoutMsg string, effectiveTimeout time.Duration, polls int) { +func reportTimeout( + tb testing.TB, + failures []attemptFailure, + stats awaitStats, + parentErr error, + awaitErr error, + deadlineRemaining time.Duration, + funcName string, + timeoutMsg string, + effectiveTimeout time.Duration, + polls int, +) { + reportAwaitStats(tb, stats, parentErr, awaitErr, deadlineRemaining, polls) reportFinalAttemptContext(tb, failures) reportAttemptErrors(tb, failures) if timeoutMsg != "" { @@ -31,6 +78,18 @@ func reportTimeout(tb testing.TB, failures []attemptFailure, funcName, timeoutMs } } +func reportAwaitStats(tb testing.TB, stats awaitStats, parentErr error, awaitErr error, deadlineRemaining time.Duration, polls int) { + var b strings.Builder + fmt.Fprintf(&b, "await stats: polls=%d failed_attempts=%d stopped_attempts=%d deadlock_attempts=%d", + polls, stats.failedAttempts, stats.stoppedAttempts, stats.deadlockAttempts) + writeDurationSummary(&b, "attempt_duration", attemptDurations(stats.attempts)) + writeDurationSummary(&b, "sleep_duration", stats.sleeps) + writeSlowestAttempts(&b, stats.attempts) + fmt.Fprintf(&b, "\ncontext at timeout: parent_err=%v await_err=%v deadline_remaining=%v", + parentErr, awaitErr, deadlineRemaining) + tb.Errorf("%s", b.String()) +} + func reportFinalAttemptContext(tb testing.TB, failures []attemptFailure) { if len(failures) == 0 { return @@ -107,6 +166,47 @@ func attemptFailureText(f attemptFailure) string { return strings.Join(f.errors, "\n") } +func writeDurationSummary(b *strings.Builder, label string, durations []time.Duration) { + if len(durations) == 0 { + fmt.Fprintf(b, " %s=(none)", label) + return + } + minDuration, maxDuration, totalDuration := durations[0], durations[0], time.Duration(0) + for _, duration := range durations { + minDuration = min(minDuration, duration) + maxDuration = max(maxDuration, duration) + totalDuration += duration + } + fmt.Fprintf(b, " %s min=%v avg=%v max=%v last=%v", + label, minDuration, totalDuration/time.Duration(len(durations)), maxDuration, durations[len(durations)-1]) +} + +func writeSlowestAttempts(b *strings.Builder, timings []attemptTiming) { + if len(timings) == 0 { + b.WriteString("\nslowest attempts: (none)") + return + } + slowest := slices.Clone(timings) + slices.SortFunc(slowest, func(a, b attemptTiming) int { + return cmp.Compare(b.duration, a.duration) + }) + if len(slowest) > 3 { + slowest = slowest[:3] + } + b.WriteString("\nslowest attempts:") + for _, timing := range slowest { + fmt.Fprintf(b, " #%d=%v", timing.attempt, timing.duration) + } +} + +func attemptDurations(timings []attemptTiming) []time.Duration { + durations := make([]time.Duration, 0, len(timings)) + for _, timing := range timings { + durations = append(durations, timing.duration) + } + return durations +} + func writeAttemptFailure(b *strings.Builder, f attemptFailure) { fmt.Fprintf(b, "\n\n --- attempt %d ---", f.attempt) if len(f.errors) == 0 { diff --git a/common/testing/await/require_ctx.go b/common/testing/await/require_ctx.go index f941efaeda..4c29b15fef 100644 --- a/common/testing/await/require_ctx.go +++ b/common/testing/await/require_ctx.go @@ -121,6 +121,7 @@ func run( var failures []attemptFailure polls := 0 pollBackoff := newPollBackoff(pollInterval) + stats := awaitStats{} for { // Parent context was canceled while we were sleeping (not our deadline). @@ -138,8 +139,11 @@ func run( t := &T{tb: tb, ctx: attemptCtx} // Run attempt. + attemptStart := time.Now() res := runAttempt(t, condition, attemptCancel, funcName, cancellable) + attemptDuration := time.Since(attemptStart) attemptCancel() + stats.recordAttempt(polls, attemptDuration, len(t.errors) > 0, res.stopped, res.deadlocked) if res.panicVal != nil { panic(res.panicVal) // propagate to caller } @@ -173,7 +177,7 @@ func run( // Our deadline expired. if deadlineReached(deadline) { - reportTimeout(tb, failures, funcName, timeoutMsg, effectiveTimeout, polls) + reportTimeout(tb, failures, stats, parentCtx.Err(), awaitCtx.Err(), time.Until(deadline), funcName, timeoutMsg, effectiveTimeout, polls) return } @@ -183,7 +187,7 @@ func run( } // Wait using backoff, or until context is canceled or deadline is reached. - sleep(awaitCtx, deadline, pollBackoff.next()) + stats.recordSleep(sleep(awaitCtx, deadline, pollBackoff.next())) } } @@ -314,12 +318,13 @@ func addJitter(base, maxDelay time.Duration) time.Duration { return min(delay, maxDelay) } -func sleep(ctx context.Context, deadline time.Time, pollInterval time.Duration) { +func sleep(ctx context.Context, deadline time.Time, pollInterval time.Duration) time.Duration { remaining := time.Until(deadline) if remaining < pollInterval { pollInterval = remaining } + start := time.Now() timer := time.NewTimer(pollInterval) defer timer.Stop() @@ -327,6 +332,7 @@ func sleep(ctx context.Context, deadline time.Time, pollInterval time.Duration) case <-ctx.Done(): case <-timer.C: } + return time.Since(start) } func deadlineReached(deadline time.Time) bool { diff --git a/common/testing/await/require_ctx_test.go b/common/testing/await/require_ctx_test.go index e02d9d6e85..b33f756a3f 100644 --- a/common/testing/await/require_ctx_test.go +++ b/common/testing/await/require_ctx_test.go @@ -301,8 +301,14 @@ func TestRequire_FailureScenarios(t *testing.T) { }) require.True(t, tb.Failed()) require.Contains(t, tb.fatals(), "not satisfied after") - require.Contains(t, tb.errors(), "last failed attempt before timeout:\n\n --- attempt 2 ---\n last attempt error") - require.Contains(t, tb.errors(), "attempt errors:\n\n --- attempt 1 ---\n first attempt error\n\n --- attempt 2 ---\n last attempt error") + errs := tb.errors() + require.Contains(t, errs, "await stats: polls=2 failed_attempts=2") + require.Contains(t, errs, "attempt_duration min=") + require.Contains(t, errs, "sleep_duration min=") + require.Contains(t, errs, "slowest attempts:") + require.Contains(t, errs, "context at timeout:") + require.Contains(t, errs, "last failed attempt before timeout:\n\n --- attempt 2 ---\n last attempt error") + require.Contains(t, errs, "attempt errors:\n\n --- attempt 1 ---\n first attempt error\n\n --- attempt 2 ---\n last attempt error") require.Equal(t, int32(2), attempts.Load()) }) From 1a7ea89aecd0d52e9c306b82a627a95657bbbf00 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 20:45:35 -0700 Subject: [PATCH 46/47] Fix testcore global override test setup --- tests/testcore/test_cluster_pool_test.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/testcore/test_cluster_pool_test.go b/tests/testcore/test_cluster_pool_test.go index b7dd66ea09..5718e4065b 100644 --- a/tests/testcore/test_cluster_pool_test.go +++ b/tests/testcore/test_cluster_pool_test.go @@ -6,6 +6,8 @@ import ( "github.com/stretchr/testify/require" "go.temporal.io/server/common/cluster" "go.temporal.io/server/common/dynamicconfig" + "go.temporal.io/server/common/membership/static" + "go.temporal.io/server/common/primitives" ) func TestGlobalOverridesSurviveTestCleanup(t *testing.T) { @@ -14,6 +16,13 @@ func TestGlobalOverridesSurviveTestCleanup(t *testing.T) { t.Run("create", func(t *testing.T) { impl := newTemporal(t, &TemporalParams{ ClusterMetadataConfig: &cluster.Config{}, + HostsByProtocolByService: map[transferProtocol]map[primitives.ServiceName]static.Hosts{ + httpProtocol: { + primitives.FrontendService: { + All: []string{"127.0.0.1:0"}, + }, + }, + }, }) dcClient = impl.dcClient }) From fd657af8b0905ee85cbf19c4cc3dde28abe4c6c3 Mon Sep 17 00:00:00 2001 From: Stephan Behnke Date: Tue, 26 May 2026 21:38:40 -0700 Subject: [PATCH 47/47] Preserve await diagnostics in test summaries --- common/testing/await/report.go | 63 ++++++++++++++++++------ common/testing/await/require_ctx_test.go | 36 +++++++------- tools/testrunner/log.go | 24 +++++++++ tools/testrunner/log_test.go | 47 ++++++++++++++++++ 4 files changed, 136 insertions(+), 34 deletions(-) diff --git a/common/testing/await/report.go b/common/testing/await/report.go index 44f61223dd..45ee5953aa 100644 --- a/common/testing/await/report.go +++ b/common/testing/await/report.go @@ -68,17 +68,23 @@ func reportTimeout( effectiveTimeout time.Duration, polls int, ) { - reportAwaitStats(tb, stats, parentErr, awaitErr, deadlineRemaining, polls) - reportFinalAttemptContext(tb, failures) - reportAttemptErrors(tb, failures) + var sections []string + sections = append(sections, formatAwaitStats(stats, parentErr, awaitErr, deadlineRemaining, polls)) + if s := formatFinalAttemptContext(failures); s != "" { + sections = append(sections, s) + } + if s := formatAttemptErrors(failures); s != "" { + sections = append(sections, s) + } + diagnostics := strings.Join(sections, "\n\n") if timeoutMsg != "" { - tb.Fatalf("%s: %s (not satisfied after %v, %d polls)", funcName, timeoutMsg, effectiveTimeout, polls) + tb.Fatalf("%s\n\n%s: %s (not satisfied after %v, %d polls)", diagnostics, funcName, timeoutMsg, effectiveTimeout, polls) } else { - tb.Fatalf("%s: condition not satisfied after %v (%d polls)", funcName, effectiveTimeout, polls) + tb.Fatalf("%s\n\n%s: condition not satisfied after %v (%d polls)", diagnostics, funcName, effectiveTimeout, polls) } } -func reportAwaitStats(tb testing.TB, stats awaitStats, parentErr error, awaitErr error, deadlineRemaining time.Duration, polls int) { +func formatAwaitStats(stats awaitStats, parentErr error, awaitErr error, deadlineRemaining time.Duration, polls int) string { var b strings.Builder fmt.Fprintf(&b, "await stats: polls=%d failed_attempts=%d stopped_attempts=%d deadlock_attempts=%d", polls, stats.failedAttempts, stats.stoppedAttempts, stats.deadlockAttempts) @@ -87,12 +93,12 @@ func reportAwaitStats(tb testing.TB, stats awaitStats, parentErr error, awaitErr writeSlowestAttempts(&b, stats.attempts) fmt.Fprintf(&b, "\ncontext at timeout: parent_err=%v await_err=%v deadline_remaining=%v", parentErr, awaitErr, deadlineRemaining) - tb.Errorf("%s", b.String()) + return b.String() } -func reportFinalAttemptContext(tb testing.TB, failures []attemptFailure) { +func formatFinalAttemptContext(failures []attemptFailure) string { if len(failures) == 0 { - return + return "" } var b strings.Builder @@ -100,19 +106,28 @@ func reportFinalAttemptContext(tb testing.TB, failures []attemptFailure) { b.WriteString("last failed attempt before timeout:") writeAttemptFailure(&b, last) - if isDeadlineOnlyFailure(last) { + if previous, ok := previousNonDeadlineFailure(failures); ok && previous.attempt != last.attempt { + b.WriteString("\n\nlast non-deadline failed attempt:") + writeAttemptFailure(&b, previous) + } else if isDeadlineOnlyFailure(last) { if previous, ok := previousDistinctFailure(failures, last); ok { b.WriteString("\n\nprevious distinct failed attempt:") writeAttemptFailure(&b, previous) } } - tb.Errorf("%s", b.String()) + return b.String() } func reportAttemptErrors(tb testing.TB, failures []attemptFailure) { + if s := formatAttemptErrors(failures); s != "" { + tb.Errorf("%s", s) + } +} + +func formatAttemptErrors(failures []attemptFailure) string { if len(failures) == 0 { - return + return "" } var b strings.Builder @@ -131,7 +146,7 @@ func reportAttemptErrors(tb testing.TB, failures []attemptFailure) { writeAttemptFailure(&b, f) } } - tb.Errorf("%s", b.String()) + return b.String() } func previousDistinctFailure(failures []attemptFailure, last attemptFailure) (attemptFailure, bool) { @@ -144,14 +159,21 @@ func previousDistinctFailure(failures []attemptFailure, last attemptFailure) (at return attemptFailure{}, false } +func previousNonDeadlineFailure(failures []attemptFailure) (attemptFailure, bool) { + for i := len(failures) - 1; i >= 0; i-- { + if !hasContextDeadlineFailure(failures[i]) { + return failures[i], true + } + } + return attemptFailure{}, false +} + func isDeadlineOnlyFailure(f attemptFailure) bool { if len(f.errors) == 0 { return false } text := strings.ToLower(attemptFailureText(f)) - hasDeadline := strings.Contains(text, "context deadline exceeded") || - strings.Contains(text, "context canceled") - if !hasDeadline { + if !hasContextDeadlineText(text) { return false } withoutDeadline := strings.ReplaceAll(text, "context deadline exceeded", "") @@ -162,6 +184,15 @@ func isDeadlineOnlyFailure(f attemptFailure) bool { strings.Contains(withoutDeadline, "error:") } +func hasContextDeadlineFailure(f attemptFailure) bool { + return hasContextDeadlineText(strings.ToLower(attemptFailureText(f))) +} + +func hasContextDeadlineText(text string) bool { + return strings.Contains(text, "context deadline exceeded") || + strings.Contains(text, "context canceled") +} + func attemptFailureText(f attemptFailure) string { return strings.Join(f.errors, "\n") } diff --git a/common/testing/await/require_ctx_test.go b/common/testing/await/require_ctx_test.go index b33f756a3f..c44ff5dd12 100644 --- a/common/testing/await/require_ctx_test.go +++ b/common/testing/await/require_ctx_test.go @@ -300,15 +300,15 @@ func TestRequire_FailureScenarios(t *testing.T) { }, time.Second, 100*time.Millisecond) }) require.True(t, tb.Failed()) - require.Contains(t, tb.fatals(), "not satisfied after") - errs := tb.errors() - require.Contains(t, errs, "await stats: polls=2 failed_attempts=2") - require.Contains(t, errs, "attempt_duration min=") - require.Contains(t, errs, "sleep_duration min=") - require.Contains(t, errs, "slowest attempts:") - require.Contains(t, errs, "context at timeout:") - require.Contains(t, errs, "last failed attempt before timeout:\n\n --- attempt 2 ---\n last attempt error") - require.Contains(t, errs, "attempt errors:\n\n --- attempt 1 ---\n first attempt error\n\n --- attempt 2 ---\n last attempt error") + fatals := tb.fatals() + require.Contains(t, fatals, "not satisfied after") + require.Contains(t, fatals, "await stats: polls=2 failed_attempts=2") + require.Contains(t, fatals, "attempt_duration min=") + require.Contains(t, fatals, "sleep_duration min=") + require.Contains(t, fatals, "slowest attempts:") + require.Contains(t, fatals, "context at timeout:") + require.Contains(t, fatals, "last failed attempt before timeout:\n\n --- attempt 2 ---\n last attempt error") + require.Contains(t, fatals, "attempt errors:\n\n --- attempt 1 ---\n first attempt error\n\n --- attempt 2 ---\n last attempt error") require.Equal(t, int32(2), attempts.Load()) }) @@ -329,10 +329,10 @@ func TestRequire_FailureScenarios(t *testing.T) { }, time.Second, 100*time.Millisecond) }) require.True(t, tb.Failed()) - require.Contains(t, tb.fatals(), "not satisfied after") - errs := tb.errors() - require.Contains(t, errs, "last failed attempt before timeout:\n\n --- attempt 2 ---\n context deadline exceeded") - require.Contains(t, errs, "previous distinct failed attempt:\n\n --- attempt 1 ---\n observed state: version is still draining") + fatals := tb.fatals() + require.Contains(t, fatals, "not satisfied after") + require.Contains(t, fatals, "last failed attempt before timeout:\n\n --- attempt 2 ---\n context deadline exceeded") + require.Contains(t, fatals, "last non-deadline failed attempt:\n\n --- attempt 1 ---\n observed state: version is still draining") require.Equal(t, int32(2), attempts.Load()) }) @@ -354,13 +354,13 @@ func TestRequire_FailureScenarios(t *testing.T) { n := attempts.Load() require.Greater(t, n, int32(4), "need >4 attempts to exercise truncation") - errs := tb.errors() - require.Contains(t, errs, fmt.Sprintf("last failed attempt before timeout:\n\n --- attempt %d ---\n attempt %d failed", n, n)) - require.Contains(t, errs, "attempt errors:\n\n --- attempt 1 ---\n attempt 1 failed\n") - require.Contains(t, errs, fmt.Sprintf("... %d attempts omitted ...", n-4)) + fatals := tb.fatals() + require.Contains(t, fatals, fmt.Sprintf("last failed attempt before timeout:\n\n --- attempt %d ---\n attempt %d failed", n, n)) + require.Contains(t, fatals, "attempt errors:\n\n --- attempt 1 ---\n attempt 1 failed\n") + require.Contains(t, fatals, fmt.Sprintf("... %d attempts omitted ...", n-4)) // Last three attempts present in order. for i := n - 2; i <= n; i++ { - require.Contains(t, errs, fmt.Sprintf("--- attempt %d ---\n attempt %d failed", i, i)) + require.Contains(t, fatals, fmt.Sprintf("--- attempt %d ---\n attempt %d failed", i, i)) } }) diff --git a/tools/testrunner/log.go b/tools/testrunner/log.go index 75fef7ddf1..77b9b85345 100644 --- a/tools/testrunner/log.go +++ b/tools/testrunner/log.go @@ -346,6 +346,9 @@ func parseFailedTestsFromOutput(stdout string) []string { func parseFailureDetails(data string) string { lines := normalizedFailureLines(data) + if block, ok := findLastAwaitFailureBlock(lines); ok { + return block + } // Prefer assertion blocks because they contain the useful testify failure // detail and can be selected from the end while ignoring trailing logs. if block, ok := findLastAssertionFailureBlock(lines); ok { @@ -371,6 +374,27 @@ func normalizedFailureLines(data string) []string { return lines } +func findLastAwaitFailureBlock(lines []string) (string, bool) { + for start := len(lines) - 1; start >= 0; start-- { + if !strings.Contains(lines[start], "await stats:") { + continue + } + end := len(lines) + for i := start + 1; i < len(lines); i++ { + if strings.HasPrefix(strings.TrimSpace(lines[i]), goTestFailLinePrefix) { + end = i + 1 + break + } + if strings.TrimSpace(lines[i]) == "FAIL" { + end = i + break + } + } + return strings.Join(lines[start:end], "\n"), true + } + return "", false +} + func findLastAssertionFailureBlock(lines []string) (string, bool) { var failLine string for i := len(lines) - 1; i >= 0; i-- { diff --git a/tools/testrunner/log_test.go b/tools/testrunner/log_test.go index a3ca35f4de..1b0e6d856e 100644 --- a/tools/testrunner/log_test.go +++ b/tools/testrunner/log_test.go @@ -198,6 +198,53 @@ FAIL`, }, notContains: []string{"attempts omitted", "--- attempt 1 ---", "first failure", "--- attempt 2 ---", "penultimate failure", "logger.go", "connection refused"}, }, + { + name: "keeps full await timeout diagnostics", + data: ` require_ctx.go:243: + Error Trace: require_ctx.go:243 + Error: await stats: polls=3 failed_attempts=3 stopped_attempts=3 deadlock_attempts=0 attempt_duration min=1ms avg=2ms max=3ms last=3ms sleep_duration min=100ms avg=100ms max=100ms last=100ms + slowest attempts: #3=3ms #2=2ms #1=1ms + context at timeout: parent_err= await_err=context deadline exceeded deadline_remaining=-1ms + + last failed attempt before timeout: + + --- attempt 3 --- + Error Trace: versioning_3_test.go:10 + Error: context deadline exceeded + + last non-deadline failed attempt: + + --- attempt 1 --- + Error Trace: versioning_3_test.go:10 + Error: Worker Deployment Version not found + + attempt errors: + + --- attempt 1 --- + Error Trace: versioning_3_test.go:10 + Error: Worker Deployment Version not found + + --- attempt 2 --- + Error Trace: versioning_3_test.go:10 + Error: context deadline exceeded + + --- attempt 3 --- + Error Trace: versioning_3_test.go:10 + Error: context deadline exceeded + + Requiref: deployment not ready (not satisfied after 90s, 3 polls) +--- FAIL: TestSuite/TestCase (90.00s) +FAIL`, + contains: []string{ + "await stats: polls=3", + "last failed attempt before timeout:", + "last non-deadline failed attempt:", + "Worker Deployment Version not found", + "attempt errors:", + "Requiref: deployment not ready", + "--- FAIL: TestSuite/TestCase (90.00s)", + }, + }, } for _, tt := range tests {