-
Notifications
You must be signed in to change notification settings - Fork 55
perf(#2354): bound enrollment wait with timeout and backoff #2359
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,17 @@ const ( | |
|
|
||
| // repoMaintenanceWorkflow is the workflow file that handles enrollment. | ||
| repoMaintenanceWorkflow = "repo-maintenance.yml" | ||
|
|
||
| // enrollmentWaitTimeout is the maximum time to wait for the | ||
| // repo-maintenance workflow run to appear and complete. | ||
| enrollmentWaitTimeout = 3 * time.Minute | ||
|
|
||
| // enrollmentPollInitial is the initial polling interval for | ||
| // workflow run status checks. | ||
| enrollmentPollInitial = 2 * time.Second | ||
|
|
||
| // enrollmentPollMax is the maximum polling interval (backoff cap). | ||
| enrollmentPollMax = 15 * time.Second | ||
| ) | ||
|
|
||
| // EnrollmentLayer monitors workflow-driven enrollment of target repos. | ||
|
|
@@ -82,11 +93,11 @@ func (l *EnrollmentLayer) Install(ctx context.Context) error { | |
| } | ||
| l.ui.StepDone("dispatched repo-maintenance workflow") | ||
|
|
||
| // Wait for the workflow run to complete. | ||
| // Wait for the workflow run to complete (bounded by enrollmentWaitTimeout). | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [low] error-handling-gap The diff removes the StepInfo guidance from Install error handler. For context cancellation errors, the user no longer receives guidance about where to check results, unlike the Uninstall path which retains this message. |
||
| l.ui.StepStart("waiting for enrollment workflow to complete") | ||
| run, err := l.awaitWorkflowRun(ctx, dispatchTime) | ||
| if err != nil { | ||
| l.ui.StepWarn(fmt.Sprintf("could not confirm enrollment: %v", err)) | ||
| l.ui.StepInfo("check the repo-maintenance workflow in .fullsend for results") | ||
| return nil // non-fatal — enrollment may still succeed | ||
| } | ||
|
|
||
|
|
@@ -105,18 +116,35 @@ func (l *EnrollmentLayer) Install(ctx context.Context) error { | |
| } | ||
|
|
||
| // awaitWorkflowRun polls for a repo-maintenance workflow run created after | ||
| // dispatchTime and waits for it to complete. | ||
| // dispatchTime and waits for it to complete. It uses exponential backoff | ||
| // and a bounded timeout to avoid long silent waits. | ||
| func (l *EnrollmentLayer) awaitWorkflowRun(ctx context.Context, dispatchTime time.Time) (*forge.WorkflowRun, error) { | ||
| for attempt := range 36 { // 3 minutes max | ||
| deadline := time.Now().Add(enrollmentWaitTimeout) | ||
| interval := enrollmentPollInitial | ||
| start := time.Now() | ||
|
|
||
| for { | ||
| if time.Now().After(deadline) { | ||
| elapsed := time.Since(start).Round(time.Second) | ||
| return nil, fmt.Errorf( | ||
| "timed out after %s waiting for repo-maintenance workflow; "+ | ||
| "check the workflow in .fullsend and re-run install if needed", | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [medium] logic-error The timeout error message from awaitWorkflowRun contains caller-specific guidance (re-run install if needed), but this method is also called by Uninstall (line 248). When the timeout fires during uninstall, the user sees misleading guidance to re-run install. Suggested fix: Make the error message generic (remove and re-run install if needed) and let the caller append context-appropriate guidance, or pass an operation label into awaitWorkflowRun. |
||
| elapsed, | ||
| ) | ||
| } | ||
|
|
||
| select { | ||
| case <-ctx.Done(): | ||
| return nil, ctx.Err() | ||
| case <-time.After(5 * time.Second): | ||
| case <-time.After(interval): | ||
| } | ||
|
|
||
| elapsed := time.Since(start).Round(time.Second) | ||
|
|
||
| runs, err := l.client.ListWorkflowRuns(ctx, l.org, forge.ConfigRepoName, repoMaintenanceWorkflow) | ||
| if err != nil { | ||
| l.ui.StepInfo(fmt.Sprintf("waiting for workflow run (attempt %d)...", attempt+1)) | ||
| l.ui.StepInfo(fmt.Sprintf("waiting for workflow registration (%s elapsed)...", elapsed)) | ||
| interval = nextInterval(interval) | ||
| continue | ||
| } | ||
|
|
||
|
|
@@ -133,11 +161,21 @@ func (l *EnrollmentLayer) awaitWorkflowRun(ctx context.Context, dispatchTime tim | |
| if run.Status == "completed" { | ||
| return run, nil | ||
| } | ||
| l.ui.StepInfo(fmt.Sprintf("workflow run: %s (%s)", run.HTMLURL, run.Status)) | ||
| l.ui.StepInfo(fmt.Sprintf("workflow run %s (%s, %s elapsed)", run.HTMLURL, run.Status, elapsed)) | ||
| break // found our run, keep waiting | ||
| } | ||
|
|
||
| interval = nextInterval(interval) | ||
| } | ||
| } | ||
|
|
||
| // nextInterval doubles the polling interval up to enrollmentPollMax. | ||
| func nextInterval(current time.Duration) time.Duration { | ||
| next := current * 2 | ||
| if next > enrollmentPollMax { | ||
| return enrollmentPollMax | ||
| } | ||
| return nil, fmt.Errorf("timed out waiting for repo-maintenance workflow") | ||
| return next | ||
| } | ||
|
|
||
| // showWorkflowLogs fetches and displays workflow run logs locally so the user | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[low] comment-style
The new constants use multi-line comments while existing constants in the same block use single-line comments. The difference is driven by content length.