Skip to content

Commit c383955

Browse files
committed
Handle generation mismatch in servicing error recovery
When a user clears HFC/HFS specs to abort a failed servicing operation, the BMH controller calls getHostFirmwareSettings/Components which check that metadata.generation matches status.conditions.observedGeneration. Since the HFS/HFC sub-controllers may not have reconciled yet after the spec change, this generation mismatch returns an error that blocks the abort path, leaving the host stuck in ServicingError with exponential backoff. Fix with two complementary changes: 1. Add a fast-path in doServiceIfNeeded: when in ServicingError state, check specs directly via lightweight r.Get() calls (which don't validate generation) before the generation-sensitive functions. If specs are cleared, abort immediately via prov.Service() and clear the error/operational status on success. 2. Watch HostFirmwareSettings and HostFirmwareComponents for generation changes (spec modifications), mapping events to the corresponding BMH. This ensures the BMH controller reconciles promptly when specs change, rather than waiting for error-state exponential backoff. Assisted-By: Claude Opus 4.6 (cherry picked from commit b235782)
1 parent 0fa6833 commit c383955

1 file changed

Lines changed: 53 additions & 0 deletions

File tree

internal/controller/metal3.io/baremetalhost_controller.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ import (
4646
"sigs.k8s.io/controller-runtime/pkg/controller"
4747
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
4848
"sigs.k8s.io/controller-runtime/pkg/event"
49+
"sigs.k8s.io/controller-runtime/pkg/handler"
4950
"sigs.k8s.io/controller-runtime/pkg/predicate"
5051
)
5152

@@ -1460,6 +1461,42 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr
14601461
liveFirmwareUpdatesAllowed = (hup.Spec.FirmwareUpdates == metal3api.HostUpdatePolicyOnReboot)
14611462
}
14621463

1464+
// Fast-path: when recovering from a servicing error, check if specs are
1465+
// cleared before calling getHostFirmwareSettings/Components. Those
1466+
// functions fail on generation mismatch (sub-controller hasn't reconciled
1467+
// yet), which would block the abort/recovery path indefinitely.
1468+
if info.host.Status.ErrorType == metal3api.ServicingError {
1469+
specsExist := false
1470+
if liveFirmwareSettingsAllowed {
1471+
hfsCheck := &metal3api.HostFirmwareSettings{}
1472+
if err := r.Get(ctx, info.request.NamespacedName, hfsCheck); err == nil && len(hfsCheck.Spec.Settings) > 0 {
1473+
specsExist = true
1474+
}
1475+
}
1476+
if liveFirmwareUpdatesAllowed && !specsExist {
1477+
hfcCheck := &metal3api.HostFirmwareComponents{}
1478+
if err := r.Get(ctx, info.request.NamespacedName, hfcCheck); err == nil && len(hfcCheck.Spec.Updates) > 0 {
1479+
specsExist = true
1480+
}
1481+
}
1482+
if !specsExist {
1483+
info.log.Info("specs cleared while in servicing error state, attempting abort")
1484+
provResult, _, err := prov.Service(ctx, servicingData, false, false)
1485+
if err != nil {
1486+
return actionError{fmt.Errorf("failed to abort servicing: %w", err)}
1487+
}
1488+
if provResult.ErrorMessage != "" {
1489+
return actionError{fmt.Errorf("failed to abort servicing: %s", provResult.ErrorMessage)}
1490+
}
1491+
if provResult.Dirty {
1492+
return actionContinue{provResult.RequeueAfter}
1493+
}
1494+
info.log.Info("successfully recovered from servicing error")
1495+
clearErrorWithStatus(info.host, metal3api.OperationalStatusOK)
1496+
return actionComplete{}
1497+
}
1498+
}
1499+
14631500
if liveFirmwareSettingsAllowed {
14641501
// handling pre-HFS FirmwareSettings here
14651502
if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) {
@@ -2501,6 +2538,22 @@ func (r *BareMetalHostReconciler) SetupWithManager(mgr ctrl.Manager, preprovImgE
25012538
controller.Owns(&metal3api.PreprovisioningImage{})
25022539
}
25032540

2541+
// Watch HFC/HFS for spec changes (generation bumps) so that the BMH
2542+
// controller reconciles promptly when a user clears firmware specs,
2543+
// rather than waiting for error-state exponential backoff to expire.
2544+
firmwareEventHandler := handler.EnqueueRequestsFromMapFunc(
2545+
func(_ context.Context, obj client.Object) []ctrl.Request {
2546+
return []ctrl.Request{{NamespacedName: client.ObjectKey{
2547+
Name: obj.GetName(),
2548+
Namespace: obj.GetNamespace(),
2549+
}}}
2550+
},
2551+
)
2552+
controller.Watches(&metal3api.HostFirmwareSettings{}, firmwareEventHandler,
2553+
builder.WithPredicates(predicate.GenerationChangedPredicate{}))
2554+
controller.Watches(&metal3api.HostFirmwareComponents{}, firmwareEventHandler,
2555+
builder.WithPredicates(predicate.GenerationChangedPredicate{}))
2556+
25042557
return controller.Complete(r)
25052558
}
25062559

0 commit comments

Comments
 (0)