diff --git a/internal/controller/metal3.io/baremetalhost_controller.go b/internal/controller/metal3.io/baremetalhost_controller.go index b2bb51a9f4..42ef4c02a9 100644 --- a/internal/controller/metal3.io/baremetalhost_controller.go +++ b/internal/controller/metal3.io/baremetalhost_controller.go @@ -46,6 +46,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/predicate" ) @@ -1460,6 +1461,42 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr liveFirmwareUpdatesAllowed = (hup.Spec.FirmwareUpdates == metal3api.HostUpdatePolicyOnReboot) } + // Fast-path: when recovering from a servicing error, check if specs are + // cleared before calling getHostFirmwareSettings/Components. Those + // functions fail on generation mismatch (sub-controller hasn't reconciled + // yet), which would block the abort/recovery path indefinitely. + if info.host.Status.ErrorType == metal3api.ServicingError { + specsExist := false + if liveFirmwareSettingsAllowed { + hfsCheck := &metal3api.HostFirmwareSettings{} + if err := r.Get(ctx, info.request.NamespacedName, hfsCheck); err == nil && len(hfsCheck.Spec.Settings) > 0 { + specsExist = true + } + } + if liveFirmwareUpdatesAllowed && !specsExist { + hfcCheck := &metal3api.HostFirmwareComponents{} + if err := r.Get(ctx, info.request.NamespacedName, hfcCheck); err == nil && len(hfcCheck.Spec.Updates) > 0 { + specsExist = true + } + } + if !specsExist { + info.log.Info("specs cleared while in servicing error state, attempting abort") + provResult, _, err := prov.Service(ctx, servicingData, false, false) + if err != nil { + return actionError{fmt.Errorf("failed to abort servicing: %w", err)} + } + if provResult.ErrorMessage != "" { + return actionError{fmt.Errorf("failed to abort servicing: %s", provResult.ErrorMessage)} + } + if provResult.Dirty { + return actionContinue{provResult.RequeueAfter} + } + info.log.Info("successfully recovered from servicing error") + clearErrorWithStatus(info.host, metal3api.OperationalStatusOK) + return actionComplete{} + } + } + if liveFirmwareSettingsAllowed { // handling pre-HFS FirmwareSettings here if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) { @@ -2501,6 +2538,22 @@ func (r *BareMetalHostReconciler) SetupWithManager(mgr ctrl.Manager, preprovImgE controller.Owns(&metal3api.PreprovisioningImage{}) } + // Watch HFC/HFS for spec changes (generation bumps) so that the BMH + // controller reconciles promptly when a user clears firmware specs, + // rather than waiting for error-state exponential backoff to expire. + firmwareEventHandler := handler.EnqueueRequestsFromMapFunc( + func(_ context.Context, obj client.Object) []ctrl.Request { + return []ctrl.Request{{NamespacedName: client.ObjectKey{ + Name: obj.GetName(), + Namespace: obj.GetNamespace(), + }}} + }, + ) + controller.Watches(&metal3api.HostFirmwareSettings{}, firmwareEventHandler, + builder.WithPredicates(predicate.GenerationChangedPredicate{})) + controller.Watches(&metal3api.HostFirmwareComponents{}, firmwareEventHandler, + builder.WithPredicates(predicate.GenerationChangedPredicate{})) + return controller.Complete(r) } diff --git a/internal/controller/metal3.io/hostfirmwaresettings_controller.go b/internal/controller/metal3.io/hostfirmwaresettings_controller.go index 7cc291120f..84390893f2 100644 --- a/internal/controller/metal3.io/hostfirmwaresettings_controller.go +++ b/internal/controller/metal3.io/hostfirmwaresettings_controller.go @@ -247,7 +247,7 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info generation := info.hfs.GetGeneration() if specMismatch { - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") { dirty = true } @@ -255,7 +255,7 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info // Eventually this will be handled by a webhook errors := r.validateHostFirmwareSettings(info, &newStatus, schema) if len(errors) == 0 { - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { dirty = true } } else { @@ -266,15 +266,15 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info } } reason = reasonConfigurationError - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") { dirty = true } } } else { - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { dirty = true } - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") { dirty = true } } @@ -425,7 +425,7 @@ func (r *HostFirmwareSettingsReconciler) publishEvent(ctx context.Context, reque } } -func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus, info *rInfo, +func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus, cond metal3api.SettingsConditionType, newStatus metav1.ConditionStatus, reason conditionReason, message string) bool { newCondition := metav1.Condition{ @@ -435,13 +435,8 @@ func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus Reason: string(reason), Message: message, } - meta.SetStatusCondition(&status.Conditions, newCondition) - currCond := meta.FindStatusCondition(info.hfs.Status.Conditions, string(cond)) - if currCond == nil || currCond.Status != newStatus { - return true - } - return false + return meta.SetStatusCondition(&status.Conditions, newCondition) } // Generate a name based on the schema key and values which should be the same for similar hardware.