diff --git a/internal/controller/metal3.io/baremetalhost_controller.go b/internal/controller/metal3.io/baremetalhost_controller.go index b2bb51a9f4..30a1207a03 100644 --- a/internal/controller/metal3.io/baremetalhost_controller.go +++ b/internal/controller/metal3.io/baremetalhost_controller.go @@ -46,6 +46,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/predicate" ) @@ -1460,14 +1461,51 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr liveFirmwareUpdatesAllowed = (hup.Spec.FirmwareUpdates == metal3api.HostUpdatePolicyOnReboot) } + // Fast-path: when recovering from a servicing error, check if specs are + // cleared before calling getHostFirmwareSettings/Components. Those + // functions fail on generation mismatch (sub-controller hasn't reconciled + // yet), which would block the abort/recovery path indefinitely. + if info.host.Status.ErrorType == metal3api.ServicingError { + specsExist := false + if liveFirmwareSettingsAllowed { + hfsCheck := &metal3api.HostFirmwareSettings{} + if err := r.Get(ctx, info.request.NamespacedName, hfsCheck); err == nil && len(hfsCheck.Spec.Settings) > 0 { + specsExist = true + } + } + if liveFirmwareUpdatesAllowed && !specsExist { + hfcCheck := &metal3api.HostFirmwareComponents{} + if err := r.Get(ctx, info.request.NamespacedName, hfcCheck); err == nil && len(hfcCheck.Spec.Updates) > 0 { + specsExist = true + } + } + if !specsExist { + info.log.Info("specs cleared while in servicing error state, attempting abort") + provResult, _, err := prov.Service(ctx, servicingData, false, false) + if err != nil { + return actionError{fmt.Errorf("failed to abort servicing: %w", err)} + } + if provResult.ErrorMessage != "" { + return actionError{fmt.Errorf("failed to abort servicing: %s", provResult.ErrorMessage)} + } + if provResult.Dirty { + info.log.Info("abort operation in progress, checking back later") + return actionContinue{provResult.RequeueAfter} + } + info.log.Info("successfully recovered from servicing error") + info.host.Status.ErrorType = "" + info.host.Status.ErrorMessage = "" + info.host.Status.OperationalStatus = metal3api.OperationalStatusOK + return actionComplete{} + } + } + if liveFirmwareSettingsAllowed { // handling pre-HFS FirmwareSettings here if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) { servicingData.FirmwareConfig = info.host.Spec.Firmware fwDirty = true } - servicingData.HasFirmwareSpec = fwDirty && info.host.Spec.Firmware != nil - // handling HFS based FirmwareSettings here var hfs *metal3api.HostFirmwareSettings var err error @@ -1480,7 +1518,11 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr servicingData.TargetFirmwareSettings = hfs.Spec.Settings } - servicingData.HasFirmwareSpec = servicingData.HasFirmwareSpec || (hfs != nil && len(hfs.Spec.Settings) > 0) + // Track if HFS spec has actual settings - check independently since getHostFirmwareSettings + // returns nil when no changes even if object exists + hfsExists := &metal3api.HostFirmwareSettings{} + hfsExistsErr := r.Get(ctx, info.request.NamespacedName, hfsExists) + servicingData.HasFirmwareSettingsSpec = (hfsExistsErr == nil && len(hfsExists.Spec.Settings) > 0) } if liveFirmwareUpdatesAllowed { @@ -1498,7 +1540,11 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr } } - servicingData.HasFirmwareSpec = servicingData.HasFirmwareSpec || (hfc != nil && len(hfc.Spec.Updates) > 0) + // Track if HFC spec has actual updates - check independently since getHostFirmwareComponents + // returns nil when no changes even if object exists + hfcExists := &metal3api.HostFirmwareComponents{} + hfcExistsErr := r.Get(ctx, info.request.NamespacedName, hfcExists) + servicingData.HasFirmwareComponentsSpec = (hfcExistsErr == nil && len(hfcExists.Spec.Updates) > 0) } hasChanges := fwDirty || hfsDirty || hfcDirty @@ -2148,8 +2194,7 @@ func hostObjectHasChanges(conditions []metav1.Condition, changedCondition, valid return false, true, nil } -// Get the stored firmware settings. Returns dirty=true if there are valid pending changes. -// The hfs object is returned when available regardless of validity, so callers can inspect spec contents. +// Get the stored firmware settings if there are valid changes. func (r *BareMetalHostReconciler) getHostFirmwareSettings(ctx context.Context, info *reconcileInfo) (dirty bool, hfs *metal3api.HostFirmwareSettings, err error) { hfs = &metal3api.HostFirmwareSettings{} if err = r.Get(ctx, info.request.NamespacedName, hfs); err != nil { @@ -2169,7 +2214,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareSettings(ctx context.Context, i } if !valid { info.log.Info("hostFirmwareSettings not valid", "namespacename", info.request.NamespacedName) - return false, hfs, nil + return false, nil, nil } if changed { @@ -2183,11 +2228,11 @@ func (r *BareMetalHostReconciler) getHostFirmwareSettings(ctx context.Context, i } info.log.Info("hostFirmwareSettings no updates", "namespacename", info.request.NamespacedName) - return false, hfs, nil + return false, nil, nil } -// Get the stored firmware components. Returns dirty=true if there are valid pending changes. -// The hfc object is returned when available regardless of validity, so callers can inspect spec contents. +// Get the stored firmware settings if there are valid changes. + func (r *BareMetalHostReconciler) getHostFirmwareComponents(ctx context.Context, info *reconcileInfo) (dirty bool, hfc *metal3api.HostFirmwareComponents, err error) { hfc = &metal3api.HostFirmwareComponents{} if err = r.Get(ctx, info.request.NamespacedName, hfc); err != nil { @@ -2207,7 +2252,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareComponents(ctx context.Context, } if !valid { info.log.Info("hostFirmwareComponents not valid", "namespacename", info.request.NamespacedName) - return false, hfc, nil + return false, nil, nil } if changed { info.log.Info("hostFirmwareComponents indicating ChangeDetected", "namespacename", info.request.NamespacedName) @@ -2215,7 +2260,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareComponents(ctx context.Context, } info.log.Info("hostFirmwareComponents no updates", "namespacename", info.request.NamespacedName) - return false, hfc, nil + return false, nil, nil } func setConditionTrue(host *metal3api.BareMetalHost, typ, reason string) { @@ -2501,6 +2546,22 @@ func (r *BareMetalHostReconciler) SetupWithManager(mgr ctrl.Manager, preprovImgE controller.Owns(&metal3api.PreprovisioningImage{}) } + // Watch HFC/HFS for spec changes (generation bumps) so that the BMH + // controller reconciles promptly when a user clears firmware specs, + // rather than waiting for the error-state exponential backoff to expire. + firmwareEventHandler := handler.EnqueueRequestsFromMapFunc( + func(_ context.Context, obj client.Object) []ctrl.Request { + return []ctrl.Request{{NamespacedName: client.ObjectKey{ + Name: obj.GetName(), + Namespace: obj.GetNamespace(), + }}} + }, + ) + controller.Watches(&metal3api.HostFirmwareSettings{}, firmwareEventHandler, + builder.WithPredicates(predicate.GenerationChangedPredicate{})) + controller.Watches(&metal3api.HostFirmwareComponents{}, firmwareEventHandler, + builder.WithPredicates(predicate.GenerationChangedPredicate{})) + return controller.Complete(r) } diff --git a/internal/controller/metal3.io/hostfirmwaresettings_controller.go b/internal/controller/metal3.io/hostfirmwaresettings_controller.go index f3000eff53..eae3aa7709 100644 --- a/internal/controller/metal3.io/hostfirmwaresettings_controller.go +++ b/internal/controller/metal3.io/hostfirmwaresettings_controller.go @@ -245,7 +245,7 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info generation := info.hfs.GetGeneration() if specMismatch { - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") { dirty = true } @@ -253,7 +253,7 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info // Eventually this will be handled by a webhook errors := r.validateHostFirmwareSettings(info, &newStatus, schema) if len(errors) == 0 { - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { dirty = true } } else { @@ -264,15 +264,15 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info } } reason = reasonConfigurationError - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") { dirty = true } } } else { - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") { dirty = true } - if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") { + if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") { dirty = true } } @@ -422,7 +422,7 @@ func (r *HostFirmwareSettingsReconciler) publishEvent(ctx context.Context, reque } } -func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus, info *rInfo, +func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus, cond metal3api.SettingsConditionType, newStatus metav1.ConditionStatus, reason conditionReason, message string) bool { newCondition := metav1.Condition{ @@ -432,13 +432,8 @@ func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus Reason: string(reason), Message: message, } - meta.SetStatusCondition(&status.Conditions, newCondition) - currCond := meta.FindStatusCondition(info.hfs.Status.Conditions, string(cond)) - if currCond == nil || currCond.Status != newStatus { - return true - } - return false + return meta.SetStatusCondition(&status.Conditions, newCondition) } // Generate a name based on the schema key and values which should be the same for similar hardware. diff --git a/pkg/provisioner/ironic/servicing.go b/pkg/provisioner/ironic/servicing.go index 317adfb365..54bba8b964 100644 --- a/pkg/provisioner/ironic/servicing.go +++ b/pkg/provisioner/ironic/servicing.go @@ -79,7 +79,15 @@ func (p *ironicProvisioner) startServicing(ctx context.Context, bmcAccess bmc.Ac } func (p *ironicProvisioner) abortServicing(ctx context.Context, ironicNode *nodes.Node) (result provisioner.Result, started bool, err error) { - p.log.Info("aborting servicing because firmware spec was removed") + // Clear maintenance flag first if it's set + if ironicNode.Maintenance { + p.log.Info("clearing maintenance flag before aborting servicing") + result, err = p.setMaintenanceFlag(ctx, ironicNode, false, "") + return result, started, err + } + + // Set started to let the controller know about the change + p.log.Info("aborting servicing due to removal of spec.updates/spec.settings") started, result, err = p.tryChangeNodeProvisionState( ctx, ironicNode, @@ -101,11 +109,17 @@ func (p *ironicProvisioner) Service(ctx context.Context, data provisioner.Servic return result, started, err } + if _, err = p.buildServiceSteps(bmcAccess, data); err != nil { + result, err = operationFailed(err.Error()) + return result, started, err + } + switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.ServiceFail: - // When servicing failed and user removed all firmware specs, - // abort the servicing operation to back out - if !data.HasFirmwareSpec { + // When servicing failed and user actually removed the specs (not just no updates calculated), + // we need to abort the servicing operation to back out + if !data.HasFirmwareSettingsSpec && !data.HasFirmwareComponentsSpec { + p.log.Info("aborting servicing because spec.updates/spec.settings was removed") return p.abortServicing(ctx, ironicNode) } @@ -137,18 +151,13 @@ func (p *ironicProvisioner) Service(ctx context.Context, data provisioner.Servic // Servicing finished p.log.Info("servicing finished on the host") result, err = operationComplete() - case nodes.Servicing: - p.log.Info("waiting for host to finish servicing", - "state", ironicNode.ProvisionState, - "serviceStep", ironicNode.ServiceStep) - result, err = operationContinuing(provisionRequeueDelay) - - case nodes.ServiceWait: - // If user removed all firmware specs while in ServiceWait, abort - if !data.HasFirmwareSpec { + case nodes.Servicing, nodes.ServiceWait: + // If user actually removed spec.updates/spec.settings while servicing is in progress, abort immediately + if !data.HasFirmwareSettingsSpec && !data.HasFirmwareComponentsSpec { + p.log.Info("aborting in-progress servicing because spec.updates/spec.settings was removed") return p.abortServicing(ctx, ironicNode) } - + p.log.Info("waiting for host to become active", "state", ironicNode.ProvisionState, "serviceStep", ironicNode.ServiceStep) diff --git a/pkg/provisioner/ironic/servicing_test.go b/pkg/provisioner/ironic/servicing_test.go index bf8ff09d9a..7dc07d2183 100644 --- a/pkg/provisioner/ironic/servicing_test.go +++ b/pkg/provisioner/ironic/servicing_test.go @@ -86,17 +86,6 @@ func TestService(t *testing.T) { expectedRequestAfter: 0, expectedDirty: false, }, - { - name: "serviceFail state(abort - no firmware spec)", - ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ - ProvisionState: string(nodes.ServiceFail), - UUID: nodeUUID, - }), - skipConfig: true, - expectedStarted: true, - expectedRequestAfter: 10, - expectedDirty: true, - }, { name: "serviceFail state(retry)", ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ @@ -141,17 +130,6 @@ func TestService(t *testing.T) { expectedRequestAfter: 10, expectedDirty: true, }, - { - name: "serviceWait state(abort - no firmware spec)", - ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ - ProvisionState: string(nodes.ServiceWait), - UUID: nodeUUID, - }), - skipConfig: true, - expectedStarted: true, - expectedRequestAfter: 10, - expectedDirty: true, - }, { name: "active state(servicing finished)", ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ @@ -186,14 +164,13 @@ func TestService(t *testing.T) { host.Status.Provisioning.ID = nodeUUID prepData := provisioner.ServicingData{} if !tc.skipConfig { - host.Spec.BMC.Address = "bios-test://test.bmc/" + host.Spec.BMC.Address = "raid-test://test.bmc/" prepData.ActualFirmwareSettings = metal3api.SettingsMap{ "Answer": "unknown", } prepData.TargetFirmwareSettings = metal3api.DesiredSettingsMap{ "Answer": intstr.FromInt(42), } - prepData.HasFirmwareSpec = true } publisher := func(reason, message string) {} diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index ab940823d1..2fb00fdc17 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -121,9 +121,9 @@ type ServicingData struct { TargetFirmwareSettings metal3api.DesiredSettingsMap ActualFirmwareSettings metal3api.SettingsMap TargetFirmwareComponents []metal3api.FirmwareUpdate - // True if any firmware spec exists (settings, components, or legacy FirmwareConfig), - // used to distinguish "no updates" from "user cleared spec". - HasFirmwareSpec bool + // Flags to track if specs exist (vs. just no updates calculated) + HasFirmwareSettingsSpec bool + HasFirmwareComponentsSpec bool } type ProvisionData struct {