Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 73 additions & 12 deletions internal/controller/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/predicate"
)

Expand Down Expand Up @@ -1460,14 +1461,51 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr
liveFirmwareUpdatesAllowed = (hup.Spec.FirmwareUpdates == metal3api.HostUpdatePolicyOnReboot)
}

// Fast-path: when recovering from a servicing error, check if specs are
// cleared before calling getHostFirmwareSettings/Components. Those
// functions fail on generation mismatch (sub-controller hasn't reconciled
// yet), which would block the abort/recovery path indefinitely.
if info.host.Status.ErrorType == metal3api.ServicingError {
specsExist := false
if liveFirmwareSettingsAllowed {
hfsCheck := &metal3api.HostFirmwareSettings{}
if err := r.Get(ctx, info.request.NamespacedName, hfsCheck); err == nil && len(hfsCheck.Spec.Settings) > 0 {
specsExist = true
}
}
if liveFirmwareUpdatesAllowed && !specsExist {
hfcCheck := &metal3api.HostFirmwareComponents{}
if err := r.Get(ctx, info.request.NamespacedName, hfcCheck); err == nil && len(hfcCheck.Spec.Updates) > 0 {
specsExist = true
}
}
if !specsExist {
info.log.Info("specs cleared while in servicing error state, attempting abort")
provResult, _, err := prov.Service(ctx, servicingData, false, false)
if err != nil {
return actionError{fmt.Errorf("failed to abort servicing: %w", err)}
}
if provResult.ErrorMessage != "" {
return actionError{fmt.Errorf("failed to abort servicing: %s", provResult.ErrorMessage)}
}
if provResult.Dirty {
info.log.Info("abort operation in progress, checking back later")
return actionContinue{provResult.RequeueAfter}
}
info.log.Info("successfully recovered from servicing error")
info.host.Status.ErrorType = ""
info.host.Status.ErrorMessage = ""
info.host.Status.OperationalStatus = metal3api.OperationalStatusOK
return actionComplete{}
}
}

if liveFirmwareSettingsAllowed {
// handling pre-HFS FirmwareSettings here
if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) {
servicingData.FirmwareConfig = info.host.Spec.Firmware
fwDirty = true
}
servicingData.HasFirmwareSpec = fwDirty && info.host.Spec.Firmware != nil

// handling HFS based FirmwareSettings here
var hfs *metal3api.HostFirmwareSettings
var err error
Expand All @@ -1480,7 +1518,11 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr
servicingData.TargetFirmwareSettings = hfs.Spec.Settings
}

servicingData.HasFirmwareSpec = servicingData.HasFirmwareSpec || (hfs != nil && len(hfs.Spec.Settings) > 0)
// Track if HFS spec has actual settings - check independently since getHostFirmwareSettings
// returns nil when no changes even if object exists
hfsExists := &metal3api.HostFirmwareSettings{}
hfsExistsErr := r.Get(ctx, info.request.NamespacedName, hfsExists)
servicingData.HasFirmwareSettingsSpec = (hfsExistsErr == nil && len(hfsExists.Spec.Settings) > 0)
Comment on lines +1521 to +1525
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Keep legacy BareMetalHost.spec.firmware represented in HasFirmwareSettingsSpec.

Lines 1465-1468 still start servicing from the legacy firmware field, but this flag is now derived only from HostFirmwareSettings. After a service starts and status is synced, fwDirty drops to false; on the next Servicing / ServiceWait reconcile, pkg/provisioner/ironic/servicing.go sees both spec flags false and aborts an in-flight legacy firmware update as if the user had removed it. Please fold the legacy firmware-config presence into this signal as well.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@internal/controller/metal3.io/baremetalhost_controller.go` around lines 1481
- 1485, The HasFirmwareSettingsSpec flag is currently set only from
HostFirmwareSettings; also inspect the legacy BareMetalHost.spec.firmware and
fold its presence into the same signal so legacy in-flight updates aren't
aborted: after the existing r.Get for hfsExists (hfsExistsErr) also fetch the
BareMetalHost object (or use the existing info.bmh if available) and treat
servicingData.HasFirmwareSettingsSpec as true if either (hfsExistsErr == nil &&
len(hfsExists.Spec.Settings) > 0) OR the BareMetalHost.Spec.Firmware is non-nil
and contains settings (e.g., check Spec.Firmware or its fields are non-empty);
update the assignment to servicingData.HasFirmwareSettingsSpec accordingly using
those identifiers (hfsExists, hfsExistsErr, BareMetalHost.Spec.Firmware or
info.bmh).

}

if liveFirmwareUpdatesAllowed {
Expand All @@ -1498,7 +1540,11 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr
}
}

servicingData.HasFirmwareSpec = servicingData.HasFirmwareSpec || (hfc != nil && len(hfc.Spec.Updates) > 0)
// Track if HFC spec has actual updates - check independently since getHostFirmwareComponents
// returns nil when no changes even if object exists
hfcExists := &metal3api.HostFirmwareComponents{}
hfcExistsErr := r.Get(ctx, info.request.NamespacedName, hfcExists)
servicingData.HasFirmwareComponentsSpec = (hfcExistsErr == nil && len(hfcExists.Spec.Updates) > 0)
}

hasChanges := fwDirty || hfsDirty || hfcDirty
Expand Down Expand Up @@ -2148,8 +2194,7 @@ func hostObjectHasChanges(conditions []metav1.Condition, changedCondition, valid
return false, true, nil
}

// Get the stored firmware settings. Returns dirty=true if there are valid pending changes.
// The hfs object is returned when available regardless of validity, so callers can inspect spec contents.
// Get the stored firmware settings if there are valid changes.
func (r *BareMetalHostReconciler) getHostFirmwareSettings(ctx context.Context, info *reconcileInfo) (dirty bool, hfs *metal3api.HostFirmwareSettings, err error) {
hfs = &metal3api.HostFirmwareSettings{}
if err = r.Get(ctx, info.request.NamespacedName, hfs); err != nil {
Expand All @@ -2169,7 +2214,7 @@ func (r *BareMetalHostReconciler) getHostFirmwareSettings(ctx context.Context, i
}
if !valid {
info.log.Info("hostFirmwareSettings not valid", "namespacename", info.request.NamespacedName)
return false, hfs, nil
return false, nil, nil
}

if changed {
Expand All @@ -2183,11 +2228,11 @@ func (r *BareMetalHostReconciler) getHostFirmwareSettings(ctx context.Context, i
}

info.log.Info("hostFirmwareSettings no updates", "namespacename", info.request.NamespacedName)
return false, hfs, nil
return false, nil, nil
}

// Get the stored firmware components. Returns dirty=true if there are valid pending changes.
// The hfc object is returned when available regardless of validity, so callers can inspect spec contents.
// Get the stored firmware settings if there are valid changes.

func (r *BareMetalHostReconciler) getHostFirmwareComponents(ctx context.Context, info *reconcileInfo) (dirty bool, hfc *metal3api.HostFirmwareComponents, err error) {
hfc = &metal3api.HostFirmwareComponents{}
if err = r.Get(ctx, info.request.NamespacedName, hfc); err != nil {
Expand All @@ -2207,15 +2252,15 @@ func (r *BareMetalHostReconciler) getHostFirmwareComponents(ctx context.Context,
}
if !valid {
info.log.Info("hostFirmwareComponents not valid", "namespacename", info.request.NamespacedName)
return false, hfc, nil
return false, nil, nil
}
if changed {
info.log.Info("hostFirmwareComponents indicating ChangeDetected", "namespacename", info.request.NamespacedName)
return true, hfc, nil
}

info.log.Info("hostFirmwareComponents no updates", "namespacename", info.request.NamespacedName)
return false, hfc, nil
return false, nil, nil
}

func setConditionTrue(host *metal3api.BareMetalHost, typ, reason string) {
Expand Down Expand Up @@ -2501,6 +2546,22 @@ func (r *BareMetalHostReconciler) SetupWithManager(mgr ctrl.Manager, preprovImgE
controller.Owns(&metal3api.PreprovisioningImage{})
}

// Watch HFC/HFS for spec changes (generation bumps) so that the BMH
// controller reconciles promptly when a user clears firmware specs,
// rather than waiting for the error-state exponential backoff to expire.
firmwareEventHandler := handler.EnqueueRequestsFromMapFunc(
func(_ context.Context, obj client.Object) []ctrl.Request {
return []ctrl.Request{{NamespacedName: client.ObjectKey{
Name: obj.GetName(),
Namespace: obj.GetNamespace(),
}}}
},
)
controller.Watches(&metal3api.HostFirmwareSettings{}, firmwareEventHandler,
builder.WithPredicates(predicate.GenerationChangedPredicate{}))
controller.Watches(&metal3api.HostFirmwareComponents{}, firmwareEventHandler,
builder.WithPredicates(predicate.GenerationChangedPredicate{}))

return controller.Complete(r)
}

Expand Down
19 changes: 7 additions & 12 deletions internal/controller/metal3.io/hostfirmwaresettings_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,15 +245,15 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info
generation := info.hfs.GetGeneration()

if specMismatch {
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") {
dirty = true
}

// Run validation on the Spec to detect invalid values entered by user, including Spec settings not in Status
// Eventually this will be handled by a webhook
errors := r.validateHostFirmwareSettings(info, &newStatus, schema)
if len(errors) == 0 {
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") {
dirty = true
}
} else {
Expand All @@ -264,15 +264,15 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info
}
}
reason = reasonConfigurationError
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") {
dirty = true
}
}
} else {
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") {
dirty = true
}
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") {
dirty = true
}
}
Expand Down Expand Up @@ -422,7 +422,7 @@ func (r *HostFirmwareSettingsReconciler) publishEvent(ctx context.Context, reque
}
}

func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus, info *rInfo,
func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus,
cond metal3api.SettingsConditionType, newStatus metav1.ConditionStatus,
reason conditionReason, message string) bool {
newCondition := metav1.Condition{
Expand All @@ -432,13 +432,8 @@ func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus
Reason: string(reason),
Message: message,
}
meta.SetStatusCondition(&status.Conditions, newCondition)

currCond := meta.FindStatusCondition(info.hfs.Status.Conditions, string(cond))
if currCond == nil || currCond.Status != newStatus {
return true
}
return false
return meta.SetStatusCondition(&status.Conditions, newCondition)
}

// Generate a name based on the schema key and values which should be the same for similar hardware.
Expand Down
37 changes: 23 additions & 14 deletions pkg/provisioner/ironic/servicing.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,15 @@ func (p *ironicProvisioner) startServicing(ctx context.Context, bmcAccess bmc.Ac
}

func (p *ironicProvisioner) abortServicing(ctx context.Context, ironicNode *nodes.Node) (result provisioner.Result, started bool, err error) {
p.log.Info("aborting servicing because firmware spec was removed")
// Clear maintenance flag first if it's set
if ironicNode.Maintenance {
p.log.Info("clearing maintenance flag before aborting servicing")
result, err = p.setMaintenanceFlag(ctx, ironicNode, false, "")
return result, started, err
}

// Set started to let the controller know about the change
p.log.Info("aborting servicing due to removal of spec.updates/spec.settings")
started, result, err = p.tryChangeNodeProvisionState(
ctx,
ironicNode,
Expand All @@ -101,11 +109,17 @@ func (p *ironicProvisioner) Service(ctx context.Context, data provisioner.Servic
return result, started, err
}

if _, err = p.buildServiceSteps(bmcAccess, data); err != nil {
result, err = operationFailed(err.Error())
return result, started, err
}

switch nodes.ProvisionState(ironicNode.ProvisionState) {
case nodes.ServiceFail:
// When servicing failed and user removed all firmware specs,
// abort the servicing operation to back out
if !data.HasFirmwareSpec {
// When servicing failed and user actually removed the specs (not just no updates calculated),
// we need to abort the servicing operation to back out
if !data.HasFirmwareSettingsSpec && !data.HasFirmwareComponentsSpec {
p.log.Info("aborting servicing because spec.updates/spec.settings was removed")
return p.abortServicing(ctx, ironicNode)
}

Expand Down Expand Up @@ -137,18 +151,13 @@ func (p *ironicProvisioner) Service(ctx context.Context, data provisioner.Servic
// Servicing finished
p.log.Info("servicing finished on the host")
result, err = operationComplete()
case nodes.Servicing:
p.log.Info("waiting for host to finish servicing",
"state", ironicNode.ProvisionState,
"serviceStep", ironicNode.ServiceStep)
result, err = operationContinuing(provisionRequeueDelay)

case nodes.ServiceWait:
// If user removed all firmware specs while in ServiceWait, abort
if !data.HasFirmwareSpec {
case nodes.Servicing, nodes.ServiceWait:
// If user actually removed spec.updates/spec.settings while servicing is in progress, abort immediately
if !data.HasFirmwareSettingsSpec && !data.HasFirmwareComponentsSpec {
p.log.Info("aborting in-progress servicing because spec.updates/spec.settings was removed")
return p.abortServicing(ctx, ironicNode)
}

p.log.Info("waiting for host to become active",
"state", ironicNode.ProvisionState,
"serviceStep", ironicNode.ServiceStep)
Expand Down
25 changes: 1 addition & 24 deletions pkg/provisioner/ironic/servicing_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,17 +86,6 @@ func TestService(t *testing.T) {
expectedRequestAfter: 0,
expectedDirty: false,
},
{
name: "serviceFail state(abort - no firmware spec)",
ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{
ProvisionState: string(nodes.ServiceFail),
UUID: nodeUUID,
}),
skipConfig: true,
expectedStarted: true,
expectedRequestAfter: 10,
expectedDirty: true,
},
{
name: "serviceFail state(retry)",
ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{
Expand Down Expand Up @@ -141,17 +130,6 @@ func TestService(t *testing.T) {
expectedRequestAfter: 10,
expectedDirty: true,
},
{
name: "serviceWait state(abort - no firmware spec)",
ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{
ProvisionState: string(nodes.ServiceWait),
UUID: nodeUUID,
}),
skipConfig: true,
expectedStarted: true,
expectedRequestAfter: 10,
expectedDirty: true,
},
{
name: "active state(servicing finished)",
ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{
Expand Down Expand Up @@ -186,14 +164,13 @@ func TestService(t *testing.T) {
host.Status.Provisioning.ID = nodeUUID
prepData := provisioner.ServicingData{}
if !tc.skipConfig {
host.Spec.BMC.Address = "bios-test://test.bmc/"
host.Spec.BMC.Address = "raid-test://test.bmc/"
prepData.ActualFirmwareSettings = metal3api.SettingsMap{
"Answer": "unknown",
}
prepData.TargetFirmwareSettings = metal3api.DesiredSettingsMap{
"Answer": intstr.FromInt(42),
}
prepData.HasFirmwareSpec = true
}
Comment on lines 166 to 174
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Set the new spec-presence flag in these configured fixtures.

When !tc.skipConfig, this fixture now fills TargetFirmwareSettings / ActualFirmwareSettings but leaves both HasFirmware*Spec flags false. The remaining ServiceFail / ServiceWait cases will hit the abort path instead of the retry/wait behavior they expect.

Suggested fix
 			if !tc.skipConfig {
 				host.Spec.BMC.Address = "raid-test://test.bmc/"
 				prepData.ActualFirmwareSettings = metal3api.SettingsMap{
 					"Answer": "unknown",
 				}
 				prepData.TargetFirmwareSettings = metal3api.DesiredSettingsMap{
 					"Answer": intstr.FromInt(42),
 				}
+				prepData.HasFirmwareSettingsSpec = true
 			}
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if !tc.skipConfig {
host.Spec.BMC.Address = "bios-test://test.bmc/"
host.Spec.BMC.Address = "raid-test://test.bmc/"
prepData.ActualFirmwareSettings = metal3api.SettingsMap{
"Answer": "unknown",
}
prepData.TargetFirmwareSettings = metal3api.DesiredSettingsMap{
"Answer": intstr.FromInt(42),
}
prepData.HasFirmwareSpec = true
}
if !tc.skipConfig {
host.Spec.BMC.Address = "raid-test://test.bmc/"
prepData.ActualFirmwareSettings = metal3api.SettingsMap{
"Answer": "unknown",
}
prepData.TargetFirmwareSettings = metal3api.DesiredSettingsMap{
"Answer": intstr.FromInt(42),
}
prepData.HasFirmwareSettingsSpec = true
}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pkg/provisioner/ironic/servicing_test.go` around lines 166 - 174, The test
fixture fills prepData.ActualFirmwareSettings and
prepData.TargetFirmwareSettings when !tc.skipConfig but does not set the
corresponding presence flags, so the code takes the abort path; update the
fixture so that inside the if !tc.skipConfig block you also set the
spec-presence flags (e.g., prepData.HasFirmwareActualSpec = true and
prepData.HasFirmwareTargetSpec = true) so the ServiceFail/ServiceWait cases
exercise the retry/wait behavior; make these assignments alongside
prepData.ActualFirmwareSettings and prepData.TargetFirmwareSettings.


publisher := func(reason, message string) {}
Expand Down
6 changes: 3 additions & 3 deletions pkg/provisioner/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,9 @@ type ServicingData struct {
TargetFirmwareSettings metal3api.DesiredSettingsMap
ActualFirmwareSettings metal3api.SettingsMap
TargetFirmwareComponents []metal3api.FirmwareUpdate
// True if any firmware spec exists (settings, components, or legacy FirmwareConfig),
// used to distinguish "no updates" from "user cleared spec".
HasFirmwareSpec bool
// Flags to track if specs exist (vs. just no updates calculated)
HasFirmwareSettingsSpec bool
HasFirmwareComponentsSpec bool
}

type ProvisionData struct {
Expand Down