Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions internal/controller/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/predicate"
)

Expand Down Expand Up @@ -1460,6 +1461,42 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr
liveFirmwareUpdatesAllowed = (hup.Spec.FirmwareUpdates == metal3api.HostUpdatePolicyOnReboot)
}

// Fast-path: when recovering from a servicing error, check if specs are
// cleared before calling getHostFirmwareSettings/Components. Those
// functions fail on generation mismatch (sub-controller hasn't reconciled
// yet), which would block the abort/recovery path indefinitely.
if info.host.Status.ErrorType == metal3api.ServicingError {
specsExist := false
if liveFirmwareSettingsAllowed {
hfsCheck := &metal3api.HostFirmwareSettings{}
if err := r.Get(ctx, info.request.NamespacedName, hfsCheck); err == nil && len(hfsCheck.Spec.Settings) > 0 {
specsExist = true
}
}
if liveFirmwareUpdatesAllowed && !specsExist {
hfcCheck := &metal3api.HostFirmwareComponents{}
if err := r.Get(ctx, info.request.NamespacedName, hfcCheck); err == nil && len(hfcCheck.Spec.Updates) > 0 {
specsExist = true
}
}
if !specsExist {
info.log.Info("specs cleared while in servicing error state, attempting abort")
provResult, _, err := prov.Service(ctx, servicingData, false, false)
if err != nil {
return actionError{fmt.Errorf("failed to abort servicing: %w", err)}
}
if provResult.ErrorMessage != "" {
return actionError{fmt.Errorf("failed to abort servicing: %s", provResult.ErrorMessage)}
}
if provResult.Dirty {
return actionContinue{provResult.RequeueAfter}
}
info.log.Info("successfully recovered from servicing error")
clearErrorWithStatus(info.host, metal3api.OperationalStatusOK)
return actionComplete{}
Comment on lines +1495 to +1496
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Status changes may not be persisted.

clearErrorWithStatus modifies the host status but its return value is ignored. The bare actionComplete{} likely doesn't signal dirty, so the status changes (clearing error, setting OperationalStatusOK) won't be saved. Other code paths (e.g., line 1601) wrap in actionUpdate{} when the host is modified.

🐛 Proposed fix
-		info.log.Info("successfully recovered from servicing error")
-		clearErrorWithStatus(info.host, metal3api.OperationalStatusOK)
-		return actionComplete{}
+		info.log.Info("successfully recovered from servicing error")
+		clearErrorWithStatus(info.host, metal3api.OperationalStatusOK)
+		return actionUpdate{actionComplete{}}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@internal/controller/metal3.io/baremetalhost_controller.go` around lines 1491
- 1492, clearErrorWithStatus(info.host, metal3api.OperationalStatusOK) mutates
host status but its return value is ignored, so the controller may not persist
the change; replace the bare actionComplete{} return with returning the update
action that marks the host as modified (use the same pattern as other paths that
use actionUpdate{}, e.g., the path at line ~1601) by capturing the modified host
result from clearErrorWithStatus (or otherwise marking info.host dirty) and
returning actionUpdate{} so the status change is persisted.

}
}

if liveFirmwareSettingsAllowed {
// handling pre-HFS FirmwareSettings here
if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) {
Expand Down Expand Up @@ -2501,6 +2538,22 @@ func (r *BareMetalHostReconciler) SetupWithManager(mgr ctrl.Manager, preprovImgE
controller.Owns(&metal3api.PreprovisioningImage{})
}

// Watch HFC/HFS for spec changes (generation bumps) so that the BMH
// controller reconciles promptly when a user clears firmware specs,
// rather than waiting for error-state exponential backoff to expire.
firmwareEventHandler := handler.EnqueueRequestsFromMapFunc(
func(_ context.Context, obj client.Object) []ctrl.Request {
return []ctrl.Request{{NamespacedName: client.ObjectKey{
Name: obj.GetName(),
Namespace: obj.GetNamespace(),
}}}
},
)
controller.Watches(&metal3api.HostFirmwareSettings{}, firmwareEventHandler,
builder.WithPredicates(predicate.GenerationChangedPredicate{}))
controller.Watches(&metal3api.HostFirmwareComponents{}, firmwareEventHandler,
builder.WithPredicates(predicate.GenerationChangedPredicate{}))

return controller.Complete(r)
}

Expand Down
19 changes: 7 additions & 12 deletions internal/controller/metal3.io/hostfirmwaresettings_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,15 +247,15 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info
generation := info.hfs.GetGeneration()

if specMismatch {
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") {
dirty = true
}

// Run validation on the Spec to detect invalid values entered by user, including Spec settings not in Status
// Eventually this will be handled by a webhook
errors := r.validateHostFirmwareSettings(info, &newStatus, schema)
if len(errors) == 0 {
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") {
dirty = true
}
} else {
Expand All @@ -266,15 +266,15 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info
}
}
reason = reasonConfigurationError
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") {
dirty = true
}
}
} else {
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") {
dirty = true
}
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") {
dirty = true
}
}
Expand Down Expand Up @@ -425,7 +425,7 @@ func (r *HostFirmwareSettingsReconciler) publishEvent(ctx context.Context, reque
}
}

func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus, info *rInfo,
func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus,
cond metal3api.SettingsConditionType, newStatus metav1.ConditionStatus,
reason conditionReason, message string) bool {
newCondition := metav1.Condition{
Expand All @@ -435,13 +435,8 @@ func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus
Reason: string(reason),
Message: message,
}
meta.SetStatusCondition(&status.Conditions, newCondition)

currCond := meta.FindStatusCondition(info.hfs.Status.Conditions, string(cond))
if currCond == nil || currCond.Status != newStatus {
return true
}
return false
return meta.SetStatusCondition(&status.Conditions, newCondition)
}

// Generate a name based on the schema key and values which should be the same for similar hardware.
Expand Down