From 2954a7115475414be5cd529251ac502c0ba2c84b Mon Sep 17 00:00:00 2001 From: Jacob Anders Date: Thu, 19 Mar 2026 20:53:11 +1000 Subject: [PATCH] Fix Healthy condition not updating from Ironic health data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four bugs prevented the Healthy condition from working correctly on provisioned hosts: 1. computeConditions was only called inside the actResult.Dirty() block. For steady-state provisioned hosts where nothing changes during reconciliation, conditions were never recomputed. This meant the Healthy condition (populated from Ironic's health monitoring API) stayed at its initial Unknown value forever, since health status changes externally in Ironic and doesn't trigger state machine transitions. 2. ChooseMicroversion() capped at 1.95, but the Ironic health field on the Node resource requires microversion 1.109. With a lower microversion, Ironic does not include the health field in responses, so GetHealth always returned an empty string and the Healthy condition was never set. Added HasHealthAPI() feature detection and bumped ChooseMicroversion() to return 1.109 when the Ironic endpoint supports it. 3. Empty health from GetHealth (e.g. when the node is not yet registered) was treated as "unknown" via the default switch case, overwriting a previously valid Healthy=True on every idle reconcile. Now empty health means "no data, leave the existing condition unchanged" — only non-empty values from Ironic update the condition. 4. conditionsBefore used a slice reference sharing the same backing array as host.Status.Conditions. In-place modifications by conditions.Set were visible through both references, making reflect.DeepEqual always return true. Deep copy the slice before comparing so condition-only changes trigger a status save. Also suppress ErrNeedsRegistration log noise in GetHealth, which fires on every reconcile before the node is registered in Ironic. Assisted-By: Claude Opus 4.6 Signed-off-by: Jacob Anders --- .../metal3.io/baremetalhost_controller.go | 31 ++++++++++++------- .../baremetalhost_controller_test.go | 25 ++++++++++----- pkg/provisioner/ironic/clients/features.go | 11 ++++++- .../ironic/clients/features_test.go | 23 +++++++++++--- pkg/provisioner/ironic/ironic.go | 4 ++- 5 files changed, 67 insertions(+), 27 deletions(-) diff --git a/internal/controller/metal3.io/baremetalhost_controller.go b/internal/controller/metal3.io/baremetalhost_controller.go index b2bb51a9f4..68c79b6ffb 100644 --- a/internal/controller/metal3.io/baremetalhost_controller.go +++ b/internal/controller/metal3.io/baremetalhost_controller.go @@ -240,16 +240,21 @@ func (r *BareMetalHostReconciler) Reconcile(ctx context.Context, request ctrl.Re return result, err } + // Always compute conditions since some (e.g. Healthy) can change + // based on external state from Ironic regardless of state machine + // transitions. + conditionsBefore := slices.Clone(host.GetConditions()) + computeConditions(ctx, host, prov) + conditionsChanged := !reflect.DeepEqual(conditionsBefore, host.GetConditions()) + // Only save status when we're told to, otherwise we // introduce an infinite loop reconciling the same object over and // over when there is an unrecoverable error (tracked through the // error state of the host). - if actResult.Dirty() { - // Save Host + if actResult.Dirty() || conditionsChanged { info.log.Info("saving host status", "operational status", host.OperationalStatus(), "provisioning state", host.Status.Provisioning.State) - computeConditions(ctx, host, prov) err = r.saveHostStatus(ctx, host) if err != nil { return ctrl.Result{}, fmt.Errorf("failed to save host status after %q: %w", initialState, err) @@ -2304,15 +2309,17 @@ func computeConditions(ctx context.Context, host *metal3api.BareMetalHost, prov setConditionUnknown(host, metal3api.HealthyCondition, metal3api.UnknownHealthReason) return } - switch prov.GetHealth(ctx) { - case provisioner.HealthOK: - setConditionTrue(host, metal3api.HealthyCondition, metal3api.HealthyReason) - case provisioner.HealthWarning: - setConditionFalse(host, metal3api.HealthyCondition, metal3api.WarningHealthReason) - case provisioner.HealthCritical: - setConditionFalse(host, metal3api.HealthyCondition, metal3api.CriticalHealthReason) - default: - setConditionUnknown(host, metal3api.HealthyCondition, metal3api.UnknownHealthReason) + if health := prov.GetHealth(ctx); health != "" { + switch health { + case provisioner.HealthOK: + setConditionTrue(host, metal3api.HealthyCondition, metal3api.HealthyReason) + case provisioner.HealthWarning: + setConditionFalse(host, metal3api.HealthyCondition, metal3api.WarningHealthReason) + case provisioner.HealthCritical: + setConditionFalse(host, metal3api.HealthyCondition, metal3api.CriticalHealthReason) + default: + setConditionUnknown(host, metal3api.HealthyCondition, metal3api.UnknownHealthReason) + } } } diff --git a/internal/controller/metal3.io/baremetalhost_controller_test.go b/internal/controller/metal3.io/baremetalhost_controller_test.go index a2dcd0482f..4de4adf77a 100644 --- a/internal/controller/metal3.io/baremetalhost_controller_test.go +++ b/internal/controller/metal3.io/baremetalhost_controller_test.go @@ -3187,12 +3187,6 @@ func TestComputeHealthyCondition(t *testing.T) { ExpectedStatus metav1.ConditionStatus ExpectedReason string }{ - { - Scenario: "empty health reports unknown", - Health: "", - ExpectedStatus: metav1.ConditionUnknown, - ExpectedReason: metal3api.UnknownHealthReason, - }, { Scenario: "OK health", Health: provisioner.HealthOK, @@ -3232,6 +3226,17 @@ func TestComputeHealthyCondition(t *testing.T) { assert.Equal(t, tc.ExpectedReason, cond.Reason) }) } + + t.Run("empty health leaves condition unchanged", func(t *testing.T) { + host := bmhWithStatus(metal3api.OperationalStatusOK, metal3api.StateAvailable) + fix := &fixture.Fixture{Health: ""} + prov, err := fix.NewProvisioner(t.Context(), provisioner.BuildHostData(*host, bmc.Credentials{}), nil) + require.NoError(t, err) + computeConditions(t.Context(), host, prov) + + cond := conditions.Get(host, metal3api.HealthyCondition) + assert.Nil(t, cond, "empty health should not set the Healthy condition") + }) } func TestComputeConditions(t *testing.T) { @@ -3331,8 +3336,12 @@ func TestComputeConditions(t *testing.T) { assert.True(t, conditions.IsFalse(tc.BareMetalHost, metal3api.ReadyCondition)) } cond := conditions.Get(tc.BareMetalHost, metal3api.HealthyCondition) - require.NotNil(t, cond, "Healthy condition should always be set") - assert.Equal(t, metav1.ConditionUnknown, cond.Status, "Healthy should be Unknown when health is not reported") + if tc.provisioner == nil { + require.NotNil(t, cond, "Healthy condition should be set when provisioner is nil") + assert.Equal(t, metav1.ConditionUnknown, cond.Status, "Healthy should be Unknown when provisioner is nil") + } else { + assert.Nil(t, cond, "Healthy condition should not be set when provisioner reports no health data") + } }) } } diff --git a/pkg/provisioner/ironic/clients/features.go b/pkg/provisioner/ironic/clients/features.go index c7374d3788..76fc17cb31 100644 --- a/pkg/provisioner/ironic/clients/features.go +++ b/pkg/provisioner/ironic/clients/features.go @@ -45,7 +45,8 @@ func (af AvailableFeatures) Log(logger logr.Logger) { "maxVersion", fmt.Sprintf("1.%d", af.MaxVersion), "chosenVersion", af.ChooseMicroversion(), "virtualMediaGET", af.HasVirtualMediaGetAPI(), - "disablePowerOff", af.HasDisablePowerOff()) + "disablePowerOff", af.HasDisablePowerOff(), + "healthAPI", af.HasHealthAPI()) } func (af AvailableFeatures) HasVirtualMediaGetAPI() bool { @@ -56,7 +57,15 @@ func (af AvailableFeatures) HasDisablePowerOff() bool { return af.MaxVersion >= 95 //nolint:mnd } +func (af AvailableFeatures) HasHealthAPI() bool { + return af.MaxVersion >= 109 //nolint:mnd +} + func (af AvailableFeatures) ChooseMicroversion() string { + if af.HasHealthAPI() { + return "1.109" + } + if af.HasDisablePowerOff() { return "1.95" } diff --git a/pkg/provisioner/ironic/clients/features_test.go b/pkg/provisioner/ironic/clients/features_test.go index df3d594e67..05364050cf 100644 --- a/pkg/provisioner/ironic/clients/features_test.go +++ b/pkg/provisioner/ironic/clients/features_test.go @@ -6,7 +6,6 @@ import ( ) func TestAvailableFeatures_ChooseMicroversion(t *testing.T) { - microVersion := "1.95" type fields struct { MaxVersion int } @@ -23,18 +22,32 @@ func TestAvailableFeatures_ChooseMicroversion(t *testing.T) { want: baselineVersionString, }, { - name: fmt.Sprintf("MaxVersion = %d return %s", 89, microVersion), + name: "MaxVersion = 95 return 1.95", feature: fields{ MaxVersion: 95, }, - want: microVersion, + want: "1.95", }, { - name: fmt.Sprintf("MaxVersion > %d return %s", 89, microVersion), + name: "MaxVersion = 100 return 1.95", feature: fields{ MaxVersion: 100, }, - want: microVersion, + want: "1.95", + }, + { + name: "MaxVersion = 109 return 1.109", + feature: fields{ + MaxVersion: 109, + }, + want: "1.109", + }, + { + name: "MaxVersion > 109 return 1.109", + feature: fields{ + MaxVersion: 115, + }, + want: "1.109", }, } for _, tt := range tests { diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index afae65bf4d..e99cee6b78 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -2004,7 +2004,9 @@ func (p *ironicProvisioner) HasPowerFailure(ctx context.Context) bool { func (p *ironicProvisioner) GetHealth(ctx context.Context) string { node, err := p.getNode(ctx) if err != nil { - p.log.Error(err, "ignored error while checking health status") + if !errors.Is(err, provisioner.ErrNeedsRegistration) { + p.log.Error(err, "ignored error while checking health status") + } return "" } return node.Health