Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 19 additions & 12 deletions internal/controller/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,16 +240,21 @@ func (r *BareMetalHostReconciler) Reconcile(ctx context.Context, request ctrl.Re
return result, err
}

// Always compute conditions since some (e.g. Healthy) can change
// based on external state from Ironic regardless of state machine
// transitions.
conditionsBefore := slices.Clone(host.GetConditions())
computeConditions(ctx, host, prov)
conditionsChanged := !reflect.DeepEqual(conditionsBefore, host.GetConditions())
Comment on lines +243 to +248
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

This turns condition recomputation into two extra Ironic reads per loop.

computeConditions() now runs on every reconcile, and for the ironic provisioner it calls both HasPowerFailure() and GetHealth(). In pkg/provisioner/ironic/ironic.go, both methods do their own getNode(ctx) lookup, so steady-state hosts pick up two additional node reads every cycle. Please cache the node-backed condition data per reconcile or plumb a single status snapshot into computeConditions. As per coding guidelines, "Focus on major issues impacting performance, readability, maintainability and security. Avoid nitpicks and avoid verbosity."

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@internal/controller/metal3.io/baremetalhost_controller.go` around lines 243 -
250, computeConditions is being called every reconcile and for the ironic
provisioner results in two extra Ironic node reads because HasPowerFailure() and
GetHealth() each call getNode(ctx); fix by obtaining the node-backed status once
per reconcile and passing that snapshot into computeConditions (or by adding a
cached accessor on the ironic provisioner that does a single getNode(ctx) and
reuses the result). Concretely, call getNode(ctx) once before computeConditions,
create a node/status snapshot, and change computeConditions(host, prov) to
accept that snapshot (or add prov.GetNodeSnapshot(ctx) and have
HasPowerFailure/GetHealth use it), ensuring computeConditions uses the single
cached snapshot instead of triggering multiple getNode calls.


// Only save status when we're told to, otherwise we
// introduce an infinite loop reconciling the same object over and
// over when there is an unrecoverable error (tracked through the
// error state of the host).
if actResult.Dirty() {
// Save Host
if actResult.Dirty() || conditionsChanged {
info.log.Info("saving host status",
"operational status", host.OperationalStatus(),
"provisioning state", host.Status.Provisioning.State)
computeConditions(ctx, host, prov)
err = r.saveHostStatus(ctx, host)
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to save host status after %q: %w", initialState, err)
Expand Down Expand Up @@ -2304,15 +2309,17 @@ func computeConditions(ctx context.Context, host *metal3api.BareMetalHost, prov
setConditionUnknown(host, metal3api.HealthyCondition, metal3api.UnknownHealthReason)
return
}
switch prov.GetHealth(ctx) {
case provisioner.HealthOK:
setConditionTrue(host, metal3api.HealthyCondition, metal3api.HealthyReason)
case provisioner.HealthWarning:
setConditionFalse(host, metal3api.HealthyCondition, metal3api.WarningHealthReason)
case provisioner.HealthCritical:
setConditionFalse(host, metal3api.HealthyCondition, metal3api.CriticalHealthReason)
default:
setConditionUnknown(host, metal3api.HealthyCondition, metal3api.UnknownHealthReason)
if health := prov.GetHealth(ctx); health != "" {
switch health {
case provisioner.HealthOK:
setConditionTrue(host, metal3api.HealthyCondition, metal3api.HealthyReason)
case provisioner.HealthWarning:
setConditionFalse(host, metal3api.HealthyCondition, metal3api.WarningHealthReason)
case provisioner.HealthCritical:
setConditionFalse(host, metal3api.HealthyCondition, metal3api.CriticalHealthReason)
default:
setConditionUnknown(host, metal3api.HealthyCondition, metal3api.UnknownHealthReason)
}
}
}

Expand Down
25 changes: 17 additions & 8 deletions internal/controller/metal3.io/baremetalhost_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3187,12 +3187,6 @@ func TestComputeHealthyCondition(t *testing.T) {
ExpectedStatus metav1.ConditionStatus
ExpectedReason string
}{
{
Scenario: "empty health reports unknown",
Health: "",
ExpectedStatus: metav1.ConditionUnknown,
ExpectedReason: metal3api.UnknownHealthReason,
},
{
Scenario: "OK health",
Health: provisioner.HealthOK,
Expand Down Expand Up @@ -3232,6 +3226,17 @@ func TestComputeHealthyCondition(t *testing.T) {
assert.Equal(t, tc.ExpectedReason, cond.Reason)
})
}

t.Run("empty health leaves condition unchanged", func(t *testing.T) {
host := bmhWithStatus(metal3api.OperationalStatusOK, metal3api.StateAvailable)
fix := &fixture.Fixture{Health: ""}
prov, err := fix.NewProvisioner(t.Context(), provisioner.BuildHostData(*host, bmc.Credentials{}), nil)
require.NoError(t, err)
computeConditions(t.Context(), host, prov)

cond := conditions.Get(host, metal3api.HealthyCondition)
assert.Nil(t, cond, "empty health should not set the Healthy condition")
})
}

func TestComputeConditions(t *testing.T) {
Expand Down Expand Up @@ -3331,8 +3336,12 @@ func TestComputeConditions(t *testing.T) {
assert.True(t, conditions.IsFalse(tc.BareMetalHost, metal3api.ReadyCondition))
}
cond := conditions.Get(tc.BareMetalHost, metal3api.HealthyCondition)
require.NotNil(t, cond, "Healthy condition should always be set")
assert.Equal(t, metav1.ConditionUnknown, cond.Status, "Healthy should be Unknown when health is not reported")
if tc.provisioner == nil {
require.NotNil(t, cond, "Healthy condition should be set when provisioner is nil")
assert.Equal(t, metav1.ConditionUnknown, cond.Status, "Healthy should be Unknown when provisioner is nil")
} else {
assert.Nil(t, cond, "Healthy condition should not be set when provisioner reports no health data")
}
})
}
}
11 changes: 10 additions & 1 deletion pkg/provisioner/ironic/clients/features.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ func (af AvailableFeatures) Log(logger logr.Logger) {
"maxVersion", fmt.Sprintf("1.%d", af.MaxVersion),
"chosenVersion", af.ChooseMicroversion(),
"virtualMediaGET", af.HasVirtualMediaGetAPI(),
"disablePowerOff", af.HasDisablePowerOff())
"disablePowerOff", af.HasDisablePowerOff(),
"healthAPI", af.HasHealthAPI())
}

func (af AvailableFeatures) HasVirtualMediaGetAPI() bool {
Expand All @@ -56,7 +57,15 @@ func (af AvailableFeatures) HasDisablePowerOff() bool {
return af.MaxVersion >= 95 //nolint:mnd
}

func (af AvailableFeatures) HasHealthAPI() bool {
return af.MaxVersion >= 109 //nolint:mnd
}

func (af AvailableFeatures) ChooseMicroversion() string {
if af.HasHealthAPI() {
return "1.109"
}
Comment on lines +65 to +67
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

# 1) Confirm global microversion assignment from ChooseMicroversion()
rg -n -C3 'p\.client\.Microversion\s*=\s*p\.availableFeatures\.ChooseMicroversion\(\)' pkg/provisioner/ironic

# 2) List Ironic node operations using the same shared client
rg -n -C2 'nodes\.[A-Za-z_]+\(' pkg/provisioner/ironic

# 3) Check for per-request microversion/header overrides (if any)
rg -n -C2 'X-OpenStack-Ironic-API-Version|OpenStack-API-Version|Microversion' pkg/provisioner/ironic

Repository: openshift/baremetal-operator

Length of output: 16585


🏁 Script executed:

# Find HasHealthAPI() definition and usage
rg -n -B2 -A5 'func.*HasHealthAPI' pkg/provisioner/ironic

# Find health-related operations or API calls
rg -n -i 'health' pkg/provisioner/ironic/clients/features.go

# Check what microversions are involved and their implications
rg -n '1\.10[0-9]' pkg/provisioner/ironic

Repository: openshift/baremetal-operator

Length of output: 1123


🏁 Script executed:

# Search for actual health endpoint/operation calls
rg -n -i 'health' pkg/provisioner/ironic/*.go

# Check if there's any health check method or similar
rg -n 'func.*health|checkHealth|getHealth' pkg/provisioner/ironic -i

# Look for what operations actually require 1.109
rg -n 'GetHealthStatus|health.*status' pkg/provisioner/ironic -i

Repository: openshift/baremetal-operator

Length of output: 2237


🏁 Script executed:

# Get the full implementation of GetHealth()
sed -n '2001,2020p' pkg/provisioner/ironic/ironic.go

# Check what getNode() does (referenced in GetHealth)
rg -n -A10 'func.*getNode\(' pkg/provisioner/ironic/ironic.go | head -30

# Verify if getNode uses the shared client
rg -n 'getNode.*p\.client' pkg/provisioner/ironic/ironic.go

Repository: openshift/baremetal-operator

Length of output: 945


🏁 Script executed:

# Confirm p.client is shared across all methods in the provisioner
rg -n 'p\.client\.' pkg/provisioner/ironic/ironic.go | wc -l

# Check if nodes.Get or any node operation supports per-request microversion override
rg -n 'type.*Opts' pkg/provisioner/ironic --include='*.go' | grep -i 'microversion\|version'

# Look at the client definition to understand if Microversion affects all requests
rg -n 'type.*ServiceClient' pkg/provisioner/ironic -A5

Repository: openshift/baremetal-operator

Length of output: 532


🏁 Script executed:

# Check the p.client field definition
rg -n 'p\.client' pkg/provisioner/ironic/ironic.go -B2 -A2 | head -20

# Look at the provisioner struct definition
rg -n 'type ironicProvisioner struct' pkg/provisioner/ironic/ironic.go -A10

# Check if gophercloud ServiceClient uses global Microversion
rg -n 'ServiceClient' pkg/provisioner/ironic/clients/client.go -B2 -A5

Repository: openshift/baremetal-operator

Length of output: 1640


🏁 Script executed:

# Check if there are comments or documentation about microversion 1.109 compatibility
rg -n '1\.109\|microversion.*compat' pkg/provisioner/ironic -B2 -A2

# Look for any test coverage showing issues with 1.109 on non-health operations
rg -n '1\.109' pkg/provisioner/ironic -l

# Check the baseline/conservative microversion to understand the risk delta
rg -n 'baselineVersionString\|conservative' pkg/provisioner/ironic/clients/client.go -B2 -A2

Repository: openshift/baremetal-operator

Length of output: 162


🏁 Script executed:

# Find baseline/conservative version definition
rg -n 'baselineVersionString\|1\.6\|1\.5' pkg/provisioner/ironic/clients/client.go -B2 -A2

# Get context around client initialization with versions
sed -n '85,105p' pkg/provisioner/ironic/clients/client.go

Repository: openshift/baremetal-operator

Length of output: 304


🏁 Script executed:

# Find baselineVersionString definition
rg -n 'baselineVersionString\s*=' pkg/provisioner/ironic/clients/client.go -B2 -A2

# Also check the constants file
rg -n 'const.*Version' pkg/provisioner/ironic/clients/ -B1 -A1

Repository: openshift/baremetal-operator

Length of output: 54


🏁 Script executed:

# Read the beginning of client.go to find baselineVersionString
head -30 pkg/provisioner/ironic/clients/client.go

# Also check imports and constants
rg -n 'baselineVersionString' pkg/provisioner/ironic/clients/client.go

Repository: openshift/baremetal-operator

Length of output: 1005


🏁 Script executed:

# Search all files in the package for baselineVersionString definition
rg -n 'baselineVersionString' pkg/provisioner/ironic/ -A1 -B1

# Check if it's defined in features.go or elsewhere
fd '\.go$' pkg/provisioner/ironic/clients | xargs grep -l 'baselineVersionString'

Repository: openshift/baremetal-operator

Length of output: 1227


🏁 Script executed:

# Find the baseline constant definition
rg -n 'const.*baseline\|baseline\s*=' pkg/provisioner/ironic/clients/features.go -B2 -A2

# Get broader context around baseline and baselineVersionString
sed -n '1,30p' pkg/provisioner/ironic/clients/features.go

Repository: openshift/baremetal-operator

Length of output: 1082


Global switch to 1.109 impacts all Ironic node operations on the shared client.

When HasHealthAPI() is true (MaxVersion >= 109), ChooseMicroversion() returns "1.109" instead of the conservative baseline "1.89". This global microversion is assigned to the shared client at pkg/provisioner/ironic/dependencies.go:21 via p.client.Microversion = p.availableFeatures.ChooseMicroversion(), affecting all subsequent node operations (Create, Get, Update, ChangeProvisionState, Delete, SetRAIDConfig, ListBIOSSettings, etc.), not just health reads. A 20-microversion jump introduces risk of behavior drift in non-health flows. Consider keeping baseline 1.89 globally and using 1.109 only on the health-read path (or a dedicated client).

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pkg/provisioner/ironic/clients/features.go` around lines 65 - 67,
ChooseMicroversion() currently returns "1.109" when af.HasHealthAPI() is true,
which causes p.client.Microversion (set from
availableFeatures.ChooseMicroversion()) to be bumped globally and affect all
node operations; change the behavior so ChooseMicroversion() returns the
conservative baseline "1.89" for the shared client and only use "1.109" for
health reads by creating a dedicated health client or temporarily cloning the
client and setting its Microversion in the health-read path (e.g., around the
health read call/site), ensuring other methods like Create, Get, Update,
ChangeProvisionState, Delete, SetRAIDConfig, ListBIOSSettings continue using the
baseline microversion.


if af.HasDisablePowerOff() {
return "1.95"
}
Expand Down
23 changes: 18 additions & 5 deletions pkg/provisioner/ironic/clients/features_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
)

func TestAvailableFeatures_ChooseMicroversion(t *testing.T) {
microVersion := "1.95"
type fields struct {
MaxVersion int
}
Expand All @@ -23,18 +22,32 @@ func TestAvailableFeatures_ChooseMicroversion(t *testing.T) {
want: baselineVersionString,
},
{
name: fmt.Sprintf("MaxVersion = %d return %s", 89, microVersion),
name: "MaxVersion = 95 return 1.95",
feature: fields{
MaxVersion: 95,
},
want: microVersion,
want: "1.95",
},
{
name: fmt.Sprintf("MaxVersion > %d return %s", 89, microVersion),
name: "MaxVersion = 100 return 1.95",
feature: fields{
MaxVersion: 100,
},
want: microVersion,
want: "1.95",
},
{
name: "MaxVersion = 109 return 1.109",
feature: fields{
MaxVersion: 109,
},
want: "1.109",
},
{
name: "MaxVersion > 109 return 1.109",
feature: fields{
MaxVersion: 115,
},
want: "1.109",
},
}
for _, tt := range tests {
Expand Down
4 changes: 3 additions & 1 deletion pkg/provisioner/ironic/ironic.go
Original file line number Diff line number Diff line change
Expand Up @@ -2004,7 +2004,9 @@ func (p *ironicProvisioner) HasPowerFailure(ctx context.Context) bool {
func (p *ironicProvisioner) GetHealth(ctx context.Context) string {
node, err := p.getNode(ctx)
if err != nil {
p.log.Error(err, "ignored error while checking health status")
if !errors.Is(err, provisioner.ErrNeedsRegistration) {
p.log.Error(err, "ignored error while checking health status")
}
return ""
}
return node.Health
Expand Down