From 0900fac413be4133d7a7ebd8d9ef0542100a2db4 Mon Sep 17 00:00:00 2001 From: Robert Smith Date: Mon, 28 Jul 2025 22:36:38 +0100 Subject: [PATCH] Add support for explicitly enabling XIDs in health checks This change adds support for users to explicitly set specific XIDs as fatal errors that cause a device to be marked as unhealth. When used in conjunction with the ability to ignore ALL XIDs, the set of XIDs that are fatal can be set. To do this, an environment variable DP_ENABLE_HEALTHCHECKS that mirrors the existing DP_DISABLE_HEALTHCHECKS envvar is added. Explicitly enabling an XID overrides disabling them. Signed-off-by: Robert Smith Co-Authored-by: Evan Lezar --- internal/rm/health.go | 143 +++++++++++++++++++++--------- internal/rm/health_test.go | 175 ++++++++++++++++++++++++++++++++++--- 2 files changed, 265 insertions(+), 53 deletions(-) diff --git a/internal/rm/health.go b/internal/rm/health.go index a1989cd88..5dcc2570d 100644 --- a/internal/rm/health.go +++ b/internal/rm/health.go @@ -32,16 +32,18 @@ const ( // disabled entirely. If set, the envvar is treated as a comma-separated list of Xids to ignore. Note that // this is in addition to the Application errors that are already ignored. envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS" - allHealthChecks = "xids" + // envEnableHealthChecks defines the environment variable that is checked to + // determine which XIDs should be explicitly enabled. XIDs specified here + // override the ones specified in the `DP_DISABLE_HEALTHCHECKS`. + // Note that this also allows individual XIDs to be selected when ALL XIDs + // are disabled. + envEnableHealthChecks = "DP_ENABLE_HEALTHCHECKS" ) // CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devices, unhealthy chan<- *Device) error { - disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks)) - if disableHealthChecks == "all" { - disableHealthChecks = allHealthChecks - } - if strings.Contains(disableHealthChecks, "xids") { + xids := getHealthCheckXids() + if xids.IsAllDisabled() { return nil } @@ -59,26 +61,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic } }() - // FIXME: formalize the full list and document it. - // http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4 - // Application errors: the GPU should still be healthy - ignoredXids := []uint64{ - 13, // Graphics Engine Exception - 31, // GPU memory page fault - 43, // GPU stopped processing - 45, // Preemptive cleanup, due to previous errors - 68, // Video processor exception - 109, // Context Switch Timeout Error - } - - skippedXids := make(map[uint64]bool) - for _, id := range ignoredXids { - skippedXids[id] = true - } - - for _, additionalXid := range getAdditionalXids(disableHealthChecks) { - skippedXids[additionalXid] = true - } + klog.Infof("Using XIDs for health checks: %v", xids) eventSet, ret := r.nvml.EventSetCreate() if ret != nvml.SUCCESS { @@ -152,7 +135,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic continue } - if skippedXids[e.EventData] { + if xids.IsDisabled(e.EventData) { klog.Infof("Skipping event %+v", e) continue } @@ -188,29 +171,109 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic } } -// getAdditionalXids returns a list of additional Xids to skip from the specified string. -// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values. Invalid values -// are ignored. -func getAdditionalXids(input string) []uint64 { - if input == "" { - return nil +const allXIDs = 0 + +// disabledXIDs stores a map of explicitly disabled XIDs. +// The special XID `allXIDs` indicates that all XIDs are disabled, but does +// allow for specific XIDs to be enabled even if this is the case. +type disabledXIDs map[uint64]bool + +// Disabled returns whether XID-based health checks are disabled. +// These are considered if all XIDs have been disabled AND no other XIDs have +// been explcitly enabled. +func (h disabledXIDs) IsAllDisabled() bool { + if allDisabled, ok := h[allXIDs]; ok { + return allDisabled } + // At this point we wither have explicitly disabled XIDs or explicitly + // enabled XIDs. Since ANY XID that's not specified is assumed enabled, we + // return here. + return false +} - var additionalXids []uint64 - for _, additionalXid := range strings.Split(input, ",") { - trimmed := strings.TrimSpace(additionalXid) +// IsDisabled checks whether the specified XID has been explicitly disalbled. +// An XID is considered disabled if it has been explicitly disabled, or all XIDs +// have been disabled. +func (h disabledXIDs) IsDisabled(xid uint64) bool { + // Handle the case where enabled=all. + if explicitAll, ok := h[allXIDs]; ok && !explicitAll { + return false + } + // Handle the case where the XID has been specifically enabled (or disabled) + if disabled, ok := h[xid]; ok { + return disabled + } + return h.IsAllDisabled() +} + +// getHealthCheckXids returns the XIDs that are considered fatal. +// Here we combine the following (in order of precedence): +// * A list of explicitly disabled XIDs (including all XIDs) +// * A list of hardcoded disabled XIDs +// * A list of explicitly enabled XIDs (including all XIDs) +// +// Note that if an XID is explicitly enabled, this takes precedence over it +// having been disabled either explicitly or implicitly. +func getHealthCheckXids() disabledXIDs { + disabled := newHealthCheckXIDs( + // TODO: We should not read the envvar here directly, but instead + // "upgrade" this to a top-level config option. + strings.Split(strings.ToLower(os.Getenv(envDisableHealthChecks)), ",")..., + ) + enabled := newHealthCheckXIDs( + // TODO: We should not read the envvar here directly, but instead + // "upgrade" this to a top-level config option. + strings.Split(strings.ToLower(os.Getenv(envEnableHealthChecks)), ",")..., + ) + + // Add the list of hardcoded disabled (ignored) XIDs: + // FIXME: formalize the full list and document it. + // http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4 + // Application errors: the GPU should still be healthy + ignoredXids := []uint64{ + 13, // Graphics Engine Exception + 31, // GPU memory page fault + 43, // GPU stopped processing + 45, // Preemptive cleanup, due to previous errors + 68, // Video processor exception + 109, // Context Switch Timeout Error + } + for _, ignored := range ignoredXids { + disabled[ignored] = true + } + + // Explicitly ENABLE specific XIDs, + for enabled := range enabled { + disabled[enabled] = false + } + return disabled +} + +// newHealthCheckXIDs converts a list of Xids to a healthCheckXIDs map. +// Special xid values 'all' and 'xids' return a special map that matches all +// xids. +// For other xids, these are converted to a uint64 values with invalid values +// being ignored. +func newHealthCheckXIDs(xids ...string) disabledXIDs { + output := make(disabledXIDs) + for _, xid := range xids { + trimmed := strings.TrimSpace(xid) + if trimmed == "all" || trimmed == "xids" { + // TODO: We should have a different type for "all" and "all-except" + return disabledXIDs{allXIDs: true} + } if trimmed == "" { continue } - xid, err := strconv.ParseUint(trimmed, 10, 64) + id, err := strconv.ParseUint(trimmed, 10, 64) if err != nil { klog.Infof("Ignoring malformed Xid value %v: %v", trimmed, err) continue } - additionalXids = append(additionalXids, xid) - } - return additionalXids + output[id] = true + } + return output } // getDevicePlacement returns the placement of the specified device. diff --git a/internal/rm/health_test.go b/internal/rm/health_test.go index 101aadf78..f623c550f 100644 --- a/internal/rm/health_test.go +++ b/internal/rm/health_test.go @@ -18,57 +18,206 @@ package rm import ( "fmt" + "strings" "testing" "github.com/stretchr/testify/require" ) -func TestGetAdditionalXids(t *testing.T) { +func TestNewHealthCheckXIDs(t *testing.T) { testCases := []struct { input string - expected []uint64 + expected disabledXIDs }{ - {}, { - input: ",", + expected: disabledXIDs{}, }, { - input: "not-an-int", + input: ",", + expected: disabledXIDs{}, + }, + { + input: "not-an-int", + expected: disabledXIDs{}, }, { input: "68", - expected: []uint64{68}, + expected: disabledXIDs{68: true}, }, { - input: "-68", + input: "-68", + expected: disabledXIDs{}, }, { input: "68 ", - expected: []uint64{68}, + expected: disabledXIDs{68: true}, }, { input: "68,", - expected: []uint64{68}, + expected: disabledXIDs{68: true}, }, { input: ",68", - expected: []uint64{68}, + expected: disabledXIDs{68: true}, }, { input: "68,67", - expected: []uint64{68, 67}, + expected: disabledXIDs{67: true, 68: true}, }, { input: "68,not-an-int,67", - expected: []uint64{68, 67}, + expected: disabledXIDs{67: true, 68: true}, }, } for i, tc := range testCases { t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { - xids := getAdditionalXids(tc.input) + xids := newHealthCheckXIDs(strings.Split(tc.input, ",")...) require.EqualValues(t, tc.expected, xids) }) } } + +func TestGetHealthCheckXids(t *testing.T) { + testCases := []struct { + description string + enabled string + disabled string + expectedAllDisabled bool + expectedContents disabledXIDs + expectedDisabled map[uint64]bool + }{ + { + description: "empty envvars are default disabled", + expectedAllDisabled: false, + expectedContents: disabledXIDs{ + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + }, + expectedDisabled: map[uint64]bool{ + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + }, + }, + { + description: "disabled is all", + disabled: "all", + expectedAllDisabled: true, + expectedContents: disabledXIDs{ + 0: true, + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + }, + expectedDisabled: map[uint64]bool{ + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + 555: true, + }, + }, + { + description: "disabled is xids", + disabled: "xids", + expectedAllDisabled: true, + expectedContents: disabledXIDs{ + 0: true, + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + }, + expectedDisabled: map[uint64]bool{ + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + 555: true, + }, + }, + { + description: "enabled is all", + enabled: "all", + expectedAllDisabled: false, + expectedContents: disabledXIDs{ + 0: false, + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + }, + expectedDisabled: map[uint64]bool{ + 13: false, + 31: false, + 43: false, + 45: false, + 68: false, + 109: false, + 555: false, + }, + }, + { + description: "enabled overrides disabled", + disabled: "11", + enabled: "11", + expectedAllDisabled: false, + expectedContents: disabledXIDs{ + 11: false, + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + }, + expectedDisabled: map[uint64]bool{ + 11: false, + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + 555: false, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + t.Setenv(envDisableHealthChecks, tc.disabled) + t.Setenv(envEnableHealthChecks, tc.enabled) + + xids := getHealthCheckXids() + require.EqualValues(t, tc.expectedContents, xids) + require.Equal(t, tc.expectedAllDisabled, xids.IsAllDisabled()) + + disabled := make(map[uint64]bool) + for xid := range tc.expectedDisabled { + disabled[xid] = xids.IsDisabled(xid) + } + require.Equal(t, tc.expectedDisabled, disabled) + }) + } +}