From df3a86f10ba3bcd77e727e428e00a390305d540a Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Wed, 25 Mar 2026 19:26:54 +0000 Subject: [PATCH 1/2] Adding env var to confirm CDI is enabled Signed-off-by: Vishesh Tanksale --- .gitignore | 2 ++ cmd/compute-domain-daemon/main.go | 7 +++++++ cmd/compute-domain-kubelet-plugin/computedomain.go | 1 + internal/common/util.go | 9 +++++++++ 4 files changed, 19 insertions(+) diff --git a/.gitignore b/.gitignore index 15acf8294..f2a3fe8ea 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,10 @@ .cache/ .bash_history /compute-domain-controller +/compute-domain-daemon /compute-domain-kubelet-plugin /gpu-kubelet-plugin +/webhook .idea [._]*.sw[a-p] coverage.out diff --git a/cmd/compute-domain-daemon/main.go b/cmd/compute-domain-daemon/main.go index 03e95c394..d85f6f9c1 100644 --- a/cmd/compute-domain-daemon/main.go +++ b/cmd/compute-domain-daemon/main.go @@ -204,6 +204,13 @@ func newApp() *cli.App { // Run invokes the IMEX daemon and manages its lifecycle. func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error { + // Verify that CDI container edits were applied by the container runtime. + // If the env var is not set to "true", CDI is likely disabled and the daemon + // cannot function correctly (e.g. the /imexd mount will be missing). + if os.Getenv(common.CDIEditsAppliedEnvKey) != common.CDIEditsAppliedEnvValue { + return fmt.Errorf("CDI container edits did not apply -- is CDI enabled in your container runtime?") + } + common.StartDebugSignalHandlers() // Validate feature gate dependencies diff --git a/cmd/compute-domain-kubelet-plugin/computedomain.go b/cmd/compute-domain-kubelet-plugin/computedomain.go index ac87ff7fe..908d55f7a 100644 --- a/cmd/compute-domain-kubelet-plugin/computedomain.go +++ b/cmd/compute-domain-kubelet-plugin/computedomain.go @@ -162,6 +162,7 @@ func (s *ComputeDomainDaemonSettings) GetCDIContainerEditsCommon(ctx context.Con edits := &cdiapi.ContainerEdits{ ContainerEdits: &cdispec.ContainerEdits{ Env: []string{ + fmt.Sprintf("%s=%s", common.CDIEditsAppliedEnvKey, common.CDIEditsAppliedEnvValue), fmt.Sprintf("CLIQUE_ID=%s", s.manager.cliqueID), fmt.Sprintf("COMPUTE_DOMAIN_UUID=%s", cd.UID), fmt.Sprintf("COMPUTE_DOMAIN_NAME=%s", cd.Name), diff --git a/internal/common/util.go b/internal/common/util.go index 65c6edfae..e1cee46fc 100644 --- a/internal/common/util.go +++ b/internal/common/util.go @@ -29,6 +29,15 @@ import ( const dumpPath = "/tmp/goroutine-stacks.dump" +// CDIEditsAppliedEnvKey and CDIEditsAppliedEnvValue is a key/value +// pair injected via CDI container edits into the CD daemon container. The CD +// daemon checks for this key/value pair at startup to verify that CDI edits were +// applied by the container runtime. If not present, CDI is likely disabled. +const ( + CDIEditsAppliedEnvKey = "NVIDIA_CDI_EDITS_APPLIED" + CDIEditsAppliedEnvValue = "true" +) + // Set up SIGUSR2 handler: if triggered, acquire stack traces for all goroutines // in this process. Dump to file, and fall back to emitting to stderr if file // output didn't work. From ac173d72789e5c7099d43a4380fd33a8586f07bd Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Wed, 25 Mar 2026 19:34:54 +0000 Subject: [PATCH 2/2] Using existing env var to confirm CDI is enabled Signed-off-by: Vishesh Tanksale --- cmd/compute-domain-daemon/main.go | 9 +++++---- cmd/compute-domain-kubelet-plugin/computedomain.go | 1 - internal/common/util.go | 9 --------- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/cmd/compute-domain-daemon/main.go b/cmd/compute-domain-daemon/main.go index d85f6f9c1..fdee03466 100644 --- a/cmd/compute-domain-daemon/main.go +++ b/cmd/compute-domain-daemon/main.go @@ -204,10 +204,11 @@ func newApp() *cli.App { // Run invokes the IMEX daemon and manages its lifecycle. func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error { - // Verify that CDI container edits were applied by the container runtime. - // If the env var is not set to "true", CDI is likely disabled and the daemon - // cannot function correctly (e.g. the /imexd mount will be missing). - if os.Getenv(common.CDIEditsAppliedEnvKey) != common.CDIEditsAppliedEnvValue { + // Verify that CDI container edits were applied by the container runtime by + // checking for COMPUTE_DOMAIN_UUID, which is always injected as part of the + // CDI edits. If it is missing, CDI is likely disabled and the daemon cannot + // function correctly (e.g. the /imexd mount will be missing). + if flags.computeDomainUUID == "" { return fmt.Errorf("CDI container edits did not apply -- is CDI enabled in your container runtime?") } diff --git a/cmd/compute-domain-kubelet-plugin/computedomain.go b/cmd/compute-domain-kubelet-plugin/computedomain.go index 908d55f7a..ac87ff7fe 100644 --- a/cmd/compute-domain-kubelet-plugin/computedomain.go +++ b/cmd/compute-domain-kubelet-plugin/computedomain.go @@ -162,7 +162,6 @@ func (s *ComputeDomainDaemonSettings) GetCDIContainerEditsCommon(ctx context.Con edits := &cdiapi.ContainerEdits{ ContainerEdits: &cdispec.ContainerEdits{ Env: []string{ - fmt.Sprintf("%s=%s", common.CDIEditsAppliedEnvKey, common.CDIEditsAppliedEnvValue), fmt.Sprintf("CLIQUE_ID=%s", s.manager.cliqueID), fmt.Sprintf("COMPUTE_DOMAIN_UUID=%s", cd.UID), fmt.Sprintf("COMPUTE_DOMAIN_NAME=%s", cd.Name), diff --git a/internal/common/util.go b/internal/common/util.go index e1cee46fc..65c6edfae 100644 --- a/internal/common/util.go +++ b/internal/common/util.go @@ -29,15 +29,6 @@ import ( const dumpPath = "/tmp/goroutine-stacks.dump" -// CDIEditsAppliedEnvKey and CDIEditsAppliedEnvValue is a key/value -// pair injected via CDI container edits into the CD daemon container. The CD -// daemon checks for this key/value pair at startup to verify that CDI edits were -// applied by the container runtime. If not present, CDI is likely disabled. -const ( - CDIEditsAppliedEnvKey = "NVIDIA_CDI_EDITS_APPLIED" - CDIEditsAppliedEnvValue = "true" -) - // Set up SIGUSR2 handler: if triggered, acquire stack traces for all goroutines // in this process. Dump to file, and fall back to emitting to stderr if file // output didn't work.