Skip to content

Commit 14f5202

Browse files
Remove shell dependency from validator pods
NVIDIA's distroless-cc `-dev` tag (the gpu-operator image base) will no longer be approved as a STIG parent image. The non-`-dev` variant ships no shell, so the validator daemonsets and workload validation pods — which wrapped binaries in `sh -c` and used shell-based preStop hooks — would break on the new base. Re-adding a shell to the image would only swap one CVE source for another. Replace shell wrappers with direct binary invocation. The operator-validator and sandbox-validator init containers invoke `nvidia-validator` directly. Their pause containers use a new top-level `--sleep` flag that prints the validator-success message and blocks on SIGTERM. Workload pod main containers run `nvidia-validator --version` as a no-op exit-0; the per-workload success message now prints from `(c *CUDA).runWorkload` and `(p *Plugin).runWorkload` after `waitForPod` succeeds — surfacing in the operator-validator init container logs where success is actually established. For preStop cleanup, add a small static helper `rmglob` that takes glob patterns and removes matching paths. Modeled on k8s-cc-manager's vendored static `/bin/rm`, shipped at `/usr/bin/rmglob`. Both validator daemonsets keep their `lifecycle.preStop` blocks; they now call this binary instead of `sh -c rm`. Flip the Dockerfile base to `nvcr.io/nvidia/distroless/cc:v4.0.0`. Signed-off-by: Rajath Agasthya <ragasthya@nvidia.com>
1 parent b927469 commit 14f5202

9 files changed

Lines changed: 288 additions & 46 deletions

File tree

assets/state-operator-validation/0500_daemonset.yaml

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ spec:
2828
initContainers:
2929
- name: driver-validation
3030
image: "FILLED BY THE OPERATOR"
31-
command: ['sh', '-c']
32-
args: ["nvidia-validator"]
31+
command: ["nvidia-validator"]
3332
env:
3433
- name: WITH_WAIT
3534
value: "true"
@@ -58,8 +57,7 @@ spec:
5857
mountPath: /host-dev-char
5958
- name: toolkit-validation
6059
image: "FILLED BY THE OPERATOR"
61-
command: ['sh', '-c']
62-
args: ["nvidia-validator"]
60+
command: ["nvidia-validator"]
6361
env:
6462
- name: NVIDIA_VISIBLE_DEVICES
6563
value: "all"
@@ -75,8 +73,7 @@ spec:
7573
mountPropagation: Bidirectional
7674
- name: cuda-validation
7775
image: "FILLED BY THE OPERATOR"
78-
command: ['sh', '-c']
79-
args: ["nvidia-validator"]
76+
command: ["nvidia-validator"]
8077
env:
8178
- name: WITH_WAIT
8279
value: "false"
@@ -98,8 +95,7 @@ spec:
9895
mountPropagation: Bidirectional
9996
- name: plugin-validation
10097
image: "FILLED BY THE OPERATOR"
101-
command: ['sh', '-c']
102-
args: ["nvidia-validator"]
98+
command: ["nvidia-validator"]
10399
env:
104100
- name: COMPONENT
105101
value: plugin
@@ -126,14 +122,14 @@ spec:
126122
containers:
127123
- image: "FILLED BY THE OPERATOR"
128124
name: nvidia-operator-validator
129-
command: ['sh', '-c']
130-
args: ["echo all validations are successful; while true; do sleep 86400; done"]
125+
command: ["nvidia-validator"]
126+
args: ["--sleep"]
131127
securityContext:
132128
privileged: true
133129
lifecycle:
134130
preStop:
135131
exec:
136-
command: ["sh", "-c", "rm -f /run/nvidia/validations/*-ready"]
132+
command: ["/usr/bin/rmglob", "/run/nvidia/validations/*-ready"]
137133
volumeMounts:
138134
- name: run-nvidia-validations
139135
mountPath: "/run/nvidia/validations"

assets/state-sandbox-validation/0500_daemonset.yaml

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ spec:
2828
initContainers:
2929
- name: cc-manager-validation
3030
image: "FILLED BY THE OPERATOR"
31-
command: ['sh', '-c']
32-
args: ["nvidia-validator"]
31+
command: ["nvidia-validator"]
3332
env:
3433
- name: WITH_WAIT
3534
value: "true"
@@ -49,8 +48,7 @@ spec:
4948
mountPropagation: Bidirectional
5049
- name: vfio-pci-validation
5150
image: "FILLED BY THE OPERATOR"
52-
command: ['sh', '-c']
53-
args: ["nvidia-validator"]
51+
command: ["nvidia-validator"]
5452
env:
5553
- name: WITH_WAIT
5654
value: "true"
@@ -74,8 +72,7 @@ spec:
7472
mountPropagation: Bidirectional
7573
- name: vgpu-manager-validation
7674
image: "FILLED BY THE OPERATOR"
77-
command: ['sh', '-c']
78-
args: ["nvidia-validator"]
75+
command: ["nvidia-validator"]
7976
env:
8077
- name: WITH_WAIT
8178
value: "true"
@@ -102,8 +99,7 @@ spec:
10299
mountPropagation: Bidirectional
103100
- name: vgpu-devices-validation
104101
image: "FILLED BY THE OPERATOR"
105-
command: ['sh', '-c']
106-
args: ["nvidia-validator"]
102+
command: ["nvidia-validator"]
107103
env:
108104
- name: WITH_WAIT
109105
value: "true"
@@ -122,14 +118,14 @@ spec:
122118
containers:
123119
- image: "FILLED BY THE OPERATOR"
124120
name: nvidia-sandbox-validator
125-
command: ['sh', '-c']
126-
args: ["echo all validations are successful; while true; do sleep 86400; done"]
121+
command: ["nvidia-validator"]
122+
args: ["--sleep"]
127123
securityContext:
128124
privileged: true
129125
lifecycle:
130126
preStop:
131127
exec:
132-
command: ["sh", "-c", "rm -f /run/nvidia/validations/*"]
128+
command: ["/usr/bin/rmglob", "/run/nvidia/validations/*"]
133129
volumeMounts:
134130
- name: run-nvidia-validations
135131
mountPath: "/run/nvidia/validations"

cmd/nvidia-validator/main.go

Lines changed: 60 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ var (
135135
hostRootFlag string
136136
driverInstallDirFlag string
137137
driverInstallDirCtrPathFlag string
138+
sleepFlag bool
138139
)
139140

140141
// defaultGPUWorkloadConfig is "vm-passthrough" unless
@@ -375,14 +376,17 @@ func main() {
375376
Destination: &driverInstallDirCtrPathFlag,
376377
Sources: cli.EnvVars("DRIVER_INSTALL_DIR_CTR_PATH"),
377378
},
379+
&cli.BoolFlag{
380+
Name: "sleep",
381+
Usage: "after any other action, print the validator-success message and block until SIGTERM/SIGINT/SIGHUP, then exit 0",
382+
Destination: &sleepFlag,
383+
Sources: cli.EnvVars("SLEEP"),
384+
},
378385
}
379386

380387
// Log version info
381388
log.Infof("version: %s", c.Version)
382389

383-
// Handle signals
384-
go handleSignal()
385-
386390
// invoke command
387391
err := c.Run(context.Background(), os.Args)
388392
if err != nil {
@@ -404,6 +408,10 @@ func handleSignal() {
404408

405409
func validateFlags(ctx context.Context, cli *cli.Command) (context.Context, error) {
406410
if componentFlag == "" {
411+
// Standalone --sleep mode does not require a component.
412+
if sleepFlag {
413+
return ctx, nil
414+
}
407415
return ctx, fmt.Errorf("invalid -c <component-name> flag: must not be empty string")
408416
}
409417
if !isValidComponent() {
@@ -509,24 +517,59 @@ func getWorkloadConfig(ctx context.Context) (string, error) {
509517
}
510518

511519
func start(ctx context.Context, cli *cli.Command) error {
512-
// if cleanup is requested, delete all existing status files(default)
513-
if cleanupAllFlag {
514-
// cleanup output directory and create again each time
515-
err := os.RemoveAll(outputDirFlag)
516-
if err != nil {
517-
if !os.IsNotExist(err) {
518-
return err
520+
// In sleep mode, runSleep installs its own signal handler. Otherwise
521+
// preserve legacy behavior: any signal terminates the process.
522+
if !sleepFlag {
523+
go handleSignal()
524+
}
525+
526+
if componentFlag != "" {
527+
// if cleanup is requested, delete all existing status files(default)
528+
if cleanupAllFlag {
529+
// cleanup output directory and create again each time
530+
err := os.RemoveAll(outputDirFlag)
531+
if err != nil {
532+
if !os.IsNotExist(err) {
533+
return err
534+
}
519535
}
520536
}
537+
538+
// create status directory
539+
err := os.Mkdir(outputDirFlag, 0755)
540+
if err != nil && !os.IsExist(err) {
541+
return err
542+
}
543+
544+
if err := validateComponent(ctx, componentFlag); err != nil {
545+
return err
546+
}
521547
}
522548

523-
// create status directory
524-
err := os.Mkdir(outputDirFlag, 0755)
525-
if err != nil && !os.IsExist(err) {
526-
return err
549+
if sleepFlag {
550+
return runSleep(ctx)
527551
}
552+
return nil
553+
}
528554

529-
return validateComponent(ctx, componentFlag)
555+
// runSleep prints the validator-success message and blocks until a
556+
// termination signal arrives, then exits cleanly. Per-pod cleanup of
557+
// status markers is handled separately by the rmglob binary invoked
558+
// from `lifecycle.preStop`.
559+
func runSleep(ctx context.Context) error {
560+
fmt.Println("all validations are successful")
561+
562+
sigCh := make(chan os.Signal, 1)
563+
signal.Notify(sigCh, syscall.SIGTERM, syscall.SIGINT, syscall.SIGHUP)
564+
defer signal.Stop(sigCh)
565+
566+
select {
567+
case <-ctx.Done():
568+
log.Infof("context canceled")
569+
case s := <-sigCh:
570+
log.Infof("received signal %s", s)
571+
}
572+
return nil
530573
}
531574

532575
func validateComponent(ctx context.Context, componentFlag string) error {
@@ -1368,6 +1411,7 @@ func (p *Plugin) runWorkload() error {
13681411
if err != nil {
13691412
return err
13701413
}
1414+
fmt.Println("device-plugin workload validation is successful")
13711415
return nil
13721416
}
13731417

@@ -1621,6 +1665,7 @@ func (c *CUDA) runWorkload() error {
16211665
if err != nil {
16221666
return err
16231667
}
1668+
fmt.Println("cuda workload validation is successful")
16241669
return nil
16251670
}
16261671

cmd/nvidia-validator/main_test.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ package main
1919
import (
2020
"context"
2121
"os"
22+
"syscall"
2223
"testing"
24+
"time"
2325
)
2426

2527
func Test_isValidComponent(t *testing.T) {
@@ -216,3 +218,72 @@ UNKNOWN_FEATURE: true`,
216218
})
217219
}
218220
}
221+
222+
func Test_validateFlags_standaloneSleep(t *testing.T) {
223+
tests := []struct {
224+
name string
225+
component string
226+
sleep bool
227+
wantErr bool
228+
}{
229+
{name: "no component, no sleep: error", wantErr: true},
230+
{name: "no component, sleep: ok", sleep: true},
231+
{name: "valid component, no sleep: ok", component: "driver"},
232+
{name: "valid component, sleep: ok", component: "driver", sleep: true},
233+
}
234+
for _, tt := range tests {
235+
t.Run(tt.name, func(t *testing.T) {
236+
origComponent, origSleep := componentFlag, sleepFlag
237+
componentFlag, sleepFlag = tt.component, tt.sleep
238+
defer func() {
239+
componentFlag, sleepFlag = origComponent, origSleep
240+
}()
241+
242+
_, err := validateFlags(context.Background(), nil)
243+
if tt.wantErr && err == nil {
244+
t.Errorf("validateFlags() expected error, got nil")
245+
}
246+
if !tt.wantErr && err != nil {
247+
t.Errorf("validateFlags() unexpected error: %v", err)
248+
}
249+
})
250+
}
251+
}
252+
253+
func Test_runSleep_returnsOnSignal(t *testing.T) {
254+
errCh := make(chan error, 1)
255+
go func() { errCh <- runSleep(context.Background()) }()
256+
257+
// Give runSleep a moment to install its signal handler before sending.
258+
time.Sleep(50 * time.Millisecond)
259+
if err := syscall.Kill(syscall.Getpid(), syscall.SIGTERM); err != nil {
260+
t.Fatalf("kill: %v", err)
261+
}
262+
263+
select {
264+
case err := <-errCh:
265+
if err != nil {
266+
t.Errorf("runSleep returned error: %v", err)
267+
}
268+
case <-time.After(2 * time.Second):
269+
t.Fatalf("runSleep did not return within 2s of SIGTERM")
270+
}
271+
}
272+
273+
func Test_runSleep_contextCancel(t *testing.T) {
274+
ctx, cancel := context.WithCancel(context.Background())
275+
errCh := make(chan error, 1)
276+
go func() { errCh <- runSleep(ctx) }()
277+
278+
time.Sleep(50 * time.Millisecond)
279+
cancel()
280+
281+
select {
282+
case err := <-errCh:
283+
if err != nil {
284+
t.Errorf("runSleep returned error: %v", err)
285+
}
286+
case <-time.After(2 * time.Second):
287+
t.Fatalf("runSleep did not return within 2s of context cancel")
288+
}
289+
}

cmd/rmglob/main.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
Copyright (c) NVIDIA CORPORATION. All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
// rmglob is a tiny static helper binary that expands one or more glob
18+
// patterns and removes the matching paths. It exists so that distroless
19+
// gpu-operator container images can run path cleanup from a Kubernetes
20+
// `lifecycle.preStop` hook without needing a shell on the image.
21+
//
22+
// It is the path-cleanup analog of k8s-cc-manager's vendored static `/bin/rm`.
23+
package main
24+
25+
import (
26+
"fmt"
27+
"os"
28+
"path/filepath"
29+
)
30+
31+
func main() {
32+
if len(os.Args) < 2 {
33+
fmt.Fprintln(os.Stderr, "usage: rmglob <glob>...")
34+
os.Exit(2)
35+
}
36+
37+
var failed bool
38+
for _, pattern := range os.Args[1:] {
39+
matches, err := filepath.Glob(pattern)
40+
if err != nil {
41+
//#nosec G705 -- stderr diagnostic, not a network-reachable sink
42+
fmt.Fprintf(os.Stderr, "rmglob: invalid pattern %q: %v\n", pattern, err)
43+
failed = true
44+
continue
45+
}
46+
for _, m := range matches {
47+
// Path removal is the binary's sole purpose; the patterns come from
48+
// gpu-operator-rendered manifests, not external user input.
49+
//#nosec G703 -- intentional path removal
50+
if err := os.RemoveAll(m); err != nil {
51+
//#nosec G705 -- stderr diagnostic, not a network-reachable sink
52+
fmt.Fprintf(os.Stderr, "rmglob: remove %q: %v\n", m, err)
53+
failed = true
54+
}
55+
}
56+
}
57+
if failed {
58+
os.Exit(1)
59+
}
60+
}

0 commit comments

Comments
 (0)