Skip to content

Commit be705a9

Browse files
committed
Switch gpu-operator image to distroless base image
Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
1 parent b28606d commit be705a9

10 files changed

Lines changed: 38 additions & 81 deletions

File tree

.common-ci.yml

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,6 @@ trigger-pipeline:
8282
- '[[ -n "${SKIP_QEMU_SETUP}" ]] || docker run --rm --privileged multiarch/qemu-user-static --reset -p yes'
8383

8484
# Define targets for the gpu-operator image
85-
.dist-ubi9:
86-
variables:
87-
DIST: ubi9
88-
CVE_UPDATES: "cyrus-sasl-lib"
89-
9085
.target-gpu-operator:
9186
variables:
9287
IMAGE_NAME: "${CI_REGISTRY_IMAGE}"
@@ -122,7 +117,7 @@ trigger-pipeline:
122117

123118
# Since OUT_IMAGE_NAME and OUT_IMAGE_VERSION are set, this will push the CI image to the
124119
# Target
125-
- make push-${DIST}
120+
- make push-image
126121

127122
.release-bundle:
128123
stage: release
@@ -174,15 +169,13 @@ trigger-pipeline:
174169
release:staging-gpu-operator:
175170
extends:
176171
- .release:staging
177-
- .dist-ubi9
178172
- .target-gpu-operator
179173
variables:
180174
OUT_IMAGE_NAME: "${CI_REGISTRY_IMAGE}/staging/gpu-operator"
181175

182176
release:staging-latest-gpu-operator:
183177
extends:
184178
- .release:staging
185-
- .dist-ubi9
186179
- .target-gpu-operator
187180
variables:
188181
OUT_IMAGE_NAME: "${CI_REGISTRY_IMAGE}/staging/gpu-operator"

.github/workflows/ci.yaml

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,6 @@ jobs:
133133
build-gpu-operator-arm64:
134134
needs: [go-check, go-test, go-build]
135135
runs-on: ubuntu-24.04-arm
136-
strategy:
137-
matrix:
138-
dist: [ubi9]
139136
steps:
140137
- uses: actions/checkout@v4
141138
name: Check out code
@@ -172,13 +169,10 @@ jobs:
172169
VERSION: ${COMMIT_SHORT_SHA}-arm64
173170
run: |
174171
echo "${VERSION}"
175-
make build-${{ matrix.dist }}
172+
make build-image
176173
build-gpu-operator-amd64:
177174
needs: [go-check, go-test, go-build]
178175
runs-on: ubuntu-latest
179-
strategy:
180-
matrix:
181-
dist: [ubi9]
182176
steps:
183177
- uses: actions/checkout@v4
184178
name: Check out code
@@ -215,14 +209,11 @@ jobs:
215209
VERSION: ${COMMIT_SHORT_SHA}-amd64
216210
run: |
217211
echo "${VERSION}"
218-
make build-${{ matrix.dist }}
212+
make build-image
219213
220214
build-multi-arch-images:
221215
needs: [build-gpu-operator-arm64, build-gpu-operator-amd64]
222216
runs-on: ubuntu-latest
223-
strategy:
224-
matrix:
225-
dist: [ubi9]
226217
steps:
227218
- uses: actions/checkout@v4
228219
name: Check out code

.gitlab-ci.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,11 @@ unit-tests:
9090
- 'echo "Logging in to CI registry ${CI_REGISTRY}"'
9191
- docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}"
9292
script:
93-
- make build-${DIST}
93+
- make build-image
9494

9595
build:gpu-operator:
9696
extends:
9797
- .image-build
98-
- .dist-ubi9
9998
- .target-gpu-operator
10099

101100
.e2e_defaults:
@@ -108,8 +107,6 @@ build:gpu-operator:
108107
OPERATOR_VERSION: "${CI_COMMIT_SHORT_SHA}"
109108
OPERATOR_IMAGE: "${CI_REGISTRY_IMAGE}"
110109
GPU_PRODUCT_NAME: "Tesla-T4"
111-
extends:
112-
- .dist-ubi9
113110
except:
114111
variables:
115112
- $CI_COMMIT_MESSAGE =~ /skip-end-to-end-tests/

.nvidia-ci.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,11 @@ variables:
5050
regctl manifest get ${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION} --list > /dev/null && echo "${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION}" || ( echo "${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION} does not exist" && sleep infinity )
5151
script:
5252
- regctl registry login "${OUT_REGISTRY}" -u "${OUT_REGISTRY_USER}" -p "${OUT_REGISTRY_TOKEN}"
53-
- make IMAGE=${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION} OUT_IMAGE=${OUT_IMAGE_NAME}:${CI_COMMIT_SHORT_SHA} push-${DIST}
53+
- make IMAGE=${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION} OUT_IMAGE=${OUT_IMAGE_NAME}:${CI_COMMIT_SHORT_SHA} push-image
5454

5555
image:gpu-operator:
5656
extends:
5757
- .image-pull
58-
- .dist-ubi9
5958
- .target-gpu-operator
6059

6160
# We skip the integration tests for the internal CI:
@@ -101,7 +100,6 @@ image:gpu-operator:
101100
.scan:gpu-operator:
102101
extends:
103102
- .scan
104-
- .dist-ubi9
105103
- .target-gpu-operator
106104
needs:
107105
- image:gpu-operator
@@ -130,7 +128,6 @@ scan:gpu-operator-arm64:
130128
release:ngc-gpu-operator:
131129
extends:
132130
- .release:ngc
133-
- .dist-ubi9
134131
- .target-gpu-operator
135132

136133
# Define the external image signing steps for NGC

Makefile

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,6 @@ else
7171
GOBIN=$(shell go env GOBIN)
7272
endif
7373

74-
all: gpu-operator
75-
7674
GOOS ?= linux
7775
VERSION_PKG = github.com/NVIDIA/gpu-operator/internal/info
7876

@@ -258,43 +256,36 @@ cov-report: coverage install-tools
258256
$(GCOV2LCOV) -infile $(COVERAGE_FILE) -outfile lcov.info
259257

260258
##### Public rules #####
261-
DISTRIBUTIONS := ubi9
262-
DEFAULT_PUSH_TARGET := ubi9
263-
264-
PUSH_TARGETS := $(patsubst %,push-%, $(DISTRIBUTIONS))
265-
BUILD_TARGETS := $(patsubst %,build-%, $(DISTRIBUTIONS))
266-
TEST_TARGETS := $(patsubst %,test-%, $(DISTRIBUTIONS))
259+
PUSH_TARGETS := push-image
260+
BUILD_TARGETS := build-image
261+
TEST_TARGETS := test
267262

268263
ifneq ($(BUILD_MULTI_ARCH_IMAGES),true)
269264
include $(CURDIR)/native-only.mk
270265
else
271266
include $(CURDIR)/multi-arch.mk
272267
endif
273268

274-
ALL_TARGETS := $(DISTRIBUTIONS) $(PUSH_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) docker-image
269+
ALL_TARGETS := $(PUSH_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) docker-image
275270
.PHONY: $(ALL_TARGETS)
276271

277272
build-%: DOCKERFILE = $(CURDIR)/docker/Dockerfile
278273

279-
$(DISTRIBUTIONS): %: build-%
280-
$(BUILD_TARGETS): build-%:
274+
build-image:
281275
DOCKER_BUILDKIT=1 \
282276
$(DOCKER) $(BUILDX) build --pull \
283277
$(DOCKER_BUILD_OPTIONS) \
284278
$(DOCKER_BUILD_PLATFORM_OPTIONS) \
285279
--tag $(IMAGE) \
286280
--build-arg VERSION="$(VERSION)" \
287281
--build-arg BUILDER_IMAGE="$(BUILDER_IMAGE)" \
288-
--build-arg CUDA_SAMPLE_IMAGE=nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda$(CUDA_SAMPLES_VERSION) \
289282
--build-arg GOLANG_VERSION="$(GOLANG_VERSION)" \
290-
--build-arg CVE_UPDATES="$(CVE_UPDATES)" \
291283
--build-arg GIT_COMMIT="$(GIT_COMMIT)" \
292284
--file $(DOCKERFILE) $(CURDIR)
293285

294286
# Provide a utility target to build the images to allow for use in external tools.
295287
# This includes https://github.com/openshift-psap/ci-artifacts
296288
docker-image: OUT_IMAGE ?= $(IMAGE_NAME):$(IMAGE_TAG)
297-
docker-image: ${DEFAULT_PUSH_TARGET}
298289

299290
install-tools:
300291
@echo Installing tools from tools.go

cmd/nvidia-validator/main.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,8 @@ const (
217217
appComponentLabelKey = "app.kubernetes.io/component"
218218
// wslNvidiaSMIPath indicates the path to the nvidia-smi binary on WSL
219219
wslNvidiaSMIPath = "/usr/lib/wsl/lib/nvidia-smi"
220+
// shell indicates what shell to use when invoking commands in a subprocess
221+
shell = "sh"
220222
)
221223

222224
func main() {
@@ -616,6 +618,7 @@ func runCommandWithWait(command string, args []string, sleepSeconds int, silent
616618
fmt.Printf("running command %s with args %v\n", command, args)
617619
err := cmd.Run()
618620
if err != nil {
621+
log.Warningf("error running command: %v", err)
619622
fmt.Printf("command failed, retrying after %d seconds\n", sleepSeconds)
620623
time.Sleep(time.Duration(sleepSeconds) * time.Second)
621624
continue
@@ -649,7 +652,7 @@ func setEnvVar(envvars []string, key, value string) []string {
649652
// For driver container installs, check existence of .driver-ctr-ready to confirm running driver
650653
// container has completed and is in Ready state.
651654
func assertDriverContainerReady(silent bool) error {
652-
command := "bash"
655+
command := shell
653656
args := []string{"-c", "stat /run/nvidia/validations/.driver-ctr-ready"}
654657

655658
if withWaitFlag {
@@ -932,7 +935,7 @@ func (n *NvidiaFs) validate() error {
932935

933936
func (n *NvidiaFs) runValidation(silent bool) error {
934937
// check for nvidia_fs module to be loaded
935-
command := "bash"
938+
command := shell
936939
args := []string{"-c", "lsmod | grep nvidia_fs"}
937940

938941
if withWaitFlag {
@@ -1067,7 +1070,7 @@ func (m *MOFED) validate() error {
10671070

10681071
func (m *MOFED) runValidation(silent bool) error {
10691072
// check for mlx5_core module to be loaded
1070-
command := "bash"
1073+
command := shell
10711074
args := []string{"-c", "lsmod | grep mlx5_core"}
10721075

10731076
// If MOFED container is running then use readiness flag set by the driver container instead
@@ -1632,7 +1635,7 @@ func (c *CCManager) setKubeClient(kubeClient kubernetes.Interface) {
16321635

16331636
// Check that the ccManager container is ready after applying required ccMode
16341637
func assertCCManagerContainerReady(silent, withWaitFlag bool) error {
1635-
command := "bash"
1638+
command := shell
16361639
args := []string{"-c", "stat /run/nvidia/validations/.cc-manager-ctr-ready"}
16371640

16381641
if withWaitFlag {

docker/Dockerfile

Lines changed: 17 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
ARG CUDA_SAMPLE_IMAGE=undefined
1615
ARG GOLANG_VERSION=x.x.x
1716

1817
FROM nvcr.io/nvidia/cuda:12.9.0-base-ubi9 as builder
@@ -56,17 +55,17 @@ ARG VERSION="unknown"
5655
ARG GIT_COMMIT="unknown"
5756
RUN make cmds
5857

59-
FROM ${CUDA_SAMPLE_IMAGE} AS sample-builder
60-
61-
FROM nvcr.io/nvidia/cuda:12.9.0-base-ubi9
58+
# Install must-gather dependency: `kubectl`
59+
ARG TARGETARCH
60+
RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && curl -LO https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/${OS_ARCH}/kubectl
61+
RUN chmod +x ./kubectl
62+
RUN mv ./kubectl /usr/local/bin
6263

63-
# Remove CUDA libs(compat etc) in favor of libs installed by the NVIDIA driver
64-
RUN dnf remove -y cuda-*
64+
FROM nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0 AS sample-builder
6565

66-
RUN dnf install -y \
67-
kmod \
68-
pciutils && \
69-
rm -rf /var/cache/yum/*
66+
# The C/C++ distroless image is used as a base since the CUDA vectorAdd
67+
# sample application depends on C/C++ libraries.
68+
FROM nvcr.io/nvidia/distroless/cc:v3.1.7-dev
7069

7170
ENV NVIDIA_VISIBLE_DEVICES=void
7271

@@ -84,40 +83,27 @@ LABEL vsc-ref=${GIT_COMMIT}
8483

8584
WORKDIR /
8685
COPY --from=builder /workspace/gpu-operator /usr/bin/
86+
COPY --from=builder /workspace/kubectl /usr/bin/
8787
COPY --from=builder /workspace/nvidia-validator /usr/bin/
8888
COPY --from=sample-builder /cuda-samples/vectorAdd /usr/bin/vectorAdd
89+
# TODO: Copy the compat libs from the 'sample-builder' image instead.
90+
# The current 'sample-builder' image does not contain the compat libs in the ARM variant.
91+
# Once new sample images are published that contain the compat libs, we can update the below.
92+
COPY --from=builder /usr/local/cuda/compat /usr/local/cuda/compat
8993

90-
# gpu-operator manifests
91-
RUN mkdir -p /opt/gpu-operator/manifests
9294
COPY assets /opt/gpu-operator/
9395
COPY manifests /opt/gpu-operator/manifests
96+
COPY validator/manifests /opt/validator/manifests
9497

95-
# validator manifests
96-
RUN mkdir -p /opt/validator/manifests
97-
COPY validator/manifests/plugin-workload-validation.yaml /opt/validator/manifests
98-
COPY validator/manifests/cuda-workload-validation.yaml /opt/validator/manifests
99-
100-
RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE
10198
COPY hack/must-gather.sh /usr/bin/gather
10299

103-
# Install must-gather dependency: `kubectl`
104-
ARG TARGETARCH
105-
RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && curl -LO https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/${OS_ARCH}/kubectl
106-
RUN chmod +x ./kubectl
107-
RUN mv ./kubectl /usr/local/bin
108-
109100
# Add CRD resource into the image for helm upgrades
110101
COPY deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml /opt/gpu-operator/nvidia.com_clusterpolicies.yaml
111102
COPY deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml /opt/gpu-operator/nvidia.com_nvidiadrivers.yaml
112103
COPY deployments/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml /opt/gpu-operator/nfd-api-crds.yaml
113104

114-
# Install / upgrade packages here that are required to resolve CVEs
115-
ARG CVE_UPDATES
116-
RUN if [ -n "${CVE_UPDATES}" ]; then \
117-
dnf update -y ${CVE_UPDATES} && \
118-
rm -rf /var/cache/yum/*; \
119-
fi
120-
121105
USER 65532:65532
122106

107+
COPY LICENSE /licenses/
108+
123109
ENTRYPOINT ["/usr/bin/gpu-operator"]

multi-arch.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ DOCKER_BUILD_OPTIONS = --output=type=image,push=$(PUSH_ON_BUILD) --provenance=$(
1818
DOCKER_BUILD_PLATFORM_OPTIONS ?= --platform=linux/amd64,linux/arm64
1919

2020
REGCTL ?= regctl
21-
$(PUSH_TARGETS): push-%:
21+
push-image:
2222
$(REGCTL) \
2323
image copy \
2424
$(IMAGE) $(OUT_IMAGE)

native-only.mk

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
PUSH_ON_BUILD ?= false
1616
DOCKER_BUILD_PLATFORM_OPTIONS ?= --platform=linux/amd64
1717
DOCKER_BUILD_OPTIONS = --output=type=image,push=$(PUSH_ON_BUILD) --provenance=$(ATTACH_ATTESTATIONS) --sbom=$(ATTACH_ATTESTATIONS)
18-
$(PUSH_TARGETS): OUT_IMAGE ?= $(IMAGE_NAME):$(IMAGE_TAG)
19-
$(PUSH_TARGETS): push-%:
18+
19+
push-image: OUT_IMAGE ?= $(IMAGE_NAME):$(IMAGE_TAG)
20+
push-image:
2021
$(DOCKER) tag "$(IMAGE_NAME):$(VERSION)-$(DEFAULT_PUSH_TARGET)" "$(OUT_IMAGE)"
2122
$(DOCKER) push "$(OUT_IMAGE)"

versions.mk

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,4 @@ GOLANG_VERSION ?= 1.24.4
2323

2424
GOLANGCI_LINT_VERSION ?= v2.1.6
2525

26-
CUDA_SAMPLES_VERSION ?= 12.5.0
27-
2826
GIT_COMMIT ?= $(shell git describe --match="" --dirty --long --always 2> /dev/null || echo "")

0 commit comments

Comments
 (0)