Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
FROM nvcr.io/nvidia/cuda:13.1.1-base-ubuntu22.04
ARG GOLANG_VERSION=1.24.13
FROM nvcr.io/nvidia/cuda:13.2.1-base-ubuntu22.04
ARG GOLANG_VERSION=1.26.2
ARG USERNAME=developer
ARG USER_UID=1000
ARG USER_GID=1000
Expand Down Expand Up @@ -83,12 +83,21 @@ RUN set -eux; \
\
tar -C /usr/local -xzf go.tgz; \
rm go.tgz
ENV GOTOOLCHAIN=local
# GOTOOLCHAIN=auto lets Go honour `toolchain` directives in go.mod, auto-
# fetching the matching version if the baked-in compiler is older. Costs
# one toolchain download per fresh build cache, then nothing.
ENV GOTOOLCHAIN=auto
ENV GOPATH=/go
ENV PATH=$GOPATH/bin:$PATH
RUN mkdir -p "$GOPATH/src" "$GOPATH/bin" && chmod -R 1777 "$GOPATH"
ENV PATH=$PATH:/usr/local/go/bin

ARG UV_VERSION=0.11.7
RUN curl -LsSf "https://astral.sh/uv/${UV_VERSION}/install.sh" | sh \
&& mv /root/.local/bin/uv /usr/local/bin/uv \
&& mv /root/.local/bin/uvx /usr/local/bin/uvx \
&& uv --version

# Required for DCGM metrics
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
# disable all constraints on the configurations required by NVIDIA container toolkit
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
*.swp
*.swo
dcgm-exporter
.go/
.cursor/
!etc/
!deployment/
.env
Expand All @@ -9,6 +11,7 @@ dcgm-exporter
vendor/
tests.cov
test_results.json
.coverdata/
.scannerwork
dist/
.run
Expand Down
2 changes: 1 addition & 1 deletion .hadolint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Ignored rules with justification:
# - DL3008/DL3041: Package version pinning not used because:
# * We intentionally use the latest DCGM version available in NVIDIA repos
# * Version control is provided by the versioned CUDA base image (e.g., cuda:13.1.1)
# * Version control is provided by the versioned CUDA base image (e.g., cuda:13.2.0)
# * Allows automatic security patches and bug fixes within compatible versions
# * Pinning would require Dockerfile updates for every DCGM patch release
# * Build tools (wget, gcc) are ephemeral and don't affect final image
Expand Down
93 changes: 70 additions & 23 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,15 @@ include hack/VERSION

REGISTRY ?= nvidia
GO ?= go
GOBIN_DIR := $(or $(shell $(GO) env GOBIN),$(shell $(GO) env GOPATH)/bin)
MKDIR ?= mkdir
GOLANGCILINT_TIMEOUT ?= 10m
IMAGE_TAG ?= ""

export PATH := $(GOBIN_DIR):$(PATH)

DCGM_VERSION := $(NEW_DCGM_VERSION)
GOLANG_VERSION := 1.24.13
GOLANG_VERSION := 1.26.2
VERSION := $(NEW_EXPORTER_VERSION)
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)
OUTPUT := type=oci,dest=/dev/null
Expand Down Expand Up @@ -63,14 +66,14 @@ ubi%: DOCKERFILE = docker/Dockerfile
ubi%: BUILD_TARGET = runtime-ubi
ubi%: --docker-build-%
@
ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:13.1.1-base-ubi9
ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:13.2.1-base-ubi9
ubi9: IMAGE_TAG = ubi9

ubuntu%: DOCKERFILE = docker/Dockerfile
ubuntu%: BUILD_TARGET = runtime-ubuntu
ubuntu%: --docker-build-%
@
ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:13.1.1-base-ubuntu22.04
ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:13.2.1-base-ubuntu22.04
ubuntu22.04: IMAGE_TAG = ubuntu22.04

distroless: DOCKERFILE = docker/Dockerfile
Expand All @@ -80,6 +83,7 @@ distroless: --docker-build-distroless

--docker-build-%:
@echo "Building for $@ with target $(BUILD_TARGET)"
mkdir -p .go/compiler .go/pkg/mod
docker buildx inspect
DOCKER_BUILDKIT=1 \
$(DOCKERCMD) --pull \
Expand All @@ -92,6 +96,9 @@ distroless: --docker-build-distroless
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
--build-arg "VERSION=$(VERSION)" \
$(if $(GOPROXY),--build-arg "GOPROXY=$(GOPROXY)") \
$(if $(GONOSUMDB),--build-arg "GONOSUMDB=$(GONOSUMDB)") \
$(if $(GOSUMDB),--build-arg "GOSUMDB=$(GOSUMDB)") \
--tag $(REGISTRY)/dcgm-exporter:$(FULL_VERSION)$(if $(IMAGE_TAG),-$(IMAGE_TAG)) \
--file $(DOCKERFILE) .

Expand All @@ -104,18 +111,27 @@ package-arm64:
package-amd64:
$(MAKE) package-build PLATFORMS=linux/amd64

ifeq ($(GOPROXY_ENABLED),true)
package-build: BUILD_TYPE = distroless
package-build: IMAGE_TAG = distroless
DIST_PREFIX = stig-
else
package-build: BUILD_TYPE = ubuntu22.04
package-build: IMAGE_TAG = ubuntu22.04
DIST_PREFIX =
endif

package-build:
ARCH=`echo $(PLATFORMS) | cut -d'/' -f2)`; \
ARCH=`echo $(PLATFORMS) | cut -d'/' -f2`; \
if [ "$$ARCH" = "amd64" ]; then \
ARCH="x86-64"; \
fi; \
if [ "$$ARCH" = "arm64" ]; then \
ARCH="sbsa"; \
fi; \
export DIST_NAME="dcgm_exporter-linux-$$ARCH-$(VERSION)"; \
export DIST_NAME="dcgm_exporter-$(DIST_PREFIX)linux-$$ARCH-$(VERSION)"; \
export COMPONENT_NAME="dcgm_exporter"; \
$(MAKE) ubuntu22.04 OUTPUT=type=docker PLATFORMS=$(PLATFORMS) && \
$(MAKE) $(BUILD_TYPE) OUTPUT=type=docker PLATFORMS=$(PLATFORMS) && \
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME && \
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/usr/bin && \
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/etc/dcgm-exporter && \
Expand All @@ -135,26 +151,58 @@ package-build:
test-integration: generate
go test -race -count=1 -timeout 5m -v $(TEST_ARGS) ./tests/integration/

.PHONY: test-coverage
test-coverage:
@echo "Preparing coverage data directories..."
@rm -rf .coverdata
@mkdir -p .coverdata/unit .coverdata/integration .coverdata/merged
@echo "Running unit tests..."
gotestsum --format testname -- \
$$(go list ./... | grep -v "/tests/e2e/") \
$$($(GO) list ./... | grep -v "/tests/e2e/") \
-count=1 -timeout 5m \
-covermode=count \
-coverprofile=unit_coverage.out \
--short
-cover -covermode=count \
--short \
-args -test.gocoverdir=$(CURDIR)/.coverdata/unit
@echo "Running integration tests..."
gotestsum --format testname -- \
./internal/pkg/integration_test/... \
-count=1 -timeout 5m \
-covermode=count \
-cover -covermode=count \
-coverpkg=./internal/pkg/... \
-coverprofile=integration_coverage.out \
--short \
-args -test.gocoverdir=$(CURDIR)/.coverdata/integration
@echo "Merging coverage data..."
$(GO) tool covdata merge \
-i=$(CURDIR)/.coverdata/unit,$(CURDIR)/.coverdata/integration \
-o=$(CURDIR)/.coverdata/merged
@echo "Coverage summary (pre-filter):"
$(GO) tool covdata percent -i=$(CURDIR)/.coverdata/merged
$(GO) tool covdata textfmt \
-i=$(CURDIR)/.coverdata/merged \
-o=combined_coverage.out.tmp
grep -v "mock_" combined_coverage.out.tmp > tests.cov
rm -rf combined_coverage.out.tmp .coverdata
$(GO) tool cover -func=tests.cov

# Unit tests only with coverage (for CI without GPU/DCGM)
# Skips integration tests that require DCGM library
# Skips nvmlprovider tests that require NVML library (GPU)
# Emits a single coverage profile directly (no merge step)
# Generates test_results.json for SonarQube integration
.PHONY: unit-test-coverage
unit-test-coverage:
@echo "Running unit tests only (skipping integration tests and nvmlprovider)..."
gotestsum --format testname --jsonfile test_results.json -- \
$$(go list ./... | grep -v -E "(tests/e2e|integration_test|nvmlprovider)") \
-count=1 -timeout 5m \
-covermode=count \
-coverprofile=tests.cov \
--short
@echo "Merging coverage profiles..."
gocovmerge unit_coverage.out integration_coverage.out > combined_coverage.out.tmp
cat combined_coverage.out.tmp | grep -v "mock_" > tests.cov
rm combined_coverage.out.tmp integration_coverage.out unit_coverage.out
@echo "Filtering out mock files from coverage..."
@if [ -f tests.cov ]; then \
grep -v "mock_" tests.cov > tests.cov.tmp && mv tests.cov.tmp tests.cov || true; \
fi
@echo "Unit test coverage completed"
go tool cover -func=tests.cov

.PHONY: lint
Expand Down Expand Up @@ -194,22 +242,21 @@ validate: validate-modules hadolint check-fmt ## Run all validation checks

.PHONY: tools
tools: ## Install required tools and utilities
curl -sSfL https://golangci-lint.run/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v2.8.0
go install golang.org/x/tools/cmd/goimports@v0.41.0
go install mvdan.cc/gofumpt@v0.9.2
go install github.com/wadey/gocovmerge@v0.0.0-20160331181800-b5bfa59ec0ad
go install gotest.tools/gotestsum@v1.13.0
curl -sSfL https://golangci-lint.run/install.sh | sh -s -- -b $(GOBIN_DIR) v2.11.4
$(GO) install golang.org/x/tools/cmd/goimports@v0.44.0
$(GO) install mvdan.cc/gofumpt@v0.9.2
$(GO) install gotest.tools/gotestsum@v1.13.0

fmt:
find . -name '*.go' | xargs gofumpt -l -w
find . -path './.go' -prune -o -name '*.go' -print | xargs gofumpt -l -w

goimports:
go list -f {{.Dir}} $(MODULE)/... \
| xargs goimports -local $(MODULE) -w

check-fmt:
@echo "Checking code formatting. Any listed files don't match goimports:"
! (find . -iname "*.go" \
! (find . -path './.go' -prune -o -path './internal/mocks' -prune -o -path './third_party' -prune -o -path './examples' -prune -o -iname "*.go" -print \
| xargs goimports -l -local $(MODULE) | grep .)

.PHONY: e2e-test
Expand Down
35 changes: 34 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https
To gather metrics on a GPU node, simply start the `dcgm-exporter` container:

```shell
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.5.3-4.8.2-distroless
curl localhost:9400/metrics
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
Expand Down Expand Up @@ -92,6 +92,35 @@ dcgm-exporter --web-config-file=web-config.yaml

A sample `web-config.yaml` file can be fetched from [exporter-toolkit repository](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-config.yml). The reference of the `web-config.yaml` file can be consulted in the [docs](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md).

### IPv6 Support

DCGM-Exporter supports IPv6 addresses for both the remote hostengine connection (`-r`) and the metrics listen address (`-a`). IPv6 addresses must use bracket notation when combined with a port.

#### Remote Hostengine (CLI)

```shell
dcgm-exporter -r "[::1]:5555"
```

#### Remote Hostengine (Environment Variable)

```shell
export DCGM_REMOTE_HOSTENGINE_INFO="[::1]:5555"
dcgm-exporter
```

#### Metrics Listen Address

```shell
dcgm-exporter -a "[::]:9400"
```

**Note:** The brackets in `[::1]:5555` are required by the DCGM connection protocol. When using the CLI, the shell requires quoting (double or single quotes) around the address to prevent bracket interpretation.

#### Prerequisites

The remote `nv-hostengine` must be configured to listen on IPv6. Refer to the [DCGM documentation](https://docs.nvidia.com/datacenter/dcgm/latest/) for configuring `nv-hostengine` bind address options.

### How to include HPC jobs in metric labels

The DCGM-exporter can include High-Performance Computing (HPC) job information into its metric labels. To achieve this, HPC environment administrators must configure their HPC environment to generate files that map GPUs to HPC jobs.
Expand Down Expand Up @@ -164,6 +193,10 @@ Notes:
* Always make sure your entries have 2 commas (',')
* The complete list of counters that can be collected can be found on the DCGM API reference manual: <https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-field-ids.html>

### Profiling Metrics

Please note that for Ampere and earlier generation GPUs, profiling metrics depend on the datacenter-gpu-manager-4-proprietary package. This package is included in the container.

### What about a Grafana Dashboard?

You can find the official NVIDIA DCGM-Exporter dashboard here: <https://grafana.com/grafana/dashboards/12239>
Expand Down
12 changes: 6 additions & 6 deletions dcgm-exporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,24 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "4.8.1"
app.kubernetes.io/version: "4.8.2"
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "4.8.1"
app.kubernetes.io/version: "4.8.2"
template:
metadata:
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "4.8.1"
app.kubernetes.io/version: "4.8.2"
name: "dcgm-exporter"
spec:
automountServiceAccountToken: false
containers:
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless"
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.5.3-4.8.2-distroless"
env:
- name: "DCGM_EXPORTER_LISTEN"
value: ":9400"
Expand Down Expand Up @@ -83,11 +83,11 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "4.8.1"
app.kubernetes.io/version: "4.8.2"
spec:
selector:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "4.8.1"
app.kubernetes.io/version: "4.8.2"
ports:
- name: "metrics"
port: 9400
4 changes: 2 additions & 2 deletions deployment/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
apiVersion: v2
name: dcgm-exporter
description: A Helm chart for DCGM exporter
version: "4.8.1"
version: "4.8.2"
kubeVersion: ">= 1.19.0-0"
appVersion: "4.8.1"
appVersion: "4.8.2"
sources:
- https://github.com/nvidia/dcgm-exporter
home: https://github.com/nvidia/dcgm-exporter/
Expand Down
4 changes: 0 additions & 4 deletions deployment/templates/metrics-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ data:
# Memory usage
DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
DCGM_FI_DEV_FB_RESERVED, gauge, Framebuffer memory reserved (in MiB).

# ECC
# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
Expand Down Expand Up @@ -79,9 +78,6 @@ data:
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed

# Static configuration information. These appear as labels on the other metrics
DCGM_FI_DRIVER_VERSION, label, Driver Version

# DCP metrics
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active.
# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned.
Expand Down
4 changes: 2 additions & 2 deletions deployment/templates/service-monitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ spec:
scrapeTimeout: "{{ .Values.serviceMonitor.scrapeTimeout }}"
honorLabels: {{ .Values.serviceMonitor.honorLabels }}
relabelings:
{{ toYaml .Values.serviceMonitor.relabelings | nindent 6 }}
{{- toYaml .Values.serviceMonitor.relabelings | nindent 6 }}
metricRelabelings:
{{ toYaml .Values.serviceMonitor.metricRelabelings | nindent 6 }}
{{- toYaml .Values.serviceMonitor.metricRelabelings | nindent 6 }}
{{- end -}}
Loading
Loading