diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index a26a20c1..1d50ea1d 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,5 +1,5 @@ -FROM nvcr.io/nvidia/cuda:13.1.1-base-ubuntu22.04 -ARG GOLANG_VERSION=1.24.13 +FROM nvcr.io/nvidia/cuda:13.2.1-base-ubuntu22.04 +ARG GOLANG_VERSION=1.26.2 ARG USERNAME=developer ARG USER_UID=1000 ARG USER_GID=1000 @@ -83,12 +83,21 @@ RUN set -eux; \ \ tar -C /usr/local -xzf go.tgz; \ rm go.tgz -ENV GOTOOLCHAIN=local +# GOTOOLCHAIN=auto lets Go honour `toolchain` directives in go.mod, auto- +# fetching the matching version if the baked-in compiler is older. Costs +# one toolchain download per fresh build cache, then nothing. +ENV GOTOOLCHAIN=auto ENV GOPATH=/go ENV PATH=$GOPATH/bin:$PATH RUN mkdir -p "$GOPATH/src" "$GOPATH/bin" && chmod -R 1777 "$GOPATH" ENV PATH=$PATH:/usr/local/go/bin +ARG UV_VERSION=0.11.7 +RUN curl -LsSf "https://astral.sh/uv/${UV_VERSION}/install.sh" | sh \ + && mv /root/.local/bin/uv /usr/local/bin/uv \ + && mv /root/.local/bin/uvx /usr/local/bin/uvx \ + && uv --version + # Required for DCGM metrics ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 # disable all constraints on the configurations required by NVIDIA container toolkit diff --git a/.gitignore b/.gitignore index d6a3ac48..7950722f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ *.swp *.swo dcgm-exporter +.go/ +.cursor/ !etc/ !deployment/ .env @@ -9,6 +11,7 @@ dcgm-exporter vendor/ tests.cov test_results.json +.coverdata/ .scannerwork dist/ .run diff --git a/.hadolint.yaml b/.hadolint.yaml index 62a1da5d..56432bcf 100644 --- a/.hadolint.yaml +++ b/.hadolint.yaml @@ -4,7 +4,7 @@ # Ignored rules with justification: # - DL3008/DL3041: Package version pinning not used because: # * We intentionally use the latest DCGM version available in NVIDIA repos -# * Version control is provided by the versioned CUDA base image (e.g., cuda:13.1.1) +# * Version control is provided by the versioned CUDA base image (e.g., cuda:13.2.0) # * Allows automatic security patches and bug fixes within compatible versions # * Pinning would require Dockerfile updates for every DCGM patch release # * Build tools (wget, gcc) are ephemeral and don't affect final image diff --git a/Makefile b/Makefile index 1d564718..ca03ea17 100644 --- a/Makefile +++ b/Makefile @@ -16,12 +16,15 @@ include hack/VERSION REGISTRY ?= nvidia GO ?= go +GOBIN_DIR := $(or $(shell $(GO) env GOBIN),$(shell $(GO) env GOPATH)/bin) MKDIR ?= mkdir GOLANGCILINT_TIMEOUT ?= 10m IMAGE_TAG ?= "" +export PATH := $(GOBIN_DIR):$(PATH) + DCGM_VERSION := $(NEW_DCGM_VERSION) -GOLANG_VERSION := 1.24.13 +GOLANG_VERSION := 1.26.2 VERSION := $(NEW_EXPORTER_VERSION) FULL_VERSION := $(DCGM_VERSION)-$(VERSION) OUTPUT := type=oci,dest=/dev/null @@ -63,14 +66,14 @@ ubi%: DOCKERFILE = docker/Dockerfile ubi%: BUILD_TARGET = runtime-ubi ubi%: --docker-build-% @ -ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:13.1.1-base-ubi9 +ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:13.2.1-base-ubi9 ubi9: IMAGE_TAG = ubi9 ubuntu%: DOCKERFILE = docker/Dockerfile ubuntu%: BUILD_TARGET = runtime-ubuntu ubuntu%: --docker-build-% @ -ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:13.1.1-base-ubuntu22.04 +ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:13.2.1-base-ubuntu22.04 ubuntu22.04: IMAGE_TAG = ubuntu22.04 distroless: DOCKERFILE = docker/Dockerfile @@ -80,6 +83,7 @@ distroless: --docker-build-distroless --docker-build-%: @echo "Building for $@ with target $(BUILD_TARGET)" + mkdir -p .go/compiler .go/pkg/mod docker buildx inspect DOCKER_BUILDKIT=1 \ $(DOCKERCMD) --pull \ @@ -92,6 +96,9 @@ distroless: --docker-build-distroless --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \ --build-arg "DCGM_VERSION=$(DCGM_VERSION)" \ --build-arg "VERSION=$(VERSION)" \ + $(if $(GOPROXY),--build-arg "GOPROXY=$(GOPROXY)") \ + $(if $(GONOSUMDB),--build-arg "GONOSUMDB=$(GONOSUMDB)") \ + $(if $(GOSUMDB),--build-arg "GOSUMDB=$(GOSUMDB)") \ --tag $(REGISTRY)/dcgm-exporter:$(FULL_VERSION)$(if $(IMAGE_TAG),-$(IMAGE_TAG)) \ --file $(DOCKERFILE) . @@ -104,18 +111,27 @@ package-arm64: package-amd64: $(MAKE) package-build PLATFORMS=linux/amd64 +ifeq ($(GOPROXY_ENABLED),true) +package-build: BUILD_TYPE = distroless +package-build: IMAGE_TAG = distroless +DIST_PREFIX = stig- +else +package-build: BUILD_TYPE = ubuntu22.04 package-build: IMAGE_TAG = ubuntu22.04 +DIST_PREFIX = +endif + package-build: - ARCH=`echo $(PLATFORMS) | cut -d'/' -f2)`; \ + ARCH=`echo $(PLATFORMS) | cut -d'/' -f2`; \ if [ "$$ARCH" = "amd64" ]; then \ ARCH="x86-64"; \ fi; \ if [ "$$ARCH" = "arm64" ]; then \ ARCH="sbsa"; \ fi; \ - export DIST_NAME="dcgm_exporter-linux-$$ARCH-$(VERSION)"; \ + export DIST_NAME="dcgm_exporter-$(DIST_PREFIX)linux-$$ARCH-$(VERSION)"; \ export COMPONENT_NAME="dcgm_exporter"; \ - $(MAKE) ubuntu22.04 OUTPUT=type=docker PLATFORMS=$(PLATFORMS) && \ + $(MAKE) $(BUILD_TYPE) OUTPUT=type=docker PLATFORMS=$(PLATFORMS) && \ $(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME && \ $(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/usr/bin && \ $(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/etc/dcgm-exporter && \ @@ -135,26 +151,58 @@ package-build: test-integration: generate go test -race -count=1 -timeout 5m -v $(TEST_ARGS) ./tests/integration/ +.PHONY: test-coverage test-coverage: + @echo "Preparing coverage data directories..." + @rm -rf .coverdata + @mkdir -p .coverdata/unit .coverdata/integration .coverdata/merged @echo "Running unit tests..." gotestsum --format testname -- \ - $$(go list ./... | grep -v "/tests/e2e/") \ + $$($(GO) list ./... | grep -v "/tests/e2e/") \ -count=1 -timeout 5m \ - -covermode=count \ - -coverprofile=unit_coverage.out \ - --short + -cover -covermode=count \ + --short \ + -args -test.gocoverdir=$(CURDIR)/.coverdata/unit @echo "Running integration tests..." gotestsum --format testname -- \ ./internal/pkg/integration_test/... \ -count=1 -timeout 5m \ - -covermode=count \ + -cover -covermode=count \ -coverpkg=./internal/pkg/... \ - -coverprofile=integration_coverage.out \ + --short \ + -args -test.gocoverdir=$(CURDIR)/.coverdata/integration + @echo "Merging coverage data..." + $(GO) tool covdata merge \ + -i=$(CURDIR)/.coverdata/unit,$(CURDIR)/.coverdata/integration \ + -o=$(CURDIR)/.coverdata/merged + @echo "Coverage summary (pre-filter):" + $(GO) tool covdata percent -i=$(CURDIR)/.coverdata/merged + $(GO) tool covdata textfmt \ + -i=$(CURDIR)/.coverdata/merged \ + -o=combined_coverage.out.tmp + grep -v "mock_" combined_coverage.out.tmp > tests.cov + rm -rf combined_coverage.out.tmp .coverdata + $(GO) tool cover -func=tests.cov + +# Unit tests only with coverage (for CI without GPU/DCGM) +# Skips integration tests that require DCGM library +# Skips nvmlprovider tests that require NVML library (GPU) +# Emits a single coverage profile directly (no merge step) +# Generates test_results.json for SonarQube integration +.PHONY: unit-test-coverage +unit-test-coverage: + @echo "Running unit tests only (skipping integration tests and nvmlprovider)..." + gotestsum --format testname --jsonfile test_results.json -- \ + $$(go list ./... | grep -v -E "(tests/e2e|integration_test|nvmlprovider)") \ + -count=1 -timeout 5m \ + -covermode=count \ + -coverprofile=tests.cov \ --short - @echo "Merging coverage profiles..." - gocovmerge unit_coverage.out integration_coverage.out > combined_coverage.out.tmp - cat combined_coverage.out.tmp | grep -v "mock_" > tests.cov - rm combined_coverage.out.tmp integration_coverage.out unit_coverage.out + @echo "Filtering out mock files from coverage..." + @if [ -f tests.cov ]; then \ + grep -v "mock_" tests.cov > tests.cov.tmp && mv tests.cov.tmp tests.cov || true; \ + fi + @echo "Unit test coverage completed" go tool cover -func=tests.cov .PHONY: lint @@ -194,14 +242,13 @@ validate: validate-modules hadolint check-fmt ## Run all validation checks .PHONY: tools tools: ## Install required tools and utilities - curl -sSfL https://golangci-lint.run/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v2.8.0 - go install golang.org/x/tools/cmd/goimports@v0.41.0 - go install mvdan.cc/gofumpt@v0.9.2 - go install github.com/wadey/gocovmerge@v0.0.0-20160331181800-b5bfa59ec0ad - go install gotest.tools/gotestsum@v1.13.0 + curl -sSfL https://golangci-lint.run/install.sh | sh -s -- -b $(GOBIN_DIR) v2.11.4 + $(GO) install golang.org/x/tools/cmd/goimports@v0.44.0 + $(GO) install mvdan.cc/gofumpt@v0.9.2 + $(GO) install gotest.tools/gotestsum@v1.13.0 fmt: - find . -name '*.go' | xargs gofumpt -l -w + find . -path './.go' -prune -o -name '*.go' -print | xargs gofumpt -l -w goimports: go list -f {{.Dir}} $(MODULE)/... \ @@ -209,7 +256,7 @@ goimports: check-fmt: @echo "Checking code formatting. Any listed files don't match goimports:" - ! (find . -iname "*.go" \ + ! (find . -path './.go' -prune -o -path './internal/mocks' -prune -o -path './third_party' -prune -o -path './examples' -prune -o -iname "*.go" -print \ | xargs goimports -l -local $(MODULE) | grep .) .PHONY: e2e-test diff --git a/README.md b/README.md index e8ad4113..3359d6cc 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https To gather metrics on a GPU node, simply start the `dcgm-exporter` container: ```shell -docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless +docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.5.3-4.8.2-distroless curl localhost:9400/metrics # HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). # TYPE DCGM_FI_DEV_SM_CLOCK gauge @@ -92,6 +92,35 @@ dcgm-exporter --web-config-file=web-config.yaml A sample `web-config.yaml` file can be fetched from [exporter-toolkit repository](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-config.yml). The reference of the `web-config.yaml` file can be consulted in the [docs](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md). +### IPv6 Support + +DCGM-Exporter supports IPv6 addresses for both the remote hostengine connection (`-r`) and the metrics listen address (`-a`). IPv6 addresses must use bracket notation when combined with a port. + +#### Remote Hostengine (CLI) + +```shell +dcgm-exporter -r "[::1]:5555" +``` + +#### Remote Hostengine (Environment Variable) + +```shell +export DCGM_REMOTE_HOSTENGINE_INFO="[::1]:5555" +dcgm-exporter +``` + +#### Metrics Listen Address + +```shell +dcgm-exporter -a "[::]:9400" +``` + +**Note:** The brackets in `[::1]:5555` are required by the DCGM connection protocol. When using the CLI, the shell requires quoting (double or single quotes) around the address to prevent bracket interpretation. + +#### Prerequisites + +The remote `nv-hostengine` must be configured to listen on IPv6. Refer to the [DCGM documentation](https://docs.nvidia.com/datacenter/dcgm/latest/) for configuring `nv-hostengine` bind address options. + ### How to include HPC jobs in metric labels The DCGM-exporter can include High-Performance Computing (HPC) job information into its metric labels. To achieve this, HPC environment administrators must configure their HPC environment to generate files that map GPUs to HPC jobs. @@ -164,6 +193,10 @@ Notes: * Always make sure your entries have 2 commas (',') * The complete list of counters that can be collected can be found on the DCGM API reference manual: +### Profiling Metrics + +Please note that for Ampere and earlier generation GPUs, profiling metrics depend on the datacenter-gpu-manager-4-proprietary package. This package is included in the container. + ### What about a Grafana Dashboard? You can find the official NVIDIA DCGM-Exporter dashboard here: diff --git a/dcgm-exporter.yaml b/dcgm-exporter.yaml index 3c813372..6572d4ef 100644 --- a/dcgm-exporter.yaml +++ b/dcgm-exporter.yaml @@ -18,24 +18,24 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "4.8.1" + app.kubernetes.io/version: "4.8.2" spec: updateStrategy: type: RollingUpdate selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "4.8.1" + app.kubernetes.io/version: "4.8.2" template: metadata: labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "4.8.1" + app.kubernetes.io/version: "4.8.2" name: "dcgm-exporter" spec: automountServiceAccountToken: false containers: - - image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless" + - image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.5.3-4.8.2-distroless" env: - name: "DCGM_EXPORTER_LISTEN" value: ":9400" @@ -83,11 +83,11 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "4.8.1" + app.kubernetes.io/version: "4.8.2" spec: selector: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "4.8.1" + app.kubernetes.io/version: "4.8.2" ports: - name: "metrics" port: 9400 diff --git a/deployment/Chart.yaml b/deployment/Chart.yaml index 7556f7e3..ac8ac64b 100644 --- a/deployment/Chart.yaml +++ b/deployment/Chart.yaml @@ -1,9 +1,9 @@ apiVersion: v2 name: dcgm-exporter description: A Helm chart for DCGM exporter -version: "4.8.1" +version: "4.8.2" kubeVersion: ">= 1.19.0-0" -appVersion: "4.8.1" +appVersion: "4.8.2" sources: - https://github.com/nvidia/dcgm-exporter home: https://github.com/nvidia/dcgm-exporter/ diff --git a/deployment/templates/metrics-configmap.yaml b/deployment/templates/metrics-configmap.yaml index 5b8b77b5..4b56c857 100644 --- a/deployment/templates/metrics-configmap.yaml +++ b/deployment/templates/metrics-configmap.yaml @@ -50,7 +50,6 @@ data: # Memory usage DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). - DCGM_FI_DEV_FB_RESERVED, gauge, Framebuffer memory reserved (in MiB). # ECC # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. @@ -79,9 +78,6 @@ data: DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed - # Static configuration information. These appear as labels on the other metrics - DCGM_FI_DRIVER_VERSION, label, Driver Version - # DCP metrics DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active. # DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned. diff --git a/deployment/templates/service-monitor.yaml b/deployment/templates/service-monitor.yaml index fd4afddf..10d2929c 100644 --- a/deployment/templates/service-monitor.yaml +++ b/deployment/templates/service-monitor.yaml @@ -37,7 +37,7 @@ spec: scrapeTimeout: "{{ .Values.serviceMonitor.scrapeTimeout }}" honorLabels: {{ .Values.serviceMonitor.honorLabels }} relabelings: - {{ toYaml .Values.serviceMonitor.relabelings | nindent 6 }} + {{- toYaml .Values.serviceMonitor.relabelings | nindent 6 }} metricRelabelings: - {{ toYaml .Values.serviceMonitor.metricRelabelings | nindent 6 }} + {{- toYaml .Values.serviceMonitor.metricRelabelings | nindent 6 }} {{- end -}} diff --git a/deployment/values.yaml b/deployment/values.yaml index b663c406..d7af1df8 100644 --- a/deployment/values.yaml +++ b/deployment/values.yaml @@ -17,7 +17,7 @@ image: pullPolicy: IfNotPresent # Image tag defaults to AppVersion, but you can use the tag key # for the image tag, e.g: - tag: 4.5.2-4.8.1-distroless + tag: 4.5.3-4.8.2-distroless # Change the following reference to "/etc/dcgm-exporter/default-counters.csv" # to stop profiling metrics from DCGM @@ -305,7 +305,6 @@ kubernetesDRA: # Memory usage # DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). # DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). - # DCGM_FI_DEV_FB_RESERVED, gauge, Framebuffer memory reserved (in MiB). # ECC # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. @@ -346,9 +345,6 @@ kubernetesDRA: # DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. # DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. - # Static configuration information - # DCGM_FI_DRIVER_VERSION, label, Driver Version - livenessProbe: initialDelaySeconds: 45 periodSeconds: 5 diff --git a/docker/Dockerfile b/docker/Dockerfile index a3390f93..c97c2266 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -12,14 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG BASEIMAGE=nvcr.io/nvidia/cuda:13.1.1-base-ubuntu22.04 +ARG BASEIMAGE=nvcr.io/nvidia/cuda:13.2.1-base-ubuntu22.04 ##### # Common builder stage - compiles dcgm-exporter with CGO for all targets ##### FROM --platform=$BUILDPLATFORM ubuntu:22.04 AS builder -ARG GOLANG_VERSION=1.24.13 +ARG GOLANG_VERSION=1.26.2 WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter @@ -41,47 +41,61 @@ RUN --mount=type=cache,sharing=locked,target=/var/cache/apt \ && apt-get autoremove -y \ && ln -sf /usr/aarch64-linux-gnu/lib/ld-linux-aarch64.so.1 /lib/ld-linux-aarch64.so.1 -# Install Go +# Copy cached Go compiler and modules for STIG hermetic builds. +# In regular mode these directories exist but are empty (created by Makefile/CI). +COPY .go/compiler/ .go/compiler/ +COPY .go/pkg/mod/ /go/pkg/mod/ + +# Install Go - uses cached compiler for STIG hermetic builds, otherwise downloads +# with SHA256 verification for standard builds. RUN set -eux; \ arch="$(dpkg --print-architecture)"; arch="${arch##*-}"; \ - url=; \ - filename=; \ - case "$arch" in \ - 'amd64') \ - filename="go${GOLANG_VERSION}.linux-amd64.tar.gz"; \ - url="https://dl.google.com/go/${filename}"; \ - ;; \ - 'arm64') \ - filename="go${GOLANG_VERSION}.linux-arm64.tar.gz"; \ - url="https://dl.google.com/go/${filename}"; \ - ;; \ - *) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \ - esac; \ - wget -O go.tgz "$url" --progress=dot:giga; \ - \ - echo "Verifying SHA256 checksum..."; \ - wget -O go.sha256 "https://dl.google.com/go/${filename}.sha256"; \ - expected_sha256=$(cat go.sha256); \ - actual_sha256=$(sha256sum go.tgz | awk '{print $1}'); \ - if [ "$expected_sha256" != "$actual_sha256" ]; then \ - echo >&2 "error: SHA256 checksum verification failed"; \ - echo >&2 "expected: $expected_sha256"; \ - echo >&2 "actual: $actual_sha256"; \ - exit 1; \ - fi; \ - echo "SHA256 checksum verified successfully"; \ - rm go.sha256; \ - \ - tar -C /usr/local -xzf go.tgz; \ - rm go.tgz + if [ -f ".go/compiler/go${GOLANG_VERSION}.linux-${arch}.tar.gz" ]; then \ + echo "Using pre-cached Go compiler (hermetic build)"; \ + tar -C /usr/local -xzf ".go/compiler/go${GOLANG_VERSION}.linux-${arch}.tar.gz"; \ + else \ + echo "Downloading Go compiler from dl.google.com"; \ + filename="go${GOLANG_VERSION}.linux-${arch}.tar.gz"; \ + url="https://dl.google.com/go/${filename}"; \ + wget -O go.tgz "$url" --progress=dot:giga; \ + echo "Verifying SHA256 checksum..."; \ + wget -O go.sha256 "https://dl.google.com/go/${filename}.sha256"; \ + expected_sha256=$(cat go.sha256); \ + actual_sha256=$(sha256sum go.tgz | awk '{print $1}'); \ + if [ "$expected_sha256" != "$actual_sha256" ]; then \ + echo >&2 "error: SHA256 checksum verification failed"; \ + echo >&2 "expected: $expected_sha256"; \ + echo >&2 "actual: $actual_sha256"; \ + exit 1; \ + fi; \ + echo "SHA256 checksum verified successfully"; \ + rm go.sha256; \ + tar -C /usr/local -xzf go.tgz; \ + rm go.tgz; \ + fi ENV GOTOOLCHAIN=local GOPATH=/go ENV PATH=$GOPATH/bin:/usr/local/go/bin:$PATH RUN mkdir -p "$GOPATH/src" "$GOPATH/bin" && chmod -R 700 "$GOPATH" -# Download dependencies +# GOPROXY support for STIG hermetic builds - these ARGs are only set when +# GOPROXY_ENABLED=true in CI; in regular mode they remain empty. +ARG GOPROXY +ARG GONOSUMDB +ARG GOSUMDB +ENV GOPROXY=${GOPROXY} +ENV GONOSUMDB=${GONOSUMDB} +ENV GOSUMDB=${GOSUMDB} + +# Download dependencies - skipped when pre-cached modules exist (hermetic build) COPY go.mod go.sum ./ -RUN go mod download +RUN set -eux; \ + if [ -d "/go/pkg/mod" ] && [ "$(ls -A /go/pkg/mod 2>/dev/null)" ]; then \ + echo "Using pre-cached Go modules (hermetic build)"; \ + else \ + echo "Downloading Go modules..."; \ + go mod download; \ + fi # Copy source code COPY cmd/ cmd/ @@ -110,7 +124,7 @@ RUN apt-get update && apt-get upgrade -y --no-install-recommends && \ ##### # Ubuntu 22.04 runtime target ##### -FROM --platform=$TARGETARCH nvcr.io/nvidia/cuda:13.1.1-base-ubuntu22.04 AS runtime-ubuntu +FROM --platform=$TARGETARCH nvcr.io/nvidia/cuda:13.2.1-base-ubuntu22.04 AS runtime-ubuntu ARG VERSION ARG DCGM_VERSION @@ -192,7 +206,7 @@ ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"] ##### # UBI9 runtime target ##### -FROM --platform=$TARGETARCH nvcr.io/nvidia/cuda:13.1.1-base-ubi9 AS runtime-ubi +FROM --platform=$TARGETARCH nvcr.io/nvidia/cuda:13.2.1-base-ubi9 AS runtime-ubi ARG VERSION ARG DCGM_VERSION @@ -271,7 +285,7 @@ ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"] ##### # Distroless helper stage - builds full Ubuntu container with DCGM libraries ##### -FROM --platform=$TARGETARCH nvcr.io/nvidia/cuda:13.1.1-base-ubuntu22.04 AS runtime-distroless-helper +FROM --platform=$TARGETARCH nvcr.io/nvidia/cuda:13.2.1-base-ubuntu22.04 AS runtime-distroless-helper ARG TARGETARCH @@ -303,7 +317,7 @@ RUN set -e; \ ##### # Distroless runtime target - minimal container image ##### -FROM --platform=$TARGETARCH nvcr.io/nvidia/distroless/cc:v4.0.1 AS runtime-distroless +FROM --platform=$TARGETARCH nvcr.io/nvidia/distroless/cc:v4.0.4 AS runtime-distroless ARG VERSION ARG TARGETARCH @@ -328,6 +342,7 @@ COPY --from=runtime-distroless-helper --chown=root:root --chmod=755 /usr/local/d # Copy required utilities for runtime COPY --from=runtime-distroless-helper --chown=root:root --chmod=755 /usr/bin/sh /usr/bin/ COPY --from=runtime-distroless-helper --chown=root:root --chmod=755 /usr/bin/sh /bin/ +COPY --from=runtime-distroless-helper --chown=root:root --chmod=755 /usr/bin/lshw /bin/ COPY --from=runtime-distroless-helper --chown=root:root --chmod=755 /usr/sbin/setcap /usr/bin/ COPY --from=runtime-distroless-helper --chown=root:root --chmod=755 /usr/bin/env /usr/bin/ COPY --from=runtime-distroless-helper --chown=root:root --chmod=755 /usr/bin/bash /usr/bin/ diff --git a/docker/build-cross.sh b/docker/build-cross.sh index b40c7aff..c0eca55c 100644 --- a/docker/build-cross.sh +++ b/docker/build-cross.sh @@ -28,6 +28,14 @@ fi echo "Building dcgm-exporter for $TARGETOS/$TARGETARCH using CC=$CC" +# For hermetic builds, switch to offline mode when cached modules are available +if [ -d "/go/pkg/mod" ] && [ "$(ls -A /go/pkg/mod)" ] && [ -n "${GOPROXY:-}" ]; then + echo "Hermetic build: Using cached modules in offline mode" + export GOPROXY=direct + export GOSUMDB=off + export GONOSUMDB='*' +fi + # Execute build with all necessary environment variables GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=1 CC=$CC make install diff --git a/go.mod b/go.mod index cbab2f11..b13c508b 100644 --- a/go.mod +++ b/go.mod @@ -1,47 +1,48 @@ module github.com/NVIDIA/dcgm-exporter -go 1.24.0 +go 1.26.0 -toolchain go1.24.13 +toolchain go1.26.2 require ( - github.com/NVIDIA/go-dcgm v0.0.0-20260115225648-6cbb0463ce9f - github.com/NVIDIA/go-nvml v0.12.4-1 + github.com/NVIDIA/go-dcgm v0.0.0-20260422145128-ec245c09fe3e + github.com/NVIDIA/go-nvml v0.13.0-1 github.com/avast/retry-go/v4 v4.6.0 - github.com/bits-and-blooms/bitset v1.22.0 - github.com/fsnotify/fsnotify v1.7.0 + github.com/bits-and-blooms/bitset v1.24.4 + github.com/containerd/cgroups/v3 v3.1.3 + github.com/fsnotify/fsnotify v1.9.0 github.com/google/uuid v1.6.0 github.com/gorilla/mux v1.8.1 - github.com/mittwald/go-helm-client v0.12.16 - github.com/onsi/ginkgo/v2 v2.22.0 - github.com/onsi/gomega v1.36.0 + github.com/mittwald/go-helm-client v0.12.19 + github.com/onsi/ginkgo/v2 v2.28.1 + github.com/onsi/gomega v1.39.1 github.com/pkg/errors v0.9.1 - github.com/prometheus/client_model v0.6.1 - github.com/prometheus/common v0.63.0 - github.com/prometheus/exporter-toolkit v0.14.0 - github.com/sirupsen/logrus v1.9.3 - github.com/stretchr/testify v1.10.0 - github.com/urfave/cli/v2 v2.27.6 + github.com/prometheus/client_model v0.6.2 + github.com/prometheus/common v0.67.5 + github.com/prometheus/exporter-toolkit v0.16.0 + github.com/sirupsen/logrus v1.9.4 + github.com/stretchr/testify v1.11.1 + github.com/urfave/cli/v2 v2.27.7 go.uber.org/goleak v1.3.0 - go.uber.org/mock v0.5.0 - golang.org/x/sync v0.16.0 - google.golang.org/grpc v1.71.1 - helm.sh/helm/v3 v3.18.5 - k8s.io/api v0.33.3 - k8s.io/apimachinery v0.33.3 - k8s.io/client-go v0.33.3 - k8s.io/kubelet v0.32.3 - k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e + go.uber.org/mock v0.6.0 + golang.org/x/sync v0.20.0 + google.golang.org/grpc v1.80.0 + helm.sh/helm/v3 v3.20.2 + k8s.io/api v0.36.0 + k8s.io/apimachinery v0.36.0 + k8s.io/client-go v0.36.0 + k8s.io/kubelet v0.36.0 + k8s.io/utils v0.0.0-20260319190234-28399d86e0b5 ) require ( - dario.cat/mergo v1.0.1 // indirect + dario.cat/mergo v1.0.2 // indirect github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 // indirect github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect - github.com/BurntSushi/toml v1.5.0 // indirect + github.com/BurntSushi/toml v1.6.0 // indirect github.com/MakeNowJust/heredoc v1.0.0 // indirect github.com/Masterminds/goutils v1.1.1 // indirect - github.com/Masterminds/semver/v3 v3.3.1 // indirect + github.com/Masterminds/semver/v3 v3.4.0 // indirect github.com/Masterminds/sprig/v3 v3.3.0 // indirect github.com/Masterminds/squirrel v1.5.4 // indirect github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect @@ -49,118 +50,121 @@ require ( github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/chai2010/gettext-go v1.0.3 // indirect - github.com/containerd/cgroups/v3 v3.1.1 // indirect - github.com/containerd/containerd v1.7.27 // indirect + github.com/clipperhouse/uax29/v2 v2.7.0 // indirect + github.com/containerd/containerd v1.7.31 // indirect github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/log v0.1.0 // indirect github.com/containerd/platforms v0.2.1 // indirect - github.com/coreos/go-systemd/v22 v22.5.0 // indirect - github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect - github.com/cyphar/filepath-securejoin v0.4.1 // indirect + github.com/coreos/go-systemd/v22 v22.7.0 // indirect + github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect + github.com/cyphar/filepath-securejoin v0.6.1 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/docker/docker-credential-helpers v0.9.3 // indirect - github.com/emicklei/go-restful/v3 v3.12.2 // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect github.com/evanphx/json-patch v5.9.11+incompatible // indirect github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect - github.com/fatih/color v1.18.0 // indirect - github.com/fxamacker/cbor/v2 v2.8.0 // indirect + github.com/fatih/color v1.19.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.1 // indirect github.com/go-errors/errors v1.5.1 // indirect github.com/go-gorp/gorp/v3 v3.1.0 // indirect - github.com/go-logr/logr v1.4.2 // indirect - github.com/go-openapi/jsonpointer v0.21.1 // indirect - github.com/go-openapi/jsonreference v0.21.0 // indirect - github.com/go-openapi/swag v0.23.1 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-openapi/jsonpointer v0.23.1 // indirect + github.com/go-openapi/jsonreference v0.21.5 // indirect + github.com/go-openapi/swag v0.26.0 // indirect + github.com/go-openapi/swag/cmdutils v0.26.0 // indirect + github.com/go-openapi/swag/conv v0.26.0 // indirect + github.com/go-openapi/swag/fileutils v0.26.0 // indirect + github.com/go-openapi/swag/jsonname v0.26.0 // indirect + github.com/go-openapi/swag/jsonutils v0.26.0 // indirect + github.com/go-openapi/swag/loading v0.26.0 // indirect + github.com/go-openapi/swag/mangling v0.26.0 // indirect + github.com/go-openapi/swag/netutils v0.26.0 // indirect + github.com/go-openapi/swag/stringutils v0.26.0 // indirect + github.com/go-openapi/swag/typeutils v0.26.0 // indirect + github.com/go-openapi/swag/yamlutils v0.26.0 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gobwas/glob v0.2.3 // indirect - github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang-jwt/jwt/v5 v5.3.1 // indirect github.com/google/btree v1.1.3 // indirect - github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/gnostic-models v0.7.1 // indirect github.com/google/go-cmp v0.7.0 // indirect - github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect - github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect + github.com/google/pprof v0.0.0-20260402051712-545e8a4df936 // indirect github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect github.com/gosuri/uitable v0.0.4 // indirect - github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/huandu/xstrings v1.5.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/jmoiron/sqlx v1.4.0 // indirect - github.com/josharian/intern v1.0.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.18.0 // indirect + github.com/klauspost/compress v1.18.5 // indirect github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect - github.com/lib/pq v1.10.9 // indirect + github.com/lib/pq v1.12.3 // indirect github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect - github.com/mailru/easyjson v0.9.0 // indirect github.com/mattn/go-colorable v0.1.14 // indirect - github.com/mattn/go-isatty v0.0.20 // indirect - github.com/mattn/go-runewidth v0.0.16 // indirect - github.com/mdlayher/socket v0.5.1 // indirect + github.com/mattn/go-isatty v0.0.21 // indirect + github.com/mattn/go-runewidth v0.0.23 // indirect + github.com/mdlayher/socket v0.6.0 // indirect github.com/mdlayher/vsock v1.2.1 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/go-wordwrap v1.0.1 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect - github.com/moby/spdystream v0.5.0 // indirect + github.com/moby/spdystream v0.5.1 // indirect github.com/moby/sys/userns v0.1.0 // indirect github.com/moby/term v0.5.2 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect - github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect github.com/peterbourgon/diskv v2.0.1+incompatible // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.22.0 // indirect - github.com/prometheus/procfs v0.16.0 // indirect - github.com/rivo/uniseg v0.4.7 // indirect - github.com/rubenv/sql-migrate v1.8.0 // indirect + github.com/prometheus/client_golang v1.23.2 // indirect + github.com/prometheus/procfs v0.20.1 // indirect + github.com/rubenv/sql-migrate v1.8.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 // indirect github.com/shopspring/decimal v1.4.0 // indirect - github.com/spf13/cast v1.7.1 // indirect - github.com/spf13/cobra v1.9.1 // indirect - github.com/spf13/pflag v1.0.7 // indirect + github.com/spf13/cast v1.10.0 // indirect + github.com/spf13/cobra v1.10.2 // indirect + github.com/spf13/pflag v1.0.10 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/x448/float16 v0.8.4 // indirect github.com/xlab/treeprint v1.2.0 // indirect - github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect - go.opentelemetry.io/otel/sdk/metric v1.35.0 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect - go.yaml.in/yaml/v3 v3.0.3 // indirect - golang.org/x/crypto v0.40.0 // indirect - golang.org/x/net v0.41.0 // indirect - golang.org/x/oauth2 v0.28.0 // indirect - golang.org/x/sys v0.34.0 // indirect - golang.org/x/term v0.33.0 // indirect - golang.org/x/text v0.27.0 // indirect - golang.org/x/time v0.11.0 // indirect - golang.org/x/tools v0.34.0 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250324211829-b45e905df463 // indirect - google.golang.org/protobuf v1.36.6 // indirect - gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect + go.yaml.in/yaml/v2 v2.4.4 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/crypto v0.50.0 // indirect + golang.org/x/mod v0.35.0 // indirect + golang.org/x/net v0.53.0 // indirect + golang.org/x/oauth2 v0.36.0 // indirect + golang.org/x/sys v0.43.0 // indirect + golang.org/x/term v0.42.0 // indirect + golang.org/x/text v0.36.0 // indirect + golang.org/x/time v0.15.0 // indirect + golang.org/x/tools v0.44.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260420184626-e10c466a9529 // indirect + google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.33.3 // indirect - k8s.io/apiserver v0.33.3 // indirect - k8s.io/cli-runtime v0.33.3 // indirect - k8s.io/component-base v0.33.3 // indirect - k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect - k8s.io/kubectl v0.33.3 // indirect + k8s.io/apiextensions-apiserver v0.36.0 // indirect + k8s.io/apiserver v0.36.0 // indirect + k8s.io/cli-runtime v0.36.0 // indirect + k8s.io/component-base v0.36.0 // indirect + k8s.io/klog/v2 v2.140.0 // indirect + k8s.io/kube-openapi v0.0.0-20260414162039-ec9c827d403f // indirect + k8s.io/kubectl v0.36.0 // indirect + k8s.io/streaming v0.36.0 // indirect oras.land/oras-go/v2 v2.6.0 // indirect - sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect - sigs.k8s.io/kustomize/api v0.19.0 // indirect - sigs.k8s.io/kustomize/kyaml v0.19.0 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect + sigs.k8s.io/kustomize/api v0.21.1 // indirect + sigs.k8s.io/kustomize/kyaml v0.21.1 // indirect sigs.k8s.io/randfill v1.0.0 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect - sigs.k8s.io/yaml v1.5.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.4.0 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index cd741e4b..31ef18b1 100644 --- a/go.sum +++ b/go.sum @@ -1,29 +1,29 @@ -dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s= -dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= +dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8= +dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA= filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk= github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8= github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg= github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= -github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg= -github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= +github.com/BurntSushi/toml v1.6.0 h1:dRaEfpa2VI55EwlIW72hMRHdWouJeRF7TPYhI+AUQjk= +github.com/BurntSushi/toml v1.6.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/MakeNowJust/heredoc v1.0.0 h1:cXCdzVdstXyiTqTvfqk9SDHpKNjxuom+DOlyEeQ4pzQ= github.com/MakeNowJust/heredoc v1.0.0/go.mod h1:mG5amYoWBHf8vpLOuehzbGGw0EHxpZZ6lCpQ4fNJ8LE= github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= github.com/Masterminds/goutils v1.1.1/go.mod h1:8cTjp+g8YejhMuvIA5y2vz3BpJxksy863GQaJW2MFNU= -github.com/Masterminds/semver/v3 v3.3.1 h1:QtNSWtVZ3nBfk8mAOu/B6v7FMJ+NHTIgUPi7rj+4nv4= -github.com/Masterminds/semver/v3 v3.3.1/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Masterminds/sprig/v3 v3.3.0 h1:mQh0Yrg1XPo6vjYXgtf5OtijNAKJRNcTdOOGZe3tPhs= github.com/Masterminds/sprig/v3 v3.3.0/go.mod h1:Zy1iXRYNqNLUolqCpL4uhk6SHUMAOSCzdgBfDb35Lz0= github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM= github.com/Masterminds/squirrel v1.5.4/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10= -github.com/NVIDIA/go-dcgm v0.0.0-20260115225648-6cbb0463ce9f h1:N0eRtecIPi9xm1FxFnQCqlxRRBMAZPnRV4m/FwgOx1s= -github.com/NVIDIA/go-dcgm v0.0.0-20260115225648-6cbb0463ce9f/go.mod h1:cA0Bv7+JtAd8sqCCZizhAQjj4+Z47x/d8KD60iYBT+g= -github.com/NVIDIA/go-nvml v0.12.4-1 h1:WKUvqshhWSNTfm47ETRhv0A0zJyr1ncCuHiXwoTrBEc= -github.com/NVIDIA/go-nvml v0.12.4-1/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= +github.com/NVIDIA/go-dcgm v0.0.0-20260422145128-ec245c09fe3e h1:YvYb99vEgxmGR0Qjnck9LT9iMUq5kKdaS3qcmpXsWH0= +github.com/NVIDIA/go-dcgm v0.0.0-20260422145128-ec245c09fe3e/go.mod h1:cA0Bv7+JtAd8sqCCZizhAQjj4+Z47x/d8KD60iYBT+g= +github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw= +github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= @@ -32,37 +32,41 @@ github.com/avast/retry-go/v4 v4.6.0 h1:K9xNA+KeB8HHc2aWFuLb25Offp+0iVRXEvFx8IinR github.com/avast/retry-go/v4 v4.6.0/go.mod h1:gvWlPhBVsvBbLkVGDg/KwvBv0bEkCOLRRSHKIr2PyOE= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= -github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/bits-and-blooms/bitset v1.24.4 h1:95H15Og1clikBrKr/DuzMXkQzECs1M6hhoGXLwLQOZE= +github.com/bits-and-blooms/bitset v1.24.4/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/bshuster-repo/logrus-logstash-hook v1.0.0 h1:e+C0SB5R1pu//O4MQ3f9cFuPGoOVeF2fE4Og9otCc70= github.com/bshuster-repo/logrus-logstash-hook v1.0.0/go.mod h1:zsTqEiSzDgAa/8GZR7E1qaXrhYNDKBYy5/dWPTIflbk= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chai2010/gettext-go v1.0.3 h1:9liNh8t+u26xl5ddmWLmsOsdNLwkdRTg5AG+JnTiM80= github.com/chai2010/gettext-go v1.0.3/go.mod h1:y+wnP2cHYaVj19NZhYKAwEMH2CI1gNHeQQ+5AjwawxA= -github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM= -github.com/containerd/cgroups/v3 v3.1.1 h1:ASZmQGfOHbRj43/1aMn5QcWIsv0R/AuHHDNCguRY0p0= -github.com/containerd/cgroups/v3 v3.1.1/go.mod h1:PKZ2AcWmSBsY/tJUVhtS/rluX0b1uq1GmPO1ElCmbOw= -github.com/containerd/containerd v1.7.27 h1:yFyEyojddO3MIGVER2xJLWoCIn+Up4GaHFquP7hsFII= -github.com/containerd/containerd v1.7.27/go.mod h1:xZmPnl75Vc+BLGt4MIfu6bp+fy03gdHAn9bz+FreFR0= +github.com/clipperhouse/uax29/v2 v2.7.0 h1:+gs4oBZ2gPfVrKPthwbMzWZDaAFPGYK72F0NJv2v7Vk= +github.com/clipperhouse/uax29/v2 v2.7.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM= +github.com/containerd/cgroups/v3 v3.1.3 h1:eUNflyMddm18+yrDmZPn3jI7C5hJ9ahABE5q6dyLYXQ= +github.com/containerd/cgroups/v3 v3.1.3/go.mod h1:PKZ2AcWmSBsY/tJUVhtS/rluX0b1uq1GmPO1ElCmbOw= +github.com/containerd/containerd v1.7.31 h1:jn3IMuTV4Bb1Uwb0MFPW2ASJAD3W1lh6QqqZHIZwDh4= +github.com/containerd/containerd v1.7.31/go.mod h1:jdwD6s/BhV4XVJGrvtziNPVA+83n66TwptVaPKprq4E= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpSBQv6A= github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw= -github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= -github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= -github.com/cpuguy83/go-md2man/v2 v2.0.6 h1:XJtiaUW6dEEqVuZiMTn1ldk455QWwEIsMIJlo5vtkx0= +github.com/coreos/go-systemd/v22 v22.7.0 h1:LAEzFkke61DFROc7zNLX/WA2i5J8gYqe0rSj9KI28KA= +github.com/coreos/go-systemd/v22 v22.7.0/go.mod h1:xNUYtjHu2EDXbsxz1i41wouACIwT7Ybq9o0BQhMwD0w= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= +github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= +github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= -github.com/cyphar/filepath-securejoin v0.4.1 h1:JyxxyPEaktOD+GAnqIqTf9A8tHyAG22rowi7HkoSU1s= -github.com/cyphar/filepath-securejoin v0.4.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= +github.com/cyphar/filepath-securejoin v0.6.1 h1:5CeZ1jPXEiYt3+Z6zqprSAgSWiggmpVyciv8syjIpVE= +github.com/cyphar/filepath-securejoin v0.6.1/go.mod h1:A8hd4EnAeyujCJRrICiOWqjS1AX0a9kM5XL+NwKoYSc= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= @@ -81,61 +85,93 @@ github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c h1:+pKlWGMw7gf6bQ github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c/go.mod h1:Uw6UezgYA44ePAFQYUehOuCzmy5zmg/+nl2ZfMWGkpA= github.com/docker/go-metrics v0.0.1 h1:AgB/0SvBxihN0X8OR4SjsblXkbMvalQ8cjmtKQ2rQV8= github.com/docker/go-metrics v0.0.1/go.mod h1:cG1hvH2utMXtqgqqYE9plW6lDxS3/5ayHzueweSI3Vw= -github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= -github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/evanphx/json-patch v5.9.11+incompatible h1:ixHHqfcGvxhWkniF1tWxBHA0yb4Z+d1UQi45df52xW8= github.com/evanphx/json-patch v5.9.11+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f h1:Wl78ApPPB2Wvf/TIe2xdyJxTlb6obmF18d8QdkxNDu4= github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f/go.mod h1:OSYXu++VVOHnXeitef/D8n/6y4QV8uLHSFXX4NeXMGc= -github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= -github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= +github.com/fatih/color v1.19.0 h1:Zp3PiM21/9Ld6FzSKyL5c/BULoe/ONr9KlbYVOfG8+w= +github.com/fatih/color v1.19.0/go.mod h1:zNk67I0ZUT1bEGsSGyCZYZNrHuTkJJB+r6Q9VuMi0LE= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= -github.com/foxcpp/go-mockdns v1.1.0 h1:jI0rD8M0wuYAxL7r/ynTrCQQq0BVqfB99Vgk7DlmewI= -github.com/foxcpp/go-mockdns v1.1.0/go.mod h1:IhLeSFGed3mJIAXPH2aiRQB+kqz7oqu8ld2qVbOu7Wk= +github.com/foxcpp/go-mockdns v1.2.0 h1:omK3OrHRD1IWJz1FuFBCFquhXslXoF17OvBS6JPzZF0= +github.com/foxcpp/go-mockdns v1.2.0/go.mod h1:IhLeSFGed3mJIAXPH2aiRQB+kqz7oqu8ld2qVbOu7Wk= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= -github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= -github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= -github.com/fxamacker/cbor/v2 v2.8.0 h1:fFtUGXUzXPHTIUdne5+zzMPTfffl3RD5qYnkY40vtxU= -github.com/fxamacker/cbor/v2 v2.8.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fxamacker/cbor/v2 v2.9.1 h1:2rWm8B193Ll4VdjsJY28jxs70IdDsHRWgQYAI80+rMQ= +github.com/fxamacker/cbor/v2 v2.9.1/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= +github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= +github.com/gkampitakis/go-snaps v0.5.15 h1:amyJrvM1D33cPHwVrjo9jQxX8g/7E2wYdZ+01KS3zGE= +github.com/gkampitakis/go-snaps v0.5.15/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= github.com/go-errors/errors v1.5.1 h1:ZwEMSLRCapFLflTpT7NKaAc7ukJ8ZPEjzlxt8rPN8bk= github.com/go-errors/errors v1.5.1/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= github.com/go-gorp/gorp/v3 v3.1.0 h1:ItKF/Vbuj31dmV4jxA1qblpSwkl9g1typ24xoe70IGs= github.com/go-gorp/gorp/v3 v3.1.0/go.mod h1:dLEjIyyRNiXvNZ8PSmzpt1GsWAUK8kjVhEpjH8TixEw= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= -github.com/go-openapi/jsonpointer v0.21.1 h1:whnzv/pNXtK2FbX/W9yJfRmE2gsmkfahjMKB0fZvcic= -github.com/go-openapi/jsonpointer v0.21.1/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk= -github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= -github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= -github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU= -github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= +github.com/go-openapi/jsonpointer v0.23.1 h1:1HBACs7XIwR2RcmItfdSFlALhGbe6S92p0ry4d1GWg4= +github.com/go-openapi/jsonpointer v0.23.1/go.mod h1:iWRmZTrGn7XwYhtPt/fvdSFj1OfNBngqRT2UG3BxSqY= +github.com/go-openapi/jsonreference v0.21.5 h1:6uCGVXU/aNF13AQNggxfysJ+5ZcU4nEAe+pJyVWRdiE= +github.com/go-openapi/jsonreference v0.21.5/go.mod h1:u25Bw85sX4E2jzFodh1FOKMTZLcfifd1Q+iKKOUxExw= +github.com/go-openapi/swag v0.26.0 h1:GVDXCmfvhfu1BxiHo8/FA+BbKmhecHnG3varjON5/RI= +github.com/go-openapi/swag v0.26.0/go.mod h1:82g3193sZJRbocs7bNCqGfIgq8pkuwVwCfhKIRlEQF0= +github.com/go-openapi/swag/cmdutils v0.26.0 h1:iowihOcvq7y4egO8cOq0dmfohz6wfeQ63U1EnuhO2TU= +github.com/go-openapi/swag/cmdutils v0.26.0/go.mod h1:Sm1MVFMkF6guJJ+pQqHnQA3N0j9qALV3NxzDSv6bETM= +github.com/go-openapi/swag/conv v0.26.0 h1:5yGGsPYI1ZCva93U0AoKi/iZrNhaJEjr324YVsiD89I= +github.com/go-openapi/swag/conv v0.26.0/go.mod h1:tpAmIL7X58VPnHHiSO4uE3jBeRamGsFsfdDeDtb5ECE= +github.com/go-openapi/swag/fileutils v0.26.0 h1:WJoPRvsA7QRiiWluowkLJa9jaYR7FCuxmDvnCgaRRxU= +github.com/go-openapi/swag/fileutils v0.26.0/go.mod h1:0WDJ7lp67eNjPMO50wAWYlKvhOb6CQ37rzR7wrgI8Tc= +github.com/go-openapi/swag/jsonname v0.26.0 h1:gV1NFX9M8avo0YSpmWogqfQISigCmpaiNci8cGECU5w= +github.com/go-openapi/swag/jsonname v0.26.0/go.mod h1:urBBR8bZNoDYGr653ynhIx+gTeIz0ARZxHkAPktJK2M= +github.com/go-openapi/swag/jsonutils v0.26.0 h1:FawFML2iAXsPqmERscuMPIHmFsoP1tOqWkxBaKNMsnA= +github.com/go-openapi/swag/jsonutils v0.26.0/go.mod h1:2VmA0CJlyFqgawOaPI9psnjFDqzyivIqLYN34t9p91E= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.26.0 h1:apqeINu/ICHouqiRZbyFvuDge5jCmmLTqGQ9V95EaOM= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.26.0/go.mod h1:AyM6QT8uz5IdKxk5akv0y6u4QvcL9GWERt0Jx/F/R8Y= +github.com/go-openapi/swag/loading v0.26.0 h1:Apg6zaKhCJurpJer0DCxq99qwmhFddBhaMX7kilDcko= +github.com/go-openapi/swag/loading v0.26.0/go.mod h1:dBxQ/6V2uBaAQdevN18VELE6xSpJWZxLX4txe12JwDg= +github.com/go-openapi/swag/mangling v0.26.0 h1:Du2YC4YLA/Y5m/YKQd7AnY5qq0wRKSFZTTt8ktFaXcQ= +github.com/go-openapi/swag/mangling v0.26.0/go.mod h1:jifS7W9vbg+pw63bT+GI53otluMQL3CeemuyCHKwVx0= +github.com/go-openapi/swag/netutils v0.26.0 h1:CmZp+ZT7HrmFwrC3GdGsXBq2+42T1bjKBapcqVpIs3c= +github.com/go-openapi/swag/netutils v0.26.0/go.mod h1:5iK+Ok3ZohWWex1C50BFTPexi03UaPwjW4Oj8kgrpwo= +github.com/go-openapi/swag/stringutils v0.26.0 h1:qZQngLxs5s7SLijc3N2ZO+fUq2o8LjuWAASSrJuh+xg= +github.com/go-openapi/swag/stringutils v0.26.0/go.mod h1:sWn5uY+QIIspwPhvgnqJsH8xqFT2ZbYcvbcFanRyhFE= +github.com/go-openapi/swag/typeutils v0.26.0 h1:2kdEwdiNWy+JJdOvu5MA2IIg2SylWAFuuyQIKYybfq4= +github.com/go-openapi/swag/typeutils v0.26.0/go.mod h1:oovDuIUvTrEHVMqWilQzKzV4YlSKgyZmFh7AlfABNVE= +github.com/go-openapi/swag/yamlutils v0.26.0 h1:H7O8l/8NJJQ/oiReEN+oMpnGMyt8G0hl460nRZxhLMQ= +github.com/go-openapi/swag/yamlutils v0.26.0/go.mod h1:1evKEGAtP37Pkwcc7EWMF0hedX0/x3Rkvei2wtG/TbU= +github.com/go-openapi/testify/enable/yaml/v2 v2.4.2 h1:5zRca5jw7lzVREKCZVNBpysDNBjj74rBh0N2BGQbSR0= +github.com/go-openapi/testify/enable/yaml/v2 v2.4.2/go.mod h1:XVevPw5hUXuV+5AkI1u1PeAm27EQVrhXTTCPAF85LmE= +github.com/go-openapi/testify/v2 v2.4.2 h1:tiByHpvE9uHrrKjOszax7ZvKB7QOgizBWGBLuq0ePx4= +github.com/go-openapi/testify/v2 v2.4.2/go.mod h1:SgsVHtfooshd0tublTtJ50FPKhujf47YRqauXXOUxfw= github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y= github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= -github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= -github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= -github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= +github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= +github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY= +github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= -github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gnostic-models v0.7.1 h1:SisTfuFKJSKM5CPZkffwi6coztzzeYUhc3v4yxLWH8c= +github.com/google/gnostic-models v0.7.1/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= -github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4= -github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ= +github.com/google/pprof v0.0.0-20260402051712-545e8a4df936 h1:EwtI+Al+DeppwYX2oXJCETMO23COyaKGP6fHVpkpWpg= +github.com/google/pprof v0.0.0-20260402051712-545e8a4df936/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= @@ -146,11 +182,8 @@ github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5T github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= github.com/gosuri/uitable v0.0.4 h1:IG2xLKRvErL3uhY6e1BylFzG+aJiwQviDDTfOKeKTpY= github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16vt0PJo= -github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= -github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= -github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4M0+kPpLofRdBo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7 h1:X+2YciYSxvMQK0UZ7sg45ZVabVZBeBuvMkmuI2V3Fak= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7/go.mod h1:lW34nIZuQ8UDPdkon5fmfp2l3+ZkQ2me/+oecHYLOII= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -166,16 +199,14 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= -github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= -github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE= +github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= -github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE= +github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -184,24 +215,27 @@ github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 h1:SOEGU9fKiNWd/HOJuq github.com/lann/builder v0.0.0-20180802200727-47ae307949d0/go.mod h1:dXGbAdH5GtBTC4WfIxhKZfyBF/HBFgRZSWwZ9g/He9o= github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 h1:P6pPBnrTSX3DEVR4fDembhRWSsG5rVo6hYhAB/ADZrk= github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0/go.mod h1:vmVJ0l/dxyfGW6FmdpVm2joNMFikkuWg0EoCKLGUMNw= -github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/lib/pq v1.12.3 h1:tTWxr2YLKwIvK90ZXEw8GP7UFHtcbTtty8zsI+YjrfQ= +github.com/lib/pq v1.12.3/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE= -github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= -github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8= -github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= -github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= -github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= -github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/mattn/go-isatty v0.0.21 h1:xYae+lCNBP7QuW4PUnNG61ffM4hVIfm+zUzDuSzYLGs= +github.com/mattn/go-isatty v0.0.21/go.mod h1:ZXfXG4SQHsB/w3ZeOYbR0PrPwLy+n6xiMrJlRFqopa4= +github.com/mattn/go-runewidth v0.0.23 h1:7ykA0T0jkPpzSvMS5i9uoNn2Xy3R383f9HDx3RybWcw= +github.com/mattn/go-runewidth v0.0.23/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= -github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos= -github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ= +github.com/mdlayher/socket v0.6.0 h1:ScZPaAGyO1icQnbFrhPM8mnXyMu9qukC1K4ZoM2IQKU= +github.com/mdlayher/socket v0.6.0/go.mod h1:q7vozUAnxSqnjHc12Fik5yUKIzfZ8ITCfMkhOtE9z18= github.com/mdlayher/vsock v1.2.1 h1:pC1mTJTvjo1r9n9fbm7S1j04rCgCzhCOS5DY0zqHlnQ= github.com/mdlayher/vsock v1.2.1/go.mod h1:NRfCibel++DgeMD8z/hP+PPTjlNJsdPOmxcnENvE+SE= +github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= +github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= github.com/miekg/dns v1.1.57 h1:Jzi7ApEIzwEPLHWRcafCN9LZSBbqQpxjt/wpgvg7wcM= github.com/miekg/dns v1.1.57/go.mod h1:uqRjCRUuEAA6qsOiJvDd+CFo/vW+y5WR6SNmHE55hZk= github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= @@ -210,10 +244,10 @@ github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQ github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= -github.com/mittwald/go-helm-client v0.12.16 h1:YTyJX6L0SI/O7HNTG0qDZI2/jyGELxJOQEjvTj4mf6k= -github.com/mittwald/go-helm-client v0.12.16/go.mod h1:PDF7Ra8bmJ2YTNzoehoMMi+gW/EJBk/4TLz7j52rehY= -github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= -github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= +github.com/mittwald/go-helm-client v0.12.19 h1:GzwISuYemkgISegXfYzY3i6blRZzfNpp2G5+tBUyzp4= +github.com/mittwald/go-helm-client v0.12.19/go.mod h1:mlTMyzGOua5rXH4+kFTU/YsE9xxqvwkEW1c5ukM8Cj4= +github.com/moby/spdystream v0.5.1 h1:9sNYeYZUcci9R6/w7KDaFWEWeV4LStVG78Mpyq/Zm/Y= +github.com/moby/spdystream v0.5.1/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g= github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ= @@ -221,20 +255,19 @@ github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFL github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/2gBQ3RWajuToeY6ZtZTIKv2v7ThUy5KKusIT0yc0= github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= -github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= -github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg= -github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= -github.com/onsi/gomega v1.36.0 h1:Pb12RlruUtj4XUuPUqeEWc6j5DkVVVA49Uf6YLfC95Y= -github.com/onsi/gomega v1.36.0/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= +github.com/onsi/ginkgo/v2 v2.28.1 h1:S4hj+HbZp40fNKuLUQOYLDgZLwNUVn19N3Atb98NCyI= +github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE= +github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28= +github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= @@ -250,46 +283,43 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/poy/onpar v1.1.2 h1:QaNrNiZx0+Nar5dLgTVp5mXkyoVFIbepjyEoGSnhbAY= github.com/poy/onpar v1.1.2/go.mod h1:6X8FLNoxyr9kkmnlqpK6LSoiOtrO6MICtWwEuWkLjzg= -github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= -github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= -github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= -github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= -github.com/prometheus/common v0.63.0 h1:YR/EIY1o3mEFP/kZCD7iDMnLPlGyuU2Gb3HIcXnA98k= -github.com/prometheus/common v0.63.0/go.mod h1:VVFF/fBIoToEnWRVkYoXEkq3R3paCoxG9PXP74SnV18= -github.com/prometheus/exporter-toolkit v0.14.0 h1:NMlswfibpcZZ+H0sZBiTjrA3/aBFHkNZqE+iCj5EmRg= -github.com/prometheus/exporter-toolkit v0.14.0/go.mod h1:Gu5LnVvt7Nr/oqTBUC23WILZepW0nffNo10XdhQcwWA= -github.com/prometheus/procfs v0.16.0 h1:xh6oHhKwnOJKMYiYBDWmkHqQPyiY40sny36Cmx2bbsM= -github.com/prometheus/procfs v0.16.0/go.mod h1:8veyXUu3nGP7oaCxhX6yeaM5u4stL2FeMXnCqhDthZg= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= +github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= +github.com/prometheus/exporter-toolkit v0.16.0 h1:xT/j7L2XKF+VJd6B4fpUw6xWabHrSmsUf6mYmFqyu0s= +github.com/prometheus/exporter-toolkit v0.16.0/go.mod h1:d1EL8Z9674xQe/iWhwP2wDyCEoBPbXVeqDbqAUsgJWY= +github.com/prometheus/procfs v0.20.1 h1:XwbrGOIplXW/AU3YhIhLODXMJYyC1isLFfYCsTEycfc= +github.com/prometheus/procfs v0.20.1/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo= github.com/redis/go-redis/extra/rediscmd/v9 v9.0.5 h1:EaDatTxkdHG+U3Bk4EUr+DZ7fOGwTfezUiUJMaIcaho= github.com/redis/go-redis/extra/rediscmd/v9 v9.0.5/go.mod h1:fyalQWdtzDBECAQFBJuQe5bzQ02jGd5Qcbgb97Flm7U= github.com/redis/go-redis/extra/redisotel/v9 v9.0.5 h1:EfpWLLCyXw8PSM2/XNJLjI3Pb27yVE+gIAfeqp8LUCc= github.com/redis/go-redis/extra/redisotel/v9 v9.0.5/go.mod h1:WZjPDy7VNzn77AAfnAfVjZNvfJTYfPetfZk5yoSTLaQ= github.com/redis/go-redis/v9 v9.7.3 h1:YpPyAayJV+XErNsatSElgRZZVCwXX9QzkKYNvO7x0wM= github.com/redis/go-redis/v9 v9.7.3/go.mod h1:bGUrSggJ9X9GUmZpZNEOQKaANxSGgOEBRltRTZHSvrA= -github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= -github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= -github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= -github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= -github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= -github.com/rubenv/sql-migrate v1.8.0 h1:dXnYiJk9k3wetp7GfQbKJcPHjVJL6YK19tKj8t2Ns0o= -github.com/rubenv/sql-migrate v1.8.0/go.mod h1:F2bGFBwCU+pnmbtNYDeKvSuvL6lBVtXDXUUv5t+u1qw= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/rubenv/sql-migrate v1.8.1 h1:EPNwCvjAowHI3TnZ+4fQu3a915OpnQoPAjTXCGOy2U0= +github.com/rubenv/sql-migrate v1.8.1/go.mod h1:BTIKBORjzyxZDS6dzoiw6eAFYJ1iNlGAtjn4LGeVjS8= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 h1:KRzFb2m7YtdldCEkzs6KqmJw4nqEVZGK7IN2kJkjTuQ= github.com/santhosh-tekuri/jsonschema/v6 v6.0.2/go.mod h1:JXeL+ps8p7/KNMjDQk3TCwPpBy0wYklyWTfbkIzdIFU= -github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= -github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= +github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= +github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k= github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y= -github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= -github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= -github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= -github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/spf13/pflag v1.0.7 h1:vN6T9TfwStFPFM5XzjsvmzZkLuaLX+HS+0SeFLRgU6M= -github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= +github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= +github.com/spf13/cast v1.10.0 h1:h2x0u2shc1QuLHfxi+cTJvs30+ZAHOGRic8uyGTDWxY= +github.com/spf13/cast v1.10.0/go.mod h1:jNfB8QC9IA6ZuY2ZjDp0KtFO2LZZlg4S/7bzP6qqeHo= +github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= +github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= @@ -297,28 +327,34 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/urfave/cli/v2 v2.27.6 h1:VdRdS98FNhKZ8/Az8B7MTyGQmpIr36O1EHybx/LaZ4g= -github.com/urfave/cli/v2 v2.27.6/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= +github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU= +github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/xlab/treeprint v1.2.0 h1:HzHnuAF1plUN2zGlAFHbSQP2qJ0ZAD3XF5XD7OesXRQ= github.com/xlab/treeprint v1.2.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0= -github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= -github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= -github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= -go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 h1:FnBeRrxr7OU4VvAzt5X7s6266i6cSVkkFPS0TuXWbIg= +github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/contrib/bridges/prometheus v0.57.0 h1:UW0+QyeyBVhn+COBec3nGhfnFe5lwB0ic1JBVjzhk0w= go.opentelemetry.io/contrib/bridges/prometheus v0.57.0/go.mod h1:ppciCHRLsyCio54qbzQv0E4Jyth/fLWDTJYfvWpcSVk= go.opentelemetry.io/contrib/exporters/autoexport v0.57.0 h1:jmTVJ86dP60C01K3slFQa2NQ/Aoi7zA+wy7vMOKD9H4= go.opentelemetry.io/contrib/exporters/autoexport v0.57.0/go.mod h1:EJBheUMttD/lABFyLXhce47Wr6DPWYReCzaZiXadH7g= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ= -go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= -go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.65.0 h1:7iP2uCb7sGddAr30RRS6xjKy7AZ2JtTOPA3oolgVSw8= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.65.0/go.mod h1:c7hN3ddxs/z6q9xwvfLPk+UHlWRQyaeR1LdgfL/66l0= +go.opentelemetry.io/otel v1.41.0 h1:YlEwVsGAlCvczDILpUXpIpPSL/VPugt7zHThEMLce1c= +go.opentelemetry.io/otel v1.41.0/go.mod h1:Yt4UwgEKeT05QbLwbyHXEwhnjxNO6D8L5PQP51/46dE= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.8.0 h1:WzNab7hOOLzdDF/EoWCt4glhrbMPVMOO5JYTmpz36Ls= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.8.0/go.mod h1:hKvJwTzJdp90Vh7p6q/9PAOd55dI6WA6sWj62a/JvSs= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.8.0 h1:S+LdBGiQXtJdowoJoQPEtI52syEP/JYBUpjO49EQhV8= @@ -327,10 +363,10 @@ go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.32.0 h1:j7Z go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.32.0/go.mod h1:WXbYJTUaZXAbYd8lbgGuvih0yuCfOFC5RJoYnoLcGz8= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.32.0 h1:t/Qur3vKSkUCcDVaSumWF2PKHt85pc7fRvFuoVT8qFU= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.32.0/go.mod h1:Rl61tySSdcOJWoEgYZVtmnKdA0GeKrSqkHC1t+91CH8= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 h1:Vh5HayB/0HHfOQA7Ctx69E/Y/DcQSMPpKANYVMQ7fBA= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0/go.mod h1:cpgtDBaqD/6ok/UG0jT15/uKjAY8mRA53diogHBg3UI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 h1:5pojmb1U1AogINhN3SurB+zm/nIcusopeBNp42f45QM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0/go.mod h1:57gTHJSE5S1tqg+EKsLPlTWhpHMsWlVmer+LA926XiA= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0 h1:QKdN8ly8zEMrByybbQgv8cWBcdAarwmIPZ6FThrWXJs= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0/go.mod h1:bTdK1nhqF76qiPoCCdyFIV+N/sRHYXYCTQc+3VCi3MI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 h1:DvJDOPmSWQHWywQS6lKL+pb8s3gBLOZUtw4N+mavW1I= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0/go.mod h1:EtekO9DEJb4/jRyN4v4Qjc2yA7AtfCBuz2FynRUWTXs= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.32.0 h1:cMyu9O88joYEaI47CnQkxO1XZdpoTF9fEnW2duIddhw= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.32.0/go.mod h1:6Am3rn7P9TVVeXYG+wtcGE7IE1tsQ+bP3AuWcKt/gOI= go.opentelemetry.io/otel/exporters/prometheus v0.54.0 h1:rFwzp68QMgtzu9PgP3jm9XaMICI6TsofWWPcBDKwlsU= @@ -343,88 +379,63 @@ go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.32.0 h1:cC2yDI3IQd0Udsu go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.32.0/go.mod h1:2PD5Ex6z8CFzDbTdOlwyNIUywRr1DN0ospafJM1wJ+s= go.opentelemetry.io/otel/log v0.8.0 h1:egZ8vV5atrUWUbnSsHn6vB8R21G2wrKqNiDt3iWertk= go.opentelemetry.io/otel/log v0.8.0/go.mod h1:M9qvDdUTRCopJcGRKg57+JSQ9LgLBrwwfC32epk5NX8= -go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= -go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= -go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY= -go.opentelemetry.io/otel/sdk v1.35.0/go.mod h1:+ga1bZliga3DxJ3CQGg3updiaAJoNECOgJREo9KHGQg= +go.opentelemetry.io/otel/metric v1.41.0 h1:rFnDcs4gRzBcsO9tS8LCpgR0dxg4aaxWlJxCno7JlTQ= +go.opentelemetry.io/otel/metric v1.41.0/go.mod h1:xPvCwd9pU0VN8tPZYzDZV/BMj9CM9vs00GuBjeKhJps= +go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8= +go.opentelemetry.io/otel/sdk v1.40.0/go.mod h1:Ph7EFdYvxq72Y8Li9q8KebuYUr2KoeyHx0DRMKrYBUE= go.opentelemetry.io/otel/sdk/log v0.8.0 h1:zg7GUYXqxk1jnGF/dTdLPrK06xJdrXgqgFLnI4Crxvs= go.opentelemetry.io/otel/sdk/log v0.8.0/go.mod h1:50iXr0UVwQrYS45KbruFrEt4LvAdCaWWgIrsN3ZQggo= -go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5JpUCaEqEI9o= -go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= -go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= -go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= -go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= -go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= +go.opentelemetry.io/otel/sdk/metric v1.40.0 h1:mtmdVqgQkeRxHgRv4qhyJduP3fYJRMX4AtAlbuWdCYw= +go.opentelemetry.io/otel/sdk/metric v1.40.0/go.mod h1:4Z2bGMf0KSK3uRjlczMOeMhKU2rhUqdWNoKcYrtcBPg= +go.opentelemetry.io/otel/trace v1.41.0 h1:Vbk2co6bhj8L59ZJ6/xFTskY+tGAbOnCtQGVVa9TIN0= +go.opentelemetry.io/otel/trace v1.41.0/go.mod h1:U1NU4ULCoxeDKc09yCWdWe+3QoyweJcISEVa1RBzOis= +go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A= +go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= -go.uber.org/mock v0.5.0 h1:KAMbZvZPyBPWgD14IrIQ38QCyjwpvVVV6K/bHl1IwQU= -go.uber.org/mock v0.5.0/go.mod h1:ge71pBPLYDk7QIi1LupWxdAykm7KIEFchiOqd6z7qMM= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= -go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE= -go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.40.0 h1:r4x+VvoG5Fm+eJcxMaY8CQM7Lb0l1lsmjGBQ6s8BfKM= -golang.org/x/crypto v0.40.0/go.mod h1:Qr1vMER5WyS2dfPHAlsOj01wgLbsyWtFn/aY+5+ZdxY= -golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.25.0 h1:n7a+ZbQKQA/Ysbyb0/6IbB1H/X41mKgbhfv7AfG/44w= -golang.org/x/mod v0.25.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= -golang.org/x/oauth2 v0.28.0 h1:CrgCKl8PPAVtLnU3c+EDw6x11699EWlsDeWNWKdIOkc= -golang.org/x/oauth2 v0.28.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= -golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +go.uber.org/mock v0.6.0 h1:hyF9dfmbgIX5EfOdasqLsWD6xqpNZlXblLB/Dbnwv3Y= +go.uber.org/mock v0.6.0/go.mod h1:KiVJ4BqZJaMj4svdfmHM0AUx4NJYO8ZNpPnZn1Z+BBU= +go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ= +go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= +golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= +golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM= +golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU= +golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= +golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= +golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs= +golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA= -golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.33.0 h1:NuFncQrRcaRvVmgRkvM3j/F00gWIAlcmlB8ACEKmGIg= -golang.org/x/term v0.33.0/go.mod h1:s18+ql9tYWp1IfpV9DmCtQDDSRBUjKaw9M1eAv5UeF0= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= -golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= -golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= -golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.34.0 h1:qIpSLOxeCYGg9TrcJokLBG4KFA6d795g0xkBkiESGlo= -golang.org/x/tools v0.34.0/go.mod h1:pAP9OwEaY1CAW3HOmg3hLZC5Z0CCmzjAF2UQMSqNARg= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/genproto v0.0.0-20240123012728-ef4313101c80 h1:KAeGQVN3M9nD0/bQXnr/ClcEMJ968gUXJQ9pwfSynuQ= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250324211829-b45e905df463 h1:e0AIkUUhxyBKh6ssZNrAMeqhA7RKUj42346d1y02i2g= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250324211829-b45e905df463/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= -google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= -google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= +golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.42.0 h1:UiKe+zDFmJobeJ5ggPwOshJIVt6/Ft0rcfrXZDLWAWY= +golang.org/x/term v0.42.0/go.mod h1:Dq/D+snpsbazcBG5+F9Q1n2rXV8Ma+71xEjTRufARgY= +golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= +golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= +golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= +golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno= +golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c= +golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= +google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3 h1:1hfbdAfFbkmpg41000wDVqr7jUpK/Yo+LPnIxxGzmkg= +google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 h1:merA0rdPeUV3YIIfHHcH4qBkiQAc1nfCKSI7lB4cV2M= +google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409/go.mod h1:fl8J1IvUjCilwZzQowmw2b7HQB2eAuYBabMXzWurF+I= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260420184626-e10c466a9529 h1:XF8+t6QQiS0o9ArVan/HW8Q7cycNPGsJf6GA2nXxYAg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260420184626-e10c466a9529/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM= +google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4= +google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af h1:+5/Sw3GsDNlEmu7TfklWKPdQ0Ykja5VEmq2i817+jbI= +google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= @@ -432,45 +443,45 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -helm.sh/helm/v3 v3.18.5 h1:Cc3Z5vd6kDrZq9wO9KxKLNEickiTho6/H/dBNRVSos4= -helm.sh/helm/v3 v3.18.5/go.mod h1:L/dXDR2r539oPlFP1PJqKAC1CUgqHJDLkxKpDGrWnyg= -k8s.io/api v0.33.3 h1:SRd5t//hhkI1buzxb288fy2xvjubstenEKL9K51KBI8= -k8s.io/api v0.33.3/go.mod h1:01Y/iLUjNBM3TAvypct7DIj0M0NIZc+PzAHCIo0CYGE= -k8s.io/apiextensions-apiserver v0.33.3 h1:qmOcAHN6DjfD0v9kxL5udB27SRP6SG/MTopmge3MwEs= -k8s.io/apiextensions-apiserver v0.33.3/go.mod h1:oROuctgo27mUsyp9+Obahos6CWcMISSAPzQ77CAQGz8= -k8s.io/apimachinery v0.33.3 h1:4ZSrmNa0c/ZpZJhAgRdcsFcZOw1PQU1bALVQ0B3I5LA= -k8s.io/apimachinery v0.33.3/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/apiserver v0.33.3 h1:Wv0hGc+QFdMJB4ZSiHrCgN3zL3QRatu56+rpccKC3J4= -k8s.io/apiserver v0.33.3/go.mod h1:05632ifFEe6TxwjdAIrwINHWE2hLwyADFk5mBsQa15E= -k8s.io/cli-runtime v0.33.3 h1:Dgy4vPjNIu8LMJBSvs8W0LcdV0PX/8aGG1DA1W8lklA= -k8s.io/cli-runtime v0.33.3/go.mod h1:yklhLklD4vLS8HNGgC9wGiuHWze4g7x6XQZ+8edsKEo= -k8s.io/client-go v0.33.3 h1:M5AfDnKfYmVJif92ngN532gFqakcGi6RvaOF16efrpA= -k8s.io/client-go v0.33.3/go.mod h1:luqKBQggEf3shbxHY4uVENAxrDISLOarxpTKMiUuujg= -k8s.io/component-base v0.33.3 h1:mlAuyJqyPlKZM7FyaoM/LcunZaaY353RXiOd2+B5tGA= -k8s.io/component-base v0.33.3/go.mod h1:ktBVsBzkI3imDuxYXmVxZ2zxJnYTZ4HAsVj9iF09qp4= -k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= -k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUyGcf03XZEP0ZIKgKj35LS4= -k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= -k8s.io/kubectl v0.33.3 h1:r/phHvH1iU7gO/l7tTjQk2K01ER7/OAJi8uFHHyWSac= -k8s.io/kubectl v0.33.3/go.mod h1:euj2bG56L6kUGOE/ckZbCoudPwuj4Kud7BR0GzyNiT0= -k8s.io/kubelet v0.32.3 h1:B9HzW4yB67flx8tN2FYuDwZvxnmK3v5EjxxFvOYjmc8= -k8s.io/kubelet v0.32.3/go.mod h1:yyAQSCKC+tjSlaFw4HQG7Jein+vo+GeKBGdXdQGvL1U= -k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e h1:KqK5c/ghOm8xkHYhlodbp6i6+r+ChV2vuAuVRdFbLro= -k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +helm.sh/helm/v3 v3.20.2 h1:binM4rvPx5DcNsa1sIt7UZi55lRbu3pZUFmQkSoRh48= +helm.sh/helm/v3 v3.20.2/go.mod h1:Fl1kBaWCpkUrM6IYXPjQ3bdZQfFrogKArqptvueZ6Ww= +k8s.io/api v0.36.0 h1:SgqDhZzHdOtMk40xVSvCXkP9ME0H05hPM3p9AB1kL80= +k8s.io/api v0.36.0/go.mod h1:m1LVrGPNYax5NBHdO+QuAedXyuzTt4RryI/qnmNvs34= +k8s.io/apiextensions-apiserver v0.36.0 h1:Wt7E8J+VBCbj4FjiBfDTK/neXDDjyJVJc7xfuOHImZ0= +k8s.io/apiextensions-apiserver v0.36.0/go.mod h1:kGDjH0msuiIB3tgsYRV0kS9GqpMYMUsQ3GHv7TApyug= +k8s.io/apimachinery v0.36.0 h1:jZyPzhd5Z+3h9vJLt0z9XdzW9VzNzWAUw+P1xZ9PXtQ= +k8s.io/apimachinery v0.36.0/go.mod h1:FklypaRJt6n5wUIwWXIP6GJlIpUizTgfo1T/As+Tyxc= +k8s.io/apiserver v0.36.0 h1:Jg5OFAENUACByUCg15CmhZAYrr5ZyJ+jodyA1mHl3YE= +k8s.io/apiserver v0.36.0/go.mod h1:mHvwdHf+qKEm+1/hYm756SV+oREOKSPnsjagOpx6Vho= +k8s.io/cli-runtime v0.36.0 h1:HNxciQpQMMOKS0/GiUXcKDyA6J2FDILJj9NmP2BZrTg= +k8s.io/cli-runtime v0.36.0/go.mod h1:KObkknK9Ro5LYX+1RdiKc7C8CvGg4aX+V/Zv+E8WPHA= +k8s.io/client-go v0.36.0 h1:pOYi7C4RHChYjMiHpZSpSbIM6ZxVbRXBy7CuiIwqA3c= +k8s.io/client-go v0.36.0/go.mod h1:ZKKcpwF0aLYfkHFCjillCKaTK/yBkEDHTDXCFY6AS9Y= +k8s.io/component-base v0.36.0 h1:hFjEktssxiJhrK1zfybkH4kJOi8iZuF+mIDCqS5+jRo= +k8s.io/component-base v0.36.0/go.mod h1:JZvIfcNHk+uck+8LhJzhSBtydWXaZNQwX2OdL+Mnwsk= +k8s.io/klog/v2 v2.140.0 h1:Tf+J3AH7xnUzZyVVXhTgGhEKnFqye14aadWv7bzXdzc= +k8s.io/klog/v2 v2.140.0/go.mod h1:o+/RWfJ6PwpnFn7OyAG3QnO47BFsymfEfrz6XyYSSp0= +k8s.io/kube-openapi v0.0.0-20260414162039-ec9c827d403f h1:4Qiq0YAoQATdgmHALJWz9rJ4fj20pB3xebpB4CFNhYM= +k8s.io/kube-openapi v0.0.0-20260414162039-ec9c827d403f/go.mod h1:uGBT7iTA6c6MvqUvSXIaYZo9ukscABYi2btjhvgKGZ0= +k8s.io/kubectl v0.36.0 h1:hEGr8NvIm2Wjqs2Xy48Uzmvo6lpHdGKlLyMvau2gTms= +k8s.io/kubectl v0.36.0/go.mod h1:iDe8aV5BEi45W8k+5n71I2pJ/nwE0PHDu+/2cejzYoo= +k8s.io/kubelet v0.36.0 h1:zWeevZeGl80DInNU6WUo13yWmgbEajkRaBFqeKqkweA= +k8s.io/kubelet v0.36.0/go.mod h1:PLROV2RwWJkSbAkdZ8HeJWsbsjEEEMlhRIEzAwGeU9c= +k8s.io/streaming v0.36.0 h1:agnTxU+NFulUrtYzXUGKO3ndEa8jKwht1Kwn9nu9x+4= +k8s.io/streaming v0.36.0/go.mod h1:z6fV3D+NVkoeqRMtWwlUZK6U17SY/LqNzOxWL6GyR/s= +k8s.io/utils v0.0.0-20260319190234-28399d86e0b5 h1:kBawHLSnx/mYHmRnNUf9d4CpjREbeZuxoSGOX/J+aYM= +k8s.io/utils v0.0.0-20260319190234-28399d86e0b5/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc= oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= -sigs.k8s.io/kustomize/api v0.19.0 h1:F+2HB2mU1MSiR9Hp1NEgoU2q9ItNOaBJl0I4Dlus5SQ= -sigs.k8s.io/kustomize/api v0.19.0/go.mod h1:/BbwnivGVcBh1r+8m3tH1VNxJmHSk1PzP5fkP6lbL1o= -sigs.k8s.io/kustomize/kyaml v0.19.0 h1:RFge5qsO1uHhwJsu3ipV7RNolC7Uozc0jUBC/61XSlA= -sigs.k8s.io/kustomize/kyaml v0.19.0/go.mod h1:FeKD5jEOH+FbZPpqUghBP8mrLjJ3+zD3/rf9NNu1cwY= -sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/kustomize/api v0.21.1 h1:lzqbzvz2CSvsjIUZUBNFKtIMsEw7hVLJp0JeSIVmuJs= +sigs.k8s.io/kustomize/api v0.21.1/go.mod h1:f3wkKByTrgpgltLgySCntrYoq5d3q7aaxveSagwTlwI= +sigs.k8s.io/kustomize/kyaml v0.21.1 h1:IVlbmhC076nf6foyL6Taw4BkrLuEsXUXNpsE+ScX7fI= +sigs.k8s.io/kustomize/kyaml v0.21.1/go.mod h1:hmxADesM3yUN2vbA5z1/YTBnzLJ1dajdqpQonwBL1FQ= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= -sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= -sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ= -sigs.k8s.io/yaml v1.5.0/go.mod h1:wZs27Rbxoai4C0f8/9urLZtZtF3avA3gKvGyPdDqTO4= +sigs.k8s.io/structured-merge-diff/v6 v6.4.0 h1:qmp2e3ZfFi1/jJbDGpD4mt3wyp6PE1NfKHCYLqgNQJo= +sigs.k8s.io/structured-merge-diff/v6 v6.4.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/hack/VERSION b/hack/VERSION index a503815a..e37d6647 100644 --- a/hack/VERSION +++ b/hack/VERSION @@ -1,4 +1,4 @@ -OLD_DCGM_VERSION=4.5.1 -OLD_EXPORTER_VERSION=4.8.0 -NEW_DCGM_VERSION=4.5.2 -NEW_EXPORTER_VERSION=4.8.1 +OLD_DCGM_VERSION=4.5.2 +OLD_EXPORTER_VERSION=4.8.1 +NEW_DCGM_VERSION=4.5.3 +NEW_EXPORTER_VERSION=4.8.2 diff --git a/internal/mocks/pkg/dcgmprovider/mock_client.go b/internal/mocks/pkg/dcgmprovider/mock_client.go index 01e362bd..f5838193 100644 --- a/internal/mocks/pkg/dcgmprovider/mock_client.go +++ b/internal/mocks/pkg/dcgmprovider/mock_client.go @@ -170,11 +170,12 @@ func (mr *MockDCGMMockRecorder) EntityGetLatestValues(arg0, arg1, arg2 any) *gom } // FieldGetByID mocks base method. -func (m *MockDCGM) FieldGetByID(arg0 dcgm.Short) dcgm.FieldMeta { +func (m *MockDCGM) FieldGetByID(arg0 dcgm.Short) (dcgm.FieldMeta, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "FieldGetByID", arg0) ret0, _ := ret[0].(dcgm.FieldMeta) - return ret0 + ret1, _ := ret[1].(error) + return ret0, ret1 } // FieldGetByID indicates an expected call of FieldGetByID. diff --git a/internal/mocks/pkg/nvmlprovider/mock_client.go b/internal/mocks/pkg/nvmlprovider/mock_client.go index 3bece5be..c071cdef 100644 --- a/internal/mocks/pkg/nvmlprovider/mock_client.go +++ b/internal/mocks/pkg/nvmlprovider/mock_client.go @@ -66,34 +66,34 @@ func (mr *MockNVMLMockRecorder) Cleanup() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Cleanup", reflect.TypeOf((*MockNVML)(nil).Cleanup)) } -// GetDeviceProcessMemory mocks base method. -func (m *MockNVML) GetDeviceProcessMemory(gpuUUID string) (map[uint32]uint64, error) { +// GetAllMIGDevicesProcessMemory mocks base method. +func (m *MockNVML) GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[uint]map[uint32]uint64, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetDeviceProcessMemory", gpuUUID) - ret0, _ := ret[0].(map[uint32]uint64) + ret := m.ctrl.Call(m, "GetAllMIGDevicesProcessMemory", parentGPUUUID) + ret0, _ := ret[0].(map[uint]map[uint32]uint64) ret1, _ := ret[1].(error) return ret0, ret1 } -// GetDeviceProcessMemory indicates an expected call of GetDeviceProcessMemory. -func (mr *MockNVMLMockRecorder) GetDeviceProcessMemory(gpuUUID any) *gomock.Call { +// GetAllMIGDevicesProcessMemory indicates an expected call of GetAllMIGDevicesProcessMemory. +func (mr *MockNVMLMockRecorder) GetAllMIGDevicesProcessMemory(parentGPUUUID any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetDeviceProcessMemory", reflect.TypeOf((*MockNVML)(nil).GetDeviceProcessMemory), gpuUUID) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetAllMIGDevicesProcessMemory", reflect.TypeOf((*MockNVML)(nil).GetAllMIGDevicesProcessMemory), parentGPUUUID) } -// GetAllMIGDevicesProcessMemory mocks base method. -func (m *MockNVML) GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[uint]map[uint32]uint64, error) { +// GetDeviceProcessMemory mocks base method. +func (m *MockNVML) GetDeviceProcessMemory(gpuUUID string) (map[uint32]uint64, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetAllMIGDevicesProcessMemory", parentGPUUUID) - ret0, _ := ret[0].(map[uint]map[uint32]uint64) + ret := m.ctrl.Call(m, "GetDeviceProcessMemory", gpuUUID) + ret0, _ := ret[0].(map[uint32]uint64) ret1, _ := ret[1].(error) return ret0, ret1 } -// GetAllMIGDevicesProcessMemory indicates an expected call of GetAllMIGDevicesProcessMemory. -func (mr *MockNVMLMockRecorder) GetAllMIGDevicesProcessMemory(parentGPUUUID any) *gomock.Call { +// GetDeviceProcessMemory indicates an expected call of GetDeviceProcessMemory. +func (mr *MockNVMLMockRecorder) GetDeviceProcessMemory(gpuUUID any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetAllMIGDevicesProcessMemory", reflect.TypeOf((*MockNVML)(nil).GetAllMIGDevicesProcessMemory), parentGPUUUID) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetDeviceProcessMemory", reflect.TypeOf((*MockNVML)(nil).GetDeviceProcessMemory), gpuUUID) } // GetDeviceProcessUtilization mocks base method. diff --git a/internal/pkg/appconfig/types.go b/internal/pkg/appconfig/types.go index ce1e02a2..85385762 100644 --- a/internal/pkg/appconfig/types.go +++ b/internal/pkg/appconfig/types.go @@ -76,4 +76,5 @@ type Config struct { DisableStartupValidate bool EnableGPUBindUnbindWatch bool // Enable GPU bind/unbind event monitoring GPUBindUnbindPollInterval time.Duration // Poll interval for GPU bind/unbind events + EnablePprof bool // Enable /debug/pprof/ HTTP endpoints } diff --git a/internal/pkg/collector/collector_factory.go b/internal/pkg/collector/collector_factory.go index 1c011a75..ad6cd193 100644 --- a/internal/pkg/collector/collector_factory.go +++ b/internal/pkg/collector/collector_factory.go @@ -128,7 +128,6 @@ func (cf *collectorFactory) NewCollectors() []EntityCollectorTuple { if IsDCGMExpP2PStatusEnabled(cf.counterSet.ExporterCounters) { newCollector, err := cf.enableExpCollector(counters.DCGMExpP2PStatus) - if err != nil { slog.Error(fmt.Sprintf("collector '%s' cannot be initialized; err: %v", counters.DCGMExpP2PStatus, err)) os.Exit(1) diff --git a/internal/pkg/collector/gpu_collector_test.go b/internal/pkg/collector/gpu_collector_test.go index bc09f6da..6ab14092 100644 --- a/internal/pkg/collector/gpu_collector_test.go +++ b/internal/pkg/collector/gpu_collector_test.go @@ -103,13 +103,13 @@ func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) { mi := devicemonitoring.Info{ DeviceInfo: dcgm.Device{ - UUID: "fake0", - Identifiers: dcgm.DeviceIdentifiers{ - Model: "NVIDIA T400 4GB", - }, - PCI: dcgm.PCIInfo{ - BusID: "00000000:0000:0000.0", - }, + UUID: "fake0", + Identifiers: dcgm.DeviceIdentifiers{ + Model: "NVIDIA T400 4GB", + }, + PCI: dcgm.PCIInfo{ + BusID: "00000000:0000:0000.0", + }, }, } diff --git a/internal/pkg/collector/gpu_health_collector.go b/internal/pkg/collector/gpu_health_collector.go index 9b09a209..6383ca38 100644 --- a/internal/pkg/collector/gpu_health_collector.go +++ b/internal/pkg/collector/gpu_health_collector.go @@ -45,6 +45,59 @@ var gpuHealthChecks = []dcgm.HealthSystem{ dcgm.DCGM_HEALTH_WATCH_THERMAL, dcgm.DCGM_HEALTH_WATCH_POWER, dcgm.DCGM_HEALTH_WATCH_DRIVER, + // DCGM reports devastating GPU-wide XIDs (e.g. XID 79 fallen off bus, XID 95 uncontained ECC) + // under DCGM_HEALTH_WATCH_ALL rather than any specific subsystem. Surface them as their own + // health_watch="ALL" time series instead of dropping them or forcing them into a subsystem. + dcgm.DCGM_HEALTH_WATCH_ALL, +} + +// initGPUHealthEntityIncidentDefaults ensures byEntity[entity] is a non-nil map of PASS incidents +// for every health watch we export. Skips if that entity already has a non-nil inner map. +func initGPUHealthEntityIncidentDefaults( + byEntity map[dcgm.GroupEntityPair]map[dcgm.HealthSystem]dcgm.Incident, + entity dcgm.GroupEntityPair, +) { + if inner, exists := byEntity[entity]; exists && inner != nil { + return + } + inner := make(map[dcgm.HealthSystem]dcgm.Incident) + for _, healthSystem := range gpuHealthChecks { + inner[healthSystem] = dcgm.Incident{ + System: healthSystem, + Health: dcgm.DCGM_HEALTH_RESULT_PASS, + Error: dcgm.DiagErrorDetail{}, + } + } + byEntity[entity] = inner +} + +// applyGPUHealthIncidents overlays HealthCheck results onto byEntity: each incident replaces the +// stored incident for that entity and health system. Unknown entities or entities with +// uninitialized incident maps are skipped. +func applyGPUHealthIncidents( + byEntity map[dcgm.GroupEntityPair]map[dcgm.HealthSystem]dcgm.Incident, + incidents []dcgm.Incident, +) { + for _, incident := range incidents { + incidentMap, ok := byEntity[incident.EntityInfo] + if !ok { + logrus.WithFields(logrus.Fields{ + "entity": incident.EntityInfo, + "system": healthSystemWatchToString(incident.System), + "health": incident.Health, + }).Warn("Received health incident for entity not in monitoring group, skipping") + continue + } + if incidentMap == nil { + logrus.WithFields(logrus.Fields{ + "entity": incident.EntityInfo, + "system": healthSystemWatchToString(incident.System), + "health": incident.Health, + }).Warn("Received health incident for entity with uninitialized incident defaults, skipping") + continue + } + incidentMap[incident.System] = incident + } } type gpuHealthStatusCollector struct { @@ -95,30 +148,22 @@ func (c *gpuHealthStatusCollector) GetMetrics() (MetricsByCounter, error) { entityHealthSystemToIncident := map[dcgm.GroupEntityPair]map[dcgm.HealthSystem]dcgm.Incident{} for _, mi := range monitoringInfoInGroup { - entityHealthSystemToIncident[mi.Entity] = make(map[dcgm.HealthSystem]dcgm.Incident) - // Populate the table with default values - for _, healthSystem := range gpuHealthChecks { - entityHealthSystemToIncident[mi.Entity][healthSystem] = dcgm.Incident{ - System: healthSystem, - Health: dcgm.DCGM_HEALTH_RESULT_PASS, - Error: dcgm.DiagErrorDetail{}, - } - } + initGPUHealthEntityIncidentDefaults(entityHealthSystemToIncident, mi.Entity) } - // We assyme that each health check may produce only one incident per system - for _, incident := range gpuHealthStatus.Incidents { - if _, exists := entityHealthSystemToIncident[incident.EntityInfo]; !exists { - logrus.WithFields(logrus.Fields{ - "entity": incident.EntityInfo, - "system": healthSystemWatchToString(incident.System), - "health": incident.Health, - }).Warn("Received health incident for entity not in monitoring group, skipping") - continue + // Seed PASS defaults for every FE_GPU DCGM reports in this health group before incidents are + // merged. This is defensive only: metrics are still emitted only for monitoringInfoInGroup, but + // pre-seeding keeps incident merging safe if DCGM reports an FE_GPU whose defaults were not + // initialized in the monitored-entity pass above. + for _, entityPair := range groupInfo.EntityList { + if entityPair.EntityGroupId == dcgm.FE_GPU { + initGPUHealthEntityIncidentDefaults(entityHealthSystemToIncident, entityPair) } - entityHealthSystemToIncident[incident.EntityInfo][incident.System] = incident } + // Each health watch may contribute at most one incident per entity for this scrape. + applyGPUHealthIncidents(entityHealthSystemToIncident, gpuHealthStatus.Incidents) + labels := map[string]string{} for _, mi := range monitoringInfoInGroup { @@ -264,12 +309,14 @@ var healthSystemWatchToStringMap = map[dcgm.HealthSystem]string{ dcgm.DCGM_HEALTH_WATCH_DRIVER: "DRIVER", dcgm.DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL: "NVSWITCH_NONFATAL", dcgm.DCGM_HEALTH_WATCH_NVSWITCH_FATAL: "NVSWITCH_FATAL", + dcgm.DCGM_HEALTH_WATCH_CONNECTX: "CONNECTX", + dcgm.DCGM_HEALTH_WATCH_ALL: "ALL", } -func healthSystemWatchToString(heathSystem dcgm.HealthSystem) string { - name, ok := healthSystemWatchToStringMap[heathSystem] +func healthSystemWatchToString(healthSystem dcgm.HealthSystem) string { + name, ok := healthSystemWatchToStringMap[healthSystem] if !ok { - return "" + return fmt.Sprintf("UNKNOWN(%d)", healthSystem) } return name } @@ -385,9 +432,33 @@ var healthCheckErrorToStringMap = map[dcgm.HealthCheckErrorCode]string{ dcgm.DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION: "DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION", dcgm.DCGM_FR_CUDA_FM_NOT_INITIALIZED: "DCGM_FR_CUDA_FM_NOT_INITIALIZED", dcgm.DCGM_FR_SXID_ERROR: "DCGM_FR_SXID_ERROR", + dcgm.DCGM_FR_GFLOPS_THRESHOLD_VIOLATION: "DCGM_FR_GFLOPS_THRESHOLD_VIOLATION", + dcgm.DCGM_FR_NAN_VALUE: "DCGM_FR_NAN_VALUE", + dcgm.DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR: "DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR", + dcgm.DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE: "DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE", + dcgm.DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE: "DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE", + dcgm.DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE: "DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE", + dcgm.DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE: "DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE", + dcgm.DCGM_FR_TEST_SKIPPED: "DCGM_FR_TEST_SKIPPED", + dcgm.DCGM_FR_SRAM_THRESHOLD: "DCGM_FR_SRAM_THRESHOLD", + dcgm.DCGM_FR_NVLINK_EFFECTIVE_BER_THRESHOLD: "DCGM_FR_NVLINK_EFFECTIVE_BER_THRESHOLD", + dcgm.DCGM_FR_FALLEN_OFF_BUS: "DCGM_FR_FALLEN_OFF_BUS", + dcgm.DCGM_FR_NVLINK_SYMBOL_BER_THRESHOLD: "DCGM_FR_NVLINK_SYMBOL_BER_THRESHOLD", + dcgm.DCGM_FR_IMEX_UNHEALTHY: "DCGM_FR_IMEX_UNHEALTHY", + dcgm.DCGM_FR_FABRIC_PROBE_STATE: "DCGM_FR_FABRIC_PROBE_STATE", + dcgm.DCGM_FR_BINARY_PERMISSIONS: "DCGM_FR_BINARY_PERMISSIONS", + dcgm.DCGM_FR_GPU_RECOVERY_RESET: "DCGM_FR_GPU_RECOVERY_RESET", + dcgm.DCGM_FR_GPU_RECOVERY_REBOOT: "DCGM_FR_GPU_RECOVERY_REBOOT", + dcgm.DCGM_FR_GPU_RECOVERY_DRAIN_P2P: "DCGM_FR_GPU_RECOVERY_DRAIN_P2P", + dcgm.DCGM_FR_GPU_RECOVERY_DRAIN_RESET: "DCGM_FR_GPU_RECOVERY_DRAIN_RESET", + dcgm.DCGM_FR_NCCL_ERROR: "DCGM_FR_NCCL_ERROR", + dcgm.DCGM_FR_RETEST_REQUESTED: "DCGM_FR_RETEST_REQUESTED", dcgm.DCGM_FR_ERROR_SENTINEL: "DCGM_FR_ERROR_SENTINEL", } func healthCheckErrorToString(err dcgm.HealthCheckErrorCode) string { - return healthCheckErrorToStringMap[err] + if name, ok := healthCheckErrorToStringMap[err]; ok { + return name + } + return fmt.Sprintf("DCGM_FR_UNKNOWN(%d)", err) } diff --git a/internal/pkg/collector/gpu_health_collector_test.go b/internal/pkg/collector/gpu_health_collector_test.go index 9d54934b..36ada351 100644 --- a/internal/pkg/collector/gpu_health_collector_test.go +++ b/internal/pkg/collector/gpu_health_collector_test.go @@ -106,6 +106,17 @@ func TestNewGPUHealthStatusCollector(t *testing.T) { } func setDefaultExpectationsForGPUHealthStatusCollectorMockDCGMProvider(t *testing.T, mockDCGMProvider *mockdcgm.MockDCGM) { + t.Helper() + setDefaultExpectationsForGPUHealthStatusCollectorMockDCGMProviderWithGroup(t, mockDCGMProvider, + []dcgm.GroupEntityPair{{EntityId: uint(0), EntityGroupId: dcgm.FE_GPU}}, nil) +} + +func setDefaultExpectationsForGPUHealthStatusCollectorMockDCGMProviderWithGroup( + t *testing.T, + mockDCGMProvider *mockdcgm.MockDCGM, + groupEntities []dcgm.GroupEntityPair, + healthResponseOverride *dcgm.HealthResponse, +) { t.Helper() mockDCGMProvider.EXPECT().GetSupportedDevices().Return([]uint{0}, nil).AnyTimes() mockDCGMProvider.EXPECT().CreateGroup(gomock.Cond(func(x any) bool { @@ -146,12 +157,13 @@ func setDefaultExpectationsForGPUHealthStatusCollectorMockDCGMProvider(t *testin }, }, } + if healthResponseOverride != nil { + healthCheckResponse = *healthResponseOverride + } mockDCGMProvider.EXPECT().HealthCheck(gomock.Any()).Return(healthCheckResponse, nil).AnyTimes() mockDCGMProvider.EXPECT().GetGroupInfo(gomock.Any()).Return(&dcgm.GroupInfo{ - EntityList: []dcgm.GroupEntityPair{ - {EntityId: uint(0), EntityGroupId: dcgm.FE_GPU}, - }, + EntityList: groupEntities, }, nil).AnyTimes() } @@ -339,26 +351,352 @@ func TestIsDCGMExpGPUHealthStatusEnabled(t *testing.T) { func TestHealthSystemWatchToString(t *testing.T) { type testCase struct { - name string - heathSystem dcgm.HealthSystem - expected string + name string + healthSystem dcgm.HealthSystem + expected string } testCases := []testCase{ { - name: "returns POWER when dcgm.DCGM_HEALTH_WATCH_POWER", - heathSystem: dcgm.DCGM_HEALTH_WATCH_POWER, - expected: "POWER", + name: "returns POWER when dcgm.DCGM_HEALTH_WATCH_POWER", + healthSystem: dcgm.DCGM_HEALTH_WATCH_POWER, + expected: "POWER", + }, + { + name: "returns ALL when dcgm.DCGM_HEALTH_WATCH_ALL", + healthSystem: dcgm.DCGM_HEALTH_WATCH_ALL, + expected: "ALL", }, { - name: "returns empty string when dcgm.HealthSystem is unknown", - heathSystem: dcgm.HealthSystem(100500), - expected: "", + name: "returns CONNECTX when dcgm.DCGM_HEALTH_WATCH_CONNECTX", + healthSystem: dcgm.DCGM_HEALTH_WATCH_CONNECTX, + expected: "CONNECTX", + }, + { + name: "returns UNKNOWN(N) when dcgm.HealthSystem is unknown", + healthSystem: dcgm.HealthSystem(100500), + expected: "UNKNOWN(100500)", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + actual := healthSystemWatchToString(tc.healthSystem) + assert.Equal(t, tc.expected, actual) + }) + } +} + +func TestInitGPUHealthEntityIncidentDefaults_ReplacesNilInnerMap(t *testing.T) { + key := dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 0} + byEntity := map[dcgm.GroupEntityPair]map[dcgm.HealthSystem]dcgm.Incident{ + key: nil, + } + + initGPUHealthEntityIncidentDefaults(byEntity, key) + + require.NotNil(t, byEntity[key]) + for _, hs := range gpuHealthChecks { + inc := byEntity[key][hs] + assert.Equal(t, hs, inc.System) + assert.Equal(t, dcgm.DCGM_HEALTH_RESULT_PASS, inc.Health) + } +} + +func TestInitGPUHealthEntityIncidentDefaults_Idempotent(t *testing.T) { + key := dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 1} + byEntity := map[dcgm.GroupEntityPair]map[dcgm.HealthSystem]dcgm.Incident{} + + initGPUHealthEntityIncidentDefaults(byEntity, key) + first := byEntity[key] + initGPUHealthEntityIncidentDefaults(byEntity, key) + require.Equal(t, first, byEntity[key]) +} + +func TestApplyGPUHealthIncidents_SkipsNilInnerMapWithoutPanic(t *testing.T) { + key := dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 0} + byEntity := map[dcgm.GroupEntityPair]map[dcgm.HealthSystem]dcgm.Incident{ + key: nil, + } + inc := dcgm.Incident{ + EntityInfo: key, + System: dcgm.DCGM_HEALTH_WATCH_THERMAL, + Health: dcgm.DCGM_HEALTH_RESULT_FAIL, + } + + assert.NotPanics(t, func() { + applyGPUHealthIncidents(byEntity, []dcgm.Incident{inc}) + }) + assert.Nil(t, byEntity[key]) +} + +func TestApplyGPUHealthIncidents_AppliesIncident(t *testing.T) { + key := dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 0} + byEntity := map[dcgm.GroupEntityPair]map[dcgm.HealthSystem]dcgm.Incident{} + initGPUHealthEntityIncidentDefaults(byEntity, key) + + inc := dcgm.Incident{ + EntityInfo: key, + System: dcgm.DCGM_HEALTH_WATCH_THERMAL, + Health: dcgm.DCGM_HEALTH_RESULT_FAIL, + Error: dcgm.DiagErrorDetail{ + Message: "thermal", + Code: dcgm.DCGM_FR_THERMAL_VIOLATIONS, + }, + } + applyGPUHealthIncidents(byEntity, []dcgm.Incident{inc}) + + got := byEntity[key][dcgm.DCGM_HEALTH_WATCH_THERMAL] + assert.Equal(t, dcgm.DCGM_HEALTH_RESULT_FAIL, got.Health) + assert.Equal(t, "thermal", got.Error.Message) +} + +func TestApplyGPUHealthIncidents_SkipsUnknownEntity(t *testing.T) { + key := dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 0} + byEntity := map[dcgm.GroupEntityPair]map[dcgm.HealthSystem]dcgm.Incident{} + initGPUHealthEntityIncidentDefaults(byEntity, key) + + unknown := dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 999} + inc := dcgm.Incident{ + EntityInfo: unknown, + System: dcgm.DCGM_HEALTH_WATCH_THERMAL, + Health: dcgm.DCGM_HEALTH_RESULT_FAIL, + } + + assert.NotPanics(t, func() { + applyGPUHealthIncidents(byEntity, []dcgm.Incident{inc}) + }) + _, ok := byEntity[unknown] + assert.False(t, ok) +} + +// GetGroupInfo can list an FE_GPU that is not part of monitoringInfoInGroup. This is a defensive +// regression test for group-vs-monitoring divergence: we only assert that GetMetrics completes +// without panic because this collector still emits metrics only for monitoringInfoInGroup. +func TestGPUHealthStatusCollector_GetMetrics_HealthGroupContainsUnmonitoredGPU_NoPanic(t *testing.T) { + ctrl := gomock.NewController(t) + + mockDCGMProvider := mockdcgm.NewMockDCGM(ctrl) + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGMProvider) + + groupList := []dcgm.GroupEntityPair{ + {EntityGroupId: dcgm.FE_GPU, EntityId: 0}, + {EntityGroupId: dcgm.FE_GPU, EntityId: 99}, + } + customHealth := &dcgm.HealthResponse{ + OverallHealth: dcgm.DCGM_HEALTH_RESULT_FAIL, + Incidents: []dcgm.Incident{ + { + System: dcgm.DCGM_HEALTH_WATCH_THERMAL, + Health: dcgm.DCGM_HEALTH_RESULT_FAIL, + Error: dcgm.DiagErrorDetail{ + Message: "boom!", + Code: dcgm.DCGM_FR_THERMAL_VIOLATIONS, + }, + EntityInfo: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 99}, + }, }, } + setDefaultExpectationsForGPUHealthStatusCollectorMockDCGMProviderWithGroup(t, mockDCGMProvider, groupList, customHealth) + + counterList := counters.CounterList{ + {FieldName: counters.DCGMExpGPUHealthStatus}, + } + collector, err := NewGPUHealthStatusCollector(counterList, + "", + &appconfig.Config{}, + getDefaultDeviceWatchListForGPUHealthStatusCollectorMockDCGMProvider(ctrl), + ) + require.NoError(t, err) + + assert.NotPanics(t, func() { + metrics, gErr := collector.GetMetrics() + require.NoError(t, gErr) + require.Len(t, metrics, 1) + for _, values := range metrics { + assert.Len(t, values, len(gpuHealthChecks)) + } + }) +} + +func TestHealthCheckErrorToString(t *testing.T) { + testCases := []struct { + name string + code dcgm.HealthCheckErrorCode + expected string + }{ + { + name: "returns DCGM_FR_OK for DCGM_FR_OK", + code: dcgm.DCGM_FR_OK, + expected: "DCGM_FR_OK", + }, + { + name: "returns DCGM_FR_FALLEN_OFF_BUS for DCGM_FR_FALLEN_OFF_BUS", + code: dcgm.DCGM_FR_FALLEN_OFF_BUS, + expected: "DCGM_FR_FALLEN_OFF_BUS", + }, + { + name: "returns DCGM_FR_GFLOPS_THRESHOLD_VIOLATION (110)", + code: dcgm.DCGM_FR_GFLOPS_THRESHOLD_VIOLATION, + expected: "DCGM_FR_GFLOPS_THRESHOLD_VIOLATION", + }, + { + name: "returns DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE (116)", + code: dcgm.DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE, + expected: "DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE", + }, + { + name: "returns DCGM_FR_UNKNOWN(N) for unmapped code", + code: dcgm.HealthCheckErrorCode(424242), + expected: "DCGM_FR_UNKNOWN(424242)", + }, + } for _, tc := range testCases { - actual := healthSystemWatchToString(tc.heathSystem) - assert.Equal(t, tc.expected, actual) + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.expected, healthCheckErrorToString(tc.code)) + }) + } +} + +func TestApplyGPUHealthIncidents_WatchAllIncidentRoutedToAllSlot(t *testing.T) { + key := dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 0} + byEntity := map[dcgm.GroupEntityPair]map[dcgm.HealthSystem]dcgm.Incident{} + initGPUHealthEntityIncidentDefaults(byEntity, key) + + inc := dcgm.Incident{ + EntityInfo: key, + System: dcgm.DCGM_HEALTH_WATCH_ALL, + Health: dcgm.DCGM_HEALTH_RESULT_FAIL, + Error: dcgm.DiagErrorDetail{ + Message: "GPU fallen off bus", + Code: dcgm.DCGM_FR_FALLEN_OFF_BUS, + }, + } + applyGPUHealthIncidents(byEntity, []dcgm.Incident{inc}) + + got := byEntity[key][dcgm.DCGM_HEALTH_WATCH_ALL] + assert.Equal(t, dcgm.DCGM_HEALTH_RESULT_FAIL, got.Health) + assert.Equal(t, dcgm.DCGM_FR_FALLEN_OFF_BUS, got.Error.Code) + + // All other subsystem slots must stay PASS -- devastating XIDs belong to the ALL slot only, + // they are not fanned out. + for _, hs := range gpuHealthChecks { + if hs == dcgm.DCGM_HEALTH_WATCH_ALL { + continue + } + assert.Equalf(t, dcgm.DCGM_HEALTH_RESULT_PASS, byEntity[key][hs].Health, + "subsystem %v should not be marked FAIL by a DCGM_HEALTH_WATCH_ALL incident", + healthSystemWatchToString(hs)) + } +} + +func TestApplyGPUHealthIncidents_WatchAllAndSubsystemIndependent(t *testing.T) { + key := dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 0} + byEntity := map[dcgm.GroupEntityPair]map[dcgm.HealthSystem]dcgm.Incident{} + initGPUHealthEntityIncidentDefaults(byEntity, key) + + incidents := []dcgm.Incident{ + { + EntityInfo: key, + System: dcgm.DCGM_HEALTH_WATCH_PCIE, + Health: dcgm.DCGM_HEALTH_RESULT_FAIL, + Error: dcgm.DiagErrorDetail{Code: dcgm.DCGM_FR_PCI_REPLAY_RATE}, + }, + { + EntityInfo: key, + System: dcgm.DCGM_HEALTH_WATCH_ALL, + Health: dcgm.DCGM_HEALTH_RESULT_FAIL, + Error: dcgm.DiagErrorDetail{Code: dcgm.DCGM_FR_FALLEN_OFF_BUS}, + }, + } + applyGPUHealthIncidents(byEntity, incidents) + + assert.Equal(t, dcgm.DCGM_HEALTH_RESULT_FAIL, byEntity[key][dcgm.DCGM_HEALTH_WATCH_PCIE].Health) + assert.Equal(t, dcgm.DCGM_FR_PCI_REPLAY_RATE, byEntity[key][dcgm.DCGM_HEALTH_WATCH_PCIE].Error.Code) + assert.Equal(t, dcgm.DCGM_HEALTH_RESULT_FAIL, byEntity[key][dcgm.DCGM_HEALTH_WATCH_ALL].Health) + assert.Equal(t, dcgm.DCGM_FR_FALLEN_OFF_BUS, byEntity[key][dcgm.DCGM_HEALTH_WATCH_ALL].Error.Code) + + for _, hs := range gpuHealthChecks { + if hs == dcgm.DCGM_HEALTH_WATCH_PCIE || hs == dcgm.DCGM_HEALTH_WATCH_ALL { + continue + } + assert.Equalf(t, dcgm.DCGM_HEALTH_RESULT_PASS, byEntity[key][hs].Health, + "subsystem %v should remain PASS", healthSystemWatchToString(hs)) + } +} + +// TestGPUHealthStatusCollector_GetMetrics_WatchAllIncident exercises the full scrape path with an +// injected DCGM_HEALTH_WATCH_ALL incident (representing a devastating XID such as GPU fallen off +// bus). It asserts the emitted row shape: one row per entry in gpuHealthChecks, with the ALL row +// carrying the incident. +func TestGPUHealthStatusCollector_GetMetrics_WatchAllIncident(t *testing.T) { + ctrl := gomock.NewController(t) + + mockDCGMProvider := mockdcgm.NewMockDCGM(ctrl) + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGMProvider) + + groupList := []dcgm.GroupEntityPair{ + {EntityGroupId: dcgm.FE_GPU, EntityId: 0}, + } + customHealth := &dcgm.HealthResponse{ + OverallHealth: dcgm.DCGM_HEALTH_RESULT_FAIL, + Incidents: []dcgm.Incident{ + { + System: dcgm.DCGM_HEALTH_WATCH_ALL, + Health: dcgm.DCGM_HEALTH_RESULT_FAIL, + Error: dcgm.DiagErrorDetail{ + Message: "GPU has fallen off the bus", + Code: dcgm.DCGM_FR_FALLEN_OFF_BUS, + }, + EntityInfo: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 0}, + }, + }, + } + setDefaultExpectationsForGPUHealthStatusCollectorMockDCGMProviderWithGroup(t, mockDCGMProvider, groupList, customHealth) + + counterList := counters.CounterList{ + {FieldName: counters.DCGMExpGPUHealthStatus}, + } + + collector, err := NewGPUHealthStatusCollector(counterList, + "", + &appconfig.Config{}, + getDefaultDeviceWatchListForGPUHealthStatusCollectorMockDCGMProvider(ctrl), + ) + require.NoError(t, err) + + metrics, err := collector.GetMetrics() + require.NoError(t, err) + require.Len(t, metrics, 1) + + var values []Metric + for _, v := range metrics { + values = v + } + assert.Len(t, values, len(gpuHealthChecks), "expected one row per entry in gpuHealthChecks") + + var allRowFound bool + for _, value := range values { + healthWatch := value.Labels["health_watch"] + healthErrorCode := value.Labels["health_error_code"] + switch healthWatch { + case "ALL": + allRowFound = true + assert.Equal(t, "20", value.Value, "ALL row should carry FAIL (20) when a devastating XID is reported") + assert.Equal(t, "DCGM_FR_FALLEN_OFF_BUS", healthErrorCode) + default: + assert.Equalf(t, "0", value.Value, "subsystem %q should remain PASS when only a DCGM_HEALTH_WATCH_ALL incident is present", healthWatch) + assert.Equal(t, "DCGM_FR_OK", healthErrorCode) + } } + assert.True(t, allRowFound, "expected a row with health_watch=ALL") } diff --git a/internal/pkg/dcgmprovider/dcgm.go b/internal/pkg/dcgmprovider/dcgm.go index 65f9ed51..1ddfd83d 100644 --- a/internal/pkg/dcgmprovider/dcgm.go +++ b/internal/pkg/dcgmprovider/dcgm.go @@ -71,7 +71,12 @@ func newDCGMProvider(config *appconfig.Config) DCGM { cleanup, err := dcgm.Init(dcgm.Standalone, config.RemoteHEInfo, "0") if err != nil { // Don't call cleanup on error - initialization failed, nothing to clean up - slog.Error(err.Error()) + slog.Error("Failed to connect to remote hostengine", + slog.String("address", config.RemoteHEInfo), + slog.String("error", err.Error()), + slog.String("hint", "Verify nv-hostengine is running and listening on the expected address. "+ + "For IPv6, use bracket notation: []: (e.g., \"[::1]:5555\")"), + ) os.Exit(1) } client.shutdown = cleanup @@ -143,7 +148,7 @@ func (d dcgmProvider) Fv2_String(fv dcgm.FieldValue_v2) string { return dcgm.Fv2_String(fv) } -func (d dcgmProvider) FieldGetByID(fieldID dcgm.Short) dcgm.FieldMeta { +func (d dcgmProvider) FieldGetByID(fieldID dcgm.Short) (dcgm.FieldMeta, error) { return dcgm.FieldGetByID(fieldID) } diff --git a/internal/pkg/dcgmprovider/smart_init.go b/internal/pkg/dcgmprovider/smart_init.go index 49bb314c..53ae4c74 100644 --- a/internal/pkg/dcgmprovider/smart_init.go +++ b/internal/pkg/dcgmprovider/smart_init.go @@ -6,8 +6,9 @@ import ( "os" "testing" - "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" ) // SmartDCGMInit tries to initialize DCGM with embedded mode first, then falls back to remote if it fails diff --git a/internal/pkg/dcgmprovider/types.go b/internal/pkg/dcgmprovider/types.go index c7ed7801..e6797313 100644 --- a/internal/pkg/dcgmprovider/types.go +++ b/internal/pkg/dcgmprovider/types.go @@ -35,7 +35,7 @@ type DCGM interface { EntitiesGetLatestValues([]dcgm.GroupEntityPair, []dcgm.Short, uint) ([]dcgm.FieldValue_v2, error) EntityGetLatestValues(dcgm.Field_Entity_Group, uint, []dcgm.Short) ([]dcgm.FieldValue_v1, error) Fv2_String(fv dcgm.FieldValue_v2) string - FieldGetByID(dcgm.Short) dcgm.FieldMeta + FieldGetByID(dcgm.Short) (dcgm.FieldMeta, error) FieldGroupCreate(string, []dcgm.Short) (dcgm.FieldHandle, error) FieldGroupDestroy(dcgm.FieldHandle) error GetAllDeviceCount() (uint, error) diff --git a/internal/pkg/devicewatcher/device_watcher.go b/internal/pkg/devicewatcher/device_watcher.go index 732720cd..e598eada 100644 --- a/internal/pkg/devicewatcher/device_watcher.go +++ b/internal/pkg/devicewatcher/device_watcher.go @@ -94,14 +94,31 @@ func NewDeviceWatcher() *DeviceWatcher { func (d *DeviceWatcher) GetDeviceFields(counters []counters.Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short { var deviceFields []dcgm.Short + var failedCount int for _, counter := range counters { - fieldMeta := dcgmprovider.Client().FieldGetByID(counter.FieldID) + fieldMeta, err := dcgmprovider.Client().FieldGetByID(counter.FieldID) + if err != nil { + failedCount++ + slog.Debug("FieldGetByID failed; skipping field", + slog.Any("field_id", counter.FieldID), + slog.String(ErrorKey, err.Error()), + ) + continue + } if shouldIncludeField(entityType, fieldMeta.EntityLevel) { deviceFields = append(deviceFields, counter.FieldID) } } + if failedCount > 0 { + slog.Warn("Some fields were skipped because FieldGetByID failed", + slog.Int("failed_count", failedCount), + slog.Int("total_count", len(counters)), + slog.Any("entity_type", entityType), + ) + } + return deviceFields } diff --git a/internal/pkg/devicewatcher/device_watcher_test.go b/internal/pkg/devicewatcher/device_watcher_test.go index cf0d2f6c..004bc475 100644 --- a/internal/pkg/devicewatcher/device_watcher_test.go +++ b/internal/pkg/devicewatcher/device_watcher_test.go @@ -1833,7 +1833,7 @@ func TestDeviceWatcher_GetDeviceFields(t *testing.T) { }, mockDCGMFunc: func(fieldIDs []dcgm.Short) { for _, fieldID := range fieldIDs { - mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID], nil) } }, want: func() []dcgm.Short { @@ -1848,7 +1848,7 @@ func TestDeviceWatcher_GetDeviceFields(t *testing.T) { }, mockDCGMFunc: func(fieldIDs []dcgm.Short) { for _, fieldID := range fieldIDs { - mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID], nil) } }, want: func() []dcgm.Short { @@ -1866,7 +1866,7 @@ func TestDeviceWatcher_GetDeviceFields(t *testing.T) { }, mockDCGMFunc: func(fieldIDs []dcgm.Short) { for _, fieldID := range fieldIDs { - mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID], nil) } }, want: func() []dcgm.Short { @@ -1884,7 +1884,7 @@ func TestDeviceWatcher_GetDeviceFields(t *testing.T) { }, mockDCGMFunc: func(fieldIDs []dcgm.Short) { for _, fieldID := range fieldIDs { - mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID], nil) } }, want: func() []dcgm.Short { @@ -1902,7 +1902,7 @@ func TestDeviceWatcher_GetDeviceFields(t *testing.T) { }, mockDCGMFunc: func(fieldIDs []dcgm.Short) { for _, fieldID := range fieldIDs { - mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID], nil) } }, want: func() []dcgm.Short { @@ -1921,7 +1921,7 @@ func TestDeviceWatcher_GetDeviceFields(t *testing.T) { }, mockDCGMFunc: func(fieldIDs []dcgm.Short) { for _, fieldID := range fieldIDs { - mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID], nil) } }, want: func() []dcgm.Short { @@ -1939,7 +1939,7 @@ func TestDeviceWatcher_GetDeviceFields(t *testing.T) { }, mockDCGMFunc: func(fieldIDs []dcgm.Short) { for _, fieldID := range fieldIDs { - mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + mockDCGM.EXPECT().FieldGetByID(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID], nil) } }, want: func() []dcgm.Short { @@ -1974,3 +1974,39 @@ func TestDeviceWatcher_GetDeviceFields(t *testing.T) { }) } } + +// TestDeviceWatcher_GetDeviceFields_FieldGetByIDError asserts that when +// FieldGetByID returns an error for a field, that field is silently skipped +// and the rest of the counter list is still processed. This exercises the +// error path introduced when go-dcgm's FieldGetByID gained an error return. +func TestDeviceWatcher_GetDeviceFields_FieldGetByIDError(t *testing.T) { + ctrl := gomock.NewController(t) + mockDCGM := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGM) + + // Two counters: the first fails FieldGetByID; the second succeeds and + // should still be returned by GetDeviceFields. + failingCounter := testutils.SampleGPUTempCounter + goodCounter := testutils.SampleGPUPowerUsageCounter + + mockDCGM.EXPECT(). + FieldGetByID(failingCounter.FieldID). + Return(dcgm.FieldMeta{}, fmt.Errorf("field lookup failed")) + mockDCGM.EXPECT(). + FieldGetByID(goodCounter.FieldID). + Return(testutils.SampleFieldIDToFieldMeta[goodCounter.FieldID], nil) + + d := &DeviceWatcher{} + got := d.GetDeviceFields( + []counters.Counter{failingCounter, goodCounter}, + dcgm.FE_GPU, + ) + + assert.Equal(t, []dcgm.Short{goodCounter.FieldID}, got, + "failing field should be skipped; remaining field should be returned") +} diff --git a/internal/pkg/hostname/hostname_test.go b/internal/pkg/hostname/hostname_test.go index b50a59b2..db7d57ae 100644 --- a/internal/pkg/hostname/hostname_test.go +++ b/internal/pkg/hostname/hostname_test.go @@ -140,6 +140,54 @@ func TestGetHostname(t *testing.T) { }, want: "localhost", }, + { + name: "When appconfig.UseRemoteHE is true and remote address is IPv6 loopback with port", + config: &appconfig.Config{ + UseRemoteHE: true, + RemoteHEInfo: "[::1]:5555", + }, + want: "::1", + }, + { + name: "When appconfig.UseRemoteHE is true and remote address is full IPv6 with port", + config: &appconfig.Config{ + UseRemoteHE: true, + RemoteHEInfo: "[2001:db8::1]:5555", + }, + want: "2001:db8::1", + }, + { + name: "When appconfig.UseRemoteHE is true and remote address is IPv6 wildcard with port", + config: &appconfig.Config{ + UseRemoteHE: true, + RemoteHEInfo: "[::]:5555", + }, + want: "::", + }, + { + name: "When appconfig.UseRemoteHE is true and remote address is IPv6 without port", + config: &appconfig.Config{ + UseRemoteHE: true, + RemoteHEInfo: "[::1]", + }, + want: "[::1]", + }, + { + name: "When appconfig.UseRemoteHE is true and remote address is bare IPv6 without brackets or port", + config: &appconfig.Config{ + UseRemoteHE: true, + RemoteHEInfo: "::1", + }, + want: "::1", + }, + { + name: "When appconfig.UseRemoteHE is true and remote address is empty", + config: &appconfig.Config{ + UseRemoteHE: true, + RemoteHEInfo: "", + }, + want: "", + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/internal/pkg/integration_test/collector_test.go b/internal/pkg/integration_test/collector_test.go index 7a2099e8..48b8b168 100644 --- a/internal/pkg/integration_test/collector_test.go +++ b/internal/pkg/integration_test/collector_test.go @@ -28,6 +28,7 @@ import ( "github.com/NVIDIA/go-dcgm/pkg/dcgm" io_prometheus_client "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "google.golang.org/grpc" @@ -738,7 +739,7 @@ func TestXIDCollector_Gather_Encode(t *testing.T) { require.NoError(t, err) require.NotEmpty(t, b) - var parser expfmt.TextParser + parser := expfmt.NewTextParser(model.UTF8Validation) mf, err := parser.TextToMetricFamilies(&b) require.NoError(t, err) require.NotEmpty(t, mf) diff --git a/internal/pkg/nvmlprovider/provider.go b/internal/pkg/nvmlprovider/provider.go index 493d5e94..7633e0be 100644 --- a/internal/pkg/nvmlprovider/provider.go +++ b/internal/pkg/nvmlprovider/provider.go @@ -262,6 +262,11 @@ func (n nvmlProvider) GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[u continue } + if giID < 0 { + slog.Debug("Skipping MIG device with negative GPU instance ID", "gpuInstanceID", giID) + continue + } + pidToMemory := make(map[uint32]uint64, len(processes)) for _, p := range processes { pidToMemory[p.Pid] = p.UsedGpuMemory diff --git a/internal/pkg/server/server.go b/internal/pkg/server/server.go index ccb51f34..147556e0 100644 --- a/internal/pkg/server/server.go +++ b/internal/pkg/server/server.go @@ -76,19 +76,23 @@ func NewMetricsServer( serverv1.reloadInProgress.Store(false) router.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("X-Content-Type-Options", "nosniff") - _, err := w.Write([]byte(` - GPU Exporter - -

GPU Exporter

-

Metrics

-

Health

+ pprofHTML := "" + if c.EnablePprof { + pprofHTML = `

Profiling (pprof)

+ ` + } + _, err := w.Write([]byte(` + GPU Exporter + +

GPU Exporter

+

Metrics

+

Health

` + pprofHTML + ` `)) if err != nil { @@ -101,21 +105,21 @@ func NewMetricsServer( router.HandleFunc("/health", serverv1.Health) router.HandleFunc("/metrics", serverv1.Metrics) - // Register pprof endpoints for profiling and debugging - // Access via: curl http://localhost:9400/debug/pprof/heap > heap.pprof - router.HandleFunc("/debug/pprof/", pprof.Index) - router.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) - router.HandleFunc("/debug/pprof/profile", pprof.Profile) - router.HandleFunc("/debug/pprof/symbol", pprof.Symbol) - router.HandleFunc("/debug/pprof/trace", pprof.Trace) - router.Handle("/debug/pprof/heap", pprof.Handler("heap")) - router.Handle("/debug/pprof/goroutine", pprof.Handler("goroutine")) - router.Handle("/debug/pprof/threadcreate", pprof.Handler("threadcreate")) - router.Handle("/debug/pprof/block", pprof.Handler("block")) - router.Handle("/debug/pprof/mutex", pprof.Handler("mutex")) - router.Handle("/debug/pprof/allocs", pprof.Handler("allocs")) - - slog.Info("Profiling endpoints enabled at /debug/pprof/") + if c.EnablePprof { + router.HandleFunc("/debug/pprof/", pprof.Index) + router.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) + router.HandleFunc("/debug/pprof/profile", pprof.Profile) + router.HandleFunc("/debug/pprof/symbol", pprof.Symbol) + router.HandleFunc("/debug/pprof/trace", pprof.Trace) + router.Handle("/debug/pprof/heap", pprof.Handler("heap")) + router.Handle("/debug/pprof/goroutine", pprof.Handler("goroutine")) + router.Handle("/debug/pprof/threadcreate", pprof.Handler("threadcreate")) + router.Handle("/debug/pprof/block", pprof.Handler("block")) + router.Handle("/debug/pprof/mutex", pprof.Handler("mutex")) + router.Handle("/debug/pprof/allocs", pprof.Handler("allocs")) + + slog.Info("Profiling endpoints enabled at /debug/pprof/") + } var podMapper *transformation.PodMapper for _, t := range serverv1.transformations { diff --git a/internal/pkg/server/server_test.go b/internal/pkg/server/server_test.go index 07fdad61..a953f646 100644 --- a/internal/pkg/server/server_test.go +++ b/internal/pkg/server/server_test.go @@ -27,6 +27,7 @@ import ( "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "go.uber.org/mock/gomock" mockcollectorpkg "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/collector" @@ -272,14 +273,13 @@ func TestHealthReturnsOK(t *testing.T) { assert.Equal(t, http.StatusOK, recorder.Code) } -func TestHealthDoesNotPanicWhenWriteError(t *testing.T) { +func TestHealthReturnsOKWhenWriteReturnsError(t *testing.T) { metricServer := &MetricsServer{} // Set a registry so the code path reaches the write call metricServer.registry.Store(registry.NewRegistry()) recorder := &mockResponseWriter{} - assert.NotPanics(t, func() { - metricServer.Health(recorder, nil) - }) + metricServer.Health(recorder, nil) + assert.Equal(t, http.StatusInternalServerError, recorder.Code) } func TestHealthReturnsOKWhenRegistryIsNil(t *testing.T) { @@ -312,3 +312,41 @@ func TestHealthReturnsOKWithRegistryAvailable(t *testing.T) { assert.Equal(t, "true", recorder.Header().Get("X-Registry-Available")) assert.NotEqual(t, "true", recorder.Header().Get("X-Reload-In-Progress")) } + +func TestPprofEndpointsDisabledByDefault(t *testing.T) { + ctrl := gomock.NewController(t) + mockManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + cfg := &appconfig.Config{Address: ":0"} + srv, cleanup, err := NewMetricsServer(cfg, mockManager, registry.NewRegistry()) + require.NoError(t, err) + defer cleanup() + + router := srv.server.Handler + + rec := httptest.NewRecorder() + router.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/debug/pprof/", nil)) + assert.Equal(t, http.StatusNotFound, rec.Code) + + rec = httptest.NewRecorder() + router.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil)) + assert.NotContains(t, rec.Body.String(), "pprof") +} + +func TestPprofEndpointsEnabledWhenFlagSet(t *testing.T) { + ctrl := gomock.NewController(t) + mockManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + cfg := &appconfig.Config{Address: ":0", EnablePprof: true} + srv, cleanup, err := NewMetricsServer(cfg, mockManager, registry.NewRegistry()) + require.NoError(t, err) + defer cleanup() + + router := srv.server.Handler + + rec := httptest.NewRecorder() + router.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/debug/pprof/", nil)) + assert.Equal(t, http.StatusOK, rec.Code) + + rec = httptest.NewRecorder() + router.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil)) + assert.Contains(t, rec.Body.String(), "pprof") +} diff --git a/internal/pkg/testutils/test_utils.go b/internal/pkg/testutils/test_utils.go index ed738aca..c51a0620 100644 --- a/internal/pkg/testutils/test_utils.go +++ b/internal/pkg/testutils/test_utils.go @@ -195,6 +195,7 @@ func CreateTmpDir(t *testing.T) (string, func()) { } type MockPodResourcesServer struct { + v1.UnimplementedPodResourcesListerServer resourceName string gpus []string } diff --git a/internal/pkg/transformation/kubernetes.go b/internal/pkg/transformation/kubernetes.go index 5f9f6324..1fe44e42 100644 --- a/internal/pkg/transformation/kubernetes.go +++ b/internal/pkg/transformation/kubernetes.go @@ -676,7 +676,7 @@ func (p *PodMapper) toDeviceToSharingPods(devicePods *podresourcesapi.ListPodRes // Check for potential integer overflow before conversion if migDevice.GPUInstanceID >= 0 { giIdentifier := deviceinfo.GetGPUInstanceIdentifier(deviceInfo, migDevice.ParentUUID, - uint(migDevice.GPUInstanceID)) + uint(migDevice.GPUInstanceID)) //nolint:gosec // G115: bounds checked above deviceToPodsMap[giIdentifier] = append(deviceToPodsMap[giIdentifier], podInfo) } } @@ -713,7 +713,6 @@ func (p *PodMapper) toDeviceToPod( devicePods *podresourcesapi.ListPodResourcesResponse, deviceInfo deviceinfo.Provider, ) map[string]PodInfo { deviceToPodMap := make(map[string]PodInfo) - uidToPodInfo := make(map[string]PodInfo) slog.Debug("Processing pod resources", "totalPods", len(devicePods.GetPodResources())) @@ -755,11 +754,6 @@ func (p *PodMapper) toDeviceToPod( podInfo := p.createPodInfo(pod, container) - // Store PodInfo by UID for process-based mapping correction - if podInfo.UID != "" { - uidToPodInfo[podInfo.UID] = podInfo - } - slog.Debug("Created pod info", "podInfo", fmt.Sprintf("%+v", podInfo), "podName", pod.GetName(), @@ -813,7 +807,7 @@ func (p *PodMapper) toDeviceToPod( // Check for potential integer overflow before conversion if migDevice.GPUInstanceID >= 0 { giIdentifier := deviceinfo.GetGPUInstanceIdentifier(deviceInfo, migDevice.ParentUUID, - uint(migDevice.GPUInstanceID)) + uint(migDevice.GPUInstanceID)) //nolint:gosec // G115: bounds checked above slog.Debug("Mapped MIG device to GPU instance", "deviceID", deviceID, "giIdentifier", giIdentifier, diff --git a/internal/pkg/transformation/kubernetes_mig_test.go b/internal/pkg/transformation/kubernetes_mig_test.go index 2111c12e..7d35e1c1 100644 --- a/internal/pkg/transformation/kubernetes_mig_test.go +++ b/internal/pkg/transformation/kubernetes_mig_test.go @@ -26,13 +26,14 @@ import ( "go.uber.org/mock/gomock" "google.golang.org/grpc" + podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1" + mocknvmlprovider "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/nvmlprovider" "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" "github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider" "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" - podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1" ) const sampleMetricsJSON = `{ @@ -372,298 +373,6 @@ const sampleMetricsJSON = `{ "labels": {}, "attributes": {} } - ], - "DCGM_FI_DEV_FB_RESERVED": [ - { - "counter": { - "field_id": 253, - "field_name": "DCGM_FI_DEV_FB_RESERVED", - "prom_type": "gauge", - "help": "Framebuffer memory reserved (in MiB)." - }, - "value": "576", - "gpu": "0", - "gpu_uuid": "GPU-be839661-c0f5-7452-284b-b875666df60c", - "gpu_device": "nvidia0", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:1B:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 253, - "field_name": "DCGM_FI_DEV_FB_RESERVED", - "prom_type": "gauge", - "help": "Framebuffer memory reserved (in MiB)." - }, - "value": "576", - "gpu": "1", - "gpu_uuid": "GPU-21c6d9d7-46cd-7e91-99c3-7b6a06a3faea", - "gpu_device": "nvidia1", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:43:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 253, - "field_name": "DCGM_FI_DEV_FB_RESERVED", - "prom_type": "gauge", - "help": "Framebuffer memory reserved (in MiB)." - }, - "value": "576", - "gpu": "2", - "gpu_uuid": "GPU-5d9cc71f-b438-dc00-707d-c6c12bcfede1", - "gpu_device": "nvidia2", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:52:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 253, - "field_name": "DCGM_FI_DEV_FB_RESERVED", - "prom_type": "gauge", - "help": "Framebuffer memory reserved (in MiB)." - }, - "value": "576", - "gpu": "3", - "gpu_uuid": "GPU-81d888ca-dd11-328c-45fa-d6807a1afa6a", - "gpu_device": "nvidia3", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:61:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 253, - "field_name": "DCGM_FI_DEV_FB_RESERVED", - "prom_type": "gauge", - "help": "Framebuffer memory reserved (in MiB)." - }, - "value": "576", - "gpu": "4", - "gpu_uuid": "GPU-c4c7f4f8-af86-6966-c0b2-7c1e40c18347", - "gpu_device": "nvidia4", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:9D:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 253, - "field_name": "DCGM_FI_DEV_FB_RESERVED", - "prom_type": "gauge", - "help": "Framebuffer memory reserved (in MiB)." - }, - "value": "576", - "gpu": "5", - "gpu_uuid": "GPU-7845680c-0e07-1670-c2bb-9f018cd7864b", - "gpu_device": "nvidia5", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:C3:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 253, - "field_name": "DCGM_FI_DEV_FB_RESERVED", - "prom_type": "gauge", - "help": "Framebuffer memory reserved (in MiB)." - }, - "value": "576", - "gpu": "6", - "gpu_uuid": "GPU-f70b214f-9fe8-5a4e-0499-0ff9572959ff", - "gpu_device": "nvidia6", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:D1:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 253, - "field_name": "DCGM_FI_DEV_FB_RESERVED", - "prom_type": "gauge", - "help": "Framebuffer memory reserved (in MiB)." - }, - "value": "576", - "gpu": "7", - "gpu_uuid": "GPU-eb5c9999-ebc3-9a6e-58cc-494befb69b8a", - "gpu_device": "nvidia7", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:DF:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - } - ], - "DCGM_FI_DRIVER_VERSION": [ - { - "counter": { - "field_id": 254, - "field_name": "DCGM_FI_DRIVER_VERSION", - "prom_type": "label", - "help": "Driver Version" - }, - "value": "575.51.03", - "gpu": "0", - "gpu_uuid": "GPU-be839661-c0f5-7452-284b-b875666df60c", - "gpu_device": "nvidia0", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:1B:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 254, - "field_name": "DCGM_FI_DRIVER_VERSION", - "prom_type": "label", - "help": "Driver Version" - }, - "value": "575.51.03", - "gpu": "1", - "gpu_uuid": "GPU-21c6d9d7-46cd-7e91-99c3-7b6a06a3faea", - "gpu_device": "nvidia1", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:43:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 254, - "field_name": "DCGM_FI_DRIVER_VERSION", - "prom_type": "label", - "help": "Driver Version" - }, - "value": "575.51.03", - "gpu": "2", - "gpu_uuid": "GPU-5d9cc71f-b438-dc00-707d-c6c12bcfede1", - "gpu_device": "nvidia2", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:52:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 254, - "field_name": "DCGM_FI_DRIVER_VERSION", - "prom_type": "label", - "help": "Driver Version" - }, - "value": "575.51.03", - "gpu": "3", - "gpu_uuid": "GPU-81d888ca-dd11-328c-45fa-d6807a1afa6a", - "gpu_device": "nvidia3", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:61:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 254, - "field_name": "DCGM_FI_DRIVER_VERSION", - "prom_type": "label", - "help": "Driver Version" - }, - "value": "575.51.03", - "gpu": "4", - "gpu_uuid": "GPU-c4c7f4f8-af86-6966-c0b2-7c1e40c18347", - "gpu_device": "nvidia4", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:9D:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 254, - "field_name": "DCGM_FI_DRIVER_VERSION", - "prom_type": "label", - "help": "Driver Version" - }, - "value": "575.51.03", - "gpu": "5", - "gpu_uuid": "GPU-7845680c-0e07-1670-c2bb-9f018cd7864b", - "gpu_device": "nvidia5", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:C3:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 254, - "field_name": "DCGM_FI_DRIVER_VERSION", - "prom_type": "label", - "help": "Driver Version" - }, - "value": "575.51.03", - "gpu": "6", - "gpu_uuid": "GPU-f70b214f-9fe8-5a4e-0499-0ff9572959ff", - "gpu_device": "nvidia6", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:D1:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - }, - { - "counter": { - "field_id": 254, - "field_name": "DCGM_FI_DRIVER_VERSION", - "prom_type": "label", - "help": "Driver Version" - }, - "value": "575.51.03", - "gpu": "7", - "gpu_uuid": "GPU-eb5c9999-ebc3-9a6e-58cc-494befb69b8a", - "gpu_device": "nvidia7", - "gpu_model": "NVIDIA H100 80GB HBM3", - "pci_bus_id": "00000000:DF:00.0", - "uuid": "UUID", - "hostname": "localhost", - "labels": {}, - "attributes": {} - } ] } }` diff --git a/internal/pkg/transformation/kubernetes_test.go b/internal/pkg/transformation/kubernetes_test.go index 09e654a7..8e924894 100644 --- a/internal/pkg/transformation/kubernetes_test.go +++ b/internal/pkg/transformation/kubernetes_test.go @@ -51,6 +51,11 @@ import ( "github.com/NVIDIA/dcgm-exporter/internal/pkg/utils" ) +//nolint:gosec // G115: test helper for non-negative int-to-uint conversion +func toUint(v int) uint { + return uint(v) +} + func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { testutils.RequireLinux(t) logrus.SetLevel(logrus.DebugLevel) @@ -396,8 +401,8 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { } mockSystemInfo := mockdeviceinfo.NewMockProvider(ctrl) - mockSystemInfo.EXPECT().GPUCount().Return(uint(1)).AnyTimes() - mockSystemInfo.EXPECT().GPU(uint(0)).Return(mockGPU).AnyTimes() + mockSystemInfo.EXPECT().GPUCount().Return(toUint(1)).AnyTimes() + mockSystemInfo.EXPECT().GPU(toUint(0)).Return(mockGPU).AnyTimes() err := podMapper.Process(metrics, mockSystemInfo) require.NoError(t, err) @@ -583,9 +588,9 @@ func TestProcessPodMapper_WithLabels(t *testing.T) { } mockDeviceInfo := mockdeviceinfo.NewMockProvider(ctrl) - mockDeviceInfo.EXPECT().GPUCount().Return(uint(len(gpus))).AnyTimes() + mockDeviceInfo.EXPECT().GPUCount().Return(toUint(len(gpus))).AnyTimes() for i := range gpus { - mockDeviceInfo.EXPECT().GPU(uint(i)).Return(mockGPU).AnyTimes() + mockDeviceInfo.EXPECT().GPU(toUint(i)).Return(mockGPU).AnyTimes() } // Process metrics @@ -809,9 +814,9 @@ func TestProcessPodMapper_WithUID(t *testing.T) { } mockDeviceInfo := mockdeviceinfo.NewMockProvider(ctrl) - mockDeviceInfo.EXPECT().GPUCount().Return(uint(len(gpus))).AnyTimes() + mockDeviceInfo.EXPECT().GPUCount().Return(toUint(len(gpus))).AnyTimes() for i := range gpus { - mockDeviceInfo.EXPECT().GPU(uint(i)).Return(mockGPU).AnyTimes() + mockDeviceInfo.EXPECT().GPU(toUint(i)).Return(mockGPU).AnyTimes() } // Process metrics @@ -920,9 +925,9 @@ func TestProcessPodMapper_WithLabelsAndUID(t *testing.T) { } mockDeviceInfo := mockdeviceinfo.NewMockProvider(ctrl) - mockDeviceInfo.EXPECT().GPUCount().Return(uint(len(gpus))).AnyTimes() + mockDeviceInfo.EXPECT().GPUCount().Return(toUint(len(gpus))).AnyTimes() for i := range gpus { - mockDeviceInfo.EXPECT().GPU(uint(i)).Return(mockGPU).AnyTimes() + mockDeviceInfo.EXPECT().GPU(toUint(i)).Return(mockGPU).AnyTimes() } // Process metrics @@ -1523,12 +1528,12 @@ func TestKubernetesVirtualGPUs_UnusedGPUsPreserveMetrics(t *testing.T) { } mockSystemInfo := mockdeviceinfo.NewMockProvider(ctrl) - mockSystemInfo.EXPECT().GPUCount().Return(uint(len(allGPUUUIDs))).AnyTimes() + mockSystemInfo.EXPECT().GPUCount().Return(toUint(len(allGPUUUIDs))).AnyTimes() for i, uuid := range allGPUUUIDs { - mockSystemInfo.EXPECT().GPU(uint(i)).Return(deviceinfo.GPUInfo{ + mockSystemInfo.EXPECT().GPU(toUint(i)).Return(deviceinfo.GPUInfo{ DeviceInfo: dcgm.Device{ UUID: uuid, - GPU: uint(i), + GPU: toUint(i), }, }).AnyTimes() } @@ -1656,8 +1661,8 @@ func TestKubernetesVirtualGPUs_UnusedMIGInstancesPreserveMetrics(t *testing.T) { } mockSystemInfo := mockdeviceinfo.NewMockProvider(ctrl) - mockSystemInfo.EXPECT().GPUCount().Return(uint(1)).AnyTimes() - mockSystemInfo.EXPECT().GPU(uint(0)).Return(deviceinfo.GPUInfo{ + mockSystemInfo.EXPECT().GPUCount().Return(toUint(1)).AnyTimes() + mockSystemInfo.EXPECT().GPU(toUint(0)).Return(deviceinfo.GPUInfo{ DeviceInfo: dcgm.Device{UUID: gpuUUID, GPU: 0}, MigEnabled: true, GPUInstances: gpuInstances, diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index 680a156a..2c184dbe 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -102,6 +102,7 @@ const ( CLIDisableStartupValidate = "disable-startup-validate" CLIEnableGPUBindUnbindWatch = "enable-gpu-bind-unbind-watch" CLIGPUBindUnbindPollInterval = "gpu-bind-unbind-poll-interval" + CLIEnablePprof = "enable-pprof" ) func NewApp(buildVersion ...string) *cli.App { @@ -130,7 +131,7 @@ func NewApp(buildVersion ...string) *cli.App { Name: CLIAddress, Aliases: []string{"a"}, Value: ":9400", - Usage: "Address", + Usage: "Listen address as :. For IPv6, use \"[]:\" (e.g., \"[::]:9400\")", EnvVars: []string{"DCGM_EXPORTER_LISTEN"}, }, &cli.IntFlag{ @@ -172,7 +173,7 @@ func NewApp(buildVersion ...string) *cli.App { Name: CLIRemoteHEInfo, Aliases: []string{"r"}, Value: "localhost:5555", - Usage: "Connect to remote hostengine at :", + Usage: "Connect to remote hostengine at :. For IPv6, use \"[]:\" (e.g., \"[::1]:5555\")", EnvVars: []string{"DCGM_REMOTE_HOSTENGINE_INFO"}, }, &cli.BoolFlag{ @@ -349,6 +350,12 @@ func NewApp(buildVersion ...string) *cli.App { EnvVars: []string{"DCGM_EXPORTER_GPU_BIND_UNBIND_POLL_INTERVAL"}, Value: "1s", }, + &cli.BoolFlag{ + Name: CLIEnablePprof, + Value: false, + Usage: "Enable /debug/pprof/ HTTP endpoints for profiling and debugging", + EnvVars: []string{"DCGM_EXPORTER_ENABLE_PPROF"}, + }, } if runtime.GOOS == "linux" { @@ -365,9 +372,7 @@ func NewApp(buildVersion ...string) *cli.App { return nil } - c.Action = func(c *cli.Context) error { - return action(c) - } + c.Action = action return c } @@ -1001,12 +1006,13 @@ func parseDeviceOptions(devices string) (appconfig.DeviceOptions, error) { } letter := letterAndRange[0] - if letter == FlexKey { + switch letter { + case FlexKey: dOpt.Flex = true if count > 1 { return dOpt, fmt.Errorf("no range can be specified with the flex option 'f'") } - } else if letter == MajorKey || letter == MinorKey { + case MajorKey, MinorKey: var indices []int if count == 1 { // No range means all present devices of the type @@ -1016,15 +1022,14 @@ func parseDeviceOptions(devices string) (appconfig.DeviceOptions, error) { for _, numberOrRange := range numbers { rangeTokens := strings.Split(numberOrRange, "-") rangeTokenCount := len(rangeTokens) - if rangeTokenCount > 2 { - return dOpt, fmt.Errorf("range can only be '-', but found '%s'", numberOrRange) - } else if rangeTokenCount == 1 { + switch rangeTokenCount { + case 1: number, err := strconv.Atoi(rangeTokens[0]) if err != nil { return dOpt, err } indices = append(indices, number) - } else { + case 2: start, err := strconv.Atoi(rangeTokens[0]) if err != nil { return dOpt, err @@ -1038,6 +1043,8 @@ func parseDeviceOptions(devices string) (appconfig.DeviceOptions, error) { for i := start; i <= end; i++ { indices = append(indices, i) } + default: + return dOpt, fmt.Errorf("range can only be '-', but found '%s'", numberOrRange) } } } @@ -1047,7 +1054,7 @@ func parseDeviceOptions(devices string) (appconfig.DeviceOptions, error) { } else { dOpt.MinorRange = indices } - } else { + default: return dOpt, fmt.Errorf("the only valid options preceding ':' are 'g' or 'i', but found '%s'", letter) } @@ -1116,6 +1123,7 @@ func contextToConfig(c *cli.Context) (*appconfig.Config, error) { DisableStartupValidate: c.Bool(CLIDisableStartupValidate), EnableGPUBindUnbindWatch: c.Bool(CLIEnableGPUBindUnbindWatch), GPUBindUnbindPollInterval: parseDuration(c.String(CLIGPUBindUnbindPollInterval), 1*time.Second), + EnablePprof: c.Bool(CLIEnablePprof), }, nil } diff --git a/service-monitor.yaml b/service-monitor.yaml index 8a528f32..090acb06 100644 --- a/service-monitor.yaml +++ b/service-monitor.yaml @@ -18,12 +18,12 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "4.8.1" + app.kubernetes.io/version: "4.8.2" spec: selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "4.8.1" + app.kubernetes.io/version: "4.8.2" endpoints: - port: "metrics" path: "/metrics" diff --git a/tests/docker/Makefile b/tests/docker/Makefile index 51ef7c14..f90bffab 100644 --- a/tests/docker/Makefile +++ b/tests/docker/Makefile @@ -18,7 +18,7 @@ GO_CMD ?= go # Note: FULL_VERSION should match root Makefile's DCGM_VERSION-VERSION format # This gets updated automatically by 'make update-version' from root REGISTRY ?= nvidia -FULL_VERSION ?= 4.5.2-4.8.1 +FULL_VERSION ?= 4.5.3-4.8.2 # Override specific images (optional) # If not set, defaults to: $(REGISTRY)/dcgm-exporter:$(FULL_VERSION)- diff --git a/tests/docker/README.md b/tests/docker/README.md index fcfd1bd0..835f9cf9 100644 --- a/tests/docker/README.md +++ b/tests/docker/README.md @@ -25,9 +25,9 @@ make test-images ### Default Behavior By default, tests run against locally built images with the current version: -- `nvidia/dcgm-exporter:4.5.2-4.8.1-distroless` -- `nvidia/dcgm-exporter:4.5.2-4.8.1-ubuntu22.04` -- `nvidia/dcgm-exporter:4.5.2-4.8.1-ubi9` +- `nvidia/dcgm-exporter:4.5.3-4.8.2-distroless` +- `nvidia/dcgm-exporter:4.5.3-4.8.2-ubuntu22.04` +- `nvidia/dcgm-exporter:4.5.3-4.8.2-ubi9` (Version is automatically updated by `make update-version` from the root Makefile) @@ -60,7 +60,7 @@ make docker-test-distroless REGISTRY=my-registry.io FULL_VERSION=3.0.0-3.1.0 make docker-test # Test specific version -FULL_VERSION=4.5.2-5.0.0 make docker-test-ubuntu +FULL_VERSION=4.5.3-5.0.0 make docker-test-ubuntu ``` #### Override Specific Images @@ -69,11 +69,11 @@ Set environment variables to test specific images: ```bash # Test published image -IMAGE_UBUNTU=nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-ubuntu22.04 \ +IMAGE_UBUNTU=nvcr.io/nvidia/k8s/dcgm-exporter:4.5.3-4.8.2-ubuntu22.04 \ make docker-test-ubuntu # Mix local and published images -IMAGE_UBUNTU=nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-ubuntu22.04 \ +IMAGE_UBUNTU=nvcr.io/nvidia/k8s/dcgm-exporter:4.5.3-4.8.2-ubuntu22.04 \ IMAGE_UBI="" \ IMAGE_DISTROLESS=my-registry.io/dcgm-exporter:custom-distroless \ make docker-test @@ -90,7 +90,7 @@ make docker-test | Variable | Default | Description | |----------|---------|-------------| | `REGISTRY` | `nvidia` | Container registry for default images | -| `FULL_VERSION` | `4.5.2-4.8.1` | Combined DCGM and exporter version (updated by root Makefile) | +| `FULL_VERSION` | `4.5.3-4.8.2` | Combined DCGM and exporter version (updated by root Makefile) | | `IMAGE_UBUNTU` | `${REGISTRY}/dcgm-exporter:${FULL_VERSION}-ubuntu22.04` | Full path to Ubuntu image | | `IMAGE_UBI` | `${REGISTRY}/dcgm-exporter:${FULL_VERSION}-ubi9` | Full path to UBI image | | `IMAGE_DISTROLESS` | `${REGISTRY}/dcgm-exporter:${FULL_VERSION}-distroless` | Full path to distroless image | @@ -101,22 +101,22 @@ make docker-test ```bash # Test only one variant from published registry -IMAGE_UBUNTU=nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-ubuntu22.04 \ +IMAGE_UBUNTU=nvcr.io/nvidia/k8s/dcgm-exporter:4.5.3-4.8.2-ubuntu22.04 \ IMAGE_UBI="" \ IMAGE_DISTROLESS="" \ make docker-test # Test release candidate -FULL_VERSION=4.5.2-5.0.0-rc1 make docker-test +FULL_VERSION=4.5.3-5.0.0-rc1 make docker-test # Test PR build REGISTRY=ci.mycompany.com \ -FULL_VERSION=4.5.2-pr-1234 \ +FULL_VERSION=4.5.3-pr-1234 \ make docker-test-ubuntu # Compare two versions -IMAGE_UBUNTU=nvidia/dcgm-exporter:4.5.2-4.8.1-ubuntu22.04 \ -IMAGE_DISTROLESS=nvidia/dcgm-exporter:4.5.2-5.0.0-distroless \ +IMAGE_UBUNTU=nvidia/dcgm-exporter:4.5.3-4.8.2-ubuntu22.04 \ +IMAGE_DISTROLESS=nvidia/dcgm-exporter:4.5.3-5.0.0-distroless \ IMAGE_UBI="" \ make docker-test ``` @@ -165,11 +165,11 @@ When updating DCGM or exporter versions, the tests are automatically updated: # From the project root make update-version \ OLD_DCGM_VERSION=4.3.0 \ - NEW_DCGM_VERSION=4.5.2 \ - OLD_EXPORTER_VERSION=4.5.2 \ - NEW_EXPORTER_VERSION=4.8.1 + NEW_DCGM_VERSION=4.5.3 \ + OLD_EXPORTER_VERSION=4.5.3 \ + NEW_EXPORTER_VERSION=4.8.2 -# This will update FULL_VERSION in tests/docker/Makefile from 4.3.0-4.5.2 to 4.5.2-4.8.1 +# This will update FULL_VERSION in tests/docker/Makefile from 4.3.0-4.5.3 to 4.5.3-4.8.2 ``` After version update: @@ -194,10 +194,10 @@ Containers are started with the following flags for GPU access: To verify your GPU setup: ```bash # Check NVIDIA Container Toolkit -docker run --rm --gpus all nvidia/cuda:13.1.1-base nvidia-smi +docker run --rm --gpus all nvidia/cuda:13.2.1-base nvidia-smi # Verify DCGM access -docker run --rm --gpus all --cap-add SYS_ADMIN nvcr.io/nvidia/k8s/dcgm-exporter:4.5.2-4.8.1-distroless +docker run --rm --gpus all --cap-add SYS_ADMIN nvcr.io/nvidia/k8s/dcgm-exporter:4.5.3-4.8.2-distroless ``` ## Limitations diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index 2a999d08..bb6fea85 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -16,7 +16,7 @@ GO_CMD ?= go NAMESPACE ?= "dcgm-exporter" CHART ?= "./../../deployment/" IMAGE_REPOSITORY ?= "nvcr.io/nvidia/k8s/dcgm-exporter" -IMAGE_TAG ?= "4.5.2-4.8.1-distroless" +IMAGE_TAG ?= "4.5.3-4.8.2-distroless" KUBECONFIG ?= "~/.kube/config" RUNTIME_CLASS ?= "" NO_CLEANUP ?= "false" diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go index 1bfd25e6..e3484e98 100644 --- a/tests/e2e/e2e_suite_test.go +++ b/tests/e2e/e2e_suite_test.go @@ -31,6 +31,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/utils/ptr" @@ -175,7 +176,7 @@ var _ = Describe("dcgm-exporter-e2e-suite", func() { metricsResponse = shouldReadMetrics(ctx, kubeClient, dcgmExpPod, dcgmExporterPort) g.Expect(metricsResponse).ShouldNot(BeEmpty()) - var parser expfmt.TextParser + parser := expfmt.NewTextParser(model.UTF8Validation) metricFamilies, err := parser.TextToMetricFamilies(bytes.NewReader(metricsResponse)) g.Expect(err).ShouldNot(HaveOccurred()) g.Expect(len(metricFamilies)).Should(BeNumerically(">", 0)) @@ -400,7 +401,7 @@ var _ = Describe("dcgm-exporter-e2e-suite", func() { g.Expect(metricsResponse).ShouldNot(BeEmpty(), "Metrics response should not be empty") // Parse metrics - var parser expfmt.TextParser + parser := expfmt.NewTextParser(model.UTF8Validation) metricFamilies, err := parser.TextToMetricFamilies(bytes.NewReader(metricsResponse)) if err != nil { fmt.Fprintf(GinkgoWriter, "Metrics parsing failed:\n%s\n", string(metricsResponse)) diff --git a/tests/e2e/e2e_verify_default_configuration_test.go b/tests/e2e/e2e_verify_default_configuration_test.go index ff2fb932..1a3d4cc7 100644 --- a/tests/e2e/e2e_verify_default_configuration_test.go +++ b/tests/e2e/e2e_verify_default_configuration_test.go @@ -27,6 +27,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" "k8s.io/utils/ptr" @@ -122,7 +123,7 @@ var VerifyDefaultHelmConfiguration = func( It("should verify metrics", func(ctx context.Context) { Expect(metricsResponse).ShouldNot(BeEmpty()) - var parser expfmt.TextParser + parser := expfmt.NewTextParser(model.UTF8Validation) metricFamilies, err := parser.TextToMetricFamilies(bytes.NewReader(metricsResponse)) Expect(err).ShouldNot(HaveOccurred()) Expect(len(metricFamilies)).Should(BeNumerically(">", 0)) diff --git a/tests/e2e/e2e_verify_http_basic_auth_test.go b/tests/e2e/e2e_verify_http_basic_auth_test.go index 449c9bda..d918af91 100644 --- a/tests/e2e/e2e_verify_http_basic_auth_test.go +++ b/tests/e2e/e2e_verify_http_basic_auth_test.go @@ -26,10 +26,11 @@ import ( "net/http" "time" - "github.com/NVIDIA/dcgm-exporter/tests/e2e/internal/framework" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" + + "github.com/NVIDIA/dcgm-exporter/tests/e2e/internal/framework" ) // VerifyHelmConfigurationWhenHttpBasicAuthEnabled tests helm chart when Http basic authentication is enabled diff --git a/tests/integration/reload_test.go b/tests/integration/reload_test.go index f70826ac..7c263133 100644 --- a/tests/integration/reload_test.go +++ b/tests/integration/reload_test.go @@ -26,6 +26,7 @@ import ( "time" "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/urfave/cli/v2" @@ -123,7 +124,7 @@ func TestMultipleSIGHUPReloads(t *testing.T) { // Now we can programmatically trigger reloads! const numReloads = 5 - var parser expfmt.TextParser + parser := expfmt.NewTextParser(model.UTF8Validation) for i := 0; i < numReloads; i++ { t.Logf("Reload iteration %d/%d", i+1, numReloads) diff --git a/tests/integration/start_read_test.go b/tests/integration/start_read_test.go index bb9099c2..b5f6a8b3 100644 --- a/tests/integration/start_read_test.go +++ b/tests/integration/start_read_test.go @@ -26,6 +26,7 @@ import ( "github.com/avast/retry-go/v4" "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" "github.com/stretchr/testify/require" "github.com/NVIDIA/dcgm-exporter/pkg/cmd" @@ -82,7 +83,7 @@ func TestStartAndReadMetrics(t *testing.T) { ) require.NotEmpty(t, metricsResp) - var parser expfmt.TextParser + parser := expfmt.NewTextParser(model.UTF8Validation) mf, err := parser.TextToMetricFamilies(strings.NewReader(metricsResp)) require.NoError(t, err) require.Greater(t, len(mf), 0, "expected number of metrics more than 0")