From 58a2532ab33a6df98df7a4bb27401b9749a002f3 Mon Sep 17 00:00:00 2001 From: Allen Zhou Date: Mon, 11 May 2026 17:26:13 +0800 Subject: [PATCH] fix: correct Prometheus type for NVLink bandwidth fields from counter to gauge DCGM field 449 (DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL) is declared as `counter` but DCGM internally computes an instantaneous throughput rate (MiB/s) via ReadAndCacheNvLinkBandwidth() in DcgmCacheManager.cpp: double valueDbl = (currentKiB - prevKiB) / timeDiffSec / 1000.0; The NVML source fields (NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX/TX, field IDs 138/139) are documented as cumulative KiB in nvml.h. DCGM converts them to an instantaneous MiB/s rate, so the exported value fluctuates with GPU load and can decrease between samples. This makes the metric a gauge by definition (Prometheus docs: "A gauge is a metric that represents a single numerical value that can arbitrarily go up and down"). Marking it as counter causes dashboards that apply rate() or increase() to produce near-zero or meaningless results. Also corrects the help text to accurately describe the value as throughput rate rather than "number of counters". Fixes #417 Signed-off-by: Allen Zhou --- deployment/templates/metrics-configmap.yaml | 4 ++-- deployment/values.yaml | 4 ++-- etc/dcp-metrics-included.csv | 4 ++-- etc/default-counters.csv | 4 ++-- tests/integration/testdata/default-counters.csv | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/deployment/templates/metrics-configmap.yaml b/deployment/templates/metrics-configmap.yaml index 4b56c857..efa0af97 100644 --- a/deployment/templates/metrics-configmap.yaml +++ b/deployment/templates/metrics-configmap.yaml @@ -67,8 +67,8 @@ data: # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. - DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. - # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. + DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, gauge, NVLink total throughput for all lanes (MiB/s, TX+RX combined). + # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, gauge, NVLink throughput for lane 0 (MiB/s, TX+RX combined). # VGPU License status DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status diff --git a/deployment/values.yaml b/deployment/values.yaml index d7af1df8..ef801f9c 100644 --- a/deployment/values.yaml +++ b/deployment/values.yaml @@ -322,8 +322,8 @@ kubernetesDRA: # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. - # DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. - # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. + # DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, gauge, NVLink total throughput for all lanes (MiB/s, TX+RX combined). + # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, gauge, NVLink throughput for lane 0 (MiB/s, TX+RX combined). # VGPU License status # DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status diff --git a/etc/dcp-metrics-included.csv b/etc/dcp-metrics-included.csv index d8e53f84..024d4dd5 100644 --- a/etc/dcp-metrics-included.csv +++ b/etc/dcp-metrics-included.csv @@ -55,8 +55,8 @@ DCGM_FI_DEV_FB_RESERVED, gauge, Framebuffer memory reserved (in MiB). # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. -DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. -# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, gauge, NVLink total throughput for all lanes (MiB/s, TX+RX combined). +# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, gauge, NVLink throughput for lane 0 (MiB/s, TX+RX combined). # VGPU License status DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status diff --git a/etc/default-counters.csv b/etc/default-counters.csv index 2eed7a65..327df706 100644 --- a/etc/default-counters.csv +++ b/etc/default-counters.csv @@ -62,8 +62,8 @@ DCGM_FI_DEV_FB_RESERVED, gauge, Framebuffer memory reserved (in MiB). # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. -DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. -# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, gauge, NVLink total throughput for all lanes (MiB/s, TX+RX combined). +# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, gauge, NVLink throughput for lane 0 (MiB/s, TX+RX combined). # VGPU License status DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status diff --git a/tests/integration/testdata/default-counters.csv b/tests/integration/testdata/default-counters.csv index 11c5fbbd..f250aae5 100644 --- a/tests/integration/testdata/default-counters.csv +++ b/tests/integration/testdata/default-counters.csv @@ -55,7 +55,7 @@ DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. -DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, gauge, NVLink total throughput for all lanes (MiB/s, TX+RX combined) # VGPU License status DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status