diff --git a/deployment/templates/metrics-configmap.yaml b/deployment/templates/metrics-configmap.yaml index 4b56c857..efa0af97 100644 --- a/deployment/templates/metrics-configmap.yaml +++ b/deployment/templates/metrics-configmap.yaml @@ -67,8 +67,8 @@ data: # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. - DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. - # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. + DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, gauge, NVLink total throughput for all lanes (MiB/s, TX+RX combined). + # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, gauge, NVLink throughput for lane 0 (MiB/s, TX+RX combined). # VGPU License status DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status diff --git a/deployment/values.yaml b/deployment/values.yaml index d7af1df8..ef801f9c 100644 --- a/deployment/values.yaml +++ b/deployment/values.yaml @@ -322,8 +322,8 @@ kubernetesDRA: # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. - # DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. - # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. + # DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, gauge, NVLink total throughput for all lanes (MiB/s, TX+RX combined). + # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, gauge, NVLink throughput for lane 0 (MiB/s, TX+RX combined). # VGPU License status # DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status diff --git a/etc/dcp-metrics-included.csv b/etc/dcp-metrics-included.csv index d8e53f84..024d4dd5 100644 --- a/etc/dcp-metrics-included.csv +++ b/etc/dcp-metrics-included.csv @@ -55,8 +55,8 @@ DCGM_FI_DEV_FB_RESERVED, gauge, Framebuffer memory reserved (in MiB). # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. -DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. -# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, gauge, NVLink total throughput for all lanes (MiB/s, TX+RX combined). +# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, gauge, NVLink throughput for lane 0 (MiB/s, TX+RX combined). # VGPU License status DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status diff --git a/etc/default-counters.csv b/etc/default-counters.csv index 2eed7a65..327df706 100644 --- a/etc/default-counters.csv +++ b/etc/default-counters.csv @@ -62,8 +62,8 @@ DCGM_FI_DEV_FB_RESERVED, gauge, Framebuffer memory reserved (in MiB). # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. -DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. -# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, gauge, NVLink total throughput for all lanes (MiB/s, TX+RX combined). +# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, gauge, NVLink throughput for lane 0 (MiB/s, TX+RX combined). # VGPU License status DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status diff --git a/tests/integration/testdata/default-counters.csv b/tests/integration/testdata/default-counters.csv index 11c5fbbd..f250aae5 100644 --- a/tests/integration/testdata/default-counters.csv +++ b/tests/integration/testdata/default-counters.csv @@ -55,7 +55,7 @@ DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. -DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, gauge, NVLink total throughput for all lanes (MiB/s, TX+RX combined) # VGPU License status DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status