From fda5321f0a6fba7ddbdc6b8d75c57ae123ca3068 Mon Sep 17 00:00:00 2001 From: Krystian Bednarczuk Date: Thu, 4 Dec 2025 18:47:36 +0100 Subject: [PATCH 1/6] Add MIG time-sharing support with per-process metrics Signed-off-by: Krystian Bednarczuk --- go.mod | 2 + go.sum | 5 + .../mocks/pkg/nvmlprovider/mock_client.go | 45 ++ internal/pkg/deviceinfo/device_info.go | 2 +- internal/pkg/nvmlprovider/provider.go | 103 +++++ internal/pkg/nvmlprovider/types.go | 11 + internal/pkg/transformation/const.go | 5 +- internal/pkg/transformation/kubernetes.go | 119 ++++- .../pkg/transformation/kubernetes_test.go | 350 ++++++++++++++ internal/pkg/transformation/pidmapper.go | 103 +++++ internal/pkg/transformation/pidmapper_stub.go | 35 ++ internal/pkg/transformation/pidmapper_test.go | 133 ++++++ .../pkg/transformation/process_metrics.go | 192 ++++++++ .../transformation/process_metrics_test.go | 434 ++++++++++++++++++ 14 files changed, 1531 insertions(+), 8 deletions(-) create mode 100644 internal/pkg/transformation/pidmapper.go create mode 100644 internal/pkg/transformation/pidmapper_stub.go create mode 100644 internal/pkg/transformation/pidmapper_test.go create mode 100644 internal/pkg/transformation/process_metrics.go create mode 100644 internal/pkg/transformation/process_metrics_test.go diff --git a/go.mod b/go.mod index 4e1505e8..cbab2f11 100644 --- a/go.mod +++ b/go.mod @@ -49,6 +49,7 @@ require ( github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/chai2010/gettext-go v1.0.3 // indirect + github.com/containerd/cgroups/v3 v3.1.1 // indirect github.com/containerd/containerd v1.7.27 // indirect github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/log v0.1.0 // indirect @@ -103,6 +104,7 @@ require ( github.com/mitchellh/go-wordwrap v1.0.1 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect github.com/moby/spdystream v0.5.0 // indirect + github.com/moby/sys/userns v0.1.0 // indirect github.com/moby/term v0.5.2 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect diff --git a/go.sum b/go.sum index f401ee97..cd741e4b 100644 --- a/go.sum +++ b/go.sum @@ -44,6 +44,9 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chai2010/gettext-go v1.0.3 h1:9liNh8t+u26xl5ddmWLmsOsdNLwkdRTg5AG+JnTiM80= github.com/chai2010/gettext-go v1.0.3/go.mod h1:y+wnP2cHYaVj19NZhYKAwEMH2CI1gNHeQQ+5AjwawxA= +github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM= +github.com/containerd/cgroups/v3 v3.1.1 h1:ASZmQGfOHbRj43/1aMn5QcWIsv0R/AuHHDNCguRY0p0= +github.com/containerd/cgroups/v3 v3.1.1/go.mod h1:PKZ2AcWmSBsY/tJUVhtS/rluX0b1uq1GmPO1ElCmbOw= github.com/containerd/containerd v1.7.27 h1:yFyEyojddO3MIGVER2xJLWoCIn+Up4GaHFquP7hsFII= github.com/containerd/containerd v1.7.27/go.mod h1:xZmPnl75Vc+BLGt4MIfu6bp+fy03gdHAn9bz+FreFR0= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= @@ -211,6 +214,8 @@ github.com/mittwald/go-helm-client v0.12.16 h1:YTyJX6L0SI/O7HNTG0qDZI2/jyGELxJOQ github.com/mittwald/go-helm-client v0.12.16/go.mod h1:PDF7Ra8bmJ2YTNzoehoMMi+gW/EJBk/4TLz7j52rehY= github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= +github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g= +github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ= github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= diff --git a/internal/mocks/pkg/nvmlprovider/mock_client.go b/internal/mocks/pkg/nvmlprovider/mock_client.go index c13b94c3..3bece5be 100644 --- a/internal/mocks/pkg/nvmlprovider/mock_client.go +++ b/internal/mocks/pkg/nvmlprovider/mock_client.go @@ -66,6 +66,51 @@ func (mr *MockNVMLMockRecorder) Cleanup() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Cleanup", reflect.TypeOf((*MockNVML)(nil).Cleanup)) } +// GetDeviceProcessMemory mocks base method. +func (m *MockNVML) GetDeviceProcessMemory(gpuUUID string) (map[uint32]uint64, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetDeviceProcessMemory", gpuUUID) + ret0, _ := ret[0].(map[uint32]uint64) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetDeviceProcessMemory indicates an expected call of GetDeviceProcessMemory. +func (mr *MockNVMLMockRecorder) GetDeviceProcessMemory(gpuUUID any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetDeviceProcessMemory", reflect.TypeOf((*MockNVML)(nil).GetDeviceProcessMemory), gpuUUID) +} + +// GetAllMIGDevicesProcessMemory mocks base method. +func (m *MockNVML) GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[uint]map[uint32]uint64, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetAllMIGDevicesProcessMemory", parentGPUUUID) + ret0, _ := ret[0].(map[uint]map[uint32]uint64) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetAllMIGDevicesProcessMemory indicates an expected call of GetAllMIGDevicesProcessMemory. +func (mr *MockNVMLMockRecorder) GetAllMIGDevicesProcessMemory(parentGPUUUID any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetAllMIGDevicesProcessMemory", reflect.TypeOf((*MockNVML)(nil).GetAllMIGDevicesProcessMemory), parentGPUUUID) +} + +// GetDeviceProcessUtilization mocks base method. +func (m *MockNVML) GetDeviceProcessUtilization(gpuUUID string) (map[uint32]uint32, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetDeviceProcessUtilization", gpuUUID) + ret0, _ := ret[0].(map[uint32]uint32) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetDeviceProcessUtilization indicates an expected call of GetDeviceProcessUtilization. +func (mr *MockNVMLMockRecorder) GetDeviceProcessUtilization(gpuUUID any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetDeviceProcessUtilization", reflect.TypeOf((*MockNVML)(nil).GetDeviceProcessUtilization), gpuUUID) +} + // GetMIGDeviceInfoByID mocks base method. func (m *MockNVML) GetMIGDeviceInfoByID(arg0 string) (*nvmlprovider.MIGDeviceInfo, error) { m.ctrl.T.Helper() diff --git a/internal/pkg/deviceinfo/device_info.go b/internal/pkg/deviceinfo/device_info.go index fb4e031d..af76729e 100644 --- a/internal/pkg/deviceinfo/device_info.go +++ b/internal/pkg/deviceinfo/device_info.go @@ -153,7 +153,7 @@ func (s *Info) initializeGPUInfo(gOpt appconfig.DeviceOptions, useFakeGPUs bool) links, err := dcgmprovider.Client().GetNvLinkLinkStatus() if err == nil { for i := 0; i < len(s.gpus); i++ { - // monitor only the nvlinks as per the device options input + // monitor only the nvlinks as per the device options input if gOpt.Flex || s.shouldMonitor(gOpt.MajorRange, s.gpus[i].DeviceInfo.GPU) { var matchingLinks []dcgm.NvLinkStatus var linkCount uint = 1 diff --git a/internal/pkg/nvmlprovider/provider.go b/internal/pkg/nvmlprovider/provider.go index dc4328e2..24d3602c 100644 --- a/internal/pkg/nvmlprovider/provider.go +++ b/internal/pkg/nvmlprovider/provider.go @@ -170,6 +170,109 @@ func getMIGDeviceInfoForOldDriver(uuid string) (*MIGDeviceInfo, error) { }, nil } +// GetDeviceProcessMemory returns memory usage for compute processes running on the GPU +func (n nvmlProvider) GetDeviceProcessMemory(gpuUUID string) (map[uint32]uint64, error) { + if err := n.preCheck(); err != nil { + return nil, fmt.Errorf("failed to get device process memory: %w", err) + } + + device, ret := nvml.DeviceGetHandleByUUID(gpuUUID) + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get device handle for UUID %s: %s", gpuUUID, nvml.ErrorString(ret)) + } + + processes, ret := device.GetComputeRunningProcesses() + if ret != nvml.SUCCESS && ret != nvml.ERROR_NOT_SUPPORTED { + return nil, fmt.Errorf("failed to get compute running processes: %s", nvml.ErrorString(ret)) + } + + result := make(map[uint32]uint64, len(processes)) + for _, p := range processes { + result[p.Pid] = p.UsedGpuMemory + } + + return result, nil +} + +// GetDeviceProcessUtilization returns SM utilization for processes running on the GPU +func (n nvmlProvider) GetDeviceProcessUtilization(gpuUUID string) (map[uint32]uint32, error) { + if err := n.preCheck(); err != nil { + return nil, fmt.Errorf("failed to get device process utilization: %w", err) + } + + device, ret := nvml.DeviceGetHandleByUUID(gpuUUID) + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get device handle for UUID %s: %s", gpuUUID, nvml.ErrorString(ret)) + } + + samples, ret := device.GetProcessUtilization(0) + if ret != nvml.SUCCESS { + if ret == nvml.ERROR_NOT_SUPPORTED { + return nil, nil + } + return nil, fmt.Errorf("failed to get process utilization: %s", nvml.ErrorString(ret)) + } + + result := make(map[uint32]uint32, len(samples)) + for _, s := range samples { + result[s.Pid] = s.SmUtil + } + + return result, nil +} + +// GetAllMIGDevicesProcessMemory returns memory usage for all MIG devices on a parent GPU. +// Returns a map from GPU Instance ID to (PID -> memory used in bytes). +// Note: Only memory info is available for MIG devices, not SM utilization. +func (n nvmlProvider) GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[uint]map[uint32]uint64, error) { + if err := n.preCheck(); err != nil { + return nil, fmt.Errorf("failed to get MIG device process memory: %w", err) + } + + parentDevice, ret := nvml.DeviceGetHandleByUUID(parentGPUUUID) + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get parent device handle for UUID %s: %s", parentGPUUUID, nvml.ErrorString(ret)) + } + + migCount, ret := parentDevice.GetMaxMigDeviceCount() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get MIG device count for UUID %s: %s", parentGPUUUID, nvml.ErrorString(ret)) + } + + result := make(map[uint]map[uint32]uint64) + + for i := 0; i < migCount; i++ { + migDevice, ret := parentDevice.GetMigDeviceHandleByIndex(i) + if ret == nvml.ERROR_NOT_FOUND || ret == nvml.ERROR_INVALID_ARGUMENT { + continue + } + if ret != nvml.SUCCESS { + slog.Debug("Failed to get MIG device handle", "index", i, "error", nvml.ErrorString(ret)) + continue + } + + giID, ret := migDevice.GetGpuInstanceId() + if ret != nvml.SUCCESS { + slog.Debug("Failed to get GPU instance ID for MIG device", "index", i, "error", nvml.ErrorString(ret)) + continue + } + + processes, ret := migDevice.GetComputeRunningProcesses() + if ret != nvml.SUCCESS && ret != nvml.ERROR_NOT_SUPPORTED { + slog.Debug("Failed to get running processes for MIG device", "gpuInstanceID", giID, "error", nvml.ErrorString(ret)) + continue + } + + pidToMemory := make(map[uint32]uint64, len(processes)) + for _, p := range processes { + pidToMemory[p.Pid] = p.UsedGpuMemory + } + result[uint(giID)] = pidToMemory + } + + return result, nil +} + // Cleanup performs cleanup operations for the NVML provider func (n nvmlProvider) Cleanup() { if !n.initialized { diff --git a/internal/pkg/nvmlprovider/types.go b/internal/pkg/nvmlprovider/types.go index 507b7afd..6bc4f33f 100644 --- a/internal/pkg/nvmlprovider/types.go +++ b/internal/pkg/nvmlprovider/types.go @@ -18,7 +18,18 @@ package nvmlprovider +// NVML interface provides access to NVIDIA Management Library functionality type NVML interface { GetMIGDeviceInfoByID(string) (*MIGDeviceInfo, error) + // GetDeviceProcessMemory returns memory usage for processes running on the GPU. + // Returns a map from PID to memory used in bytes. + GetDeviceProcessMemory(gpuUUID string) (map[uint32]uint64, error) + // GetDeviceProcessUtilization returns SM utilization for processes running on the GPU. + // Returns a map from PID to SM utilization percentage. + GetDeviceProcessUtilization(gpuUUID string) (map[uint32]uint32, error) + // GetAllMIGDevicesProcessMemory returns memory usage for all MIG devices on a parent GPU. + // Returns a map from GPU Instance ID to (PID -> memory used in bytes). + // Note: Only memory info is available for MIG devices, not SM utilization. + GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[uint]map[uint32]uint64, error) Cleanup() } diff --git a/internal/pkg/transformation/const.go b/internal/pkg/transformation/const.go index a0eea389..951b1a0e 100644 --- a/internal/pkg/transformation/const.go +++ b/internal/pkg/transformation/const.go @@ -17,7 +17,6 @@ package transformation const ( - // Note standard resource attributes podAttribute = "pod" namespaceAttribute = "namespace" containerAttribute = "container" @@ -39,4 +38,8 @@ const ( draMigDeviceUUID = "dra_mig_device_uuid" DRAGPUDriverName = "gpu.nvidia.com" + + metricGPUUtil = "DCGM_FI_DEV_GPU_UTIL" + metricFBUsed = "DCGM_FI_DEV_FB_USED" + metricGREngineActive = "DCGM_FI_PROF_GR_ENGINE_ACTIVE" ) diff --git a/internal/pkg/transformation/kubernetes.go b/internal/pkg/transformation/kubernetes.go index 74f92870..543c7e67 100644 --- a/internal/pkg/transformation/kubernetes.go +++ b/internal/pkg/transformation/kubernetes.go @@ -41,6 +41,7 @@ import ( "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" "github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider" "github.com/NVIDIA/dcgm-exporter/internal/pkg/utils" @@ -177,6 +178,82 @@ func (p *PodMapper) Name() string { return "podMapper" } +func (p *PodMapper) createPerProcessMetrics( + val collector.Metric, + counter counters.Counter, + originalMetric collector.Metric, + dataMap *perProcessDataMap, +) ([]collector.Metric, error) { + metricsKey := getMIGMetricsKey(val.GPUUUID, val.GPUInstanceID) + if dataMap.metrics[metricsKey] == nil { + metricsKey = val.GPUUUID + } + + devicePods := dataMap.deviceToPods[metricsKey] + if len(devicePods) == 0 { + return nil, nil + } + + data := dataMap.metrics[metricsKey] + podValues := buildPodValueMap(dataMap.pidToPod, data, counter.FieldName) + maps.Copy(podValues, buildIdlePodValues(podValues, devicePods)) + + var result []collector.Metric + for _, podInfo := range devicePods { + value, ok := podValues[podInfo.UID] + if !ok { + continue + } + + metric, err := utils.DeepCopy(originalMetric) + if err != nil { + return nil, err + } + metric.Value = value + + if !p.Config.UseOldNamespace { + metric.Attributes[podAttribute] = podInfo.Name + metric.Attributes[namespaceAttribute] = podInfo.Namespace + metric.Attributes[containerAttribute] = podInfo.Container + } else { + metric.Attributes[oldPodAttribute] = podInfo.Name + metric.Attributes[oldNamespaceAttribute] = podInfo.Namespace + metric.Attributes[oldContainerAttribute] = podInfo.Container + } + metric.Attributes[uidAttribute] = podInfo.UID + if podInfo.VGPU != "" { + metric.Attributes[vgpuAttribute] = podInfo.VGPU + } + + result = append(result, metric) + } + + return result, nil +} + +func buildPodValueMap(pidToPod map[uint32]*PodInfo, data *perProcessMetrics, fieldName string) map[string]string { + podValues := make(map[string]string) + if data == nil { + return podValues + } + for pid, podInfo := range pidToPod { + if value, ok := data.getValueForMetric(fieldName, pid); ok { + podValues[podInfo.UID] = value + } + } + return podValues +} + +func buildIdlePodValues(existingValues map[string]string, devicePods []PodInfo) map[string]string { + idleValues := make(map[string]string) + for _, podInfo := range devicePods { + if _, ok := existingValues[podInfo.UID]; !ok { + idleValues[podInfo.UID] = "0" + } + } + return idleValues +} + func (p *PodMapper) Process(metrics collector.MetricsByCounter, deviceInfo deviceinfo.Provider) error { socketPath := p.Config.PodResourcesKubeletSocket _, err := os.Stat(socketPath) @@ -245,8 +322,12 @@ func (p *PodMapper) Process(metrics collector.MetricsByCounter, deviceInfo devic if p.Config.KubernetesVirtualGPUs { deviceToPods := p.toDeviceToSharingPods(pods, deviceInfo) - - slog.Debug(fmt.Sprintf("Device to sharing pods mapping: %+v", deviceToPods)) + gpuUUIDToDeviceID := getGPUUUIDToDeviceID(deviceInfo, p.Config.KubernetesGPUIdType) + processCollector := &perProcessCollector{ + client: nvmlprovider.Client(), + pidMapper: newPIDToPodMapper(), + } + perProcessData := processCollector.Collect(gpuUUIDToDeviceID, deviceToPods, deviceInfo) // For each counter metric, init a slice to collect metrics to associate with shared virtual GPUs. for counter := range metrics { @@ -260,6 +341,17 @@ func (p *PodMapper) Process(metrics collector.MetricsByCounter, deviceInfo devic } podInfos := deviceToPods[deviceID] + if isPerProcessMetric(counter.FieldName) { + perProcessMetrics, err := p.createPerProcessMetrics(val, counter, metrics[counter][j], perProcessData) + if err != nil { + return err + } + if perProcessMetrics != nil { + newmetrics = append(newmetrics, metrics[counter][j]) // original device-level metric + newmetrics = append(newmetrics, perProcessMetrics...) + continue + } + } // For all containers using the GPU, extract and annotate a metric // with the container info and the shared GPU label, if it exists. // Notably, this will increase the number of unique metrics (i.e. labelsets) @@ -439,6 +531,19 @@ func getSharedGPU(deviceID string) (string, bool) { return "", false } +// stripVGPUSuffix removes the ::N suffix from device IDs. +// AWS EKS with NVIDIA device plugin reports MIG devices as: +// +// MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55::7 +// +// This function strips the ::N suffix to get the MIG UUID for NVML lookups. +func stripVGPUSuffix(deviceID string) string { + if base, _, found := strings.Cut(deviceID, "::"); found { + return base + } + return deviceID +} + func (p *PodMapper) toDeviceToPodsDRA(devicePods *podresourcesapi.ListPodResourcesResponse) map[string][]PodInfo { deviceToPodsMap := make(map[string][]PodInfo) labelCache := make(map[string]PodMetadata) // Cache to avoid duplicate API calls @@ -541,7 +646,8 @@ func (p *PodMapper) toDeviceToSharingPods(devicePods *podresourcesapi.ListPodRes podInfo.VGPU = vgpu } if strings.HasPrefix(deviceID, appconfig.MIG_UUID_PREFIX) { - migDevice, err := nvmlprovider.Client().GetMIGDeviceInfoByID(deviceID) + migUUID := stripVGPUSuffix(deviceID) + migDevice, err := nvmlprovider.Client().GetMIGDeviceInfoByID(migUUID) if err == nil { // Check for potential integer overflow before conversion if migDevice.GPUInstanceID >= 0 { @@ -550,7 +656,7 @@ func (p *PodMapper) toDeviceToSharingPods(devicePods *podresourcesapi.ListPodRes deviceToPodsMap[giIdentifier] = append(deviceToPodsMap[giIdentifier], podInfo) } } - gpuUUID := deviceID[len(appconfig.MIG_UUID_PREFIX):] + gpuUUID := migUUID[len(appconfig.MIG_UUID_PREFIX):] deviceToPodsMap[gpuUUID] = append(deviceToPodsMap[gpuUUID], podInfo) } else if gkeMigDeviceIDMatches := gkeMigDeviceIDRegex.FindStringSubmatch(deviceID); gkeMigDeviceIDMatches != nil { var gpuIndex string @@ -671,7 +777,8 @@ func (p *PodMapper) toDeviceToPod( "resourceName", resourceName, "deviceIds", device.GetDeviceIds(), ) - migDevice, err := nvmlprovider.Client().GetMIGDeviceInfoByID(deviceID) + migUUID := stripVGPUSuffix(deviceID) + migDevice, err := nvmlprovider.Client().GetMIGDeviceInfoByID(migUUID) if err == nil { // Check for potential integer overflow before conversion if migDevice.GPUInstanceID >= 0 { @@ -699,7 +806,7 @@ func (p *PodMapper) toDeviceToPod( "deviceIds", device.GetDeviceIds(), ) } - gpuUUID := deviceID[len(appconfig.MIG_UUID_PREFIX):] + gpuUUID := migUUID[len(appconfig.MIG_UUID_PREFIX):] slog.Debug("Mapped MIG device to GPU UUID", "deviceID", deviceID, "gpuUUID", gpuUUID, diff --git a/internal/pkg/transformation/kubernetes_test.go b/internal/pkg/transformation/kubernetes_test.go index 3736bd32..f943b4b3 100644 --- a/internal/pkg/transformation/kubernetes_test.go +++ b/internal/pkg/transformation/kubernetes_test.go @@ -944,3 +944,353 @@ func TestProcessPodMapper_WithLabelsAndUID(t *testing.T) { } } } + +func TestBuildPodValueMap(t *testing.T) { + tests := []struct { + name string + pidToPod map[uint32]*PodInfo + data *perProcessMetrics + fieldName string + expected map[string]string + }{ + { + name: "nil data returns empty map", + pidToPod: map[uint32]*PodInfo{1001: {UID: "uid1"}}, + data: nil, + fieldName: metricGPUUtil, + expected: map[string]string{}, + }, + { + name: "maps PID values to pod UIDs for GPU util", + pidToPod: map[uint32]*PodInfo{1001: {UID: "uid1"}, 1002: {UID: "uid2"}}, + data: &perProcessMetrics{ + pidToSMUtil: map[uint32]uint32{1001: 50, 1002: 75}, + }, + fieldName: metricGPUUtil, + expected: map[string]string{"uid1": "50", "uid2": "75"}, + }, + { + name: "maps PID values to pod UIDs for FB used", + pidToPod: map[uint32]*PodInfo{1001: {UID: "uid1"}}, + data: &perProcessMetrics{ + pidToMemory: map[uint32]uint64{1001: 1024 * 1024 * 1024}, + }, + fieldName: metricFBUsed, + expected: map[string]string{"uid1": "1024"}, + }, + { + name: "skips PIDs without metric data", + pidToPod: map[uint32]*PodInfo{1001: {UID: "uid1"}, 2002: {UID: "uid2"}}, + data: &perProcessMetrics{ + pidToSMUtil: map[uint32]uint32{1001: 50}, + }, + fieldName: metricGPUUtil, + expected: map[string]string{"uid1": "50"}, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + result := buildPodValueMap(tc.pidToPod, tc.data, tc.fieldName) + assert.Equal(t, tc.expected, result) + }) + } +} + +func TestBuildIdlePodValues(t *testing.T) { + tests := []struct { + name string + existingValues map[string]string + devicePods []PodInfo + expected map[string]string + }{ + { + name: "adds zero values for idle pods", + existingValues: map[string]string{"uid1": "50"}, + devicePods: []PodInfo{{UID: "uid1"}, {UID: "uid2"}, {UID: "uid3"}}, + expected: map[string]string{"uid2": "0", "uid3": "0"}, + }, + { + name: "skips pods with existing values", + existingValues: map[string]string{"uid1": "50", "uid2": "75"}, + devicePods: []PodInfo{{UID: "uid1"}, {UID: "uid2"}}, + expected: map[string]string{}, + }, + { + name: "all pods idle", + existingValues: map[string]string{}, + devicePods: []PodInfo{{UID: "uid1"}, {UID: "uid2"}}, + expected: map[string]string{"uid1": "0", "uid2": "0"}, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + result := buildIdlePodValues(tc.existingValues, tc.devicePods) + assert.Equal(t, tc.expected, result) + }) + } +} + +func TestPodMapper_CreatePerProcessMetrics(t *testing.T) { + gpuUUID := "GPU-00000000-0000-0000-0000-000000000000" + podUID := "a9c80282-3f6b-4d5b-84d5-a137a6668011" + + tests := []struct { + name string + useOldNS bool + dataMap *perProcessDataMap + counter counters.Counter + originalMetric collector.Metric + validate func(t *testing.T, result []collector.Metric, err error) + }{ + { + name: "no deviceToPods returns nil", + useOldNS: false, + dataMap: &perProcessDataMap{ + metrics: map[string]*perProcessMetrics{gpuUUID: {pidToSMUtil: map[uint32]uint32{1001: 50}}}, + pidToPod: map[uint32]*PodInfo{1001: {UID: podUID}}, + deviceToPods: map[string][]PodInfo{}, + }, + counter: counters.Counter{FieldName: metricGPUUtil}, + originalMetric: collector.Metric{ + GPUUUID: gpuUUID, + Attributes: map[string]string{}, + }, + validate: func(t *testing.T, result []collector.Metric, err error) { + assert.NoError(t, err) + assert.Nil(t, result) + }, + }, + { + name: "creates metrics with new namespace attributes", + useOldNS: false, + dataMap: &perProcessDataMap{ + metrics: map[string]*perProcessMetrics{ + gpuUUID: { + pidToSMUtil: map[uint32]uint32{1001: 50}, + pidToMemory: map[uint32]uint64{1001: 1024 * 1024 * 1024}, + }, + }, + pidToPod: map[uint32]*PodInfo{ + 1001: {Name: "test-pod", Namespace: "default", UID: podUID, Container: "app"}, + }, + deviceToPods: map[string][]PodInfo{ + gpuUUID: {{Name: "test-pod", Namespace: "default", UID: podUID, Container: "app"}}, + }, + }, + counter: counters.Counter{FieldName: metricGPUUtil}, + originalMetric: collector.Metric{ + GPUUUID: gpuUUID, + Value: "0", + Attributes: map[string]string{}, + }, + validate: func(t *testing.T, result []collector.Metric, err error) { + assert.NoError(t, err) + require.Len(t, result, 1) + assert.Equal(t, "50", result[0].Value) + assert.Equal(t, "test-pod", result[0].Attributes[podAttribute]) + assert.Equal(t, "default", result[0].Attributes[namespaceAttribute]) + assert.Equal(t, "app", result[0].Attributes[containerAttribute]) + assert.Equal(t, podUID, result[0].Attributes[uidAttribute]) + }, + }, + { + name: "creates metrics with old namespace attributes", + useOldNS: true, + dataMap: &perProcessDataMap{ + metrics: map[string]*perProcessMetrics{ + gpuUUID: { + pidToSMUtil: map[uint32]uint32{1001: 75}, + }, + }, + pidToPod: map[uint32]*PodInfo{ + 1001: {Name: "old-pod", Namespace: "kube-system", UID: podUID, Container: "container"}, + }, + deviceToPods: map[string][]PodInfo{ + gpuUUID: {{Name: "old-pod", Namespace: "kube-system", UID: podUID, Container: "container"}}, + }, + }, + counter: counters.Counter{FieldName: metricGPUUtil}, + originalMetric: collector.Metric{ + GPUUUID: gpuUUID, + Attributes: map[string]string{}, + }, + validate: func(t *testing.T, result []collector.Metric, err error) { + assert.NoError(t, err) + require.Len(t, result, 1) + assert.Equal(t, "75", result[0].Value) + assert.Equal(t, "old-pod", result[0].Attributes[oldPodAttribute]) + assert.Equal(t, "kube-system", result[0].Attributes[oldNamespaceAttribute]) + assert.Equal(t, "container", result[0].Attributes[oldContainerAttribute]) + }, + }, + { + name: "includes VGPU attribute when present", + useOldNS: false, + dataMap: &perProcessDataMap{ + metrics: map[string]*perProcessMetrics{ + gpuUUID: { + pidToSMUtil: map[uint32]uint32{1001: 25}, + }, + }, + pidToPod: map[uint32]*PodInfo{ + 1001: {Name: "vgpu-pod", Namespace: "default", UID: podUID, VGPU: "vgpu-0"}, + }, + deviceToPods: map[string][]PodInfo{ + gpuUUID: {{Name: "vgpu-pod", Namespace: "default", UID: podUID, VGPU: "vgpu-0"}}, + }, + }, + counter: counters.Counter{FieldName: metricGPUUtil}, + originalMetric: collector.Metric{ + GPUUUID: gpuUUID, + Attributes: map[string]string{}, + }, + validate: func(t *testing.T, result []collector.Metric, err error) { + assert.NoError(t, err) + require.Len(t, result, 1) + assert.Equal(t, "vgpu-0", result[0].Attributes[vgpuAttribute]) + }, + }, + { + name: "backfills idle pods with zero for GPU util", + useOldNS: false, + dataMap: &perProcessDataMap{ + metrics: map[string]*perProcessMetrics{ + gpuUUID: { + pidToSMUtil: map[uint32]uint32{1001: 50}, + }, + }, + pidToPod: map[uint32]*PodInfo{ + 1001: {Name: "active-pod", Namespace: "ns1", UID: "uid1"}, + }, + deviceToPods: map[string][]PodInfo{ + gpuUUID: { + {Name: "active-pod", Namespace: "ns1", UID: "uid1"}, + {Name: "idle-pod", Namespace: "ns2", UID: "uid2"}, + }, + }, + }, + counter: counters.Counter{FieldName: metricGPUUtil}, + originalMetric: collector.Metric{ + GPUUUID: gpuUUID, + Attributes: map[string]string{}, + }, + validate: func(t *testing.T, result []collector.Metric, err error) { + assert.NoError(t, err) + require.Len(t, result, 2) + values := map[string]string{} + for _, m := range result { + values[m.Attributes[podAttribute]] = m.Value + } + assert.Equal(t, "50", values["active-pod"]) + assert.Equal(t, "0", values["idle-pod"]) + }, + }, + { + name: "backfills idle pods with zero for FB used", + useOldNS: false, + dataMap: &perProcessDataMap{ + metrics: map[string]*perProcessMetrics{ + gpuUUID: { + pidToMemory: map[uint32]uint64{1001: 1024 * 1024 * 1024}, + }, + }, + pidToPod: map[uint32]*PodInfo{ + 1001: {Name: "active-pod", Namespace: "ns1", UID: "uid1"}, + }, + deviceToPods: map[string][]PodInfo{ + gpuUUID: { + {Name: "active-pod", Namespace: "ns1", UID: "uid1"}, + {Name: "idle-pod", Namespace: "ns2", UID: "uid2"}, + }, + }, + }, + counter: counters.Counter{FieldName: metricFBUsed}, + originalMetric: collector.Metric{ + GPUUUID: gpuUUID, + Attributes: map[string]string{}, + }, + validate: func(t *testing.T, result []collector.Metric, err error) { + assert.NoError(t, err) + require.Len(t, result, 2) + values := map[string]string{} + for _, m := range result { + values[m.Attributes[podAttribute]] = m.Value + } + assert.Equal(t, "1024", values["active-pod"]) + assert.Equal(t, "0", values["idle-pod"]) + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + podMapper := &PodMapper{ + Config: &appconfig.Config{ + UseOldNamespace: tc.useOldNS, + }, + } + + result, err := podMapper.createPerProcessMetrics( + tc.originalMetric, + tc.counter, + tc.originalMetric, + tc.dataMap, + ) + + tc.validate(t, result, err) + }) + } +} + +func TestStripVGPUSuffix(t *testing.T) { + tests := []struct { + name string + deviceID string + expected string + }{ + { + name: "AWS MIG device ID with vgpu suffix", + deviceID: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55::7", + expected: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55", + }, + { + name: "AWS MIG device ID with different vgpu index", + deviceID: "MIG-a8d7e63b-588b-5fd8-826d-d1eab19c6f18::9", + expected: "MIG-a8d7e63b-588b-5fd8-826d-d1eab19c6f18", + }, + { + name: "Plain MIG UUID without suffix", + deviceID: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55", + expected: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55", + }, + { + name: "Regular GPU UUID", + deviceID: "GPU-65759866-6a45-99ff-bc37-c534ea0ae191", + expected: "GPU-65759866-6a45-99ff-bc37-c534ea0ae191", + }, + { + name: "Non-MIG device ID with vgpu suffix", + deviceID: "b8ea3855-276c-c9cb-b366-c6fa655957c5::2", + expected: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + }, + { + name: "Empty string", + deviceID: "", + expected: "", + }, + { + name: "Device ID with empty suffix", + deviceID: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55::", + expected: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + result := stripVGPUSuffix(tc.deviceID) + assert.Equal(t, tc.expected, result) + }) + } +} diff --git a/internal/pkg/transformation/pidmapper.go b/internal/pkg/transformation/pidmapper.go new file mode 100644 index 00000000..faf6080e --- /dev/null +++ b/internal/pkg/transformation/pidmapper.go @@ -0,0 +1,103 @@ +//go:build linux + +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package transformation + +import ( + "fmt" + "regexp" + "strings" + + "github.com/containerd/cgroups/v3" +) + +var podUIDRegex = regexp.MustCompile(`pod([a-f0-9_-]+)`) + +type pidToPodMapper struct { + pidToUID map[uint32]string +} + +func newPIDToPodMapper() *pidToPodMapper { + return &pidToPodMapper{pidToUID: make(map[uint32]string)} +} + +func (m *pidToPodMapper) getPodUIDForPID(pid uint32) (string, error) { + if uid, ok := m.pidToUID[pid]; ok { + return uid, nil + } + + cgroupPath := fmt.Sprintf("/proc/%d/cgroup", pid) + subsystems, unified, err := cgroups.ParseCgroupFileUnified(cgroupPath) + if err != nil { + return "", fmt.Errorf("failed to parse cgroup file for PID %d: %w", pid, err) + } + + uid := extractPodUIDFromPaths(subsystems, unified) + if uid != "" { + m.pidToUID[pid] = uid + } + return uid, nil +} + +func extractPodUIDFromPaths(subsystems map[string]string, unified string) string { + for _, path := range subsystems { + if uid := extractPodUID(path); uid != "" { + return uid + } + } + if uid := extractPodUID(unified); uid != "" { + return uid + } + return "" +} + +func extractPodUID(path string) string { + matches := podUIDRegex.FindStringSubmatch(path) + if len(matches) < 2 { + return "" + } + uid := strings.ReplaceAll(matches[1], "_", "-") + if len(uid) < 32 { + return "" + } + return uid +} + +func (m *pidToPodMapper) buildPIDToPodMap(pids []uint32, pods []PodInfo) map[uint32]*PodInfo { + uidToPod := make(map[string]*PodInfo) + for i := range pods { + if pods[i].UID != "" { + uidToPod[pods[i].UID] = &pods[i] + } + } + result := make(map[uint32]*PodInfo) + for _, pid := range pids { + uid, err := m.getPodUIDForPID(pid) + if err != nil { + continue + } + if uid == "" { + continue + } + if pod, ok := uidToPod[uid]; ok { + result[pid] = pod + } + } + + return result +} diff --git a/internal/pkg/transformation/pidmapper_stub.go b/internal/pkg/transformation/pidmapper_stub.go new file mode 100644 index 00000000..35eaa7cd --- /dev/null +++ b/internal/pkg/transformation/pidmapper_stub.go @@ -0,0 +1,35 @@ +//go:build !linux + +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package transformation + +import "fmt" + +type pidToPodMapper struct{} + +func newPIDToPodMapper() *pidToPodMapper { + return &pidToPodMapper{} +} + +func (m *pidToPodMapper) getPodUIDForPID(pid uint32) (string, error) { + return "", fmt.Errorf("PID to Pod mapping is only supported on Linux") +} + +func (m *pidToPodMapper) buildPIDToPodMap(pids []uint32, pods []PodInfo) map[uint32]*PodInfo { + return make(map[uint32]*PodInfo) +} diff --git a/internal/pkg/transformation/pidmapper_test.go b/internal/pkg/transformation/pidmapper_test.go new file mode 100644 index 00000000..45f86f0a --- /dev/null +++ b/internal/pkg/transformation/pidmapper_test.go @@ -0,0 +1,133 @@ +//go:build linux + +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package transformation + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestExtractPodUID(t *testing.T) { + tests := []struct { + name string + path string + expected string + }{ + { + name: "cgroups v1 besteffort", + path: "/kubepods/besteffort/poda9c80282-3f6b-4d5b-84d5-a137a6668011/container123", + expected: "a9c80282-3f6b-4d5b-84d5-a137a6668011", + }, + { + name: "cgroups v1 burstable", + path: "/kubepods/burstable/pod12345678-1234-1234-1234-123456789012/abc", + expected: "12345678-1234-1234-1234-123456789012", + }, + { + name: "cgroups v2 with underscores", + path: "/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-poda9c80282_3f6b_4d5b_84d5_a137a6668011.slice", + expected: "a9c80282-3f6b-4d5b-84d5-a137a6668011", + }, + { + name: "no pod UID", + path: "/system.slice/docker.service", + expected: "", + }, + { + name: "short UID (invalid)", + path: "/kubepods/pod123/container", + expected: "", + }, + { + name: "empty path", + path: "", + expected: "", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + result := extractPodUID(tc.path) + assert.Equal(t, tc.expected, result) + }) + } +} + +func TestExtractPodUIDFromPaths(t *testing.T) { + tests := []struct { + name string + subsystems map[string]string + unified string + expected string + }{ + { + name: "found in subsystems", + subsystems: map[string]string{ + "memory": "/kubepods/besteffort/poda9c80282-3f6b-4d5b-84d5-a137a6668011/container", + "cpu": "/kubepods/besteffort/poda9c80282-3f6b-4d5b-84d5-a137a6668011/container", + }, + unified: "", + expected: "a9c80282-3f6b-4d5b-84d5-a137a6668011", + }, + { + name: "found in unified only", + subsystems: map[string]string{}, + unified: "/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod12345678_1234_1234_1234_123456789012.slice", + expected: "12345678-1234-1234-1234-123456789012", + }, + { + name: "not found", + subsystems: map[string]string{ + "memory": "/system.slice/docker.service", + }, + unified: "/user.slice", + expected: "", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + result := extractPodUIDFromPaths(tc.subsystems, tc.unified) + assert.Equal(t, tc.expected, result) + }) + } +} + +func TestBuildPIDToPodMap(t *testing.T) { + mapper := newPIDToPodMapper() + + pods := []PodInfo{ + {Name: "pod1", Namespace: "default", UID: "uid-1"}, + {Name: "pod2", Namespace: "default", UID: "uid-2"}, + {Name: "pod3", Namespace: "kube-system", UID: ""}, + } + + mapper.pidToUID[1001] = "uid-1" + mapper.pidToUID[1002] = "uid-2" + mapper.pidToUID[1003] = "uid-unknown" + + result := mapper.buildPIDToPodMap([]uint32{1001, 1002, 1003, 1004}, pods) + + assert.Len(t, result, 2) + assert.Equal(t, "pod1", result[1001].Name) + assert.Equal(t, "pod2", result[1002].Name) + assert.Nil(t, result[1003]) + assert.Nil(t, result[1004]) +} diff --git a/internal/pkg/transformation/process_metrics.go b/internal/pkg/transformation/process_metrics.go new file mode 100644 index 00000000..b1c0f9ac --- /dev/null +++ b/internal/pkg/transformation/process_metrics.go @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package transformation + +import ( + "fmt" + "log/slog" + "maps" + "slices" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider" +) + +func isPerProcessMetric(fieldName string) bool { + return fieldName == metricGPUUtil || fieldName == metricFBUsed || fieldName == metricGREngineActive +} + +// getGPUUUIDToDeviceID builds a mapping from GPU UUID to device ID based on the specified ID type. +func getGPUUUIDToDeviceID(devInfo deviceinfo.Provider, idType appconfig.KubernetesGPUIDType) map[string]string { + result := make(map[string]string, devInfo.GPUCount()) + for i := uint(0); i < devInfo.GPUCount(); i++ { + gpu := devInfo.GPU(i) + uuid := gpu.DeviceInfo.UUID + + var deviceID string + switch idType { + case appconfig.GPUUID: + deviceID = uuid + default: + deviceID = fmt.Sprintf("nvidia%d", gpu.DeviceInfo.GPU) + } + + result[uuid] = deviceID + } + return result +} + +type perProcessMetrics struct { + pidToSMUtil map[uint32]uint32 + pidToMemory map[uint32]uint64 +} + +func (c *perProcessCollector) processRegularGPU(gpuUUID string, podInfos []PodInfo) (*perProcessMetrics, map[uint32]*PodInfo) { + var err error + data := &perProcessMetrics{} + + data.pidToMemory, err = c.client.GetDeviceProcessMemory(gpuUUID) + if err != nil { + slog.Debug("Failed to get process memory", "gpuUUID", gpuUUID, "error", err) + } + + data.pidToSMUtil, err = c.client.GetDeviceProcessUtilization(gpuUUID) + if err != nil { + slog.Debug("Failed to get process utilization", "gpuUUID", gpuUUID, "error", err) + } + + pidToPod := c.pidMapper.buildPIDToPodMap(data.getAllPIDs(), podInfos) + return data, pidToPod +} + +func (m *perProcessMetrics) getAllPIDs() []uint32 { + pidSet := make(map[uint32]struct{}) + for pid := range m.pidToSMUtil { + pidSet[pid] = struct{}{} + } + for pid := range m.pidToMemory { + pidSet[pid] = struct{}{} + } + return slices.Collect(maps.Keys(pidSet)) +} + +func (m *perProcessMetrics) getValueForMetric(fieldName string, pid uint32) (string, bool) { + switch fieldName { + case metricGPUUtil: + if util, ok := m.pidToSMUtil[pid]; ok { + return fmt.Sprintf("%d", util), true + } + case metricFBUsed: + if mem, ok := m.pidToMemory[pid]; ok { + memMiB := mem / (1024 * 1024) + return fmt.Sprintf("%d", memMiB), true + } + } + return "", false +} + +type perProcessDataMap struct { + metrics map[string]*perProcessMetrics // keyed by GPU UUID or "/" for MIG + pidToPod map[uint32]*PodInfo + deviceToPods map[string][]PodInfo // keyed by GPU UUID or "/" for MIG +} + +type PIDMapper interface { + buildPIDToPodMap(pids []uint32, pods []PodInfo) map[uint32]*PodInfo +} + +type perProcessCollector struct { + client nvmlprovider.NVML + pidMapper PIDMapper +} + +func getMIGMetricsKey(parentUUID string, gpuInstanceID string) string { + return parentUUID + "/" + gpuInstanceID +} + +func (c *perProcessCollector) processMIGEnabledGPU( + gpu deviceinfo.GPUInfo, + deviceToPods map[string][]PodInfo, +) (map[string]*perProcessMetrics, map[uint32]*PodInfo, map[string][]PodInfo) { + gpuUUID := gpu.DeviceInfo.UUID + gpuIndex := gpu.DeviceInfo.GPU + + allMIGProcessMemory, err := c.client.GetAllMIGDevicesProcessMemory(gpuUUID) + if err != nil { + slog.Debug("Failed to get MIG device process memory", "gpuUUID", gpuUUID, "error", err) + return nil, nil, nil + } + + metrics := make(map[string]*perProcessMetrics) + pidToPod := make(map[uint32]*PodInfo) + migKeyToPods := make(map[string][]PodInfo) + + for _, instance := range gpu.GPUInstances { + gpuInstanceID := instance.Info.NvmlInstanceId + migDeviceID := fmt.Sprintf("%d-%d", gpuIndex, gpuInstanceID) + podInfos := deviceToPods[migDeviceID] + + if len(podInfos) == 0 { + continue + } + + data := &perProcessMetrics{pidToMemory: allMIGProcessMemory[gpuInstanceID]} + migKey := getMIGMetricsKey(gpuUUID, fmt.Sprintf("%d", gpuInstanceID)) + metrics[migKey] = data + migKeyToPods[migKey] = podInfos + maps.Copy(pidToPod, c.pidMapper.buildPIDToPodMap(data.getAllPIDs(), podInfos)) + } + + return metrics, pidToPod, migKeyToPods +} + +func (c *perProcessCollector) Collect(gpuDeviceMap map[string]string, deviceToPods map[string][]PodInfo, devInfo deviceinfo.Provider) *perProcessDataMap { + result := &perProcessDataMap{ + metrics: make(map[string]*perProcessMetrics), + pidToPod: make(map[uint32]*PodInfo), + deviceToPods: make(map[string][]PodInfo), + } + + if devInfo == nil || c.client == nil { + return result + } + + for i := uint(0); i < devInfo.GPUCount(); i++ { + gpu := devInfo.GPU(i) + gpuUUID := gpu.DeviceInfo.UUID + + if len(gpu.GPUInstances) > 0 { + metrics, pidToPod, keyToPods := c.processMIGEnabledGPU(gpu, deviceToPods) + maps.Copy(result.metrics, metrics) + maps.Copy(result.pidToPod, pidToPod) + maps.Copy(result.deviceToPods, keyToPods) + } else { + deviceID := gpuDeviceMap[gpuUUID] + podInfos := deviceToPods[deviceID] + if len(podInfos) == 0 { + continue + } + data, pidToPod := c.processRegularGPU(gpuUUID, podInfos) + result.metrics[gpuUUID] = data + result.deviceToPods[gpuUUID] = podInfos + maps.Copy(result.pidToPod, pidToPod) + } + } + + return result +} diff --git a/internal/pkg/transformation/process_metrics_test.go b/internal/pkg/transformation/process_metrics_test.go new file mode 100644 index 00000000..143f602f --- /dev/null +++ b/internal/pkg/transformation/process_metrics_test.go @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package transformation + +import ( + "testing" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" + + mockdeviceinfo "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/deviceinfo" + mocknvmlprovider "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/nvmlprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider" +) + +type mockPIDMapper struct { + result map[uint32]*PodInfo +} + +func (m *mockPIDMapper) buildPIDToPodMap(pids []uint32, pods []PodInfo) map[uint32]*PodInfo { + return m.result +} + +func TestGetGPUUUIDToDeviceID(t *testing.T) { + gpu0UUID := "GPU-00000000-0000-0000-0000-000000000000" + gpu1UUID := "GPU-11111111-1111-1111-1111-111111111111" + + tests := []struct { + name string + idType appconfig.KubernetesGPUIDType + expected map[string]string + }{ + { + name: "device name type", + idType: appconfig.DeviceName, + expected: map[string]string{ + gpu0UUID: "nvidia0", + gpu1UUID: "nvidia1", + }, + }, + { + name: "GPU UUID type", + idType: appconfig.GPUUID, + expected: map[string]string{ + gpu0UUID: gpu0UUID, + gpu1UUID: gpu1UUID, + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockDevInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockDevInfo.EXPECT().GPUCount().Return(uint(2)).AnyTimes() + mockDevInfo.EXPECT().GPU(uint(0)).Return(deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{UUID: gpu0UUID, GPU: 0}, + }).AnyTimes() + mockDevInfo.EXPECT().GPU(uint(1)).Return(deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{UUID: gpu1UUID, GPU: 1}, + }).AnyTimes() + + result := getGPUUUIDToDeviceID(mockDevInfo, tc.idType) + + assert.Equal(t, tc.expected, result) + }) + } +} + +func TestIsPerProcessMetric(t *testing.T) { + tests := []struct { + fieldName string + expected bool + }{ + {metricGPUUtil, true}, + {metricFBUsed, true}, + {metricGREngineActive, true}, + {"DCGM_FI_DEV_POWER_USAGE", false}, + {"DCGM_FI_DEV_GPU_TEMP", false}, + {"", false}, + } + + for _, tc := range tests { + t.Run(tc.fieldName, func(t *testing.T) { + result := isPerProcessMetric(tc.fieldName) + assert.Equal(t, tc.expected, result) + }) + } +} + +func TestPerProcessMetrics_GetAllPIDs(t *testing.T) { + metrics := &perProcessMetrics{ + pidToSMUtil: map[uint32]uint32{ + 1001: 50, + 1002: 30, + }, + pidToMemory: map[uint32]uint64{ + 1002: 1024, + 1003: 2048, + }, + } + + pids := metrics.getAllPIDs() + + assert.Len(t, pids, 3) + assert.Contains(t, pids, uint32(1001)) + assert.Contains(t, pids, uint32(1002)) + assert.Contains(t, pids, uint32(1003)) +} + +func TestPerProcessMetrics_GetValueForMetric(t *testing.T) { + metrics := &perProcessMetrics{ + pidToSMUtil: map[uint32]uint32{ + 1001: 50, + 1002: 100, + }, + pidToMemory: map[uint32]uint64{ + 1001: 1024 * 1024 * 1024, + 1002: 512 * 1024 * 1024, + }, + } + + tests := []struct { + name string + fieldName string + pid uint32 + expected string + hasValue bool + }{ + { + name: "GPU util - 50%", + fieldName: metricGPUUtil, + pid: 1001, + expected: "50", + hasValue: true, + }, + { + name: "GPU util - 100%", + fieldName: metricGPUUtil, + pid: 1002, + expected: "100", + hasValue: true, + }, + { + name: "FB used - 1GB", + fieldName: metricFBUsed, + pid: 1001, + expected: "1024", + hasValue: true, + }, + { + name: "FB used - 512MB", + fieldName: metricFBUsed, + pid: 1002, + expected: "512", + hasValue: true, + }, + { + name: "unknown metric", + fieldName: "DCGM_FI_DEV_POWER_USAGE", + pid: 1001, + expected: "", + hasValue: false, + }, + { + name: "unknown PID", + fieldName: metricGPUUtil, + pid: 9999, + expected: "", + hasValue: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + value, hasValue := metrics.getValueForMetric(tc.fieldName, tc.pid) + assert.Equal(t, tc.hasValue, hasValue) + if tc.hasValue { + assert.Equal(t, tc.expected, value) + } + }) + } +} + +func TestPerProcessMetrics_EmptyMaps(t *testing.T) { + metrics := &perProcessMetrics{ + pidToSMUtil: make(map[uint32]uint32), + pidToMemory: make(map[uint32]uint64), + } + + pids := metrics.getAllPIDs() + assert.Len(t, pids, 0) + + value, hasValue := metrics.getValueForMetric(metricGPUUtil, 1001) + assert.False(t, hasValue) + assert.Equal(t, "", value) +} + +func TestPerProcessCollector_Collect(t *testing.T) { + gpu0UUID := "GPU-00000000-0000-0000-0000-000000000000" + gpu1UUID := "GPU-11111111-1111-1111-1111-111111111111" + podUID0 := "a9c80282-3f6b-4d5b-84d5-a137a6668011" + podUID1 := "b9c80282-3f6b-4d5b-84d5-b137a6668022" + + pod0 := &PodInfo{Name: "test-pod", Namespace: "default", UID: podUID0, Container: "app"} + pod1 := &PodInfo{Name: "pod1", Namespace: "ns1", UID: podUID1} + + tests := []struct { + name string + setupMocks func(ctrl *gomock.Controller) (nvmlprovider.NVML, deviceinfo.Provider) + gpuDeviceMap map[string]string + deviceToPods map[string][]PodInfo + pidToPod map[uint32]*PodInfo + validate func(t *testing.T, result *perProcessDataMap) + }{ + { + name: "nil devInfo returns empty", + setupMocks: func(ctrl *gomock.Controller) (nvmlprovider.NVML, deviceinfo.Provider) { + mockNVML := mocknvmlprovider.NewMockNVML(ctrl) + return mockNVML, nil + }, + validate: func(t *testing.T, result *perProcessDataMap) { + assert.Empty(t, result.metrics) + assert.Empty(t, result.pidToPod) + assert.Empty(t, result.deviceToPods) + }, + }, + { + name: "nil client returns empty", + setupMocks: func(ctrl *gomock.Controller) (nvmlprovider.NVML, deviceinfo.Provider) { + mockDevInfo := mockdeviceinfo.NewMockProvider(ctrl) + return nil, mockDevInfo + }, + validate: func(t *testing.T, result *perProcessDataMap) { + assert.Empty(t, result.metrics) + assert.Empty(t, result.pidToPod) + assert.Empty(t, result.deviceToPods) + }, + }, + { + name: "single regular GPU with processes", + setupMocks: func(ctrl *gomock.Controller) (nvmlprovider.NVML, deviceinfo.Provider) { + mockNVML := mocknvmlprovider.NewMockNVML(ctrl) + mockNVML.EXPECT().GetDeviceProcessMemory(gpu0UUID).Return(map[uint32]uint64{ + 1001: 1024 * 1024 * 1024, + 1002: 512 * 1024 * 1024, + }, nil) + mockNVML.EXPECT().GetDeviceProcessUtilization(gpu0UUID).Return(map[uint32]uint32{ + 1001: 50, + 1002: 30, + }, nil) + + mockDevInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockDevInfo.EXPECT().GPUCount().Return(uint(1)).AnyTimes() + mockDevInfo.EXPECT().GPU(uint(0)).Return(deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{UUID: gpu0UUID, GPU: 0}, + GPUInstances: nil, + }).AnyTimes() + return mockNVML, mockDevInfo + }, + gpuDeviceMap: map[string]string{gpu0UUID: "nvidia0"}, + deviceToPods: map[string][]PodInfo{ + "nvidia0": {{Name: "test-pod", Namespace: "default", UID: podUID0, Container: "app"}}, + }, + pidToPod: map[uint32]*PodInfo{1001: pod0}, + validate: func(t *testing.T, result *perProcessDataMap) { + assert.Contains(t, result.metrics, gpu0UUID) + + gpuMetrics := result.metrics[gpu0UUID] + assert.Equal(t, uint32(50), gpuMetrics.pidToSMUtil[1001]) + assert.Equal(t, uint32(30), gpuMetrics.pidToSMUtil[1002]) + assert.Equal(t, uint64(1024*1024*1024), gpuMetrics.pidToMemory[1001]) + assert.Equal(t, uint64(512*1024*1024), gpuMetrics.pidToMemory[1002]) + + assert.Len(t, result.pidToPod, 1) + assert.Equal(t, "test-pod", result.pidToPod[1001].Name) + assert.Equal(t, "default", result.pidToPod[1001].Namespace) + + assert.Contains(t, result.deviceToPods, gpu0UUID) + assert.Len(t, result.deviceToPods[gpu0UUID], 1) + assert.Equal(t, "test-pod", result.deviceToPods[gpu0UUID][0].Name) + }, + }, + { + name: "no pods using GPU skips collection", + setupMocks: func(ctrl *gomock.Controller) (nvmlprovider.NVML, deviceinfo.Provider) { + mockNVML := mocknvmlprovider.NewMockNVML(ctrl) + + mockDevInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockDevInfo.EXPECT().GPUCount().Return(uint(1)).AnyTimes() + mockDevInfo.EXPECT().GPU(uint(0)).Return(deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{UUID: gpu0UUID, GPU: 0}, + GPUInstances: nil, + }).AnyTimes() + return mockNVML, mockDevInfo + }, + gpuDeviceMap: map[string]string{gpu0UUID: "nvidia0"}, + deviceToPods: map[string][]PodInfo{}, + validate: func(t *testing.T, result *perProcessDataMap) { + assert.Empty(t, result.metrics) + assert.Empty(t, result.pidToPod) + assert.Empty(t, result.deviceToPods) + }, + }, + { + name: "MIG-enabled GPU", + setupMocks: func(ctrl *gomock.Controller) (nvmlprovider.NVML, deviceinfo.Provider) { + mockNVML := mocknvmlprovider.NewMockNVML(ctrl) + mockNVML.EXPECT().GetAllMIGDevicesProcessMemory(gpu0UUID).Return(map[uint]map[uint32]uint64{ + 1: {2001: 256 * 1024 * 1024}, + }, nil) + + mockDevInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockDevInfo.EXPECT().GPUCount().Return(uint(1)).AnyTimes() + mockDevInfo.EXPECT().GPU(uint(0)).Return(deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{UUID: gpu0UUID, GPU: 0}, + MigEnabled: true, + GPUInstances: []deviceinfo.GPUInstanceInfo{ + {Info: dcgm.MigEntityInfo{NvmlInstanceId: 1}, EntityId: 100}, + }, + }).AnyTimes() + return mockNVML, mockDevInfo + }, + gpuDeviceMap: map[string]string{gpu0UUID: "nvidia0"}, + deviceToPods: map[string][]PodInfo{ + "0-1": {{Name: "mig-pod", Namespace: "default", UID: podUID0, Container: "app"}}, + }, + pidToPod: map[uint32]*PodInfo{2001: {Name: "mig-pod", Namespace: "default", UID: podUID0}}, + validate: func(t *testing.T, result *perProcessDataMap) { + migKey := getMIGMetricsKey(gpu0UUID, "1") + assert.Contains(t, result.metrics, migKey) + + migMetrics := result.metrics[migKey] + assert.Equal(t, uint64(256*1024*1024), migMetrics.pidToMemory[2001]) + assert.Nil(t, migMetrics.pidToSMUtil) + + assert.Len(t, result.pidToPod, 1) + assert.Equal(t, "mig-pod", result.pidToPod[2001].Name) + + assert.Contains(t, result.deviceToPods, migKey) + assert.Len(t, result.deviceToPods[migKey], 1) + assert.Equal(t, "mig-pod", result.deviceToPods[migKey][0].Name) + }, + }, + { + name: "multiple regular GPUs", + setupMocks: func(ctrl *gomock.Controller) (nvmlprovider.NVML, deviceinfo.Provider) { + mockNVML := mocknvmlprovider.NewMockNVML(ctrl) + mockNVML.EXPECT().GetDeviceProcessMemory(gpu0UUID).Return(map[uint32]uint64{1001: 100}, nil) + mockNVML.EXPECT().GetDeviceProcessUtilization(gpu0UUID).Return(map[uint32]uint32{1001: 10}, nil) + mockNVML.EXPECT().GetDeviceProcessMemory(gpu1UUID).Return(map[uint32]uint64{2001: 200}, nil) + mockNVML.EXPECT().GetDeviceProcessUtilization(gpu1UUID).Return(map[uint32]uint32{2001: 20}, nil) + + mockDevInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockDevInfo.EXPECT().GPUCount().Return(uint(2)).AnyTimes() + mockDevInfo.EXPECT().GPU(uint(0)).Return(deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{UUID: gpu0UUID, GPU: 0}, + }).AnyTimes() + mockDevInfo.EXPECT().GPU(uint(1)).Return(deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{UUID: gpu1UUID, GPU: 1}, + }).AnyTimes() + return mockNVML, mockDevInfo + }, + gpuDeviceMap: map[string]string{ + gpu0UUID: "nvidia0", + gpu1UUID: "nvidia1", + }, + deviceToPods: map[string][]PodInfo{ + "nvidia0": {{Name: "pod0", Namespace: "ns0", UID: podUID0}}, + "nvidia1": {{Name: "pod1", Namespace: "ns1", UID: podUID1}}, + }, + pidToPod: map[uint32]*PodInfo{ + 1001: pod0, + 2001: pod1, + }, + validate: func(t *testing.T, result *perProcessDataMap) { + assert.Len(t, result.metrics, 2) + assert.Contains(t, result.metrics, gpu0UUID) + assert.Contains(t, result.metrics, gpu1UUID) + + assert.Equal(t, uint32(10), result.metrics[gpu0UUID].pidToSMUtil[1001]) + assert.Equal(t, uint64(100), result.metrics[gpu0UUID].pidToMemory[1001]) + assert.Equal(t, uint32(20), result.metrics[gpu1UUID].pidToSMUtil[2001]) + assert.Equal(t, uint64(200), result.metrics[gpu1UUID].pidToMemory[2001]) + + assert.Len(t, result.pidToPod, 2) + assert.Equal(t, "test-pod", result.pidToPod[1001].Name) + assert.Equal(t, "pod1", result.pidToPod[2001].Name) + + assert.Len(t, result.deviceToPods, 2) + assert.Contains(t, result.deviceToPods, gpu0UUID) + assert.Contains(t, result.deviceToPods, gpu1UUID) + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + client, devInfo := tc.setupMocks(ctrl) + + collector := &perProcessCollector{ + client: client, + pidMapper: &mockPIDMapper{result: tc.pidToPod}, + } + + result := collector.Collect(tc.gpuDeviceMap, tc.deviceToPods, devInfo) + + assert.NotNil(t, result) + tc.validate(t, result) + }) + } +} From 0269f2ab760c6fa4c96b4a41be598721fed23fd4 Mon Sep 17 00:00:00 2001 From: Krystian Bednarczuk Date: Mon, 8 Dec 2025 10:35:01 +0100 Subject: [PATCH 2/6] Cleanup: improve comments, reduce test duplication, fix copyright year Signed-off-by: Krystian Bednarczuk --- internal/pkg/nvmlprovider/provider.go | 5 ++- internal/pkg/nvmlprovider/provider_test.go | 24 +++++++++++++ internal/pkg/nvmlprovider/types.go | 6 ++-- internal/pkg/transformation/const.go | 6 ++-- .../pkg/transformation/kubernetes_test.go | 27 ++++++++++---- internal/pkg/transformation/pidmapper.go | 4 +-- internal/pkg/transformation/pidmapper_stub.go | 35 ------------------- internal/pkg/transformation/pidmapper_test.go | 9 +++-- .../pkg/transformation/process_metrics.go | 4 +-- .../transformation/process_metrics_test.go | 31 +++++++++------- 10 files changed, 80 insertions(+), 71 deletions(-) delete mode 100644 internal/pkg/transformation/pidmapper_stub.go diff --git a/internal/pkg/nvmlprovider/provider.go b/internal/pkg/nvmlprovider/provider.go index 24d3602c..493d5e94 100644 --- a/internal/pkg/nvmlprovider/provider.go +++ b/internal/pkg/nvmlprovider/provider.go @@ -221,9 +221,8 @@ func (n nvmlProvider) GetDeviceProcessUtilization(gpuUUID string) (map[uint32]ui return result, nil } -// GetAllMIGDevicesProcessMemory returns memory usage for all MIG devices on a parent GPU. -// Returns a map from GPU Instance ID to (PID -> memory used in bytes). -// Note: Only memory info is available for MIG devices, not SM utilization. +// GetAllMIGDevicesProcessMemory returns per-process memory usage for all MIG instances on a GPU. +// Returns map[gpuInstanceID (MIG instance)]map[PID]memoryBytes. func (n nvmlProvider) GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[uint]map[uint32]uint64, error) { if err := n.preCheck(); err != nil { return nil, fmt.Errorf("failed to get MIG device process memory: %w", err) diff --git a/internal/pkg/nvmlprovider/provider_test.go b/internal/pkg/nvmlprovider/provider_test.go index 45d3b85c..16b2daac 100644 --- a/internal/pkg/nvmlprovider/provider_test.go +++ b/internal/pkg/nvmlprovider/provider_test.go @@ -31,6 +31,30 @@ func TestGetMIGDeviceInfoByID_When_NVML_Not_Initialized(t *testing.T) { assert.Error(t, err, "uuid: %v, Device Info: %+v", validMIGUUID, deviceInfo) } +func TestGetDeviceProcessMemory_When_NVML_Not_Initialized(t *testing.T) { + provider := nvmlProvider{} + result, err := provider.GetDeviceProcessMemory("GPU-test-uuid") + assert.Error(t, err) + assert.Nil(t, result) + assert.Contains(t, err.Error(), "failed to get device process memory") +} + +func TestGetDeviceProcessUtilization_When_NVML_Not_Initialized(t *testing.T) { + provider := nvmlProvider{} + result, err := provider.GetDeviceProcessUtilization("GPU-test-uuid") + assert.Error(t, err) + assert.Nil(t, result) + assert.Contains(t, err.Error(), "failed to get device process utilization") +} + +func TestGetAllMIGDevicesProcessMemory_When_NVML_Not_Initialized(t *testing.T) { + provider := nvmlProvider{} + result, err := provider.GetAllMIGDevicesProcessMemory("GPU-test-uuid") + assert.Error(t, err) + assert.Nil(t, result) + assert.Contains(t, err.Error(), "failed to get MIG device process memory") +} + func TestGetMIGDeviceInfoByID_When_DriverVersion_Below_R470(t *testing.T) { _ = Initialize() assert.NotNil(t, Client(), "expected NVML Client to be not nil") diff --git a/internal/pkg/nvmlprovider/types.go b/internal/pkg/nvmlprovider/types.go index 6bc4f33f..7d2b3021 100644 --- a/internal/pkg/nvmlprovider/types.go +++ b/internal/pkg/nvmlprovider/types.go @@ -18,7 +18,6 @@ package nvmlprovider -// NVML interface provides access to NVIDIA Management Library functionality type NVML interface { GetMIGDeviceInfoByID(string) (*MIGDeviceInfo, error) // GetDeviceProcessMemory returns memory usage for processes running on the GPU. @@ -27,9 +26,8 @@ type NVML interface { // GetDeviceProcessUtilization returns SM utilization for processes running on the GPU. // Returns a map from PID to SM utilization percentage. GetDeviceProcessUtilization(gpuUUID string) (map[uint32]uint32, error) - // GetAllMIGDevicesProcessMemory returns memory usage for all MIG devices on a parent GPU. - // Returns a map from GPU Instance ID to (PID -> memory used in bytes). - // Note: Only memory info is available for MIG devices, not SM utilization. + // GetAllMIGDevicesProcessMemory returns per-process memory usage for all MIG instances on a GPU. + // Returns map[gpuInstanceID (MIG instance)]map[PID]memoryBytes. GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[uint]map[uint32]uint64, error) Cleanup() } diff --git a/internal/pkg/transformation/const.go b/internal/pkg/transformation/const.go index 951b1a0e..96a593f9 100644 --- a/internal/pkg/transformation/const.go +++ b/internal/pkg/transformation/const.go @@ -17,6 +17,7 @@ package transformation const ( + // Note standard resource attributes podAttribute = "pod" namespaceAttribute = "namespace" containerAttribute = "container" @@ -39,7 +40,6 @@ const ( DRAGPUDriverName = "gpu.nvidia.com" - metricGPUUtil = "DCGM_FI_DEV_GPU_UTIL" - metricFBUsed = "DCGM_FI_DEV_FB_USED" - metricGREngineActive = "DCGM_FI_PROF_GR_ENGINE_ACTIVE" + metricGPUUtil = "DCGM_FI_DEV_GPU_UTIL" + metricFBUsed = "DCGM_FI_DEV_FB_USED" ) diff --git a/internal/pkg/transformation/kubernetes_test.go b/internal/pkg/transformation/kubernetes_test.go index f943b4b3..c645f4c3 100644 --- a/internal/pkg/transformation/kubernetes_test.go +++ b/internal/pkg/transformation/kubernetes_test.go @@ -347,6 +347,9 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { ctrl := gomock.NewController(t) mockNVMLProvider := mocknvmlprovider.NewMockNVML(ctrl) mockNVMLProvider.EXPECT().GetMIGDeviceInfoByID(gomock.Any()).Return(migDeviceInfo, nil).AnyTimes() + mockNVMLProvider.EXPECT().GetDeviceProcessMemory(gomock.Any()).Return(map[uint32]uint64{}, nil).AnyTimes() + mockNVMLProvider.EXPECT().GetDeviceProcessUtilization(gomock.Any()).Return(map[uint32]uint32{}, nil).AnyTimes() + mockNVMLProvider.EXPECT().GetAllMIGDevicesProcessMemory(gomock.Any()).Return(map[uint]map[uint32]uint64{}, nil).AnyTimes() nvmlprovider.SetClient(mockNVMLProvider) podMapper := NewPodMapper(&appconfig.Config{ @@ -946,6 +949,7 @@ func TestProcessPodMapper_WithLabelsAndUID(t *testing.T) { } func TestBuildPodValueMap(t *testing.T) { + t.Parallel() tests := []struct { name string pidToPod map[uint32]*PodInfo @@ -960,6 +964,15 @@ func TestBuildPodValueMap(t *testing.T) { fieldName: metricGPUUtil, expected: map[string]string{}, }, + { + name: "empty pidToPod returns empty map", + pidToPod: map[uint32]*PodInfo{}, + data: &perProcessMetrics{ + pidToSMUtil: map[uint32]uint32{1001: 50}, + }, + fieldName: metricGPUUtil, + expected: map[string]string{}, + }, { name: "maps PID values to pod UIDs for GPU util", pidToPod: map[uint32]*PodInfo{1001: {UID: "uid1"}, 1002: {UID: "uid2"}}, @@ -991,6 +1004,7 @@ func TestBuildPodValueMap(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { + t.Parallel() result := buildPodValueMap(tc.pidToPod, tc.data, tc.fieldName) assert.Equal(t, tc.expected, result) }) @@ -998,6 +1012,7 @@ func TestBuildPodValueMap(t *testing.T) { } func TestBuildIdlePodValues(t *testing.T) { + t.Parallel() tests := []struct { name string existingValues map[string]string @@ -1026,6 +1041,7 @@ func TestBuildIdlePodValues(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { + t.Parallel() result := buildIdlePodValues(tc.existingValues, tc.devicePods) assert.Equal(t, tc.expected, result) }) @@ -1033,6 +1049,7 @@ func TestBuildIdlePodValues(t *testing.T) { } func TestPodMapper_CreatePerProcessMetrics(t *testing.T) { + t.Parallel() gpuUUID := "GPU-00000000-0000-0000-0000-000000000000" podUID := "a9c80282-3f6b-4d5b-84d5-a137a6668011" @@ -1226,6 +1243,7 @@ func TestPodMapper_CreatePerProcessMetrics(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { + t.Parallel() podMapper := &PodMapper{ Config: &appconfig.Config{ UseOldNamespace: tc.useOldNS, @@ -1245,21 +1263,17 @@ func TestPodMapper_CreatePerProcessMetrics(t *testing.T) { } func TestStripVGPUSuffix(t *testing.T) { + t.Parallel() tests := []struct { name string deviceID string expected string }{ { - name: "AWS MIG device ID with vgpu suffix", + name: "MIG device ID with vgpu suffix", deviceID: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55::7", expected: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55", }, - { - name: "AWS MIG device ID with different vgpu index", - deviceID: "MIG-a8d7e63b-588b-5fd8-826d-d1eab19c6f18::9", - expected: "MIG-a8d7e63b-588b-5fd8-826d-d1eab19c6f18", - }, { name: "Plain MIG UUID without suffix", deviceID: "MIG-2ce7a541-c516-5dbc-a76e-26cc100d9b55", @@ -1289,6 +1303,7 @@ func TestStripVGPUSuffix(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { + t.Parallel() result := stripVGPUSuffix(tc.deviceID) assert.Equal(t, tc.expected, result) }) diff --git a/internal/pkg/transformation/pidmapper.go b/internal/pkg/transformation/pidmapper.go index faf6080e..a265cae0 100644 --- a/internal/pkg/transformation/pidmapper.go +++ b/internal/pkg/transformation/pidmapper.go @@ -1,7 +1,5 @@ -//go:build linux - /* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/internal/pkg/transformation/pidmapper_stub.go b/internal/pkg/transformation/pidmapper_stub.go deleted file mode 100644 index 35eaa7cd..00000000 --- a/internal/pkg/transformation/pidmapper_stub.go +++ /dev/null @@ -1,35 +0,0 @@ -//go:build !linux - -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package transformation - -import "fmt" - -type pidToPodMapper struct{} - -func newPIDToPodMapper() *pidToPodMapper { - return &pidToPodMapper{} -} - -func (m *pidToPodMapper) getPodUIDForPID(pid uint32) (string, error) { - return "", fmt.Errorf("PID to Pod mapping is only supported on Linux") -} - -func (m *pidToPodMapper) buildPIDToPodMap(pids []uint32, pods []PodInfo) map[uint32]*PodInfo { - return make(map[uint32]*PodInfo) -} diff --git a/internal/pkg/transformation/pidmapper_test.go b/internal/pkg/transformation/pidmapper_test.go index 45f86f0a..e299421d 100644 --- a/internal/pkg/transformation/pidmapper_test.go +++ b/internal/pkg/transformation/pidmapper_test.go @@ -1,7 +1,5 @@ -//go:build linux - /* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +23,7 @@ import ( ) func TestExtractPodUID(t *testing.T) { + t.Parallel() tests := []struct { name string path string @@ -64,6 +63,7 @@ func TestExtractPodUID(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { + t.Parallel() result := extractPodUID(tc.path) assert.Equal(t, tc.expected, result) }) @@ -71,6 +71,7 @@ func TestExtractPodUID(t *testing.T) { } func TestExtractPodUIDFromPaths(t *testing.T) { + t.Parallel() tests := []struct { name string subsystems map[string]string @@ -104,6 +105,7 @@ func TestExtractPodUIDFromPaths(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { + t.Parallel() result := extractPodUIDFromPaths(tc.subsystems, tc.unified) assert.Equal(t, tc.expected, result) }) @@ -111,6 +113,7 @@ func TestExtractPodUIDFromPaths(t *testing.T) { } func TestBuildPIDToPodMap(t *testing.T) { + t.Parallel() mapper := newPIDToPodMapper() pods := []PodInfo{ diff --git a/internal/pkg/transformation/process_metrics.go b/internal/pkg/transformation/process_metrics.go index b1c0f9ac..ca554bdf 100644 --- a/internal/pkg/transformation/process_metrics.go +++ b/internal/pkg/transformation/process_metrics.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ import ( ) func isPerProcessMetric(fieldName string) bool { - return fieldName == metricGPUUtil || fieldName == metricFBUsed || fieldName == metricGREngineActive + return fieldName == metricGPUUtil || fieldName == metricFBUsed } // getGPUUUIDToDeviceID builds a mapping from GPU UUID to device ID based on the specified ID type. diff --git a/internal/pkg/transformation/process_metrics_test.go b/internal/pkg/transformation/process_metrics_test.go index 143f602f..3fa5dc88 100644 --- a/internal/pkg/transformation/process_metrics_test.go +++ b/internal/pkg/transformation/process_metrics_test.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,6 +39,7 @@ func (m *mockPIDMapper) buildPIDToPodMap(pids []uint32, pods []PodInfo) map[uint } func TestGetGPUUUIDToDeviceID(t *testing.T) { + t.Parallel() gpu0UUID := "GPU-00000000-0000-0000-0000-000000000000" gpu1UUID := "GPU-11111111-1111-1111-1111-111111111111" @@ -67,6 +68,7 @@ func TestGetGPUUUIDToDeviceID(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { + t.Parallel() ctrl := gomock.NewController(t) defer ctrl.Finish() @@ -87,20 +89,20 @@ func TestGetGPUUUIDToDeviceID(t *testing.T) { } func TestIsPerProcessMetric(t *testing.T) { + t.Parallel() tests := []struct { fieldName string expected bool }{ {metricGPUUtil, true}, {metricFBUsed, true}, - {metricGREngineActive, true}, {"DCGM_FI_DEV_POWER_USAGE", false}, - {"DCGM_FI_DEV_GPU_TEMP", false}, {"", false}, } for _, tc := range tests { t.Run(tc.fieldName, func(t *testing.T) { + t.Parallel() result := isPerProcessMetric(tc.fieldName) assert.Equal(t, tc.expected, result) }) @@ -108,6 +110,7 @@ func TestIsPerProcessMetric(t *testing.T) { } func TestPerProcessMetrics_GetAllPIDs(t *testing.T) { + t.Parallel() metrics := &perProcessMetrics{ pidToSMUtil: map[uint32]uint32{ 1001: 50, @@ -128,14 +131,15 @@ func TestPerProcessMetrics_GetAllPIDs(t *testing.T) { } func TestPerProcessMetrics_GetValueForMetric(t *testing.T) { + t.Parallel() metrics := &perProcessMetrics{ pidToSMUtil: map[uint32]uint32{ 1001: 50, - 1002: 100, + 1002: 0, }, pidToMemory: map[uint32]uint64{ 1001: 1024 * 1024 * 1024, - 1002: 512 * 1024 * 1024, + 1002: 0, }, } @@ -147,31 +151,31 @@ func TestPerProcessMetrics_GetValueForMetric(t *testing.T) { hasValue bool }{ { - name: "GPU util - 50%", + name: "GPU util", fieldName: metricGPUUtil, pid: 1001, expected: "50", hasValue: true, }, { - name: "GPU util - 100%", + name: "GPU util zero", fieldName: metricGPUUtil, pid: 1002, - expected: "100", + expected: "0", hasValue: true, }, { - name: "FB used - 1GB", + name: "FB used", fieldName: metricFBUsed, pid: 1001, expected: "1024", hasValue: true, }, { - name: "FB used - 512MB", + name: "FB used zero", fieldName: metricFBUsed, pid: 1002, - expected: "512", + expected: "0", hasValue: true, }, { @@ -202,6 +206,7 @@ func TestPerProcessMetrics_GetValueForMetric(t *testing.T) { } func TestPerProcessMetrics_EmptyMaps(t *testing.T) { + t.Parallel() metrics := &perProcessMetrics{ pidToSMUtil: make(map[uint32]uint32), pidToMemory: make(map[uint32]uint64), @@ -216,6 +221,7 @@ func TestPerProcessMetrics_EmptyMaps(t *testing.T) { } func TestPerProcessCollector_Collect(t *testing.T) { + t.Parallel() gpu0UUID := "GPU-00000000-0000-0000-0000-000000000000" gpu1UUID := "GPU-11111111-1111-1111-1111-111111111111" podUID0 := "a9c80282-3f6b-4d5b-84d5-a137a6668011" @@ -385,7 +391,7 @@ func TestPerProcessCollector_Collect(t *testing.T) { gpu1UUID: "nvidia1", }, deviceToPods: map[string][]PodInfo{ - "nvidia0": {{Name: "pod0", Namespace: "ns0", UID: podUID0}}, + "nvidia0": {{Name: "test-pod", Namespace: "default", UID: podUID0}}, "nvidia1": {{Name: "pod1", Namespace: "ns1", UID: podUID1}}, }, pidToPod: map[uint32]*PodInfo{ @@ -415,6 +421,7 @@ func TestPerProcessCollector_Collect(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { + t.Parallel() ctrl := gomock.NewController(t) defer ctrl.Finish() From 4dd89c6e410760d25fc73a7f177af29e4f6cef01 Mon Sep 17 00:00:00 2001 From: Krystian Bednarczuk Date: Wed, 18 Feb 2026 14:20:12 +0100 Subject: [PATCH 3/6] Fix issue #611 - Preserve device-level metrics for unused GPUs when KubernetesVirtualGPUs is enabled Signed-off-by: Krystian Bednarczuk --- internal/pkg/transformation/kubernetes.go | 5 + .../pkg/transformation/kubernetes_test.go | 246 ++++++++++++++++++ 2 files changed, 251 insertions(+) diff --git a/internal/pkg/transformation/kubernetes.go b/internal/pkg/transformation/kubernetes.go index 543c7e67..252a68f1 100644 --- a/internal/pkg/transformation/kubernetes.go +++ b/internal/pkg/transformation/kubernetes.go @@ -376,6 +376,11 @@ func (p *PodMapper) Process(metrics collector.MetricsByCounter, deviceInfo devic } newmetrics = append(newmetrics, metric) } + // Preserve the original device-level metric for GPUs not currently + // used by any pod, so they still appear in /metrics with value 0. + if len(podInfos) == 0 { + newmetrics = append(newmetrics, metrics[counter][j]) + } } // Upsert the annotated series into the final map only if we found any // pods using the devices for the metric. Otherwise, leave the original diff --git a/internal/pkg/transformation/kubernetes_test.go b/internal/pkg/transformation/kubernetes_test.go index c645f4c3..b36e4f4e 100644 --- a/internal/pkg/transformation/kubernetes_test.go +++ b/internal/pkg/transformation/kubernetes_test.go @@ -1309,3 +1309,249 @@ func TestStripVGPUSuffix(t *testing.T) { }) } } + +func TestKubernetesVirtualGPUs_UnusedGPUsPreserveMetrics(t *testing.T) { + testutils.RequireLinux(t) + + testCases := []struct { + name string + counter counters.Counter + wantPodMetrics int + wantDeviceMetrics int + }{ + { + name: "non-per-process metric", + counter: counters.Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + }, + wantPodMetrics: 1, + wantDeviceMetrics: 2, + }, + { + name: "per-process metric", + counter: counters.Counter{ + FieldID: 203, + FieldName: metricGPUUtil, + PromType: "gauge", + }, + wantPodMetrics: 1, + wantDeviceMetrics: 3, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + allGPUUUIDs := []string{"gpu-uuid-0", "gpu-uuid-1", "gpu-uuid-2"} + inUseGPUUUIDs := []string{"gpu-uuid-0"} + + tmpDir, cleanup := testutils.CreateTmpDir(t) + defer cleanup() + socketPath := tmpDir + "/kubelet.sock" + server := grpc.NewServer() + defer server.Stop() + + config := &appconfig.Config{ + UseRemoteHE: false, + Kubernetes: true, + EnableDCGMLog: true, + DCGMLogLevel: "DEBUG", + } + dcgmprovider.SmartDCGMInit(t, config) + defer dcgmprovider.Client().Cleanup() + + podresourcesapi.RegisterPodResourcesListerServer(server, + testutils.NewMockPodResourcesServer(appconfig.NvidiaResourceName, inUseGPUUUIDs)) + cleanupServer := testutils.StartMockServer(t, server, socketPath) + defer cleanupServer() + + ctrl := gomock.NewController(t) + mockNVMLProvider := mocknvmlprovider.NewMockNVML(ctrl) + mockNVMLProvider.EXPECT().GetMIGDeviceInfoByID(gomock.Any()).Return(&nvmlprovider.MIGDeviceInfo{}, nil).AnyTimes() + mockNVMLProvider.EXPECT().GetDeviceProcessMemory(gomock.Any()).Return(map[uint32]uint64{}, nil).AnyTimes() + mockNVMLProvider.EXPECT().GetDeviceProcessUtilization(gomock.Any()).Return(map[uint32]uint32{}, nil).AnyTimes() + mockNVMLProvider.EXPECT().GetAllMIGDevicesProcessMemory(gomock.Any()).Return(map[uint]map[uint32]uint64{}, nil).AnyTimes() + nvmlprovider.SetClient(mockNVMLProvider) + + podMapper := NewPodMapper(&appconfig.Config{ + KubernetesGPUIdType: appconfig.GPUUID, + PodResourcesKubeletSocket: socketPath, + KubernetesVirtualGPUs: true, + }) + require.NotNil(t, podMapper) + + metrics := collector.MetricsByCounter{} + for i, gpuUUID := range allGPUUUIDs { + metrics[tc.counter] = append(metrics[tc.counter], collector.Metric{ + GPU: fmt.Sprint(i), + GPUUUID: gpuUUID, + Value: fmt.Sprint(42 + i), + Counter: tc.counter, + Attributes: map[string]string{}, + }) + } + + mockSystemInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockSystemInfo.EXPECT().GPUCount().Return(uint(len(allGPUUUIDs))).AnyTimes() + for i, uuid := range allGPUUUIDs { + mockSystemInfo.EXPECT().GPU(uint(i)).Return(deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{ + UUID: uuid, + GPU: uint(i), + }, + }).AnyTimes() + } + + err := podMapper.Process(metrics, mockSystemInfo) + require.NoError(t, err) + + var deviceMetrics, podMetrics []collector.Metric + for _, m := range metrics[tc.counter] { + if _, hasPod := m.Attributes[podAttribute]; hasPod { + podMetrics = append(podMetrics, m) + } else { + deviceMetrics = append(deviceMetrics, m) + } + } + + require.Len(t, podMetrics, tc.wantPodMetrics) + require.Equal(t, "gpu-pod-0", podMetrics[0].Attributes[podAttribute]) + require.Equal(t, "default", podMetrics[0].Attributes[namespaceAttribute]) + + require.Len(t, deviceMetrics, tc.wantDeviceMetrics) + for _, m := range deviceMetrics { + require.NotContains(t, m.Attributes, podAttribute) + } + }) + } +} + +func TestKubernetesVirtualGPUs_UnusedMIGInstancesPreserveMetrics(t *testing.T) { + testutils.RequireLinux(t) + + testCases := []struct { + name string + counter counters.Counter + wantPodMetrics int + wantDeviceMetrics int + }{ + { + name: "non-per-process metric", + counter: counters.Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + }, + wantPodMetrics: 1, + wantDeviceMetrics: 2, + }, + { + name: "per-process metric", + counter: counters.Counter{ + FieldID: 203, + FieldName: metricGPUUtil, + PromType: "gauge", + }, + // in-use instance: 1 device-level + 1 per-process (has pod attr) + // unused instances: 2 device-level + wantPodMetrics: 1, + wantDeviceMetrics: 3, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gpuUUID := "GPU-test-mig-uuid" + allInstances := []uint{7, 12, 13} + podDeviceID := "nvidia0/gi7/vgpu0" + + tmpDir, cleanup := testutils.CreateTmpDir(t) + defer cleanup() + socketPath := tmpDir + "/kubelet.sock" + server := grpc.NewServer() + defer server.Stop() + + config := &appconfig.Config{ + UseRemoteHE: false, + Kubernetes: true, + EnableDCGMLog: true, + DCGMLogLevel: "DEBUG", + } + dcgmprovider.SmartDCGMInit(t, config) + defer dcgmprovider.Client().Cleanup() + + podresourcesapi.RegisterPodResourcesListerServer(server, + testutils.NewMockPodResourcesServer("nvidia.com/mig-1g.5gb", []string{podDeviceID})) + cleanupServer := testutils.StartMockServer(t, server, socketPath) + defer cleanupServer() + + ctrl := gomock.NewController(t) + mockNVMLProvider := mocknvmlprovider.NewMockNVML(ctrl) + mockNVMLProvider.EXPECT().GetMIGDeviceInfoByID(gomock.Any()).Return(&nvmlprovider.MIGDeviceInfo{ + ParentUUID: gpuUUID, + GPUInstanceID: 3, + ComputeInstanceID: 0, + }, nil).AnyTimes() + mockNVMLProvider.EXPECT().GetAllMIGDevicesProcessMemory(gomock.Any()).Return(map[uint]map[uint32]uint64{}, nil).AnyTimes() + nvmlprovider.SetClient(mockNVMLProvider) + + podMapper := NewPodMapper(&appconfig.Config{ + KubernetesGPUIdType: appconfig.DeviceName, + PodResourcesKubeletSocket: socketPath, + KubernetesVirtualGPUs: true, + }) + require.NotNil(t, podMapper) + + metrics := collector.MetricsByCounter{} + for _, instID := range allInstances { + metrics[tc.counter] = append(metrics[tc.counter], collector.Metric{ + GPU: "0", + GPUUUID: gpuUUID, + GPUDevice: "0", + GPUInstanceID: fmt.Sprint(instID), + Value: "100", + MigProfile: "1g.5gb", + Counter: tc.counter, + Attributes: map[string]string{}, + }) + } + + var gpuInstances []deviceinfo.GPUInstanceInfo + for _, instID := range allInstances { + gpuInstances = append(gpuInstances, deviceinfo.GPUInstanceInfo{ + Info: dcgm.MigEntityInfo{NvmlInstanceId: instID}, + ProfileName: "1g.5gb", + }) + } + + mockSystemInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockSystemInfo.EXPECT().GPUCount().Return(uint(1)).AnyTimes() + mockSystemInfo.EXPECT().GPU(uint(0)).Return(deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{UUID: gpuUUID, GPU: 0}, + MigEnabled: true, + GPUInstances: gpuInstances, + }).AnyTimes() + + err := podMapper.Process(metrics, mockSystemInfo) + require.NoError(t, err) + + var deviceMetrics, podMetrics []collector.Metric + for _, m := range metrics[tc.counter] { + if _, hasPod := m.Attributes[podAttribute]; hasPod { + podMetrics = append(podMetrics, m) + } else { + deviceMetrics = append(deviceMetrics, m) + } + } + + require.Len(t, podMetrics, tc.wantPodMetrics) + require.Equal(t, "gpu-pod-0", podMetrics[0].Attributes[podAttribute]) + + require.Len(t, deviceMetrics, tc.wantDeviceMetrics) + for _, m := range deviceMetrics { + require.NotContains(t, m.Attributes, podAttribute) + } + }) + } +} From 4bcbf88743c40ec98a911cb974a47d94468fb527 Mon Sep 17 00:00:00 2001 From: Krystian Bednarczuk Date: Wed, 18 Feb 2026 14:26:17 +0100 Subject: [PATCH 4/6] Reinitialize NVML on GPU topology change for MIG time-sharing per-process metrics Signed-off-by: Krystian Bednarczuk --- pkg/cmd/app.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index cf88a949..680a156a 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -821,6 +821,19 @@ func handleGPUTopologyChange(ctx context.Context, server *server.MetricsServer, slog.Uint64("reload_id", reloadID)) dcgmprovider.Initialize(config) + // Step 3b: Reinitialize NVML + if config.Kubernetes && config.KubernetesVirtualGPUs { + slog.InfoContext(ctx, "Cleaning up NVML resources", slog.Uint64("reload_id", reloadID)) + nvmlprovider.Client().Cleanup() + + slog.InfoContext(ctx, "Reinitializing NVML", slog.Uint64("reload_id", reloadID)) + if err := nvmlprovider.Initialize(); err != nil { + slog.ErrorContext(ctx, "Failed to reinitialize NVML", + slog.Uint64("reload_id", reloadID), + slog.String("error", err.Error())) + } + } + // Step 4: Query DCP metrics (safe now - GPU is stable after topology change) queryDCPMetrics(config, reloadID) From 531b8f23f3f2238ca2e19b735485c252e4ccef6e Mon Sep 17 00:00:00 2001 From: Krystian Bednarczuk Date: Mon, 23 Mar 2026 13:32:38 +0100 Subject: [PATCH 5/6] Fix multi-PID metric accumulation and add cgroup parse failure logging Signed-off-by: Krystian Bednarczuk --- internal/pkg/transformation/kubernetes.go | 6 +++- .../pkg/transformation/kubernetes_test.go | 30 +++++++++++++++++++ internal/pkg/transformation/pidmapper.go | 9 +++++- .../pkg/transformation/process_metrics.go | 9 +++--- .../transformation/process_metrics_test.go | 16 +++++----- 5 files changed, 55 insertions(+), 15 deletions(-) diff --git a/internal/pkg/transformation/kubernetes.go b/internal/pkg/transformation/kubernetes.go index 252a68f1..13a243df 100644 --- a/internal/pkg/transformation/kubernetes.go +++ b/internal/pkg/transformation/kubernetes.go @@ -236,11 +236,15 @@ func buildPodValueMap(pidToPod map[uint32]*PodInfo, data *perProcessMetrics, fie if data == nil { return podValues } + podAccum := make(map[string]uint64) for pid, podInfo := range pidToPod { if value, ok := data.getValueForMetric(fieldName, pid); ok { - podValues[podInfo.UID] = value + podAccum[podInfo.UID] += value } } + for uid, total := range podAccum { + podValues[uid] = fmt.Sprintf("%d", total) + } return podValues } diff --git a/internal/pkg/transformation/kubernetes_test.go b/internal/pkg/transformation/kubernetes_test.go index b36e4f4e..30a6c650 100644 --- a/internal/pkg/transformation/kubernetes_test.go +++ b/internal/pkg/transformation/kubernetes_test.go @@ -1000,6 +1000,36 @@ func TestBuildPodValueMap(t *testing.T) { fieldName: metricGPUUtil, expected: map[string]string{"uid1": "50"}, }, + { + name: "multiple PIDs same pod - accumulates GPU util", + pidToPod: map[uint32]*PodInfo{1001: {UID: "uid1"}, 1002: {UID: "uid1"}}, + data: &perProcessMetrics{ + pidToSMUtil: map[uint32]uint32{1001: 30, 1002: 45}, + }, + fieldName: metricGPUUtil, + expected: map[string]string{"uid1": "75"}, + }, + { + name: "multiple PIDs same pod - accumulates FB used", + pidToPod: map[uint32]*PodInfo{1001: {UID: "uid1"}, 1002: {UID: "uid1"}}, + data: &perProcessMetrics{ + pidToMemory: map[uint32]uint64{1001: 500 * 1024 * 1024, 1002: 300 * 1024 * 1024}, + }, + fieldName: metricFBUsed, + expected: map[string]string{"uid1": "800"}, + }, + { + name: "mixed pods - some with multiple PIDs, some with single PID", + pidToPod: map[uint32]*PodInfo{ + 1001: {UID: "uid1"}, 1002: {UID: "uid1"}, + 2001: {UID: "uid2"}, + }, + data: &perProcessMetrics{ + pidToSMUtil: map[uint32]uint32{1001: 20, 1002: 30, 2001: 50}, + }, + fieldName: metricGPUUtil, + expected: map[string]string{"uid1": "50", "uid2": "50"}, + }, } for _, tc := range tests { diff --git a/internal/pkg/transformation/pidmapper.go b/internal/pkg/transformation/pidmapper.go index a265cae0..b6fbe589 100644 --- a/internal/pkg/transformation/pidmapper.go +++ b/internal/pkg/transformation/pidmapper.go @@ -18,8 +18,10 @@ package transformation import ( "fmt" + "log/slog" "regexp" "strings" + "sync" "github.com/containerd/cgroups/v3" ) @@ -27,7 +29,8 @@ import ( var podUIDRegex = regexp.MustCompile(`pod([a-f0-9_-]+)`) type pidToPodMapper struct { - pidToUID map[uint32]string + pidToUID map[uint32]string + cgroupWarnOnce sync.Once } func newPIDToPodMapper() *pidToPodMapper { @@ -87,6 +90,10 @@ func (m *pidToPodMapper) buildPIDToPodMap(pids []uint32, pods []PodInfo) map[uin for _, pid := range pids { uid, err := m.getPodUIDForPID(pid) if err != nil { + slog.Debug("Failed to map PID to pod", "pid", pid, "error", err) + m.cgroupWarnOnce.Do(func() { + slog.Warn("Failed to map PID to pod, per-process metrics may be incomplete", "pid", pid, "error", err) + }) continue } if uid == "" { diff --git a/internal/pkg/transformation/process_metrics.go b/internal/pkg/transformation/process_metrics.go index ca554bdf..c114ce97 100644 --- a/internal/pkg/transformation/process_metrics.go +++ b/internal/pkg/transformation/process_metrics.go @@ -85,19 +85,18 @@ func (m *perProcessMetrics) getAllPIDs() []uint32 { return slices.Collect(maps.Keys(pidSet)) } -func (m *perProcessMetrics) getValueForMetric(fieldName string, pid uint32) (string, bool) { +func (m *perProcessMetrics) getValueForMetric(fieldName string, pid uint32) (uint64, bool) { switch fieldName { case metricGPUUtil: if util, ok := m.pidToSMUtil[pid]; ok { - return fmt.Sprintf("%d", util), true + return uint64(util), true } case metricFBUsed: if mem, ok := m.pidToMemory[pid]; ok { - memMiB := mem / (1024 * 1024) - return fmt.Sprintf("%d", memMiB), true + return mem / (1024 * 1024), true } } - return "", false + return 0, false } type perProcessDataMap struct { diff --git a/internal/pkg/transformation/process_metrics_test.go b/internal/pkg/transformation/process_metrics_test.go index 3fa5dc88..199069cf 100644 --- a/internal/pkg/transformation/process_metrics_test.go +++ b/internal/pkg/transformation/process_metrics_test.go @@ -147,49 +147,49 @@ func TestPerProcessMetrics_GetValueForMetric(t *testing.T) { name string fieldName string pid uint32 - expected string + expected uint64 hasValue bool }{ { name: "GPU util", fieldName: metricGPUUtil, pid: 1001, - expected: "50", + expected: 50, hasValue: true, }, { name: "GPU util zero", fieldName: metricGPUUtil, pid: 1002, - expected: "0", + expected: 0, hasValue: true, }, { name: "FB used", fieldName: metricFBUsed, pid: 1001, - expected: "1024", + expected: 1024, hasValue: true, }, { name: "FB used zero", fieldName: metricFBUsed, pid: 1002, - expected: "0", + expected: 0, hasValue: true, }, { name: "unknown metric", fieldName: "DCGM_FI_DEV_POWER_USAGE", pid: 1001, - expected: "", + expected: 0, hasValue: false, }, { name: "unknown PID", fieldName: metricGPUUtil, pid: 9999, - expected: "", + expected: 0, hasValue: false, }, } @@ -217,7 +217,7 @@ func TestPerProcessMetrics_EmptyMaps(t *testing.T) { value, hasValue := metrics.getValueForMetric(metricGPUUtil, 1001) assert.False(t, hasValue) - assert.Equal(t, "", value) + assert.Equal(t, uint64(0), value) } func TestPerProcessCollector_Collect(t *testing.T) { From f7767f88b6f495f66cbb20cb898e572259a3e68d Mon Sep 17 00:00:00 2001 From: Krystian Bednarczuk Date: Tue, 24 Mar 2026 15:36:52 +0100 Subject: [PATCH 6/6] test: add error-path cases to TestPerProcessCollector_Collect Signed-off-by: Krystian Bednarczuk --- .../transformation/process_metrics_test.go | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/internal/pkg/transformation/process_metrics_test.go b/internal/pkg/transformation/process_metrics_test.go index 199069cf..a4927d4d 100644 --- a/internal/pkg/transformation/process_metrics_test.go +++ b/internal/pkg/transformation/process_metrics_test.go @@ -17,6 +17,7 @@ package transformation import ( + "fmt" "testing" "github.com/NVIDIA/go-dcgm/pkg/dcgm" @@ -417,6 +418,84 @@ func TestPerProcessCollector_Collect(t *testing.T) { assert.Contains(t, result.deviceToPods, gpu1UUID) }, }, + { + name: "GetDeviceProcessMemory error - still collects utilization", + setupMocks: func(ctrl *gomock.Controller) (nvmlprovider.NVML, deviceinfo.Provider) { + mockNVML := mocknvmlprovider.NewMockNVML(ctrl) + mockNVML.EXPECT().GetDeviceProcessMemory(gpu0UUID).Return(nil, fmt.Errorf("nvml error")) + mockNVML.EXPECT().GetDeviceProcessUtilization(gpu0UUID).Return(map[uint32]uint32{1001: 50}, nil) + + mockDevInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockDevInfo.EXPECT().GPUCount().Return(uint(1)).AnyTimes() + mockDevInfo.EXPECT().GPU(uint(0)).Return(deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{UUID: gpu0UUID, GPU: 0}, + }).AnyTimes() + return mockNVML, mockDevInfo + }, + gpuDeviceMap: map[string]string{gpu0UUID: "nvidia0"}, + deviceToPods: map[string][]PodInfo{ + "nvidia0": {{Name: "test-pod", Namespace: "default", UID: podUID0, Container: "app"}}, + }, + pidToPod: map[uint32]*PodInfo{1001: pod0}, + validate: func(t *testing.T, result *perProcessDataMap) { + assert.Contains(t, result.metrics, gpu0UUID) + assert.Nil(t, result.metrics[gpu0UUID].pidToMemory) + assert.Equal(t, uint32(50), result.metrics[gpu0UUID].pidToSMUtil[1001]) + }, + }, + { + name: "GetDeviceProcessUtilization error - still collects memory", + setupMocks: func(ctrl *gomock.Controller) (nvmlprovider.NVML, deviceinfo.Provider) { + mockNVML := mocknvmlprovider.NewMockNVML(ctrl) + mockNVML.EXPECT().GetDeviceProcessMemory(gpu0UUID).Return(map[uint32]uint64{1001: 1024}, nil) + mockNVML.EXPECT().GetDeviceProcessUtilization(gpu0UUID).Return(nil, fmt.Errorf("nvml error")) + + mockDevInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockDevInfo.EXPECT().GPUCount().Return(uint(1)).AnyTimes() + mockDevInfo.EXPECT().GPU(uint(0)).Return(deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{UUID: gpu0UUID, GPU: 0}, + }).AnyTimes() + return mockNVML, mockDevInfo + }, + gpuDeviceMap: map[string]string{gpu0UUID: "nvidia0"}, + deviceToPods: map[string][]PodInfo{ + "nvidia0": {{Name: "test-pod", Namespace: "default", UID: podUID0, Container: "app"}}, + }, + pidToPod: map[uint32]*PodInfo{1001: pod0}, + validate: func(t *testing.T, result *perProcessDataMap) { + assert.Contains(t, result.metrics, gpu0UUID) + assert.Equal(t, uint64(1024), result.metrics[gpu0UUID].pidToMemory[1001]) + assert.Nil(t, result.metrics[gpu0UUID].pidToSMUtil) + }, + }, + { + name: "GetAllMIGDevicesProcessMemory error - returns empty data for that GPU", + setupMocks: func(ctrl *gomock.Controller) (nvmlprovider.NVML, deviceinfo.Provider) { + mockNVML := mocknvmlprovider.NewMockNVML(ctrl) + mockNVML.EXPECT().GetAllMIGDevicesProcessMemory(gpu0UUID).Return(nil, fmt.Errorf("nvml error")) + + mockDevInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockDevInfo.EXPECT().GPUCount().Return(uint(1)).AnyTimes() + mockDevInfo.EXPECT().GPU(uint(0)).Return(deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{UUID: gpu0UUID, GPU: 0}, + MigEnabled: true, + GPUInstances: []deviceinfo.GPUInstanceInfo{ + {Info: dcgm.MigEntityInfo{NvmlInstanceId: 1}, EntityId: 100}, + }, + }).AnyTimes() + return mockNVML, mockDevInfo + }, + gpuDeviceMap: map[string]string{gpu0UUID: "nvidia0"}, + deviceToPods: map[string][]PodInfo{ + "0-1": {{Name: "mig-pod", Namespace: "default", UID: podUID0, Container: "app"}}, + }, + pidToPod: map[uint32]*PodInfo{}, + validate: func(t *testing.T, result *perProcessDataMap) { + assert.Empty(t, result.metrics) + assert.Empty(t, result.pidToPod) + assert.Empty(t, result.deviceToPods) + }, + }, } for _, tc := range tests {