diff --git a/internal/pkg/transformation/kubernetes.go b/internal/pkg/transformation/kubernetes.go index e93d94c8..92d8e797 100644 --- a/internal/pkg/transformation/kubernetes.go +++ b/internal/pkg/transformation/kubernetes.go @@ -269,33 +269,37 @@ func (p *PodMapper) Process(metrics collector.MetricsByCounter, deviceInfo devic } podInfos := deviceToPods[deviceID] - for _, pi := range podInfos { - metric, err := utils.DeepCopy(metrics[counter][j]) - if err != nil { - return err - } - if !p.Config.UseOldNamespace { - metric.Attributes[podAttribute] = pi.Name - metric.Attributes[namespaceAttribute] = pi.Namespace - metric.Attributes[containerAttribute] = pi.Container - } else { - metric.Attributes[oldPodAttribute] = pi.Name - metric.Attributes[oldNamespaceAttribute] = pi.Namespace - metric.Attributes[oldContainerAttribute] = pi.Container - } - if p.Config.KubernetesEnablePodUID { - metric.Attributes[uidAttribute] = pi.UID - } - if pi.VGPU != "" { - metric.Attributes[vgpuAttribute] = pi.VGPU - } + if podInfos != nil { + for _, pi := range podInfos { + metric, err := utils.DeepCopy(metrics[counter][j]) + if err != nil { + return err + } + if !p.Config.UseOldNamespace { + metric.Attributes[podAttribute] = pi.Name + metric.Attributes[namespaceAttribute] = pi.Namespace + metric.Attributes[containerAttribute] = pi.Container + } else { + metric.Attributes[oldPodAttribute] = pi.Name + metric.Attributes[oldNamespaceAttribute] = pi.Namespace + metric.Attributes[oldContainerAttribute] = pi.Container + } + if p.Config.KubernetesEnablePodUID { + metric.Attributes[uidAttribute] = pi.UID + } + if pi.VGPU != "" { + metric.Attributes[vgpuAttribute] = pi.VGPU + } - // Robustness: ensure no overlap between Labels and Attributes - for k := range metric.Attributes { - delete(metric.Labels, k) - } + // Robustness: ensure no overlap between Labels and Attributes + for k := range metric.Attributes { + delete(metric.Labels, k) + } - newmetrics = append(newmetrics, metric) + newmetrics = append(newmetrics, metric) + } + } else { + newmetrics = append(newmetrics, metrics[counter][j]) } } if len(newmetrics) > 0 { diff --git a/internal/pkg/transformation/kubernetes_test.go b/internal/pkg/transformation/kubernetes_test.go index 35a67dfa..badea645 100644 --- a/internal/pkg/transformation/kubernetes_test.go +++ b/internal/pkg/transformation/kubernetes_test.go @@ -1044,3 +1044,75 @@ func TestPodMapper_createPodInfo_WithInformer(t *testing.T) { assert.Equal(t, "gpu-app", podInfo.Labels["app"], "Should retrieve labels from Informer") assert.Equal(t, "production", podInfo.Labels["env"]) } + +func TestProcessPodMapper_VirtualGPUs_PartialAllocation(t *testing.T) { + testutils.RequireLinux(t) + + // Setup mock gRPC server + tmpDir, cleanup := testutils.CreateTmpDir(t) + defer cleanup() + socketPath := tmpDir + "/kubelet.sock" + + server := grpc.NewServer() + // Two GPUs on the node + gpus := []string{"GPU-0", "GPU-1"} + // Only GPU-0 is allocated to a pod + allocatedGPUs := []string{"GPU-0"} + podresourcesapi.RegisterPodResourcesListerServer(server, + testutils.NewMockPodResourcesServer(appconfig.NvidiaResourceName, allocatedGPUs)) + cleanupServer := testutils.StartMockServer(t, server, socketPath) + defer cleanupServer() + + // Create PodMapper with Virtual GPUs enabled + podMapper := NewPodMapper(&appconfig.Config{ + KubernetesVirtualGPUs: true, + KubernetesGPUIdType: appconfig.GPUUID, + PodResourcesKubeletSocket: socketPath, + }) + + // Setup metrics for both GPUs + metrics := collector.MetricsByCounter{} + counter := counters.Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + } + for i, gpuUUID := range gpus { + metrics[counter] = append(metrics[counter], collector.Metric{ + GPU: fmt.Sprint(i), + GPUUUID: gpuUUID, + Attributes: map[string]string{}, + Counter: counter, + }) + } + + // Setup mock device info + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockSystemInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockSystemInfo.EXPECT().GPUCount().Return(uint(len(gpus))).AnyTimes() + + // Process metrics + err := podMapper.Process(metrics, mockSystemInfo) + require.NoError(t, err) + + // Verify that both GPUs are still present + assert.Len(t, metrics[counter], 2, "Expected 2 metrics, one for each GPU") + + // Verify that GPU-0 has pod attributes and GPU-1 does not + foundGPU0 := false + foundGPU1 := false + for _, metric := range metrics[counter] { + switch metric.GPUUUID { + case "GPU-0": + foundGPU0 = true + assert.Contains(t, metric.Attributes, podAttribute, "GPU-0 should have pod attribute") + case "GPU-1": + foundGPU1 = true + assert.NotContains(t, metric.Attributes, podAttribute, "GPU-1 should NOT have pod attribute") + } + } + assert.True(t, foundGPU0, "Should have found GPU-0") + assert.True(t, foundGPU1, "Should have found GPU-1") +}