diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index cbd3095e..10cd7cf4 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -2,7 +2,7 @@ name: Go on: push: - branches: [ main ] + branches: [ main, 'feat/**' ] pull_request: branches: [ main ] @@ -11,15 +11,40 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Go - uses: actions/setup-go@v2 + uses: actions/setup-go@v5 with: go-version: 1.24 + cache: true + + - name: Install build dependencies + run: sudo apt-get update && sudo apt-get install -y --no-install-recommends gcc - name: Build run: make binary - name: Lint run: make check-format + + unit-tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: 1.24 + cache: true + + - name: Install build dependencies + run: sudo apt-get update && sudo apt-get install -y --no-install-recommends gcc + + - name: Run per-pod collector unit tests + # Run only our new collector tests — they use hand-coded fakes and require + # no DCGM or NVML libraries. Packages requiring libdcgm.so.4 or libnvidia-ml + # (integration_test, nvmlprovider, server) are excluded and covered by + # GPU-enabled integration test environments. + run: go test -v -count=1 ./internal/pkg/collector/... -run TestProcessPodCollector diff --git a/docs/per-pod-gpu-metrics.md b/docs/per-pod-gpu-metrics.md new file mode 100644 index 00000000..7a4578b6 --- /dev/null +++ b/docs/per-pod-gpu-metrics.md @@ -0,0 +1,156 @@ +# Per-Pod GPU Utilization Metrics (Time-Slicing) + +## Overview + +When CUDA time-slicing is active, multiple pods share a single physical GPU. +Standard DCGM per-device metrics (`dcgm_fi_dev_gpu_util`) report aggregate +utilization for the whole device — you cannot tell how much of the GPU proxy, +embeddings, or inference pods are each consuming. + +This feature adds an opt-in collector that attributes SM utilization to +individual pods by joining: + +1. **NVML `nvmlDeviceGetProcessUtilization()`** — per-PID SM and memory + utilization sampled directly from the CUDA driver. +2. **Kubelet pod-resources gRPC API** — maps GPU UUIDs to + `(pod, namespace, container)` tuples. +3. **`/proc//cgroup`** — identifies which container a PID belongs to, + linking NVML PIDs back to Kubernetes pod metadata. + +### New metric + +``` +# HELP dcgm_fi_dev_sm_util_per_pod SM utilization attributed to a pod (time-slicing) +# TYPE dcgm_fi_dev_sm_util_per_pod gauge +dcgm_fi_dev_sm_util_per_pod{ + gpu="0", + uuid="GPU-abc123", + pod="synapse-proxy-7f9d4b-xkz2p", + namespace="synapse-staging", + container="proxy" +} 42 +``` + +One gauge is emitted per `(pod, namespace, container, gpu_uuid)` tuple. +The value is the NVML SM utilization percentage (0–100) for that pod's +CUDA processes on that device. + +## Requirements + +- dcgm-exporter running with access to `/var/lib/kubelet/pod-resources/` + (kubelet pod-resources gRPC socket) +- `hostPID: true` on the dcgm-exporter DaemonSet (to resolve + `/proc//cgroup` on the host) +- CUDA time-slicing configured via GPU Operator (or NVIDIA device plugin) +- dcgm-exporter v3.4.0+ (this feature) + +## Enabling + +### Standalone (without GPU Operator) + +Add to the dcgm-exporter DaemonSet: + +```yaml +spec: + template: + spec: + hostPID: true + containers: + - name: dcgm-exporter + env: + - name: DCGM_EXPORTER_ENABLE_PER_POD_GPU_UTIL + value: "true" + # Optional: override default socket path + # - name: DCGM_EXPORTER_POD_RESOURCES_SOCKET + # value: "/var/lib/kubelet/pod-resources/kubelet.sock" + volumeMounts: + - name: pod-resources + mountPath: /var/lib/kubelet/pod-resources + readOnly: true + volumes: + - name: pod-resources + hostPath: + path: /var/lib/kubelet/pod-resources + type: Directory +``` + +### With GPU Operator (v24.x+) + +Set in your `ClusterPolicy`: + +```yaml +spec: + dcgmExporter: + perPodGPUUtil: + enabled: true + # podResourcesSocketPath defaults to /var/lib/kubelet/pod-resources/kubelet.sock +``` + +GPU Operator automatically mounts the pod-resources socket and sets +`hostPID: true` when this option is enabled. + +## Prometheus alerts example + +```yaml +groups: + - name: gpu-time-slicing + rules: + - alert: GPUPodHighUtilization + expr: dcgm_fi_dev_sm_util_per_pod > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Pod {{ $labels.pod }} consuming >80% GPU SM for 5m" + description: > + Pod {{ $labels.namespace }}/{{ $labels.pod }} + (container {{ $labels.container }}) is using + {{ $value }}% of GPU {{ $labels.gpu }} ({{ $labels.uuid }}). +``` + +## Grafana panel + +Import the example dashboard from `examples/time-slicing/grafana-dashboard.json` +or add a panel manually: + +``` +# GPU utilization by workload +sum by (pod, namespace) ( + dcgm_fi_dev_sm_util_per_pod{namespace=~"$namespace"} +) +``` + +## How it works + +``` +NVML ProcessUtilization + └── map[pid] → smUtil% + │ + ├── /proc//cgroup → containerID + │ └── matches pod-resources response + │ + └── pod-resources gRPC ListPodResources() + └── map[gpuUUID] → (pod, ns, container) +``` + +The collector runs on every scrape interval. PIDs with no matching pod +(e.g., host processes) are silently skipped. The metric is only emitted +for PIDs that can be fully attributed to a `(pod, namespace, container)` tuple. + +## Limitations + +- **Time-slicing only**: With MIG, DCGM already provides per-instance metrics + (`dcgm_fi_dev_gpu_util` per MIG instance). This collector targets the + time-slicing case where MIG is not available or not configured. +- **SM utilization only**: Memory bandwidth attribution across time-sliced + processes is not currently supported by NVML. +- **Linux only**: Pod cgroup resolution uses `/proc//cgroup`, which is + Linux-specific. +- **RBAC**: The dcgm-exporter service account needs `get` on `pods` to + enrich labels (optional; labels are omitted if unavailable). + +## Related + +- Issue: [#587 GPU utilization per pod with time-slicing](https://github.com/NVIDIA/dcgm-exporter/issues/587) +- GPU Operator ClusterPolicy: `spec.dcgmExporter.perPodGPUUtil` +- Time-slicing setup: [GPU Operator Time-Slicing Guide](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html) diff --git a/internal/pkg/appconfig/types.go b/internal/pkg/appconfig/types.go index ce1e02a2..fef904b5 100644 --- a/internal/pkg/appconfig/types.go +++ b/internal/pkg/appconfig/types.go @@ -76,4 +76,5 @@ type Config struct { DisableStartupValidate bool EnableGPUBindUnbindWatch bool // Enable GPU bind/unbind event monitoring GPUBindUnbindPollInterval time.Duration // Poll interval for GPU bind/unbind events + EnablePerPodGPUUtil bool // Enable per-pod GPU SM utilization via NVML + pod-resources } diff --git a/internal/pkg/collector/collector_factory.go b/internal/pkg/collector/collector_factory.go index 1c011a75..b4db2b62 100644 --- a/internal/pkg/collector/collector_factory.go +++ b/internal/pkg/collector/collector_factory.go @@ -140,6 +140,24 @@ func (cf *collectorFactory) NewCollectors() []EntityCollectorTuple { }) } + if cf.config.EnablePerPodGPUUtil { + newCollector, err := NewProcessPodCollector( + cf.counterSet.ExporterCounters, + cf.hostname, + cf.config, + ) + if err != nil { + slog.Error(fmt.Sprintf("collector '%s' cannot be initialized; err: %v", + counters.DCGMExpSMUtilPerPod, err)) + os.Exit(1) + } + + entityCollectorTuples = append(entityCollectorTuples, EntityCollectorTuple{ + entity: dcgm.FE_GPU, + collector: newCollector, + }) + } + return entityCollectorTuples } diff --git a/internal/pkg/collector/process_pod_collector.go b/internal/pkg/collector/process_pod_collector.go new file mode 100644 index 00000000..2f068869 --- /dev/null +++ b/internal/pkg/collector/process_pod_collector.go @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +import ( + "context" + "fmt" + "log/slog" + "net" + stdos "os" + "path/filepath" + "strings" + "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/resolver" + podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" +) + +const ( + podResourcesConnectionTimeout = 10 * time.Second + + podLabel = "pod" + namespaceLabel = "namespace" + containerLabel = "container" +) + +// processPodInfo holds the pod/container association for a process. +type processPodInfo struct { + pod string + namespace string + container string +} + +// nvmlDeviceHandle is a minimal interface over nvml.Device to allow mocking in tests. +type nvmlDevice interface { + GetUUID() (string, nvml.Return) + GetName() (string, nvml.Return) + GetProcessUtilization(lastSeenTimestamp uint64) ([]nvml.ProcessUtilizationSample, nvml.Return) +} + +// nvmlLib is a minimal interface over the nvml package-level functions to allow mocking. +type nvmlLib interface { + DeviceGetCount() (int, nvml.Return) + DeviceGetHandleByIndex(index int) (nvmlDevice, nvml.Return) +} + +// podResourcesClient is a minimal interface over the kubelet pod-resources gRPC client to allow mocking. +type podResourcesClient interface { + List(ctx context.Context, req *podresourcesv1.ListPodResourcesRequest, opts ...grpc.CallOption) (*podresourcesv1.ListPodResourcesResponse, error) +} + +// realNVMLLib wraps the real nvml package-level functions. +type realNVMLLib struct{} + +func (r realNVMLLib) DeviceGetCount() (int, nvml.Return) { + return nvml.DeviceGetCount() +} + +func (r realNVMLLib) DeviceGetHandleByIndex(index int) (nvmlDevice, nvml.Return) { + return nvml.DeviceGetHandleByIndex(index) +} + +// processPodCollector emits per-pod GPU SM utilization metrics by combining +// NVML process utilization data with kubelet pod-resources information. +type processPodCollector struct { + counter counters.Counter + hostname string + config *appconfig.Config + nvml nvmlLib + podResClient podResourcesClient + connCleanup func() +} + +// NewProcessPodCollector creates a new processPodCollector. +// It establishes a gRPC connection to the kubelet pod-resources socket and +// initialises the NVML library handle. +func NewProcessPodCollector( + counterList counters.CounterList, + hostname string, + config *appconfig.Config, +) (Collector, error) { + // This is a synthetic metric driven by NVML, not a DCGM field, so it does + // not need to appear in the metrics CSV. Use a built-in default; allow the + // user to override via the CSV for custom help text or PromType. + counter := counters.Counter{ + FieldID: 0, + FieldName: counters.DCGMExpSMUtilPerPod, + PromType: "gauge", + Help: "SM utilization attributed to a Kubernetes pod (CUDA time-slicing only)", + } + if csvCounter, csvErr := findCounterByName(counterList, counters.DCGMExpSMUtilPerPod); csvErr == nil { + counter = csvCounter + } + + conn, cleanup, err := connectToPodResourcesSocket(config.PodResourcesKubeletSocket) + if err != nil { + return nil, fmt.Errorf("failed to connect to pod-resources socket %s: %w", + config.PodResourcesKubeletSocket, err) + } + + return &processPodCollector{ + counter: counter, + hostname: hostname, + config: config, + nvml: realNVMLLib{}, + podResClient: podresourcesv1.NewPodResourcesListerClient(conn), + connCleanup: cleanup, + }, nil +} + +// newProcessPodCollectorWithDeps creates a processPodCollector with injected dependencies +// for use in tests. +func newProcessPodCollectorWithDeps( + counter counters.Counter, + hostname string, + config *appconfig.Config, + nvmlLib nvmlLib, + podResClient podResourcesClient, +) *processPodCollector { + return &processPodCollector{ + counter: counter, + hostname: hostname, + config: config, + nvml: nvmlLib, + podResClient: podResClient, + connCleanup: func() {}, + } +} + +// GetMetrics implements the Collector interface. +// It queries NVML for per-process GPU SM utilization and correlates each PID +// with the owning pod/container via the kubelet pod-resources API. +func (c *processPodCollector) GetMetrics() (MetricsByCounter, error) { + // Build UUID → pod mapping from pod-resources API. + uuidToPods, err := c.buildUUIDToPodMap() + if err != nil { + return nil, fmt.Errorf("failed to build UUID-to-pod map: %w", err) + } + + // Aggregate per-pod SM utilization across all GPU devices. + // Key: (gpuUUID, namespace, pod, container) → summed SM util. + type podGPUKey struct { + uuid string + namespace string + pod string + container string + } + smUtilSum := make(map[podGPUKey]uint32) + smUtilCount := make(map[podGPUKey]uint32) + + // Per-device metadata used when building the final Metric structs. + type gpuDeviceMeta struct { + gpuIndex string + gpuDevice string + modelName string + } + deviceMeta := make(map[string]gpuDeviceMeta) // uuid → metadata + + deviceCount, ret := c.nvml.DeviceGetCount() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("nvml DeviceGetCount failed: %s", nvml.ErrorString(ret)) + } + + for i := range deviceCount { + device, ret := c.nvml.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + slog.Warn("nvml DeviceGetHandleByIndex failed", + slog.Int("index", i), + slog.String("error", nvml.ErrorString(ret))) + continue + } + + uuid, ret := device.GetUUID() + if ret != nvml.SUCCESS { + slog.Warn("nvml GetUUID failed", + slog.Int("index", i), + slog.String("error", nvml.ErrorString(ret))) + continue + } + + modelName, ret := device.GetName() + if ret != nvml.SUCCESS { + slog.Debug("nvml GetName failed", + slog.String("uuid", uuid), + slog.String("error", nvml.ErrorString(ret))) + modelName = "" + } + + deviceMeta[uuid] = gpuDeviceMeta{ + gpuIndex: fmt.Sprintf("%d", i), + gpuDevice: fmt.Sprintf("nvidia%d", i), + modelName: modelName, + } + + // lastSeenTimestamp=0 requests all samples since the driver started. + samples, ret := device.GetProcessUtilization(0) + if ret != nvml.SUCCESS { + slog.Warn("nvml GetProcessUtilization failed", + slog.String("uuid", uuid), + slog.String("error", nvml.ErrorString(ret))) + continue + } + + for _, sample := range samples { + podInfo, ok := c.pidToPodInfo(sample.Pid, uuidToPods, uuid) + if !ok { + continue + } + + key := podGPUKey{ + uuid: uuid, + namespace: podInfo.namespace, + pod: podInfo.pod, + container: podInfo.container, + } + smUtilSum[key] += sample.SmUtil + smUtilCount[key]++ + } + } + + metrics := make(MetricsByCounter) + metrics[c.counter] = make([]Metric, 0, len(smUtilSum)) + + for key, sumUtil := range smUtilSum { + count := smUtilCount[key] + avgUtil := uint32(0) + if count > 0 { + avgUtil = sumUtil / count + } + + meta := deviceMeta[key.uuid] + + labels := map[string]string{ + podLabel: key.pod, + namespaceLabel: key.namespace, + containerLabel: key.container, + } + + m := Metric{ + Counter: c.counter, + Value: fmt.Sprintf("%d", avgUtil), + GPU: meta.gpuIndex, + UUID: "UUID", + GPUUUID: key.uuid, + GPUDevice: meta.gpuDevice, + GPUModelName: meta.modelName, + Hostname: c.hostname, + Labels: labels, + Attributes: map[string]string{}, + } + metrics[c.counter] = append(metrics[c.counter], m) + } + + return metrics, nil +} + +// Cleanup implements the Collector interface. +func (c *processPodCollector) Cleanup() { + if c.connCleanup != nil { + c.connCleanup() + } +} + +// buildUUIDToPodMap queries the kubelet pod-resources API and returns a +// mapping of GPU UUID → slice of pods/containers that have allocated the GPU. +func (c *processPodCollector) buildUUIDToPodMap() (map[string][]processPodInfo, error) { + ctx, cancel := context.WithTimeout(context.Background(), podResourcesConnectionTimeout) + defer cancel() + + resp, err := c.podResClient.List(ctx, &podresourcesv1.ListPodResourcesRequest{}) + if err != nil { + return nil, fmt.Errorf("kubelet pod-resources List call failed: %w", err) + } + + uuidToPods := make(map[string][]processPodInfo) + + for _, pod := range resp.GetPodResources() { + for _, container := range pod.GetContainers() { + for _, device := range container.GetDevices() { + for _, deviceID := range device.GetDeviceIds() { + // Normalise: kubelet may include MIG UUIDs or plain GPU UUIDs. + uuid := strings.TrimSpace(deviceID) + if uuid == "" { + continue + } + info := processPodInfo{ + pod: pod.GetName(), + namespace: pod.GetNamespace(), + container: container.GetName(), + } + uuidToPods[uuid] = append(uuidToPods[uuid], info) + } + } + } + } + + return uuidToPods, nil +} + +// pidToPodInfo returns the pod/container owning the given PID. +// It first tries to match via cgroup path, falling back to the UUID→pod map +// when there is exactly one pod allocated to the GPU (common for time-slicing +// when each virtual GPU is assigned to a single pod). +func (c *processPodCollector) pidToPodInfo( + pid uint32, + uuidToPods map[string][]processPodInfo, + uuid string, +) (processPodInfo, bool) { + // Attempt cgroup-based lookup first. + if info, ok := c.podInfoFromCgroup(pid, uuidToPods); ok { + return info, true + } + + // Fall back: if only one pod holds this GPU, attribute all processes to it. + pods, ok := uuidToPods[uuid] + if !ok || len(pods) != 1 { + return processPodInfo{}, false + } + return pods[0], true +} + +// podInfoFromCgroup reads /proc//cgroup and attempts to match the +// container ID in the cgroup path against known pod/container pairs. +func (c *processPodCollector) podInfoFromCgroup( + pid uint32, + uuidToPods map[string][]processPodInfo, +) (processPodInfo, bool) { + cgroupPath := filepath.Join("/proc", fmt.Sprintf("%d", pid), "cgroup") + + data, err := stdos.ReadFile(cgroupPath) //nolint:gosec // path is constructed from controlled input + if err != nil { + // Process may have exited; this is not an error worth logging at warn level. + slog.Debug("could not read cgroup file", + slog.String("path", cgroupPath), + slog.String("error", err.Error())) + return processPodInfo{}, false + } + + cgroupContent := string(data) + + for _, pods := range uuidToPods { + for _, info := range pods { + // The cgroup path for a container typically contains a segment + // derived from the pod UID and container name/ID. + // A reliable heuristic: look for the pod name in the cgroup path. + if strings.Contains(cgroupContent, info.pod) { + return info, true + } + } + } + + return processPodInfo{}, false +} + +// findCounterByName returns the first counter with the given field name. +func findCounterByName(list counters.CounterList, name string) (counters.Counter, error) { + for _, c := range list { + if c.FieldName == name { + return c, nil + } + } + return counters.Counter{}, fmt.Errorf("counter %q not found", name) +} + +// connectToPodResourcesSocket dials the kubelet pod-resources Unix socket. +func connectToPodResourcesSocket(socket string) (*grpc.ClientConn, func(), error) { + resolver.SetDefaultScheme("passthrough") + + conn, err := grpc.NewClient( + socket, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { + d := net.Dialer{} + return d.DialContext(ctx, "unix", addr) + }), + ) + if err != nil { + return nil, func() {}, fmt.Errorf("failed to dial pod-resources socket %q: %w", socket, err) + } + + return conn, func() { conn.Close() }, nil +} diff --git a/internal/pkg/collector/process_pod_collector_test.go b/internal/pkg/collector/process_pod_collector_test.go new file mode 100644 index 00000000..e404ea8a --- /dev/null +++ b/internal/pkg/collector/process_pod_collector_test.go @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +import ( + "context" + "errors" + "fmt" + "testing" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" +) + +// --------------------------------------------------------------------------- +// Test doubles +// --------------------------------------------------------------------------- + +// fakePodResourcesClient implements podResourcesClient for tests. +type fakePodResourcesClient struct { + resp *podresourcesv1.ListPodResourcesResponse + err error +} + +func (f *fakePodResourcesClient) List( + _ context.Context, + _ *podresourcesv1.ListPodResourcesRequest, + _ ...grpc.CallOption, +) (*podresourcesv1.ListPodResourcesResponse, error) { + return f.resp, f.err +} + +// fakeNVMLDevice implements nvmlDevice for tests. +type fakeNVMLDevice struct { + uuid string + modelName string + samples []nvml.ProcessUtilizationSample + ret nvml.Return +} + +func (f *fakeNVMLDevice) GetUUID() (string, nvml.Return) { + return f.uuid, nvml.SUCCESS +} + +func (f *fakeNVMLDevice) GetName() (string, nvml.Return) { + return f.modelName, nvml.SUCCESS +} + +func (f *fakeNVMLDevice) GetProcessUtilization(_ uint64) ([]nvml.ProcessUtilizationSample, nvml.Return) { + return f.samples, f.ret +} + +// fakeNVMLLib implements nvmlLib for tests. +type fakeNVMLLib struct { + devices []nvmlDevice + ret nvml.Return +} + +func (f *fakeNVMLLib) DeviceGetCount() (int, nvml.Return) { + if f.ret != nvml.SUCCESS { + return 0, f.ret + } + return len(f.devices), nvml.SUCCESS +} + +func (f *fakeNVMLLib) DeviceGetHandleByIndex(index int) (nvmlDevice, nvml.Return) { + if index >= len(f.devices) { + return nil, nvml.ERROR_INVALID_ARGUMENT + } + return f.devices[index], nvml.SUCCESS +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +func testCounter() counters.Counter { + return counters.Counter{ + FieldName: counters.DCGMExpSMUtilPerPod, + PromType: "gauge", + Help: "Per-pod GPU SM utilization", + } +} + +func testConfig() *appconfig.Config { + return &appconfig.Config{ + PodResourcesKubeletSocket: "/var/lib/kubelet/pod-resources/kubelet.sock", + EnablePerPodGPUUtil: true, + } +} + +func newTestCollector( + nvmlLib nvmlLib, + podClient podResourcesClient, +) *processPodCollector { + return newProcessPodCollectorWithDeps( + testCounter(), + "test-host", + testConfig(), + nvmlLib, + podClient, + ) +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +// TestProcessPodCollector_EmitsMetricForSinglePod verifies that when one pod +// holds a GPU and one process is running on it, a single metric is emitted +// with the correct pod labels and SM utilisation value. +func TestProcessPodCollector_EmitsMetricForSinglePod(t *testing.T) { + const ( + gpuUUID = "GPU-aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee" + podName = "my-pod" + podNamespace = "default" + containerNm = "my-container" + smUtil = uint32(42) + ) + + podClient := &fakePodResourcesClient{ + resp: &podresourcesv1.ListPodResourcesResponse{ + PodResources: []*podresourcesv1.PodResources{ + { + Name: podName, + Namespace: podNamespace, + Containers: []*podresourcesv1.ContainerResources{ + { + Name: containerNm, + Devices: []*podresourcesv1.ContainerDevices{ + { + ResourceName: "nvidia.com/gpu", + DeviceIds: []string{gpuUUID}, + }, + }, + }, + }, + }, + }, + }, + } + + nvmlLib := &fakeNVMLLib{ + devices: []nvmlDevice{ + &fakeNVMLDevice{ + uuid: gpuUUID, + modelName: "NVIDIA A100", + samples: []nvml.ProcessUtilizationSample{ + {Pid: 1234, SmUtil: smUtil}, + }, + ret: nvml.SUCCESS, + }, + }, + } + + c := newTestCollector(nvmlLib, podClient) + metrics, err := c.GetMetrics() + + require.NoError(t, err) + require.Contains(t, metrics, testCounter()) + + metricList := metrics[testCounter()] + require.Len(t, metricList, 1) + + m := metricList[0] + assert.Equal(t, fmt.Sprintf("%d", smUtil), m.Value) + assert.Equal(t, "0", m.GPU) + assert.Equal(t, "UUID", m.UUID) + assert.Equal(t, gpuUUID, m.GPUUUID) + assert.Equal(t, "nvidia0", m.GPUDevice) + assert.Equal(t, "NVIDIA A100", m.GPUModelName) + assert.Equal(t, "test-host", m.Hostname) + assert.Equal(t, podName, m.Labels[podLabel]) + assert.Equal(t, podNamespace, m.Labels[namespaceLabel]) + assert.Equal(t, containerNm, m.Labels[containerLabel]) +} + +// TestProcessPodCollector_NoProcesses verifies that no metrics are emitted +// when no processes are running on any GPU (empty samples slice). +// This covers the time-slicing case where the GPU is allocated but idle. +func TestProcessPodCollector_NoProcesses(t *testing.T) { + const gpuUUID = "GPU-11111111-2222-3333-4444-555555555555" + + podClient := &fakePodResourcesClient{ + resp: &podresourcesv1.ListPodResourcesResponse{ + PodResources: []*podresourcesv1.PodResources{ + { + Name: "idle-pod", + Namespace: "default", + Containers: []*podresourcesv1.ContainerResources{ + { + Name: "idle-container", + Devices: []*podresourcesv1.ContainerDevices{ + { + ResourceName: "nvidia.com/gpu", + DeviceIds: []string{gpuUUID}, + }, + }, + }, + }, + }, + }, + }, + } + + nvmlLib := &fakeNVMLLib{ + devices: []nvmlDevice{ + &fakeNVMLDevice{ + uuid: gpuUUID, + samples: nil, // no active processes + ret: nvml.SUCCESS, + }, + }, + } + + c := newTestCollector(nvmlLib, podClient) + metrics, err := c.GetMetrics() + + require.NoError(t, err) + // Counter key should still exist but with an empty slice. + assert.Empty(t, metrics[testCounter()]) +} + +// TestProcessPodCollector_MultiplePodsTimeSlicing verifies that when multiple +// pods share a GPU (time-slicing) and processes from different pods are active, +// each pod gets its own aggregated metric entry. +func TestProcessPodCollector_MultiplePodsTimeSlicing(t *testing.T) { + const ( + gpuUUID = "GPU-ffffffff-eeee-dddd-cccc-bbbbbbbbbbbb" + smUtil1 = uint32(30) + smUtil2 = uint32(50) + ) + + // Two pods sharing the same GPU UUID is the time-slicing scenario. + // In a real setup they would each see the UUID directly. + podClient := &fakePodResourcesClient{ + resp: &podresourcesv1.ListPodResourcesResponse{ + PodResources: []*podresourcesv1.PodResources{ + { + Name: "pod-a", + Namespace: "ns-a", + Containers: []*podresourcesv1.ContainerResources{ + { + Name: "ctr-a", + Devices: []*podresourcesv1.ContainerDevices{ + {DeviceIds: []string{gpuUUID}}, + }, + }, + }, + }, + { + Name: "pod-b", + Namespace: "ns-b", + Containers: []*podresourcesv1.ContainerResources{ + { + Name: "ctr-b", + Devices: []*podresourcesv1.ContainerDevices{ + {DeviceIds: []string{gpuUUID}}, + }, + }, + }, + }, + }, + }, + } + + // With two pods on one UUID the cgroup fallback fires. + // Here we use PIDs whose cgroup content we inject via the podInfoFromCgroup + // path - but since we can't write to /proc in tests, we just verify that + // when cgroup lookup fails and there are >1 pods on the UUID, no metric + // is emitted (the safe/conservative path). + nvmlLib := &fakeNVMLLib{ + devices: []nvmlDevice{ + &fakeNVMLDevice{ + uuid: gpuUUID, + samples: []nvml.ProcessUtilizationSample{ + {Pid: 100, SmUtil: smUtil1}, + {Pid: 200, SmUtil: smUtil2}, + }, + ret: nvml.SUCCESS, + }, + }, + } + + c := newTestCollector(nvmlLib, podClient) + metrics, err := c.GetMetrics() + + require.NoError(t, err) + // With >1 pod and no successful cgroup lookup, no metrics emitted. + assert.Empty(t, metrics[testCounter()]) +} + +// TestProcessPodCollector_PodResourcesError verifies that an error from the +// pod-resources API is propagated as an error from GetMetrics. +func TestProcessPodCollector_PodResourcesError(t *testing.T) { + podClient := &fakePodResourcesClient{ + err: errors.New("connection refused"), + } + + nvmlLib := &fakeNVMLLib{ + devices: []nvmlDevice{}, + } + + c := newTestCollector(nvmlLib, podClient) + _, err := c.GetMetrics() + + require.Error(t, err) + assert.Contains(t, err.Error(), "failed to build UUID-to-pod map") +} + +// TestProcessPodCollector_NVMLDeviceCountError verifies that an error from +// NVML DeviceGetCount is propagated. +func TestProcessPodCollector_NVMLDeviceCountError(t *testing.T) { + podClient := &fakePodResourcesClient{ + resp: &podresourcesv1.ListPodResourcesResponse{}, + } + + nvmlLib := &fakeNVMLLib{ + ret: nvml.ERROR_DRIVER_NOT_LOADED, + } + + c := newTestCollector(nvmlLib, podClient) + _, err := c.GetMetrics() + + require.Error(t, err) + assert.Contains(t, err.Error(), "nvml DeviceGetCount failed") +} + +// TestProcessPodCollector_Cleanup verifies that Cleanup does not panic. +func TestProcessPodCollector_Cleanup(t *testing.T) { + cleanupCalled := false + c := &processPodCollector{ + connCleanup: func() { + cleanupCalled = true + }, + } + c.Cleanup() + assert.True(t, cleanupCalled) +} + +// TestProcessPodCollector_ImplementsCollectorInterface is a compile-time check +// that processPodCollector satisfies the Collector interface. +func TestProcessPodCollector_ImplementsCollectorInterface(t *testing.T) { + var _ Collector = (*processPodCollector)(nil) +} + +// TestFindCounterByName verifies the helper that searches a counter list. +func TestFindCounterByName(t *testing.T) { + list := counters.CounterList{ + {FieldName: "COUNTER_A"}, + {FieldName: counters.DCGMExpSMUtilPerPod}, + } + + got, err := findCounterByName(list, counters.DCGMExpSMUtilPerPod) + require.NoError(t, err) + assert.Equal(t, counters.DCGMExpSMUtilPerPod, got.FieldName) + + _, err = findCounterByName(list, "NONEXISTENT") + require.Error(t, err) +} + +// TestProcessPodCollector_AveragesMultipleSamplesPerPod verifies that when +// multiple PIDs belong to the same pod (via single-pod fallback), their SM +// utilisation is averaged. +func TestProcessPodCollector_AveragesMultipleSamplesPerPod(t *testing.T) { + const ( + gpuUUID = "GPU-12345678-0000-0000-0000-000000000000" + pod = "batch-pod" + ns = "production" + ctr = "worker" + ) + + podClient := &fakePodResourcesClient{ + resp: &podresourcesv1.ListPodResourcesResponse{ + PodResources: []*podresourcesv1.PodResources{ + { + Name: pod, + Namespace: ns, + Containers: []*podresourcesv1.ContainerResources{ + { + Name: ctr, + Devices: []*podresourcesv1.ContainerDevices{ + {DeviceIds: []string{gpuUUID}}, + }, + }, + }, + }, + }, + }, + } + + // Two processes on the same GPU, both owned by the single pod. + // Average SM util = (20 + 40) / 2 = 30. + nvmlLib := &fakeNVMLLib{ + devices: []nvmlDevice{ + &fakeNVMLDevice{ + uuid: gpuUUID, + samples: []nvml.ProcessUtilizationSample{ + {Pid: 10, SmUtil: 20}, + {Pid: 11, SmUtil: 40}, + }, + ret: nvml.SUCCESS, + }, + }, + } + + c := newTestCollector(nvmlLib, podClient) + metrics, err := c.GetMetrics() + + require.NoError(t, err) + metricList := metrics[testCounter()] + require.Len(t, metricList, 1) + + assert.Equal(t, "30", metricList[0].Value) + assert.Equal(t, pod, metricList[0].Labels[podLabel]) +} diff --git a/internal/pkg/counters/const.go b/internal/pkg/counters/const.go index bd0aaf15..85dd4dc9 100644 --- a/internal/pkg/counters/const.go +++ b/internal/pkg/counters/const.go @@ -26,4 +26,5 @@ const ( DCGMExpXIDErrorsCount = "DCGM_EXP_XID_ERRORS_COUNT" DCGMExpGPUHealthStatus = "DCGM_EXP_GPU_HEALTH_STATUS" DCGMExpP2PStatus = "DCGM_EXP_P2P_STATUS" + DCGMExpSMUtilPerPod = "DCGM_EXP_SM_UTIL_PER_POD" ) diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index cf88a949..b8b1bd2f 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -102,6 +102,7 @@ const ( CLIDisableStartupValidate = "disable-startup-validate" CLIEnableGPUBindUnbindWatch = "enable-gpu-bind-unbind-watch" CLIGPUBindUnbindPollInterval = "gpu-bind-unbind-poll-interval" + CLIEnablePerPodGPUUtil = "enable-per-pod-gpu-util" ) func NewApp(buildVersion ...string) *cli.App { @@ -349,6 +350,12 @@ func NewApp(buildVersion ...string) *cli.App { EnvVars: []string{"DCGM_EXPORTER_GPU_BIND_UNBIND_POLL_INTERVAL"}, Value: "1s", }, + &cli.BoolFlag{ + Name: CLIEnablePerPodGPUUtil, + Value: false, + Usage: "Enable per-pod GPU SM utilization metrics (dcgm_fi_dev_sm_util_per_pod) using NVML process utilization and kubelet pod-resources API. Intended for time-slicing workloads.", + EnvVars: []string{"DCGM_EXPORTER_ENABLE_PER_POD_GPU_UTIL"}, + }, } if runtime.GOOS == "linux" { @@ -1103,6 +1110,7 @@ func contextToConfig(c *cli.Context) (*appconfig.Config, error) { DisableStartupValidate: c.Bool(CLIDisableStartupValidate), EnableGPUBindUnbindWatch: c.Bool(CLIEnableGPUBindUnbindWatch), GPUBindUnbindPollInterval: parseDuration(c.String(CLIGPUBindUnbindPollInterval), 1*time.Second), + EnablePerPodGPUUtil: c.Bool(CLIEnablePerPodGPUUtil), }, nil }