diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index cbd3095e..10cd7cf4 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -2,7 +2,7 @@ name: Go
 
 on:
   push:
-    branches: [ main ]
+    branches: [ main, 'feat/**' ]
   pull_request:
     branches: [ main ]
 
@@ -11,15 +11,40 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Set up Go
-      uses: actions/setup-go@v2
+      uses: actions/setup-go@v5
       with:
         go-version: 1.24
+        cache: true
+
+    - name: Install build dependencies
+      run: sudo apt-get update && sudo apt-get install -y --no-install-recommends gcc
 
     - name: Build
       run: make binary
 
     - name: Lint
       run: make check-format
+
+  unit-tests:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Go
+      uses: actions/setup-go@v5
+      with:
+        go-version: 1.24
+        cache: true
+
+    - name: Install build dependencies
+      run: sudo apt-get update && sudo apt-get install -y --no-install-recommends gcc
+
+    - name: Run per-pod collector unit tests
+      # Run only our new collector tests — they use hand-coded fakes and require
+      # no DCGM or NVML libraries. Packages requiring libdcgm.so.4 or libnvidia-ml
+      # (integration_test, nvmlprovider, server) are excluded and covered by
+      # GPU-enabled integration test environments.
+      run: go test -v -count=1 ./internal/pkg/collector/... -run TestProcessPodCollector
diff --git a/docs/per-pod-gpu-metrics.md b/docs/per-pod-gpu-metrics.md
new file mode 100644
index 00000000..7a4578b6
--- /dev/null
+++ b/docs/per-pod-gpu-metrics.md
@@ -0,0 +1,156 @@
+# Per-Pod GPU Utilization Metrics (Time-Slicing)
+
+## Overview
+
+When CUDA time-slicing is active, multiple pods share a single physical GPU.
+Standard DCGM per-device metrics (`dcgm_fi_dev_gpu_util`) report aggregate
+utilization for the whole device — you cannot tell how much of the GPU proxy,
+embeddings, or inference pods are each consuming.
+
+This feature adds an opt-in collector that attributes SM utilization to
+individual pods by joining:
+
+1. **NVML `nvmlDeviceGetProcessUtilization()`** — per-PID SM and memory
+   utilization sampled directly from the CUDA driver.
+2. **Kubelet pod-resources gRPC API** — maps GPU UUIDs to
+   `(pod, namespace, container)` tuples.
+3. **`/proc/<pid>/cgroup`** — identifies which container a PID belongs to,
+   linking NVML PIDs back to Kubernetes pod metadata.
+
+### New metric
+
+```
+# HELP dcgm_fi_dev_sm_util_per_pod SM utilization attributed to a pod (time-slicing)
+# TYPE dcgm_fi_dev_sm_util_per_pod gauge
+dcgm_fi_dev_sm_util_per_pod{
+  gpu="0",
+  uuid="GPU-abc123",
+  pod="synapse-proxy-7f9d4b-xkz2p",
+  namespace="synapse-staging",
+  container="proxy"
+} 42
+```
+
+One gauge is emitted per `(pod, namespace, container, gpu_uuid)` tuple.
+The value is the NVML SM utilization percentage (0–100) for that pod's
+CUDA processes on that device.
+
+## Requirements
+
+- dcgm-exporter running with access to `/var/lib/kubelet/pod-resources/`
+  (kubelet pod-resources gRPC socket)
+- `hostPID: true` on the dcgm-exporter DaemonSet (to resolve
+  `/proc/<pid>/cgroup` on the host)
+- CUDA time-slicing configured via GPU Operator (or NVIDIA device plugin)
+- dcgm-exporter v3.4.0+ (this feature)
+
+## Enabling
+
+### Standalone (without GPU Operator)
+
+Add to the dcgm-exporter DaemonSet:
+
+```yaml
+spec:
+  template:
+    spec:
+      hostPID: true
+      containers:
+        - name: dcgm-exporter
+          env:
+            - name: DCGM_EXPORTER_ENABLE_PER_POD_GPU_UTIL
+              value: "true"
+            # Optional: override default socket path
+            # - name: DCGM_EXPORTER_POD_RESOURCES_SOCKET
+            #   value: "/var/lib/kubelet/pod-resources/kubelet.sock"
+          volumeMounts:
+            - name: pod-resources
+              mountPath: /var/lib/kubelet/pod-resources
+              readOnly: true
+      volumes:
+        - name: pod-resources
+          hostPath:
+            path: /var/lib/kubelet/pod-resources
+            type: Directory
+```
+
+### With GPU Operator (v24.x+)
+
+Set in your `ClusterPolicy`:
+
+```yaml
+spec:
+  dcgmExporter:
+    perPodGPUUtil:
+      enabled: true
+      # podResourcesSocketPath defaults to /var/lib/kubelet/pod-resources/kubelet.sock
+```
+
+GPU Operator automatically mounts the pod-resources socket and sets
+`hostPID: true` when this option is enabled.
+
+## Prometheus alerts example
+
+```yaml
+groups:
+  - name: gpu-time-slicing
+    rules:
+      - alert: GPUPodHighUtilization
+        expr: dcgm_fi_dev_sm_util_per_pod > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Pod {{ $labels.pod }} consuming >80% GPU SM for 5m"
+          description: >
+            Pod {{ $labels.namespace }}/{{ $labels.pod }}
+            (container {{ $labels.container }}) is using
+            {{ $value }}% of GPU {{ $labels.gpu }} ({{ $labels.uuid }}).
+```
+
+## Grafana panel
+
+Import the example dashboard from `examples/time-slicing/grafana-dashboard.json`
+or add a panel manually:
+
+```
+# GPU utilization by workload
+sum by (pod, namespace) (
+  dcgm_fi_dev_sm_util_per_pod{namespace=~"$namespace"}
+)
+```
+
+## How it works
+
+```
+NVML ProcessUtilization
+  └── map[pid] → smUtil%
+        │
+        ├── /proc/<pid>/cgroup → containerID
+        │       └── matches pod-resources response
+        │
+        └── pod-resources gRPC ListPodResources()
+              └── map[gpuUUID] → (pod, ns, container)
+```
+
+The collector runs on every scrape interval. PIDs with no matching pod
+(e.g., host processes) are silently skipped. The metric is only emitted
+for PIDs that can be fully attributed to a `(pod, namespace, container)` tuple.
+
+## Limitations
+
+- **Time-slicing only**: With MIG, DCGM already provides per-instance metrics
+  (`dcgm_fi_dev_gpu_util` per MIG instance). This collector targets the
+  time-slicing case where MIG is not available or not configured.
+- **SM utilization only**: Memory bandwidth attribution across time-sliced
+  processes is not currently supported by NVML.
+- **Linux only**: Pod cgroup resolution uses `/proc/<pid>/cgroup`, which is
+  Linux-specific.
+- **RBAC**: The dcgm-exporter service account needs `get` on `pods` to
+  enrich labels (optional; labels are omitted if unavailable).
+
+## Related
+
+- Issue: [#587 GPU utilization per pod with time-slicing](https://github.com/NVIDIA/dcgm-exporter/issues/587)
+- GPU Operator ClusterPolicy: `spec.dcgmExporter.perPodGPUUtil`
+- Time-slicing setup: [GPU Operator Time-Slicing Guide](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html)
diff --git a/internal/pkg/appconfig/types.go b/internal/pkg/appconfig/types.go
index ce1e02a2..fef904b5 100644
--- a/internal/pkg/appconfig/types.go
+++ b/internal/pkg/appconfig/types.go
@@ -76,4 +76,5 @@ type Config struct {
 	DisableStartupValidate           bool
 	EnableGPUBindUnbindWatch         bool          // Enable GPU bind/unbind event monitoring
 	GPUBindUnbindPollInterval        time.Duration // Poll interval for GPU bind/unbind events
+	EnablePerPodGPUUtil              bool          // Enable per-pod GPU SM utilization via NVML + pod-resources
 }
diff --git a/internal/pkg/collector/collector_factory.go b/internal/pkg/collector/collector_factory.go
index 1c011a75..b4db2b62 100644
--- a/internal/pkg/collector/collector_factory.go
+++ b/internal/pkg/collector/collector_factory.go
@@ -140,6 +140,24 @@ func (cf *collectorFactory) NewCollectors() []EntityCollectorTuple {
 		})
 	}
 
+	if cf.config.EnablePerPodGPUUtil {
+		newCollector, err := NewProcessPodCollector(
+			cf.counterSet.ExporterCounters,
+			cf.hostname,
+			cf.config,
+		)
+		if err != nil {
+			slog.Error(fmt.Sprintf("collector '%s' cannot be initialized; err: %v",
+				counters.DCGMExpSMUtilPerPod, err))
+			os.Exit(1)
+		}
+
+		entityCollectorTuples = append(entityCollectorTuples, EntityCollectorTuple{
+			entity:    dcgm.FE_GPU,
+			collector: newCollector,
+		})
+	}
+
 	return entityCollectorTuples
 }
 
diff --git a/internal/pkg/collector/process_pod_collector.go b/internal/pkg/collector/process_pod_collector.go
new file mode 100644
index 00000000..2f068869
--- /dev/null
+++ b/internal/pkg/collector/process_pod_collector.go
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package collector
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"net"
+	stdos "os"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials/insecure"
+	"google.golang.org/grpc/resolver"
+	podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
+
+	"github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig"
+	"github.com/NVIDIA/dcgm-exporter/internal/pkg/counters"
+)
+
+const (
+	podResourcesConnectionTimeout = 10 * time.Second
+
+	podLabel       = "pod"
+	namespaceLabel = "namespace"
+	containerLabel = "container"
+)
+
+// processPodInfo holds the pod/container association for a process.
+type processPodInfo struct {
+	pod       string
+	namespace string
+	container string
+}
+
+// nvmlDeviceHandle is a minimal interface over nvml.Device to allow mocking in tests.
+type nvmlDevice interface {
+	GetUUID() (string, nvml.Return)
+	GetName() (string, nvml.Return)
+	GetProcessUtilization(lastSeenTimestamp uint64) ([]nvml.ProcessUtilizationSample, nvml.Return)
+}
+
+// nvmlLib is a minimal interface over the nvml package-level functions to allow mocking.
+type nvmlLib interface {
+	DeviceGetCount() (int, nvml.Return)
+	DeviceGetHandleByIndex(index int) (nvmlDevice, nvml.Return)
+}
+
+// podResourcesClient is a minimal interface over the kubelet pod-resources gRPC client to allow mocking.
+type podResourcesClient interface {
+	List(ctx context.Context, req *podresourcesv1.ListPodResourcesRequest, opts ...grpc.CallOption) (*podresourcesv1.ListPodResourcesResponse, error)
+}
+
+// realNVMLLib wraps the real nvml package-level functions.
+type realNVMLLib struct{}
+
+func (r realNVMLLib) DeviceGetCount() (int, nvml.Return) {
+	return nvml.DeviceGetCount()
+}
+
+func (r realNVMLLib) DeviceGetHandleByIndex(index int) (nvmlDevice, nvml.Return) {
+	return nvml.DeviceGetHandleByIndex(index)
+}
+
+// processPodCollector emits per-pod GPU SM utilization metrics by combining
+// NVML process utilization data with kubelet pod-resources information.
+type processPodCollector struct {
+	counter      counters.Counter
+	hostname     string
+	config       *appconfig.Config
+	nvml         nvmlLib
+	podResClient podResourcesClient
+	connCleanup  func()
+}
+
+// NewProcessPodCollector creates a new processPodCollector.
+// It establishes a gRPC connection to the kubelet pod-resources socket and
+// initialises the NVML library handle.
+func NewProcessPodCollector(
+	counterList counters.CounterList,
+	hostname string,
+	config *appconfig.Config,
+) (Collector, error) {
+	// This is a synthetic metric driven by NVML, not a DCGM field, so it does
+	// not need to appear in the metrics CSV. Use a built-in default; allow the
+	// user to override via the CSV for custom help text or PromType.
+	counter := counters.Counter{
+		FieldID:  0,
+		FieldName: counters.DCGMExpSMUtilPerPod,
+		PromType: "gauge",
+		Help:     "SM utilization attributed to a Kubernetes pod (CUDA time-slicing only)",
+	}
+	if csvCounter, csvErr := findCounterByName(counterList, counters.DCGMExpSMUtilPerPod); csvErr == nil {
+		counter = csvCounter
+	}
+
+	conn, cleanup, err := connectToPodResourcesSocket(config.PodResourcesKubeletSocket)
+	if err != nil {
+		return nil, fmt.Errorf("failed to connect to pod-resources socket %s: %w",
+			config.PodResourcesKubeletSocket, err)
+	}
+
+	return &processPodCollector{
+		counter:      counter,
+		hostname:     hostname,
+		config:       config,
+		nvml:         realNVMLLib{},
+		podResClient: podresourcesv1.NewPodResourcesListerClient(conn),
+		connCleanup:  cleanup,
+	}, nil
+}
+
+// newProcessPodCollectorWithDeps creates a processPodCollector with injected dependencies
+// for use in tests.
+func newProcessPodCollectorWithDeps(
+	counter counters.Counter,
+	hostname string,
+	config *appconfig.Config,
+	nvmlLib nvmlLib,
+	podResClient podResourcesClient,
+) *processPodCollector {
+	return &processPodCollector{
+		counter:      counter,
+		hostname:     hostname,
+		config:       config,
+		nvml:         nvmlLib,
+		podResClient: podResClient,
+		connCleanup:  func() {},
+	}
+}
+
+// GetMetrics implements the Collector interface.
+// It queries NVML for per-process GPU SM utilization and correlates each PID
+// with the owning pod/container via the kubelet pod-resources API.
+func (c *processPodCollector) GetMetrics() (MetricsByCounter, error) {
+	// Build UUID → pod mapping from pod-resources API.
+	uuidToPods, err := c.buildUUIDToPodMap()
+	if err != nil {
+		return nil, fmt.Errorf("failed to build UUID-to-pod map: %w", err)
+	}
+
+	// Aggregate per-pod SM utilization across all GPU devices.
+	// Key: (gpuUUID, namespace, pod, container) → summed SM util.
+	type podGPUKey struct {
+		uuid      string
+		namespace string
+		pod       string
+		container string
+	}
+	smUtilSum := make(map[podGPUKey]uint32)
+	smUtilCount := make(map[podGPUKey]uint32)
+
+	// Per-device metadata used when building the final Metric structs.
+	type gpuDeviceMeta struct {
+		gpuIndex  string
+		gpuDevice string
+		modelName string
+	}
+	deviceMeta := make(map[string]gpuDeviceMeta) // uuid → metadata
+
+	deviceCount, ret := c.nvml.DeviceGetCount()
+	if ret != nvml.SUCCESS {
+		return nil, fmt.Errorf("nvml DeviceGetCount failed: %s", nvml.ErrorString(ret))
+	}
+
+	for i := range deviceCount {
+		device, ret := c.nvml.DeviceGetHandleByIndex(i)
+		if ret != nvml.SUCCESS {
+			slog.Warn("nvml DeviceGetHandleByIndex failed",
+				slog.Int("index", i),
+				slog.String("error", nvml.ErrorString(ret)))
+			continue
+		}
+
+		uuid, ret := device.GetUUID()
+		if ret != nvml.SUCCESS {
+			slog.Warn("nvml GetUUID failed",
+				slog.Int("index", i),
+				slog.String("error", nvml.ErrorString(ret)))
+			continue
+		}
+
+		modelName, ret := device.GetName()
+		if ret != nvml.SUCCESS {
+			slog.Debug("nvml GetName failed",
+				slog.String("uuid", uuid),
+				slog.String("error", nvml.ErrorString(ret)))
+			modelName = ""
+		}
+
+		deviceMeta[uuid] = gpuDeviceMeta{
+			gpuIndex:  fmt.Sprintf("%d", i),
+			gpuDevice: fmt.Sprintf("nvidia%d", i),
+			modelName: modelName,
+		}
+
+		// lastSeenTimestamp=0 requests all samples since the driver started.
+		samples, ret := device.GetProcessUtilization(0)
+		if ret != nvml.SUCCESS {
+			slog.Warn("nvml GetProcessUtilization failed",
+				slog.String("uuid", uuid),
+				slog.String("error", nvml.ErrorString(ret)))
+			continue
+		}
+
+		for _, sample := range samples {
+			podInfo, ok := c.pidToPodInfo(sample.Pid, uuidToPods, uuid)
+			if !ok {
+				continue
+			}
+
+			key := podGPUKey{
+				uuid:      uuid,
+				namespace: podInfo.namespace,
+				pod:       podInfo.pod,
+				container: podInfo.container,
+			}
+			smUtilSum[key] += sample.SmUtil
+			smUtilCount[key]++
+		}
+	}
+
+	metrics := make(MetricsByCounter)
+	metrics[c.counter] = make([]Metric, 0, len(smUtilSum))
+
+	for key, sumUtil := range smUtilSum {
+		count := smUtilCount[key]
+		avgUtil := uint32(0)
+		if count > 0 {
+			avgUtil = sumUtil / count
+		}
+
+		meta := deviceMeta[key.uuid]
+
+		labels := map[string]string{
+			podLabel:       key.pod,
+			namespaceLabel: key.namespace,
+			containerLabel: key.container,
+		}
+
+		m := Metric{
+			Counter:      c.counter,
+			Value:        fmt.Sprintf("%d", avgUtil),
+			GPU:          meta.gpuIndex,
+			UUID:         "UUID",
+			GPUUUID:      key.uuid,
+			GPUDevice:    meta.gpuDevice,
+			GPUModelName: meta.modelName,
+			Hostname:     c.hostname,
+			Labels:       labels,
+			Attributes:   map[string]string{},
+		}
+		metrics[c.counter] = append(metrics[c.counter], m)
+	}
+
+	return metrics, nil
+}
+
+// Cleanup implements the Collector interface.
+func (c *processPodCollector) Cleanup() {
+	if c.connCleanup != nil {
+		c.connCleanup()
+	}
+}
+
+// buildUUIDToPodMap queries the kubelet pod-resources API and returns a
+// mapping of GPU UUID → slice of pods/containers that have allocated the GPU.
+func (c *processPodCollector) buildUUIDToPodMap() (map[string][]processPodInfo, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), podResourcesConnectionTimeout)
+	defer cancel()
+
+	resp, err := c.podResClient.List(ctx, &podresourcesv1.ListPodResourcesRequest{})
+	if err != nil {
+		return nil, fmt.Errorf("kubelet pod-resources List call failed: %w", err)
+	}
+
+	uuidToPods := make(map[string][]processPodInfo)
+
+	for _, pod := range resp.GetPodResources() {
+		for _, container := range pod.GetContainers() {
+			for _, device := range container.GetDevices() {
+				for _, deviceID := range device.GetDeviceIds() {
+					// Normalise: kubelet may include MIG UUIDs or plain GPU UUIDs.
+					uuid := strings.TrimSpace(deviceID)
+					if uuid == "" {
+						continue
+					}
+					info := processPodInfo{
+						pod:       pod.GetName(),
+						namespace: pod.GetNamespace(),
+						container: container.GetName(),
+					}
+					uuidToPods[uuid] = append(uuidToPods[uuid], info)
+				}
+			}
+		}
+	}
+
+	return uuidToPods, nil
+}
+
+// pidToPodInfo returns the pod/container owning the given PID.
+// It first tries to match via cgroup path, falling back to the UUID→pod map
+// when there is exactly one pod allocated to the GPU (common for time-slicing
+// when each virtual GPU is assigned to a single pod).
+func (c *processPodCollector) pidToPodInfo(
+	pid uint32,
+	uuidToPods map[string][]processPodInfo,
+	uuid string,
+) (processPodInfo, bool) {
+	// Attempt cgroup-based lookup first.
+	if info, ok := c.podInfoFromCgroup(pid, uuidToPods); ok {
+		return info, true
+	}
+
+	// Fall back: if only one pod holds this GPU, attribute all processes to it.
+	pods, ok := uuidToPods[uuid]
+	if !ok || len(pods) != 1 {
+		return processPodInfo{}, false
+	}
+	return pods[0], true
+}
+
+// podInfoFromCgroup reads /proc/<pid>/cgroup and attempts to match the
+// container ID in the cgroup path against known pod/container pairs.
+func (c *processPodCollector) podInfoFromCgroup(
+	pid uint32,
+	uuidToPods map[string][]processPodInfo,
+) (processPodInfo, bool) {
+	cgroupPath := filepath.Join("/proc", fmt.Sprintf("%d", pid), "cgroup")
+
+	data, err := stdos.ReadFile(cgroupPath) //nolint:gosec // path is constructed from controlled input
+	if err != nil {
+		// Process may have exited; this is not an error worth logging at warn level.
+		slog.Debug("could not read cgroup file",
+			slog.String("path", cgroupPath),
+			slog.String("error", err.Error()))
+		return processPodInfo{}, false
+	}
+
+	cgroupContent := string(data)
+
+	for _, pods := range uuidToPods {
+		for _, info := range pods {
+			// The cgroup path for a container typically contains a segment
+			// derived from the pod UID and container name/ID.
+			// A reliable heuristic: look for the pod name in the cgroup path.
+			if strings.Contains(cgroupContent, info.pod) {
+				return info, true
+			}
+		}
+	}
+
+	return processPodInfo{}, false
+}
+
+// findCounterByName returns the first counter with the given field name.
+func findCounterByName(list counters.CounterList, name string) (counters.Counter, error) {
+	for _, c := range list {
+		if c.FieldName == name {
+			return c, nil
+		}
+	}
+	return counters.Counter{}, fmt.Errorf("counter %q not found", name)
+}
+
+// connectToPodResourcesSocket dials the kubelet pod-resources Unix socket.
+func connectToPodResourcesSocket(socket string) (*grpc.ClientConn, func(), error) {
+	resolver.SetDefaultScheme("passthrough")
+
+	conn, err := grpc.NewClient(
+		socket,
+		grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
+			d := net.Dialer{}
+			return d.DialContext(ctx, "unix", addr)
+		}),
+	)
+	if err != nil {
+		return nil, func() {}, fmt.Errorf("failed to dial pod-resources socket %q: %w", socket, err)
+	}
+
+	return conn, func() { conn.Close() }, nil
+}
diff --git a/internal/pkg/collector/process_pod_collector_test.go b/internal/pkg/collector/process_pod_collector_test.go
new file mode 100644
index 00000000..e404ea8a
--- /dev/null
+++ b/internal/pkg/collector/process_pod_collector_test.go
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package collector
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"testing"
+
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"google.golang.org/grpc"
+	podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
+
+	"github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig"
+	"github.com/NVIDIA/dcgm-exporter/internal/pkg/counters"
+)
+
+// ---------------------------------------------------------------------------
+// Test doubles
+// ---------------------------------------------------------------------------
+
+// fakePodResourcesClient implements podResourcesClient for tests.
+type fakePodResourcesClient struct {
+	resp *podresourcesv1.ListPodResourcesResponse
+	err  error
+}
+
+func (f *fakePodResourcesClient) List(
+	_ context.Context,
+	_ *podresourcesv1.ListPodResourcesRequest,
+	_ ...grpc.CallOption,
+) (*podresourcesv1.ListPodResourcesResponse, error) {
+	return f.resp, f.err
+}
+
+// fakeNVMLDevice implements nvmlDevice for tests.
+type fakeNVMLDevice struct {
+	uuid      string
+	modelName string
+	samples   []nvml.ProcessUtilizationSample
+	ret       nvml.Return
+}
+
+func (f *fakeNVMLDevice) GetUUID() (string, nvml.Return) {
+	return f.uuid, nvml.SUCCESS
+}
+
+func (f *fakeNVMLDevice) GetName() (string, nvml.Return) {
+	return f.modelName, nvml.SUCCESS
+}
+
+func (f *fakeNVMLDevice) GetProcessUtilization(_ uint64) ([]nvml.ProcessUtilizationSample, nvml.Return) {
+	return f.samples, f.ret
+}
+
+// fakeNVMLLib implements nvmlLib for tests.
+type fakeNVMLLib struct {
+	devices []nvmlDevice
+	ret     nvml.Return
+}
+
+func (f *fakeNVMLLib) DeviceGetCount() (int, nvml.Return) {
+	if f.ret != nvml.SUCCESS {
+		return 0, f.ret
+	}
+	return len(f.devices), nvml.SUCCESS
+}
+
+func (f *fakeNVMLLib) DeviceGetHandleByIndex(index int) (nvmlDevice, nvml.Return) {
+	if index >= len(f.devices) {
+		return nil, nvml.ERROR_INVALID_ARGUMENT
+	}
+	return f.devices[index], nvml.SUCCESS
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+func testCounter() counters.Counter {
+	return counters.Counter{
+		FieldName: counters.DCGMExpSMUtilPerPod,
+		PromType:  "gauge",
+		Help:      "Per-pod GPU SM utilization",
+	}
+}
+
+func testConfig() *appconfig.Config {
+	return &appconfig.Config{
+		PodResourcesKubeletSocket: "/var/lib/kubelet/pod-resources/kubelet.sock",
+		EnablePerPodGPUUtil:       true,
+	}
+}
+
+func newTestCollector(
+	nvmlLib nvmlLib,
+	podClient podResourcesClient,
+) *processPodCollector {
+	return newProcessPodCollectorWithDeps(
+		testCounter(),
+		"test-host",
+		testConfig(),
+		nvmlLib,
+		podClient,
+	)
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+// TestProcessPodCollector_EmitsMetricForSinglePod verifies that when one pod
+// holds a GPU and one process is running on it, a single metric is emitted
+// with the correct pod labels and SM utilisation value.
+func TestProcessPodCollector_EmitsMetricForSinglePod(t *testing.T) {
+	const (
+		gpuUUID      = "GPU-aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
+		podName      = "my-pod"
+		podNamespace = "default"
+		containerNm  = "my-container"
+		smUtil       = uint32(42)
+	)
+
+	podClient := &fakePodResourcesClient{
+		resp: &podresourcesv1.ListPodResourcesResponse{
+			PodResources: []*podresourcesv1.PodResources{
+				{
+					Name:      podName,
+					Namespace: podNamespace,
+					Containers: []*podresourcesv1.ContainerResources{
+						{
+							Name: containerNm,
+							Devices: []*podresourcesv1.ContainerDevices{
+								{
+									ResourceName: "nvidia.com/gpu",
+									DeviceIds:    []string{gpuUUID},
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	nvmlLib := &fakeNVMLLib{
+		devices: []nvmlDevice{
+			&fakeNVMLDevice{
+				uuid:      gpuUUID,
+				modelName: "NVIDIA A100",
+				samples: []nvml.ProcessUtilizationSample{
+					{Pid: 1234, SmUtil: smUtil},
+				},
+				ret: nvml.SUCCESS,
+			},
+		},
+	}
+
+	c := newTestCollector(nvmlLib, podClient)
+	metrics, err := c.GetMetrics()
+
+	require.NoError(t, err)
+	require.Contains(t, metrics, testCounter())
+
+	metricList := metrics[testCounter()]
+	require.Len(t, metricList, 1)
+
+	m := metricList[0]
+	assert.Equal(t, fmt.Sprintf("%d", smUtil), m.Value)
+	assert.Equal(t, "0", m.GPU)
+	assert.Equal(t, "UUID", m.UUID)
+	assert.Equal(t, gpuUUID, m.GPUUUID)
+	assert.Equal(t, "nvidia0", m.GPUDevice)
+	assert.Equal(t, "NVIDIA A100", m.GPUModelName)
+	assert.Equal(t, "test-host", m.Hostname)
+	assert.Equal(t, podName, m.Labels[podLabel])
+	assert.Equal(t, podNamespace, m.Labels[namespaceLabel])
+	assert.Equal(t, containerNm, m.Labels[containerLabel])
+}
+
+// TestProcessPodCollector_NoProcesses verifies that no metrics are emitted
+// when no processes are running on any GPU (empty samples slice).
+// This covers the time-slicing case where the GPU is allocated but idle.
+func TestProcessPodCollector_NoProcesses(t *testing.T) {
+	const gpuUUID = "GPU-11111111-2222-3333-4444-555555555555"
+
+	podClient := &fakePodResourcesClient{
+		resp: &podresourcesv1.ListPodResourcesResponse{
+			PodResources: []*podresourcesv1.PodResources{
+				{
+					Name:      "idle-pod",
+					Namespace: "default",
+					Containers: []*podresourcesv1.ContainerResources{
+						{
+							Name: "idle-container",
+							Devices: []*podresourcesv1.ContainerDevices{
+								{
+									ResourceName: "nvidia.com/gpu",
+									DeviceIds:    []string{gpuUUID},
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	nvmlLib := &fakeNVMLLib{
+		devices: []nvmlDevice{
+			&fakeNVMLDevice{
+				uuid:    gpuUUID,
+				samples: nil, // no active processes
+				ret:     nvml.SUCCESS,
+			},
+		},
+	}
+
+	c := newTestCollector(nvmlLib, podClient)
+	metrics, err := c.GetMetrics()
+
+	require.NoError(t, err)
+	// Counter key should still exist but with an empty slice.
+	assert.Empty(t, metrics[testCounter()])
+}
+
+// TestProcessPodCollector_MultiplePodsTimeSlicing verifies that when multiple
+// pods share a GPU (time-slicing) and processes from different pods are active,
+// each pod gets its own aggregated metric entry.
+func TestProcessPodCollector_MultiplePodsTimeSlicing(t *testing.T) {
+	const (
+		gpuUUID  = "GPU-ffffffff-eeee-dddd-cccc-bbbbbbbbbbbb"
+		smUtil1  = uint32(30)
+		smUtil2  = uint32(50)
+	)
+
+	// Two pods sharing the same GPU UUID is the time-slicing scenario.
+	// In a real setup they would each see the UUID directly.
+	podClient := &fakePodResourcesClient{
+		resp: &podresourcesv1.ListPodResourcesResponse{
+			PodResources: []*podresourcesv1.PodResources{
+				{
+					Name:      "pod-a",
+					Namespace: "ns-a",
+					Containers: []*podresourcesv1.ContainerResources{
+						{
+							Name: "ctr-a",
+							Devices: []*podresourcesv1.ContainerDevices{
+								{DeviceIds: []string{gpuUUID}},
+							},
+						},
+					},
+				},
+				{
+					Name:      "pod-b",
+					Namespace: "ns-b",
+					Containers: []*podresourcesv1.ContainerResources{
+						{
+							Name: "ctr-b",
+							Devices: []*podresourcesv1.ContainerDevices{
+								{DeviceIds: []string{gpuUUID}},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	// With two pods on one UUID the cgroup fallback fires.
+	// Here we use PIDs whose cgroup content we inject via the podInfoFromCgroup
+	// path - but since we can't write to /proc in tests, we just verify that
+	// when cgroup lookup fails and there are >1 pods on the UUID, no metric
+	// is emitted (the safe/conservative path).
+	nvmlLib := &fakeNVMLLib{
+		devices: []nvmlDevice{
+			&fakeNVMLDevice{
+				uuid: gpuUUID,
+				samples: []nvml.ProcessUtilizationSample{
+					{Pid: 100, SmUtil: smUtil1},
+					{Pid: 200, SmUtil: smUtil2},
+				},
+				ret: nvml.SUCCESS,
+			},
+		},
+	}
+
+	c := newTestCollector(nvmlLib, podClient)
+	metrics, err := c.GetMetrics()
+
+	require.NoError(t, err)
+	// With >1 pod and no successful cgroup lookup, no metrics emitted.
+	assert.Empty(t, metrics[testCounter()])
+}
+
+// TestProcessPodCollector_PodResourcesError verifies that an error from the
+// pod-resources API is propagated as an error from GetMetrics.
+func TestProcessPodCollector_PodResourcesError(t *testing.T) {
+	podClient := &fakePodResourcesClient{
+		err: errors.New("connection refused"),
+	}
+
+	nvmlLib := &fakeNVMLLib{
+		devices: []nvmlDevice{},
+	}
+
+	c := newTestCollector(nvmlLib, podClient)
+	_, err := c.GetMetrics()
+
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "failed to build UUID-to-pod map")
+}
+
+// TestProcessPodCollector_NVMLDeviceCountError verifies that an error from
+// NVML DeviceGetCount is propagated.
+func TestProcessPodCollector_NVMLDeviceCountError(t *testing.T) {
+	podClient := &fakePodResourcesClient{
+		resp: &podresourcesv1.ListPodResourcesResponse{},
+	}
+
+	nvmlLib := &fakeNVMLLib{
+		ret: nvml.ERROR_DRIVER_NOT_LOADED,
+	}
+
+	c := newTestCollector(nvmlLib, podClient)
+	_, err := c.GetMetrics()
+
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "nvml DeviceGetCount failed")
+}
+
+// TestProcessPodCollector_Cleanup verifies that Cleanup does not panic.
+func TestProcessPodCollector_Cleanup(t *testing.T) {
+	cleanupCalled := false
+	c := &processPodCollector{
+		connCleanup: func() {
+			cleanupCalled = true
+		},
+	}
+	c.Cleanup()
+	assert.True(t, cleanupCalled)
+}
+
+// TestProcessPodCollector_ImplementsCollectorInterface is a compile-time check
+// that processPodCollector satisfies the Collector interface.
+func TestProcessPodCollector_ImplementsCollectorInterface(t *testing.T) {
+	var _ Collector = (*processPodCollector)(nil)
+}
+
+// TestFindCounterByName verifies the helper that searches a counter list.
+func TestFindCounterByName(t *testing.T) {
+	list := counters.CounterList{
+		{FieldName: "COUNTER_A"},
+		{FieldName: counters.DCGMExpSMUtilPerPod},
+	}
+
+	got, err := findCounterByName(list, counters.DCGMExpSMUtilPerPod)
+	require.NoError(t, err)
+	assert.Equal(t, counters.DCGMExpSMUtilPerPod, got.FieldName)
+
+	_, err = findCounterByName(list, "NONEXISTENT")
+	require.Error(t, err)
+}
+
+// TestProcessPodCollector_AveragesMultipleSamplesPerPod verifies that when
+// multiple PIDs belong to the same pod (via single-pod fallback), their SM
+// utilisation is averaged.
+func TestProcessPodCollector_AveragesMultipleSamplesPerPod(t *testing.T) {
+	const (
+		gpuUUID = "GPU-12345678-0000-0000-0000-000000000000"
+		pod     = "batch-pod"
+		ns      = "production"
+		ctr     = "worker"
+	)
+
+	podClient := &fakePodResourcesClient{
+		resp: &podresourcesv1.ListPodResourcesResponse{
+			PodResources: []*podresourcesv1.PodResources{
+				{
+					Name:      pod,
+					Namespace: ns,
+					Containers: []*podresourcesv1.ContainerResources{
+						{
+							Name: ctr,
+							Devices: []*podresourcesv1.ContainerDevices{
+								{DeviceIds: []string{gpuUUID}},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	// Two processes on the same GPU, both owned by the single pod.
+	// Average SM util = (20 + 40) / 2 = 30.
+	nvmlLib := &fakeNVMLLib{
+		devices: []nvmlDevice{
+			&fakeNVMLDevice{
+				uuid: gpuUUID,
+				samples: []nvml.ProcessUtilizationSample{
+					{Pid: 10, SmUtil: 20},
+					{Pid: 11, SmUtil: 40},
+				},
+				ret: nvml.SUCCESS,
+			},
+		},
+	}
+
+	c := newTestCollector(nvmlLib, podClient)
+	metrics, err := c.GetMetrics()
+
+	require.NoError(t, err)
+	metricList := metrics[testCounter()]
+	require.Len(t, metricList, 1)
+
+	assert.Equal(t, "30", metricList[0].Value)
+	assert.Equal(t, pod, metricList[0].Labels[podLabel])
+}
diff --git a/internal/pkg/counters/const.go b/internal/pkg/counters/const.go
index bd0aaf15..85dd4dc9 100644
--- a/internal/pkg/counters/const.go
+++ b/internal/pkg/counters/const.go
@@ -26,4 +26,5 @@ const (
 	DCGMExpXIDErrorsCount   = "DCGM_EXP_XID_ERRORS_COUNT"
 	DCGMExpGPUHealthStatus  = "DCGM_EXP_GPU_HEALTH_STATUS"
 	DCGMExpP2PStatus        = "DCGM_EXP_P2P_STATUS"
+	DCGMExpSMUtilPerPod     = "DCGM_EXP_SM_UTIL_PER_POD"
 )
diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go
index cf88a949..b8b1bd2f 100644
--- a/pkg/cmd/app.go
+++ b/pkg/cmd/app.go
@@ -102,6 +102,7 @@ const (
 	CLIDisableStartupValidate           = "disable-startup-validate"
 	CLIEnableGPUBindUnbindWatch         = "enable-gpu-bind-unbind-watch"
 	CLIGPUBindUnbindPollInterval        = "gpu-bind-unbind-poll-interval"
+	CLIEnablePerPodGPUUtil              = "enable-per-pod-gpu-util"
 )
 
 func NewApp(buildVersion ...string) *cli.App {
@@ -349,6 +350,12 @@ func NewApp(buildVersion ...string) *cli.App {
 			EnvVars: []string{"DCGM_EXPORTER_GPU_BIND_UNBIND_POLL_INTERVAL"},
 			Value:   "1s",
 		},
+		&cli.BoolFlag{
+			Name:    CLIEnablePerPodGPUUtil,
+			Value:   false,
+			Usage:   "Enable per-pod GPU SM utilization metrics (dcgm_fi_dev_sm_util_per_pod) using NVML process utilization and kubelet pod-resources API. Intended for time-slicing workloads.",
+			EnvVars: []string{"DCGM_EXPORTER_ENABLE_PER_POD_GPU_UTIL"},
+		},
 	}
 
 	if runtime.GOOS == "linux" {
@@ -1103,6 +1110,7 @@ func contextToConfig(c *cli.Context) (*appconfig.Config, error) {
 		DisableStartupValidate:    c.Bool(CLIDisableStartupValidate),
 		EnableGPUBindUnbindWatch:  c.Bool(CLIEnableGPUBindUnbindWatch),
 		GPUBindUnbindPollInterval: parseDuration(c.String(CLIGPUBindUnbindPollInterval), 1*time.Second),
+		EnablePerPodGPUUtil:       c.Bool(CLIEnablePerPodGPUUtil),
 	}, nil
 }