Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ require (
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/chai2010/gettext-go v1.0.3 // indirect
github.com/containerd/cgroups/v3 v3.1.1 // indirect
github.com/containerd/containerd v1.7.27 // indirect
github.com/containerd/errdefs v1.0.0 // indirect
github.com/containerd/log v0.1.0 // indirect
Expand Down Expand Up @@ -103,6 +104,7 @@ require (
github.com/mitchellh/go-wordwrap v1.0.1 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect
github.com/moby/spdystream v0.5.0 // indirect
github.com/moby/sys/userns v0.1.0 // indirect
github.com/moby/term v0.5.2 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
Expand Down
5 changes: 5 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/chai2010/gettext-go v1.0.3 h1:9liNh8t+u26xl5ddmWLmsOsdNLwkdRTg5AG+JnTiM80=
github.com/chai2010/gettext-go v1.0.3/go.mod h1:y+wnP2cHYaVj19NZhYKAwEMH2CI1gNHeQQ+5AjwawxA=
github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM=
github.com/containerd/cgroups/v3 v3.1.1 h1:ASZmQGfOHbRj43/1aMn5QcWIsv0R/AuHHDNCguRY0p0=
github.com/containerd/cgroups/v3 v3.1.1/go.mod h1:PKZ2AcWmSBsY/tJUVhtS/rluX0b1uq1GmPO1ElCmbOw=
github.com/containerd/containerd v1.7.27 h1:yFyEyojddO3MIGVER2xJLWoCIn+Up4GaHFquP7hsFII=
github.com/containerd/containerd v1.7.27/go.mod h1:xZmPnl75Vc+BLGt4MIfu6bp+fy03gdHAn9bz+FreFR0=
github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI=
Expand Down Expand Up @@ -211,6 +214,8 @@ github.com/mittwald/go-helm-client v0.12.16 h1:YTyJX6L0SI/O7HNTG0qDZI2/jyGELxJOQ
github.com/mittwald/go-helm-client v0.12.16/go.mod h1:PDF7Ra8bmJ2YTNzoehoMMi+gW/EJBk/4TLz7j52rehY=
github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU=
github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI=
github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g=
github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28=
github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ=
github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
Expand Down
45 changes: 45 additions & 0 deletions internal/mocks/pkg/nvmlprovider/mock_client.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion internal/pkg/deviceinfo/device_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ func (s *Info) initializeGPUInfo(gOpt appconfig.DeviceOptions, useFakeGPUs bool)
links, err := dcgmprovider.Client().GetNvLinkLinkStatus()
if err == nil {
for i := 0; i < len(s.gpus); i++ {
// monitor only the nvlinks as per the device options input
// monitor only the nvlinks as per the device options input
if gOpt.Flex || s.shouldMonitor(gOpt.MajorRange, s.gpus[i].DeviceInfo.GPU) {
var matchingLinks []dcgm.NvLinkStatus
var linkCount uint = 1
Expand Down
102 changes: 102 additions & 0 deletions internal/pkg/nvmlprovider/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,108 @@ func getMIGDeviceInfoForOldDriver(uuid string) (*MIGDeviceInfo, error) {
}, nil
}

// GetDeviceProcessMemory returns memory usage for compute processes running on the GPU
func (n nvmlProvider) GetDeviceProcessMemory(gpuUUID string) (map[uint32]uint64, error) {
if err := n.preCheck(); err != nil {
return nil, fmt.Errorf("failed to get device process memory: %w", err)
}

device, ret := nvml.DeviceGetHandleByUUID(gpuUUID)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get device handle for UUID %s: %s", gpuUUID, nvml.ErrorString(ret))
}

processes, ret := device.GetComputeRunningProcesses()
if ret != nvml.SUCCESS && ret != nvml.ERROR_NOT_SUPPORTED {
return nil, fmt.Errorf("failed to get compute running processes: %s", nvml.ErrorString(ret))
}

result := make(map[uint32]uint64, len(processes))
for _, p := range processes {
result[p.Pid] = p.UsedGpuMemory
}

return result, nil
}

// GetDeviceProcessUtilization returns SM utilization for processes running on the GPU
func (n nvmlProvider) GetDeviceProcessUtilization(gpuUUID string) (map[uint32]uint32, error) {
if err := n.preCheck(); err != nil {
return nil, fmt.Errorf("failed to get device process utilization: %w", err)
}

device, ret := nvml.DeviceGetHandleByUUID(gpuUUID)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get device handle for UUID %s: %s", gpuUUID, nvml.ErrorString(ret))
}

samples, ret := device.GetProcessUtilization(0)
if ret != nvml.SUCCESS {
if ret == nvml.ERROR_NOT_SUPPORTED {
return nil, nil
}
return nil, fmt.Errorf("failed to get process utilization: %s", nvml.ErrorString(ret))
}

result := make(map[uint32]uint32, len(samples))
for _, s := range samples {
result[s.Pid] = s.SmUtil
}

return result, nil
}

// GetAllMIGDevicesProcessMemory returns per-process memory usage for all MIG instances on a GPU.
// Returns map[gpuInstanceID (MIG instance)]map[PID]memoryBytes.
func (n nvmlProvider) GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[uint]map[uint32]uint64, error) {
if err := n.preCheck(); err != nil {
return nil, fmt.Errorf("failed to get MIG device process memory: %w", err)
}

parentDevice, ret := nvml.DeviceGetHandleByUUID(parentGPUUUID)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get parent device handle for UUID %s: %s", parentGPUUUID, nvml.ErrorString(ret))
}

migCount, ret := parentDevice.GetMaxMigDeviceCount()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get MIG device count for UUID %s: %s", parentGPUUUID, nvml.ErrorString(ret))
}

result := make(map[uint]map[uint32]uint64)

for i := 0; i < migCount; i++ {
migDevice, ret := parentDevice.GetMigDeviceHandleByIndex(i)
if ret == nvml.ERROR_NOT_FOUND || ret == nvml.ERROR_INVALID_ARGUMENT {
continue
}
if ret != nvml.SUCCESS {
slog.Debug("Failed to get MIG device handle", "index", i, "error", nvml.ErrorString(ret))
continue
}

giID, ret := migDevice.GetGpuInstanceId()
if ret != nvml.SUCCESS {
slog.Debug("Failed to get GPU instance ID for MIG device", "index", i, "error", nvml.ErrorString(ret))
continue
}

processes, ret := migDevice.GetComputeRunningProcesses()
if ret != nvml.SUCCESS && ret != nvml.ERROR_NOT_SUPPORTED {
slog.Debug("Failed to get running processes for MIG device", "gpuInstanceID", giID, "error", nvml.ErrorString(ret))
continue
}

pidToMemory := make(map[uint32]uint64, len(processes))
for _, p := range processes {
pidToMemory[p.Pid] = p.UsedGpuMemory
}
result[uint(giID)] = pidToMemory
}

return result, nil
}

// Cleanup performs cleanup operations for the NVML provider
func (n nvmlProvider) Cleanup() {
if !n.initialized {
Expand Down
24 changes: 24 additions & 0 deletions internal/pkg/nvmlprovider/provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,30 @@ func TestGetMIGDeviceInfoByID_When_NVML_Not_Initialized(t *testing.T) {
assert.Error(t, err, "uuid: %v, Device Info: %+v", validMIGUUID, deviceInfo)
}

func TestGetDeviceProcessMemory_When_NVML_Not_Initialized(t *testing.T) {
provider := nvmlProvider{}
result, err := provider.GetDeviceProcessMemory("GPU-test-uuid")
assert.Error(t, err)
assert.Nil(t, result)
assert.Contains(t, err.Error(), "failed to get device process memory")
}

func TestGetDeviceProcessUtilization_When_NVML_Not_Initialized(t *testing.T) {
provider := nvmlProvider{}
result, err := provider.GetDeviceProcessUtilization("GPU-test-uuid")
assert.Error(t, err)
assert.Nil(t, result)
assert.Contains(t, err.Error(), "failed to get device process utilization")
}

func TestGetAllMIGDevicesProcessMemory_When_NVML_Not_Initialized(t *testing.T) {
provider := nvmlProvider{}
result, err := provider.GetAllMIGDevicesProcessMemory("GPU-test-uuid")
assert.Error(t, err)
assert.Nil(t, result)
assert.Contains(t, err.Error(), "failed to get MIG device process memory")
}

func TestGetMIGDeviceInfoByID_When_DriverVersion_Below_R470(t *testing.T) {
_ = Initialize()
assert.NotNil(t, Client(), "expected NVML Client to be not nil")
Expand Down
9 changes: 9 additions & 0 deletions internal/pkg/nvmlprovider/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,14 @@ package nvmlprovider

type NVML interface {
GetMIGDeviceInfoByID(string) (*MIGDeviceInfo, error)
// GetDeviceProcessMemory returns memory usage for processes running on the GPU.
// Returns a map from PID to memory used in bytes.
GetDeviceProcessMemory(gpuUUID string) (map[uint32]uint64, error)
// GetDeviceProcessUtilization returns SM utilization for processes running on the GPU.
// Returns a map from PID to SM utilization percentage.
GetDeviceProcessUtilization(gpuUUID string) (map[uint32]uint32, error)
// GetAllMIGDevicesProcessMemory returns per-process memory usage for all MIG instances on a GPU.
// Returns map[gpuInstanceID (MIG instance)]map[PID]memoryBytes.
GetAllMIGDevicesProcessMemory(parentGPUUUID string) (map[uint]map[uint32]uint64, error)
Cleanup()
}
3 changes: 3 additions & 0 deletions internal/pkg/transformation/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,7 @@ const (
draMigDeviceUUID = "dra_mig_device_uuid"

DRAGPUDriverName = "gpu.nvidia.com"

metricGPUUtil = "DCGM_FI_DEV_GPU_UTIL"
metricFBUsed = "DCGM_FI_DEV_FB_USED"
)
Loading
Loading