From bc4e0f87e2c1a2a6631cc082f5345201fc04e664 Mon Sep 17 00:00:00 2001 From: Jackson McKay Date: Mon, 15 Dec 2025 19:08:35 +0000 Subject: [PATCH 1/3] feat: Add MIG device support for hpc labels modified: README.md modified: internal/pkg/transformation/hpc.go modified: internal/pkg/transformation/hpc_test.go Signed-off-by: Jackson Jay McKay --- README.md | 4 +-- internal/pkg/transformation/hpc.go | 15 ++++++++-- internal/pkg/transformation/hpc_test.go | 38 ++++++++++++++++++++++++- 3 files changed, 51 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index fb832cbd..48c53ec3 100644 --- a/README.md +++ b/README.md @@ -100,8 +100,8 @@ The DCGM-exporter can include High-Performance Computing (HPC) job information i These mapping files follow a specific format: -* Each file is named after a unique GPU ID (e.g., 0, 1, 2, etc.). -* Each line in the file contains JOB IDs that run on the corresponding GPU. +* Each file is named after either a unique GPU ID or a unique GPU ID and a GPU instance (MIG) ID separated with a "." (e.g., 0, 1, 2.0, 2.1, 3, etc.). +* Each line in the file contains JOB IDs that run on the corresponding GPU/MIG instance. #### Enabling HPC Job Mapping on DCGM-Exporter diff --git a/internal/pkg/transformation/hpc.go b/internal/pkg/transformation/hpc.go index 5902bb2a..33d664b4 100644 --- a/internal/pkg/transformation/hpc.go +++ b/internal/pkg/transformation/hpc.go @@ -94,7 +94,16 @@ func (p *hpcMapper) Process(metrics collector.MetricsByCounter, _ deviceinfo.Pro for counter := range metrics { var modifiedMetrics []collector.Metric for _, metric := range metrics[counter] { - jobs, exists := gpuToJobMap[metric.GPU] + + var jobMapKey string + if metric.MigProfile != "" { + jobMapKey = fmt.Sprintf("%s.%s", metric.GPU, metric.GPUInstanceID) + } else { + jobMapKey = metric.GPU + } + + jobs, exists := gpuToJobMap[jobMapKey] + if exists && len(jobs) != 0 { for _, job := range jobs { modifiedMetric, err := utils.DeepCopy(metric) @@ -173,9 +182,9 @@ func getGPUFiles(dirPath string) ([]string, error) { continue // Skip directories } - _, err = strconv.Atoi(file.Name()) + _, err = strconv.ParseFloat(file.Name(), 64) if err != nil { - slog.Debug(fmt.Sprintf("HPC mapper: file %q name doesn't match with GPU ID convention", file.Name())) + slog.Debug(fmt.Sprintf("HPC mapper: file %q name doesn't match with or . convention", file.Name())) continue } mappingFiles = append(mappingFiles, file.Name()) diff --git a/internal/pkg/transformation/hpc_test.go b/internal/pkg/transformation/hpc_test.go index 0d507620..77a8250c 100644 --- a/internal/pkg/transformation/hpc_test.go +++ b/internal/pkg/transformation/hpc_test.go @@ -66,6 +66,13 @@ func TestHPCProcess(t *testing.T) { mDirEntryGPU1.EXPECT().Info().Return(mFileInfoGPU1, nil).AnyTimes() mDirEntryGPU1.EXPECT().Name().Return("1").AnyTimes() + mFileInfoGPU3MIG0 := mockos.NewMockFileInfo(ctrl) + mFileInfoGPU3MIG0.EXPECT().IsDir().Return(false).AnyTimes() + + mDirEntryGPU3MIG0 := mockos.NewMockDirEntry(ctrl) + mDirEntryGPU3MIG0.EXPECT().Info().Return(mFileInfoGPU3MIG0, nil).AnyTimes() + mDirEntryGPU3MIG0.EXPECT().Name().Return("3.0").AnyTimes() + mFileInfoDir := mockos.NewMockFileInfo(ctrl) mFileInfoDir.EXPECT().IsDir().Return(true).AnyTimes() @@ -86,6 +93,7 @@ func TestHPCProcess(t *testing.T) { Return([]fs.DirEntry{ mDirEntryGPU0, mDirEntryGPU1, + mDirEntryGPU3MIG0, mDirEntryDir, mDirEntryDamagedFile, }, nil).AnyTimes() @@ -101,8 +109,14 @@ func TestHPCProcess(t *testing.T) { _, _ = slurm1.WriteString("job2-1\n") slurm1.Close() + slurm3dot0, err := realOS.CreateTemp("", "slurm3dot0") + require.NoError(t, err) + _, _ = slurm3dot0.WriteString("job3.0-0") + slurm3dot0.Close() + mOS.EXPECT().Open(gomock.Eq("/var/run/nvidia/slurm/0")).Return(realOS.Open(slurm0.Name())) mOS.EXPECT().Open(gomock.Eq("/var/run/nvidia/slurm/1")).Return(realOS.Open(slurm1.Name())) + mOS.EXPECT().Open(gomock.Eq("/var/run/nvidia/slurm/3.0")).Return(realOS.Open(slurm3dot0.Name())) os = mOS return func() { @@ -111,13 +125,15 @@ func TestHPCProcess(t *testing.T) { _ = realOS.Remove(slurm0.Name()) slurm1.Close() _ = realOS.Remove(slurm1.Name()) + slurm3dot0.Close() + _ = realOS.Remove(slurm3dot0.Name()) } }, assertion: func(t *testing.T, mbc collector.MetricsByCounter) { require.Len(t, mbc, 1, "metrics are expected for a single counter only.") // We get metric value with 0 index metricValues := mbc[reflect.ValueOf(mbc).MapKeys()[0].Interface().(counters.Counter)] - require.Len(t, metricValues, 4, "received unexpected number of metric values.") + require.Len(t, metricValues, 5, "received unexpected number of metric values.") // Sort metrics by GPU ID slices.SortFunc(metricValues, func(a, b collector.Metric) int { return cmp.Compare(a.GPU, b.GPU) @@ -137,6 +153,11 @@ func TestHPCProcess(t *testing.T) { assert.Equal(t, "2", metricValues[3].GPU) assert.Equal(t, "1984", metricValues[3].Value) assert.NotContains(t, metricValues[3].Attributes, hpcJobAttribute) + + assert.Equal(t, "3", metricValues[4].GPU) + assert.Equal(t, "0", metricValues[4].GPUInstanceID) + assert.Equal(t, "123", metricValues[4].Value) + assert.Equal(t, "job3.0-0", metricValues[4].Attributes[hpcJobAttribute]) }, }, } @@ -196,6 +217,21 @@ func TestHPCProcess(t *testing.T) { Attributes: map[string]string{}, }) + metrics[counter] = append(metrics[counter], collector.Metric{ + GPU: "3", + GPUUUID: uuid.New().String(), + GPUDevice: "nvidia2", + GPUInstanceID: "0", + Value: "123", + MigProfile: "3g.70gb", + Counter: counters.Counter{ + FieldID: 1001, + FieldName: "DCGM_FI_PROF_GR_ENGINE_ACTIVE", + PromType: "gauge", + }, + Attributes: map[string]string{}, + }) + mapper := newHPCMapper(tt.config) err := mapper.Process(metrics, nil) if tt.wantErr != nil && !tt.wantErr(t, err, fmt.Sprintf("hpcMapper.Process(%v,%v)", metrics, nil)) { From 24e05d93724c3e707f83d153da76b789f1a788e4 Mon Sep 17 00:00:00 2001 From: Nick Curry Date: Tue, 7 Apr 2026 14:37:52 -0500 Subject: [PATCH 2/3] Use regex for file name parsing --- internal/pkg/transformation/hpc.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/internal/pkg/transformation/hpc.go b/internal/pkg/transformation/hpc.go index 33d664b4..36e6f051 100644 --- a/internal/pkg/transformation/hpc.go +++ b/internal/pkg/transformation/hpc.go @@ -22,7 +22,7 @@ import ( "log/slog" sysOS "os" "path" - "strconv" + "regexp" "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" @@ -160,6 +160,10 @@ func readFile(path string) ([]string, error) { return jobs, nil } +// validGPUFile matches Slurm job mapping filenames: plain GPU IDs (e.g. "0", "1") +// or MIG instance IDs in . format (e.g. "2.0", "3.10"). +var validGPUFile = regexp.MustCompile(`^\d+(\.\d+)?$`) + func getGPUFiles(dirPath string) ([]string, error) { files, err := os.ReadDir(dirPath) if err != nil { @@ -182,8 +186,7 @@ func getGPUFiles(dirPath string) ([]string, error) { continue // Skip directories } - _, err = strconv.ParseFloat(file.Name(), 64) - if err != nil { + if !validGPUFile.MatchString(file.Name()) { slog.Debug(fmt.Sprintf("HPC mapper: file %q name doesn't match with or . convention", file.Name())) continue } From 9899254e3393e42a27ec25532a1bb1dca12c8ac5 Mon Sep 17 00:00:00 2001 From: Nick Curry Date: Tue, 7 Apr 2026 14:37:58 -0500 Subject: [PATCH 3/3] Add unit tests --- internal/pkg/transformation/hpc_test.go | 72 +++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/internal/pkg/transformation/hpc_test.go b/internal/pkg/transformation/hpc_test.go index 77a8250c..950f22b9 100644 --- a/internal/pkg/transformation/hpc_test.go +++ b/internal/pkg/transformation/hpc_test.go @@ -242,6 +242,78 @@ func TestHPCProcess(t *testing.T) { } } +func TestGetGPUFiles(t *testing.T) { + tests := []struct { + name string + files []string + expected []string + }{ + { + name: "plain GPU IDs", + files: []string{"0", "1", "42"}, + expected: []string{"0", "1", "42"}, + }, + { + name: "MIG GPU.instance IDs", + files: []string{"0.0", "2.1", "3.10"}, + expected: []string{"0.0", "2.1", "3.10"}, + }, + { + name: "mixed valid files", + files: []string{"0", "1", "2.0", "3.1"}, + expected: []string{"0", "1", "2.0", "3.1"}, + }, + { + name: "rejects non-numeric names", + files: []string{"foo", "bar.txt", ".gitkeep"}, + expected: nil, + }, + { + name: "rejects float-like strings", + files: []string{"1e5", "NaN", "Inf", "+3", "-2", ".5"}, + expected: nil, + }, + { + name: "rejects too many dots", + files: []string{"1.2.3", "0.1.2"}, + expected: nil, + }, + { + name: "mixed valid and invalid", + files: []string{"0", "NaN", "2.0", "foo", "1e5", "3"}, + expected: []string{"0", "2.0", "3"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + mOS := mockos.NewMockOS(ctrl) + + var dirEntries []fs.DirEntry + for _, name := range tt.files { + mFileInfo := mockos.NewMockFileInfo(ctrl) + mFileInfo.EXPECT().IsDir().Return(false).AnyTimes() + + mDirEntry := mockos.NewMockDirEntry(ctrl) + mDirEntry.EXPECT().Info().Return(mFileInfo, nil).AnyTimes() + mDirEntry.EXPECT().Name().Return(name).AnyTimes() + + dirEntries = append(dirEntries, mDirEntry) + } + + mOS.EXPECT().ReadDir(gomock.Eq("/test/dir")).Return(dirEntries, nil) + + os = mOS + defer func() { os = osinterface.RealOS{} }() + + result, err := getGPUFiles("/test/dir") + require.NoError(t, err) + assert.Equal(t, tt.expected, result) + }) + } +} + func TestHPCName(t *testing.T) { assert.Equal(t, "hpcMapper", newHPCMapper(&appconfig.Config{}).Name()) }