Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ The DCGM-exporter can include High-Performance Computing (HPC) job information i

These mapping files follow a specific format:

* Each file is named after a unique GPU ID (e.g., 0, 1, 2, etc.).
* Each line in the file contains JOB IDs that run on the corresponding GPU.
* Each file is named after either a unique GPU ID or a unique GPU ID and a GPU instance (MIG) ID separated with a "." (e.g., 0, 1, 2.0, 2.1, 3, etc.).
* Each line in the file contains JOB IDs that run on the corresponding GPU/MIG instance.

#### Enabling HPC Job Mapping on DCGM-Exporter

Expand Down
22 changes: 17 additions & 5 deletions internal/pkg/transformation/hpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import (
"log/slog"
sysOS "os"
"path"
"strconv"
"regexp"

"github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig"
"github.com/NVIDIA/dcgm-exporter/internal/pkg/collector"
Expand Down Expand Up @@ -94,7 +94,16 @@ func (p *hpcMapper) Process(metrics collector.MetricsByCounter, _ deviceinfo.Pro
for counter := range metrics {
var modifiedMetrics []collector.Metric
for _, metric := range metrics[counter] {
jobs, exists := gpuToJobMap[metric.GPU]

var jobMapKey string
if metric.MigProfile != "" {
jobMapKey = fmt.Sprintf("%s.%s", metric.GPU, metric.GPUInstanceID)
} else {
jobMapKey = metric.GPU
}

jobs, exists := gpuToJobMap[jobMapKey]

if exists && len(jobs) != 0 {
for _, job := range jobs {
modifiedMetric, err := utils.DeepCopy(metric)
Expand Down Expand Up @@ -151,6 +160,10 @@ func readFile(path string) ([]string, error) {
return jobs, nil
}

// validGPUFile matches Slurm job mapping filenames: plain GPU IDs (e.g. "0", "1")
// or MIG instance IDs in <GPU_ID>.<GPU_INSTANCE_ID> format (e.g. "2.0", "3.10").
var validGPUFile = regexp.MustCompile(`^\d+(\.\d+)?$`)

func getGPUFiles(dirPath string) ([]string, error) {
files, err := os.ReadDir(dirPath)
if err != nil {
Expand All @@ -173,9 +186,8 @@ func getGPUFiles(dirPath string) ([]string, error) {
continue // Skip directories
}

_, err = strconv.Atoi(file.Name())
if err != nil {
slog.Debug(fmt.Sprintf("HPC mapper: file %q name doesn't match with GPU ID convention", file.Name()))
if !validGPUFile.MatchString(file.Name()) {
slog.Debug(fmt.Sprintf("HPC mapper: file %q name doesn't match with <GPU_ID> or <GPU_ID>.<GPU_INSTANCE_ID> convention", file.Name()))
continue
}
mappingFiles = append(mappingFiles, file.Name())
Expand Down
110 changes: 109 additions & 1 deletion internal/pkg/transformation/hpc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,13 @@ func TestHPCProcess(t *testing.T) {
mDirEntryGPU1.EXPECT().Info().Return(mFileInfoGPU1, nil).AnyTimes()
mDirEntryGPU1.EXPECT().Name().Return("1").AnyTimes()

mFileInfoGPU3MIG0 := mockos.NewMockFileInfo(ctrl)
mFileInfoGPU3MIG0.EXPECT().IsDir().Return(false).AnyTimes()

mDirEntryGPU3MIG0 := mockos.NewMockDirEntry(ctrl)
mDirEntryGPU3MIG0.EXPECT().Info().Return(mFileInfoGPU3MIG0, nil).AnyTimes()
mDirEntryGPU3MIG0.EXPECT().Name().Return("3.0").AnyTimes()

mFileInfoDir := mockos.NewMockFileInfo(ctrl)
mFileInfoDir.EXPECT().IsDir().Return(true).AnyTimes()

Expand All @@ -86,6 +93,7 @@ func TestHPCProcess(t *testing.T) {
Return([]fs.DirEntry{
mDirEntryGPU0,
mDirEntryGPU1,
mDirEntryGPU3MIG0,
mDirEntryDir,
mDirEntryDamagedFile,
}, nil).AnyTimes()
Expand All @@ -101,8 +109,14 @@ func TestHPCProcess(t *testing.T) {
_, _ = slurm1.WriteString("job2-1\n")
slurm1.Close()

slurm3dot0, err := realOS.CreateTemp("", "slurm3dot0")
require.NoError(t, err)
_, _ = slurm3dot0.WriteString("job3.0-0")
slurm3dot0.Close()

mOS.EXPECT().Open(gomock.Eq("/var/run/nvidia/slurm/0")).Return(realOS.Open(slurm0.Name()))
mOS.EXPECT().Open(gomock.Eq("/var/run/nvidia/slurm/1")).Return(realOS.Open(slurm1.Name()))
mOS.EXPECT().Open(gomock.Eq("/var/run/nvidia/slurm/3.0")).Return(realOS.Open(slurm3dot0.Name()))

os = mOS
return func() {
Expand All @@ -111,13 +125,15 @@ func TestHPCProcess(t *testing.T) {
_ = realOS.Remove(slurm0.Name())
slurm1.Close()
_ = realOS.Remove(slurm1.Name())
slurm3dot0.Close()
_ = realOS.Remove(slurm3dot0.Name())
}
},
assertion: func(t *testing.T, mbc collector.MetricsByCounter) {
require.Len(t, mbc, 1, "metrics are expected for a single counter only.")
// We get metric value with 0 index
metricValues := mbc[reflect.ValueOf(mbc).MapKeys()[0].Interface().(counters.Counter)]
require.Len(t, metricValues, 4, "received unexpected number of metric values.")
require.Len(t, metricValues, 5, "received unexpected number of metric values.")
// Sort metrics by GPU ID
slices.SortFunc(metricValues, func(a, b collector.Metric) int {
return cmp.Compare(a.GPU, b.GPU)
Expand All @@ -137,6 +153,11 @@ func TestHPCProcess(t *testing.T) {
assert.Equal(t, "2", metricValues[3].GPU)
assert.Equal(t, "1984", metricValues[3].Value)
assert.NotContains(t, metricValues[3].Attributes, hpcJobAttribute)

assert.Equal(t, "3", metricValues[4].GPU)
assert.Equal(t, "0", metricValues[4].GPUInstanceID)
assert.Equal(t, "123", metricValues[4].Value)
assert.Equal(t, "job3.0-0", metricValues[4].Attributes[hpcJobAttribute])
},
},
}
Expand Down Expand Up @@ -196,6 +217,21 @@ func TestHPCProcess(t *testing.T) {
Attributes: map[string]string{},
})

metrics[counter] = append(metrics[counter], collector.Metric{
GPU: "3",
GPUUUID: uuid.New().String(),
GPUDevice: "nvidia2",
GPUInstanceID: "0",
Value: "123",
MigProfile: "3g.70gb",
Counter: counters.Counter{
FieldID: 1001,
FieldName: "DCGM_FI_PROF_GR_ENGINE_ACTIVE",
PromType: "gauge",
},
Attributes: map[string]string{},
})

mapper := newHPCMapper(tt.config)
err := mapper.Process(metrics, nil)
if tt.wantErr != nil && !tt.wantErr(t, err, fmt.Sprintf("hpcMapper.Process(%v,%v)", metrics, nil)) {
Expand All @@ -206,6 +242,78 @@ func TestHPCProcess(t *testing.T) {
}
}

func TestGetGPUFiles(t *testing.T) {
tests := []struct {
name string
files []string
expected []string
}{
{
name: "plain GPU IDs",
files: []string{"0", "1", "42"},
expected: []string{"0", "1", "42"},
},
{
name: "MIG GPU.instance IDs",
files: []string{"0.0", "2.1", "3.10"},
expected: []string{"0.0", "2.1", "3.10"},
},
{
name: "mixed valid files",
files: []string{"0", "1", "2.0", "3.1"},
expected: []string{"0", "1", "2.0", "3.1"},
},
{
name: "rejects non-numeric names",
files: []string{"foo", "bar.txt", ".gitkeep"},
expected: nil,
},
{
name: "rejects float-like strings",
files: []string{"1e5", "NaN", "Inf", "+3", "-2", ".5"},
expected: nil,
},
{
name: "rejects too many dots",
files: []string{"1.2.3", "0.1.2"},
expected: nil,
},
{
name: "mixed valid and invalid",
files: []string{"0", "NaN", "2.0", "foo", "1e5", "3"},
expected: []string{"0", "2.0", "3"},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctrl := gomock.NewController(t)
mOS := mockos.NewMockOS(ctrl)

var dirEntries []fs.DirEntry
for _, name := range tt.files {
mFileInfo := mockos.NewMockFileInfo(ctrl)
mFileInfo.EXPECT().IsDir().Return(false).AnyTimes()

mDirEntry := mockos.NewMockDirEntry(ctrl)
mDirEntry.EXPECT().Info().Return(mFileInfo, nil).AnyTimes()
mDirEntry.EXPECT().Name().Return(name).AnyTimes()

dirEntries = append(dirEntries, mDirEntry)
}

mOS.EXPECT().ReadDir(gomock.Eq("/test/dir")).Return(dirEntries, nil)

os = mOS
defer func() { os = osinterface.RealOS{} }()

result, err := getGPUFiles("/test/dir")
require.NoError(t, err)
assert.Equal(t, tt.expected, result)
})
}
}

func TestHPCName(t *testing.T) {
assert.Equal(t, "hpcMapper", newHPCMapper(&appconfig.Config{}).Name())
}
Loading