From 261ae034713ac9afbc0fae13ee7024e493f6e84c Mon Sep 17 00:00:00 2001 From: Tariq Ibrahim Date: Mon, 18 May 2026 09:16:23 -0700 Subject: [PATCH] [toolkit] add support for injecting /dev/dri* devices nodes for gfx MIGs Signed-off-by: Tariq Ibrahim --- go.mod | 2 +- go.sum | 4 +- .../internal/platform-support/dgpu/dgpu.go | 5 ++ .../internal/platform-support/dgpu/nvml.go | 50 +++++++++++++++++-- .../platform-support/dgpu/nvsandboxutils.go | 11 ++-- .../internal/platform-support/dgpu/options.go | 4 +- vendor/modules.txt | 2 +- 7 files changed, 65 insertions(+), 13 deletions(-) diff --git a/go.mod b/go.mod index 50a73b84c..be3da3652 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/NVIDIA/go-gpuallocator v0.6.0 github.com/NVIDIA/go-nvlib v0.10.0 github.com/NVIDIA/go-nvml v0.13.0-1.0.20260212130905-92cf8c963449 - github.com/NVIDIA/nvidia-container-toolkit v1.19.1-0.20260512181421-f3d822e54693 + github.com/NVIDIA/nvidia-container-toolkit v1.19.1-0.20260515194426-db3eab6e775b github.com/fsnotify/fsnotify v1.9.0 github.com/google/renameio v1.0.1 github.com/google/uuid v1.6.0 diff --git a/go.sum b/go.sum index 6fb429bd8..76ff14115 100644 --- a/go.sum +++ b/go.sum @@ -8,8 +8,8 @@ github.com/NVIDIA/go-nvlib v0.10.0 h1:2jbAFmvLBntIc/4iUChI9DzxyYNI92pohXU4kFuNrg github.com/NVIDIA/go-nvlib v0.10.0/go.mod h1:7mzx9FSdO9fXWP9NKuZmWkCwhkEcSWQFe2tmFwtLb9c= github.com/NVIDIA/go-nvml v0.13.0-1.0.20260212130905-92cf8c963449 h1:UrArFAaPhj9av2yzEN35CvzWw68BeQjp2MaQFUIoJSU= github.com/NVIDIA/go-nvml v0.13.0-1.0.20260212130905-92cf8c963449/go.mod h1:ahi2psRYoa+wYUBIrZPRO+wJs9lcvMhxSSkjjvsJJNQ= -github.com/NVIDIA/nvidia-container-toolkit v1.19.1-0.20260512181421-f3d822e54693 h1:f0fxvFODhPudJqfiPNNvQ/UfXseivUmN7WQPvJJ0VYo= -github.com/NVIDIA/nvidia-container-toolkit v1.19.1-0.20260512181421-f3d822e54693/go.mod h1:WMJdrwb3QWMjua4xAEg9n0NjyWCRKMSfCXF7lpPsdjg= +github.com/NVIDIA/nvidia-container-toolkit v1.19.1-0.20260515194426-db3eab6e775b h1:41nL4Cqw6WvzwrP5U1IVLHSBTIrS1J0ryaqdbr4ZnvA= +github.com/NVIDIA/nvidia-container-toolkit v1.19.1-0.20260515194426-db3eab6e775b/go.mod h1:WMJdrwb3QWMjua4xAEg9n0NjyWCRKMSfCXF7lpPsdjg= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/dgpu.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/dgpu.go index 6d3636f52..190154662 100644 --- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/dgpu.go +++ b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/dgpu.go @@ -72,6 +72,11 @@ func NewForMigDevice(d device.Device, mig device.MigDevice, opts ...Option) (dis return nil, err } o.isMigDevice = true + migProfile, err := mig.GetProfile() + if err != nil { + return nil, fmt.Errorf("error getting MIG Profile attributes: %w", err) + } + o.migAttributes = migProfile.GetInfo().Attributes var discoverers []discover.Discover var errs error diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/nvml.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/nvml.go index bf2fec86f..57c3bcf13 100644 --- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/nvml.go +++ b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/nvml.go @@ -18,6 +18,7 @@ package dgpu import ( "fmt" + "slices" "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" "github.com/NVIDIA/go-nvml/pkg/nvml" @@ -75,6 +76,7 @@ func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, erro type requiredMigInfo interface { getPlacementInfo() (int, int, int, error) getDevNodePath() (string, error) + getPCIBusID() (string, error) } func (o *options) newNvmlMigDiscoverer(d requiredMigInfo) (discover.Discover, error) { @@ -82,6 +84,14 @@ func (o *options) newNvmlMigDiscoverer(d requiredMigInfo) (discover.Discover, er return nil, fmt.Errorf("error getting MIG capability device paths: %v", o.migCapsError) } + var charDevicePaths []string + + parentPath, err := d.getDevNodePath() + if err != nil { + return nil, err + } + charDevicePaths = append(charDevicePaths, parentPath) + gpu, gi, ci, err := d.getPlacementInfo() if err != nil { return nil, fmt.Errorf("error getting placement info: %w", err) @@ -92,16 +102,44 @@ func (o *options) newNvmlMigDiscoverer(d requiredMigInfo) (discover.Discover, er if err != nil { return nil, fmt.Errorf("failed to get GI cap device path: %v", err) } + charDevicePaths = append(charDevicePaths, giCapDevicePath) ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci) ciCapDevicePath, err := o.migCaps.GetCapDevicePath(ciCap) if err != nil { return nil, fmt.Errorf("failed to get CI cap device path: %v", err) } - - parentPath, err := d.getDevNodePath() - if err != nil { - return nil, err + charDevicePaths = append(charDevicePaths, ciCapDevicePath) + + supportsDRI := slices.Contains(o.migAttributes, "gfx") + if supportsDRI { + pciBusID, err := d.getPCIBusID() + if err != nil { + return nil, fmt.Errorf("error getting PCI info for device: %w", err) + } + + drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID) + if err != nil { + return nil, fmt.Errorf("failed to determine DRM devices for %q: %w", pciBusID, err) + } + + charDevicePaths = append(charDevicePaths, drmDeviceNodes...) + deviceNodes := discover.NewCharDeviceDiscoverer( + o.logger, + o.driver.DevRoot, + charDevicePaths, + ) + byPathHooks := &byPathHookDiscoverer{ + logger: o.logger, + devRoot: o.driver.DevRoot, + hookCreator: o.hookCreator, + pciBusID: pciBusID, + deviceNodes: deviceNodes, + } + return discover.Merge( + deviceNodes, + byPathHooks, + ), nil } deviceNodes := discover.NewCharDeviceDiscoverer( @@ -165,3 +203,7 @@ func (d *toRequiredMigInfo) getPlacementInfo() (int, int, int, error) { func (d *toRequiredMigInfo) getDevNodePath() (string, error) { return d.parent.getDevNodePath() } + +func (d *toRequiredMigInfo) getPCIBusID() (string, error) { + return d.parent.GetPCIBusID() +} diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/nvsandboxutils.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/nvsandboxutils.go index 4e8eb2c80..620cc1b5b 100644 --- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/nvsandboxutils.go +++ b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/nvsandboxutils.go @@ -19,6 +19,7 @@ package dgpu import ( "fmt" "path/filepath" + "slices" "strings" "github.com/NVIDIA/go-nvml/pkg/nvml" @@ -31,7 +32,7 @@ type nvsandboxutilsDGPU struct { lib nvsandboxutils.Interface uuid string devRoot string - isMig bool + supportsDRI bool hookCreator discover.HookCreator deviceLinks []string } @@ -52,11 +53,13 @@ func (o *options) newNvsandboxutilsDGPUDiscoverer(d UUIDer) (discover.Discover, return nil, fmt.Errorf("failed to get device UUID: %w", nvmlRet) } + supportsDRI := !o.isMigDevice || slices.Contains(o.migAttributes, "gfx") + nvd := nvsandboxutilsDGPU{ lib: o.nvsandboxutilslib, uuid: uuid, devRoot: strings.TrimSuffix(filepath.Clean(o.driver.DevRoot), "/dev"), - isMig: o.isMigDevice, + supportsDRI: supportsDRI, hookCreator: o.hookCreator, } @@ -73,7 +76,7 @@ func (d *nvsandboxutilsDGPU) Devices() ([]discover.Device, error) { for _, info := range gpuFileInfos { switch info.SubType { case nvsandboxutils.NV_DEV_DRI_CARD, nvsandboxutils.NV_DEV_DRI_RENDERD: - if d.isMig { + if !d.supportsDRI { continue } fallthrough @@ -90,7 +93,7 @@ func (d *nvsandboxutilsDGPU) Devices() ([]discover.Device, error) { } devices = append(devices, device) case nvsandboxutils.NV_DEV_DRI_CARD_SYMLINK, nvsandboxutils.NV_DEV_DRI_RENDERD_SYMLINK: - if d.isMig { + if !d.supportsDRI { continue } if info.Flags == nvsandboxutils.NV_FILE_FLAG_CONTENT { diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/options.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/options.go index 7f8bf2403..6301e4ccb 100644 --- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/options.go +++ b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu/options.go @@ -29,7 +29,9 @@ type options struct { driver *root.Driver hookCreator discover.HookCreator - isMigDevice bool + isMigDevice bool + migAttributes []string + // migCaps stores the MIG capabilities for the system. // If MIG is not available, this is nil. migCaps nvcaps.MigCaps diff --git a/vendor/modules.txt b/vendor/modules.txt index 04df96f47..5083111ec 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -20,7 +20,7 @@ github.com/NVIDIA/go-nvlib/pkg/pciids ## explicit; go 1.20 github.com/NVIDIA/go-nvml/pkg/dl github.com/NVIDIA/go-nvml/pkg/nvml -# github.com/NVIDIA/nvidia-container-toolkit v1.19.1-0.20260512181421-f3d822e54693 +# github.com/NVIDIA/nvidia-container-toolkit v1.19.1-0.20260515194426-db3eab6e775b ## explicit; go 1.25.0 github.com/NVIDIA/nvidia-container-toolkit/internal/config/image github.com/NVIDIA/nvidia-container-toolkit/internal/devices