Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions api/config/v1/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,13 @@ func NewConfig(c *cli.Context, flags []cli.Flag) (*Config, error) {
return config, nil
}

// DisableResourceNamingInConfig temporarily disable the resource renaming feature of the plugin.
// This may be reenabled in a future release.
// DisableResourceNamingInConfig disables the top-level resource renaming
// feature (config.Resources.GPUs / config.Resources.MIGs) and applies
// default values to the per-resource sharing entries.
//
// Per-entry Rename and Devices selection in Sharing.TimeSlicing and
// Sharing.MPS are preserved as the user configured them; only unset fields
// receive defaults.
func DisableResourceNamingInConfig(config *Config) {
// Disable resource renaming through config.Resource
if len(config.Resources.GPUs) > 0 || len(config.Resources.MIGs) > 0 {
Expand All @@ -88,10 +93,9 @@ func DisableResourceNamingInConfig(config *Config) {
config.Resources.GPUs = nil
config.Resources.MIGs = nil

// Disable renaming / device selection in Sharing.TimeSlicing.Resources
config.Sharing.TimeSlicing.disableResoureRenaming("timeSlicing")
// Disable renaming / device selection in Sharing.MPS.Resources
config.Sharing.MPS.disableResoureRenaming("mps")
// Apply defaults (auto-rename, default Devices.All) to sharing entries.
config.Sharing.TimeSlicing.applyDefaults()
config.Sharing.MPS.applyDefaults()
}

// parseConfig parses a config file as either YAML of JSON and unmarshals it into a Config struct.
Expand Down
78 changes: 78 additions & 0 deletions api/config/v1/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,84 @@ sharing:
require.False(t, *config.Sharing.MPS.FailRequestsGreaterThanOne)
}

func TestDisableResourceNamingInConfig_PreservesPerDeviceTimeSlicingSelection(t *testing.T) {
cfg := &Config{
Sharing: Sharing{
TimeSlicing: ReplicatedResources{
Resources: []ReplicatedResource{
{
Name: ResourceName("nvidia.com/gpu"),
Rename: ResourceName("nvidia.com/gpu-light"),
Devices: ReplicatedDevices{List: []ReplicatedDeviceRef{"0"}},
Replicas: 2,
},
{
Name: ResourceName("nvidia.com/gpu"),
Rename: ResourceName("nvidia.com/gpu-heavy"),
Devices: ReplicatedDevices{List: []ReplicatedDeviceRef{"1"}},
Replicas: 4,
},
},
},
},
}

DisableResourceNamingInConfig(cfg)

require.Len(t, cfg.Sharing.TimeSlicing.Resources, 2)

first := cfg.Sharing.TimeSlicing.Resources[0]
require.Equal(t, ResourceName("nvidia.com/gpu-light"), first.Rename,
"expected per-entry Rename to be preserved")
require.False(t, first.Devices.All,
"expected per-entry Devices.List to remain (Devices.All should NOT be forced)")
require.Equal(t, []ReplicatedDeviceRef{"0"}, first.Devices.List,
"expected Devices.List to be preserved")

second := cfg.Sharing.TimeSlicing.Resources[1]
require.Equal(t, ResourceName("nvidia.com/gpu-heavy"), second.Rename)
require.False(t, second.Devices.All)
require.Equal(t, []ReplicatedDeviceRef{"1"}, second.Devices.List)
}

func TestDisableResourceNamingInConfig_PreservesPerDeviceMPSSelection(t *testing.T) {
cfg := &Config{
Sharing: Sharing{
MPS: &ReplicatedResources{
Resources: []ReplicatedResource{
{
Name: ResourceName("nvidia.com/gpu"),
Rename: ResourceName("nvidia.com/gpu-light"),
Devices: ReplicatedDevices{List: []ReplicatedDeviceRef{"0"}},
Replicas: 2,
},
{
Name: ResourceName("nvidia.com/gpu"),
Rename: ResourceName("nvidia.com/gpu-heavy"),
Devices: ReplicatedDevices{List: []ReplicatedDeviceRef{"1"}},
Replicas: 4,
},
},
},
},
}

DisableResourceNamingInConfig(cfg)

require.NotNil(t, cfg.Sharing.MPS)
require.Len(t, cfg.Sharing.MPS.Resources, 2)

first := cfg.Sharing.MPS.Resources[0]
require.Equal(t, ResourceName("nvidia.com/gpu-light"), first.Rename)
require.False(t, first.Devices.All)
require.Equal(t, []ReplicatedDeviceRef{"0"}, first.Devices.List)

second := cfg.Sharing.MPS.Resources[1]
require.Equal(t, ResourceName("nvidia.com/gpu-heavy"), second.Rename)
require.False(t, second.Devices.All)
require.Equal(t, []ReplicatedDeviceRef{"1"}, second.Devices.List)
}

func newConfigForTest(t *testing.T, contents string) (*Config, error) {
t.Helper()

Expand Down
34 changes: 12 additions & 22 deletions api/config/v1/replicas.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"strings"

"github.com/google/uuid"
"k8s.io/klog/v2"
)

// ReplicatedResources defines generic options for replicating devices.
Expand All @@ -33,36 +32,27 @@ type ReplicatedResources struct {
Resources []ReplicatedResource `json:"resources,omitempty" yaml:"resources,omitempty"`
}

func (rrs *ReplicatedResources) disableResoureRenaming(id string) {
// applyDefaults fills in default values for entries in rrs.Resources, leaving
// any user-supplied Rename or Devices selection in place. A nil receiver is a
// no-op.
//
// Two defaults are applied:
// - If RenameByDefault is true and an entry has no explicit Rename, the
// entry's Rename is set to the default shared name for its Name.
// - If an entry leaves Devices unset (no All / Count / List), Devices.All
// is set to true so that the entry replicates every matching device.
func (rrs *ReplicatedResources) applyDefaults() {
if rrs == nil {
return
}
renameByDefault := rrs.RenameByDefault
setsNonDefaultRename := false
setsDevices := false
for i, r := range rrs.Resources {
if !renameByDefault && r.Rename != "" {
setsNonDefaultRename = true
rrs.Resources[i].Rename = ""
}
if renameByDefault && r.Rename != r.Name.DefaultSharedRename() {
setsNonDefaultRename = true
if rrs.RenameByDefault && r.Rename == "" {
rrs.Resources[i].Rename = r.Name.DefaultSharedRename()
}
if !r.Devices.All {
setsDevices = true
if !r.Devices.All && r.Devices.Count == 0 && len(r.Devices.List) == 0 {
rrs.Resources[i].Devices.All = true
rrs.Resources[i].Devices.Count = 0
rrs.Resources[i].Devices.List = nil
}
}
if setsNonDefaultRename {
klog.Warningf("Setting the 'rename' field in sharing.%s.resources is not yet supported in the config. Ignoring...", id)
}
if setsDevices {
klog.Warningf("Customizing the 'devices' field in sharing.%s.resources is not yet supported in the config. Ignoring...", id)
}

}

func (rrs *ReplicatedResources) isReplicated() bool {
Expand Down
34 changes: 9 additions & 25 deletions internal/rm/allocate.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,42 +33,26 @@ func (r *resourceManager) distributedAlloc(available, required []string, size in
return nil, fmt.Errorf("not enough available devices to satisfy allocation")
}

// For each candidate device, build a mapping of (stripped) device ID to
// total / available replicas for that device.
replicas := make(map[string]*struct{ total, available int })
// Count the number of currently-available replicas per underlying device.
replicas := make(map[string]int)
for _, c := range candidates {
id := AnnotatedID(c).GetID()
if _, exists := replicas[id]; !exists {
replicas[id] = &struct{ total, available int }{}
}
replicas[id].available++
}
for d := range r.devices {
id := AnnotatedID(d).GetID()
if _, exists := replicas[id]; !exists {
continue
}
replicas[id].total++
replicas[AnnotatedID(c).GetID()]++
}

// Grab the set of 'needed' devices one-by-one from the candidates list.
// Before selecting each candidate, first sort the candidate list using the
// replicas map above. After sorting, the first element in the list will
// contain the device with the least difference between total and available
// replications (based on what's already been allocated). Add this device
// to the list of devices to allocate, remove it from the candidate list,
// down its available count in the replicas map, and repeat.
// Before selecting each candidate, sort the list so that the device with
// the most remaining replicas comes first. This balances allocations
// across devices, including across devices with heterogeneous replica
// counts.
var devices []string
for i := 0; i < needed; i++ {
sort.Slice(candidates, func(i, j int) bool {
iid := AnnotatedID(candidates[i]).GetID()
jid := AnnotatedID(candidates[j]).GetID()
idiff := replicas[iid].total - replicas[iid].available
jdiff := replicas[jid].total - replicas[jid].available
return idiff < jdiff
return replicas[iid] > replicas[jid]
})
id := AnnotatedID(candidates[0]).GetID()
replicas[id].available--
replicas[id]--
devices = append(devices, candidates[0])
candidates = candidates[1:]
}
Expand Down
117 changes: 117 additions & 0 deletions internal/rm/allocate_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
/*
* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package rm

import (
"testing"

"github.com/stretchr/testify/require"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)

func makeReplicatedDevices(t *testing.T, gpuToReplicas map[string]int) Devices {
t.Helper()
ds := make(Devices)
for gpu, n := range gpuToReplicas {
for i := 0; i < n; i++ {
annotated := string(NewAnnotatedID(gpu, i))
ds[annotated] = &Device{
Device: pluginapi.Device{ID: annotated},
Index: gpu,
Replicas: n,
}
}
}
return ds
}

func countPerGPU(annotatedIDs []string) map[string]int {
counts := make(map[string]int)
for _, id := range annotatedIDs {
counts[AnnotatedID(id).GetID()]++
}
return counts
}

func TestDistributedAlloc_HeterogeneousReplicas_RespectsCapacity(t *testing.T) {
devices := makeReplicatedDevices(t, map[string]int{
"GPU-0": 2,
"GPU-1": 8,
})
r := &resourceManager{devices: devices}

available := []string{
"GPU-0::0", "GPU-0::1",
"GPU-1::0", "GPU-1::1", "GPU-1::2", "GPU-1::3",
"GPU-1::4", "GPU-1::5", "GPU-1::6", "GPU-1::7",
}

allocated, err := r.distributedAlloc(available, nil, 4)
require.NoError(t, err)
require.Len(t, allocated, 4)

counts := countPerGPU(allocated)
require.Equalf(t, 0, counts["GPU-0"],
"expected the 2-replica GPU to be left untouched while the 8-replica GPU still has capacity; got: %v",
counts)
require.Equalf(t, 4, counts["GPU-1"],
"expected all 4 picks from the 8-replica GPU; got: %v", counts)
}

func TestDistributedAlloc_HomogeneousReplicas_BalancesEvenly(t *testing.T) {
devices := makeReplicatedDevices(t, map[string]int{
"GPU-0": 2,
"GPU-1": 2,
})
r := &resourceManager{devices: devices}

available := []string{
"GPU-0::0", "GPU-0::1",
"GPU-1::0", "GPU-1::1",
}

allocated, err := r.distributedAlloc(available, nil, 2)
require.NoError(t, err)
require.Len(t, allocated, 2)

counts := countPerGPU(allocated)
require.Equalf(t, 1, counts["GPU-0"],
"expected balanced distribution; got: %v", counts)
require.Equalf(t, 1, counts["GPU-1"],
"expected balanced distribution; got: %v", counts)
}

func TestDistributedAlloc_HomogeneousReplicas_FullCapacity(t *testing.T) {
devices := makeReplicatedDevices(t, map[string]int{
"GPU-0": 2,
"GPU-1": 2,
})
r := &resourceManager{devices: devices}

available := []string{
"GPU-0::0", "GPU-0::1",
"GPU-1::0", "GPU-1::1",
}

allocated, err := r.distributedAlloc(available, nil, 4)
require.NoError(t, err)
require.Len(t, allocated, 4)

counts := countPerGPU(allocated)
require.Equalf(t, 2, counts["GPU-0"], "expected 2 from GPU-0; got: %v", counts)
require.Equalf(t, 2, counts["GPU-1"], "expected 2 from GPU-1; got: %v", counts)
}