Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions internal/rm/allocate.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,23 +51,35 @@ func (r *resourceManager) distributedAlloc(available, required []string, size in
replicas[id].total++
}

// Track how many slots have already been picked from each physical device
// during this allocation. Used as the tie-break sort key below so the
// allocator rotates to a sibling physical device when the underlying
// "used" counts would otherwise tie.
pickedFrom := make(map[string]int)

// Grab the set of 'needed' devices one-by-one from the candidates list.
// Before selecting each candidate, first sort the candidate list using the
// replicas map above. After sorting, the first element in the list will
// contain the device with the least difference between total and available
// replications (based on what's already been allocated). Add this device
// to the list of devices to allocate, remove it from the candidate list,
// down its available count in the replicas map, and repeat.
// replications (based on what's already been allocated). When two devices
// tie on that count, prefer the physical device we have not touched (or
// have touched the least) during this allocation. Add this device to the
// list of devices to allocate, remove it from the candidate list, down
// its available count in the replicas map, and repeat.
var devices []string
for i := 0; i < needed; i++ {
sort.Slice(candidates, func(i, j int) bool {
iid := AnnotatedID(candidates[i]).GetID()
jid := AnnotatedID(candidates[j]).GetID()
idiff := replicas[iid].total - replicas[iid].available
jdiff := replicas[jid].total - replicas[jid].available
return idiff < jdiff
if idiff != jdiff {
return idiff < jdiff
}
return pickedFrom[iid] < pickedFrom[jid]
})
id := AnnotatedID(candidates[0]).GetID()
pickedFrom[id]++
replicas[id].available--
devices = append(devices, candidates[0])
candidates = candidates[1:]
Expand Down
74 changes: 74 additions & 0 deletions internal/rm/allocate_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package rm

import (
"testing"

"github.com/stretchr/testify/require"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)

func makeReplicatedDevices(t *testing.T, gpuToReplicas map[string]int) Devices {
t.Helper()
ds := make(Devices)
for gpu, n := range gpuToReplicas {
for i := 0; i < n; i++ {
annotated := string(NewAnnotatedID(gpu, i))
ds[annotated] = &Device{
Device: pluginapi.Device{ID: annotated},
Index: gpu,
Replicas: n,
}
}
}
return ds
}

func countPerGPU(annotatedIDs []string) map[string]int {
counts := make(map[string]int)
for _, id := range annotatedIDs {
counts[AnnotatedID(id).GetID()]++
}
return counts
}

func TestDistributedAlloc_PartiallyAllocated_DistributesAcrossDistinctGPUs(t *testing.T) {
devices := makeReplicatedDevices(t, map[string]int{
"GPU-0": 2,
"GPU-1": 2,
})
r := &resourceManager{devices: devices}

available := []string{
"GPU-0::0", "GPU-0::1",
"GPU-1::1",
}

allocated, err := r.distributedAlloc(available, nil, 2)
require.NoError(t, err)
require.Len(t, allocated, 2)

counts := countPerGPU(allocated)
require.Equalf(t, 1, counts["GPU-0"],
"expected 1 slot from GPU-0 to keep allocations distributed across physical GPUs; got: %v",
counts)
require.Equalf(t, 1, counts["GPU-1"],
"expected 1 slot from GPU-1 (the still-available second physical GPU) instead of stacking both on GPU-0; got: %v",
counts)
}