From 126890129f301e61ac1dce67fa38b97f1a7f3b41 Mon Sep 17 00:00:00 2001 From: Jonathan Meiri <33288957+Meiri28@users.noreply.github.com> Date: Tue, 19 May 2026 17:07:37 +0300 Subject: [PATCH 1/2] test: add failing test for tie-break concentration in distributedAlloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TDD red phase for an issue independent of #1787. Setup: a node with two physical GPUs of equal advertised replica counts, where one slot on the "second" GPU has already been allocated to another pod. A new pod requests two more slots. The function name and docstring of distributedAlloc promise an even spread, but today the function deterministically picks both of the new pod's slots from the GPU with the most remaining replicas — leaving the other physical GPU's available slot untouched. The bug is in the sort tie-break. After the first pick the per-GPU 'used' counts tie across the candidates, and sort.Slice is unstable, so the next iteration ends up picking the next slot on the GPU we just picked from rather than rotating to the sibling GPU that still has capacity. Co-Authored-By: Claude Opus 4.7 (1M context) Co-Authored-By: runatom-ai <258621014+runatom-ai@users.noreply.github.com> Signed-off-by: Jonathan Meiri <33288957+Meiri28@users.noreply.github.com> --- internal/rm/allocate_test.go | 74 ++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 internal/rm/allocate_test.go diff --git a/internal/rm/allocate_test.go b/internal/rm/allocate_test.go new file mode 100644 index 000000000..f30a6250d --- /dev/null +++ b/internal/rm/allocate_test.go @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +import ( + "testing" + + "github.com/stretchr/testify/require" + pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" +) + +func makeReplicatedDevices(t *testing.T, gpuToReplicas map[string]int) Devices { + t.Helper() + ds := make(Devices) + for gpu, n := range gpuToReplicas { + for i := 0; i < n; i++ { + annotated := string(NewAnnotatedID(gpu, i)) + ds[annotated] = &Device{ + Device: pluginapi.Device{ID: annotated}, + Index: gpu, + Replicas: n, + } + } + } + return ds +} + +func countPerGPU(annotatedIDs []string) map[string]int { + counts := make(map[string]int) + for _, id := range annotatedIDs { + counts[AnnotatedID(id).GetID()]++ + } + return counts +} + +func TestDistributedAlloc_PartiallyAllocated_DistributesAcrossDistinctGPUs(t *testing.T) { + devices := makeReplicatedDevices(t, map[string]int{ + "GPU-0": 2, + "GPU-1": 2, + }) + r := &resourceManager{devices: devices} + + available := []string{ + "GPU-0::0", "GPU-0::1", + "GPU-1::1", + } + + allocated, err := r.distributedAlloc(available, nil, 2) + require.NoError(t, err) + require.Len(t, allocated, 2) + + counts := countPerGPU(allocated) + require.Equalf(t, 1, counts["GPU-0"], + "expected 1 slot from GPU-0 to keep allocations distributed across physical GPUs; got: %v", + counts) + require.Equalf(t, 1, counts["GPU-1"], + "expected 1 slot from GPU-1 (the still-available second physical GPU) instead of stacking both on GPU-0; got: %v", + counts) +} + From 326e38f5839036bb1ec151b7ecdf7f41fd26aa4a Mon Sep 17 00:00:00 2001 From: Jonathan Meiri <33288957+Meiri28@users.noreply.github.com> Date: Tue, 19 May 2026 17:08:41 +0300 Subject: [PATCH 2/2] fix(rm): break distributedAlloc sort-key ties by preferring untouched physical GPUs distributedAlloc sorts candidate replicas by 'used' (total - available) per underlying physical device. When two physical devices end up with the same 'used' count after some picks, sort.Slice is unstable and breaks the tie arbitrarily, which in practice causes the loop to keep picking from whichever device's slots happened to land first in the candidate list rather than rotating to a sibling physical device. That manifests when the available pool starts uneven (e.g., one slot on GPU-1 has already been allocated to another pod). The function name and docstring promise an even spread across replicated GPUs; the tie-break failure deterministically concentrates the new pod's slots on the GPU that had more available replicas, leaving the other physical device(s) untouched. Introduce a pickedFrom map tracking how many slots have been taken from each physical device during this allocation, and consult it as a tie-break sort key. The existing 'used' ordering remains primary; when two devices tie on that, the one we have not touched (or have touched the least) this allocation comes first. Behavior is unchanged whenever 'used' counts differ, including for fresh state and for cases where only one physical device has free slots. Failing tests added in the preceding commit now pass. Co-Authored-By: Claude Opus 4.7 (1M context) Co-Authored-By: runatom-ai <258621014+runatom-ai@users.noreply.github.com> Signed-off-by: Jonathan Meiri <33288957+Meiri28@users.noreply.github.com> --- internal/rm/allocate.go | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/internal/rm/allocate.go b/internal/rm/allocate.go index 166b68e84..8f994c8cf 100644 --- a/internal/rm/allocate.go +++ b/internal/rm/allocate.go @@ -51,13 +51,21 @@ func (r *resourceManager) distributedAlloc(available, required []string, size in replicas[id].total++ } + // Track how many slots have already been picked from each physical device + // during this allocation. Used as the tie-break sort key below so the + // allocator rotates to a sibling physical device when the underlying + // "used" counts would otherwise tie. + pickedFrom := make(map[string]int) + // Grab the set of 'needed' devices one-by-one from the candidates list. // Before selecting each candidate, first sort the candidate list using the // replicas map above. After sorting, the first element in the list will // contain the device with the least difference between total and available - // replications (based on what's already been allocated). Add this device - // to the list of devices to allocate, remove it from the candidate list, - // down its available count in the replicas map, and repeat. + // replications (based on what's already been allocated). When two devices + // tie on that count, prefer the physical device we have not touched (or + // have touched the least) during this allocation. Add this device to the + // list of devices to allocate, remove it from the candidate list, down + // its available count in the replicas map, and repeat. var devices []string for i := 0; i < needed; i++ { sort.Slice(candidates, func(i, j int) bool { @@ -65,9 +73,13 @@ func (r *resourceManager) distributedAlloc(available, required []string, size in jid := AnnotatedID(candidates[j]).GetID() idiff := replicas[iid].total - replicas[iid].available jdiff := replicas[jid].total - replicas[jid].available - return idiff < jdiff + if idiff != jdiff { + return idiff < jdiff + } + return pickedFrom[iid] < pickedFrom[jid] }) id := AnnotatedID(candidates[0]).GetID() + pickedFrom[id]++ replicas[id].available-- devices = append(devices, candidates[0]) candidates = candidates[1:]