From bf628a11bb8cc10743f7669a8b29f9344391c02f Mon Sep 17 00:00:00 2001 From: Mesut Oezdil <114185853+mesutoezdil@users.noreply.github.com> Date: Fri, 22 May 2026 05:40:25 +0300 Subject: [PATCH] fix(quota): clamp Used to zero in RmUsage to prevent negative tracking (#1880) * fix(quota): clamp Used to zero in RmUsage to prevent negative tracking Quota.Used could go negative when RmUsage was called for a pod that had no matching AddUsage entry. FitQuota checks Used+alloc>Limit so a negative Used value allows scheduling beyond the namespace quota. PR 1400 added a second RmUsage call site in the terminated-pod path of onAddPod. In edge cases both onAddPod and onDelPod can attempt to remove the same pod from the quota, making the underflow reachable. Guard against this by clamping Used to zero on subtraction. Signed-off-by: mesutoezdil * fix(quota): single map lookup and nil guard in RmUsage Replace the double lookup with a single one and add a nil check before dereferencing the Quota pointer. Signed-off-by: mesutoezdil * fix(quota): prevent double RmUsage via atomic TakeAndDeletePod Add TakeAndDeletePod to PodManager: get and delete in a single lock acquisition so only one caller wins. Use it in both onAddPod (terminated path) and onDelPod to guarantee RmUsage is called exactly once per pod. Revert the max(0,...) clamp in RmUsage since the double-call is now prevented at the source. Signed-off-by: mesutoezdil * style: fix gofmt formatting in quota_test.go Signed-off-by: mesutoezdil --------- Signed-off-by: mesutoezdil --- pkg/device/pod_test.go | 18 ++++++++++++++++++ pkg/device/pods.go | 12 ++++++++++++ pkg/device/quota.go | 5 ++--- pkg/scheduler/scheduler.go | 8 ++------ 4 files changed, 34 insertions(+), 9 deletions(-) diff --git a/pkg/device/pod_test.go b/pkg/device/pod_test.go index e6072531b..fd267042a 100644 --- a/pkg/device/pod_test.go +++ b/pkg/device/pod_test.go @@ -542,3 +542,21 @@ func TestContainerDeviceDeepCopy(t *testing.T) { _, exists := original.CustomInfo["key2"] assert.False(t, exists, "original CustomInfo should not have key2") } + +func TestTakeAndDeletePodIsAtomic(t *testing.T) { + pm := NewPodManager() + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{ + Name: "p", + Namespace: "ns", + UID: k8stypes.UID("uid-1"), + }} + pm.AddPod(pod, "node-1", PodDevices{}) + + pi1, ok1 := pm.TakeAndDeletePod(pod) + pi2, ok2 := pm.TakeAndDeletePod(pod) + + assert.True(t, ok1) + assert.NotNil(t, pi1) + assert.False(t, ok2) + assert.Nil(t, pi2) +} diff --git a/pkg/device/pods.go b/pkg/device/pods.go index 5b0d916ad..431ea9f4f 100644 --- a/pkg/device/pods.go +++ b/pkg/device/pods.go @@ -119,6 +119,18 @@ func (m *PodManager) GetPod(pod *corev1.Pod) (*PodInfo, bool) { return pi, ok } +func (m *PodManager) TakeAndDeletePod(pod *corev1.Pod) (*PodInfo, bool) { + m.mutex.Lock() + defer m.mutex.Unlock() + + pi, ok := m.pods[pod.UID] + if ok { + delete(m.pods, pod.UID) + klog.InfoS("Pod taken and deleted", "pod", klog.KRef(pod.Namespace, pod.Name), "nodeID", pi.NodeID) + } + return pi, ok +} + func (m *PodManager) ListPodsUID() ([]*corev1.Pod, error) { m.mutex.RLock() defer m.mutex.RUnlock() diff --git a/pkg/device/quota.go b/pkg/device/quota.go index 1a01fe0c9..288a03f71 100644 --- a/pkg/device/quota.go +++ b/pkg/device/quota.go @@ -155,9 +155,8 @@ func (q *QuotaManager) RmUsage(pod *corev1.Pod, podDev PodDevices) { return } for idx, val := range usage { - _, ok = (*dp)[idx] - if ok { - (*dp)[idx].Used -= val + if qInfo, ok := (*dp)[idx]; ok && qInfo != nil { + qInfo.Used -= val } } if klog.V(4).Enabled() { diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index f68f2f1c8..a2c999b63 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -145,11 +145,9 @@ func (s *Scheduler) onAddPod(obj any) { return } if util.IsPodInTerminatedState(pod) { - pi, ok := s.podManager.GetPod(pod) - if ok { + if pi, ok := s.podManager.TakeAndDeletePod(pod); ok { s.quotaManager.RmUsage(pod, pi.Devices) } - s.podManager.DelPod(pod) return } if util.IsPodTerminating(pod) { @@ -195,10 +193,8 @@ func (s *Scheduler) onDelPod(obj any) { if !ok { return } - pi, ok := s.podManager.GetPod(pod) - if ok { + if pi, ok := s.podManager.TakeAndDeletePod(pod); ok { s.quotaManager.RmUsage(pod, pi.Devices) - s.podManager.DelPod(pod) } }