Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 55 additions & 11 deletions pkg/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,9 @@ func (s *Scheduler) collectConsumedGPUUUIDsByApp(appName string, currentPod *cor
}

func (s *Scheduler) selectDynamicGPUCandidates(
nodes map[string]*util.NodeInfo,
eligibleNodes map[string]struct{},
uuidToNode map[string]string,
appBoundUUIDs map[string]struct{},
alreadySelected map[string]struct{},
consumedByApp map[string]struct{},
Expand All @@ -752,16 +755,6 @@ func (s *Scheduler) selectDynamicGPUCandidates(
if requiredCount <= 0 {
return nil, nil
}
nodes, err := s.ListNodes()
if err != nil {
return nil, err
}
uuidToNode := make(map[string]string)
for _, n := range nodes {
for _, d := range n.Devices {
uuidToNode[d.ID] = n.Node.Name
}
}
// todo: needs more flexibility
// when we allow an app to be bound to multiple nodes
// already consumed GPUs by this app should not be considered as constraints
Expand All @@ -772,6 +765,9 @@ func (s *Scheduler) selectDynamicGPUCandidates(
pinnedNode := ""
for uuid := range appBoundUUIDs {
if nodeName, ok := uuidToNode[uuid]; ok {
if _, eligible := eligibleNodes[nodeName]; !eligible {
continue
}
pinnedNode = nodeName
break
}
Expand All @@ -794,6 +790,9 @@ func (s *Scheduler) selectDynamicGPUCandidates(
timeSlicingCandidates := make([]string, 0)

for _, n := range nodes {
if _, eligible := eligibleNodes[n.Node.Name]; !eligible {
continue
}
if pinnedNode != "" && n.Node.Name != pinnedNode {
continue
}
Expand Down Expand Up @@ -961,12 +960,31 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
}, nil
}

eligibleNodes := make(map[string]struct{})
if args.NodeNames != nil {
for _, nodeName := range *args.NodeNames {
eligibleNodes[nodeName] = struct{}{}
}
}

bindings, err := s.ListGPUBindings()
if err != nil {
klog.ErrorS(err, "Failed to list GPUBindings for Filter", "pod", klog.KObj(args.Pod))
s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err)
return nil, err
}
nodes, err := s.ListNodes()
if err != nil {
klog.ErrorS(err, "Failed to list nodes for Filter", "pod", klog.KObj(args.Pod))
s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err)
return nil, err
}
uuidToNode := make(map[string]string)
for _, node := range nodes {
for _, dev := range node.Devices {
uuidToNode[dev.ID] = node.Node.Name
}
}

appBoundByUUID := make(map[string]*v1alpha1.GPUBinding)
appBoundUUIDs := make(map[string]struct{})
Expand All @@ -975,12 +993,35 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi
if b.Spec.AppName != appName || b.Spec.UUID == "" {
continue
}
matchedPod := b.MatchPod(args.Pod)
// todo: restrict binding operation on specific nodes
// bindingNode, ok := uuidToNode[b.Spec.UUID]
// if !ok {
// if matchedPod {
// err := fmt.Errorf("GPU binding %s references unknown GPU %s for pod %s/%s", b.Name, b.Spec.UUID, args.Pod.Namespace, args.Pod.Name)
// s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err)
// return &extenderv1.ExtenderFilterResult{
// FailedNodes: map[string]string{},
// }, nil
// }
// continue
// }
// if _, eligible := eligibleNodes[bindingNode]; !eligible {
// if matchedPod {
// err := fmt.Errorf("GPU binding %s (uuid=%s) targets node %s, which conflicts with scheduler filtered nodes for pod %s/%s", b.Name, b.Spec.UUID, bindingNode, args.Pod.Namespace, args.Pod.Name)
// s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err)
// return &extenderv1.ExtenderFilterResult{
// FailedNodes: map[string]string{},
// }, nil
// }
// continue
// }
appBoundUUIDs[b.Spec.UUID] = struct{}{}
if _, ok := appBoundByUUID[b.Spec.UUID]; !ok {
appBoundByUUID[b.Spec.UUID] = b
}
// todo: maybe we can remove this check, because the pod selector currently only matches the app name
if !b.MatchPod(args.Pod) {
if !matchedPod {
continue
}
matchedBindings = append(matchedBindings, b)
Expand Down Expand Up @@ -1042,6 +1083,9 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi

if nvidiaSummary.requested > 0 && len(selectedUUIDs) < nvidiaSummary.requested {
dynamicCandidates, err := s.selectDynamicGPUCandidates(
nodes,
eligibleNodes,
uuidToNode,
appBoundUUIDs,
selectedUUIDSet,
consumedByApp,
Expand Down