diff --git a/cmd/vGPUmonitor/metrics.go b/cmd/vGPUmonitor/metrics.go index 24574d002..54973df8a 100644 --- a/cmd/vGPUmonitor/metrics.go +++ b/cmd/vGPUmonitor/metrics.go @@ -260,7 +260,7 @@ func (cc ClusterManagerCollector) collectGPUDeviceMetrics(ch chan<- prometheus.M } func (cc ClusterManagerCollector) collectGPUMemoryMetrics(ch chan<- prometheus.Metric, hdev nvml.Device, index int) error { - memory, ret := hdev.GetMemoryInfo() + memory, ret := util.GetCompatibleNVMLMemoryInfo(hdev) if ret != nvml.SUCCESS { return fmt.Errorf("nvml get memory error ret=%d", ret) } diff --git a/docker/Dockerfile.withlib b/docker/Dockerfile.withlib index 900066134..e610c7926 100644 --- a/docker/Dockerfile.withlib +++ b/docker/Dockerfile.withlib @@ -15,6 +15,7 @@ ENV NVIDIA_VISIBLE_DEVICES=all ENV NVIDIA_DRIVER_CAPABILITIES=utility ARG VERSION +ARG TARGETARCH LABEL version="$VERSION" LABEL maintainer="opensource@4paradigm.com" COPY ./LICENSE /k8s-vgpu/LICENSE @@ -22,6 +23,12 @@ COPY --from=gobuild /k8s-vgpu/bin /k8s-vgpu/bin COPY --from=gobuild /go/bin/nvidia-mig-parted /k8s-vgpu/bin/ COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh COPY ./lib /k8s-vgpu/lib +RUN set -eux; \ + if [ "$TARGETARCH" = "arm64" ]; then \ + mv -f /k8s-vgpu/lib/nvidia/libvgpu.arm64.so /k8s-vgpu/lib/nvidia/libvgpu.so; \ + else \ + rm -f /k8s-vgpu/lib/nvidia/libvgpu.arm64.so; \ + fi COPY ./docker/vgpu-init.sh /k8s-vgpu/bin/vgpu-init.sh ENV PATH="/k8s-vgpu/bin:${PATH}" diff --git a/lib/nvidia/libvgpu.arm64.so b/lib/nvidia/libvgpu.arm64.so new file mode 100755 index 000000000..932dcc1e8 Binary files /dev/null and b/lib/nvidia/libvgpu.arm64.so differ diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go index f1b765c47..d325c3e68 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go @@ -130,7 +130,7 @@ func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*util.DeviceInfo { panic(0) } memoryTotal := 0 - memory, ret := ndev.GetMemoryInfo() + memory, ret := util.GetCompatibleNVMLMemoryInfo(ndev) if ret == nvml.SUCCESS { memoryTotal = int(memory.Total) } else { @@ -142,6 +142,11 @@ func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*util.DeviceInfo { klog.Error("nvml get name error ret=", ret) panic(0) } + defaultShareMode := util.ShareModeTimeSlicing + config, ok := util.GetCompatibleConfigsByDeviceName(Model) + if ok && config.DefaultShareMode != "" { + defaultShareMode = config.DefaultShareMode + } if !strings.Contains(Model, "NVIDIA") { Model = fmt.Sprintf("%v-%v", "NVIDIA", Model) } @@ -184,6 +189,7 @@ func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*util.DeviceInfo { Mode: plugin.operatingMode, Health: health, Architecture: int32(architecture), + ShareMode: defaultShareMode, }) klog.Infof("nvml registered device id=%v, memory=%v, type=%v, numa=%v", idx, registeredmem, Model, numa) } @@ -216,6 +222,20 @@ func (plugin *NvidiaDevicePlugin) RegistrInAnnotation() error { klog.V(4).InfoS("patch nvidia topo score to node", "hami.io/node-nvidia-score", string(data)) annos[nvidia.HandshakeAnnos] = "Reported " + time.Now().String() annos[nvidia.RegisterAnnos] = encodeddevices + + // Ensure each discovered device has a sharemode annotation key present. + // Do NOT override an existing value potentially set by scheduler/API handlers. + // If missing, default to time-slicing. + for _, dev := range *devices { + shareModeKey := fmt.Sprintf(util.ShareModeAnnotationTpl, dev.ID) + if node.Annotations != nil { + if _, ok := node.Annotations[shareModeKey]; ok { + continue + } + } + annos[shareModeKey] = dev.ShareMode + } + if len(data) > 0 { annos[nvidia.RegisterGPUPairScore] = string(data) } diff --git a/pkg/scheduler/routes/gpu_manage.go b/pkg/scheduler/routes/gpu_manage.go index 2e3b8cb99..4ffa4c4e3 100644 --- a/pkg/scheduler/routes/gpu_manage.go +++ b/pkg/scheduler/routes/gpu_manage.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "net/http" + "slices" "sort" "strings" "time" @@ -34,9 +35,10 @@ type GPUAppInfo struct { type GPUDetail struct { GPUInfo - Apps []GPUAppInfo `json:"apps"` - MemoryAllocated *int64 `json:"memoryAllocated,omitempty"` - MemoryAvailable *int64 `json:"memoryAvailable,omitempty"` + AllowedShareModes []string `json:"allowedShareModes,omitempty"` + Apps []GPUAppInfo `json:"apps"` + MemoryAllocated *int64 `json:"memoryAllocated,omitempty"` + MemoryAvailable *int64 `json:"memoryAvailable,omitempty"` } type AssignGPURequest struct { @@ -111,11 +113,17 @@ func ListGPUDetails(s *scheduler.Scheduler) httprouter.Handle { for _, node := range nodes { for _, device := range node.Devices { + allowedShareModes := util.DefaultAllowedShareModes + config, ok := util.GetCompatibleConfigsByDeviceName(device.Type) + if ok && len(config.AllowedShareModes) > 0 { + allowedShareModes = config.AllowedShareModes + } uuidToGPUDetails[device.ID] = &GPUDetail{ GPUInfo: GPUInfo{ NodeName: node.Node.Name, DeviceInfo: device, }, + AllowedShareModes: allowedShareModes, } } } @@ -426,6 +434,14 @@ func SwitchGPUMode(s *scheduler.Scheduler) httprouter.Handle { for _, node := range nodes { for _, device := range node.Devices { if device.ID == uuid { + config, ok := util.GetCompatibleConfigsByDeviceName(device.Type) + if ok && len(config.AllowedShareModes) > 0 { + if !slices.Contains(config.AllowedShareModes, req.Mode) { + klog.Warningf("GPU %s does not support mode %s, refusing to switch", uuid, req.Mode) + http.Error(w, fmt.Sprintf("GPU %s does not support mode %s", uuid, req.Mode), http.StatusBadRequest) + return + } + } targetNode = node.Node break } diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 0a8f99cc0..ce3f5aa18 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -18,7 +18,6 @@ package scheduler import ( "context" - "errors" "fmt" "maps" "sort" @@ -30,6 +29,7 @@ import ( corev1 "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/informers" @@ -662,6 +662,200 @@ func (s *Scheduler) getPodUsage() (map[string]PodUseDeviceStat, error) { return podUsageStat, nil } +type nvidiaRequestSummary struct { + requested int + hasMemory bool + memoryByte int64 + memoryPercent int32 +} + +func summarizeNVIDIARequests(resourceReqs util.PodDeviceRequests) nvidiaRequestSummary { + sum := nvidiaRequestSummary{} + for _, ctrReqs := range resourceReqs { + for _, req := range ctrReqs { + if req.Type != nvidia.NvidiaGPUDevice || req.Nums <= 0 { + continue + } + sum.requested += int(req.Nums) + if req.Memreq > 0 { + sum.hasMemory = true + // similar to the memory request in pod spec, we only consider the maximum memory request for now + // this works with our current assumption that only one container in the pod has a memory request + if int64(req.Memreq) > sum.memoryByte { + sum.memoryByte = int64(req.Memreq) + } + continue + } + if req.MemPercentagereq != 0 && req.MemPercentagereq != 101 { + sum.hasMemory = true + // use the max percentage across containers for a conservative single-value summary + if req.MemPercentagereq > sum.memoryPercent { + sum.memoryPercent = req.MemPercentagereq + } + } + } + } + return sum +} + +func requiredNvidiaMemoryBytes(sum nvidiaRequestSummary, totalMemory int64) int64 { + if !sum.hasMemory { + return 0 + } + if sum.memoryByte > 0 { + return sum.memoryByte + } + if sum.memoryPercent > 0 && totalMemory > 0 { + return totalMemory * int64(sum.memoryPercent) / 100 + } + return 0 +} + +func normalizeGPUUUID(uuid string) string { + if strings.Contains(uuid, "[") { + return strings.Split(uuid, "[")[0] + } + return uuid +} + +func (s *Scheduler) collectConsumedGPUUUIDsByApp(appName string, currentPod *corev1.Pod) map[string]struct{} { + consumed := make(map[string]struct{}) + for _, p := range s.ListPodsInfo() { + if p.Labels == nil || p.Labels[util.AppNameLabelKey] != appName { + continue + } + if currentPod != nil && p.Namespace == currentPod.Namespace && p.Name == currentPod.Name { + continue + } + for _, podDevices := range p.Devices { + for _, containerDevices := range podDevices { + for _, assigned := range containerDevices { + uuid := normalizeGPUUUID(assigned.UUID) + if uuid != "" { + consumed[uuid] = struct{}{} + } + } + } + } + } + return consumed +} + +func (s *Scheduler) selectDynamicGPUCandidates( + appBoundUUIDs map[string]struct{}, + alreadySelected map[string]struct{}, + consumedByApp map[string]struct{}, + allBindings []*v1alpha1.GPUBinding, + requiredCount int, + requestSummary nvidiaRequestSummary, +) ([]string, error) { + if requiredCount <= 0 { + return nil, nil + } + nodes, err := s.ListNodes() + if err != nil { + return nil, err + } + uuidToNode := make(map[string]string) + for _, n := range nodes { + for _, d := range n.Devices { + uuidToNode[d.ID] = n.Node.Name + } + } + // todo: needs more flexibility + // when we allow an app to be bound to multiple nodes + // already consumed GPUs by this app should not be considered as constraints + // and not consumed GPUs may be used with or without other candidates + // e.g. if an app has 3 GPUs, with 1 consumed, 2 not consumed, it can be bound to any of the 2 not consumed GPUs + // with another not bound GPU + // or totally other 2 GPUs + pinnedNode := "" + for uuid := range appBoundUUIDs { + if nodeName, ok := uuidToNode[uuid]; ok { + pinnedNode = nodeName + break + } + } + + bindingCount := make(map[string]int) + bindingAllocatedMemory := make(map[string]int64) + for _, b := range allBindings { + if b.Spec.UUID == "" { + continue + } + bindingCount[b.Spec.UUID]++ + if b.Spec.Memory != nil { + bindingAllocatedMemory[b.Spec.UUID] += b.Spec.Memory.Value() + } + } + + memSlicingCandidates := make([]string, 0) + exclusiveCandidates := make([]string, 0) + timeSlicingCandidates := make([]string, 0) + + for _, n := range nodes { + if pinnedNode != "" && n.Node.Name != pinnedNode { + continue + } + for _, d := range n.Devices { + uuid := d.ID + if uuid == "" || !d.Health { + continue + } + if _, ok := appBoundUUIDs[uuid]; ok { + continue + } + if _, ok := alreadySelected[uuid]; ok { + continue + } + if _, ok := consumedByApp[uuid]; ok { + continue + } + + switch d.ShareMode { + case util.ShareModeMemSlicing: + if !requestSummary.hasMemory { + continue + } + requiredMemory := requiredNvidiaMemoryBytes(requestSummary, int64(d.Devmem)) + remaining := int64(d.Devmem) - bindingAllocatedMemory[uuid] + if remaining >= requiredMemory { + memSlicingCandidates = append(memSlicingCandidates, uuid) + } + case util.ShareModeExclusive: + if bindingCount[uuid] > 0 { + continue + } + if requestSummary.hasMemory { + requiredMemory := requiredNvidiaMemoryBytes(requestSummary, int64(d.Devmem)) + if int64(d.Devmem) < requiredMemory { + continue + } + } + exclusiveCandidates = append(exclusiveCandidates, uuid) + case util.ShareModeTimeSlicing: + if requestSummary.hasMemory { + requiredMemory := requiredNvidiaMemoryBytes(requestSummary, int64(d.Devmem)) + if int64(d.Devmem) < requiredMemory { + continue + } + } + timeSlicingCandidates = append(timeSlicingCandidates, uuid) + } + } + } + + result := make([]string, 0) + if requestSummary.hasMemory && len(memSlicingCandidates) > 0 { + result = append(result, memSlicingCandidates...) + } else if len(exclusiveCandidates) > 0 { + result = append(result, exclusiveCandidates...) + } else { + result = append(result, timeSlicingCandidates...) + } + return result, nil +} + func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.ExtenderBindingResult, error) { klog.InfoS("Attempting to bind pod to node", "pod", args.PodName, "namespace", args.PodNamespace, "node", args.Node) var res *extenderv1.ExtenderBindingResult @@ -749,35 +943,131 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi if annos == nil { annos = make(map[string]string) } - appName := args.Pod.Labels[util.AppNameLabelKey] - hasBindings := false - if appName != "" { - bindings, err := s.ListGPUBindings() - if err != nil { - klog.ErrorS(err, "Failed to list GPUBindings for Filter", "pod", klog.KObj(args.Pod)) - s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) - return nil, err - } - var uuids []string - for _, b := range bindings { - if b.Spec.AppName != appName { - continue + appName := "" + if args.Pod.Labels != nil { + appName = args.Pod.Labels[util.AppNameLabelKey] + } + if appName == "" { + err := fmt.Errorf("cannot schedule pod without %s label", util.AppNameLabelKey) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) + failedNodes := make(map[string]string) + if args.NodeNames != nil { + for _, nodeName := range *args.NodeNames { + failedNodes[nodeName] = "pod has no owner application" } - if !b.MatchPod(args.Pod) { - continue + } + return &extenderv1.ExtenderFilterResult{ + FailedNodes: failedNodes, + }, nil + } + + bindings, err := s.ListGPUBindings() + if err != nil { + klog.ErrorS(err, "Failed to list GPUBindings for Filter", "pod", klog.KObj(args.Pod)) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) + return nil, err + } + + appBoundByUUID := make(map[string]*v1alpha1.GPUBinding) + appBoundUUIDs := make(map[string]struct{}) + matchedBindings := make([]*v1alpha1.GPUBinding, 0) + for _, b := range bindings { + if b.Spec.AppName != appName || b.Spec.UUID == "" { + continue + } + appBoundUUIDs[b.Spec.UUID] = struct{}{} + if _, ok := appBoundByUUID[b.Spec.UUID]; !ok { + appBoundByUUID[b.Spec.UUID] = b + } + // todo: maybe we can remove this check, because the pod selector currently only matches the app name + if !b.MatchPod(args.Pod) { + continue + } + matchedBindings = append(matchedBindings, b) + // todo: currently this will conflict if the pod has multiple containers, or requires multiple GPUs with different memory requests in bindings + if b.Spec.Memory != nil { + annos[fmt.Sprintf(nvidia.AppGPUMemAnnotationTpl, b.Spec.UUID)] = b.Spec.Memory.String() + } + } + + policyMode := "" + if args.Pod.Labels != nil { + policyMode = args.Pod.Labels[nvidia.AppPodGPUConsumePolicyKey] + } + consumedByApp := s.collectConsumedGPUUUIDsByApp(appName, args.Pod) + if policyMode == "" || policyMode == nvidia.AppPodGPUConsumePolicyAll { + if len(matchedBindings) > 0 { + for _, b := range matchedBindings { + if _, occupied := consumedByApp[b.Spec.UUID]; occupied { + err := fmt.Errorf("bound GPU %s of app %s is already consumed by another pod", b.Spec.UUID, appName) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) + return &extenderv1.ExtenderFilterResult{ + FailedNodes: map[string]string{}, + }, nil + } } - hasBindings = true - uuids = append(uuids, b.Spec.UUID) - if b.Spec.Memory != nil { - annos[fmt.Sprintf(nvidia.AppGPUMemAnnotationTpl, b.Spec.UUID)] = b.Spec.Memory.String() + for ctrIdx := range resourceReqs { + for reqIdx, req := range resourceReqs[ctrIdx] { + if req.Type != nvidia.NvidiaGPUDevice || req.Nums <= 0 { + continue + } + // this assumes only one container in the pod has a gpu request + req.Nums = int32(len(matchedBindings)) + resourceReqs[ctrIdx][reqIdx] = req + } } } - if len(uuids) > 0 { - annos[nvidia.GPUUseUUID] = strings.Join(uuids, ",") - } else { - // Ensure the hint is empty if nothing matches - annos[nvidia.GPUUseUUID] = "" + } + + nvidiaSummary := summarizeNVIDIARequests(resourceReqs) + selectedUUIDs := make([]string, 0) + selectedUUIDSet := make(map[string]struct{}) + appendSelectedUUID := func(uuid string) { + if uuid == "" { + return + } + if _, ok := selectedUUIDSet[uuid]; ok { + return } + selectedUUIDSet[uuid] = struct{}{} + selectedUUIDs = append(selectedUUIDs, uuid) + } + + for _, b := range matchedBindings { + if _, occupied := consumedByApp[b.Spec.UUID]; occupied { + continue + } + appendSelectedUUID(b.Spec.UUID) + } + + if nvidiaSummary.requested > 0 && len(selectedUUIDs) < nvidiaSummary.requested { + dynamicCandidates, err := s.selectDynamicGPUCandidates( + appBoundUUIDs, + selectedUUIDSet, + consumedByApp, + bindings, + nvidiaSummary.requested-len(selectedUUIDs), + nvidiaSummary, + ) + if err != nil { + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) + return nil, err + } + for _, uuid := range dynamicCandidates { + appendSelectedUUID(uuid) + } + } + if nvidiaSummary.requested > 0 && len(selectedUUIDs) < nvidiaSummary.requested { + err := fmt.Errorf("insufficient GPU candidates for app %s, requested=%d, available=%d", appName, nvidiaSummary.requested, len(selectedUUIDs)) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) + return &extenderv1.ExtenderFilterResult{ + FailedNodes: map[string]string{}, + }, nil + } + if len(selectedUUIDs) > 0 { + annos[nvidia.GPUUseUUID] = strings.Join(selectedUUIDs, ",") + } else { + annos[nvidia.GPUUseUUID] = "" } s.delPod(args.Pod) nodeUsage, failedNodes, err := s.getNodesUsage(args.NodeNames, args.Pod) @@ -808,49 +1098,82 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi m := (*nodeScores).NodeList[len((*nodeScores).NodeList)-1] devlist, ok := m.Devices[nvidia.NvidiaGPUDevice] - if ok && len(devlist) > 0 && !hasBindings { - appName := args.Pod.Labels[util.AppNameLabelKey] - if appName == "" { - klog.V(4).InfoS("Cannot find the owner Application to create GPUBinding automatically", - "pod", args.Pod.Name) - err := errors.New("Cannot find the owner Application to create GPUBinding automatically") + if ok && len(devlist) > 0 { + nodeInfo, err := s.GetNode(m.NodeID) + if err != nil { s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) return nil, err } - var uuid string + shareModeByUUID := make(map[string]string) + for _, d := range nodeInfo.Devices { + shareModeByUUID[d.ID] = d.ShareMode + } + + allocatedUUIDs := make(map[string]struct{}) for _, cdev := range devlist { for _, dev := range cdev { - if dev.ShareMode == util.ShareModeTimeSlicing && dev.UUID != "" { - uuid = dev.UUID + uuid := normalizeGPUUUID(dev.UUID) + if uuid == "" { + continue } + allocatedUUIDs[uuid] = struct{}{} } } - if uuid == "" { - klog.V(4).InfoS("Cannot find a GPU UUID to create GPUBinding automatically", - "pod", args.Pod.Name) - err := errors.New("Cannot find a GPU UUID to create GPUBinding automatically") - s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) - return nil, err + bindingAllocatedMemory := make(map[string]int64) + for _, b := range bindings { + if b.Spec.UUID == "" || b.Spec.Memory == nil { + continue + } + bindingAllocatedMemory[b.Spec.UUID] += b.Spec.Memory.Value() + } + deviceTotalMemByUUID := make(map[string]int64) + for _, d := range nodeInfo.Devices { + deviceTotalMemByUUID[d.ID] = int64(d.Devmem) } - autoBinding := &v1alpha1.GPUBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: strings.ToLower(fmt.Sprintf("%s-%s-%d", appName, uuid, time.Now().Unix())), - }, - Spec: v1alpha1.GPUBindingSpec{ - UUID: uuid, - AppName: appName, - PodSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - util.AppNameLabelKey: appName, + for uuid := range allocatedUUIDs { + if _, exists := appBoundByUUID[uuid]; exists { + continue + } + autoBinding := &v1alpha1.GPUBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: strings.ToLower(fmt.Sprintf("%s-%s-%d", appName, uuid, time.Now().Unix())), + }, + Spec: v1alpha1.GPUBindingSpec{ + UUID: uuid, + AppName: appName, + PodSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + util.AppNameLabelKey: appName, + }, }, }, - }, - } - err := s.CreateGPUBinding(context.Background(), autoBinding) - if err != nil { - klog.ErrorS(err, "Failed to create GPUBinding automatically", "pod", args.Pod.Name) - s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) - return nil, err + } + if shareModeByUUID[uuid] == util.ShareModeMemSlicing { + totalMem := deviceTotalMemByUUID[uuid] + requiredMem := requiredNvidiaMemoryBytes(nvidiaSummary, totalMem) + if requiredMem <= 0 { + err := fmt.Errorf("invalid mem-slicing GPU memory request for binding on %s: request=%d", uuid, requiredMem) + klog.ErrorS(err, "Failed to create GPUBinding automatically", "pod", args.Pod.Name, "uuid", uuid) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) + return nil, err + } + if totalMem > 0 && bindingAllocatedMemory[uuid]+requiredMem > totalMem { + err := fmt.Errorf("insufficient mem-slicing GPU memory for binding on %s: allocated=%d, request=%d, total=%d", uuid, bindingAllocatedMemory[uuid], requiredMem, totalMem) + klog.ErrorS(err, "Failed to create GPUBinding automatically", "pod", args.Pod.Name, "uuid", uuid) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) + return nil, err + } + memQ := resource.NewQuantity(requiredMem, resource.BinarySI) + autoBinding.Spec.Memory = memQ + bindingAllocatedMemory[uuid] += requiredMem + } + err := s.CreateGPUBinding(context.Background(), autoBinding) + if err != nil { + klog.ErrorS(err, "Failed to create GPUBinding automatically", "pod", args.Pod.Name, "uuid", uuid) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) + return nil, err + } + appBoundByUUID[uuid] = autoBinding } } diff --git a/pkg/scheduler/score.go b/pkg/scheduler/score.go index 8843b2d8e..7fa3b1907 100644 --- a/pkg/scheduler/score.go +++ b/pkg/scheduler/score.go @@ -25,7 +25,6 @@ import ( "k8s.io/klog/v2" "github.com/Project-HAMi/HAMi/pkg/device" - "github.com/Project-HAMi/HAMi/pkg/device/nvidia" "github.com/Project-HAMi/HAMi/pkg/scheduler/config" "github.com/Project-HAMi/HAMi/pkg/scheduler/policy" "github.com/Project-HAMi/HAMi/pkg/util" @@ -119,84 +118,6 @@ func fitInDevices(node *NodeUsage, requests util.ContainerDeviceRequests, annos } func (s *Scheduler) calcScore(nodes *map[string]*NodeUsage, resourceReqs util.PodDeviceRequests, annos map[string]string, task *corev1.Pod, failedNodes map[string]string) (*policy.NodeScoreList, error) { - appName := "" - if task.Labels != nil { - appName = task.Labels[util.AppNameLabelKey] - } - - if appName != "" { - policyMode := task.Labels[nvidia.AppPodGPUConsumePolicyKey] - boundCSV := annos[nvidia.GPUUseUUID] - boundSet := map[string]struct{}{} - if boundCSV != "" { - for _, u := range strings.Split(boundCSV, ",") { - u = strings.TrimSpace(u) - if u != "" { - boundSet[u] = struct{}{} - } - } - } - // compute consumed UUIDs by live pods of the same app (in scheduler memory) - consumed := map[string]struct{}{} - for _, p := range s.ListPodsInfo() { - if p.Labels == nil || p.Labels[util.AppNameLabelKey] != appName { - continue - } - for _, pdev := range p.Devices { - for _, cdevs := range pdev { - for _, cdev := range cdevs { - uuid := cdev.UUID - if strings.Contains(uuid, "[") { - uuid = strings.Split(uuid, "[")[0] - } - if uuid != "" { - consumed[uuid] = struct{}{} - } - } - } - } - } - - if policyMode == "" || policyMode == nvidia.AppPodGPUConsumePolicyAll { - if len(consumed) > 0 { - empty := policy.NodeScoreList{Policy: config.NodeSchedulerPolicy, NodeList: []*policy.NodeScore{}} - return &empty, nil - } - if len(boundSet) > 0 { - for ctrIdx := range resourceReqs { - for key, req := range resourceReqs[ctrIdx] { - if req.Type == nvidia.NvidiaGPUDevice { - req.Nums = int32(len(boundSet)) - resourceReqs[ctrIdx][key] = req - } - } - } - } - } else if policyMode == nvidia.AppPodGPUConsumePolicySingle { - // allow only unconsumed bound UUIDs by filter out those already consumed by other pods of the same app - if len(boundSet) > 0 { - pruned := make([]string, 0, len(boundSet)) - for u := range boundSet { - if _, ok := consumed[u]; !ok { - pruned = append(pruned, u) - } - } - if len(pruned) == 0 { - empty := policy.NodeScoreList{Policy: config.NodeSchedulerPolicy, NodeList: []*policy.NodeScore{}} - return &empty, nil - } - annos[nvidia.GPUUseUUID] = strings.Join(pruned, ",") - } - for ctrIdx := range resourceReqs { - for key, req := range resourceReqs[ctrIdx] { - if req.Type == nvidia.NvidiaGPUDevice && req.Nums != 1 { - req.Nums = 1 - resourceReqs[ctrIdx][key] = req - } - } - } - } - } userNodePolicy := config.NodeSchedulerPolicy if annos != nil { if value, ok := annos[policy.NodeSchedulerPolicyAnnotationKey]; ok { diff --git a/pkg/util/compatible.go b/pkg/util/compatible.go new file mode 100644 index 000000000..8aa93636c --- /dev/null +++ b/pkg/util/compatible.go @@ -0,0 +1,64 @@ +package util + +import ( + "regexp" + "strings" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// GetCompatibleNVMLMemoryInfo wraps nvml.Device.GetMemoryInfo(). +// +// Some environments/drivers return nvml.ERROR_NOT_SUPPORTED for GetMemoryInfo. +// In that case, we fall back to GetName() and derive total memory by matching +// the device name against a (currently hardcoded) mapping table. +// +// Fallback behavior: +// - Total: derived from the mapping table (bytes) +// - Free: 0 +// - Used: Total +// +// NOTE: The mapping table is intentionally hardcoded for now; later it can be +// moved to configuration. +func GetCompatibleNVMLMemoryInfo(dev nvml.Device) (nvml.Memory, nvml.Return) { + mem, ret := dev.GetMemoryInfo() + if ret == nvml.SUCCESS || ret != nvml.ERROR_NOT_SUPPORTED { + return mem, ret + } + + name, nret := dev.GetName() + if nret != nvml.SUCCESS { + return mem, nret + } + + config, ok := GetCompatibleConfigsByDeviceName(name) + if !ok { + return mem, ret + } + return nvml.Memory{ + Total: config.TotalMemory, + Free: 0, + Used: config.TotalMemory, + }, nvml.SUCCESS +} + +type DeviceCompatibleConfigPattern struct { + Pattern *regexp.Regexp + TotalMemory uint64 // bytes + DefaultShareMode string + AllowedShareModes []string +} + +var compatibleConfigPatterns = []DeviceCompatibleConfigPattern{ + {Pattern: regexp.MustCompile(`^NVIDIA\s+GB10$`), TotalMemory: 96 * 1024 * 1024 * 1024, DefaultShareMode: ShareModeMemSlicing, AllowedShareModes: []string{ShareModeMemSlicing, ShareModeExclusive}}, +} + +func GetCompatibleConfigsByDeviceName(name string) (DeviceCompatibleConfigPattern, bool) { + n := strings.TrimSpace(name) + for _, rule := range compatibleConfigPatterns { + if rule.Pattern.MatchString(n) { + return rule, true + } + } + return DeviceCompatibleConfigPattern{}, false +} diff --git a/pkg/util/types.go b/pkg/util/types.go index 1af3c39dc..56f2b749c 100644 --- a/pkg/util/types.go +++ b/pkg/util/types.go @@ -55,7 +55,8 @@ const ( ) var ( - DebugMode bool + DefaultAllowedShareModes = []string{ShareModeTimeSlicing, ShareModeMemSlicing, ShareModeExclusive} + DebugMode bool NodeName string RuntimeSocketFlag string @@ -69,7 +70,7 @@ type ContainerDevice struct { Usedmem int32 Usedcores int32 CustomInfo map[string]any - ShareMode string + ShareMode string } type ContainerDeviceRequest struct { @@ -134,20 +135,20 @@ type DeviceUsage struct { } type DeviceInfo struct { - ID string `json:"id,omitempty"` - Index uint `json:"index,omitempty"` - Count int32 `json:"count,omitempty"` - Devmem int32 `json:"devmem,omitempty"` - Devcore int32 `json:"devcore,omitempty"` - Type string `json:"type,omitempty"` - Numa int `json:"numa,omitempty"` - Mode string `json:"mode,omitempty"` - Architecture int32 `json:"architecture,omitempty"` - MIGTemplate []Geometry `json:"migtemplate,omitempty"` - Health bool `json:"health,omitempty"` - DeviceVendor string `json:"devicevendor,omitempty"` - CustomInfo map[string]any `json:"custominfo,omitempty"` - ShareMode string `json:"sharemode,omitempty"` + ID string `json:"id,omitempty"` + Index uint `json:"index,omitempty"` + Count int32 `json:"count,omitempty"` + Devmem int32 `json:"devmem,omitempty"` + Devcore int32 `json:"devcore,omitempty"` + Type string `json:"type,omitempty"` + Numa int `json:"numa,omitempty"` + Mode string `json:"mode,omitempty"` + Architecture int32 `json:"architecture,omitempty"` + MIGTemplate []Geometry `json:"migtemplate,omitempty"` + Health bool `json:"health,omitempty"` + DeviceVendor string `json:"devicevendor,omitempty"` + CustomInfo map[string]any `json:"custominfo,omitempty"` + ShareMode string `json:"sharemode,omitempty"` DevicePairScore DevicePairScore `json:"devicepairscore,omitempty"` }