Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/vGPUmonitor/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ func (cc ClusterManagerCollector) collectGPUDeviceMetrics(ch chan<- prometheus.M
}

func (cc ClusterManagerCollector) collectGPUMemoryMetrics(ch chan<- prometheus.Metric, hdev nvml.Device, index int) error {
memory, ret := hdev.GetMemoryInfo()
memory, ret := util.GetCompatibleNVMLMemoryInfo(hdev)
if ret != nvml.SUCCESS {
return fmt.Errorf("nvml get memory error ret=%d", ret)
}
Expand Down
7 changes: 7 additions & 0 deletions docker/Dockerfile.withlib
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,20 @@ ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=utility

ARG VERSION
ARG TARGETARCH
LABEL version="$VERSION"
LABEL maintainer="opensource@4paradigm.com"
COPY ./LICENSE /k8s-vgpu/LICENSE
COPY --from=gobuild /k8s-vgpu/bin /k8s-vgpu/bin
COPY --from=gobuild /go/bin/nvidia-mig-parted /k8s-vgpu/bin/
COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh
COPY ./lib /k8s-vgpu/lib
RUN set -eux; \
if [ "$TARGETARCH" = "arm64" ]; then \
mv -f /k8s-vgpu/lib/nvidia/libvgpu.arm64.so /k8s-vgpu/lib/nvidia/libvgpu.so; \
else \
rm -f /k8s-vgpu/lib/nvidia/libvgpu.arm64.so; \
fi
COPY ./docker/vgpu-init.sh /k8s-vgpu/bin/vgpu-init.sh

ENV PATH="/k8s-vgpu/bin:${PATH}"
Expand Down
Binary file added lib/nvidia/libvgpu.arm64.so
Binary file not shown.
22 changes: 21 additions & 1 deletion pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*util.DeviceInfo {
panic(0)
}
memoryTotal := 0
memory, ret := ndev.GetMemoryInfo()
memory, ret := util.GetCompatibleNVMLMemoryInfo(ndev)
if ret == nvml.SUCCESS {
memoryTotal = int(memory.Total)
} else {
Expand All @@ -142,6 +142,11 @@ func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*util.DeviceInfo {
klog.Error("nvml get name error ret=", ret)
panic(0)
}
defaultShareMode := util.ShareModeTimeSlicing
config, ok := util.GetCompatibleConfigsByDeviceName(Model)
if ok && config.DefaultShareMode != "" {
defaultShareMode = config.DefaultShareMode
}
if !strings.Contains(Model, "NVIDIA") {
Model = fmt.Sprintf("%v-%v", "NVIDIA", Model)
}
Expand Down Expand Up @@ -184,6 +189,7 @@ func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*util.DeviceInfo {
Mode: plugin.operatingMode,
Health: health,
Architecture: int32(architecture),
ShareMode: defaultShareMode,
})
klog.Infof("nvml registered device id=%v, memory=%v, type=%v, numa=%v", idx, registeredmem, Model, numa)
}
Expand Down Expand Up @@ -216,6 +222,20 @@ func (plugin *NvidiaDevicePlugin) RegistrInAnnotation() error {
klog.V(4).InfoS("patch nvidia topo score to node", "hami.io/node-nvidia-score", string(data))
annos[nvidia.HandshakeAnnos] = "Reported " + time.Now().String()
annos[nvidia.RegisterAnnos] = encodeddevices

// Ensure each discovered device has a sharemode annotation key present.
// Do NOT override an existing value potentially set by scheduler/API handlers.
// If missing, default to time-slicing.
for _, dev := range *devices {
shareModeKey := fmt.Sprintf(util.ShareModeAnnotationTpl, dev.ID)
if node.Annotations != nil {
if _, ok := node.Annotations[shareModeKey]; ok {
continue
}
}
annos[shareModeKey] = dev.ShareMode
}

if len(data) > 0 {
annos[nvidia.RegisterGPUPairScore] = string(data)
}
Expand Down
22 changes: 19 additions & 3 deletions pkg/scheduler/routes/gpu_manage.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"encoding/json"
"fmt"
"net/http"
"slices"
"sort"
"strings"
"time"
Expand Down Expand Up @@ -34,9 +35,10 @@ type GPUAppInfo struct {

type GPUDetail struct {
GPUInfo
Apps []GPUAppInfo `json:"apps"`
MemoryAllocated *int64 `json:"memoryAllocated,omitempty"`
MemoryAvailable *int64 `json:"memoryAvailable,omitempty"`
AllowedShareModes []string `json:"allowedShareModes,omitempty"`
Apps []GPUAppInfo `json:"apps"`
MemoryAllocated *int64 `json:"memoryAllocated,omitempty"`
MemoryAvailable *int64 `json:"memoryAvailable,omitempty"`
}

type AssignGPURequest struct {
Expand Down Expand Up @@ -111,11 +113,17 @@ func ListGPUDetails(s *scheduler.Scheduler) httprouter.Handle {

for _, node := range nodes {
for _, device := range node.Devices {
allowedShareModes := util.DefaultAllowedShareModes
config, ok := util.GetCompatibleConfigsByDeviceName(device.Type)
if ok && len(config.AllowedShareModes) > 0 {
allowedShareModes = config.AllowedShareModes
}
uuidToGPUDetails[device.ID] = &GPUDetail{
GPUInfo: GPUInfo{
NodeName: node.Node.Name,
DeviceInfo: device,
},
AllowedShareModes: allowedShareModes,
}
}
}
Expand Down Expand Up @@ -426,6 +434,14 @@ func SwitchGPUMode(s *scheduler.Scheduler) httprouter.Handle {
for _, node := range nodes {
for _, device := range node.Devices {
if device.ID == uuid {
config, ok := util.GetCompatibleConfigsByDeviceName(device.Type)
if ok && len(config.AllowedShareModes) > 0 {
if !slices.Contains(config.AllowedShareModes, req.Mode) {
klog.Warningf("GPU %s does not support mode %s, refusing to switch", uuid, req.Mode)
http.Error(w, fmt.Sprintf("GPU %s does not support mode %s", uuid, req.Mode), http.StatusBadRequest)
return
}
}
targetNode = node.Node
break
}
Expand Down
Loading