From 40b10daa382427019481d5b61f0b55fbfc9067b0 Mon Sep 17 00:00:00 2001 From: Aditya Singh Date: Wed, 29 Apr 2026 05:17:24 +0000 Subject: [PATCH 1/2] feat: Add dual API support for ResourceSlice (v1 and v1beta1) --- go.mod | 35 +- go.sum | 87 ++-- internal/pkg/testutils/test_utils.go | 1 + internal/pkg/transformation/dra.go | 477 ++++++++++++++---- internal/pkg/transformation/dra_test.go | 423 ++++++++++++++++ internal/pkg/transformation/kubernetes.go | 38 +- .../pkg/transformation/kubernetes_test.go | 295 ++++++++++- .../pkg/transformation/test_helpers_test.go | 71 +++ internal/pkg/transformation/types.go | 15 +- 9 files changed, 1259 insertions(+), 183 deletions(-) create mode 100644 internal/pkg/transformation/dra_test.go create mode 100644 internal/pkg/transformation/test_helpers_test.go diff --git a/go.mod b/go.mod index cbab2f11..4dc06e8f 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,8 @@ require ( github.com/NVIDIA/go-nvml v0.12.4-1 github.com/avast/retry-go/v4 v4.6.0 github.com/bits-and-blooms/bitset v1.22.0 - github.com/fsnotify/fsnotify v1.7.0 + github.com/containerd/cgroups/v3 v3.1.1 + github.com/fsnotify/fsnotify v1.9.0 github.com/google/uuid v1.6.0 github.com/gorilla/mux v1.8.1 github.com/mittwald/go-helm-client v0.12.16 @@ -25,13 +26,13 @@ require ( go.uber.org/goleak v1.3.0 go.uber.org/mock v0.5.0 golang.org/x/sync v0.16.0 - google.golang.org/grpc v1.71.1 + google.golang.org/grpc v1.72.1 helm.sh/helm/v3 v3.18.5 - k8s.io/api v0.33.3 - k8s.io/apimachinery v0.33.3 - k8s.io/client-go v0.33.3 - k8s.io/kubelet v0.32.3 - k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e + k8s.io/api v0.34.1 + k8s.io/apimachinery v0.34.1 + k8s.io/client-go v0.34.1 + k8s.io/kubelet v0.34.1 + k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 ) require ( @@ -49,7 +50,6 @@ require ( github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/chai2010/gettext-go v1.0.3 // indirect - github.com/containerd/cgroups/v3 v3.1.1 // indirect github.com/containerd/containerd v1.7.27 // indirect github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/log v0.1.0 // indirect @@ -63,7 +63,7 @@ require ( github.com/evanphx/json-patch v5.9.11+incompatible // indirect github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect github.com/fatih/color v1.18.0 // indirect - github.com/fxamacker/cbor/v2 v2.8.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-errors/errors v1.5.1 // indirect github.com/go-gorp/gorp/v3 v3.1.0 // indirect github.com/go-logr/logr v1.4.2 // indirect @@ -74,7 +74,7 @@ require ( github.com/gobwas/glob v0.2.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/google/btree v1.1.3 // indirect - github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect @@ -107,7 +107,7 @@ require ( github.com/moby/sys/userns v0.1.0 // indirect github.com/moby/term v0.5.2 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect @@ -131,10 +131,9 @@ require ( github.com/xlab/treeprint v1.2.0 // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect go.opentelemetry.io/otel/sdk/metric v1.35.0 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect - go.yaml.in/yaml/v3 v3.0.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.40.0 // indirect golang.org/x/net v0.41.0 // indirect golang.org/x/oauth2 v0.28.0 // indirect @@ -150,17 +149,17 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.33.3 // indirect - k8s.io/apiserver v0.33.3 // indirect + k8s.io/apiserver v0.34.1 // indirect k8s.io/cli-runtime v0.33.3 // indirect - k8s.io/component-base v0.33.3 // indirect + k8s.io/component-base v0.34.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect + k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect k8s.io/kubectl v0.33.3 // indirect oras.land/oras-go/v2 v2.6.0 // indirect sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect sigs.k8s.io/kustomize/api v0.19.0 // indirect sigs.k8s.io/kustomize/kyaml v0.19.0 // indirect sigs.k8s.io/randfill v1.0.0 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect - sigs.k8s.io/yaml v1.5.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index cd741e4b..42c4d5df 100644 --- a/go.sum +++ b/go.sum @@ -44,7 +44,6 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chai2010/gettext-go v1.0.3 h1:9liNh8t+u26xl5ddmWLmsOsdNLwkdRTg5AG+JnTiM80= github.com/chai2010/gettext-go v1.0.3/go.mod h1:y+wnP2cHYaVj19NZhYKAwEMH2CI1gNHeQQ+5AjwawxA= -github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM= github.com/containerd/cgroups/v3 v3.1.1 h1:ASZmQGfOHbRj43/1aMn5QcWIsv0R/AuHHDNCguRY0p0= github.com/containerd/cgroups/v3 v3.1.1/go.mod h1:PKZ2AcWmSBsY/tJUVhtS/rluX0b1uq1GmPO1ElCmbOw= github.com/containerd/containerd v1.7.27 h1:yFyEyojddO3MIGVER2xJLWoCIn+Up4GaHFquP7hsFII= @@ -95,10 +94,10 @@ github.com/foxcpp/go-mockdns v1.1.0 h1:jI0rD8M0wuYAxL7r/ynTrCQQq0BVqfB99Vgk7Dlme github.com/foxcpp/go-mockdns v1.1.0/go.mod h1:IhLeSFGed3mJIAXPH2aiRQB+kqz7oqu8ld2qVbOu7Wk= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= -github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= -github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= -github.com/fxamacker/cbor/v2 v2.8.0 h1:fFtUGXUzXPHTIUdne5+zzMPTfffl3RD5qYnkY40vtxU= -github.com/fxamacker/cbor/v2 v2.8.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= github.com/go-errors/errors v1.5.1 h1:ZwEMSLRCapFLflTpT7NKaAc7ukJ8ZPEjzlxt8rPN8bk= github.com/go-errors/errors v1.5.1/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= github.com/go-gorp/gorp/v3 v3.1.0 h1:ItKF/Vbuj31dmV4jxA1qblpSwkl9g1typ24xoe70IGs= @@ -126,9 +125,8 @@ github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= -github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -149,8 +147,8 @@ github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16 github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4M0+kPpLofRdBo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -221,8 +219,9 @@ github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFL github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/2gBQ3RWajuToeY6ZtZTIKv2v7ThUy5KKusIT0yc0= github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= @@ -327,10 +326,10 @@ go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.32.0 h1:j7Z go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.32.0/go.mod h1:WXbYJTUaZXAbYd8lbgGuvih0yuCfOFC5RJoYnoLcGz8= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.32.0 h1:t/Qur3vKSkUCcDVaSumWF2PKHt85pc7fRvFuoVT8qFU= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.32.0/go.mod h1:Rl61tySSdcOJWoEgYZVtmnKdA0GeKrSqkHC1t+91CH8= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 h1:Vh5HayB/0HHfOQA7Ctx69E/Y/DcQSMPpKANYVMQ7fBA= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0/go.mod h1:cpgtDBaqD/6ok/UG0jT15/uKjAY8mRA53diogHBg3UI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 h1:5pojmb1U1AogINhN3SurB+zm/nIcusopeBNp42f45QM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0/go.mod h1:57gTHJSE5S1tqg+EKsLPlTWhpHMsWlVmer+LA926XiA= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 h1:OeNbIYk/2C15ckl7glBlOBp5+WlYsOElzTNmiPW/x60= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0/go.mod h1:7Bept48yIeqxP2OZ9/AqIpYS94h2or0aB4FypJTc8ZM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 h1:tgJ0uaNS4c98WRNUEx5U3aDlrDOI5Rs+1Vifcw4DJ8U= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0/go.mod h1:U7HYyW0zt/a9x5J1Kjs+r1f/d4ZHnYFclhYY2+YbeoE= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.32.0 h1:cMyu9O88joYEaI47CnQkxO1XZdpoTF9fEnW2duIddhw= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.32.0/go.mod h1:6Am3rn7P9TVVeXYG+wtcGE7IE1tsQ+bP3AuWcKt/gOI= go.opentelemetry.io/otel/exporters/prometheus v0.54.0 h1:rFwzp68QMgtzu9PgP3jm9XaMICI6TsofWWPcBDKwlsU= @@ -353,16 +352,16 @@ go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5J go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= -go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= -go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= +go.opentelemetry.io/proto/otlp v1.5.0 h1:xJvq7gMzB31/d406fB8U5CBdyQGw4P399D1aQWU/3i4= +go.opentelemetry.io/proto/otlp v1.5.0/go.mod h1:keN8WnHxOy8PG0rQZjJJ5A2ebUoafqWp0eVQ4yIXvJ4= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/mock v0.5.0 h1:KAMbZvZPyBPWgD14IrIQ38QCyjwpvVVV6K/bHl1IwQU= go.uber.org/mock v0.5.0/go.mod h1:ge71pBPLYDk7QIi1LupWxdAykm7KIEFchiOqd6z7qMM= go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= -go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE= -go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= @@ -412,12 +411,12 @@ golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/genproto v0.0.0-20240123012728-ef4313101c80 h1:KAeGQVN3M9nD0/bQXnr/ClcEMJ968gUXJQ9pwfSynuQ= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= +google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= +google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= google.golang.org/genproto/googleapis/rpc v0.0.0-20250324211829-b45e905df463 h1:e0AIkUUhxyBKh6ssZNrAMeqhA7RKUj42346d1y02i2g= google.golang.org/genproto/googleapis/rpc v0.0.0-20250324211829-b45e905df463/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= -google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= -google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= +google.golang.org/grpc v1.72.1 h1:HR03wO6eyZ7lknl75XlxABNVLLFc2PAb6mHlYh756mA= +google.golang.org/grpc v1.72.1/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -434,30 +433,30 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= helm.sh/helm/v3 v3.18.5 h1:Cc3Z5vd6kDrZq9wO9KxKLNEickiTho6/H/dBNRVSos4= helm.sh/helm/v3 v3.18.5/go.mod h1:L/dXDR2r539oPlFP1PJqKAC1CUgqHJDLkxKpDGrWnyg= -k8s.io/api v0.33.3 h1:SRd5t//hhkI1buzxb288fy2xvjubstenEKL9K51KBI8= -k8s.io/api v0.33.3/go.mod h1:01Y/iLUjNBM3TAvypct7DIj0M0NIZc+PzAHCIo0CYGE= +k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM= +k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk= k8s.io/apiextensions-apiserver v0.33.3 h1:qmOcAHN6DjfD0v9kxL5udB27SRP6SG/MTopmge3MwEs= k8s.io/apiextensions-apiserver v0.33.3/go.mod h1:oROuctgo27mUsyp9+Obahos6CWcMISSAPzQ77CAQGz8= -k8s.io/apimachinery v0.33.3 h1:4ZSrmNa0c/ZpZJhAgRdcsFcZOw1PQU1bALVQ0B3I5LA= -k8s.io/apimachinery v0.33.3/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/apiserver v0.33.3 h1:Wv0hGc+QFdMJB4ZSiHrCgN3zL3QRatu56+rpccKC3J4= -k8s.io/apiserver v0.33.3/go.mod h1:05632ifFEe6TxwjdAIrwINHWE2hLwyADFk5mBsQa15E= +k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4= +k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= +k8s.io/apiserver v0.34.1 h1:U3JBGdgANK3dfFcyknWde1G6X1F4bg7PXuvlqt8lITA= +k8s.io/apiserver v0.34.1/go.mod h1:eOOc9nrVqlBI1AFCvVzsob0OxtPZUCPiUJL45JOTBG0= k8s.io/cli-runtime v0.33.3 h1:Dgy4vPjNIu8LMJBSvs8W0LcdV0PX/8aGG1DA1W8lklA= k8s.io/cli-runtime v0.33.3/go.mod h1:yklhLklD4vLS8HNGgC9wGiuHWze4g7x6XQZ+8edsKEo= -k8s.io/client-go v0.33.3 h1:M5AfDnKfYmVJif92ngN532gFqakcGi6RvaOF16efrpA= -k8s.io/client-go v0.33.3/go.mod h1:luqKBQggEf3shbxHY4uVENAxrDISLOarxpTKMiUuujg= -k8s.io/component-base v0.33.3 h1:mlAuyJqyPlKZM7FyaoM/LcunZaaY353RXiOd2+B5tGA= -k8s.io/component-base v0.33.3/go.mod h1:ktBVsBzkI3imDuxYXmVxZ2zxJnYTZ4HAsVj9iF09qp4= +k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY= +k8s.io/client-go v0.34.1/go.mod h1:kA8v0FP+tk6sZA0yKLRG67LWjqufAoSHA2xVGKw9Of8= +k8s.io/component-base v0.34.1 h1:v7xFgG+ONhytZNFpIz5/kecwD+sUhVE6HU7qQUiRM4A= +k8s.io/component-base v0.34.1/go.mod h1:mknCpLlTSKHzAQJJnnHVKqjxR7gBeHRv0rPXA7gdtQ0= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUyGcf03XZEP0ZIKgKj35LS4= -k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= +k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA= +k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= k8s.io/kubectl v0.33.3 h1:r/phHvH1iU7gO/l7tTjQk2K01ER7/OAJi8uFHHyWSac= k8s.io/kubectl v0.33.3/go.mod h1:euj2bG56L6kUGOE/ckZbCoudPwuj4Kud7BR0GzyNiT0= -k8s.io/kubelet v0.32.3 h1:B9HzW4yB67flx8tN2FYuDwZvxnmK3v5EjxxFvOYjmc8= -k8s.io/kubelet v0.32.3/go.mod h1:yyAQSCKC+tjSlaFw4HQG7Jein+vo+GeKBGdXdQGvL1U= -k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e h1:KqK5c/ghOm8xkHYhlodbp6i6+r+ChV2vuAuVRdFbLro= -k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/kubelet v0.34.1 h1:doAaTA9/Yfzbdq/u/LveZeONp96CwX9giW6b+oHn4m4= +k8s.io/kubelet v0.34.1/go.mod h1:PtV3Ese8iOM19gSooFoQT9iyRisbmJdAPuDImuccbbA= +k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= +k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc= oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o= sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= @@ -466,11 +465,9 @@ sigs.k8s.io/kustomize/api v0.19.0 h1:F+2HB2mU1MSiR9Hp1NEgoU2q9ItNOaBJl0I4Dlus5SQ sigs.k8s.io/kustomize/api v0.19.0/go.mod h1:/BbwnivGVcBh1r+8m3tH1VNxJmHSk1PzP5fkP6lbL1o= sigs.k8s.io/kustomize/kyaml v0.19.0 h1:RFge5qsO1uHhwJsu3ipV7RNolC7Uozc0jUBC/61XSlA= sigs.k8s.io/kustomize/kyaml v0.19.0/go.mod h1:FeKD5jEOH+FbZPpqUghBP8mrLjJ3+zD3/rf9NNu1cwY= -sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= -sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= -sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ= -sigs.k8s.io/yaml v1.5.0/go.mod h1:wZs27Rbxoai4C0f8/9urLZtZtF3avA3gKvGyPdDqTO4= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/internal/pkg/testutils/test_utils.go b/internal/pkg/testutils/test_utils.go index ed738aca..c51a0620 100644 --- a/internal/pkg/testutils/test_utils.go +++ b/internal/pkg/testutils/test_utils.go @@ -195,6 +195,7 @@ func CreateTmpDir(t *testing.T) (string, func()) { } type MockPodResourcesServer struct { + v1.UnimplementedPodResourcesListerServer resourceName string gpus []string } diff --git a/internal/pkg/transformation/dra.go b/internal/pkg/transformation/dra.go index 6d4755ef..9efcb21d 100644 --- a/internal/pkg/transformation/dra.go +++ b/internal/pkg/transformation/dra.go @@ -22,9 +22,13 @@ import ( "log/slog" "time" + resourcev1 "k8s.io/api/resource/v1" resourcev1beta1 "k8s.io/api/resource/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" + podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1" "github.com/NVIDIA/dcgm-exporter/internal/pkg/kubeclient" ) @@ -33,45 +37,234 @@ const ( informerResyncPeriod = 10 * time.Minute ) +// resourceSliceAdapter provides a unified interface for accessing ResourceSlice data +// from both v1 and v1beta1 API versions +type resourceSliceAdapter interface { + // GetDevices returns a list of device adapters + GetDevices() []deviceAdapter +} + +// deviceAdapter provides a unified interface for accessing device data +// from both v1 and v1beta1 API versions +type deviceAdapter interface { + // GetName returns the device name + GetName() string + // GetAttribute returns the string value of an attribute by key, or empty string if not found + GetAttribute(key string) string + // HasAttributes returns true if the device has attributes + HasAttributes() bool +} + +// v1ResourceSliceAdapter adapts resourcev1.ResourceSlice to resourceSliceAdapter +type v1ResourceSliceAdapter struct { + slice *resourcev1.ResourceSlice +} + +func (a *v1ResourceSliceAdapter) GetDevices() []deviceAdapter { + devices := make([]deviceAdapter, len(a.slice.Spec.Devices)) + for i := range a.slice.Spec.Devices { + devices[i] = &v1DeviceAdapter{device: &a.slice.Spec.Devices[i]} + } + return devices +} + +// v1DeviceAdapter adapts resourcev1.Device to deviceAdapter +type v1DeviceAdapter struct { + device *resourcev1.Device +} + +func (a *v1DeviceAdapter) GetName() string { + return a.device.Name +} + +func (a *v1DeviceAdapter) HasAttributes() bool { + return a.device.Attributes != nil +} + +func (a *v1DeviceAdapter) GetAttribute(key string) string { + if a.device.Attributes == nil { + return "" + } + attrKey := resourcev1.QualifiedName(key) + if attr, ok := a.device.Attributes[attrKey]; ok && attr.StringValue != nil { + return *attr.StringValue + } + return "" +} + +// v1beta1ResourceSliceAdapter adapts resourcev1beta1.ResourceSlice to resourceSliceAdapter +type v1beta1ResourceSliceAdapter struct { + slice *resourcev1beta1.ResourceSlice +} + +func (a *v1beta1ResourceSliceAdapter) GetDevices() []deviceAdapter { + devices := make([]deviceAdapter, len(a.slice.Spec.Devices)) + for i := range a.slice.Spec.Devices { + devices[i] = &v1beta1DeviceAdapter{device: &a.slice.Spec.Devices[i]} + } + return devices +} + +// v1beta1DeviceAdapter adapts resourcev1beta1.Device to deviceAdapter +type v1beta1DeviceAdapter struct { + device *resourcev1beta1.Device +} + +func (a *v1beta1DeviceAdapter) GetName() string { + return a.device.Name +} + +func (a *v1beta1DeviceAdapter) HasAttributes() bool { + return a.device.Basic != nil && a.device.Basic.Attributes != nil +} + +func (a *v1beta1DeviceAdapter) GetAttribute(key string) string { + if a.device.Basic == nil || a.device.Basic.Attributes == nil { + return "" + } + attrKey := resourcev1beta1.QualifiedName(key) + if attr, ok := a.device.Basic.Attributes[attrKey]; ok && attr.StringValue != nil { + return *attr.StringValue + } + return "" +} + +func supportsResourceSliceGV(client kubernetes.Interface, groupVersion string) bool { + resources, err := client.Discovery().ServerResourcesForGroupVersion(groupVersion) + if err != nil { + // Discovery returns errors when the group/version isn't served. + slog.Debug("Discovery failed for groupVersion", "groupVersion", groupVersion, "error", err) + return false + } + + for _, r := range resources.APIResources { + // Match the primary resource only (not subresources like "resourceslices/status"). + if r.Name == "resourceslices" { + return true + } + } + return false +} + +// NewDRAResourceSliceManager creates a new DRA ResourceSlice manager. +// The API version is auto-detected by checking which version has NVIDIA DRA ResourceSlices. func NewDRAResourceSliceManager() (*DRAResourceSliceManager, error) { client, err := kubeclient.GetKubeClient() if err != nil { return nil, fmt.Errorf("error getting kube client: %w", err) } + // Decide which API version to use. + // Prefer v1 only when it actually has NVIDIA DRA ResourceSlices; otherwise fall back + // to v1beta1 when that version has NVIDIA DRA ResourceSlices. + const ( + resourceGVV1 = "resource.k8s.io/v1" + resourceGVV1beta1 = "resource.k8s.io/v1beta1" + ) + + v1Served := supportsResourceSliceGV(client, resourceGVV1) + v1beta1Served := supportsResourceSliceGV(client, resourceGVV1beta1) + if !v1Served && !v1beta1Served { + slog.Warn("Neither resource.k8s.io/v1 nor v1beta1 ResourceSlice API is served; DRA labels will not be available") + return nil, nil + } + + // Determine which served API version actually has NVIDIA DRA ResourceSlices. + ctx := context.Background() + v1HasNvidiaSlices := false + if v1Served { + resourceSlicesList, err := client.ResourceV1().ResourceSlices().List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to list ResourceSlices for v1: %v", err) + } + items := make([]interface{}, 0, len(resourceSlicesList.Items)) + for i := range resourceSlicesList.Items { + items = append(items, &resourceSlicesList.Items[i]) + } + v1HasNvidiaSlices = countGPUSlices(items) > 0 + } + + v1beta1HasNvidiaSlices := false + if v1beta1Served { + resourceSlicesList, err := client.ResourceV1beta1().ResourceSlices().List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to list ResourceSlices for v1beta1: %v", err) + } + items := make([]interface{}, 0, len(resourceSlicesList.Items)) + for i := range resourceSlicesList.Items { + items = append(items, &resourceSlicesList.Items[i]) + } + v1beta1HasNvidiaSlices = countGPUSlices(items) > 0 + } + + var selected string + switch { + case v1HasNvidiaSlices: + selected = "v1" + case v1beta1HasNvidiaSlices: + selected = "v1beta1" + default: + slog.Warn("No NVIDIA DRA ResourceSlices found; DRA labels will not be available") + return nil, nil + } factory := informers.NewSharedInformerFactory(client, informerResyncPeriod) - informer := factory.Resource().V1beta1().ResourceSlices().Informer() + + var informer cache.SharedIndexInformer + switch selected { + case "v1": + informer = factory.Resource().V1().ResourceSlices().Informer() + err = informer.AddIndexers(cache.Indexers{ + "poolName": func(obj interface{}) ([]string, error) { + rs, ok := obj.(*resourcev1.ResourceSlice) + if !ok { + return nil, nil + } + return []string{rs.Spec.Pool.Name}, nil + }, + }) + if err != nil { + return nil, fmt.Errorf("error adding pool indexer to v1 ResourceSlice informer: %w", err) + } + case "v1beta1": + informer = factory.Resource().V1beta1().ResourceSlices().Informer() + err = informer.AddIndexers(cache.Indexers{ + "poolName": func(obj interface{}) ([]string, error) { + rs, ok := obj.(*resourcev1beta1.ResourceSlice) + if !ok { + return nil, nil + } + return []string{rs.Spec.Pool.Name}, nil + }, + }) + if err != nil { + return nil, fmt.Errorf("error adding pool indexer to v1beta1 ResourceSlice informer: %w", err) + } + default: + return nil, fmt.Errorf("unsupported API version selection: %s", selected) + } m := &DRAResourceSliceManager{ - factory: factory, - informer: informer, - deviceToUUID: make(map[string]string), - migDevices: make(map[string]*DRAMigDeviceInfo), - } - - _, err = informer.AddEventHandler(&cache.FilteringResourceEventHandler{ - FilterFunc: func(obj interface{}) bool { - s := obj.(*resourcev1beta1.ResourceSlice) - return s.Spec.Driver == DRAGPUDriverName - }, - Handler: cache.ResourceEventHandlerFuncs{ - AddFunc: m.onAddOrUpdate, - UpdateFunc: func(_, o interface{}) { m.onAddOrUpdate(o) }, - DeleteFunc: m.onDelete, - }, - }) - if err != nil { - return nil, fmt.Errorf("error adding event handler: %w", err) + factory: factory, + preferredAPIVersion: selected, + } + if selected == "v1" { + m.v1Informer = informer + } else { + m.v1beta1Informer = informer } ctx, cancel := context.WithCancel(context.Background()) m.cancelContext = cancel factory.Start(ctx.Done()) - if !cache.WaitForCacheSync(ctx.Done(), informer.HasSynced) { + // Wait for cache sync on the selected informer. + synced := cache.WaitForCacheSync(ctx.Done(), informer.HasSynced) + if !synced { cancel() return nil, fmt.Errorf("ResourceSlice informer cache sync failed") } + + slog.Info("ResourceSlice API informer synced successfully", "apiVersion", selected) return m, nil } @@ -85,94 +278,204 @@ func (m *DRAResourceSliceManager) Stop() { } } -// GetDeviceInfo returns the mapping UUID and MIG device info if applicable -// For MIG devices: returns (parentUUID, *DRAMigDeviceInfo) -// For full GPUs: returns (deviceUUID, nil) -func (m *DRAResourceSliceManager) GetDeviceInfo(pool, device string) (string, *DRAMigDeviceInfo) { - key := pool + "/" + device - m.mu.RLock() - defer m.mu.RUnlock() +// countGPUSlices counts the number of ResourceSlice objects with GPU devices +// (matching the DRAGPUDriverName) in the given items. +func countGPUSlices(items []interface{}) int { + count := 0 + for _, item := range items { + switch obj := item.(type) { + case *resourcev1.ResourceSlice: + if obj.Spec.Driver == DRAGPUDriverName && len(obj.Spec.Devices) > 0 { + count++ + } + case *resourcev1beta1.ResourceSlice: + if obj.Spec.Driver == DRAGPUDriverName && len(obj.Spec.Devices) > 0 { + count++ + } + } + } + return count +} - // Check if this is a MIG device - if migInfo, exists := m.migDevices[key]; exists { - // MIG device - return parent UUID and MIG info - slog.Debug(fmt.Sprintf("Found MIG device for %s with parent UUID: %s", key, migInfo.ParentUUID)) - return migInfo.ParentUUID, migInfo +func (m *DRAResourceSliceManager) getV1DeviceInfo(pool, device string) (string, *DRAMigDeviceInfo) { + if m.v1Informer == nil { + return "", nil } - // Full GPU device - return device UUID with no MIG info - if uuid, exists := m.deviceToUUID[key]; exists { - slog.Debug(fmt.Sprintf("Found GPU device for %s with UUID: %s", uuid, key)) - return uuid, nil + items, err := m.v1Informer.GetIndexer().ByIndex("poolName", pool) + if err != nil { + slog.Error(fmt.Sprintf("Error listing v1 ResourceSlices by pool index for pool %s: %v", pool, err)) + return "", nil } - slog.Info(fmt.Sprintf("No UUID found for %s", key)) - return "", nil + return m.getDeviceInfoFromResourceSliceItems(pool, device, items) } -func getAttrString(attrs map[resourcev1beta1.QualifiedName]resourcev1beta1.DeviceAttribute, key resourcev1beta1.QualifiedName) string { - if attr, ok := attrs[key]; ok && attr.StringValue != nil { - return *attr.StringValue +func (m *DRAResourceSliceManager) getV1beta1DeviceInfo(pool, device string) (string, *DRAMigDeviceInfo) { + if m.v1beta1Informer == nil { + return "", nil } - return "" -} -func (m *DRAResourceSliceManager) onAddOrUpdate(obj interface{}) { - slice := obj.(*resourcev1beta1.ResourceSlice) - pool := slice.Spec.Pool.Name + items, err := m.v1beta1Informer.GetIndexer().ByIndex("poolName", pool) + if err != nil { + slog.Error(fmt.Sprintf("Error listing v1beta1 ResourceSlices by pool index for pool %s: %v", pool, err)) + return "", nil + } - m.mu.Lock() - defer m.mu.Unlock() + return m.getDeviceInfoFromResourceSliceItems(pool, device, items) +} - for _, dev := range slice.Spec.Devices { - if dev.Basic == nil || dev.Basic.Attributes == nil { +// getDeviceInfoFromResourceSliceItems resolves device UUIDs/MIG info from a set of +// ResourceSlice objects. It does not select an API version — callers already do +// that by choosing which informer indexer to query. +func (m *DRAResourceSliceManager) getDeviceInfoFromResourceSliceItems(pool, device string, items []interface{}) (string, *DRAMigDeviceInfo) { + // Search for the device in the selected slices + for _, item := range items { + var adapter resourceSliceAdapter + switch obj := item.(type) { + case *resourcev1.ResourceSlice: + // NOTE: dcgm-exporter's DRA handling currently assumes the schema used by + // the NVIDIA GPU DRA driver (for example, "type", "uuid", "parentUUID", "profile" + // attributes). Other GPU DRA drivers with different schemas may not work + // correctly with this implementation. + if obj.Spec.Driver != DRAGPUDriverName { + continue + } + adapter = &v1ResourceSliceAdapter{slice: obj} + case *resourcev1beta1.ResourceSlice: + if obj.Spec.Driver != DRAGPUDriverName { + continue + } + adapter = &v1beta1ResourceSliceAdapter{slice: obj} + default: continue } - key := pool + "/" + dev.Name - attr := dev.Basic.Attributes - - deviceType := getAttrString(attr, "type") - switch deviceType { - case "gpu": - if uuid := getAttrString(attr, "uuid"); uuid != "" { - m.deviceToUUID[key] = uuid - slog.Debug(fmt.Sprintf("Added gpu device [key:%s] with UUID: %s", key, uuid)) + + // Search for the device in this slice + for _, dev := range adapter.GetDevices() { + if !dev.HasAttributes() { + continue + } + if dev.GetName() != device { + continue } - case "mig": - parentUUID := getAttrString(attr, "parentUUID") - profile := getAttrString(attr, "profile") - migUUID := getAttrString(attr, "uuid") - - // Only create MIG device if we have required parent UUID - if parentUUID != "" { - m.migDevices[key] = &DRAMigDeviceInfo{ - MIGDeviceUUID: migUUID, - Profile: profile, - ParentUUID: parentUUID, + deviceType := dev.GetAttribute("type") + switch deviceType { + case "mig": + parentUUID := dev.GetAttribute("parentUUID") + profile := dev.GetAttribute("profile") + migUUID := dev.GetAttribute("uuid") + if parentUUID != "" { + migInfo := &DRAMigDeviceInfo{ + MIGDeviceUUID: migUUID, + Profile: profile, + ParentUUID: parentUUID, + } + slog.Debug(fmt.Sprintf("Found MIG device %s/%s with parent UUID: %s", pool, device, parentUUID)) + return parentUUID, migInfo + } + case "gpu": + uuid := dev.GetAttribute("uuid") + if uuid != "" { + slog.Debug(fmt.Sprintf("Found GPU device %s/%s with UUID: %s", pool, device, uuid)) + return uuid, nil } - slog.Debug(fmt.Sprintf("Added MIG device %s (profile: %s) with parent: %s", migUUID, profile, parentUUID)) - } else { - slog.Debug(fmt.Sprintf("MIG device %s missing parent UUID", migUUID)) + default: + // Log unknown device types to help users understand why a device might not be handled + slog.Warn(fmt.Sprintf("Device [%s/%s] has unknown type: %s", pool, device, deviceType)) } - - default: - slog.Warn(fmt.Sprintf("Device [key:%s] has unknown type: %s", key, deviceType)) } } + + slog.Debug(fmt.Sprintf("No UUID found for pool %s, device %s", pool, device)) + return "", nil } -func (m *DRAResourceSliceManager) onDelete(obj interface{}) { - slice := obj.(*resourcev1beta1.ResourceSlice) - pool := slice.Spec.Pool.Name +// GetDeviceInfo returns the mapping UUID and MIG device info if applicable +// by querying the informer cache directly. This avoids maintaining redundant +// local caches and ensures we always have the latest state from the API server. +// For MIG devices: returns (parentUUID, *DRAMigDeviceInfo) +// For full GPUs: returns (deviceUUID, nil) +func (m *DRAResourceSliceManager) GetDeviceInfo(pool, device string) (string, *DRAMigDeviceInfo) { + m.mu.RLock() + defer m.mu.RUnlock() - m.mu.Lock() - defer m.mu.Unlock() + switch m.preferredAPIVersion { + case "v1": + return m.getV1DeviceInfo(pool, device) + case "v1beta1": + return m.getV1beta1DeviceInfo(pool, device) + default: + slog.Error("Unsupported preferred ResourceSlice API version", "apiVersion", m.preferredAPIVersion) + return "", nil + } +} - for _, dev := range slice.Spec.Devices { - key := pool + "/" + dev.Name - slog.Debug(fmt.Sprintf("Removing device for %s", key)) - delete(m.deviceToUUID, key) - delete(m.migDevices, key) +type DynamicResourceMapping struct { + MappingKey string + Info *DynamicResourceInfo +} + +// GetDynamicResourceMappings converts a DynamicResource into one or more +// DynamicResourceInfo entries and resolves the backing GPU or MIG device UUIDs +// using the ResourceSlice informer. +// +// A single DynamicResource can contain multiple ClaimResources (devices). This +// method returns a mapping entry for each matching NVIDIA GPU DRA claim. +func (m *DRAResourceSliceManager) GetDynamicResourceMappings(resource *podresourcesapi.DynamicResource) []DynamicResourceMapping { + if resource == nil { + return nil } + + mappings := make([]DynamicResourceMapping, 0, len(resource.GetClaimResources())) + for _, claimResource := range resource.GetClaimResources() { + draDriverName := claimResource.GetDriverName() + if draDriverName != DRAGPUDriverName { + continue + } + + draPoolName := claimResource.GetPoolName() + draDeviceName := claimResource.GetDeviceName() + + mappingKey, migInfo := m.GetDeviceInfo(draPoolName, draDeviceName) + if mappingKey == "" { + slog.Debug(fmt.Sprintf("No UUID for %s/%s", draPoolName, draDeviceName)) + continue + } + + drInfo := &DynamicResourceInfo{ + ClaimName: resource.GetClaimName(), + ClaimNamespace: resource.GetClaimNamespace(), + DriverName: draDriverName, + PoolName: draPoolName, + DeviceName: draDeviceName, + } + if migInfo != nil { + drInfo.MIGInfo = migInfo + } + + mappings = append(mappings, DynamicResourceMapping{ + MappingKey: mappingKey, + Info: drInfo, + }) + } + + return mappings } + +// GetDynamicResourceInfo converts a DynamicResource into a DynamicResourceInfo and +// resolves the backing GPU or MIG device UUID using the ResourceSlice informer. +// It returns the mapping key (device UUID or parent UUID for MIG devices) and +// the populated DynamicResourceInfo. If the DynamicResource is not for the +// NVIDIA GPU DRA driver or no matching device can be found, it returns "" and nil. +// +// Deprecated behavior: this returns only the first mapping. Prefer +// GetDynamicResourceMappings when a DynamicResource may contain multiple devices. +func (m *DRAResourceSliceManager) GetDynamicResourceInfo(resource *podresourcesapi.DynamicResource) (string, *DynamicResourceInfo) { + mappings := m.GetDynamicResourceMappings(resource) + if len(mappings) == 0 { + return "", nil + } + return mappings[0].MappingKey, mappings[0].Info +} \ No newline at end of file diff --git a/internal/pkg/transformation/dra_test.go b/internal/pkg/transformation/dra_test.go new file mode 100644 index 00000000..25587002 --- /dev/null +++ b/internal/pkg/transformation/dra_test.go @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + package transformation + + import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + resourcev1 "k8s.io/api/resource/v1" + resourcev1beta1 "k8s.io/api/resource/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/cache" + ) + + // testInformer is a simple test implementation of SharedIndexInformer + type testInformer struct { + store cache.Store + } + + func (t *testInformer) GetStore() cache.Store { + return t.store + } + + func (t *testInformer) GetIndexer() cache.Indexer { + return t.store.(cache.Indexer) + } + + // newDRAIndexer creates an Indexer with a poolName index matching the production + // informer configuration so tests can exercise GetDeviceInfo without relying on + // informer.AddIndexers. + func newDRAIndexer() cache.Indexer { + return cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{ + "poolName": func(obj interface{}) ([]string, error) { + switch rs := obj.(type) { + case *resourcev1.ResourceSlice: + return []string{rs.Spec.Pool.Name}, nil + case *resourcev1beta1.ResourceSlice: + return []string{rs.Spec.Pool.Name}, nil + default: + return nil, nil + } + }, + }) + } + + func (t *testInformer) AddIndexers(indexers cache.Indexers) error { + return nil + } + + func (t *testInformer) GetController() cache.Controller { + return nil + } + + func (t *testInformer) LastSyncResourceVersion() string { + return "" + } + + func (t *testInformer) AddEventHandler(handler cache.ResourceEventHandler) (cache.ResourceEventHandlerRegistration, error) { + return nil, nil + } + + func (t *testInformer) AddEventHandlerWithResyncPeriod(handler cache.ResourceEventHandler, resyncPeriod time.Duration) (cache.ResourceEventHandlerRegistration, error) { + return nil, nil + } + + func (t *testInformer) AddEventHandlerWithOptions(handler cache.ResourceEventHandler, options cache.HandlerOptions) (cache.ResourceEventHandlerRegistration, error) { + return nil, nil + } + + func (t *testInformer) RemoveEventHandler(handle cache.ResourceEventHandlerRegistration) error { + return nil + } + + func (t *testInformer) IsStopped() bool { + return false + } + + func (t *testInformer) SetWatchErrorHandler(handler cache.WatchErrorHandler) error { + return nil + } + + func (t *testInformer) SetWatchErrorHandlerWithContext(handler cache.WatchErrorHandlerWithContext) error { + return nil + } + + func (t *testInformer) SetTransform(handler cache.TransformFunc) error { + return nil + } + + func (t *testInformer) HasSynced() bool { + return true + } + + func (t *testInformer) Run(stopCh <-chan struct{}) { + } + + func (t *testInformer) RunWithContext(ctx context.Context) { + } + + func TestGetDeviceInfo_GPUDevice(t *testing.T) { + // Create a store with a ResourceSlice containing a GPU device + store := newDRAIndexer() + slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "gpu-pool", + }, + Devices: []resourcev1.Device{ + { + Name: "gpu0", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-0")}, + }, + }, + }, + }, + } + store.Add(slice) + + m := &DRAResourceSliceManager{ + v1Informer: &testInformer{store: store}, + preferredAPIVersion: "v1", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + require.NotEmpty(t, uuid, "expected UUID to be found") + assert.Equal(t, "GPU-UUID-0", uuid) + assert.Nil(t, migInfo, "expected no MIG info for GPU device") + } + + func TestGetDeviceInfo_MIGDevice(t *testing.T) { + // Create a store with a ResourceSlice containing a MIG device + store := newDRAIndexer() + slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "gpu-pool", + }, + Devices: []resourcev1.Device{ + { + Name: "mig0", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("mig")}, + "uuid": {StringValue: stringPtr("MIG-UUID-0")}, + "profile": {StringValue: stringPtr("1g.10gb")}, + "parentUUID": {StringValue: stringPtr("GPU-UUID-0")}, + }, + }, + }, + }, + } + store.Add(slice) + + m := &DRAResourceSliceManager{ + v1Informer: &testInformer{store: store}, + preferredAPIVersion: "v1", + } + + parentUUID, migInfo := m.GetDeviceInfo("gpu-pool", "mig0") + require.NotEmpty(t, parentUUID, "expected parent UUID to be found") + assert.Equal(t, "GPU-UUID-0", parentUUID) + require.NotNil(t, migInfo, "expected MIG info to be present") + assert.Equal(t, "MIG-UUID-0", migInfo.MIGDeviceUUID) + assert.Equal(t, "1g.10gb", migInfo.Profile) + assert.Equal(t, "GPU-UUID-0", migInfo.ParentUUID) + } + + func TestGetDeviceInfo_NotFound(t *testing.T) { + // Create an empty store + store := newDRAIndexer() + + m := &DRAResourceSliceManager{ + v1Informer: &testInformer{store: store}, + preferredAPIVersion: "v1", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + assert.Empty(t, uuid, "expected no UUID for non-existent device") + assert.Nil(t, migInfo, "expected no MIG info for non-existent device") + } + + func TestGetDeviceInfo_WrongPool(t *testing.T) { + // Create a store with a ResourceSlice in a different pool + store := newDRAIndexer() + slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "other-pool", + }, + Devices: []resourcev1.Device{ + { + Name: "gpu0", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-0")}, + }, + }, + }, + }, + } + store.Add(slice) + + m := &DRAResourceSliceManager{ + v1Informer: &testInformer{store: store}, + preferredAPIVersion: "v1", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + assert.Empty(t, uuid, "expected no UUID when pool doesn't match") + assert.Nil(t, migInfo, "expected no MIG info when pool doesn't match") + } + + func stringPtr(s string) *string { + return &s + } + + // TestVersionSelection_V1Preferred_NoSlices_ReturnEmpty tests that when v1 is preferred, + // we do not fall back to v1beta1. + func TestVersionSelection_V1Preferred_NoSlices_ReturnEmpty(t *testing.T) { + // Create empty v1 store (v1 served but no GPU slices) + v1Store := newDRAIndexer() + + // Create v1beta1 store with GPU slices + v1beta1Store := newDRAIndexer() + v1beta1Slice := &resourcev1beta1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "v1beta1-slice", + Namespace: "default", + }, + Spec: resourcev1beta1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1beta1.ResourcePool{ + Name: "gpu-pool", + }, + Devices: []resourcev1beta1.Device{ + { + Name: "gpu0", + Basic: &resourcev1beta1.BasicDevice{ + Attributes: map[resourcev1beta1.QualifiedName]resourcev1beta1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-0")}, + }, + }, + }, + }, + }, + } + v1beta1Store.Add(v1beta1Slice) + + m := &DRAResourceSliceManager{ + v1Informer: &testInformer{store: v1Store}, + v1beta1Informer: &testInformer{store: v1beta1Store}, + preferredAPIVersion: "v1", + } + + // GetDeviceInfo should not fall back to v1beta1 when v1 is preferred + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + assert.Empty(t, uuid, "expected no UUID when preferred version has no slices") + assert.Nil(t, migInfo, "expected no MIG info for GPU device") + } + + // TestVersionSelection_BothServedAndBothHaveObjects_PreferV1 tests that when both + // v1 and v1beta1 are served and both have objects, we prefer v1. + func TestVersionSelection_BothServedAndBothHaveObjects_PreferV1(t *testing.T) { + // Create v1 store with GPU slices + v1Store := newDRAIndexer() + v1Slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "v1-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "gpu-pool", + }, + Devices: []resourcev1.Device{ + { + Name: "gpu0", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-V1")}, + }, + }, + }, + }, + } + v1Store.Add(v1Slice) + + // Create v1beta1 store with GPU slices + v1beta1Store := newDRAIndexer() + v1beta1Slice := &resourcev1beta1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "v1beta1-slice", + Namespace: "default", + }, + Spec: resourcev1beta1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1beta1.ResourcePool{ + Name: "gpu-pool", + }, + Devices: []resourcev1beta1.Device{ + { + Name: "gpu0", + Basic: &resourcev1beta1.BasicDevice{ + Attributes: map[resourcev1beta1.QualifiedName]resourcev1beta1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-V1BETA1")}, + }, + }, + }, + }, + }, + } + v1beta1Store.Add(v1beta1Slice) + + m := &DRAResourceSliceManager{ + v1Informer: &testInformer{store: v1Store}, + v1beta1Informer: &testInformer{store: v1beta1Store}, + preferredAPIVersion: "v1", + } + + // GetDeviceInfo should prefer v1 since both have slices + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + require.NotEmpty(t, uuid, "expected UUID to be found from v1") + assert.Equal(t, "GPU-UUID-V1", uuid, "should prefer v1 when both have slices") + assert.Nil(t, migInfo, "expected no MIG info for GPU device") + } + + func TestGetDeviceInfo_InvalidPreferredVersion_ReturnsEmpty(t *testing.T) { + v1Store := newDRAIndexer() + v1Slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "v1-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "gpu-pool", + }, + Devices: []resourcev1.Device{ + { + Name: "gpu0", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-V1")}, + }, + }, + }, + }, + } + v1Store.Add(v1Slice) + + v1beta1Store := newDRAIndexer() + v1beta1Slice := &resourcev1beta1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "v1beta1-slice", + Namespace: "default", + }, + Spec: resourcev1beta1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1beta1.ResourcePool{ + Name: "gpu-pool", + }, + Devices: []resourcev1beta1.Device{ + { + Name: "gpu0", + Basic: &resourcev1beta1.BasicDevice{ + Attributes: map[resourcev1beta1.QualifiedName]resourcev1beta1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-V1BETA1")}, + }, + }, + }, + }, + }, + } + v1beta1Store.Add(v1beta1Slice) + + m := &DRAResourceSliceManager{ + v1Informer: &testInformer{store: v1Store}, + v1beta1Informer: &testInformer{store: v1beta1Store}, + preferredAPIVersion: "invalid", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + assert.Empty(t, uuid, "expected no UUID when preferred version is invalid") + assert.Nil(t, migInfo, "expected no MIG info when preferred version is invalid") + } \ No newline at end of file diff --git a/internal/pkg/transformation/kubernetes.go b/internal/pkg/transformation/kubernetes.go index 5f9f6324..7e15cec1 100644 --- a/internal/pkg/transformation/kubernetes.go +++ b/internal/pkg/transformation/kubernetes.go @@ -145,8 +145,12 @@ func NewPodMapper(c *appconfig.Config) *PodMapper { slog.Warn("Failed to get DRAResourceSliceManager, DRA pod labels will not be available", "error", err) return podMapper } + if resourceSliceManager == nil { + slog.Info("DRAResourceSliceManager not started (no NVIDIA DRA ResourceSlices found)") + return podMapper + } podMapper.ResourceSliceManager = resourceSliceManager - slog.Info("Started DRAResourceSliceManager") + slog.Info("Started DRAResourceSliceManager with auto-detected API version") } return podMapper } @@ -589,17 +593,10 @@ func (p *PodMapper) toDeviceToPodsDRA(devicePods *podresourcesapi.ListPodResourc "containerName", cntName) if dynamicResources := container.GetDynamicResources(); len(dynamicResources) > 0 && p.ResourceSliceManager != nil { for _, dr := range dynamicResources { - for _, claimResource := range dr.GetClaimResources() { - draDriverName := claimResource.GetDriverName() - if draDriverName != DRAGPUDriverName { - continue - } - draPoolName := claimResource.GetPoolName() - draDeviceName := claimResource.GetDeviceName() - - mappingKey, migInfo := p.ResourceSliceManager.GetDeviceInfo(draPoolName, draDeviceName) - if mappingKey == "" { - slog.Debug(fmt.Sprintf("No UUID for %s/%s", draPoolName, draDeviceName)) + for _, mapping := range p.ResourceSliceManager.GetDynamicResourceMappings(dr) { + mappingKey := mapping.MappingKey + drInfo := mapping.Info + if mappingKey == "" || drInfo == nil { continue } @@ -615,21 +612,12 @@ func (p *PodMapper) toDeviceToPodsDRA(devicePods *podresourcesapi.ListPodResourc if processedPods[mappingKey][podContainerKey] { continue } - podInfo := p.createPodInfo(pod, container) - drInfo := DynamicResourceInfo{ - ClaimName: dr.GetClaimName(), - ClaimNamespace: dr.GetClaimNamespace(), - DriverName: draDriverName, - PoolName: draPoolName, - DeviceName: draDeviceName, - } - if migInfo != nil { - drInfo.MIGInfo = migInfo + if drInfo.MIGInfo != nil { slog.Debug("Added MIG pod mapping", "parentUUID", mappingKey, - "migDevice", migInfo.MIGDeviceUUID, - "migProfile", migInfo.Profile, + "migDevice", drInfo.MIGInfo.MIGDeviceUUID, + "migProfile", drInfo.MIGInfo.Profile, "pod", podContainerKey) } else { slog.Debug("Added GPU pod mapping", @@ -637,7 +625,7 @@ func (p *PodMapper) toDeviceToPodsDRA(devicePods *podresourcesapi.ListPodResourc "pod", podContainerKey) } - podInfo.DynamicResources = &drInfo + podInfo.DynamicResources = drInfo deviceToPodsMap[mappingKey] = append(deviceToPodsMap[mappingKey], podInfo) processedPods[mappingKey][podContainerKey] = true } diff --git a/internal/pkg/transformation/kubernetes_test.go b/internal/pkg/transformation/kubernetes_test.go index 09e654a7..99de0ed7 100644 --- a/internal/pkg/transformation/kubernetes_test.go +++ b/internal/pkg/transformation/kubernetes_test.go @@ -31,6 +31,8 @@ import ( "go.uber.org/mock/gomock" "google.golang.org/grpc" v1 "k8s.io/api/core/v1" + resourcev1 "k8s.io/api/resource/v1" + resourcev1beta1 "k8s.io/api/resource/v1beta1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/informers" @@ -675,11 +677,222 @@ func TestPodDRAInfo(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { + // Create an indexer with ResourceSlice objects based on test case. + // We use the same poolName index as the production informer. + store := newDRAIndexer() + if len(tc.deviceToUUID) > 0 || len(tc.migDevices) > 0 { + // Create a ResourceSlice with the device from the test case + devices := []resourcev1.Device{} + if uuid, exists := tc.deviceToUUID["poolA/gpu-x"]; exists { + if migInfo, isMIG := tc.migDevices["poolA/gpu-x"]; isMIG { + // MIG device + devices = append(devices, resourcev1.Device{ + Name: "gpu-x", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("mig")}, + "uuid": {StringValue: &migInfo.MIGDeviceUUID}, + "profile": {StringValue: &migInfo.Profile}, + "parentUUID": {StringValue: &migInfo.ParentUUID}, + }, + }) + } else { + // GPU device + devices = append(devices, resourcev1.Device{ + Name: "gpu-x", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: &uuid}, + }, + }) + } + } + if len(devices) > 0 { + slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "poolA", + }, + Devices: devices, + }, + } + store.Add(slice) + } + } + + // Create test informer backed by the indexer. + testInformer := &testInformerForDRA{store: store} draMgr := &DRAResourceSliceManager{ - deviceToUUID: tc.deviceToUUID, - migDevices: tc.migDevices, + v1Informer: testInformer, + preferredAPIVersion: "v1", + } + + pm := &PodMapper{ + Config: &appconfig.Config{NvidiaResourceNames: []string{appconfig.NvidiaResourceName}}, + ResourceSliceManager: draMgr, + } + + resp := &podresourcesapi.ListPodResourcesResponse{ + PodResources: []*podresourcesapi.PodResources{{ + Name: "pod1", + Namespace: "default", + Containers: []*podresourcesapi.ContainerResources{{ + Name: "ctr1", + DynamicResources: []*podresourcesapi.DynamicResource{dra}, + }}, + }}, + } + + got := pm.toDeviceToPodsDRA(resp) + + assert.Len(t, got, len(tc.wantUUIDs), "map size") + for _, want := range tc.wantUUIDs { + assert.Contains(t, got, want, "expected key %q", want) + } + + if len(tc.wantUUIDs) == 1 { + pi := got[tc.wantUUIDs[0]] + require.Len(t, pi, 1, "should have one pod info") + + dr := *pi[0].DynamicResources + require.NotNil(t, dr, "dynamic resources should not be nil") + + assert.Equal(t, "claim1", dr.ClaimName) + assert.Equal(t, "ns1", dr.ClaimNamespace) + assert.Equal(t, DRAGPUDriverName, dr.DriverName) + assert.Equal(t, "poolA", dr.PoolName) + assert.Equal(t, "gpu-x", dr.DeviceName) + + if tc.isMIG { + require.NotNil(t, dr.MIGInfo, "MIG info should not be nil for MIG device") + assert.Equal(t, "MIG-12345", dr.MIGInfo.MIGDeviceUUID) + assert.Equal(t, "1g.12gb", dr.MIGInfo.Profile) + assert.Equal(t, "GPU-parent-uuid", dr.MIGInfo.ParentUUID) + } else { + assert.Nil(t, dr.MIGInfo, "MIG info should be nil for full GPU device") + } + } + }) + } +} + +func TestPodDRAInfo_V1beta1Preferred(t *testing.T) { + dra := &podresourcesapi.DynamicResource{ + ClaimName: "claim1", + ClaimNamespace: "ns1", + ClaimResources: []*podresourcesapi.ClaimResource{{ + DriverName: DRAGPUDriverName, + PoolName: "poolA", + DeviceName: "gpu-x", + }}, + } + + tests := []struct { + name string + deviceToUUID map[string]string + migDevices map[string]*DRAMigDeviceInfo + wantUUIDs []string + isMIG bool + }{ + { + name: "uuid-exists", + deviceToUUID: map[string]string{"poolA/gpu-x": "GPU-8a748984-0fe7-297f-916c-4b998ce202d1"}, + migDevices: map[string]*DRAMigDeviceInfo{}, + wantUUIDs: []string{"GPU-8a748984-0fe7-297f-916c-4b998ce202d1"}, + isMIG: false, + }, + { + name: "uuid-updated", + deviceToUUID: map[string]string{"poolA/gpu-x": "GPU-UUID-Updated"}, + migDevices: map[string]*DRAMigDeviceInfo{}, + wantUUIDs: []string{"GPU-UUID-Updated"}, + isMIG: false, + }, + { + name: "no-uuid", + deviceToUUID: map[string]string{}, + migDevices: map[string]*DRAMigDeviceInfo{}, + wantUUIDs: nil, + isMIG: false, + }, + { + name: "mig-device", + deviceToUUID: map[string]string{"poolA/gpu-x": "MIG-12345"}, + migDevices: map[string]*DRAMigDeviceInfo{ + "poolA/gpu-x": { + MIGDeviceUUID: "MIG-12345", + Profile: "1g.12gb", + ParentUUID: "GPU-parent-uuid", + }, + }, + wantUUIDs: []string{"GPU-parent-uuid"}, // Should map to parent UUID + isMIG: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + // Create an indexer with v1beta1 ResourceSlice objects based on test case. + // We use the same poolName index as the production informer. + store := newDRAIndexer() + if len(tc.deviceToUUID) > 0 || len(tc.migDevices) > 0 { + devices := []resourcev1beta1.Device{} + if uuid, exists := tc.deviceToUUID["poolA/gpu-x"]; exists { + if migInfo, isMIG := tc.migDevices["poolA/gpu-x"]; isMIG { + // MIG device + devices = append(devices, resourcev1beta1.Device{ + Name: "gpu-x", + Basic: &resourcev1beta1.BasicDevice{ + Attributes: map[resourcev1beta1.QualifiedName]resourcev1beta1.DeviceAttribute{ + "type": {StringValue: stringPtr("mig")}, + "uuid": {StringValue: &migInfo.MIGDeviceUUID}, + "profile": {StringValue: &migInfo.Profile}, + "parentUUID": {StringValue: &migInfo.ParentUUID}, + }, + }, + }) + } else { + // GPU device + devices = append(devices, resourcev1beta1.Device{ + Name: "gpu-x", + Basic: &resourcev1beta1.BasicDevice{ + Attributes: map[resourcev1beta1.QualifiedName]resourcev1beta1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: &uuid}, + }, + }, + }) + } + } + + if len(devices) > 0 { + slice := &resourcev1beta1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1beta1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1beta1.ResourcePool{ + Name: "poolA", + }, + Devices: devices, + }, + } + store.Add(slice) + } } + // Create test informer backed by the indexer. + testInformer := &testInformerForDRA{store: store} + draMgr := &DRAResourceSliceManager{ + v1beta1Informer: testInformer, + preferredAPIVersion: "v1beta1", + } pm := &PodMapper{ Config: &appconfig.Config{NvidiaResourceNames: []string{appconfig.NvidiaResourceName}}, ResourceSliceManager: draMgr, @@ -729,6 +942,84 @@ func TestPodDRAInfo(t *testing.T) { } } +func TestPodDRAInfo_MultipleClaimResources(t *testing.T) { + dra := &podresourcesapi.DynamicResource{ + ClaimName: "claim1", + ClaimNamespace: "ns1", + ClaimResources: []*podresourcesapi.ClaimResource{ + { + DriverName: DRAGPUDriverName, + PoolName: "poolA", + DeviceName: "gpu-x", + }, + { + DriverName: DRAGPUDriverName, + PoolName: "poolA", + DeviceName: "gpu-y", + }, + }, + } + + store := newDRAIndexer() + store.Add(&resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "poolA", + }, + Devices: []resourcev1.Device{ + { + Name: "gpu-x", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-X")}, + }, + }, + { + Name: "gpu-y", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-Y")}, + }, + }, + }, + }, + }) + + testInformer := &testInformerForDRA{store: store} + draMgr := &DRAResourceSliceManager{ + v1Informer: testInformer, + preferredAPIVersion: "v1", + } + + pm := &PodMapper{ + Config: &appconfig.Config{NvidiaResourceNames: []string{appconfig.NvidiaResourceName}}, + ResourceSliceManager: draMgr, + } + + resp := &podresourcesapi.ListPodResourcesResponse{ + PodResources: []*podresourcesapi.PodResources{{ + Name: "pod1", + Namespace: "default", + Containers: []*podresourcesapi.ContainerResources{{ + Name: "ctr1", + DynamicResources: []*podresourcesapi.DynamicResource{dra}, + }}, + }}, + } + + got := pm.toDeviceToPodsDRA(resp) + + assert.Contains(t, got, "GPU-UUID-X") + assert.Contains(t, got, "GPU-UUID-Y") + assert.Len(t, got["GPU-UUID-X"], 1) + assert.Len(t, got["GPU-UUID-Y"], 1) +} + func TestProcessPodMapper_WithUID(t *testing.T) { testutils.RequireLinux(t) diff --git a/internal/pkg/transformation/test_helpers_test.go b/internal/pkg/transformation/test_helpers_test.go new file mode 100644 index 00000000..29296d64 --- /dev/null +++ b/internal/pkg/transformation/test_helpers_test.go @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + package transformation + + import ( + "context" + "time" + + "k8s.io/client-go/tools/cache" + ) + + // testInformerForDRA is a minimal SharedIndexInformer implementation for tests that + // want to inject a pre-populated cache.Indexer (matching the production indexers). + type testInformerForDRA struct { + store cache.Store + } + + func (t *testInformerForDRA) GetStore() cache.Store { return t.store } + + func (t *testInformerForDRA) GetIndexer() cache.Indexer { return t.store.(cache.Indexer) } + + func (t *testInformerForDRA) AddIndexers(indexers cache.Indexers) error { return nil } + + func (t *testInformerForDRA) GetController() cache.Controller { return nil } + + func (t *testInformerForDRA) LastSyncResourceVersion() string { return "" } + + func (t *testInformerForDRA) AddEventHandler(handler cache.ResourceEventHandler) (cache.ResourceEventHandlerRegistration, error) { + return nil, nil + } + + func (t *testInformerForDRA) AddEventHandlerWithResyncPeriod(handler cache.ResourceEventHandler, resyncPeriod time.Duration) (cache.ResourceEventHandlerRegistration, error) { + return nil, nil + } + + func (t *testInformerForDRA) AddEventHandlerWithOptions(handler cache.ResourceEventHandler, options cache.HandlerOptions) (cache.ResourceEventHandlerRegistration, error) { + return nil, nil + } + + func (t *testInformerForDRA) RemoveEventHandler(handle cache.ResourceEventHandlerRegistration) error { return nil } + + func (t *testInformerForDRA) IsStopped() bool { return false } + + func (t *testInformerForDRA) SetWatchErrorHandler(handler cache.WatchErrorHandler) error { return nil } + + func (t *testInformerForDRA) SetWatchErrorHandlerWithContext(handler cache.WatchErrorHandlerWithContext) error { + return nil + } + + func (t *testInformerForDRA) SetTransform(handler cache.TransformFunc) error { return nil } + + func (t *testInformerForDRA) HasSynced() bool { return true } + + func (t *testInformerForDRA) Run(stopCh <-chan struct{}) {} + + func (t *testInformerForDRA) RunWithContext(ctx context.Context) {} + \ No newline at end of file diff --git a/internal/pkg/transformation/types.go b/internal/pkg/transformation/types.go index 7d572df6..581c08e3 100644 --- a/internal/pkg/transformation/types.go +++ b/internal/pkg/transformation/types.go @@ -77,12 +77,15 @@ type PodInfo struct { } type DRAResourceSliceManager struct { - factory informers.SharedInformerFactory - informer cache.SharedIndexInformer - cancelContext context.CancelFunc - mu sync.RWMutex - deviceToUUID map[string]string // pool/device -> UUID (for full GPUs) - migDevices map[string]*DRAMigDeviceInfo // pool/device -> MIG info (for MIG devices) + factory informers.SharedInformerFactory + v1Informer cache.SharedIndexInformer + v1beta1Informer cache.SharedIndexInformer + // preferredAPIVersion is selected during initialization: + // - "v1" if v1 has NVIDIA DRA ResourceSlices + // - "v1beta1" if v1 does not, but v1beta1 does + preferredAPIVersion string + cancelContext context.CancelFunc + mu sync.RWMutex } // PodMetadata holds pod metadata from API server From 6b4bc4a70c4458bb885b6837fba14ddadd855f01 Mon Sep 17 00:00:00 2001 From: Aditya Singh Date: Sun, 3 May 2026 16:27:12 +0000 Subject: [PATCH 2/2] fix(dra): address review feedback for ResourceSlice manager --- internal/pkg/transformation/dra.go | 251 ++++--- internal/pkg/transformation/dra_test.go | 627 ++++++------------ .../pkg/transformation/kubernetes_test.go | 12 +- internal/pkg/transformation/types.go | 14 +- 4 files changed, 349 insertions(+), 555 deletions(-) diff --git a/internal/pkg/transformation/dra.go b/internal/pkg/transformation/dra.go index 9efcb21d..9b20c2cd 100644 --- a/internal/pkg/transformation/dra.go +++ b/internal/pkg/transformation/dra.go @@ -25,6 +25,7 @@ import ( resourcev1 "k8s.io/api/resource/v1" resourcev1beta1 "k8s.io/api/resource/v1beta1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" @@ -146,6 +147,38 @@ func supportsResourceSliceGV(client kubernetes.Interface, groupVersion string) b return false } +// hasNvidiaDRASlices reports whether the cluster currently exposes any +// NVIDIA GPU DRA ResourceSlices on the given API version. +func hasNvidiaDRASlices(ctx context.Context, client kubernetes.Interface, apiVersion string) (bool, error) { + switch apiVersion { + case "v1": + list, err := client.ResourceV1().ResourceSlices().List(ctx, metav1.ListOptions{}) + if err != nil { + return false, fmt.Errorf("listing v1 ResourceSlices: %w", err) + } + for i := range list.Items { + s := &list.Items[i] + if s.Spec.Driver == DRAGPUDriverName && len(s.Spec.Devices) > 0 { + return true, nil + } + } + case "v1beta1": + list, err := client.ResourceV1beta1().ResourceSlices().List(ctx, metav1.ListOptions{}) + if err != nil { + return false, fmt.Errorf("listing v1beta1 ResourceSlices: %w", err) + } + for i := range list.Items { + s := &list.Items[i] + if s.Spec.Driver == DRAGPUDriverName && len(s.Spec.Devices) > 0 { + return true, nil + } + } + default: + return false, fmt.Errorf("unsupported ResourceSlice API version: %q", apiVersion) + } + return false, nil +} + // NewDRAResourceSliceManager creates a new DRA ResourceSlice manager. // The API version is auto-detected by checking which version has NVIDIA DRA ResourceSlices. func NewDRAResourceSliceManager() (*DRAResourceSliceManager, error) { @@ -172,28 +205,20 @@ func NewDRAResourceSliceManager() (*DRAResourceSliceManager, error) { ctx := context.Background() v1HasNvidiaSlices := false if v1Served { - resourceSlicesList, err := client.ResourceV1().ResourceSlices().List(ctx, metav1.ListOptions{}) + has, err := hasNvidiaDRASlices(ctx, client, "v1") if err != nil { - return nil, fmt.Errorf("failed to list ResourceSlices for v1: %v", err) - } - items := make([]interface{}, 0, len(resourceSlicesList.Items)) - for i := range resourceSlicesList.Items { - items = append(items, &resourceSlicesList.Items[i]) + return nil, err } - v1HasNvidiaSlices = countGPUSlices(items) > 0 + v1HasNvidiaSlices = has } v1beta1HasNvidiaSlices := false if v1beta1Served { - resourceSlicesList, err := client.ResourceV1beta1().ResourceSlices().List(ctx, metav1.ListOptions{}) + has, err := hasNvidiaDRASlices(ctx, client, "v1beta1") if err != nil { - return nil, fmt.Errorf("failed to list ResourceSlices for v1beta1: %v", err) + return nil, err } - items := make([]interface{}, 0, len(resourceSlicesList.Items)) - for i := range resourceSlicesList.Items { - items = append(items, &resourceSlicesList.Items[i]) - } - v1beta1HasNvidiaSlices = countGPUSlices(items) > 0 + v1beta1HasNvidiaSlices = has } var selected string @@ -244,151 +269,124 @@ func NewDRAResourceSliceManager() (*DRAResourceSliceManager, error) { } m := &DRAResourceSliceManager{ - factory: factory, - preferredAPIVersion: selected, - } - if selected == "v1" { - m.v1Informer = informer - } else { - m.v1beta1Informer = informer + factory: factory, + informer: informer, + sliceAPIVersion: selected, } - ctx, cancel := context.WithCancel(context.Background()) - m.cancelContext = cancel - factory.Start(ctx.Done()) + factory.Start(wait.NeverStop) // Wait for cache sync on the selected informer. - synced := cache.WaitForCacheSync(ctx.Done(), informer.HasSynced) + synced := cache.WaitForCacheSync(wait.NeverStop, informer.HasSynced) if !synced { - cancel() + factory.Shutdown() return nil, fmt.Errorf("ResourceSlice informer cache sync failed") } - + slog.Info("ResourceSlice API informer synced successfully", "apiVersion", selected) return m, nil } func (m *DRAResourceSliceManager) Stop() { - if m.cancelContext != nil { - m.cancelContext() - } - // Ensure factory informers are fully stopped if m.factory != nil { m.factory.Shutdown() } } -// countGPUSlices counts the number of ResourceSlice objects with GPU devices -// (matching the DRAGPUDriverName) in the given items. -func countGPUSlices(items []interface{}) int { - count := 0 - for _, item := range items { - switch obj := item.(type) { - case *resourcev1.ResourceSlice: - if obj.Spec.Driver == DRAGPUDriverName && len(obj.Spec.Devices) > 0 { - count++ - } - case *resourcev1beta1.ResourceSlice: - if obj.Spec.Driver == DRAGPUDriverName && len(obj.Spec.Devices) > 0 { - count++ - } - } - } - return count -} - func (m *DRAResourceSliceManager) getV1DeviceInfo(pool, device string) (string, *DRAMigDeviceInfo) { - if m.v1Informer == nil { + if m.informer == nil { return "", nil } - items, err := m.v1Informer.GetIndexer().ByIndex("poolName", pool) + items, err := m.informer.GetIndexer().ByIndex("poolName", pool) if err != nil { slog.Error(fmt.Sprintf("Error listing v1 ResourceSlices by pool index for pool %s: %v", pool, err)) return "", nil } - return m.getDeviceInfoFromResourceSliceItems(pool, device, items) + for _, item := range items { + rs, ok := item.(*resourcev1.ResourceSlice) + if !ok { + continue + } + if rs.Spec.Driver != DRAGPUDriverName { + continue + } + adapter := &v1ResourceSliceAdapter{slice: rs} + if mappingKey, migInfo := lookupDRADeviceInAdapter(pool, device, adapter); mappingKey != "" { + return mappingKey, migInfo + } + } + + slog.Debug(fmt.Sprintf("No UUID found for pool %s, device %s", pool, device)) + return "", nil } func (m *DRAResourceSliceManager) getV1beta1DeviceInfo(pool, device string) (string, *DRAMigDeviceInfo) { - if m.v1beta1Informer == nil { + if m.informer == nil { return "", nil } - items, err := m.v1beta1Informer.GetIndexer().ByIndex("poolName", pool) + items, err := m.informer.GetIndexer().ByIndex("poolName", pool) if err != nil { slog.Error(fmt.Sprintf("Error listing v1beta1 ResourceSlices by pool index for pool %s: %v", pool, err)) return "", nil } - return m.getDeviceInfoFromResourceSliceItems(pool, device, items) -} - -// getDeviceInfoFromResourceSliceItems resolves device UUIDs/MIG info from a set of -// ResourceSlice objects. It does not select an API version — callers already do -// that by choosing which informer indexer to query. -func (m *DRAResourceSliceManager) getDeviceInfoFromResourceSliceItems(pool, device string, items []interface{}) (string, *DRAMigDeviceInfo) { - // Search for the device in the selected slices for _, item := range items { - var adapter resourceSliceAdapter - switch obj := item.(type) { - case *resourcev1.ResourceSlice: - // NOTE: dcgm-exporter's DRA handling currently assumes the schema used by - // the NVIDIA GPU DRA driver (for example, "type", "uuid", "parentUUID", "profile" - // attributes). Other GPU DRA drivers with different schemas may not work - // correctly with this implementation. - if obj.Spec.Driver != DRAGPUDriverName { - continue - } - adapter = &v1ResourceSliceAdapter{slice: obj} - case *resourcev1beta1.ResourceSlice: - if obj.Spec.Driver != DRAGPUDriverName { - continue - } - adapter = &v1beta1ResourceSliceAdapter{slice: obj} - default: + rs, ok := item.(*resourcev1beta1.ResourceSlice) + if !ok { + continue + } + if rs.Spec.Driver != DRAGPUDriverName { continue } + adapter := &v1beta1ResourceSliceAdapter{slice: rs} + if mappingKey, migInfo := lookupDRADeviceInAdapter(pool, device, adapter); mappingKey != "" { + return mappingKey, migInfo + } + } - // Search for the device in this slice - for _, dev := range adapter.GetDevices() { - if !dev.HasAttributes() { - continue - } - if dev.GetName() != device { - continue - } + slog.Debug(fmt.Sprintf("No UUID found for pool %s, device %s", pool, device)) + return "", nil +} - deviceType := dev.GetAttribute("type") - switch deviceType { - case "mig": - parentUUID := dev.GetAttribute("parentUUID") - profile := dev.GetAttribute("profile") - migUUID := dev.GetAttribute("uuid") - if parentUUID != "" { - migInfo := &DRAMigDeviceInfo{ - MIGDeviceUUID: migUUID, - Profile: profile, - ParentUUID: parentUUID, - } - slog.Debug(fmt.Sprintf("Found MIG device %s/%s with parent UUID: %s", pool, device, parentUUID)) - return parentUUID, migInfo - } - case "gpu": - uuid := dev.GetAttribute("uuid") - if uuid != "" { - slog.Debug(fmt.Sprintf("Found GPU device %s/%s with UUID: %s", pool, device, uuid)) - return uuid, nil +// lookupDRADeviceInAdapter applies NVIDIA GPU DRA driver device attributes ("type", +// "uuid", "parentUUID", "profile"). Other drivers with different schemas may not work. +func lookupDRADeviceInAdapter(pool, device string, adapter resourceSliceAdapter) (string, *DRAMigDeviceInfo) { + for _, dev := range adapter.GetDevices() { + if !dev.HasAttributes() { + continue + } + if dev.GetName() != device { + continue + } + + deviceType := dev.GetAttribute("type") + switch deviceType { + case "mig": + parentUUID := dev.GetAttribute("parentUUID") + profile := dev.GetAttribute("profile") + migUUID := dev.GetAttribute("uuid") + if parentUUID != "" { + migInfo := &DRAMigDeviceInfo{ + MIGDeviceUUID: migUUID, + Profile: profile, + ParentUUID: parentUUID, } - default: - // Log unknown device types to help users understand why a device might not be handled - slog.Warn(fmt.Sprintf("Device [%s/%s] has unknown type: %s", pool, device, deviceType)) + slog.Debug(fmt.Sprintf("Found MIG device %s/%s with parent UUID: %s", pool, device, parentUUID)) + return parentUUID, migInfo + } + case "gpu": + uuid := dev.GetAttribute("uuid") + if uuid != "" { + slog.Debug(fmt.Sprintf("Found GPU device %s/%s with UUID: %s", pool, device, uuid)) + return uuid, nil } + default: + slog.Warn(fmt.Sprintf("Device [%s/%s] has unknown type: %s", pool, device, deviceType)) } } - - slog.Debug(fmt.Sprintf("No UUID found for pool %s, device %s", pool, device)) return "", nil } @@ -398,16 +396,19 @@ func (m *DRAResourceSliceManager) getDeviceInfoFromResourceSliceItems(pool, devi // For MIG devices: returns (parentUUID, *DRAMigDeviceInfo) // For full GPUs: returns (deviceUUID, nil) func (m *DRAResourceSliceManager) GetDeviceInfo(pool, device string) (string, *DRAMigDeviceInfo) { - m.mu.RLock() - defer m.mu.RUnlock() + if m.informer == nil { + return "", nil + } - switch m.preferredAPIVersion { + switch m.sliceAPIVersion { case "v1": return m.getV1DeviceInfo(pool, device) case "v1beta1": return m.getV1beta1DeviceInfo(pool, device) default: - slog.Error("Unsupported preferred ResourceSlice API version", "apiVersion", m.preferredAPIVersion) + if m.sliceAPIVersion != "" { + slog.Error("Unsupported ResourceSlice API version", "apiVersion", m.sliceAPIVersion) + } return "", nil } } @@ -463,19 +464,3 @@ func (m *DRAResourceSliceManager) GetDynamicResourceMappings(resource *podresour return mappings } - -// GetDynamicResourceInfo converts a DynamicResource into a DynamicResourceInfo and -// resolves the backing GPU or MIG device UUID using the ResourceSlice informer. -// It returns the mapping key (device UUID or parent UUID for MIG devices) and -// the populated DynamicResourceInfo. If the DynamicResource is not for the -// NVIDIA GPU DRA driver or no matching device can be found, it returns "" and nil. -// -// Deprecated behavior: this returns only the first mapping. Prefer -// GetDynamicResourceMappings when a DynamicResource may contain multiple devices. -func (m *DRAResourceSliceManager) GetDynamicResourceInfo(resource *podresourcesapi.DynamicResource) (string, *DynamicResourceInfo) { - mappings := m.GetDynamicResourceMappings(resource) - if len(mappings) == 0 { - return "", nil - } - return mappings[0].MappingKey, mappings[0].Info -} \ No newline at end of file diff --git a/internal/pkg/transformation/dra_test.go b/internal/pkg/transformation/dra_test.go index 25587002..8ee60f0f 100644 --- a/internal/pkg/transformation/dra_test.go +++ b/internal/pkg/transformation/dra_test.go @@ -14,410 +14,225 @@ * limitations under the License. */ - package transformation +package transformation - import ( - "context" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - resourcev1 "k8s.io/api/resource/v1" - resourcev1beta1 "k8s.io/api/resource/v1beta1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/tools/cache" - ) - - // testInformer is a simple test implementation of SharedIndexInformer - type testInformer struct { - store cache.Store - } - - func (t *testInformer) GetStore() cache.Store { - return t.store - } - - func (t *testInformer) GetIndexer() cache.Indexer { - return t.store.(cache.Indexer) - } - - // newDRAIndexer creates an Indexer with a poolName index matching the production - // informer configuration so tests can exercise GetDeviceInfo without relying on - // informer.AddIndexers. - func newDRAIndexer() cache.Indexer { - return cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{ - "poolName": func(obj interface{}) ([]string, error) { - switch rs := obj.(type) { - case *resourcev1.ResourceSlice: - return []string{rs.Spec.Pool.Name}, nil - case *resourcev1beta1.ResourceSlice: - return []string{rs.Spec.Pool.Name}, nil - default: - return nil, nil - } - }, - }) - } - - func (t *testInformer) AddIndexers(indexers cache.Indexers) error { - return nil - } - - func (t *testInformer) GetController() cache.Controller { - return nil - } - - func (t *testInformer) LastSyncResourceVersion() string { - return "" - } - - func (t *testInformer) AddEventHandler(handler cache.ResourceEventHandler) (cache.ResourceEventHandlerRegistration, error) { - return nil, nil - } - - func (t *testInformer) AddEventHandlerWithResyncPeriod(handler cache.ResourceEventHandler, resyncPeriod time.Duration) (cache.ResourceEventHandlerRegistration, error) { - return nil, nil - } - - func (t *testInformer) AddEventHandlerWithOptions(handler cache.ResourceEventHandler, options cache.HandlerOptions) (cache.ResourceEventHandlerRegistration, error) { - return nil, nil - } - - func (t *testInformer) RemoveEventHandler(handle cache.ResourceEventHandlerRegistration) error { - return nil - } - - func (t *testInformer) IsStopped() bool { - return false - } - - func (t *testInformer) SetWatchErrorHandler(handler cache.WatchErrorHandler) error { - return nil - } - - func (t *testInformer) SetWatchErrorHandlerWithContext(handler cache.WatchErrorHandlerWithContext) error { - return nil - } - - func (t *testInformer) SetTransform(handler cache.TransformFunc) error { - return nil - } - - func (t *testInformer) HasSynced() bool { - return true - } - - func (t *testInformer) Run(stopCh <-chan struct{}) { - } - - func (t *testInformer) RunWithContext(ctx context.Context) { - } - - func TestGetDeviceInfo_GPUDevice(t *testing.T) { - // Create a store with a ResourceSlice containing a GPU device - store := newDRAIndexer() - slice := &resourcev1.ResourceSlice{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-slice", - Namespace: "default", - }, - Spec: resourcev1.ResourceSliceSpec{ - Driver: DRAGPUDriverName, - Pool: resourcev1.ResourcePool{ - Name: "gpu-pool", - }, - Devices: []resourcev1.Device{ - { - Name: "gpu0", - Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ - "type": {StringValue: stringPtr("gpu")}, - "uuid": {StringValue: stringPtr("GPU-UUID-0")}, - }, - }, - }, - }, - } - store.Add(slice) - - m := &DRAResourceSliceManager{ - v1Informer: &testInformer{store: store}, - preferredAPIVersion: "v1", - } - - uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") - require.NotEmpty(t, uuid, "expected UUID to be found") - assert.Equal(t, "GPU-UUID-0", uuid) - assert.Nil(t, migInfo, "expected no MIG info for GPU device") - } - - func TestGetDeviceInfo_MIGDevice(t *testing.T) { - // Create a store with a ResourceSlice containing a MIG device - store := newDRAIndexer() - slice := &resourcev1.ResourceSlice{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-slice", - Namespace: "default", - }, - Spec: resourcev1.ResourceSliceSpec{ - Driver: DRAGPUDriverName, - Pool: resourcev1.ResourcePool{ - Name: "gpu-pool", - }, - Devices: []resourcev1.Device{ - { - Name: "mig0", - Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ - "type": {StringValue: stringPtr("mig")}, - "uuid": {StringValue: stringPtr("MIG-UUID-0")}, - "profile": {StringValue: stringPtr("1g.10gb")}, - "parentUUID": {StringValue: stringPtr("GPU-UUID-0")}, - }, - }, - }, - }, - } - store.Add(slice) - - m := &DRAResourceSliceManager{ - v1Informer: &testInformer{store: store}, - preferredAPIVersion: "v1", - } - - parentUUID, migInfo := m.GetDeviceInfo("gpu-pool", "mig0") - require.NotEmpty(t, parentUUID, "expected parent UUID to be found") - assert.Equal(t, "GPU-UUID-0", parentUUID) - require.NotNil(t, migInfo, "expected MIG info to be present") - assert.Equal(t, "MIG-UUID-0", migInfo.MIGDeviceUUID) - assert.Equal(t, "1g.10gb", migInfo.Profile) - assert.Equal(t, "GPU-UUID-0", migInfo.ParentUUID) - } - - func TestGetDeviceInfo_NotFound(t *testing.T) { - // Create an empty store - store := newDRAIndexer() - - m := &DRAResourceSliceManager{ - v1Informer: &testInformer{store: store}, - preferredAPIVersion: "v1", - } - - uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") - assert.Empty(t, uuid, "expected no UUID for non-existent device") - assert.Nil(t, migInfo, "expected no MIG info for non-existent device") - } - - func TestGetDeviceInfo_WrongPool(t *testing.T) { - // Create a store with a ResourceSlice in a different pool - store := newDRAIndexer() - slice := &resourcev1.ResourceSlice{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-slice", - Namespace: "default", - }, - Spec: resourcev1.ResourceSliceSpec{ - Driver: DRAGPUDriverName, - Pool: resourcev1.ResourcePool{ - Name: "other-pool", - }, - Devices: []resourcev1.Device{ - { - Name: "gpu0", - Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ - "type": {StringValue: stringPtr("gpu")}, - "uuid": {StringValue: stringPtr("GPU-UUID-0")}, - }, - }, - }, - }, - } - store.Add(slice) - - m := &DRAResourceSliceManager{ - v1Informer: &testInformer{store: store}, - preferredAPIVersion: "v1", - } - - uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") - assert.Empty(t, uuid, "expected no UUID when pool doesn't match") - assert.Nil(t, migInfo, "expected no MIG info when pool doesn't match") - } - - func stringPtr(s string) *string { - return &s - } - - // TestVersionSelection_V1Preferred_NoSlices_ReturnEmpty tests that when v1 is preferred, - // we do not fall back to v1beta1. - func TestVersionSelection_V1Preferred_NoSlices_ReturnEmpty(t *testing.T) { - // Create empty v1 store (v1 served but no GPU slices) - v1Store := newDRAIndexer() - - // Create v1beta1 store with GPU slices - v1beta1Store := newDRAIndexer() - v1beta1Slice := &resourcev1beta1.ResourceSlice{ - ObjectMeta: metav1.ObjectMeta{ - Name: "v1beta1-slice", - Namespace: "default", - }, - Spec: resourcev1beta1.ResourceSliceSpec{ - Driver: DRAGPUDriverName, - Pool: resourcev1beta1.ResourcePool{ - Name: "gpu-pool", - }, - Devices: []resourcev1beta1.Device{ - { - Name: "gpu0", - Basic: &resourcev1beta1.BasicDevice{ - Attributes: map[resourcev1beta1.QualifiedName]resourcev1beta1.DeviceAttribute{ - "type": {StringValue: stringPtr("gpu")}, - "uuid": {StringValue: stringPtr("GPU-UUID-0")}, - }, - }, - }, - }, - }, - } - v1beta1Store.Add(v1beta1Slice) - - m := &DRAResourceSliceManager{ - v1Informer: &testInformer{store: v1Store}, - v1beta1Informer: &testInformer{store: v1beta1Store}, - preferredAPIVersion: "v1", - } - - // GetDeviceInfo should not fall back to v1beta1 when v1 is preferred - uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") - assert.Empty(t, uuid, "expected no UUID when preferred version has no slices") - assert.Nil(t, migInfo, "expected no MIG info for GPU device") - } - - // TestVersionSelection_BothServedAndBothHaveObjects_PreferV1 tests that when both - // v1 and v1beta1 are served and both have objects, we prefer v1. - func TestVersionSelection_BothServedAndBothHaveObjects_PreferV1(t *testing.T) { - // Create v1 store with GPU slices - v1Store := newDRAIndexer() - v1Slice := &resourcev1.ResourceSlice{ - ObjectMeta: metav1.ObjectMeta{ - Name: "v1-slice", - Namespace: "default", - }, - Spec: resourcev1.ResourceSliceSpec{ - Driver: DRAGPUDriverName, - Pool: resourcev1.ResourcePool{ - Name: "gpu-pool", - }, - Devices: []resourcev1.Device{ - { - Name: "gpu0", - Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ - "type": {StringValue: stringPtr("gpu")}, - "uuid": {StringValue: stringPtr("GPU-UUID-V1")}, - }, - }, - }, - }, - } - v1Store.Add(v1Slice) - - // Create v1beta1 store with GPU slices - v1beta1Store := newDRAIndexer() - v1beta1Slice := &resourcev1beta1.ResourceSlice{ - ObjectMeta: metav1.ObjectMeta{ - Name: "v1beta1-slice", - Namespace: "default", - }, - Spec: resourcev1beta1.ResourceSliceSpec{ - Driver: DRAGPUDriverName, - Pool: resourcev1beta1.ResourcePool{ - Name: "gpu-pool", - }, - Devices: []resourcev1beta1.Device{ - { - Name: "gpu0", - Basic: &resourcev1beta1.BasicDevice{ - Attributes: map[resourcev1beta1.QualifiedName]resourcev1beta1.DeviceAttribute{ - "type": {StringValue: stringPtr("gpu")}, - "uuid": {StringValue: stringPtr("GPU-UUID-V1BETA1")}, - }, - }, - }, - }, - }, - } - v1beta1Store.Add(v1beta1Slice) - - m := &DRAResourceSliceManager{ - v1Informer: &testInformer{store: v1Store}, - v1beta1Informer: &testInformer{store: v1beta1Store}, - preferredAPIVersion: "v1", - } - - // GetDeviceInfo should prefer v1 since both have slices - uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") - require.NotEmpty(t, uuid, "expected UUID to be found from v1") - assert.Equal(t, "GPU-UUID-V1", uuid, "should prefer v1 when both have slices") - assert.Nil(t, migInfo, "expected no MIG info for GPU device") - } - - func TestGetDeviceInfo_InvalidPreferredVersion_ReturnsEmpty(t *testing.T) { - v1Store := newDRAIndexer() - v1Slice := &resourcev1.ResourceSlice{ - ObjectMeta: metav1.ObjectMeta{ - Name: "v1-slice", - Namespace: "default", - }, - Spec: resourcev1.ResourceSliceSpec{ - Driver: DRAGPUDriverName, - Pool: resourcev1.ResourcePool{ - Name: "gpu-pool", - }, - Devices: []resourcev1.Device{ - { - Name: "gpu0", - Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ - "type": {StringValue: stringPtr("gpu")}, - "uuid": {StringValue: stringPtr("GPU-UUID-V1")}, - }, - }, - }, - }, - } - v1Store.Add(v1Slice) - - v1beta1Store := newDRAIndexer() - v1beta1Slice := &resourcev1beta1.ResourceSlice{ - ObjectMeta: metav1.ObjectMeta{ - Name: "v1beta1-slice", - Namespace: "default", - }, - Spec: resourcev1beta1.ResourceSliceSpec{ - Driver: DRAGPUDriverName, - Pool: resourcev1beta1.ResourcePool{ - Name: "gpu-pool", - }, - Devices: []resourcev1beta1.Device{ - { - Name: "gpu0", - Basic: &resourcev1beta1.BasicDevice{ - Attributes: map[resourcev1beta1.QualifiedName]resourcev1beta1.DeviceAttribute{ - "type": {StringValue: stringPtr("gpu")}, - "uuid": {StringValue: stringPtr("GPU-UUID-V1BETA1")}, - }, - }, - }, - }, - }, - } - v1beta1Store.Add(v1beta1Slice) - - m := &DRAResourceSliceManager{ - v1Informer: &testInformer{store: v1Store}, - v1beta1Informer: &testInformer{store: v1beta1Store}, - preferredAPIVersion: "invalid", - } - - uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") - assert.Empty(t, uuid, "expected no UUID when preferred version is invalid") - assert.Nil(t, migInfo, "expected no MIG info when preferred version is invalid") - } \ No newline at end of file +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + resourcev1 "k8s.io/api/resource/v1" + resourcev1beta1 "k8s.io/api/resource/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/cache" +) + +// newDRAIndexer creates an Indexer with a poolName index matching the production +// informer configuration so tests can exercise GetDeviceInfo without relying on +// informer.AddIndexers. +func newDRAIndexer() cache.Indexer { + return cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{ + "poolName": func(obj interface{}) ([]string, error) { + switch rs := obj.(type) { + case *resourcev1.ResourceSlice: + return []string{rs.Spec.Pool.Name}, nil + case *resourcev1beta1.ResourceSlice: + return []string{rs.Spec.Pool.Name}, nil + default: + return nil, nil + } + }, + }) +} + +func TestGetDeviceInfo_GPUDevice(t *testing.T) { + // Create a store with a ResourceSlice containing a GPU device + store := newDRAIndexer() + slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "gpu-pool", + }, + Devices: []resourcev1.Device{ + { + Name: "gpu0", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-0")}, + }, + }, + }, + }, + } + store.Add(slice) + + m := &DRAResourceSliceManager{ + informer: &testInformerForDRA{store: store}, + sliceAPIVersion: "v1", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + require.NotEmpty(t, uuid, "expected UUID to be found") + assert.Equal(t, "GPU-UUID-0", uuid) + assert.Nil(t, migInfo, "expected no MIG info for GPU device") +} + +func TestGetDeviceInfo_MIGDevice(t *testing.T) { + // Create a store with a ResourceSlice containing a MIG device + store := newDRAIndexer() + slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "gpu-pool", + }, + Devices: []resourcev1.Device{ + { + Name: "mig0", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("mig")}, + "uuid": {StringValue: stringPtr("MIG-UUID-0")}, + "profile": {StringValue: stringPtr("1g.10gb")}, + "parentUUID": {StringValue: stringPtr("GPU-UUID-0")}, + }, + }, + }, + }, + } + store.Add(slice) + + m := &DRAResourceSliceManager{ + informer: &testInformerForDRA{store: store}, + sliceAPIVersion: "v1", + } + + parentUUID, migInfo := m.GetDeviceInfo("gpu-pool", "mig0") + require.NotEmpty(t, parentUUID, "expected parent UUID to be found") + assert.Equal(t, "GPU-UUID-0", parentUUID) + require.NotNil(t, migInfo, "expected MIG info to be present") + assert.Equal(t, "MIG-UUID-0", migInfo.MIGDeviceUUID) + assert.Equal(t, "1g.10gb", migInfo.Profile) + assert.Equal(t, "GPU-UUID-0", migInfo.ParentUUID) +} + +func TestGetDeviceInfo_NotFound(t *testing.T) { + // Create an empty store + store := newDRAIndexer() + + m := &DRAResourceSliceManager{ + informer: &testInformerForDRA{store: store}, + sliceAPIVersion: "v1", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + assert.Empty(t, uuid, "expected no UUID for non-existent device") + assert.Nil(t, migInfo, "expected no MIG info for non-existent device") +} + +func TestGetDeviceInfo_WrongPool(t *testing.T) { + // Create a store with a ResourceSlice in a different pool + store := newDRAIndexer() + slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "other-pool", + }, + Devices: []resourcev1.Device{ + { + Name: "gpu0", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-0")}, + }, + }, + }, + }, + } + store.Add(slice) + + m := &DRAResourceSliceManager{ + informer: &testInformerForDRA{store: store}, + sliceAPIVersion: "v1", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + assert.Empty(t, uuid, "expected no UUID when pool doesn't match") + assert.Nil(t, migInfo, "expected no MIG info when pool doesn't match") +} + +func stringPtr(s string) *string { + return &s +} + +// TestGetDeviceInfo_EmptyInformerStore_ReturnsEmpty verifies an empty informer store yields no mapping. +func TestGetDeviceInfo_EmptyInformerStore_ReturnsEmpty(t *testing.T) { + v1Store := newDRAIndexer() + + m := &DRAResourceSliceManager{ + informer: &testInformerForDRA{store: v1Store}, + sliceAPIVersion: "v1", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + assert.Empty(t, uuid, "expected no UUID when informer store has no matching slices") + assert.Nil(t, migInfo, "expected no MIG info for GPU device") +} + +// TestGetDeviceInfo_V1SliceInStore resolves UUID from v1 ResourceSlice objects in the informer. +func TestGetDeviceInfo_V1SliceInStore(t *testing.T) { + v1Store := newDRAIndexer() + v1Slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "v1-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "gpu-pool", + }, + Devices: []resourcev1.Device{ + { + Name: "gpu0", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-V1")}, + }, + }, + }, + }, + } + v1Store.Add(v1Slice) + + m := &DRAResourceSliceManager{ + informer: &testInformerForDRA{store: v1Store}, + sliceAPIVersion: "v1", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + require.NotEmpty(t, uuid, "expected UUID to be found from v1") + assert.Equal(t, "GPU-UUID-V1", uuid) + assert.Nil(t, migInfo, "expected no MIG info for GPU device") +} + +func TestGetDeviceInfo_NilInformer_ReturnsEmpty(t *testing.T) { + m := &DRAResourceSliceManager{} + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + assert.Empty(t, uuid, "expected no UUID when informer is nil") + assert.Nil(t, migInfo, "expected no MIG info when informer is nil") +} diff --git a/internal/pkg/transformation/kubernetes_test.go b/internal/pkg/transformation/kubernetes_test.go index 99de0ed7..a25ff775 100644 --- a/internal/pkg/transformation/kubernetes_test.go +++ b/internal/pkg/transformation/kubernetes_test.go @@ -727,8 +727,8 @@ func TestPodDRAInfo(t *testing.T) { // Create test informer backed by the indexer. testInformer := &testInformerForDRA{store: store} draMgr := &DRAResourceSliceManager{ - v1Informer: testInformer, - preferredAPIVersion: "v1", + informer: testInformer, + sliceAPIVersion: "v1", } pm := &PodMapper{ @@ -890,8 +890,8 @@ func TestPodDRAInfo_V1beta1Preferred(t *testing.T) { // Create test informer backed by the indexer. testInformer := &testInformerForDRA{store: store} draMgr := &DRAResourceSliceManager{ - v1beta1Informer: testInformer, - preferredAPIVersion: "v1beta1", + informer: testInformer, + sliceAPIVersion: "v1beta1", } pm := &PodMapper{ Config: &appconfig.Config{NvidiaResourceNames: []string{appconfig.NvidiaResourceName}}, @@ -992,8 +992,8 @@ func TestPodDRAInfo_MultipleClaimResources(t *testing.T) { testInformer := &testInformerForDRA{store: store} draMgr := &DRAResourceSliceManager{ - v1Informer: testInformer, - preferredAPIVersion: "v1", + informer: testInformer, + sliceAPIVersion: "v1", } pm := &PodMapper{ diff --git a/internal/pkg/transformation/types.go b/internal/pkg/transformation/types.go index 581c08e3..4cd34e1d 100644 --- a/internal/pkg/transformation/types.go +++ b/internal/pkg/transformation/types.go @@ -18,7 +18,6 @@ package transformation import ( "container/list" - "context" "regexp" "sync" @@ -77,15 +76,10 @@ type PodInfo struct { } type DRAResourceSliceManager struct { - factory informers.SharedInformerFactory - v1Informer cache.SharedIndexInformer - v1beta1Informer cache.SharedIndexInformer - // preferredAPIVersion is selected during initialization: - // - "v1" if v1 has NVIDIA DRA ResourceSlices - // - "v1beta1" if v1 does not, but v1beta1 does - preferredAPIVersion string - cancelContext context.CancelFunc - mu sync.RWMutex + factory informers.SharedInformerFactory + informer cache.SharedIndexInformer + // sliceAPIVersion is "v1" or "v1beta1", matching the started ResourceSlice informer. + sliceAPIVersion string } // PodMetadata holds pod metadata from API server