diff --git a/go.mod b/go.mod index cbab2f11..4dc06e8f 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,8 @@ require ( github.com/NVIDIA/go-nvml v0.12.4-1 github.com/avast/retry-go/v4 v4.6.0 github.com/bits-and-blooms/bitset v1.22.0 - github.com/fsnotify/fsnotify v1.7.0 + github.com/containerd/cgroups/v3 v3.1.1 + github.com/fsnotify/fsnotify v1.9.0 github.com/google/uuid v1.6.0 github.com/gorilla/mux v1.8.1 github.com/mittwald/go-helm-client v0.12.16 @@ -25,13 +26,13 @@ require ( go.uber.org/goleak v1.3.0 go.uber.org/mock v0.5.0 golang.org/x/sync v0.16.0 - google.golang.org/grpc v1.71.1 + google.golang.org/grpc v1.72.1 helm.sh/helm/v3 v3.18.5 - k8s.io/api v0.33.3 - k8s.io/apimachinery v0.33.3 - k8s.io/client-go v0.33.3 - k8s.io/kubelet v0.32.3 - k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e + k8s.io/api v0.34.1 + k8s.io/apimachinery v0.34.1 + k8s.io/client-go v0.34.1 + k8s.io/kubelet v0.34.1 + k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 ) require ( @@ -49,7 +50,6 @@ require ( github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/chai2010/gettext-go v1.0.3 // indirect - github.com/containerd/cgroups/v3 v3.1.1 // indirect github.com/containerd/containerd v1.7.27 // indirect github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/log v0.1.0 // indirect @@ -63,7 +63,7 @@ require ( github.com/evanphx/json-patch v5.9.11+incompatible // indirect github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect github.com/fatih/color v1.18.0 // indirect - github.com/fxamacker/cbor/v2 v2.8.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-errors/errors v1.5.1 // indirect github.com/go-gorp/gorp/v3 v3.1.0 // indirect github.com/go-logr/logr v1.4.2 // indirect @@ -74,7 +74,7 @@ require ( github.com/gobwas/glob v0.2.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/google/btree v1.1.3 // indirect - github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect @@ -107,7 +107,7 @@ require ( github.com/moby/sys/userns v0.1.0 // indirect github.com/moby/term v0.5.2 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect @@ -131,10 +131,9 @@ require ( github.com/xlab/treeprint v1.2.0 // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect go.opentelemetry.io/otel/sdk/metric v1.35.0 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect - go.yaml.in/yaml/v3 v3.0.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.40.0 // indirect golang.org/x/net v0.41.0 // indirect golang.org/x/oauth2 v0.28.0 // indirect @@ -150,17 +149,17 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.33.3 // indirect - k8s.io/apiserver v0.33.3 // indirect + k8s.io/apiserver v0.34.1 // indirect k8s.io/cli-runtime v0.33.3 // indirect - k8s.io/component-base v0.33.3 // indirect + k8s.io/component-base v0.34.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect + k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect k8s.io/kubectl v0.33.3 // indirect oras.land/oras-go/v2 v2.6.0 // indirect sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect sigs.k8s.io/kustomize/api v0.19.0 // indirect sigs.k8s.io/kustomize/kyaml v0.19.0 // indirect sigs.k8s.io/randfill v1.0.0 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect - sigs.k8s.io/yaml v1.5.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index cd741e4b..42c4d5df 100644 --- a/go.sum +++ b/go.sum @@ -44,7 +44,6 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chai2010/gettext-go v1.0.3 h1:9liNh8t+u26xl5ddmWLmsOsdNLwkdRTg5AG+JnTiM80= github.com/chai2010/gettext-go v1.0.3/go.mod h1:y+wnP2cHYaVj19NZhYKAwEMH2CI1gNHeQQ+5AjwawxA= -github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM= github.com/containerd/cgroups/v3 v3.1.1 h1:ASZmQGfOHbRj43/1aMn5QcWIsv0R/AuHHDNCguRY0p0= github.com/containerd/cgroups/v3 v3.1.1/go.mod h1:PKZ2AcWmSBsY/tJUVhtS/rluX0b1uq1GmPO1ElCmbOw= github.com/containerd/containerd v1.7.27 h1:yFyEyojddO3MIGVER2xJLWoCIn+Up4GaHFquP7hsFII= @@ -95,10 +94,10 @@ github.com/foxcpp/go-mockdns v1.1.0 h1:jI0rD8M0wuYAxL7r/ynTrCQQq0BVqfB99Vgk7Dlme github.com/foxcpp/go-mockdns v1.1.0/go.mod h1:IhLeSFGed3mJIAXPH2aiRQB+kqz7oqu8ld2qVbOu7Wk= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= -github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= -github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= -github.com/fxamacker/cbor/v2 v2.8.0 h1:fFtUGXUzXPHTIUdne5+zzMPTfffl3RD5qYnkY40vtxU= -github.com/fxamacker/cbor/v2 v2.8.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= github.com/go-errors/errors v1.5.1 h1:ZwEMSLRCapFLflTpT7NKaAc7ukJ8ZPEjzlxt8rPN8bk= github.com/go-errors/errors v1.5.1/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= github.com/go-gorp/gorp/v3 v3.1.0 h1:ItKF/Vbuj31dmV4jxA1qblpSwkl9g1typ24xoe70IGs= @@ -126,9 +125,8 @@ github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= -github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -149,8 +147,8 @@ github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16 github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4M0+kPpLofRdBo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -221,8 +219,9 @@ github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFL github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/2gBQ3RWajuToeY6ZtZTIKv2v7ThUy5KKusIT0yc0= github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= @@ -327,10 +326,10 @@ go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.32.0 h1:j7Z go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.32.0/go.mod h1:WXbYJTUaZXAbYd8lbgGuvih0yuCfOFC5RJoYnoLcGz8= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.32.0 h1:t/Qur3vKSkUCcDVaSumWF2PKHt85pc7fRvFuoVT8qFU= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.32.0/go.mod h1:Rl61tySSdcOJWoEgYZVtmnKdA0GeKrSqkHC1t+91CH8= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 h1:Vh5HayB/0HHfOQA7Ctx69E/Y/DcQSMPpKANYVMQ7fBA= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0/go.mod h1:cpgtDBaqD/6ok/UG0jT15/uKjAY8mRA53diogHBg3UI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 h1:5pojmb1U1AogINhN3SurB+zm/nIcusopeBNp42f45QM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0/go.mod h1:57gTHJSE5S1tqg+EKsLPlTWhpHMsWlVmer+LA926XiA= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 h1:OeNbIYk/2C15ckl7glBlOBp5+WlYsOElzTNmiPW/x60= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0/go.mod h1:7Bept48yIeqxP2OZ9/AqIpYS94h2or0aB4FypJTc8ZM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 h1:tgJ0uaNS4c98WRNUEx5U3aDlrDOI5Rs+1Vifcw4DJ8U= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0/go.mod h1:U7HYyW0zt/a9x5J1Kjs+r1f/d4ZHnYFclhYY2+YbeoE= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.32.0 h1:cMyu9O88joYEaI47CnQkxO1XZdpoTF9fEnW2duIddhw= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.32.0/go.mod h1:6Am3rn7P9TVVeXYG+wtcGE7IE1tsQ+bP3AuWcKt/gOI= go.opentelemetry.io/otel/exporters/prometheus v0.54.0 h1:rFwzp68QMgtzu9PgP3jm9XaMICI6TsofWWPcBDKwlsU= @@ -353,16 +352,16 @@ go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5J go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= -go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= -go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= +go.opentelemetry.io/proto/otlp v1.5.0 h1:xJvq7gMzB31/d406fB8U5CBdyQGw4P399D1aQWU/3i4= +go.opentelemetry.io/proto/otlp v1.5.0/go.mod h1:keN8WnHxOy8PG0rQZjJJ5A2ebUoafqWp0eVQ4yIXvJ4= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/mock v0.5.0 h1:KAMbZvZPyBPWgD14IrIQ38QCyjwpvVVV6K/bHl1IwQU= go.uber.org/mock v0.5.0/go.mod h1:ge71pBPLYDk7QIi1LupWxdAykm7KIEFchiOqd6z7qMM= go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= -go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE= -go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= @@ -412,12 +411,12 @@ golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/genproto v0.0.0-20240123012728-ef4313101c80 h1:KAeGQVN3M9nD0/bQXnr/ClcEMJ968gUXJQ9pwfSynuQ= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= +google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= +google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= google.golang.org/genproto/googleapis/rpc v0.0.0-20250324211829-b45e905df463 h1:e0AIkUUhxyBKh6ssZNrAMeqhA7RKUj42346d1y02i2g= google.golang.org/genproto/googleapis/rpc v0.0.0-20250324211829-b45e905df463/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= -google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= -google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= +google.golang.org/grpc v1.72.1 h1:HR03wO6eyZ7lknl75XlxABNVLLFc2PAb6mHlYh756mA= +google.golang.org/grpc v1.72.1/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -434,30 +433,30 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= helm.sh/helm/v3 v3.18.5 h1:Cc3Z5vd6kDrZq9wO9KxKLNEickiTho6/H/dBNRVSos4= helm.sh/helm/v3 v3.18.5/go.mod h1:L/dXDR2r539oPlFP1PJqKAC1CUgqHJDLkxKpDGrWnyg= -k8s.io/api v0.33.3 h1:SRd5t//hhkI1buzxb288fy2xvjubstenEKL9K51KBI8= -k8s.io/api v0.33.3/go.mod h1:01Y/iLUjNBM3TAvypct7DIj0M0NIZc+PzAHCIo0CYGE= +k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM= +k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk= k8s.io/apiextensions-apiserver v0.33.3 h1:qmOcAHN6DjfD0v9kxL5udB27SRP6SG/MTopmge3MwEs= k8s.io/apiextensions-apiserver v0.33.3/go.mod h1:oROuctgo27mUsyp9+Obahos6CWcMISSAPzQ77CAQGz8= -k8s.io/apimachinery v0.33.3 h1:4ZSrmNa0c/ZpZJhAgRdcsFcZOw1PQU1bALVQ0B3I5LA= -k8s.io/apimachinery v0.33.3/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/apiserver v0.33.3 h1:Wv0hGc+QFdMJB4ZSiHrCgN3zL3QRatu56+rpccKC3J4= -k8s.io/apiserver v0.33.3/go.mod h1:05632ifFEe6TxwjdAIrwINHWE2hLwyADFk5mBsQa15E= +k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4= +k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= +k8s.io/apiserver v0.34.1 h1:U3JBGdgANK3dfFcyknWde1G6X1F4bg7PXuvlqt8lITA= +k8s.io/apiserver v0.34.1/go.mod h1:eOOc9nrVqlBI1AFCvVzsob0OxtPZUCPiUJL45JOTBG0= k8s.io/cli-runtime v0.33.3 h1:Dgy4vPjNIu8LMJBSvs8W0LcdV0PX/8aGG1DA1W8lklA= k8s.io/cli-runtime v0.33.3/go.mod h1:yklhLklD4vLS8HNGgC9wGiuHWze4g7x6XQZ+8edsKEo= -k8s.io/client-go v0.33.3 h1:M5AfDnKfYmVJif92ngN532gFqakcGi6RvaOF16efrpA= -k8s.io/client-go v0.33.3/go.mod h1:luqKBQggEf3shbxHY4uVENAxrDISLOarxpTKMiUuujg= -k8s.io/component-base v0.33.3 h1:mlAuyJqyPlKZM7FyaoM/LcunZaaY353RXiOd2+B5tGA= -k8s.io/component-base v0.33.3/go.mod h1:ktBVsBzkI3imDuxYXmVxZ2zxJnYTZ4HAsVj9iF09qp4= +k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY= +k8s.io/client-go v0.34.1/go.mod h1:kA8v0FP+tk6sZA0yKLRG67LWjqufAoSHA2xVGKw9Of8= +k8s.io/component-base v0.34.1 h1:v7xFgG+ONhytZNFpIz5/kecwD+sUhVE6HU7qQUiRM4A= +k8s.io/component-base v0.34.1/go.mod h1:mknCpLlTSKHzAQJJnnHVKqjxR7gBeHRv0rPXA7gdtQ0= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUyGcf03XZEP0ZIKgKj35LS4= -k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= +k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA= +k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= k8s.io/kubectl v0.33.3 h1:r/phHvH1iU7gO/l7tTjQk2K01ER7/OAJi8uFHHyWSac= k8s.io/kubectl v0.33.3/go.mod h1:euj2bG56L6kUGOE/ckZbCoudPwuj4Kud7BR0GzyNiT0= -k8s.io/kubelet v0.32.3 h1:B9HzW4yB67flx8tN2FYuDwZvxnmK3v5EjxxFvOYjmc8= -k8s.io/kubelet v0.32.3/go.mod h1:yyAQSCKC+tjSlaFw4HQG7Jein+vo+GeKBGdXdQGvL1U= -k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e h1:KqK5c/ghOm8xkHYhlodbp6i6+r+ChV2vuAuVRdFbLro= -k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/kubelet v0.34.1 h1:doAaTA9/Yfzbdq/u/LveZeONp96CwX9giW6b+oHn4m4= +k8s.io/kubelet v0.34.1/go.mod h1:PtV3Ese8iOM19gSooFoQT9iyRisbmJdAPuDImuccbbA= +k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= +k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc= oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o= sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= @@ -466,11 +465,9 @@ sigs.k8s.io/kustomize/api v0.19.0 h1:F+2HB2mU1MSiR9Hp1NEgoU2q9ItNOaBJl0I4Dlus5SQ sigs.k8s.io/kustomize/api v0.19.0/go.mod h1:/BbwnivGVcBh1r+8m3tH1VNxJmHSk1PzP5fkP6lbL1o= sigs.k8s.io/kustomize/kyaml v0.19.0 h1:RFge5qsO1uHhwJsu3ipV7RNolC7Uozc0jUBC/61XSlA= sigs.k8s.io/kustomize/kyaml v0.19.0/go.mod h1:FeKD5jEOH+FbZPpqUghBP8mrLjJ3+zD3/rf9NNu1cwY= -sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= -sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= -sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ= -sigs.k8s.io/yaml v1.5.0/go.mod h1:wZs27Rbxoai4C0f8/9urLZtZtF3avA3gKvGyPdDqTO4= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/internal/pkg/testutils/test_utils.go b/internal/pkg/testutils/test_utils.go index ed738aca..c51a0620 100644 --- a/internal/pkg/testutils/test_utils.go +++ b/internal/pkg/testutils/test_utils.go @@ -195,6 +195,7 @@ func CreateTmpDir(t *testing.T) (string, func()) { } type MockPodResourcesServer struct { + v1.UnimplementedPodResourcesListerServer resourceName string gpus []string } diff --git a/internal/pkg/transformation/dra.go b/internal/pkg/transformation/dra.go index 6d4755ef..9b20c2cd 100644 --- a/internal/pkg/transformation/dra.go +++ b/internal/pkg/transformation/dra.go @@ -22,9 +22,14 @@ import ( "log/slog" "time" + resourcev1 "k8s.io/api/resource/v1" resourcev1beta1 "k8s.io/api/resource/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" + podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1" "github.com/NVIDIA/dcgm-exporter/internal/pkg/kubeclient" ) @@ -33,146 +38,429 @@ const ( informerResyncPeriod = 10 * time.Minute ) +// resourceSliceAdapter provides a unified interface for accessing ResourceSlice data +// from both v1 and v1beta1 API versions +type resourceSliceAdapter interface { + // GetDevices returns a list of device adapters + GetDevices() []deviceAdapter +} + +// deviceAdapter provides a unified interface for accessing device data +// from both v1 and v1beta1 API versions +type deviceAdapter interface { + // GetName returns the device name + GetName() string + // GetAttribute returns the string value of an attribute by key, or empty string if not found + GetAttribute(key string) string + // HasAttributes returns true if the device has attributes + HasAttributes() bool +} + +// v1ResourceSliceAdapter adapts resourcev1.ResourceSlice to resourceSliceAdapter +type v1ResourceSliceAdapter struct { + slice *resourcev1.ResourceSlice +} + +func (a *v1ResourceSliceAdapter) GetDevices() []deviceAdapter { + devices := make([]deviceAdapter, len(a.slice.Spec.Devices)) + for i := range a.slice.Spec.Devices { + devices[i] = &v1DeviceAdapter{device: &a.slice.Spec.Devices[i]} + } + return devices +} + +// v1DeviceAdapter adapts resourcev1.Device to deviceAdapter +type v1DeviceAdapter struct { + device *resourcev1.Device +} + +func (a *v1DeviceAdapter) GetName() string { + return a.device.Name +} + +func (a *v1DeviceAdapter) HasAttributes() bool { + return a.device.Attributes != nil +} + +func (a *v1DeviceAdapter) GetAttribute(key string) string { + if a.device.Attributes == nil { + return "" + } + attrKey := resourcev1.QualifiedName(key) + if attr, ok := a.device.Attributes[attrKey]; ok && attr.StringValue != nil { + return *attr.StringValue + } + return "" +} + +// v1beta1ResourceSliceAdapter adapts resourcev1beta1.ResourceSlice to resourceSliceAdapter +type v1beta1ResourceSliceAdapter struct { + slice *resourcev1beta1.ResourceSlice +} + +func (a *v1beta1ResourceSliceAdapter) GetDevices() []deviceAdapter { + devices := make([]deviceAdapter, len(a.slice.Spec.Devices)) + for i := range a.slice.Spec.Devices { + devices[i] = &v1beta1DeviceAdapter{device: &a.slice.Spec.Devices[i]} + } + return devices +} + +// v1beta1DeviceAdapter adapts resourcev1beta1.Device to deviceAdapter +type v1beta1DeviceAdapter struct { + device *resourcev1beta1.Device +} + +func (a *v1beta1DeviceAdapter) GetName() string { + return a.device.Name +} + +func (a *v1beta1DeviceAdapter) HasAttributes() bool { + return a.device.Basic != nil && a.device.Basic.Attributes != nil +} + +func (a *v1beta1DeviceAdapter) GetAttribute(key string) string { + if a.device.Basic == nil || a.device.Basic.Attributes == nil { + return "" + } + attrKey := resourcev1beta1.QualifiedName(key) + if attr, ok := a.device.Basic.Attributes[attrKey]; ok && attr.StringValue != nil { + return *attr.StringValue + } + return "" +} + +func supportsResourceSliceGV(client kubernetes.Interface, groupVersion string) bool { + resources, err := client.Discovery().ServerResourcesForGroupVersion(groupVersion) + if err != nil { + // Discovery returns errors when the group/version isn't served. + slog.Debug("Discovery failed for groupVersion", "groupVersion", groupVersion, "error", err) + return false + } + + for _, r := range resources.APIResources { + // Match the primary resource only (not subresources like "resourceslices/status"). + if r.Name == "resourceslices" { + return true + } + } + return false +} + +// hasNvidiaDRASlices reports whether the cluster currently exposes any +// NVIDIA GPU DRA ResourceSlices on the given API version. +func hasNvidiaDRASlices(ctx context.Context, client kubernetes.Interface, apiVersion string) (bool, error) { + switch apiVersion { + case "v1": + list, err := client.ResourceV1().ResourceSlices().List(ctx, metav1.ListOptions{}) + if err != nil { + return false, fmt.Errorf("listing v1 ResourceSlices: %w", err) + } + for i := range list.Items { + s := &list.Items[i] + if s.Spec.Driver == DRAGPUDriverName && len(s.Spec.Devices) > 0 { + return true, nil + } + } + case "v1beta1": + list, err := client.ResourceV1beta1().ResourceSlices().List(ctx, metav1.ListOptions{}) + if err != nil { + return false, fmt.Errorf("listing v1beta1 ResourceSlices: %w", err) + } + for i := range list.Items { + s := &list.Items[i] + if s.Spec.Driver == DRAGPUDriverName && len(s.Spec.Devices) > 0 { + return true, nil + } + } + default: + return false, fmt.Errorf("unsupported ResourceSlice API version: %q", apiVersion) + } + return false, nil +} + +// NewDRAResourceSliceManager creates a new DRA ResourceSlice manager. +// The API version is auto-detected by checking which version has NVIDIA DRA ResourceSlices. func NewDRAResourceSliceManager() (*DRAResourceSliceManager, error) { client, err := kubeclient.GetKubeClient() if err != nil { return nil, fmt.Errorf("error getting kube client: %w", err) } + // Decide which API version to use. + // Prefer v1 only when it actually has NVIDIA DRA ResourceSlices; otherwise fall back + // to v1beta1 when that version has NVIDIA DRA ResourceSlices. + const ( + resourceGVV1 = "resource.k8s.io/v1" + resourceGVV1beta1 = "resource.k8s.io/v1beta1" + ) + + v1Served := supportsResourceSliceGV(client, resourceGVV1) + v1beta1Served := supportsResourceSliceGV(client, resourceGVV1beta1) + if !v1Served && !v1beta1Served { + slog.Warn("Neither resource.k8s.io/v1 nor v1beta1 ResourceSlice API is served; DRA labels will not be available") + return nil, nil + } + + // Determine which served API version actually has NVIDIA DRA ResourceSlices. + ctx := context.Background() + v1HasNvidiaSlices := false + if v1Served { + has, err := hasNvidiaDRASlices(ctx, client, "v1") + if err != nil { + return nil, err + } + v1HasNvidiaSlices = has + } + + v1beta1HasNvidiaSlices := false + if v1beta1Served { + has, err := hasNvidiaDRASlices(ctx, client, "v1beta1") + if err != nil { + return nil, err + } + v1beta1HasNvidiaSlices = has + } + + var selected string + switch { + case v1HasNvidiaSlices: + selected = "v1" + case v1beta1HasNvidiaSlices: + selected = "v1beta1" + default: + slog.Warn("No NVIDIA DRA ResourceSlices found; DRA labels will not be available") + return nil, nil + } factory := informers.NewSharedInformerFactory(client, informerResyncPeriod) - informer := factory.Resource().V1beta1().ResourceSlices().Informer() + + var informer cache.SharedIndexInformer + switch selected { + case "v1": + informer = factory.Resource().V1().ResourceSlices().Informer() + err = informer.AddIndexers(cache.Indexers{ + "poolName": func(obj interface{}) ([]string, error) { + rs, ok := obj.(*resourcev1.ResourceSlice) + if !ok { + return nil, nil + } + return []string{rs.Spec.Pool.Name}, nil + }, + }) + if err != nil { + return nil, fmt.Errorf("error adding pool indexer to v1 ResourceSlice informer: %w", err) + } + case "v1beta1": + informer = factory.Resource().V1beta1().ResourceSlices().Informer() + err = informer.AddIndexers(cache.Indexers{ + "poolName": func(obj interface{}) ([]string, error) { + rs, ok := obj.(*resourcev1beta1.ResourceSlice) + if !ok { + return nil, nil + } + return []string{rs.Spec.Pool.Name}, nil + }, + }) + if err != nil { + return nil, fmt.Errorf("error adding pool indexer to v1beta1 ResourceSlice informer: %w", err) + } + default: + return nil, fmt.Errorf("unsupported API version selection: %s", selected) + } m := &DRAResourceSliceManager{ - factory: factory, - informer: informer, - deviceToUUID: make(map[string]string), - migDevices: make(map[string]*DRAMigDeviceInfo), - } - - _, err = informer.AddEventHandler(&cache.FilteringResourceEventHandler{ - FilterFunc: func(obj interface{}) bool { - s := obj.(*resourcev1beta1.ResourceSlice) - return s.Spec.Driver == DRAGPUDriverName - }, - Handler: cache.ResourceEventHandlerFuncs{ - AddFunc: m.onAddOrUpdate, - UpdateFunc: func(_, o interface{}) { m.onAddOrUpdate(o) }, - DeleteFunc: m.onDelete, - }, - }) - if err != nil { - return nil, fmt.Errorf("error adding event handler: %w", err) + factory: factory, + informer: informer, + sliceAPIVersion: selected, } - ctx, cancel := context.WithCancel(context.Background()) - m.cancelContext = cancel - factory.Start(ctx.Done()) + factory.Start(wait.NeverStop) - if !cache.WaitForCacheSync(ctx.Done(), informer.HasSynced) { - cancel() + // Wait for cache sync on the selected informer. + synced := cache.WaitForCacheSync(wait.NeverStop, informer.HasSynced) + if !synced { + factory.Shutdown() return nil, fmt.Errorf("ResourceSlice informer cache sync failed") } + + slog.Info("ResourceSlice API informer synced successfully", "apiVersion", selected) return m, nil } func (m *DRAResourceSliceManager) Stop() { - if m.cancelContext != nil { - m.cancelContext() - } - // Ensure factory informers are fully stopped if m.factory != nil { m.factory.Shutdown() } } -// GetDeviceInfo returns the mapping UUID and MIG device info if applicable -// For MIG devices: returns (parentUUID, *DRAMigDeviceInfo) -// For full GPUs: returns (deviceUUID, nil) -func (m *DRAResourceSliceManager) GetDeviceInfo(pool, device string) (string, *DRAMigDeviceInfo) { - key := pool + "/" + device - m.mu.RLock() - defer m.mu.RUnlock() +func (m *DRAResourceSliceManager) getV1DeviceInfo(pool, device string) (string, *DRAMigDeviceInfo) { + if m.informer == nil { + return "", nil + } - // Check if this is a MIG device - if migInfo, exists := m.migDevices[key]; exists { - // MIG device - return parent UUID and MIG info - slog.Debug(fmt.Sprintf("Found MIG device for %s with parent UUID: %s", key, migInfo.ParentUUID)) - return migInfo.ParentUUID, migInfo + items, err := m.informer.GetIndexer().ByIndex("poolName", pool) + if err != nil { + slog.Error(fmt.Sprintf("Error listing v1 ResourceSlices by pool index for pool %s: %v", pool, err)) + return "", nil } - // Full GPU device - return device UUID with no MIG info - if uuid, exists := m.deviceToUUID[key]; exists { - slog.Debug(fmt.Sprintf("Found GPU device for %s with UUID: %s", uuid, key)) - return uuid, nil + for _, item := range items { + rs, ok := item.(*resourcev1.ResourceSlice) + if !ok { + continue + } + if rs.Spec.Driver != DRAGPUDriverName { + continue + } + adapter := &v1ResourceSliceAdapter{slice: rs} + if mappingKey, migInfo := lookupDRADeviceInAdapter(pool, device, adapter); mappingKey != "" { + return mappingKey, migInfo + } } - slog.Info(fmt.Sprintf("No UUID found for %s", key)) + slog.Debug(fmt.Sprintf("No UUID found for pool %s, device %s", pool, device)) return "", nil } -func getAttrString(attrs map[resourcev1beta1.QualifiedName]resourcev1beta1.DeviceAttribute, key resourcev1beta1.QualifiedName) string { - if attr, ok := attrs[key]; ok && attr.StringValue != nil { - return *attr.StringValue +func (m *DRAResourceSliceManager) getV1beta1DeviceInfo(pool, device string) (string, *DRAMigDeviceInfo) { + if m.informer == nil { + return "", nil } - return "" -} -func (m *DRAResourceSliceManager) onAddOrUpdate(obj interface{}) { - slice := obj.(*resourcev1beta1.ResourceSlice) - pool := slice.Spec.Pool.Name + items, err := m.informer.GetIndexer().ByIndex("poolName", pool) + if err != nil { + slog.Error(fmt.Sprintf("Error listing v1beta1 ResourceSlices by pool index for pool %s: %v", pool, err)) + return "", nil + } + + for _, item := range items { + rs, ok := item.(*resourcev1beta1.ResourceSlice) + if !ok { + continue + } + if rs.Spec.Driver != DRAGPUDriverName { + continue + } + adapter := &v1beta1ResourceSliceAdapter{slice: rs} + if mappingKey, migInfo := lookupDRADeviceInAdapter(pool, device, adapter); mappingKey != "" { + return mappingKey, migInfo + } + } - m.mu.Lock() - defer m.mu.Unlock() + slog.Debug(fmt.Sprintf("No UUID found for pool %s, device %s", pool, device)) + return "", nil +} - for _, dev := range slice.Spec.Devices { - if dev.Basic == nil || dev.Basic.Attributes == nil { +// lookupDRADeviceInAdapter applies NVIDIA GPU DRA driver device attributes ("type", +// "uuid", "parentUUID", "profile"). Other drivers with different schemas may not work. +func lookupDRADeviceInAdapter(pool, device string, adapter resourceSliceAdapter) (string, *DRAMigDeviceInfo) { + for _, dev := range adapter.GetDevices() { + if !dev.HasAttributes() { + continue + } + if dev.GetName() != device { continue } - key := pool + "/" + dev.Name - attr := dev.Basic.Attributes - deviceType := getAttrString(attr, "type") + deviceType := dev.GetAttribute("type") switch deviceType { - case "gpu": - if uuid := getAttrString(attr, "uuid"); uuid != "" { - m.deviceToUUID[key] = uuid - slog.Debug(fmt.Sprintf("Added gpu device [key:%s] with UUID: %s", key, uuid)) - } - case "mig": - parentUUID := getAttrString(attr, "parentUUID") - profile := getAttrString(attr, "profile") - migUUID := getAttrString(attr, "uuid") - - // Only create MIG device if we have required parent UUID + parentUUID := dev.GetAttribute("parentUUID") + profile := dev.GetAttribute("profile") + migUUID := dev.GetAttribute("uuid") if parentUUID != "" { - m.migDevices[key] = &DRAMigDeviceInfo{ + migInfo := &DRAMigDeviceInfo{ MIGDeviceUUID: migUUID, Profile: profile, ParentUUID: parentUUID, } - slog.Debug(fmt.Sprintf("Added MIG device %s (profile: %s) with parent: %s", migUUID, profile, parentUUID)) - } else { - slog.Debug(fmt.Sprintf("MIG device %s missing parent UUID", migUUID)) + slog.Debug(fmt.Sprintf("Found MIG device %s/%s with parent UUID: %s", pool, device, parentUUID)) + return parentUUID, migInfo + } + case "gpu": + uuid := dev.GetAttribute("uuid") + if uuid != "" { + slog.Debug(fmt.Sprintf("Found GPU device %s/%s with UUID: %s", pool, device, uuid)) + return uuid, nil } - default: - slog.Warn(fmt.Sprintf("Device [key:%s] has unknown type: %s", key, deviceType)) + slog.Warn(fmt.Sprintf("Device [%s/%s] has unknown type: %s", pool, device, deviceType)) + } + } + return "", nil +} + +// GetDeviceInfo returns the mapping UUID and MIG device info if applicable +// by querying the informer cache directly. This avoids maintaining redundant +// local caches and ensures we always have the latest state from the API server. +// For MIG devices: returns (parentUUID, *DRAMigDeviceInfo) +// For full GPUs: returns (deviceUUID, nil) +func (m *DRAResourceSliceManager) GetDeviceInfo(pool, device string) (string, *DRAMigDeviceInfo) { + if m.informer == nil { + return "", nil + } + + switch m.sliceAPIVersion { + case "v1": + return m.getV1DeviceInfo(pool, device) + case "v1beta1": + return m.getV1beta1DeviceInfo(pool, device) + default: + if m.sliceAPIVersion != "" { + slog.Error("Unsupported ResourceSlice API version", "apiVersion", m.sliceAPIVersion) } + return "", nil } } -func (m *DRAResourceSliceManager) onDelete(obj interface{}) { - slice := obj.(*resourcev1beta1.ResourceSlice) - pool := slice.Spec.Pool.Name +type DynamicResourceMapping struct { + MappingKey string + Info *DynamicResourceInfo +} + +// GetDynamicResourceMappings converts a DynamicResource into one or more +// DynamicResourceInfo entries and resolves the backing GPU or MIG device UUIDs +// using the ResourceSlice informer. +// +// A single DynamicResource can contain multiple ClaimResources (devices). This +// method returns a mapping entry for each matching NVIDIA GPU DRA claim. +func (m *DRAResourceSliceManager) GetDynamicResourceMappings(resource *podresourcesapi.DynamicResource) []DynamicResourceMapping { + if resource == nil { + return nil + } + + mappings := make([]DynamicResourceMapping, 0, len(resource.GetClaimResources())) + for _, claimResource := range resource.GetClaimResources() { + draDriverName := claimResource.GetDriverName() + if draDriverName != DRAGPUDriverName { + continue + } + + draPoolName := claimResource.GetPoolName() + draDeviceName := claimResource.GetDeviceName() + + mappingKey, migInfo := m.GetDeviceInfo(draPoolName, draDeviceName) + if mappingKey == "" { + slog.Debug(fmt.Sprintf("No UUID for %s/%s", draPoolName, draDeviceName)) + continue + } - m.mu.Lock() - defer m.mu.Unlock() + drInfo := &DynamicResourceInfo{ + ClaimName: resource.GetClaimName(), + ClaimNamespace: resource.GetClaimNamespace(), + DriverName: draDriverName, + PoolName: draPoolName, + DeviceName: draDeviceName, + } + if migInfo != nil { + drInfo.MIGInfo = migInfo + } - for _, dev := range slice.Spec.Devices { - key := pool + "/" + dev.Name - slog.Debug(fmt.Sprintf("Removing device for %s", key)) - delete(m.deviceToUUID, key) - delete(m.migDevices, key) + mappings = append(mappings, DynamicResourceMapping{ + MappingKey: mappingKey, + Info: drInfo, + }) } + + return mappings } diff --git a/internal/pkg/transformation/dra_test.go b/internal/pkg/transformation/dra_test.go new file mode 100644 index 00000000..8ee60f0f --- /dev/null +++ b/internal/pkg/transformation/dra_test.go @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package transformation + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + resourcev1 "k8s.io/api/resource/v1" + resourcev1beta1 "k8s.io/api/resource/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/cache" +) + +// newDRAIndexer creates an Indexer with a poolName index matching the production +// informer configuration so tests can exercise GetDeviceInfo without relying on +// informer.AddIndexers. +func newDRAIndexer() cache.Indexer { + return cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{ + "poolName": func(obj interface{}) ([]string, error) { + switch rs := obj.(type) { + case *resourcev1.ResourceSlice: + return []string{rs.Spec.Pool.Name}, nil + case *resourcev1beta1.ResourceSlice: + return []string{rs.Spec.Pool.Name}, nil + default: + return nil, nil + } + }, + }) +} + +func TestGetDeviceInfo_GPUDevice(t *testing.T) { + // Create a store with a ResourceSlice containing a GPU device + store := newDRAIndexer() + slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "gpu-pool", + }, + Devices: []resourcev1.Device{ + { + Name: "gpu0", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-0")}, + }, + }, + }, + }, + } + store.Add(slice) + + m := &DRAResourceSliceManager{ + informer: &testInformerForDRA{store: store}, + sliceAPIVersion: "v1", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + require.NotEmpty(t, uuid, "expected UUID to be found") + assert.Equal(t, "GPU-UUID-0", uuid) + assert.Nil(t, migInfo, "expected no MIG info for GPU device") +} + +func TestGetDeviceInfo_MIGDevice(t *testing.T) { + // Create a store with a ResourceSlice containing a MIG device + store := newDRAIndexer() + slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "gpu-pool", + }, + Devices: []resourcev1.Device{ + { + Name: "mig0", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("mig")}, + "uuid": {StringValue: stringPtr("MIG-UUID-0")}, + "profile": {StringValue: stringPtr("1g.10gb")}, + "parentUUID": {StringValue: stringPtr("GPU-UUID-0")}, + }, + }, + }, + }, + } + store.Add(slice) + + m := &DRAResourceSliceManager{ + informer: &testInformerForDRA{store: store}, + sliceAPIVersion: "v1", + } + + parentUUID, migInfo := m.GetDeviceInfo("gpu-pool", "mig0") + require.NotEmpty(t, parentUUID, "expected parent UUID to be found") + assert.Equal(t, "GPU-UUID-0", parentUUID) + require.NotNil(t, migInfo, "expected MIG info to be present") + assert.Equal(t, "MIG-UUID-0", migInfo.MIGDeviceUUID) + assert.Equal(t, "1g.10gb", migInfo.Profile) + assert.Equal(t, "GPU-UUID-0", migInfo.ParentUUID) +} + +func TestGetDeviceInfo_NotFound(t *testing.T) { + // Create an empty store + store := newDRAIndexer() + + m := &DRAResourceSliceManager{ + informer: &testInformerForDRA{store: store}, + sliceAPIVersion: "v1", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + assert.Empty(t, uuid, "expected no UUID for non-existent device") + assert.Nil(t, migInfo, "expected no MIG info for non-existent device") +} + +func TestGetDeviceInfo_WrongPool(t *testing.T) { + // Create a store with a ResourceSlice in a different pool + store := newDRAIndexer() + slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "other-pool", + }, + Devices: []resourcev1.Device{ + { + Name: "gpu0", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-0")}, + }, + }, + }, + }, + } + store.Add(slice) + + m := &DRAResourceSliceManager{ + informer: &testInformerForDRA{store: store}, + sliceAPIVersion: "v1", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + assert.Empty(t, uuid, "expected no UUID when pool doesn't match") + assert.Nil(t, migInfo, "expected no MIG info when pool doesn't match") +} + +func stringPtr(s string) *string { + return &s +} + +// TestGetDeviceInfo_EmptyInformerStore_ReturnsEmpty verifies an empty informer store yields no mapping. +func TestGetDeviceInfo_EmptyInformerStore_ReturnsEmpty(t *testing.T) { + v1Store := newDRAIndexer() + + m := &DRAResourceSliceManager{ + informer: &testInformerForDRA{store: v1Store}, + sliceAPIVersion: "v1", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + assert.Empty(t, uuid, "expected no UUID when informer store has no matching slices") + assert.Nil(t, migInfo, "expected no MIG info for GPU device") +} + +// TestGetDeviceInfo_V1SliceInStore resolves UUID from v1 ResourceSlice objects in the informer. +func TestGetDeviceInfo_V1SliceInStore(t *testing.T) { + v1Store := newDRAIndexer() + v1Slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "v1-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "gpu-pool", + }, + Devices: []resourcev1.Device{ + { + Name: "gpu0", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-V1")}, + }, + }, + }, + }, + } + v1Store.Add(v1Slice) + + m := &DRAResourceSliceManager{ + informer: &testInformerForDRA{store: v1Store}, + sliceAPIVersion: "v1", + } + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + require.NotEmpty(t, uuid, "expected UUID to be found from v1") + assert.Equal(t, "GPU-UUID-V1", uuid) + assert.Nil(t, migInfo, "expected no MIG info for GPU device") +} + +func TestGetDeviceInfo_NilInformer_ReturnsEmpty(t *testing.T) { + m := &DRAResourceSliceManager{} + + uuid, migInfo := m.GetDeviceInfo("gpu-pool", "gpu0") + assert.Empty(t, uuid, "expected no UUID when informer is nil") + assert.Nil(t, migInfo, "expected no MIG info when informer is nil") +} diff --git a/internal/pkg/transformation/kubernetes.go b/internal/pkg/transformation/kubernetes.go index 5f9f6324..7e15cec1 100644 --- a/internal/pkg/transformation/kubernetes.go +++ b/internal/pkg/transformation/kubernetes.go @@ -145,8 +145,12 @@ func NewPodMapper(c *appconfig.Config) *PodMapper { slog.Warn("Failed to get DRAResourceSliceManager, DRA pod labels will not be available", "error", err) return podMapper } + if resourceSliceManager == nil { + slog.Info("DRAResourceSliceManager not started (no NVIDIA DRA ResourceSlices found)") + return podMapper + } podMapper.ResourceSliceManager = resourceSliceManager - slog.Info("Started DRAResourceSliceManager") + slog.Info("Started DRAResourceSliceManager with auto-detected API version") } return podMapper } @@ -589,17 +593,10 @@ func (p *PodMapper) toDeviceToPodsDRA(devicePods *podresourcesapi.ListPodResourc "containerName", cntName) if dynamicResources := container.GetDynamicResources(); len(dynamicResources) > 0 && p.ResourceSliceManager != nil { for _, dr := range dynamicResources { - for _, claimResource := range dr.GetClaimResources() { - draDriverName := claimResource.GetDriverName() - if draDriverName != DRAGPUDriverName { - continue - } - draPoolName := claimResource.GetPoolName() - draDeviceName := claimResource.GetDeviceName() - - mappingKey, migInfo := p.ResourceSliceManager.GetDeviceInfo(draPoolName, draDeviceName) - if mappingKey == "" { - slog.Debug(fmt.Sprintf("No UUID for %s/%s", draPoolName, draDeviceName)) + for _, mapping := range p.ResourceSliceManager.GetDynamicResourceMappings(dr) { + mappingKey := mapping.MappingKey + drInfo := mapping.Info + if mappingKey == "" || drInfo == nil { continue } @@ -615,21 +612,12 @@ func (p *PodMapper) toDeviceToPodsDRA(devicePods *podresourcesapi.ListPodResourc if processedPods[mappingKey][podContainerKey] { continue } - podInfo := p.createPodInfo(pod, container) - drInfo := DynamicResourceInfo{ - ClaimName: dr.GetClaimName(), - ClaimNamespace: dr.GetClaimNamespace(), - DriverName: draDriverName, - PoolName: draPoolName, - DeviceName: draDeviceName, - } - if migInfo != nil { - drInfo.MIGInfo = migInfo + if drInfo.MIGInfo != nil { slog.Debug("Added MIG pod mapping", "parentUUID", mappingKey, - "migDevice", migInfo.MIGDeviceUUID, - "migProfile", migInfo.Profile, + "migDevice", drInfo.MIGInfo.MIGDeviceUUID, + "migProfile", drInfo.MIGInfo.Profile, "pod", podContainerKey) } else { slog.Debug("Added GPU pod mapping", @@ -637,7 +625,7 @@ func (p *PodMapper) toDeviceToPodsDRA(devicePods *podresourcesapi.ListPodResourc "pod", podContainerKey) } - podInfo.DynamicResources = &drInfo + podInfo.DynamicResources = drInfo deviceToPodsMap[mappingKey] = append(deviceToPodsMap[mappingKey], podInfo) processedPods[mappingKey][podContainerKey] = true } diff --git a/internal/pkg/transformation/kubernetes_test.go b/internal/pkg/transformation/kubernetes_test.go index 09e654a7..a25ff775 100644 --- a/internal/pkg/transformation/kubernetes_test.go +++ b/internal/pkg/transformation/kubernetes_test.go @@ -31,6 +31,8 @@ import ( "go.uber.org/mock/gomock" "google.golang.org/grpc" v1 "k8s.io/api/core/v1" + resourcev1 "k8s.io/api/resource/v1" + resourcev1beta1 "k8s.io/api/resource/v1beta1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/informers" @@ -675,11 +677,222 @@ func TestPodDRAInfo(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { + // Create an indexer with ResourceSlice objects based on test case. + // We use the same poolName index as the production informer. + store := newDRAIndexer() + if len(tc.deviceToUUID) > 0 || len(tc.migDevices) > 0 { + // Create a ResourceSlice with the device from the test case + devices := []resourcev1.Device{} + if uuid, exists := tc.deviceToUUID["poolA/gpu-x"]; exists { + if migInfo, isMIG := tc.migDevices["poolA/gpu-x"]; isMIG { + // MIG device + devices = append(devices, resourcev1.Device{ + Name: "gpu-x", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("mig")}, + "uuid": {StringValue: &migInfo.MIGDeviceUUID}, + "profile": {StringValue: &migInfo.Profile}, + "parentUUID": {StringValue: &migInfo.ParentUUID}, + }, + }) + } else { + // GPU device + devices = append(devices, resourcev1.Device{ + Name: "gpu-x", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: &uuid}, + }, + }) + } + } + if len(devices) > 0 { + slice := &resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "poolA", + }, + Devices: devices, + }, + } + store.Add(slice) + } + } + + // Create test informer backed by the indexer. + testInformer := &testInformerForDRA{store: store} draMgr := &DRAResourceSliceManager{ - deviceToUUID: tc.deviceToUUID, - migDevices: tc.migDevices, + informer: testInformer, + sliceAPIVersion: "v1", + } + + pm := &PodMapper{ + Config: &appconfig.Config{NvidiaResourceNames: []string{appconfig.NvidiaResourceName}}, + ResourceSliceManager: draMgr, + } + + resp := &podresourcesapi.ListPodResourcesResponse{ + PodResources: []*podresourcesapi.PodResources{{ + Name: "pod1", + Namespace: "default", + Containers: []*podresourcesapi.ContainerResources{{ + Name: "ctr1", + DynamicResources: []*podresourcesapi.DynamicResource{dra}, + }}, + }}, + } + + got := pm.toDeviceToPodsDRA(resp) + + assert.Len(t, got, len(tc.wantUUIDs), "map size") + for _, want := range tc.wantUUIDs { + assert.Contains(t, got, want, "expected key %q", want) + } + + if len(tc.wantUUIDs) == 1 { + pi := got[tc.wantUUIDs[0]] + require.Len(t, pi, 1, "should have one pod info") + + dr := *pi[0].DynamicResources + require.NotNil(t, dr, "dynamic resources should not be nil") + + assert.Equal(t, "claim1", dr.ClaimName) + assert.Equal(t, "ns1", dr.ClaimNamespace) + assert.Equal(t, DRAGPUDriverName, dr.DriverName) + assert.Equal(t, "poolA", dr.PoolName) + assert.Equal(t, "gpu-x", dr.DeviceName) + + if tc.isMIG { + require.NotNil(t, dr.MIGInfo, "MIG info should not be nil for MIG device") + assert.Equal(t, "MIG-12345", dr.MIGInfo.MIGDeviceUUID) + assert.Equal(t, "1g.12gb", dr.MIGInfo.Profile) + assert.Equal(t, "GPU-parent-uuid", dr.MIGInfo.ParentUUID) + } else { + assert.Nil(t, dr.MIGInfo, "MIG info should be nil for full GPU device") + } + } + }) + } +} + +func TestPodDRAInfo_V1beta1Preferred(t *testing.T) { + dra := &podresourcesapi.DynamicResource{ + ClaimName: "claim1", + ClaimNamespace: "ns1", + ClaimResources: []*podresourcesapi.ClaimResource{{ + DriverName: DRAGPUDriverName, + PoolName: "poolA", + DeviceName: "gpu-x", + }}, + } + + tests := []struct { + name string + deviceToUUID map[string]string + migDevices map[string]*DRAMigDeviceInfo + wantUUIDs []string + isMIG bool + }{ + { + name: "uuid-exists", + deviceToUUID: map[string]string{"poolA/gpu-x": "GPU-8a748984-0fe7-297f-916c-4b998ce202d1"}, + migDevices: map[string]*DRAMigDeviceInfo{}, + wantUUIDs: []string{"GPU-8a748984-0fe7-297f-916c-4b998ce202d1"}, + isMIG: false, + }, + { + name: "uuid-updated", + deviceToUUID: map[string]string{"poolA/gpu-x": "GPU-UUID-Updated"}, + migDevices: map[string]*DRAMigDeviceInfo{}, + wantUUIDs: []string{"GPU-UUID-Updated"}, + isMIG: false, + }, + { + name: "no-uuid", + deviceToUUID: map[string]string{}, + migDevices: map[string]*DRAMigDeviceInfo{}, + wantUUIDs: nil, + isMIG: false, + }, + { + name: "mig-device", + deviceToUUID: map[string]string{"poolA/gpu-x": "MIG-12345"}, + migDevices: map[string]*DRAMigDeviceInfo{ + "poolA/gpu-x": { + MIGDeviceUUID: "MIG-12345", + Profile: "1g.12gb", + ParentUUID: "GPU-parent-uuid", + }, + }, + wantUUIDs: []string{"GPU-parent-uuid"}, // Should map to parent UUID + isMIG: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + // Create an indexer with v1beta1 ResourceSlice objects based on test case. + // We use the same poolName index as the production informer. + store := newDRAIndexer() + if len(tc.deviceToUUID) > 0 || len(tc.migDevices) > 0 { + devices := []resourcev1beta1.Device{} + if uuid, exists := tc.deviceToUUID["poolA/gpu-x"]; exists { + if migInfo, isMIG := tc.migDevices["poolA/gpu-x"]; isMIG { + // MIG device + devices = append(devices, resourcev1beta1.Device{ + Name: "gpu-x", + Basic: &resourcev1beta1.BasicDevice{ + Attributes: map[resourcev1beta1.QualifiedName]resourcev1beta1.DeviceAttribute{ + "type": {StringValue: stringPtr("mig")}, + "uuid": {StringValue: &migInfo.MIGDeviceUUID}, + "profile": {StringValue: &migInfo.Profile}, + "parentUUID": {StringValue: &migInfo.ParentUUID}, + }, + }, + }) + } else { + // GPU device + devices = append(devices, resourcev1beta1.Device{ + Name: "gpu-x", + Basic: &resourcev1beta1.BasicDevice{ + Attributes: map[resourcev1beta1.QualifiedName]resourcev1beta1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: &uuid}, + }, + }, + }) + } + } + + if len(devices) > 0 { + slice := &resourcev1beta1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1beta1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1beta1.ResourcePool{ + Name: "poolA", + }, + Devices: devices, + }, + } + store.Add(slice) + } } + // Create test informer backed by the indexer. + testInformer := &testInformerForDRA{store: store} + draMgr := &DRAResourceSliceManager{ + informer: testInformer, + sliceAPIVersion: "v1beta1", + } pm := &PodMapper{ Config: &appconfig.Config{NvidiaResourceNames: []string{appconfig.NvidiaResourceName}}, ResourceSliceManager: draMgr, @@ -729,6 +942,84 @@ func TestPodDRAInfo(t *testing.T) { } } +func TestPodDRAInfo_MultipleClaimResources(t *testing.T) { + dra := &podresourcesapi.DynamicResource{ + ClaimName: "claim1", + ClaimNamespace: "ns1", + ClaimResources: []*podresourcesapi.ClaimResource{ + { + DriverName: DRAGPUDriverName, + PoolName: "poolA", + DeviceName: "gpu-x", + }, + { + DriverName: DRAGPUDriverName, + PoolName: "poolA", + DeviceName: "gpu-y", + }, + }, + } + + store := newDRAIndexer() + store.Add(&resourcev1.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-slice", + Namespace: "default", + }, + Spec: resourcev1.ResourceSliceSpec{ + Driver: DRAGPUDriverName, + Pool: resourcev1.ResourcePool{ + Name: "poolA", + }, + Devices: []resourcev1.Device{ + { + Name: "gpu-x", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-X")}, + }, + }, + { + Name: "gpu-y", + Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ + "type": {StringValue: stringPtr("gpu")}, + "uuid": {StringValue: stringPtr("GPU-UUID-Y")}, + }, + }, + }, + }, + }) + + testInformer := &testInformerForDRA{store: store} + draMgr := &DRAResourceSliceManager{ + informer: testInformer, + sliceAPIVersion: "v1", + } + + pm := &PodMapper{ + Config: &appconfig.Config{NvidiaResourceNames: []string{appconfig.NvidiaResourceName}}, + ResourceSliceManager: draMgr, + } + + resp := &podresourcesapi.ListPodResourcesResponse{ + PodResources: []*podresourcesapi.PodResources{{ + Name: "pod1", + Namespace: "default", + Containers: []*podresourcesapi.ContainerResources{{ + Name: "ctr1", + DynamicResources: []*podresourcesapi.DynamicResource{dra}, + }}, + }}, + } + + got := pm.toDeviceToPodsDRA(resp) + + assert.Contains(t, got, "GPU-UUID-X") + assert.Contains(t, got, "GPU-UUID-Y") + assert.Len(t, got["GPU-UUID-X"], 1) + assert.Len(t, got["GPU-UUID-Y"], 1) +} + func TestProcessPodMapper_WithUID(t *testing.T) { testutils.RequireLinux(t) diff --git a/internal/pkg/transformation/test_helpers_test.go b/internal/pkg/transformation/test_helpers_test.go new file mode 100644 index 00000000..29296d64 --- /dev/null +++ b/internal/pkg/transformation/test_helpers_test.go @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + package transformation + + import ( + "context" + "time" + + "k8s.io/client-go/tools/cache" + ) + + // testInformerForDRA is a minimal SharedIndexInformer implementation for tests that + // want to inject a pre-populated cache.Indexer (matching the production indexers). + type testInformerForDRA struct { + store cache.Store + } + + func (t *testInformerForDRA) GetStore() cache.Store { return t.store } + + func (t *testInformerForDRA) GetIndexer() cache.Indexer { return t.store.(cache.Indexer) } + + func (t *testInformerForDRA) AddIndexers(indexers cache.Indexers) error { return nil } + + func (t *testInformerForDRA) GetController() cache.Controller { return nil } + + func (t *testInformerForDRA) LastSyncResourceVersion() string { return "" } + + func (t *testInformerForDRA) AddEventHandler(handler cache.ResourceEventHandler) (cache.ResourceEventHandlerRegistration, error) { + return nil, nil + } + + func (t *testInformerForDRA) AddEventHandlerWithResyncPeriod(handler cache.ResourceEventHandler, resyncPeriod time.Duration) (cache.ResourceEventHandlerRegistration, error) { + return nil, nil + } + + func (t *testInformerForDRA) AddEventHandlerWithOptions(handler cache.ResourceEventHandler, options cache.HandlerOptions) (cache.ResourceEventHandlerRegistration, error) { + return nil, nil + } + + func (t *testInformerForDRA) RemoveEventHandler(handle cache.ResourceEventHandlerRegistration) error { return nil } + + func (t *testInformerForDRA) IsStopped() bool { return false } + + func (t *testInformerForDRA) SetWatchErrorHandler(handler cache.WatchErrorHandler) error { return nil } + + func (t *testInformerForDRA) SetWatchErrorHandlerWithContext(handler cache.WatchErrorHandlerWithContext) error { + return nil + } + + func (t *testInformerForDRA) SetTransform(handler cache.TransformFunc) error { return nil } + + func (t *testInformerForDRA) HasSynced() bool { return true } + + func (t *testInformerForDRA) Run(stopCh <-chan struct{}) {} + + func (t *testInformerForDRA) RunWithContext(ctx context.Context) {} + \ No newline at end of file diff --git a/internal/pkg/transformation/types.go b/internal/pkg/transformation/types.go index 7d572df6..4cd34e1d 100644 --- a/internal/pkg/transformation/types.go +++ b/internal/pkg/transformation/types.go @@ -18,7 +18,6 @@ package transformation import ( "container/list" - "context" "regexp" "sync" @@ -77,12 +76,10 @@ type PodInfo struct { } type DRAResourceSliceManager struct { - factory informers.SharedInformerFactory - informer cache.SharedIndexInformer - cancelContext context.CancelFunc - mu sync.RWMutex - deviceToUUID map[string]string // pool/device -> UUID (for full GPUs) - migDevices map[string]*DRAMigDeviceInfo // pool/device -> MIG info (for MIG devices) + factory informers.SharedInformerFactory + informer cache.SharedIndexInformer + // sliceAPIVersion is "v1" or "v1beta1", matching the started ResourceSlice informer. + sliceAPIVersion string } // PodMetadata holds pod metadata from API server