Skip to content

Commit 0fa652f

Browse files
committed
backport partial #2406
This is required so that newer version of dcgm-exporter can have correct rbac permissions if needed Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent 355f43a commit 0fa652f

5 files changed

Lines changed: 126 additions & 1 deletion

File tree

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
apiVersion: rbac.authorization.k8s.io/v1
2+
kind: ClusterRole
3+
metadata:
4+
name: nvidia-dcgm-exporter-read-pods
5+
labels:
6+
app: nvidia-dcgm-exporter
7+
rules:
8+
- apiGroups:
9+
- ""
10+
resources:
11+
- pods
12+
verbs:
13+
- get
14+
- list
15+
- watch
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
apiVersion: rbac.authorization.k8s.io/v1
2+
kind: ClusterRoleBinding
3+
metadata:
4+
name: nvidia-dcgm-exporter-read-pods
5+
labels:
6+
app: nvidia-dcgm-exporter
7+
roleRef:
8+
apiGroup: rbac.authorization.k8s.io
9+
kind: ClusterRole
10+
name: nvidia-dcgm-exporter-read-pods
11+
subjects:
12+
- kind: ServiceAccount
13+
name: nvidia-dcgm-exporter
14+
namespace: "FILLED BY THE OPERATOR"

assets/state-dcgm-exporter/0800_daemonset.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ spec:
2424
effect: NoSchedule
2525
priorityClassName: system-node-critical
2626
serviceAccountName: nvidia-dcgm-exporter
27+
automountServiceAccountToken: false
2728
initContainers:
2829
- name: toolkit-validation
2930
image: "FILLED BY THE OPERATOR"

controllers/object_controls.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,33 @@ func RoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
445445
return gpuv1.Ready, nil
446446
}
447447

448+
var rbacGates = map[string]func(*gpuv1.ClusterPolicySpec) bool{
449+
"nvidia-dcgm-exporter-read-pods": func(config *gpuv1.ClusterPolicySpec) bool {
450+
return isDCGMExporterKubernetesPodMetadataEnabled(&config.DCGMExporter)
451+
},
452+
}
453+
454+
func isRBACEnabled(name string, config *gpuv1.ClusterPolicySpec) bool {
455+
gate, ok := rbacGates[name]
456+
if !ok {
457+
return true
458+
}
459+
return gate(config)
460+
}
461+
462+
func isDCGMExporterKubernetesPodMetadataEnabled(config *gpuv1.DCGMExporterSpec) bool {
463+
for _, env := range config.Env {
464+
switch env.Name {
465+
case "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_LABELS", "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_UID":
466+
enabled, err := strconv.ParseBool(env.Value)
467+
if err == nil && enabled {
468+
return true
469+
}
470+
}
471+
}
472+
return false
473+
}
474+
448475
// ClusterRole creates ClusterRole resource
449476
func ClusterRole(n ClusterPolicyController) (gpuv1.State, error) {
450477
ctx := n.ctx
@@ -464,6 +491,15 @@ func ClusterRole(n ClusterPolicyController) (gpuv1.State, error) {
464491
return gpuv1.Disabled, nil
465492
}
466493

494+
if !isRBACEnabled(obj.Name, &n.singleton.Spec) {
495+
err := n.client.Delete(ctx, obj)
496+
if err != nil && !apierrors.IsNotFound(err) {
497+
logger.Info("Couldn't delete", "Error", err)
498+
return gpuv1.NotReady, err
499+
}
500+
return gpuv1.Disabled, nil
501+
}
502+
467503
if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
468504
return gpuv1.NotReady, err
469505
}
@@ -505,6 +541,15 @@ func ClusterRoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
505541
return gpuv1.Disabled, nil
506542
}
507543

544+
if !isRBACEnabled(obj.Name, &n.singleton.Spec) {
545+
err := n.client.Delete(ctx, obj)
546+
if err != nil && !apierrors.IsNotFound(err) {
547+
logger.Info("Couldn't delete", "Error", err)
548+
return gpuv1.NotReady, err
549+
}
550+
return gpuv1.Disabled, nil
551+
}
552+
508553
for idx := range obj.Subjects {
509554
obj.Subjects[idx].Namespace = n.operatorNamespace
510555
}
@@ -1820,6 +1865,10 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
18201865
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, jobMappingVol)
18211866
}
18221867

1868+
if isDCGMExporterKubernetesPodMetadataEnabled(&config.DCGMExporter) {
1869+
obj.Spec.Template.Spec.AutomountServiceAccountToken = ptr.To(true)
1870+
}
1871+
18231872
// mount configmap for custom metrics if provided by user
18241873
if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" {
18251874
metricsConfigVolMount := corev1.VolumeMount{Name: "metrics-config", ReadOnly: true, MountPath: MetricsConfigMountPath, SubPath: MetricsConfigFileName}

controllers/object_controls_test.go

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,7 @@ func testDaemonsetCommon(t *testing.T, cp *gpuv1.ClusterPolicy, component string
526526
require.Equal(t, spec.args, mainCtr.Args, "unexpected Args")
527527
}
528528
for _, env := range spec.env {
529-
require.Contains(t, mainCtr.Env, env, "env var not present")
529+
require.Equal(t, env.Value, getContainerEnv(&mainCtr, env.Name), "unexpected value for env var %s", env.Name)
530530
}
531531
// TODO: implement checks for other common fields (i.e. Resources, securityContext, Tolerations, etc.)
532532

@@ -1151,6 +1151,14 @@ func getDCGMExporterTestInput(testCase string) *gpuv1.ClusterPolicy {
11511151
case "standalone-dcgm":
11521152
dcgmEnabled := true
11531153
cp.Spec.DCGM.Enabled = &dcgmEnabled
1154+
case "pod-labels-enabled":
1155+
cp.Spec.DCGMExporter.Env = []gpuv1.EnvVar{
1156+
{Name: "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_LABELS", Value: "true"},
1157+
}
1158+
case "pod-uid-enabled":
1159+
cp.Spec.DCGMExporter.Env = []gpuv1.EnvVar{
1160+
{Name: "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_UID", Value: "true"},
1161+
}
11541162
default:
11551163
return nil
11561164
}
@@ -1166,6 +1174,7 @@ func getDCGMExporterTestOutput(testCase string) map[string]interface{} {
11661174
"numDaemonsets": 1,
11671175
"dcgmExporterImage": "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.0-3.2.0-ubuntu22.04",
11681176
"imagePullSecret": "ngc-secret",
1177+
"clusterRoleExists": false,
11691178
}
11701179

11711180
switch testCase {
@@ -1175,6 +1184,16 @@ func getDCGMExporterTestOutput(testCase string) map[string]interface{} {
11751184
output["env"] = map[string]string{
11761185
"DCGM_REMOTE_HOSTENGINE_INFO": "nvidia-dcgm:5555",
11771186
}
1187+
case "pod-labels-enabled":
1188+
output["env"] = map[string]string{
1189+
"DCGM_EXPORTER_KUBERNETES_ENABLE_POD_LABELS": "true",
1190+
}
1191+
output["clusterRoleExists"] = true
1192+
case "pod-uid-enabled":
1193+
output["env"] = map[string]string{
1194+
"DCGM_EXPORTER_KUBERNETES_ENABLE_POD_UID": "true",
1195+
}
1196+
output["clusterRoleExists"] = true
11781197
default:
11791198
return nil
11801199
}
@@ -1200,6 +1219,16 @@ func TestDCGMExporter(t *testing.T) {
12001219
getDCGMExporterTestInput("standalone-dcgm"),
12011220
getDCGMExporterTestOutput("standalone-dcgm"),
12021221
},
1222+
{
1223+
"PodLabelsEnabled",
1224+
getDCGMExporterTestInput("pod-labels-enabled"),
1225+
getDCGMExporterTestOutput("pod-labels-enabled"),
1226+
},
1227+
{
1228+
"PodUIDEnabled",
1229+
getDCGMExporterTestInput("pod-uid-enabled"),
1230+
getDCGMExporterTestOutput("pod-uid-enabled"),
1231+
},
12031232
}
12041233

12051234
for _, tc := range testCases {
@@ -1231,6 +1260,23 @@ func TestDCGMExporter(t *testing.T) {
12311260
}
12321261
}
12331262

1263+
clusterRoleKey := client.ObjectKey{Name: "nvidia-dcgm-exporter-read-pods"}
1264+
clusterRole := &rbacv1.ClusterRole{}
1265+
clusterRoleGetErr := clusterPolicyController.client.Get(context.Background(), clusterRoleKey, clusterRole)
1266+
clusterRoleBinding := &rbacv1.ClusterRoleBinding{}
1267+
clusterRoleBindingGetErr := clusterPolicyController.client.Get(context.Background(), clusterRoleKey, clusterRoleBinding)
1268+
if tc.output["clusterRoleExists"].(bool) {
1269+
require.NoError(t, clusterRoleGetErr, "ClusterRole should exist when pod metadata enrichment is enabled")
1270+
require.NoError(t, clusterRoleBindingGetErr, "ClusterRoleBinding should exist when pod metadata enrichment is enabled")
1271+
require.NotNil(t, ds.Spec.Template.Spec.AutomountServiceAccountToken, "AutomountServiceAccountToken should be set when pod metadata enrichment is enabled")
1272+
require.True(t, *ds.Spec.Template.Spec.AutomountServiceAccountToken, "AutomountServiceAccountToken should be true when pod metadata enrichment is enabled")
1273+
} else {
1274+
require.True(t, apierrors.IsNotFound(clusterRoleGetErr), "ClusterRole should not exist when pod metadata enrichment is disabled (got err=%v)", clusterRoleGetErr)
1275+
require.True(t, apierrors.IsNotFound(clusterRoleBindingGetErr), "ClusterRoleBinding should not exist when pod metadata enrichment is disabled (got err=%v)", clusterRoleBindingGetErr)
1276+
require.NotNil(t, ds.Spec.Template.Spec.AutomountServiceAccountToken, "AutomountServiceAccountToken should be explicitly set false when pod metadata enrichment is disabled")
1277+
require.False(t, *ds.Spec.Template.Spec.AutomountServiceAccountToken, "AutomountServiceAccountToken should be false when pod metadata enrichment is disabled")
1278+
}
1279+
12341280
require.Equal(t, tc.output["dcgmExporterImage"], dcgmExporterImage, "Unexpected configuration for dcgm-exporter image")
12351281

12361282
// cleanup by deleting all kubernetes objects

0 commit comments

Comments
 (0)