From 12847b90cf7617d8b1044a8fb3cc477e144f9d85 Mon Sep 17 00:00:00 2001 From: "aldi.j" Date: Wed, 2 Jul 2025 22:35:50 +0700 Subject: [PATCH 1/3] [dcgm-exporter] Add configure externalTrafficPolicy for dcgmExporter service Signed-off-by: aldi.j --- api/nvidia/v1/clusterpolicy_types.go | 8 ++++++++ api/nvidia/v1/zz_generated.deepcopy.go | 5 +++++ bundle/manifests/nvidia.com_clusterpolicies.yaml | 8 ++++++++ config/crd/bases/nvidia.com_clusterpolicies.yaml | 8 ++++++++ controllers/object_controls.go | 4 ++++ .../gpu-operator/crds/nvidia.com_clusterpolicies.yaml | 8 ++++++++ deployments/gpu-operator/values.yaml | 2 ++ 7 files changed, 43 insertions(+) diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 6e466aef8..4f23f9bca 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -935,6 +935,14 @@ type DCGMExporterServiceConfig struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text" Type corev1.ServiceType `json:"type,omitempty"` + // ExternalTrafficPolicy controls how external traffic is handled by the Kubernetes service. + // Acceptable values are "Cluster" and "Local". This setting only applies when service type is NodePort or LoadBalancer. + // +kubebuilder:validation:Enum=Cluster;Local + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="External Traffic Policy" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:Cluster,Local" + ExternalTrafficPolicy string `json:"externalTrafficPolicy,omitempty"` + // InternalTrafficPolicy describes how nodes distribute service traffic they receive on the ClusterIP. // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Internal Traffic Policy for the DCGM Exporter K8s Service" diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index 1735b0699..1192ed208 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -276,6 +276,11 @@ func (in *DCGMExporterMetricsConfig) DeepCopy() *DCGMExporterMetricsConfig { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DCGMExporterServiceConfig) DeepCopyInto(out *DCGMExporterServiceConfig) { *out = *in + if in.ExternalTrafficPolicy != nil { + in, out := &in.ExternalTrafficPolicy, &out.ExternalTrafficPolicy + *out = new(corev1.ServiceExternalTrafficPolicyType) + **out = **in + } if in.InternalTrafficPolicy != nil { in, out := &in.InternalTrafficPolicy, &out.InternalTrafficPolicy *out = new(corev1.ServiceInternalTrafficPolicy) diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 032be5190..0ae2a6230 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -385,6 +385,14 @@ spec: description: 'Optional: Service configuration for NVIDIA DCGM Exporter' properties: + externalTrafficPolicy: + description: |- + Controls how external traffic is routed to the service. + Valid values are "Cluster" (default) and "Local". Applies to NodePort and LoadBalancer service types only. + enum: + - Cluster + - Local + type: string internalTrafficPolicy: description: InternalTrafficPolicy describes how nodes distribute service traffic they receive on the ClusterIP. diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 032be5190..0ae2a6230 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -385,6 +385,14 @@ spec: description: 'Optional: Service configuration for NVIDIA DCGM Exporter' properties: + externalTrafficPolicy: + description: |- + Controls how external traffic is routed to the service. + Valid values are "Cluster" (default) and "Local". Applies to NodePort and LoadBalancer service types only. + enum: + - Cluster + - Local + type: string internalTrafficPolicy: description: InternalTrafficPolicy describes how nodes distribute service traffic they receive on the ClusterIP. diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 30b52e81a..5e8562493 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -998,6 +998,10 @@ func TransformDCGMExporterService(obj *corev1.Service, config *gpuv1.ClusterPoli if serviceConfig.InternalTrafficPolicy != nil { obj.Spec.InternalTrafficPolicy = serviceConfig.InternalTrafficPolicy } + + if serviceConfig.ExternalTrafficPolicy != nil { + obj.Spec.ExternalTrafficPolicy = *serviceConfig.ExternalTrafficPolicy + } } return nil } diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 032be5190..0ae2a6230 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -385,6 +385,14 @@ spec: description: 'Optional: Service configuration for NVIDIA DCGM Exporter' properties: + externalTrafficPolicy: + description: |- + Controls how external traffic is routed to the service. + Valid values are "Cluster" (default) and "Local". Applies to NodePort and LoadBalancer service types only. + enum: + - Cluster + - Local + type: string internalTrafficPolicy: description: InternalTrafficPolicy describes how nodes distribute service traffic they receive on the ClusterIP. diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 5f4b64c07..d52926962 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -324,6 +324,8 @@ dcgmExporter: value: "/etc/dcgm-exporter/dcp-metrics-included.csv" resources: {} service: + type: ClusterIP + externalTrafficPolicy: Cluster internalTrafficPolicy: Cluster serviceMonitor: enabled: false From e6a05b5fbc29c2863eaf21062a4c9d08f3b66076 Mon Sep 17 00:00:00 2001 From: "aldi.j" Date: Wed, 2 Jul 2025 22:55:54 +0700 Subject: [PATCH 2/3] [dcgm-exporter] Revise the struct field for ExternalTrafficPolicy Signed-off-by: aldi.j --- api/nvidia/v1/clusterpolicy_types.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 4f23f9bca..f5775a333 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -941,7 +941,7 @@ type DCGMExporterServiceConfig struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="External Traffic Policy" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:Cluster,Local" - ExternalTrafficPolicy string `json:"externalTrafficPolicy,omitempty"` + ExternalTrafficPolicy *corev1.ServiceExternalTrafficPolicyType `json:"externalTrafficPolicy,omitempty"` // InternalTrafficPolicy describes how nodes distribute service traffic they receive on the ClusterIP. // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true From a9ffd601e77ee4f387cbea17de5e0f9db284c037 Mon Sep 17 00:00:00 2001 From: "aldi.j" Date: Thu, 3 Jul 2025 01:22:17 +0700 Subject: [PATCH 3/3] [dcgm-exporter] Allow custom number for nodePort Signed-off-by: aldi.j --- api/nvidia/v1/clusterpolicy_types.go | 7 +++++++ api/nvidia/v1/zz_generated.deepcopy.go | 5 +++++ bundle/manifests/nvidia.com_clusterpolicies.yaml | 7 +++++++ config/crd/bases/nvidia.com_clusterpolicies.yaml | 7 +++++++ controllers/object_controls.go | 4 ++++ .../gpu-operator/crds/nvidia.com_clusterpolicies.yaml | 7 +++++++ deployments/gpu-operator/values.yaml | 1 + 7 files changed, 38 insertions(+) diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index f5775a333..dd1dfa29f 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -935,6 +935,13 @@ type DCGMExporterServiceConfig struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text" Type corev1.ServiceType `json:"type,omitempty"` + // NodePort allows setting a custom port number for the NodePort service. + // Must be within the range configured in the Kubernetes API server (usually 30000-32767). + // +kubebuilder:validation:Minimum=30000 + // +kubebuilder:validation:Maximum=32767 + // +optional + NodePort *int32 `json:"nodePort,omitempty"` + // ExternalTrafficPolicy controls how external traffic is handled by the Kubernetes service. // Acceptable values are "Cluster" and "Local". This setting only applies when service type is NodePort or LoadBalancer. // +kubebuilder:validation:Enum=Cluster;Local diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index 1192ed208..93b9d7d4f 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -276,6 +276,11 @@ func (in *DCGMExporterMetricsConfig) DeepCopy() *DCGMExporterMetricsConfig { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DCGMExporterServiceConfig) DeepCopyInto(out *DCGMExporterServiceConfig) { *out = *in + if in.NodePort != nil { + in, out := &in.NodePort, &out.NodePort + *out = new(int32) + **out = **in + } if in.ExternalTrafficPolicy != nil { in, out := &in.ExternalTrafficPolicy, &out.ExternalTrafficPolicy *out = new(corev1.ServiceExternalTrafficPolicyType) diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 0ae2a6230..d5f06899d 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -397,6 +397,13 @@ spec: description: InternalTrafficPolicy describes how nodes distribute service traffic they receive on the ClusterIP. type: string + nodePort: + description: |- + Custom port number to expose for the NodePort service. + Must be between 30000 and 32767. + minimum: 30000 + maximum: 32767 + type: integer type: description: Type represents the ServiceType which describes ingress methods for a service diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 0ae2a6230..d5f06899d 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -397,6 +397,13 @@ spec: description: InternalTrafficPolicy describes how nodes distribute service traffic they receive on the ClusterIP. type: string + nodePort: + description: |- + Custom port number to expose for the NodePort service. + Must be between 30000 and 32767. + minimum: 30000 + maximum: 32767 + type: integer type: description: Type represents the ServiceType which describes ingress methods for a service diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 5e8562493..8ad717df5 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -995,6 +995,10 @@ func TransformDCGMExporterService(obj *corev1.Service, config *gpuv1.ClusterPoli obj.Spec.Type = serviceConfig.Type } + if serviceConfig.NodePort != nil && len(obj.Spec.Ports) > 0 { + obj.Spec.Ports[0].NodePort = *serviceConfig.NodePort + } + if serviceConfig.InternalTrafficPolicy != nil { obj.Spec.InternalTrafficPolicy = serviceConfig.InternalTrafficPolicy } diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 0ae2a6230..d5f06899d 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -397,6 +397,13 @@ spec: description: InternalTrafficPolicy describes how nodes distribute service traffic they receive on the ClusterIP. type: string + nodePort: + description: |- + Custom port number to expose for the NodePort service. + Must be between 30000 and 32767. + minimum: 30000 + maximum: 32767 + type: integer type: description: Type represents the ServiceType which describes ingress methods for a service diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index d52926962..a8e313fb9 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -325,6 +325,7 @@ dcgmExporter: resources: {} service: type: ClusterIP + nodePort: null externalTrafficPolicy: Cluster internalTrafficPolicy: Cluster serviceMonitor: