Skip to content

Commit bfd8a3d

Browse files
Add scrapeTimeout to DCGM Exporter ServiceMonitor
Expose scrapeTimeout on DCGMExporterServiceMonitorConfig in the ClusterPolicy CR and propagate it to the rendered ServiceMonitor's endpoint. The field is also surfaced in the gpu-operator Helm values under dcgmExporter.serviceMonitor. Empty string is the unset sentinel; Prometheus' global default applies otherwise. Signed-off-by: Rajath Agasthya <ragasthya@nvidia.com>
1 parent da2b563 commit bfd8a3d

7 files changed

Lines changed: 38 additions & 2 deletions

File tree

api/nvidia/v1/clusterpolicy_types.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,6 +1082,14 @@ type DCGMExporterServiceMonitorConfig struct {
10821082
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
10831083
Interval promv1.Duration `json:"interval,omitempty"`
10841084

1085+
// ScrapeTimeout to use when scraping metrics from NVIDIA DCGM Exporter. Must not be greater than Interval.
1086+
// If not specified, Prometheus' global scrape timeout is used.
1087+
// Supported units: y, w, d, h, m, s, ms
1088+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
1089+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Scrape timeout for NVIDIA DCGM Exporter"
1090+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
1091+
ScrapeTimeout promv1.Duration `json:"scrapeTimeout,omitempty"`
1092+
10851093
// HonorLabels chooses the metric’s labels on collisions with target labels.
10861094
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
10871095
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Choose the metric's label on collisions with target labels"

bundle/manifests/nvidia.com_clusterpolicies.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -785,6 +785,13 @@ spec:
785785
type: string
786786
type: object
787787
type: array
788+
scrapeTimeout:
789+
description: |-
790+
ScrapeTimeout to use when scraping metrics from NVIDIA DCGM Exporter. Must not be greater than Interval.
791+
If not specified, Prometheus' global scrape timeout is used.
792+
Supported units: y, w, d, h, m, s, ms
793+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
794+
type: string
788795
type: object
789796
version:
790797
description: NVIDIA DCGM Exporter image tag

config/crd/bases/nvidia.com_clusterpolicies.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -785,6 +785,13 @@ spec:
785785
type: string
786786
type: object
787787
type: array
788+
scrapeTimeout:
789+
description: |-
790+
ScrapeTimeout to use when scraping metrics from NVIDIA DCGM Exporter. Must not be greater than Interval.
791+
If not specified, Prometheus' global scrape timeout is used.
792+
Supported units: y, w, d, h, m, s, ms
793+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
794+
type: string
788795
type: object
789796
version:
790797
description: NVIDIA DCGM Exporter image tag

controllers/object_controls.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4963,6 +4963,10 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) {
49634963
obj.Spec.Endpoints[0].Interval = serviceMonitor.Interval
49644964
}
49654965

4966+
if serviceMonitor.ScrapeTimeout != "" {
4967+
obj.Spec.Endpoints[0].ScrapeTimeout = serviceMonitor.ScrapeTimeout
4968+
}
4969+
49664970
if serviceMonitor.HonorLabels != nil {
49674971
obj.Spec.Endpoints[0].HonorLabels = *serviceMonitor.HonorLabels
49684972
}

controllers/object_controls_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,6 +1378,7 @@ func TestServiceMonitor(t *testing.T) {
13781378
ServiceMonitor: &gpuv1.DCGMExporterServiceMonitorConfig{
13791379
Enabled: ptr.To(true),
13801380
Interval: promv1.Duration("15s"),
1381+
ScrapeTimeout: promv1.Duration("10s"),
13811382
HonorLabels: ptr.To(true),
13821383
AdditionalLabels: map[string]string{"a": "b"},
13831384
Relabelings: []*promv1.RelabelConfig{{Action: "keep"}},
@@ -1394,8 +1395,9 @@ func TestServiceMonitor(t *testing.T) {
13941395
Spec: promv1.ServiceMonitorSpec{
13951396
NamespaceSelector: promv1.NamespaceSelector{MatchNames: []string{"test-namespace"}},
13961397
Endpoints: []promv1.Endpoint{{
1397-
Interval: promv1.Duration("15s"),
1398-
HonorLabels: true,
1398+
Interval: promv1.Duration("15s"),
1399+
ScrapeTimeout: promv1.Duration("10s"),
1400+
HonorLabels: true,
13991401
RelabelConfigs: []promv1.RelabelConfig{{
14001402
Action: "keep",
14011403
}},

deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -785,6 +785,13 @@ spec:
785785
type: string
786786
type: object
787787
type: array
788+
scrapeTimeout:
789+
description: |-
790+
ScrapeTimeout to use when scraping metrics from NVIDIA DCGM Exporter. Must not be greater than Interval.
791+
If not specified, Prometheus' global scrape timeout is used.
792+
Supported units: y, w, d, h, m, s, ms
793+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
794+
type: string
788795
type: object
789796
version:
790797
description: NVIDIA DCGM Exporter image tag

deployments/gpu-operator/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ dcgmExporter:
302302
serviceMonitor:
303303
enabled: false
304304
interval: 15s
305+
scrapeTimeout: ""
305306
honorLabels: false
306307
additionalLabels: {}
307308
relabelings: []

0 commit comments

Comments
 (0)