Skip to content

Commit 9060e5f

Browse files
committed
nest serviceMonitor under metrics in operator
Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent 8e45835 commit 9060e5f

9 files changed

Lines changed: 475 additions & 137 deletions

File tree

api/nvidia/v1/clusterpolicy_types.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ type ServiceMonitorConfig struct {
148148
// Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used.
149149
// Supported units: y, w, d, h, m, s, ms
150150
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
151-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Interval which metrics should be scraped from"
151+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Interval at which metrics should be scraped"
152152
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
153153
Interval promv1.Duration `json:"interval,omitempty"`
154154

@@ -193,10 +193,8 @@ type OperatorSpec struct {
193193
// queryable and should be preserved when modifying objects.
194194
Annotations map[string]string `json:"annotations,omitempty"`
195195

196-
// Optional: ServiceMonitor configuration for NVIDIA GPU Operator
197-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
198-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceMonitor configuration for NVIDIA GPU Operator"
199-
ServiceMonitor *ServiceMonitorConfig `json:"serviceMonitor,omitempty"`
196+
// Metrics configuration for NVIDIA GPU Operator
197+
Metrics OperatorMetricsSpec `json:"metrics,omitempty"`
200198

201199
// UseOpenShiftDriverToolkit indicates if DriverToolkit image should be used on OpenShift to build and install driver modules
202200
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
@@ -205,6 +203,13 @@ type OperatorSpec struct {
205203
UseOpenShiftDriverToolkit *bool `json:"use_ocp_driver_toolkit,omitempty"`
206204
}
207205

206+
type OperatorMetricsSpec struct {
207+
// Optional: ServiceMonitor configuration for NVIDIA GPU Operator
208+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
209+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceMonitor configuration for NVIDIA GPU Operator"
210+
ServiceMonitor *ServiceMonitorConfig `json:"serviceMonitor,omitempty"`
211+
}
212+
208213
// HostPathsSpec defines various paths on the host needed by GPU Operator components
209214
type HostPathsSpec struct {
210215
// RootFS represents the path to the root filesystem of the host.

api/nvidia/v1/zz_generated.deepcopy.go

Lines changed: 21 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/nvidia.com_clusterpolicies.yaml

Lines changed: 118 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -681,25 +681,23 @@ spec:
681681
additionalProperties:
682682
type: string
683683
description: AdditionalLabels to add to ServiceMonitor instance
684-
for NVIDIA DCGM Exporter
685684
type: object
686685
enabled:
687686
description: Enabled indicates if ServiceMonitor is deployed
688-
for NVIDIA DCGM Exporter
689687
type: boolean
690688
honorLabels:
691689
description: HonorLabels chooses the metric’s labels on collisions
692690
with target labels.
693691
type: boolean
694692
interval:
695693
description: |-
696-
Interval which metrics should be scraped from NVIDIA DCGM Exporter. If not specified Prometheus’ global scrape interval is used.
694+
Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used.
697695
Supported units: y, w, d, h, m, s, ms
698696
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
699697
type: string
700698
relabelings:
701699
description: Relabelings allows to rewrite labels on metric
702-
sets for NVIDIA DCGM Exporter
700+
sets
703701
items:
704702
description: |-
705703
RelabelConfig allows dynamic rewriting of the label set for targets, alerts,
@@ -1971,6 +1969,122 @@ spec:
19711969
(scope and select) objects. May match selectors of replication controllers
19721970
and services.
19731971
type: object
1972+
metrics:
1973+
description: Metrics configuration for NVIDIA GPU Operator
1974+
properties:
1975+
serviceMonitor:
1976+
description: 'Optional: ServiceMonitor configuration for NVIDIA
1977+
GPU Operator'
1978+
properties:
1979+
additionalLabels:
1980+
additionalProperties:
1981+
type: string
1982+
description: AdditionalLabels to add to ServiceMonitor
1983+
instance
1984+
type: object
1985+
enabled:
1986+
description: Enabled indicates if ServiceMonitor is deployed
1987+
type: boolean
1988+
honorLabels:
1989+
description: HonorLabels chooses the metric’s labels on
1990+
collisions with target labels.
1991+
type: boolean
1992+
interval:
1993+
description: |-
1994+
Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used.
1995+
Supported units: y, w, d, h, m, s, ms
1996+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
1997+
type: string
1998+
relabelings:
1999+
description: Relabelings allows to rewrite labels on metric
2000+
sets
2001+
items:
2002+
description: |-
2003+
RelabelConfig allows dynamic rewriting of the label set for targets, alerts,
2004+
scraped samples and remote write samples.
2005+
2006+
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config
2007+
properties:
2008+
action:
2009+
default: replace
2010+
description: |-
2011+
action to perform based on the regex matching.
2012+
2013+
`Uppercase` and `Lowercase` actions require Prometheus >= v2.36.0.
2014+
`DropEqual` and `KeepEqual` actions require Prometheus >= v2.41.0.
2015+
2016+
Default: "Replace"
2017+
enum:
2018+
- replace
2019+
- Replace
2020+
- keep
2021+
- Keep
2022+
- drop
2023+
- Drop
2024+
- hashmod
2025+
- HashMod
2026+
- labelmap
2027+
- LabelMap
2028+
- labeldrop
2029+
- LabelDrop
2030+
- labelkeep
2031+
- LabelKeep
2032+
- lowercase
2033+
- Lowercase
2034+
- uppercase
2035+
- Uppercase
2036+
- keepequal
2037+
- KeepEqual
2038+
- dropequal
2039+
- DropEqual
2040+
type: string
2041+
modulus:
2042+
description: |-
2043+
modulus to take of the hash of the source label values.
2044+
2045+
Only applicable when the action is `HashMod`.
2046+
format: int64
2047+
type: integer
2048+
regex:
2049+
description: regex defines the regular expression
2050+
against which the extracted value is matched.
2051+
type: string
2052+
replacement:
2053+
description: |-
2054+
replacement value against which a Replace action is performed if the
2055+
regular expression matches.
2056+
2057+
Regex capture groups are available.
2058+
type: string
2059+
separator:
2060+
description: separator defines the string between
2061+
concatenated SourceLabels.
2062+
type: string
2063+
sourceLabels:
2064+
description: |-
2065+
sourceLabels defines the source labels select values from existing labels. Their content is
2066+
concatenated using the configured Separator and matched against the
2067+
configured regular expression.
2068+
items:
2069+
description: |-
2070+
LabelName is a valid Prometheus label name.
2071+
For Prometheus 3.x, a label name is valid if it contains UTF-8 characters.
2072+
For Prometheus 2.x, a label name is only valid if it contains ASCII characters, letters, numbers, as well as underscores.
2073+
type: string
2074+
type: array
2075+
targetLabel:
2076+
description: |-
2077+
targetLabel defines the label to which the resulting string is written in a replacement.
2078+
2079+
It is mandatory for `Replace`, `HashMod`, `Lowercase`, `Uppercase`,
2080+
`KeepEqual` and `DropEqual` actions.
2081+
2082+
Regex capture groups are available.
2083+
type: string
2084+
type: object
2085+
type: array
2086+
type: object
2087+
type: object
19742088
runtimeClass:
19752089
default: nvidia
19762090
type: string

0 commit comments

Comments
 (0)