diff --git a/.github/workflows/alerts-test.yml b/.github/workflows/alerts-test.yml new file mode 100644 index 00000000..0ee5652a --- /dev/null +++ b/.github/workflows/alerts-test.yml @@ -0,0 +1,55 @@ +name: Alerts-test-kafka-operator +on: + workflow_run: + workflows: ["Build Artifacts"] + types: + - completed + pull_request: + branches: + - all + +env: + max_attempts: 30 + delay: 10 + +permissions: + contents: read + +jobs: + Run-Alerts-Test: + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Check out repository code + uses: actions/checkout@v4 + + - name: Check yq version + run: yq --version + + - name: Install Helm + run: | + curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + + - name: Render rules file from helm chart + run: | + helm template kafka-montemplate ./operator/charts/helm/kafka-service/ > ./operator/tests/alerts-tests/rules.yaml + sed -n '/prometheus_rules.yaml/,/---/p' -i ./operator/tests/alerts-tests/rules.yaml + sed '0,/spec:/d' -i ./operator/tests/alerts-tests/rules.yaml + + + - name: Check that all necessary tests exists + run: | + chmod +x ./operator/tests/alerts-tests/tests-checker.sh + cd ./operator/tests/alerts-tests/ + ./tests-checker.sh + continue-on-error: true + + - name: Install vmalert-tool + run: | + wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/v1.122.4/vmutils-linux-amd64-v1.122.4-enterprise.tar.gz + tar -xvf vmutils-linux-amd64-v1.122.4-enterprise.tar.gz + chmod +x vmalert-tool-prod + + - name: Run test + run: | + ./vmalert-tool-prod unittest --files ./operator/tests/alerts-tests/test.yaml \ No newline at end of file diff --git a/monitoring/README.md b/monitoring/README.md index f07186be..c7f72e1c 100644 --- a/monitoring/README.md +++ b/monitoring/README.md @@ -101,3 +101,14 @@ Triggers for tracking following problems are included in template: If you have Kafka which is deployed in DR mode you need to create two hosts: for left and for right side and to specify the side as value (`left`, `right`) for the macros `{$DR_SIDE}`. If you have Kafka without DR just leave this macros empty. + +### Deep alerts tuning using subchart + +If you want to make deep customizations on alerts (add new ones, override any alert fields, disable alerts etd) you can use v2 alerts functionality. +To use it you need: + +1) Set alertsPackVersion: v2 value in monitoring section in values yaml for kafka-services. +2) Use subchart`s values yaml (/operator/charts/helm/kafka-service/charts/prometheusrules) to set overrides for alerts. Overrides will be merged with default alerts, described in subchart helpers.tpl with higher priority. + +If you will set any other value for alertsPackVersion except "v2" or wont set this value at all - installation will happen on old flavour. +Alert groups in subchart are supported in same manner as described above. \ No newline at end of file diff --git a/operator/charts/helm/kafka-service/Chart.yaml b/operator/charts/helm/kafka-service/Chart.yaml index 789999a5..f474a4ab 100644 --- a/operator/charts/helm/kafka-service/Chart.yaml +++ b/operator/charts/helm/kafka-service/Chart.yaml @@ -19,3 +19,11 @@ version: 1.0.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. appVersion: 1.0.0 + +dependencies: + # Prometheus alert rules +- name: monitoring + condition: monitoring.install + version: ~0 + repository: "file://charts/prometheusrules" + \ No newline at end of file diff --git a/operator/charts/helm/kafka-service/charts/prometheusrules/Chart.yaml b/operator/charts/helm/kafka-service/charts/prometheusrules/Chart.yaml new file mode 100644 index 00000000..92ce7a9f --- /dev/null +++ b/operator/charts/helm/kafka-service/charts/prometheusrules/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: monitoring +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/operator/charts/helm/kafka-service/charts/prometheusrules/templates/_helpers.tpl b/operator/charts/helm/kafka-service/charts/prometheusrules/templates/_helpers.tpl new file mode 100644 index 00000000..d20e878e --- /dev/null +++ b/operator/charts/helm/kafka-service/charts/prometheusrules/templates/_helpers.tpl @@ -0,0 +1,132 @@ +{{- define "defaultAlerts" -}} + {{ .Release.Namespace }}-{{ .Release.Name }}: + rules: + KafkaIsDegradedAlert: + annotations: + description: 'Kafka is Degraded' + summary: Some of Kafka Service pods are down + expr: kafka_cluster_status{namespace="{{ .Release.Namespace }}",container="{{ template "kafka.name" . }}-monitoring"} == 6 + for: 3m + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + KafkaMetricsAreAbsent: + annotations: + description: 'Kafka metrics are absent on {{ .Release.Namespace }}.' + summary: Kafka metrics are absent + expr: absent(kafka_cluster_status{namespace="{{ .Release.Namespace }}"}) == 1 + for: 3m + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + KafkaIsDownAlert: + annotations: + description: 'Kafka is Down' + summary: All of Kafka Service pods are down + expr: kafka_cluster_status{namespace="{{ .Release.Namespace }}",container="{{ template "kafka.name" . }}-monitoring"} == 10 + for: 3m + labels: + severity: critical + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + KafkaCPUUsageAlert: + annotations: + description: 'Kafka CPU usage is higher than 95 percents' + summary: Some of Kafka Service pods load CPU higher then 95 percents + expr: max(rate(container_cpu_usage_seconds_total{namespace="{{ .Release.Namespace }}",pod=~"{{ template "kafka.name" . }}-[0-9].*",container="kafka"}[5m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="{{ .Release.Namespace }}",exported_pod=~"{{ template "kafka.name" . }}-[0-9].*"}) > 0.95 + for: 3m + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + KafkaMemoryUsageAlert: + annotations: + description: 'Kafka memory usage is higher than 95 percents' + summary: Some of Kafka Service pods use memory higher then 95 percents + expr: max(container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}",pod=~"{{ template "kafka.name" . }}-[0-9].*",container="kafka"}) / max(kube_pod_container_resource_limits_memory_bytes{exported_namespace="{{ .Release.Namespace }}",exported_pod=~"{{ template "kafka.name" . }}-[0-9].*"}) > 0.95 + for: 3m + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + KafkaHeapMemoryUsageAlert: + annotations: + description: 'Kafka heap memory usage is higher than 95 percents' + summary: Some of Kafka Service pods use heap memory higher then 95 percents + expr: max(java_Memory_HeapMemoryUsage_used{namespace="{{ .Release.Namespace }}",broker=~"{{ template "kafka.name" . }}-[0-9].*"}) / max(java_Memory_HeapMemoryUsage_max{namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"}) > 0.95 + for: 3m + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + KafkaGCCountAlert: + annotations: + description: 'Some of Kafka Service pods have Garbage collections count rate higher than {{ .Values.thresholds.gcCountAlert }}' + summary: Some of Kafka Service pods have Garbage collections count rate higher than {{ .Values.thresholds.gcCountAlert }} + expr: max(rate(java_GarbageCollector_CollectionCount_total{namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"}[5m])) > {{ .Values.thresholds.gcCountAlert }} + for: 3m + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + KafkaLagAlert: + annotations: + description: 'Some of Kafka Service pods have partition lag higher than {{ .Values.thresholds.lagAlert }}' + summary: Some of Kafka Service pods have partition lag higher than {{ .Values.thresholds.lagAlert }} + expr: max(kafka_consumergroup_group_lag{namespace="{{ .Release.Namespace }}"}) > {{ .Values.thresholds.lagAlert }} + for: 3m + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + {{- if .Values.thresholds.partitionCountAlert }} + KafkaPartitionCountAlert: + annotations: + description: 'Kafka Partition count for {{`{{ $labels.broker }}`}} broker is higher than {{ .Values.thresholds.partitionCountAlert }}' + summary: Some of Kafka Partition count is higher than {{ .Values.thresholds.partitionCountAlert }} + expr: kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > {{ .Values.thresholds.partitionCountAlert }} + for: 3m + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + {{- end }} + {{- if .Values.thresholds.brokerSkewAlert }} + KafkaBrokerSkewAlert: + annotations: + description: 'Kafka Broker Skew for {{`{{ $labels.broker }}`}} broker is higher than {{ .Values.thresholds.brokerSkewAlert }} percent' + summary: Some of Kafka Broker Skew is higher than {{ .Values.thresholds.brokerSkewAlert }} percent + expr: (kafka_broker_skew{namespace="{{ .Release.Namespace }}", container="{{ template "kafka.name" . }}-monitoring", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > {{ .Values.thresholds.brokerSkewAlert }}) and on(broker, namespace) (kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > 3 ) + for: 3m + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + {{- end }} + {{- if .Values.thresholds.brokerLeaderSkewAlert }} + KafkaBrokerLeaderSkewAlert: + annotations: + description: 'Kafka Broker Leader Skew for {{`{{ $labels.broker }}`}} broker is higher than {{ .Values.thresholds.brokerLeaderSkewAlert }} percent' + summary: Some of Kafka Broker Leader Skew is higher than {{ .Values.thresholds.brokerLeaderSkewAlert }} percent + expr: (kafka_broker_leader_skew{namespace="{{ .Release.Namespace }}", container="{{ template "kafka.name" . }}-monitoring", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > {{ .Values.thresholds.brokerLeaderSkewAlert }}) and on(broker, namespace) (kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > 3 ) + for: 3m + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} + {{- end }} + SupplementaryServicesCompatibilityAlert: + annotations: + description: 'Kafka supplementary services in namespace {{`{{ $labels.namespace }}`}} is not compatible with Kafka version {{`{{ $labels.application_version }}`}}' + summary: 'Kafka supplementary services in namespace {{`{{ $labels.namespace }}`}} is not compatible with Kafka version {{`{{ $labels.application_version }}`}}, allowed range is {{`{{ $labels.min_version }}`}} - {{`{{ $labels.max_version }}`}}' + expr: supplementary_services_version_compatible{application="kafka", namespace="{{ .Release.Namespace }}"} != 1 + for: 3m + labels: + severity: warning + namespace: {{ .Release.Namespace }} + service: {{ .Release.Name }} +{{- end }} + + diff --git a/operator/charts/helm/kafka-service/charts/prometheusrules/templates/prometheus_rules.yaml b/operator/charts/helm/kafka-service/charts/prometheusrules/templates/prometheus_rules.yaml new file mode 100644 index 00000000..ff61cee4 --- /dev/null +++ b/operator/charts/helm/kafka-service/charts/prometheusrules/templates/prometheus_rules.yaml @@ -0,0 +1,59 @@ +{{- if and ( .Values.install) (eq .Values.alertsPackVersion "v2") }} +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: prometheusrules +spec: + groups: + +{{- $defaultConfig := fromYaml (include "defaultAlerts" . ) -}} +{{- $overrideConfig := .Values.alerts -}} +{{- $finalConfig := merge $overrideConfig $defaultConfig -}} +{{- $alertGroups := .Values.ruleGroups -}} + +{{- range $defaultGroupName, $defaultGroup := $finalConfig }} +{{- $found := true }} +{{- if $alertGroups }} +{{- $found := false }} +{{- range $alertGroups }} + {{- if eq $defaultGroupName . }} + {{- $found := true }} + {{- end }} +{{- end }} +{{- else }} + {{- $found := true }} +{{- end }} + +{{- if $found }} + - name: {{ $defaultGroupName }} + {{- if $defaultGroup.labels }} + labels: + {{- range $defaultLabelName, $defaultLabelValue := $defaultGroup.labels }} + {{ $defaultLabelName }}: {{ $defaultLabelValue }} + {{- end }} + {{- end }} + {{- if $defaultGroup.interval }} + interval: {{ $defaultGroup.interval }} + {{- end }} + {{- if $defaultGroup.concurrency }} + concurrency: {{ $defaultGroup.concurrency }} + {{- end }} + rules: +{{- range $defaultRuleName, $defaultRule := $defaultGroup.rules }} + - alert: {{ $defaultRuleName }} + expr: {{ $defaultRule.expr }} + {{- if $defaultRule.for }} + for: {{ $defaultRule.for }} + {{- end }} + labels: +{{- range $defaultLabelName, $defaultLabelValue := $defaultRule.labels }} + {{ $defaultLabelName }}: {{ $defaultLabelValue }} +{{- end }} + annotations: +{{- range $defaultAnnotationName, $defaultAnnotationValue := $defaultRule.annotations }} + {{ $defaultAnnotationName }}: {{ printf $defaultAnnotationValue | trimAll "\n" | toJson | replace "\\u0026" "&" | replace "\\u003e" ">" | nindent 14 }} +{{- end }} +{{- end }} +{{- end }} +{{- end }} +{{- end }} diff --git a/operator/charts/helm/kafka-service/charts/prometheusrules/values.yaml b/operator/charts/helm/kafka-service/charts/prometheusrules/values.yaml new file mode 100644 index 00000000..e69de29b diff --git a/operator/charts/helm/kafka-service/templates/prometheus_rules.yaml b/operator/charts/helm/kafka-service/templates/prometheus_rules.yaml index 9ac81fae..e276028a 100644 --- a/operator/charts/helm/kafka-service/templates/prometheus_rules.yaml +++ b/operator/charts/helm/kafka-service/templates/prometheus_rules.yaml @@ -1,4 +1,4 @@ -{{- if (and (eq (include "monitoring.install" .) "true") (ne (include "monitoring.type" .) "influxdb") .Values.global.installDashboard (ne (.Values.monitoring.installGrafanaDashboard | toString) "false")) }} +{{- if and (eq (include "monitoring.install" .) "true") (ne .Values.monitoring.alertsPackVersion "v2") (ne (include "monitoring.type" .) "influxdb") }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -19,7 +19,7 @@ spec: expr: kafka_cluster_status{namespace="{{ .Release.Namespace }}",container="{{ template "kafka.name" . }}-monitoring"} == 6 for: 3m labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: KafkaMetricsAreAbsent @@ -29,7 +29,7 @@ spec: expr: absent(kafka_cluster_status{namespace="{{ .Release.Namespace }}"}) == 1 for: 3m labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: KafkaIsDownAlert @@ -49,7 +49,7 @@ spec: expr: max(rate(container_cpu_usage_seconds_total{namespace="{{ .Release.Namespace }}",pod=~"{{ template "kafka.name" . }}-[0-9].*",container="kafka"}[5m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="{{ .Release.Namespace }}",exported_pod=~"{{ template "kafka.name" . }}-[0-9].*"}) > 0.95 for: 3m labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: KafkaMemoryUsageAlert @@ -59,7 +59,7 @@ spec: expr: max(container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}",pod=~"{{ template "kafka.name" . }}-[0-9].*",container="kafka"}) / max(kube_pod_container_resource_limits_memory_bytes{exported_namespace="{{ .Release.Namespace }}",exported_pod=~"{{ template "kafka.name" . }}-[0-9].*"}) > 0.95 for: 3m labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: KafkaHeapMemoryUsageAlert @@ -69,7 +69,7 @@ spec: expr: max(java_Memory_HeapMemoryUsage_used{namespace="{{ .Release.Namespace }}",broker=~"{{ template "kafka.name" . }}-[0-9].*"}) / max(java_Memory_HeapMemoryUsage_max{namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"}) > 0.95 for: 3m labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: KafkaGCCountAlert @@ -79,7 +79,7 @@ spec: expr: max(rate(java_GarbageCollector_CollectionCount_total{namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"}[5m])) > {{ .Values.monitoring.thresholds.gcCountAlert }} for: 3m labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} - alert: KafkaLagAlert @@ -89,7 +89,7 @@ spec: expr: max(kafka_consumergroup_group_lag{namespace="{{ .Release.Namespace }}"}) > {{ .Values.monitoring.thresholds.lagAlert }} for: 3m labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} {{- if .Values.monitoring.thresholds.partitionCountAlert }} @@ -100,31 +100,31 @@ spec: expr: kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > {{ .Values.monitoring.thresholds.partitionCountAlert }} for: 3m labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} {{- end }} {{- if .Values.monitoring.thresholds.brokerSkewAlert }} - alert: KafkaBrokerSkewAlert annotations: - description: 'Kafka Broker Skew for {{`{{ $labels.broker }}`}} broker is higher than {{ .Values.monitoring.thresholds.brokerSkewAlert }}%' - summary: Some of Kafka Broker Skew is higher than {{ .Values.monitoring.thresholds.brokerSkewAlert }}% + description: 'Kafka Broker Skew for {{`{{ $labels.broker }}`}} broker is higher than {{ .Values.monitoring.thresholds.brokerSkewAlert }} percent' + summary: Some of Kafka Broker Skew is higher than {{ .Values.monitoring.thresholds.brokerSkewAlert }} percent expr: (kafka_broker_skew{namespace="{{ .Release.Namespace }}", container="{{ template "kafka.name" . }}-monitoring", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > {{ .Values.monitoring.thresholds.brokerSkewAlert }}) and on(broker, namespace) (kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > {{ coalesce .Values.monitoring.thresholds.brokerSkewAlertPartitionCount (include "kafka.replicas" . ) }}) for: 3m labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} {{- end }} {{- if .Values.monitoring.thresholds.brokerLeaderSkewAlert }} - alert: KafkaBrokerLeaderSkewAlert annotations: - description: 'Kafka Broker Leader Skew for {{`{{ $labels.broker }}`}} broker is higher than {{ .Values.monitoring.thresholds.brokerLeaderSkewAlert }}%' - summary: Some of Kafka Broker Leader Skew is higher than {{ .Values.monitoring.thresholds.brokerLeaderSkewAlert }}% + description: 'Kafka Broker Leader Skew for {{`{{ $labels.broker }}`}} broker is higher than {{ .Values.monitoring.thresholds.brokerLeaderSkewAlert }} percent' + summary: Some of Kafka Broker Leader Skew is higher than {{ .Values.monitoring.thresholds.brokerLeaderSkewAlert }} percent expr: (kafka_broker_leader_skew{namespace="{{ .Release.Namespace }}", container="{{ template "kafka.name" . }}-monitoring", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > {{ .Values.monitoring.thresholds.brokerLeaderSkewAlert }}) and on(broker, namespace) (kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > {{ coalesce .Values.monitoring.thresholds.brokerLeaderSkewAlertPartitionCount (include "kafka.replicas" . ) }}) for: 3m labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} {{- end }} @@ -138,7 +138,7 @@ spec: } > 0 for: 5m labels: - severity: high + severity: warning namespace: {{ .Release.Namespace }} service: {{ .Release.Name }} {{- end }} diff --git a/operator/charts/helm/kafka-service/values.yaml b/operator/charts/helm/kafka-service/values.yaml index 2e8d1d82..10894b46 100644 --- a/operator/charts/helm/kafka-service/values.yaml +++ b/operator/charts/helm/kafka-service/values.yaml @@ -194,6 +194,7 @@ kafka: monitoring: install: true + alertsPackVersion: v1 dockerImage: ghcr.io/netcracker/qubership-kafka-monitoring:main serviceMonitorEnabled: true # affinity: { diff --git a/operator/tests/alerts-tests/test.yaml b/operator/tests/alerts-tests/test.yaml new file mode 100644 index 00000000..c23b42ce --- /dev/null +++ b/operator/tests/alerts-tests/test.yaml @@ -0,0 +1,356 @@ +rule_files: +- rules.yaml +evaluation_interval: 1m +tests: +- interval: 1m + input_series: + - series: kafka_cluster_status{namespace="default",container="kafka-monitoring"} + values: "6x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaIsDegradedAlert + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: kafka-montemplate + container: kafka-monitoring + exp_annotations: + description: Kafka is Degraded + summary: Some of Kafka Service pods are down + +- interval: 1m + input_series: + - series: kafka_cluster_status{namespace="default",container="kafka-monitoring"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaIsDegradedAlert + exp_alerts: [] + +- interval: 1m + input_series: + - series: kafka_cluster_status + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaMetricsAreAbsent + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: kafka-montemplate + exp_annotations: + description: Kafka metrics are absent on default. + summary: Kafka metrics are absent + +- interval: 1m + input_series: + - series: kafka_cluster_status{namespace="default"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaMetricsAreAbsent + exp_alerts: [] + +- interval: 1m + input_series: + - series: kafka_cluster_status{namespace="default",container="kafka-monitoring"} + values: "10x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaIsDownAlert + exp_alerts: + - exp_labels: + severity: critical + namespace: default + service: kafka-montemplate + container: kafka-monitoring + exp_annotations: + description: Kafka is Down + summary: All of Kafka Service pods are down + +- interval: 1m + input_series: + - series: kafka_cluster_status{namespace="default",container="kafka-monitoring"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaIsDownAlert + exp_alerts: [] + +- interval: 1m + input_series: + - series: container_cpu_usage_seconds_total{namespace="default",pod="kafka-0",container="kafka"} + values: "300+300x5" + - series: kube_pod_container_resource_limits_cpu_cores{exported_namespace="default",exported_pod="kafka-0"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaCPUUsageAlert + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: kafka-montemplate + exp_annotations: + description: Kafka CPU usage is higher than 95 percents + summary: Some of Kafka Service pods load CPU higher then 95 percents + +- interval: 1m + input_series: + - series: container_cpu_usage_seconds_total{namespace="default",pod="kafka-0",container="kafka"} + values: "0x5" + - series: kube_pod_container_resource_limits_cpu_cores{exported_namespace="default",exported_pod="kafka-0"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaCPUUsageAlert + exp_alerts: [] + +- interval: 1m + input_series: + - series: container_memory_working_set_bytes{namespace="default",pod="kafka-0",container="kafka"} + values: "1x5" + - series: kube_pod_container_resource_limits_memory_bytes{exported_namespace="default",exported_pod="kafka-0"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaMemoryUsageAlert + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: kafka-montemplate + exp_annotations: + description: Kafka memory usage is higher than 95 percents + summary: Some of Kafka Service pods use memory higher then 95 percents + +- interval: 1m + input_series: + - series: container_memory_working_set_bytes{namespace="default",pod="kafka-0",container="kafka"} + values: "0x5" + - series: kube_pod_container_resource_limits_memory_bytes{exported_namespace="default",exported_pod="kafka-0"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaMemoryUsageAlert + exp_alerts: [] + +- interval: 1m + input_series: + - series: java_Memory_HeapMemoryUsage_used{namespace="default",broker="kafka-0"} + values: "1x5" + - series: java_Memory_HeapMemoryUsage_max{namespace="default",broker="kafka-0"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaHeapMemoryUsageAlert + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: kafka-montemplate + exp_annotations: + description: Kafka heap memory usage is higher than 95 percents + summary: Some of Kafka Service pods use heap memory higher then 95 percents + +- interval: 1m + input_series: + - series: java_Memory_HeapMemoryUsage_used{namespace="default",broker="kafka-0"} + values: "0x5" + - series: java_Memory_HeapMemoryUsage_max{namespace="default",broker="kafka-0"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaHeapMemoryUsageAlert + exp_alerts: [] + +- interval: 1m + input_series: + - series: java_GarbageCollector_CollectionCount_total{namespace="default", broker="kafka-0"} + values: "3001+3001x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaGCCountAlert + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: kafka-montemplate + exp_annotations: + description: Some of Kafka Service pods have Garbage collections count rate higher than 10 + summary: Some of Kafka Service pods have Garbage collections count rate higher than 10 + +- interval: 1m + input_series: + - series: java_GarbageCollector_CollectionCount_total{namespace="default", broker="kafka-0"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaGCCountAlert + exp_alerts: [] + +- interval: 1m + input_series: + - series: kafka_consumergroup_group_lag{namespace="default"} + values: "1001x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaLagAlert + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: kafka-montemplate + exp_annotations: + description: Some of Kafka Service pods have partition lag higher than 1000 + summary: Some of Kafka Service pods have partition lag higher than 1000 + +- interval: 1m + input_series: + - series: kafka_consumergroup_group_lag{namespace="default"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaLagAlert + exp_alerts: [] + +- interval: 1m + input_series: + - series: kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="default", broker="kafka-0"} + values: "4001x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaPartitionCountAlert + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: kafka-montemplate + broker: kafka-0 + name: PartitionCount + exp_annotations: + description: Kafka Partition count for kafka-0 broker is higher than 4000 + summary: Some of Kafka Partition count is higher than 4000 + +- interval: 1m + input_series: + - series: kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="default", broker="kafka-0"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaPartitionCountAlert + exp_alerts: [] + +- interval: 1m + input_series: + - series: kafka_broker_skew{namespace="default", container="kafka-monitoring", broker="kafka-0"} + values: "51x5" + - series: kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="default", broker="kafka-0"} + values: "4x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaBrokerSkewAlert + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: kafka-montemplate + broker: kafka-0 + container: kafka-monitoring + exp_annotations: + description: Kafka Broker Skew for kafka-0 broker is higher than 50 percent + summary: Some of Kafka Broker Skew is higher than 50 percent + +- interval: 1m + input_series: + - series: kafka_broker_skew{namespace="default", container="kafka-monitoring", broker="kafka-0"} + values: "0x5" + - series: kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="default", broker="kafka-0"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaBrokerSkewAlert + exp_alerts: [] + +- interval: 1m + input_series: + - series: kafka_broker_leader_skew{namespace="default", container="kafka-monitoring", broker="kafka-0"} + values: "51x5" + - series: kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="default", broker="kafka-0"} + values: "4x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaBrokerLeaderSkewAlert + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: kafka-montemplate + broker: kafka-0 + container: kafka-monitoring + exp_annotations: + description: Kafka Broker Leader Skew for kafka-0 broker is higher than 50 percent + summary: Some of Kafka Broker Leader Skew is higher than 50 percent + +- interval: 1m + input_series: + - series: kafka_broker_leader_skew{namespace="default", container="kafka-monitoring", broker="kafka-0"} + values: "0x5" + - series: kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="default", broker="kafka-0"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: KafkaBrokerLeaderSkewAlert + exp_alerts: [] + +- interval: 1m + input_series: + - series: supplementary_services_version_compatible{application="kafka", namespace="default"} + values: "0x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: SupplementaryServicesCompatibilityAlert + exp_alerts: + - exp_labels: + severity: warning + namespace: default + service: kafka-montemplate + application: kafka + exp_annotations: + description: "Kafka supplementary services in namespace default is not compatible with Kafka version " + summary: "Kafka supplementary services in namespace default is not compatible with Kafka version , allowed range is - " + +- interval: 1m + input_series: + - series: supplementary_services_version_compatible{application="kafka", namespace="default"} + values: "1x5" + alert_rule_test: + - eval_time: 5m + groupname: default-kafka-montemplate + alertname: SupplementaryServicesCompatibilityAlert + exp_alerts: [] \ No newline at end of file diff --git a/operator/tests/alerts-tests/tests-checker.sh b/operator/tests/alerts-tests/tests-checker.sh new file mode 100644 index 00000000..21fa71e8 --- /dev/null +++ b/operator/tests/alerts-tests/tests-checker.sh @@ -0,0 +1,33 @@ +rules=() +readarray -t rules < <(yq eval '.groups[].rules[].alert' ./rules.yaml) +tests=() +readarray -t tests < <(yq '.tests[].alert_rule_test[].alertname' ./test.yaml) +errorrules=() +errorcount=() +i=0 + +for item in "${rules[@]}"; do +count=0 + + for j in "${tests[@]}"; do + if [[ "$j" == "$item" ]]; then + ((count++)) + fi + done +if [[ "$count" -lt 2 ]]; then +errorrules[i]="$item" +errorcount[i]="$count" +((i++)) +fi +done + +if [[ "$i" -gt 0 ]]; then +echo "This alert rules dont have all required tests (minimum 2 tests per rule needed):" + for k in "${!errorrules[@]}"; do + echo "Alert: ${errorrules[k]}, Tests found: ${errorcount[k]}" + done +exit 1 +else +echo "All alert rules has required tests" +exit 0 +fi \ No newline at end of file