Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions .github/workflows/alerts-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: Alerts-test-kafka-operator
on:
workflow_run:
workflows: ["Build Artifacts"]
types:
- completed
pull_request:
branches:
- all

env:
max_attempts: 30
delay: 10

permissions:
contents: read

jobs:
Run-Alerts-Test:
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- name: Check out repository code
uses: actions/checkout@v4

- name: Check yq version
run: yq --version

- name: Install Helm
run: |
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash

- name: Render rules file from helm chart
run: |
helm template kafka-montemplate ./operator/charts/helm/kafka-service/ > ./operator/tests/alerts-tests/rules.yaml
sed -n '/prometheus_rules.yaml/,/---/p' -i ./operator/tests/alerts-tests/rules.yaml
sed '0,/spec:/d' -i ./operator/tests/alerts-tests/rules.yaml


- name: Check that all necessary tests exists
run: |
chmod +x ./operator/tests/alerts-tests/tests-checker.sh
cd ./operator/tests/alerts-tests/
./tests-checker.sh
continue-on-error: true

- name: Install vmalert-tool
run: |
wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/v1.122.4/vmutils-linux-amd64-v1.122.4-enterprise.tar.gz
tar -xvf vmutils-linux-amd64-v1.122.4-enterprise.tar.gz
chmod +x vmalert-tool-prod

- name: Run test
run: |
./vmalert-tool-prod unittest --files ./operator/tests/alerts-tests/test.yaml
11 changes: 11 additions & 0 deletions monitoring/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,14 @@ Triggers for tracking following problems are included in template:
If you have Kafka which is deployed in DR mode you need to create two hosts:
for left and for right side and to specify the side as value (`left`, `right`) for the macros `{$DR_SIDE}`.
If you have Kafka without DR just leave this macros empty.

### Deep alerts tuning using subchart

If you want to make deep customizations on alerts (add new ones, override any alert fields, disable alerts etd) you can use v2 alerts functionality.
To use it you need:

1) Set alertsPackVersion: v2 value in monitoring section in values yaml for kafka-services.
2) Use subchart`s values yaml (/operator/charts/helm/kafka-service/charts/prometheusrules) to set overrides for alerts. Overrides will be merged with default alerts, described in subchart helpers.tpl with higher priority.

If you will set any other value for alertsPackVersion except "v2" or wont set this value at all - installation will happen on old flavour.
Alert groups in subchart are supported in same manner as described above.
8 changes: 8 additions & 0 deletions operator/charts/helm/kafka-service/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,11 @@ version: 1.0.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application.
appVersion: 1.0.0

dependencies:
# Prometheus alert rules
- name: monitoring
condition: monitoring.install
version: ~0
repository: "file://charts/prometheusrules"

Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
apiVersion: v2
name: monitoring
description: A Helm chart for Kubernetes

# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application

# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.16.0"
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
{{- define "defaultAlerts" -}}
{{ .Release.Namespace }}-{{ .Release.Name }}:
rules:
KafkaIsDegradedAlert:
annotations:
description: 'Kafka is Degraded'
summary: Some of Kafka Service pods are down
expr: kafka_cluster_status{namespace="{{ .Release.Namespace }}",container="{{ template "kafka.name" . }}-monitoring"} == 6
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
KafkaMetricsAreAbsent:
annotations:
description: 'Kafka metrics are absent on {{ .Release.Namespace }}.'
summary: Kafka metrics are absent
expr: absent(kafka_cluster_status{namespace="{{ .Release.Namespace }}"}) == 1
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
KafkaIsDownAlert:
annotations:
description: 'Kafka is Down'
summary: All of Kafka Service pods are down
expr: kafka_cluster_status{namespace="{{ .Release.Namespace }}",container="{{ template "kafka.name" . }}-monitoring"} == 10
for: 3m
labels:
severity: critical
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
KafkaCPUUsageAlert:
annotations:
description: 'Kafka CPU usage is higher than 95 percents'
summary: Some of Kafka Service pods load CPU higher then 95 percents
expr: max(rate(container_cpu_usage_seconds_total{namespace="{{ .Release.Namespace }}",pod=~"{{ template "kafka.name" . }}-[0-9].*",container="kafka"}[5m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="{{ .Release.Namespace }}",exported_pod=~"{{ template "kafka.name" . }}-[0-9].*"}) > 0.95
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
KafkaMemoryUsageAlert:
annotations:
description: 'Kafka memory usage is higher than 95 percents'
summary: Some of Kafka Service pods use memory higher then 95 percents
expr: max(container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}",pod=~"{{ template "kafka.name" . }}-[0-9].*",container="kafka"}) / max(kube_pod_container_resource_limits_memory_bytes{exported_namespace="{{ .Release.Namespace }}",exported_pod=~"{{ template "kafka.name" . }}-[0-9].*"}) > 0.95
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
KafkaHeapMemoryUsageAlert:
annotations:
description: 'Kafka heap memory usage is higher than 95 percents'
summary: Some of Kafka Service pods use heap memory higher then 95 percents
expr: max(java_Memory_HeapMemoryUsage_used{namespace="{{ .Release.Namespace }}",broker=~"{{ template "kafka.name" . }}-[0-9].*"}) / max(java_Memory_HeapMemoryUsage_max{namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"}) > 0.95
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
KafkaGCCountAlert:
annotations:
description: 'Some of Kafka Service pods have Garbage collections count rate higher than {{ .Values.thresholds.gcCountAlert }}'
summary: Some of Kafka Service pods have Garbage collections count rate higher than {{ .Values.thresholds.gcCountAlert }}
expr: max(rate(java_GarbageCollector_CollectionCount_total{namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"}[5m])) > {{ .Values.thresholds.gcCountAlert }}
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
KafkaLagAlert:
annotations:
description: 'Some of Kafka Service pods have partition lag higher than {{ .Values.thresholds.lagAlert }}'
summary: Some of Kafka Service pods have partition lag higher than {{ .Values.thresholds.lagAlert }}
expr: max(kafka_consumergroup_group_lag{namespace="{{ .Release.Namespace }}"}) > {{ .Values.thresholds.lagAlert }}
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
{{- if .Values.thresholds.partitionCountAlert }}
KafkaPartitionCountAlert:
annotations:
description: 'Kafka Partition count for {{`{{ $labels.broker }}`}} broker is higher than {{ .Values.thresholds.partitionCountAlert }}'
summary: Some of Kafka Partition count is higher than {{ .Values.thresholds.partitionCountAlert }}
expr: kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > {{ .Values.thresholds.partitionCountAlert }}
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
{{- end }}
{{- if .Values.thresholds.brokerSkewAlert }}
KafkaBrokerSkewAlert:
annotations:
description: 'Kafka Broker Skew for {{`{{ $labels.broker }}`}} broker is higher than {{ .Values.thresholds.brokerSkewAlert }} percent'
summary: Some of Kafka Broker Skew is higher than {{ .Values.thresholds.brokerSkewAlert }} percent
expr: (kafka_broker_skew{namespace="{{ .Release.Namespace }}", container="{{ template "kafka.name" . }}-monitoring", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > {{ .Values.thresholds.brokerSkewAlert }}) and on(broker, namespace) (kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > 3 )
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
{{- end }}
{{- if .Values.thresholds.brokerLeaderSkewAlert }}
KafkaBrokerLeaderSkewAlert:
annotations:
description: 'Kafka Broker Leader Skew for {{`{{ $labels.broker }}`}} broker is higher than {{ .Values.thresholds.brokerLeaderSkewAlert }} percent'
summary: Some of Kafka Broker Leader Skew is higher than {{ .Values.thresholds.brokerLeaderSkewAlert }} percent
expr: (kafka_broker_leader_skew{namespace="{{ .Release.Namespace }}", container="{{ template "kafka.name" . }}-monitoring", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > {{ .Values.thresholds.brokerLeaderSkewAlert }}) and on(broker, namespace) (kafka_server_ReplicaManager_Value{name="PartitionCount", namespace="{{ .Release.Namespace }}", broker=~"{{ template "kafka.name" . }}-[0-9].*"} > 3 )
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
{{- end }}
SupplementaryServicesCompatibilityAlert:
annotations:
description: 'Kafka supplementary services in namespace {{`{{ $labels.namespace }}`}} is not compatible with Kafka version {{`{{ $labels.application_version }}`}}'
summary: 'Kafka supplementary services in namespace {{`{{ $labels.namespace }}`}} is not compatible with Kafka version {{`{{ $labels.application_version }}`}}, allowed range is {{`{{ $labels.min_version }}`}} - {{`{{ $labels.max_version }}`}}'
expr: supplementary_services_version_compatible{application="kafka", namespace="{{ .Release.Namespace }}"} != 1
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
{{- end }}


Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{{- if and ( .Values.install) (eq .Values.alertsPackVersion "v2") }}
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: prometheusrules
spec:
groups:

{{- $defaultConfig := fromYaml (include "defaultAlerts" . ) -}}
{{- $overrideConfig := .Values.alerts -}}
{{- $finalConfig := merge $overrideConfig $defaultConfig -}}
{{- $alertGroups := .Values.ruleGroups -}}

{{- range $defaultGroupName, $defaultGroup := $finalConfig }}
{{- $found := true }}
{{- if $alertGroups }}
{{- $found := false }}
{{- range $alertGroups }}
{{- if eq $defaultGroupName . }}
{{- $found := true }}
{{- end }}
{{- end }}
{{- else }}
{{- $found := true }}
{{- end }}

{{- if $found }}
- name: {{ $defaultGroupName }}
{{- if $defaultGroup.labels }}
labels:
{{- range $defaultLabelName, $defaultLabelValue := $defaultGroup.labels }}
{{ $defaultLabelName }}: {{ $defaultLabelValue }}
{{- end }}
{{- end }}
{{- if $defaultGroup.interval }}
interval: {{ $defaultGroup.interval }}
{{- end }}
{{- if $defaultGroup.concurrency }}
concurrency: {{ $defaultGroup.concurrency }}
{{- end }}
rules:
{{- range $defaultRuleName, $defaultRule := $defaultGroup.rules }}
- alert: {{ $defaultRuleName }}
expr: {{ $defaultRule.expr }}
{{- if $defaultRule.for }}
for: {{ $defaultRule.for }}
{{- end }}
labels:
{{- range $defaultLabelName, $defaultLabelValue := $defaultRule.labels }}
{{ $defaultLabelName }}: {{ $defaultLabelValue }}
{{- end }}
annotations:
{{- range $defaultAnnotationName, $defaultAnnotationValue := $defaultRule.annotations }}
{{ $defaultAnnotationName }}: {{ printf $defaultAnnotationValue | trimAll "\n" | toJson | replace "\\u0026" "&" | replace "\\u003e" ">" | nindent 14 }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
Empty file.
Loading
Loading