From f303249a26613d3e24e8b8a184f90de4e7638dc7 Mon Sep 17 00:00:00 2001 From: saja0219 Date: Thu, 14 May 2026 12:49:42 +0530 Subject: [PATCH] fix: rabbitmq auto cluster restart --- .../helm/rabbitmq/templates/deployment.yaml | 8 ++++++ operator/charts/helm/rabbitmq/values.yaml | 2 ++ operator/src/handler.py | 26 +++++++++++++++++++ operator/src/rabbitconstants.py | 23 +++++++++++----- 4 files changed, 52 insertions(+), 7 deletions(-) diff --git a/operator/charts/helm/rabbitmq/templates/deployment.yaml b/operator/charts/helm/rabbitmq/templates/deployment.yaml index efffa4c6..ebdff6a3 100644 --- a/operator/charts/helm/rabbitmq/templates/deployment.yaml +++ b/operator/charts/helm/rabbitmq/templates/deployment.yaml @@ -125,6 +125,14 @@ spec: - name: ENABLE_SHOVEL_MONITORING value: "{{ .Values.operator.enableOperatorShovelMonitoring }}" {{- end }} + {{- if .Values.operator.enableClusterRestart }} + - name: ENABLE_CLUSTER_RESTART + value: "{{ .Values.operator.enableClusterRestart }}" + {{- end }} + {{- if .Values.operator.clusterRestartThreshold }} + - name: CLUSTER_RESTART_THRESHOLD + value: "{{ .Values.operator.clusterRestartThreshold }}" + {{- end }} - name: API_GROUP value: {{ .Values.operator.apiGroup | quote }} {{- if or (eq (include "backupDaemon.enableTls" .) "true") (and (eq (include "rabbitmq.enableTls" .) "true") (include "rabbitmq.tlsSecretName" .)) }} diff --git a/operator/charts/helm/rabbitmq/values.yaml b/operator/charts/helm/rabbitmq/values.yaml index fb0845b4..0b046c0f 100644 --- a/operator/charts/helm/rabbitmq/values.yaml +++ b/operator/charts/helm/rabbitmq/values.yaml @@ -42,6 +42,8 @@ operator: apiGroup: "netcracker.com" pullPolicy: "Always" enableOperatorShovelMonitoring: true + enableClusterRestart: false + clusterRestartThreshold: 10800 restartScheduler: dockerImage: ghcr.io/netcracker/qubership-docker-kubectl:main enabled: true diff --git a/operator/src/handler.py b/operator/src/handler.py index bbb70e38..f3fe6c48 100644 --- a/operator/src/handler.py +++ b/operator/src/handler.py @@ -72,6 +72,7 @@ logger.setLevel(LOGLEVEL) logger.info("loglevel is set to " + str(LOGLEVEL)) +CLUSTER_DOWN_SINCE = 0 class FakeKubeResponse: def __init__(self, obj): @@ -1620,6 +1621,31 @@ def shovel_monitoring(spec,retry, **kwargs): logger.info("All shovels are running properly after restart.") +@kopf.timer(api_group, cr_version, 'rabbitmqservices', interval=900, initial_delay=900) +def cluster_monitoring(spec, **kwargs): + global CLUSTER_DOWN_SINCE + enabled = False + try: + enabled = bool(util.strtobool(os.getenv("ENABLE_CLUSTER_RESTART", "false"))) + except Exception: + enabled = False + if enabled: + kub_helper = KubernetesHelper(spec) + threshold_seconds = int(os.getenv("CLUSTER_RESTART_THRESHOLD", "10800")) + try: + cluster_status = kub_helper.check_cluster_state() + if cluster_status == "error": + current_time = int(time.time()) + if CLUSTER_DOWN_SINCE == 0: + CLUSTER_DOWN_SINCE = current_time + outage_duration = current_time - CLUSTER_DOWN_SINCE + logger.warning(f"RabbitMQ cluster unavailable for {outage_duration} seconds") + if outage_duration >= threshold_seconds: + logger.error("RabbitMQ cluster exceeded outage threshold. Restarting cluster.") + kub_helper.reboot_pods() + except Exception as ex: + logger.warning(f"RabbitMQ cluster monitoring failed: {ex}") + @kopf.on.create(api_group, cr_version, 'rabbitmqservices') def on_create(body, meta, spec, status, **kwargs): kub_helper = KubernetesHelper(spec) diff --git a/operator/src/rabbitconstants.py b/operator/src/rabbitconstants.py index 8c6228c4..f6dc563b 100644 --- a/operator/src/rabbitconstants.py +++ b/operator/src/rabbitconstants.py @@ -18,7 +18,8 @@ # TODO $(get_user):$(get_password) was changed to guest:guest """ -rabbitmq-diagnostics -q check_running && rabbitmq-diagnostics -q check_local_alarms || exit 1 +rabbitmq-diagnostics -q check_running && \ +rabbitmq-diagnostics -q check_local_alarms || exit 1 if [[ "$HOSTNAME" != "rmqlocal-0-0" ]] && [[ ! -f /var/lib/rabbitmq/started_at_least_once ]] ; then FD_OUTPUT="/proc/1/fd/1" @@ -45,7 +46,13 @@ """ ] -rabbitmq_hostpath_readiness_probe_command = ['rabbitmq-diagnostics', 'ping', '-q'] +rabbitmq_hostpath_readiness_probe_command = ['bin/bash', '-c', + """ +rabbitmq-diagnostics -q check_running && \ +rabbitmq-diagnostics -q check_local_alarms && \ +rabbitmq-diagnostics -q check_virtual_hosts +""" + ] rabbitmq_storageclass_liveness_probe_command = ['bin/bash', '-c', """ if [ -f /var/lib/rabbitmq/started_at_least_once ]; then @@ -56,7 +63,7 @@ echo "http-liveness probe failed" exit 1 fi -elif rabbitmqctl await_online_nodes $(( ( $(echo -n ${MY_POD_NAME##*-}) + 1 ) / 2 + 1 )) -t 1 ; then +elif rabbitmqctl await_online_nodes $(( ( $(echo -n ${MY_POD_NAME##*-}) + 1 ) / 2 + 1 )) -t 30 ; then echo "awaiting nodes succeeded" else echo "awaiting nodes liveness probe failed" exit 1 @@ -67,8 +74,10 @@ rabbitmq_storageclass_readiness_probe_command = ['bin/bash', '-c', """ if [ -f /var/lib/rabbitmq/started_at_least_once ]; then echo "executing rabbitmq-diagnostics ping -q" - rabbitmq-diagnostics ping -q; -elif rabbitmqctl await_online_nodes $(( ( $(echo -n ${MY_POD_NAME##*-}) + 1 ) / 2 + 1 )) -t 1 ; then + rabbitmq-diagnostics -q check_running && \ + rabbitmq-diagnostics -q check_local_alarms && \ + rabbitmq-diagnostics -q check_virtual_hosts +elif rabbitmqctl await_online_nodes $(( ( $(echo -n ${MY_POD_NAME##*-}) + 1 ) / 2 + 1 )) -t 30 ; then echo "awaiting nodes succeeded" touch /var/lib/rabbitmq/started_at_least_once else @@ -94,7 +103,7 @@ storageclass_liveness_probe = V1Probe(failure_threshold=30, initial_delay_seconds=10, period_seconds=30, success_threshold=1, - timeout_seconds=15, + timeout_seconds=20, _exec=V1ExecAction( command=rabbitmq_storageclass_liveness_probe_command)) @@ -108,6 +117,6 @@ hostpath_liveness_probe = V1Probe(failure_threshold=30, initial_delay_seconds=10, period_seconds=30, success_threshold=1, - timeout_seconds=15, + timeout_seconds=20, _exec=V1ExecAction( command=rabbitmq_hostpath_liveness_probe_command))