Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions operator/charts/helm/rabbitmq/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,14 @@ spec:
- name: ENABLE_SHOVEL_MONITORING
value: "{{ .Values.operator.enableOperatorShovelMonitoring }}"
{{- end }}
{{- if .Values.operator.enableClusterRestart }}
- name: ENABLE_CLUSTER_RESTART
value: "{{ .Values.operator.enableClusterRestart }}"
{{- end }}
{{- if .Values.operator.clusterRestartThreshold }}
- name: CLUSTER_RESTART_THRESHOLD
value: "{{ .Values.operator.clusterRestartThreshold }}"
{{- end }}
- name: API_GROUP
value: {{ .Values.operator.apiGroup | quote }}
{{- if or (eq (include "backupDaemon.enableTls" .) "true") (and (eq (include "rabbitmq.enableTls" .) "true") (include "rabbitmq.tlsSecretName" .)) }}
Expand Down
2 changes: 2 additions & 0 deletions operator/charts/helm/rabbitmq/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ operator:
apiGroup: "netcracker.com"
pullPolicy: "Always"
enableOperatorShovelMonitoring: true
enableClusterRestart: false
clusterRestartThreshold: 10800
restartScheduler:
dockerImage: ghcr.io/netcracker/qubership-docker-kubectl:main
enabled: true
Expand Down
26 changes: 26 additions & 0 deletions operator/src/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
logger.setLevel(LOGLEVEL)
logger.info("loglevel is set to " + str(LOGLEVEL))

CLUSTER_DOWN_SINCE = 0

class FakeKubeResponse:
def __init__(self, obj):
Expand Down Expand Up @@ -1620,6 +1621,31 @@ def shovel_monitoring(spec,retry, **kwargs):
logger.info("All shovels are running properly after restart.")


@kopf.timer(api_group, cr_version, 'rabbitmqservices', interval=900, initial_delay=900)
def cluster_monitoring(spec, **kwargs):
global CLUSTER_DOWN_SINCE
enabled = False
try:
enabled = bool(util.strtobool(os.getenv("ENABLE_CLUSTER_RESTART", "false")))
except Exception:
enabled = False
if enabled:
kub_helper = KubernetesHelper(spec)
threshold_seconds = int(os.getenv("CLUSTER_RESTART_THRESHOLD", "10800"))
try:
cluster_status = kub_helper.check_cluster_state()
if cluster_status == "error":
current_time = int(time.time())
if CLUSTER_DOWN_SINCE == 0:
CLUSTER_DOWN_SINCE = current_time
outage_duration = current_time - CLUSTER_DOWN_SINCE
logger.warning(f"RabbitMQ cluster unavailable for {outage_duration} seconds")
if outage_duration >= threshold_seconds:
logger.error("RabbitMQ cluster exceeded outage threshold. Restarting cluster.")
kub_helper.reboot_pods()
except Exception as ex:
logger.warning(f"RabbitMQ cluster monitoring failed: {ex}")

@kopf.on.create(api_group, cr_version, 'rabbitmqservices')
def on_create(body, meta, spec, status, **kwargs):
kub_helper = KubernetesHelper(spec)
Expand Down
23 changes: 16 additions & 7 deletions operator/src/rabbitconstants.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
# TODO $(get_user):$(get_password) was changed to guest:guest
"""

rabbitmq-diagnostics -q check_running && rabbitmq-diagnostics -q check_local_alarms || exit 1
rabbitmq-diagnostics -q check_running && \
rabbitmq-diagnostics -q check_local_alarms || exit 1

if [[ "$HOSTNAME" != "rmqlocal-0-0" ]] && [[ ! -f /var/lib/rabbitmq/started_at_least_once ]] ; then
FD_OUTPUT="/proc/1/fd/1"
Expand All @@ -45,7 +46,13 @@
"""
]

rabbitmq_hostpath_readiness_probe_command = ['rabbitmq-diagnostics', 'ping', '-q']
rabbitmq_hostpath_readiness_probe_command = ['bin/bash', '-c',
"""
rabbitmq-diagnostics -q check_running && \
rabbitmq-diagnostics -q check_local_alarms && \
rabbitmq-diagnostics -q check_virtual_hosts
"""
]

rabbitmq_storageclass_liveness_probe_command = ['bin/bash', '-c', """
if [ -f /var/lib/rabbitmq/started_at_least_once ]; then
Expand All @@ -56,7 +63,7 @@
echo "http-liveness probe failed"
exit 1
fi
elif rabbitmqctl await_online_nodes $(( ( $(echo -n ${MY_POD_NAME##*-}) + 1 ) / 2 + 1 )) -t 1 ; then
elif rabbitmqctl await_online_nodes $(( ( $(echo -n ${MY_POD_NAME##*-}) + 1 ) / 2 + 1 )) -t 30 ; then
echo "awaiting nodes succeeded"
else echo "awaiting nodes liveness probe failed"
exit 1
Expand All @@ -67,8 +74,10 @@
rabbitmq_storageclass_readiness_probe_command = ['bin/bash', '-c', """
if [ -f /var/lib/rabbitmq/started_at_least_once ]; then
echo "executing rabbitmq-diagnostics ping -q"
rabbitmq-diagnostics ping -q;
elif rabbitmqctl await_online_nodes $(( ( $(echo -n ${MY_POD_NAME##*-}) + 1 ) / 2 + 1 )) -t 1 ; then
rabbitmq-diagnostics -q check_running && \
rabbitmq-diagnostics -q check_local_alarms && \
rabbitmq-diagnostics -q check_virtual_hosts
elif rabbitmqctl await_online_nodes $(( ( $(echo -n ${MY_POD_NAME##*-}) + 1 ) / 2 + 1 )) -t 30 ; then
echo "awaiting nodes succeeded"
touch /var/lib/rabbitmq/started_at_least_once
else
Expand All @@ -94,7 +103,7 @@
storageclass_liveness_probe = V1Probe(failure_threshold=30,
initial_delay_seconds=10,
period_seconds=30, success_threshold=1,
timeout_seconds=15,
timeout_seconds=20,
_exec=V1ExecAction(
command=rabbitmq_storageclass_liveness_probe_command))

Expand All @@ -108,6 +117,6 @@
hostpath_liveness_probe = V1Probe(failure_threshold=30,
initial_delay_seconds=10,
period_seconds=30, success_threshold=1,
timeout_seconds=15,
timeout_seconds=20,
_exec=V1ExecAction(
command=rabbitmq_hostpath_liveness_probe_command))
Loading