From 057e5dfdddfd21598cfef92d3d3a9d29c59f9af6 Mon Sep 17 00:00:00 2001
From: Alexander Morozov <dodomorozov@10.211.89.18-vpn.dhcp.yndx.net>
Date: Fri, 5 Jun 2026 21:05:55 +0300
Subject: [PATCH 1/2] feat: limit switchover attemtps

---
 internal/app/app.go                       |  6 ++++
 internal/config/config.go                 |  2 ++
 tests/features/switchover_from.84.feature | 44 +++++++++++++++++++++++
 tests/features/switchover_from.feature    | 44 +++++++++++++++++++++++
 tests/images/docker-compose.yaml          |  3 ++
 tests/images/mysql/mysync.yaml            |  1 +
 6 files changed, 100 insertions(+)

diff --git a/internal/app/app.go b/internal/app/app.go
index bde78a09..02d05a67 100644
--- a/internal/app/app.go
+++ b/internal/app/app.go
@@ -964,6 +964,12 @@ func (app *App) emulateError(pos string) bool {
 }
 
 func (app *App) approveSwitchover(switchover *Switchover, activeNodes []string, clusterState map[string]*nodestate.NodeState) error {
+	// Limit amount of switchover retries
+	if switchover.MasterTransition != FailoverTransition &&
+		app.config.SwitchoverMaxAttempts > 0 && switchover.RunCount >= app.config.SwitchoverMaxAttempts {
+		return fmt.Errorf("switchover failed %d times, giving up after reaching switchover_max_attempts (%d)",
+			switchover.RunCount, app.config.SwitchoverMaxAttempts)
+	}
 	if switchover.RunCount > 0 {
 		return nil
 	}
diff --git a/internal/config/config.go b/internal/config/config.go
index 344a6fda..12aed4a8 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -108,6 +108,7 @@ type Config struct {
 	ForceSwitchover                         bool                         `config:"force_switchover" yaml:"force_switchover"` // TODO: Remove when we will be sure it's right way to do switchover
 	ReplicationConvergenceTimeoutSwitchover time.Duration                `config:"replication_convergence_timeout_switchover" yaml:"replication_convergence_timeout_switchover"`
 	SwitchoverTimeout                       time.Duration                `config:"switchover_timeout" yaml:"switchover_timeout"`
+	SwitchoverMaxAttempts                   int                          `config:"switchover_max_attempts" yaml:"switchover_max_attempts"`
 	DSNSettings                             string                       `config:"dsn_settings" yaml:"dsn_settings"`
 	OptimizationConfig                      OptimizationConfig           `config:"optimization_config" yaml:"optimization_config"`
 }
@@ -210,6 +211,7 @@ func DefaultConfig() (Config, error) {
 		ManagerSwitchover:                       false,
 		ForceSwitchover:                         false,
 		SwitchoverTimeout:                       30 * time.Minute,
+		SwitchoverMaxAttempts:                   3,
 		ReplicationConvergenceTimeoutSwitchover: 300 * time.Second,
 		DSNSettings:                             "?autocommit=1&sql_log_off=1",
 		OptimizationConfig: OptimizationConfig{
diff --git a/tests/features/switchover_from.84.feature b/tests/features/switchover_from.84.feature
index 63f4ee6b..252ef532 100644
--- a/tests/features/switchover_from.84.feature
+++ b/tests/features/switchover_from.84.feature
@@ -43,6 +43,7 @@ Feature: manual switchover from old master
     Given cluster environment is
       """
       FORCE_SWITCHOVER=<force_switchover>
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS=1000
       """
     Given cluster is up and running
     Then zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
@@ -84,6 +85,49 @@ Feature: manual switchover from old master
       | true              |
       | false             |
 
+  Scenario: switchover gives up and releases the master after reaching max attempts
+    Given cluster environment is
+      """
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS=3
+      """
+    And cluster is up and running
+    Then mysql host "mysql1" should be master
+    And zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
+      """
+      ["mysql1","mysql2","mysql3"]
+      """
+    When mysql on host "mysql3" is killed
+    And mysql on host "mysql2" is killed
+    # pre-set run_count so the switchover is already approved and enters the retry loop;
+    # with both replicas dead it can never reach quorum and keeps failing
+    And I set zookeeper node "/test/switch" to
+      """
+      {
+          "from": "mysql1",
+          "to": "",
+          "cause": "manual",
+          "initiated_by": "mysql1",
+          "run_count": 1,
+          "master_transition": "switchover"
+      }
+      """
+    # after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover
+    Then zookeeper node "/test/last_rejected_switch" should match json within "60" seconds
+      """
+      {
+          "from": "mysql1",
+          "master_transition": "switchover",
+          "result": {
+              "ok": false,
+              "error": "REGEXP:.*giving up after reaching switchover_max_attempts.*"
+          }
+      }
+      """
+    And zookeeper node "/test/switch" should not exist
+    # the old master must be released from read-only instead of being held there forever
+    And mysql host "mysql1" should be master
+    And mysql host "mysql1" should become writable within "30" seconds
+
   Scenario Outline: switchover from works on healthy cluster
     Given cluster environment is
       """
diff --git a/tests/features/switchover_from.feature b/tests/features/switchover_from.feature
index a35814fa..93529b32 100644
--- a/tests/features/switchover_from.feature
+++ b/tests/features/switchover_from.feature
@@ -43,6 +43,7 @@ Feature: manual switchover from old master
     Given cluster environment is
       """
       FORCE_SWITCHOVER=<force_switchover>
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS=1000
       """
     Given cluster is up and running
     Then zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
@@ -84,6 +85,49 @@ Feature: manual switchover from old master
       | true              |
       | false             |
 
+  Scenario: switchover gives up and releases the master after reaching max attempts
+    Given cluster environment is
+      """
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS=3
+      """
+    And cluster is up and running
+    Then mysql host "mysql1" should be master
+    And zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
+      """
+      ["mysql1","mysql2","mysql3"]
+      """
+    When mysql on host "mysql3" is killed
+    And mysql on host "mysql2" is killed
+    # pre-set run_count so the switchover is already approved and enters the retry loop;
+    # with both replicas dead it can never reach quorum and keeps failing
+    And I set zookeeper node "/test/switch" to
+      """
+      {
+          "from": "mysql1",
+          "to": "",
+          "cause": "manual",
+          "initiated_by": "mysql1",
+          "run_count": 1,
+          "master_transition": "switchover"
+      }
+      """
+    # after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover
+    Then zookeeper node "/test/last_rejected_switch" should match json within "60" seconds
+      """
+      {
+          "from": "mysql1",
+          "master_transition": "switchover",
+          "result": {
+              "ok": false,
+              "error": "REGEXP:.*giving up after reaching switchover_max_attempts.*"
+          }
+      }
+      """
+    And zookeeper node "/test/switch" should not exist
+    # the old master must be released from read-only instead of being held there forever
+    And mysql host "mysql1" should be master
+    And mysql host "mysql1" should become writable within "30" seconds
+
   Scenario Outline: switchover from works on healthy cluster
     Given cluster environment is
       """
diff --git a/tests/images/docker-compose.yaml b/tests/images/docker-compose.yaml
index 37999998..ff796bb5 100644
--- a/tests/images/docker-compose.yaml
+++ b/tests/images/docker-compose.yaml
@@ -106,6 +106,7 @@ services:
       LOW_REPLICATION_MARK:
       RESETUP_HOST_LAG:
       OFFLINE_MODE_MAX_OFFLINE_PCT:
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS:
     healthcheck:
       test: "mysql --user=admin --password=admin_pwd -e 'SELECT 1'"
       start_period: 30s
@@ -161,6 +162,7 @@ services:
       LOW_REPLICATION_MARK:
       RESETUP_HOST_LAG:
       OFFLINE_MODE_MAX_OFFLINE_PCT:
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS:
     depends_on:
       mysql1:
         condition: service_healthy
@@ -210,6 +212,7 @@ services:
       LOW_REPLICATION_MARK:
       RESETUP_HOST_LAG:
       OFFLINE_MODE_MAX_OFFLINE_PCT:
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS:
     depends_on:
       mysql1:
         condition: service_healthy
diff --git a/tests/images/mysql/mysync.yaml b/tests/images/mysql/mysync.yaml
index f0d83d16..f83c9a2f 100644
--- a/tests/images/mysql/mysync.yaml
+++ b/tests/images/mysql/mysync.yaml
@@ -66,6 +66,7 @@ repl_mon: ${REPL_MON:-false}
 force_switchover: ${FORCE_SWITCHOVER:-false}
 manager_switchover: ${MANAGER_SWITCHOVER:-true}
 replication_convergence_timeout_switchover: 300s
+switchover_max_attempts: ${MYSYNC_SWITCHOVER_MAX_ATTEMPTS:-3}
 manager_election_delay_after_quorum_loss: ${MANAGER_ELECTION_DELAY_AFTER_QUORUM_LOSS:-15s}
 resetup_host_lag: ${RESETUP_HOST_LAG:-30000s}
 manager_lock_acquire_delay_after_quorum_loss: ${MANAGER_LOCK_ACQUIRE_DELAY_AFTER_QUORUM_LOSS:-30s}

From b39601371f3b44e33dc77e189bfdec3553280809 Mon Sep 17 00:00:00 2001
From: Alexander Morozov <dodomorozov@10.217.129.116-red3.dhcp.yndx.net>
Date: Mon, 8 Jun 2026 19:40:23 +0300
Subject: [PATCH 2/2] change limits

---
 internal/config/config.go                 | 2 +-
 tests/features/switchover_from.84.feature | 4 ++--
 tests/features/switchover_from.feature    | 4 ++--
 tests/images/mysql/mysync.yaml            | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/internal/config/config.go b/internal/config/config.go
index 12aed4a8..5b56d601 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -211,7 +211,7 @@ func DefaultConfig() (Config, error) {
 		ManagerSwitchover:                       false,
 		ForceSwitchover:                         false,
 		SwitchoverTimeout:                       30 * time.Minute,
-		SwitchoverMaxAttempts:                   3,
+		SwitchoverMaxAttempts:                   60,
 		ReplicationConvergenceTimeoutSwitchover: 300 * time.Second,
 		DSNSettings:                             "?autocommit=1&sql_log_off=1",
 		OptimizationConfig: OptimizationConfig{
diff --git a/tests/features/switchover_from.84.feature b/tests/features/switchover_from.84.feature
index 252ef532..d643b568 100644
--- a/tests/features/switchover_from.84.feature
+++ b/tests/features/switchover_from.84.feature
@@ -88,7 +88,7 @@ Feature: manual switchover from old master
   Scenario: switchover gives up and releases the master after reaching max attempts
     Given cluster environment is
       """
-      MYSYNC_SWITCHOVER_MAX_ATTEMPTS=3
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS=60
       """
     And cluster is up and running
     Then mysql host "mysql1" should be master
@@ -112,7 +112,7 @@ Feature: manual switchover from old master
       }
       """
     # after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover
-    Then zookeeper node "/test/last_rejected_switch" should match json within "60" seconds
+    Then zookeeper node "/test/last_rejected_switch" should match json within "250" seconds
       """
       {
           "from": "mysql1",
diff --git a/tests/features/switchover_from.feature b/tests/features/switchover_from.feature
index 93529b32..294813be 100644
--- a/tests/features/switchover_from.feature
+++ b/tests/features/switchover_from.feature
@@ -88,7 +88,7 @@ Feature: manual switchover from old master
   Scenario: switchover gives up and releases the master after reaching max attempts
     Given cluster environment is
       """
-      MYSYNC_SWITCHOVER_MAX_ATTEMPTS=3
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS=60
       """
     And cluster is up and running
     Then mysql host "mysql1" should be master
@@ -112,7 +112,7 @@ Feature: manual switchover from old master
       }
       """
     # after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover
-    Then zookeeper node "/test/last_rejected_switch" should match json within "60" seconds
+    Then zookeeper node "/test/last_rejected_switch" should match json within "250" seconds
       """
       {
           "from": "mysql1",
diff --git a/tests/images/mysql/mysync.yaml b/tests/images/mysql/mysync.yaml
index f83c9a2f..950a8ecc 100644
--- a/tests/images/mysql/mysync.yaml
+++ b/tests/images/mysql/mysync.yaml
@@ -66,7 +66,7 @@ repl_mon: ${REPL_MON:-false}
 force_switchover: ${FORCE_SWITCHOVER:-false}
 manager_switchover: ${MANAGER_SWITCHOVER:-true}
 replication_convergence_timeout_switchover: 300s
-switchover_max_attempts: ${MYSYNC_SWITCHOVER_MAX_ATTEMPTS:-3}
+switchover_max_attempts: ${MYSYNC_SWITCHOVER_MAX_ATTEMPTS:-60}
 manager_election_delay_after_quorum_loss: ${MANAGER_ELECTION_DELAY_AFTER_QUORUM_LOSS:-15s}
 resetup_host_lag: ${RESETUP_HOST_LAG:-30000s}
 manager_lock_acquire_delay_after_quorum_loss: ${MANAGER_LOCK_ACQUIRE_DELAY_AFTER_QUORUM_LOSS:-30s}