yandex · dodokek · Jun 9, 2026 · Jun 5, 2026 · Jun 8, 2026
diff --git a/internal/app/app.go b/internal/app/app.go
@@ -964,6 +964,12 @@ func (app *App) emulateError(pos string) bool {
 }
 
 func (app *App) approveSwitchover(switchover *Switchover, activeNodes []string, clusterState map[string]*nodestate.NodeState) error {
+	// Limit amount of switchover retries
+	if switchover.MasterTransition != FailoverTransition &&
+		app.config.SwitchoverMaxAttempts > 0 && switchover.RunCount >= app.config.SwitchoverMaxAttempts {
+		return fmt.Errorf("switchover failed %d times, giving up after reaching switchover_max_attempts (%d)",
+			switchover.RunCount, app.config.SwitchoverMaxAttempts)
+	}
 	if switchover.RunCount > 0 {
 		return nil
 	}

diff --git a/internal/config/config.go b/internal/config/config.go
@@ -108,6 +108,7 @@ type Config struct {
 	ForceSwitchover                         bool                         `config:"force_switchover" yaml:"force_switchover"` // TODO: Remove when we will be sure it's right way to do switchover
 	ReplicationConvergenceTimeoutSwitchover time.Duration                `config:"replication_convergence_timeout_switchover" yaml:"replication_convergence_timeout_switchover"`
 	SwitchoverTimeout                       time.Duration                `config:"switchover_timeout" yaml:"switchover_timeout"`
+	SwitchoverMaxAttempts                   int                          `config:"switchover_max_attempts" yaml:"switchover_max_attempts"`
 	DSNSettings                             string                       `config:"dsn_settings" yaml:"dsn_settings"`
 	OptimizationConfig                      OptimizationConfig           `config:"optimization_config" yaml:"optimization_config"`
 }
@@ -210,6 +211,7 @@ func DefaultConfig() (Config, error) {
 		ManagerSwitchover:                       false,
 		ForceSwitchover:                         false,
 		SwitchoverTimeout:                       30 * time.Minute,
+		SwitchoverMaxAttempts:                   60,
 		ReplicationConvergenceTimeoutSwitchover: 300 * time.Second,
 		DSNSettings:                             "?autocommit=1&sql_log_off=1",
 		OptimizationConfig: OptimizationConfig{

diff --git a/tests/features/switchover_from.84.feature b/tests/features/switchover_from.84.feature
@@ -43,6 +43,7 @@ Feature: manual switchover from old master
     Given cluster environment is
       """
       FORCE_SWITCHOVER=<force_switchover>
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS=1000
       """
     Given cluster is up and running
     Then zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
@@ -84,6 +85,49 @@ Feature: manual switchover from old master
       | true              |
       | false             |
 
+  Scenario: switchover gives up and releases the master after reaching max attempts
+    Given cluster environment is
+      """
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS=60
+      """
+    And cluster is up and running
+    Then mysql host "mysql1" should be master
+    And zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
+      """
+      ["mysql1","mysql2","mysql3"]
+      """
+    When mysql on host "mysql3" is killed
+    And mysql on host "mysql2" is killed
+    # pre-set run_count so the switchover is already approved and enters the retry loop;
+    # with both replicas dead it can never reach quorum and keeps failing
+    And I set zookeeper node "/test/switch" to
+      """
+      {
+          "from": "mysql1",
+          "to": "",
+          "cause": "manual",
+          "initiated_by": "mysql1",
+          "run_count": 1,
+          "master_transition": "switchover"
+      }
+      """
+    # after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover
+    Then zookeeper node "/test/last_rejected_switch" should match json within "250" seconds
+      """
+      {
+          "from": "mysql1",
+          "master_transition": "switchover",
+          "result": {
+              "ok": false,
+              "error": "REGEXP:.*giving up after reaching switchover_max_attempts.*"
+          }
+      }
+      """
+    And zookeeper node "/test/switch" should not exist
+    # the old master must be released from read-only instead of being held there forever
+    And mysql host "mysql1" should be master
+    And mysql host "mysql1" should become writable within "30" seconds
+
   Scenario Outline: switchover from works on healthy cluster
     Given cluster environment is
       """

diff --git a/tests/features/switchover_from.feature b/tests/features/switchover_from.feature
@@ -43,6 +43,7 @@ Feature: manual switchover from old master
     Given cluster environment is
       """
       FORCE_SWITCHOVER=<force_switchover>
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS=1000
       """
     Given cluster is up and running
     Then zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
@@ -84,6 +85,49 @@ Feature: manual switchover from old master
       | true              |
       | false             |
 
+  Scenario: switchover gives up and releases the master after reaching max attempts
+    Given cluster environment is
+      """
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS=60
+      """
+    And cluster is up and running
+    Then mysql host "mysql1" should be master
+    And zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
+      """
+      ["mysql1","mysql2","mysql3"]
+      """
+    When mysql on host "mysql3" is killed
+    And mysql on host "mysql2" is killed
+    # pre-set run_count so the switchover is already approved and enters the retry loop;
+    # with both replicas dead it can never reach quorum and keeps failing
+    And I set zookeeper node "/test/switch" to
+      """
+      {
+          "from": "mysql1",
+          "to": "",
+          "cause": "manual",
+          "initiated_by": "mysql1",
+          "run_count": 1,
+          "master_transition": "switchover"
+      }
+      """
+    # after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover
+    Then zookeeper node "/test/last_rejected_switch" should match json within "250" seconds
+      """
+      {
+          "from": "mysql1",
+          "master_transition": "switchover",
+          "result": {
+              "ok": false,
+              "error": "REGEXP:.*giving up after reaching switchover_max_attempts.*"
+          }
+      }
+      """
+    And zookeeper node "/test/switch" should not exist
+    # the old master must be released from read-only instead of being held there forever
+    And mysql host "mysql1" should be master
+    And mysql host "mysql1" should become writable within "30" seconds
+
   Scenario Outline: switchover from works on healthy cluster
     Given cluster environment is
       """

diff --git a/tests/images/docker-compose.yaml b/tests/images/docker-compose.yaml
@@ -106,6 +106,7 @@ services:
       LOW_REPLICATION_MARK:
       RESETUP_HOST_LAG:
       OFFLINE_MODE_MAX_OFFLINE_PCT:
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS:
     healthcheck:
       test: "mysql --user=admin --password=admin_pwd -e 'SELECT 1'"
       start_period: 30s
@@ -161,6 +162,7 @@ services:
       LOW_REPLICATION_MARK:
       RESETUP_HOST_LAG:
       OFFLINE_MODE_MAX_OFFLINE_PCT:
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS:
     depends_on:
       mysql1:
         condition: service_healthy
@@ -210,6 +212,7 @@ services:
       LOW_REPLICATION_MARK:
       RESETUP_HOST_LAG:
       OFFLINE_MODE_MAX_OFFLINE_PCT:
+      MYSYNC_SWITCHOVER_MAX_ATTEMPTS:
     depends_on:
       mysql1:
         condition: service_healthy

diff --git a/tests/images/mysql/mysync.yaml b/tests/images/mysql/mysync.yaml
@@ -66,6 +66,7 @@ repl_mon: ${REPL_MON:-false}
 force_switchover: ${FORCE_SWITCHOVER:-false}
 manager_switchover: ${MANAGER_SWITCHOVER:-true}
 replication_convergence_timeout_switchover: 300s
+switchover_max_attempts: ${MYSYNC_SWITCHOVER_MAX_ATTEMPTS:-60}
 manager_election_delay_after_quorum_loss: ${MANAGER_ELECTION_DELAY_AFTER_QUORUM_LOSS:-15s}
 resetup_host_lag: ${RESETUP_HOST_LAG:-30000s}
 manager_lock_acquire_delay_after_quorum_loss: ${MANAGER_LOCK_ACQUIRE_DELAY_AFTER_QUORUM_LOSS:-30s}