From 057e5dfdddfd21598cfef92d3d3a9d29c59f9af6 Mon Sep 17 00:00:00 2001 From: Alexander Morozov Date: Fri, 5 Jun 2026 21:05:55 +0300 Subject: [PATCH 1/2] feat: limit switchover attemtps --- internal/app/app.go | 6 ++++ internal/config/config.go | 2 ++ tests/features/switchover_from.84.feature | 44 +++++++++++++++++++++++ tests/features/switchover_from.feature | 44 +++++++++++++++++++++++ tests/images/docker-compose.yaml | 3 ++ tests/images/mysql/mysync.yaml | 1 + 6 files changed, 100 insertions(+) diff --git a/internal/app/app.go b/internal/app/app.go index bde78a09..02d05a67 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -964,6 +964,12 @@ func (app *App) emulateError(pos string) bool { } func (app *App) approveSwitchover(switchover *Switchover, activeNodes []string, clusterState map[string]*nodestate.NodeState) error { + // Limit amount of switchover retries + if switchover.MasterTransition != FailoverTransition && + app.config.SwitchoverMaxAttempts > 0 && switchover.RunCount >= app.config.SwitchoverMaxAttempts { + return fmt.Errorf("switchover failed %d times, giving up after reaching switchover_max_attempts (%d)", + switchover.RunCount, app.config.SwitchoverMaxAttempts) + } if switchover.RunCount > 0 { return nil } diff --git a/internal/config/config.go b/internal/config/config.go index 344a6fda..12aed4a8 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -108,6 +108,7 @@ type Config struct { ForceSwitchover bool `config:"force_switchover" yaml:"force_switchover"` // TODO: Remove when we will be sure it's right way to do switchover ReplicationConvergenceTimeoutSwitchover time.Duration `config:"replication_convergence_timeout_switchover" yaml:"replication_convergence_timeout_switchover"` SwitchoverTimeout time.Duration `config:"switchover_timeout" yaml:"switchover_timeout"` + SwitchoverMaxAttempts int `config:"switchover_max_attempts" yaml:"switchover_max_attempts"` DSNSettings string `config:"dsn_settings" yaml:"dsn_settings"` OptimizationConfig OptimizationConfig `config:"optimization_config" yaml:"optimization_config"` } @@ -210,6 +211,7 @@ func DefaultConfig() (Config, error) { ManagerSwitchover: false, ForceSwitchover: false, SwitchoverTimeout: 30 * time.Minute, + SwitchoverMaxAttempts: 3, ReplicationConvergenceTimeoutSwitchover: 300 * time.Second, DSNSettings: "?autocommit=1&sql_log_off=1", OptimizationConfig: OptimizationConfig{ diff --git a/tests/features/switchover_from.84.feature b/tests/features/switchover_from.84.feature index 63f4ee6b..252ef532 100644 --- a/tests/features/switchover_from.84.feature +++ b/tests/features/switchover_from.84.feature @@ -43,6 +43,7 @@ Feature: manual switchover from old master Given cluster environment is """ FORCE_SWITCHOVER= + MYSYNC_SWITCHOVER_MAX_ATTEMPTS=1000 """ Given cluster is up and running Then zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds @@ -84,6 +85,49 @@ Feature: manual switchover from old master | true | | false | + Scenario: switchover gives up and releases the master after reaching max attempts + Given cluster environment is + """ + MYSYNC_SWITCHOVER_MAX_ATTEMPTS=3 + """ + And cluster is up and running + Then mysql host "mysql1" should be master + And zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds + """ + ["mysql1","mysql2","mysql3"] + """ + When mysql on host "mysql3" is killed + And mysql on host "mysql2" is killed + # pre-set run_count so the switchover is already approved and enters the retry loop; + # with both replicas dead it can never reach quorum and keeps failing + And I set zookeeper node "/test/switch" to + """ + { + "from": "mysql1", + "to": "", + "cause": "manual", + "initiated_by": "mysql1", + "run_count": 1, + "master_transition": "switchover" + } + """ + # after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover + Then zookeeper node "/test/last_rejected_switch" should match json within "60" seconds + """ + { + "from": "mysql1", + "master_transition": "switchover", + "result": { + "ok": false, + "error": "REGEXP:.*giving up after reaching switchover_max_attempts.*" + } + } + """ + And zookeeper node "/test/switch" should not exist + # the old master must be released from read-only instead of being held there forever + And mysql host "mysql1" should be master + And mysql host "mysql1" should become writable within "30" seconds + Scenario Outline: switchover from works on healthy cluster Given cluster environment is """ diff --git a/tests/features/switchover_from.feature b/tests/features/switchover_from.feature index a35814fa..93529b32 100644 --- a/tests/features/switchover_from.feature +++ b/tests/features/switchover_from.feature @@ -43,6 +43,7 @@ Feature: manual switchover from old master Given cluster environment is """ FORCE_SWITCHOVER= + MYSYNC_SWITCHOVER_MAX_ATTEMPTS=1000 """ Given cluster is up and running Then zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds @@ -84,6 +85,49 @@ Feature: manual switchover from old master | true | | false | + Scenario: switchover gives up and releases the master after reaching max attempts + Given cluster environment is + """ + MYSYNC_SWITCHOVER_MAX_ATTEMPTS=3 + """ + And cluster is up and running + Then mysql host "mysql1" should be master + And zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds + """ + ["mysql1","mysql2","mysql3"] + """ + When mysql on host "mysql3" is killed + And mysql on host "mysql2" is killed + # pre-set run_count so the switchover is already approved and enters the retry loop; + # with both replicas dead it can never reach quorum and keeps failing + And I set zookeeper node "/test/switch" to + """ + { + "from": "mysql1", + "to": "", + "cause": "manual", + "initiated_by": "mysql1", + "run_count": 1, + "master_transition": "switchover" + } + """ + # after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover + Then zookeeper node "/test/last_rejected_switch" should match json within "60" seconds + """ + { + "from": "mysql1", + "master_transition": "switchover", + "result": { + "ok": false, + "error": "REGEXP:.*giving up after reaching switchover_max_attempts.*" + } + } + """ + And zookeeper node "/test/switch" should not exist + # the old master must be released from read-only instead of being held there forever + And mysql host "mysql1" should be master + And mysql host "mysql1" should become writable within "30" seconds + Scenario Outline: switchover from works on healthy cluster Given cluster environment is """ diff --git a/tests/images/docker-compose.yaml b/tests/images/docker-compose.yaml index 37999998..ff796bb5 100644 --- a/tests/images/docker-compose.yaml +++ b/tests/images/docker-compose.yaml @@ -106,6 +106,7 @@ services: LOW_REPLICATION_MARK: RESETUP_HOST_LAG: OFFLINE_MODE_MAX_OFFLINE_PCT: + MYSYNC_SWITCHOVER_MAX_ATTEMPTS: healthcheck: test: "mysql --user=admin --password=admin_pwd -e 'SELECT 1'" start_period: 30s @@ -161,6 +162,7 @@ services: LOW_REPLICATION_MARK: RESETUP_HOST_LAG: OFFLINE_MODE_MAX_OFFLINE_PCT: + MYSYNC_SWITCHOVER_MAX_ATTEMPTS: depends_on: mysql1: condition: service_healthy @@ -210,6 +212,7 @@ services: LOW_REPLICATION_MARK: RESETUP_HOST_LAG: OFFLINE_MODE_MAX_OFFLINE_PCT: + MYSYNC_SWITCHOVER_MAX_ATTEMPTS: depends_on: mysql1: condition: service_healthy diff --git a/tests/images/mysql/mysync.yaml b/tests/images/mysql/mysync.yaml index f0d83d16..f83c9a2f 100644 --- a/tests/images/mysql/mysync.yaml +++ b/tests/images/mysql/mysync.yaml @@ -66,6 +66,7 @@ repl_mon: ${REPL_MON:-false} force_switchover: ${FORCE_SWITCHOVER:-false} manager_switchover: ${MANAGER_SWITCHOVER:-true} replication_convergence_timeout_switchover: 300s +switchover_max_attempts: ${MYSYNC_SWITCHOVER_MAX_ATTEMPTS:-3} manager_election_delay_after_quorum_loss: ${MANAGER_ELECTION_DELAY_AFTER_QUORUM_LOSS:-15s} resetup_host_lag: ${RESETUP_HOST_LAG:-30000s} manager_lock_acquire_delay_after_quorum_loss: ${MANAGER_LOCK_ACQUIRE_DELAY_AFTER_QUORUM_LOSS:-30s} From b39601371f3b44e33dc77e189bfdec3553280809 Mon Sep 17 00:00:00 2001 From: Alexander Morozov Date: Mon, 8 Jun 2026 19:40:23 +0300 Subject: [PATCH 2/2] change limits --- internal/config/config.go | 2 +- tests/features/switchover_from.84.feature | 4 ++-- tests/features/switchover_from.feature | 4 ++-- tests/images/mysql/mysync.yaml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/internal/config/config.go b/internal/config/config.go index 12aed4a8..5b56d601 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -211,7 +211,7 @@ func DefaultConfig() (Config, error) { ManagerSwitchover: false, ForceSwitchover: false, SwitchoverTimeout: 30 * time.Minute, - SwitchoverMaxAttempts: 3, + SwitchoverMaxAttempts: 60, ReplicationConvergenceTimeoutSwitchover: 300 * time.Second, DSNSettings: "?autocommit=1&sql_log_off=1", OptimizationConfig: OptimizationConfig{ diff --git a/tests/features/switchover_from.84.feature b/tests/features/switchover_from.84.feature index 252ef532..d643b568 100644 --- a/tests/features/switchover_from.84.feature +++ b/tests/features/switchover_from.84.feature @@ -88,7 +88,7 @@ Feature: manual switchover from old master Scenario: switchover gives up and releases the master after reaching max attempts Given cluster environment is """ - MYSYNC_SWITCHOVER_MAX_ATTEMPTS=3 + MYSYNC_SWITCHOVER_MAX_ATTEMPTS=60 """ And cluster is up and running Then mysql host "mysql1" should be master @@ -112,7 +112,7 @@ Feature: manual switchover from old master } """ # after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover - Then zookeeper node "/test/last_rejected_switch" should match json within "60" seconds + Then zookeeper node "/test/last_rejected_switch" should match json within "250" seconds """ { "from": "mysql1", diff --git a/tests/features/switchover_from.feature b/tests/features/switchover_from.feature index 93529b32..294813be 100644 --- a/tests/features/switchover_from.feature +++ b/tests/features/switchover_from.feature @@ -88,7 +88,7 @@ Feature: manual switchover from old master Scenario: switchover gives up and releases the master after reaching max attempts Given cluster environment is """ - MYSYNC_SWITCHOVER_MAX_ATTEMPTS=3 + MYSYNC_SWITCHOVER_MAX_ATTEMPTS=60 """ And cluster is up and running Then mysql host "mysql1" should be master @@ -112,7 +112,7 @@ Feature: manual switchover from old master } """ # after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover - Then zookeeper node "/test/last_rejected_switch" should match json within "60" seconds + Then zookeeper node "/test/last_rejected_switch" should match json within "250" seconds """ { "from": "mysql1", diff --git a/tests/images/mysql/mysync.yaml b/tests/images/mysql/mysync.yaml index f83c9a2f..950a8ecc 100644 --- a/tests/images/mysql/mysync.yaml +++ b/tests/images/mysql/mysync.yaml @@ -66,7 +66,7 @@ repl_mon: ${REPL_MON:-false} force_switchover: ${FORCE_SWITCHOVER:-false} manager_switchover: ${MANAGER_SWITCHOVER:-true} replication_convergence_timeout_switchover: 300s -switchover_max_attempts: ${MYSYNC_SWITCHOVER_MAX_ATTEMPTS:-3} +switchover_max_attempts: ${MYSYNC_SWITCHOVER_MAX_ATTEMPTS:-60} manager_election_delay_after_quorum_loss: ${MANAGER_ELECTION_DELAY_AFTER_QUORUM_LOSS:-15s} resetup_host_lag: ${RESETUP_HOST_LAG:-30000s} manager_lock_acquire_delay_after_quorum_loss: ${MANAGER_LOCK_ACQUIRE_DELAY_AFTER_QUORUM_LOSS:-30s}