diff --git a/internal/app/app.go b/internal/app/app.go index bde78a09..02d05a67 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -964,6 +964,12 @@ func (app *App) emulateError(pos string) bool { } func (app *App) approveSwitchover(switchover *Switchover, activeNodes []string, clusterState map[string]*nodestate.NodeState) error { + // Limit amount of switchover retries + if switchover.MasterTransition != FailoverTransition && + app.config.SwitchoverMaxAttempts > 0 && switchover.RunCount >= app.config.SwitchoverMaxAttempts { + return fmt.Errorf("switchover failed %d times, giving up after reaching switchover_max_attempts (%d)", + switchover.RunCount, app.config.SwitchoverMaxAttempts) + } if switchover.RunCount > 0 { return nil } diff --git a/internal/config/config.go b/internal/config/config.go index 344a6fda..5b56d601 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -108,6 +108,7 @@ type Config struct { ForceSwitchover bool `config:"force_switchover" yaml:"force_switchover"` // TODO: Remove when we will be sure it's right way to do switchover ReplicationConvergenceTimeoutSwitchover time.Duration `config:"replication_convergence_timeout_switchover" yaml:"replication_convergence_timeout_switchover"` SwitchoverTimeout time.Duration `config:"switchover_timeout" yaml:"switchover_timeout"` + SwitchoverMaxAttempts int `config:"switchover_max_attempts" yaml:"switchover_max_attempts"` DSNSettings string `config:"dsn_settings" yaml:"dsn_settings"` OptimizationConfig OptimizationConfig `config:"optimization_config" yaml:"optimization_config"` } @@ -210,6 +211,7 @@ func DefaultConfig() (Config, error) { ManagerSwitchover: false, ForceSwitchover: false, SwitchoverTimeout: 30 * time.Minute, + SwitchoverMaxAttempts: 60, ReplicationConvergenceTimeoutSwitchover: 300 * time.Second, DSNSettings: "?autocommit=1&sql_log_off=1", OptimizationConfig: OptimizationConfig{ diff --git a/tests/features/switchover_from.84.feature b/tests/features/switchover_from.84.feature index 63f4ee6b..d643b568 100644 --- a/tests/features/switchover_from.84.feature +++ b/tests/features/switchover_from.84.feature @@ -43,6 +43,7 @@ Feature: manual switchover from old master Given cluster environment is """ FORCE_SWITCHOVER= + MYSYNC_SWITCHOVER_MAX_ATTEMPTS=1000 """ Given cluster is up and running Then zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds @@ -84,6 +85,49 @@ Feature: manual switchover from old master | true | | false | + Scenario: switchover gives up and releases the master after reaching max attempts + Given cluster environment is + """ + MYSYNC_SWITCHOVER_MAX_ATTEMPTS=60 + """ + And cluster is up and running + Then mysql host "mysql1" should be master + And zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds + """ + ["mysql1","mysql2","mysql3"] + """ + When mysql on host "mysql3" is killed + And mysql on host "mysql2" is killed + # pre-set run_count so the switchover is already approved and enters the retry loop; + # with both replicas dead it can never reach quorum and keeps failing + And I set zookeeper node "/test/switch" to + """ + { + "from": "mysql1", + "to": "", + "cause": "manual", + "initiated_by": "mysql1", + "run_count": 1, + "master_transition": "switchover" + } + """ + # after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover + Then zookeeper node "/test/last_rejected_switch" should match json within "250" seconds + """ + { + "from": "mysql1", + "master_transition": "switchover", + "result": { + "ok": false, + "error": "REGEXP:.*giving up after reaching switchover_max_attempts.*" + } + } + """ + And zookeeper node "/test/switch" should not exist + # the old master must be released from read-only instead of being held there forever + And mysql host "mysql1" should be master + And mysql host "mysql1" should become writable within "30" seconds + Scenario Outline: switchover from works on healthy cluster Given cluster environment is """ diff --git a/tests/features/switchover_from.feature b/tests/features/switchover_from.feature index a35814fa..294813be 100644 --- a/tests/features/switchover_from.feature +++ b/tests/features/switchover_from.feature @@ -43,6 +43,7 @@ Feature: manual switchover from old master Given cluster environment is """ FORCE_SWITCHOVER= + MYSYNC_SWITCHOVER_MAX_ATTEMPTS=1000 """ Given cluster is up and running Then zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds @@ -84,6 +85,49 @@ Feature: manual switchover from old master | true | | false | + Scenario: switchover gives up and releases the master after reaching max attempts + Given cluster environment is + """ + MYSYNC_SWITCHOVER_MAX_ATTEMPTS=60 + """ + And cluster is up and running + Then mysql host "mysql1" should be master + And zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds + """ + ["mysql1","mysql2","mysql3"] + """ + When mysql on host "mysql3" is killed + And mysql on host "mysql2" is killed + # pre-set run_count so the switchover is already approved and enters the retry loop; + # with both replicas dead it can never reach quorum and keeps failing + And I set zookeeper node "/test/switch" to + """ + { + "from": "mysql1", + "to": "", + "cause": "manual", + "initiated_by": "mysql1", + "run_count": 1, + "master_transition": "switchover" + } + """ + # after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover + Then zookeeper node "/test/last_rejected_switch" should match json within "250" seconds + """ + { + "from": "mysql1", + "master_transition": "switchover", + "result": { + "ok": false, + "error": "REGEXP:.*giving up after reaching switchover_max_attempts.*" + } + } + """ + And zookeeper node "/test/switch" should not exist + # the old master must be released from read-only instead of being held there forever + And mysql host "mysql1" should be master + And mysql host "mysql1" should become writable within "30" seconds + Scenario Outline: switchover from works on healthy cluster Given cluster environment is """ diff --git a/tests/images/docker-compose.yaml b/tests/images/docker-compose.yaml index 37999998..ff796bb5 100644 --- a/tests/images/docker-compose.yaml +++ b/tests/images/docker-compose.yaml @@ -106,6 +106,7 @@ services: LOW_REPLICATION_MARK: RESETUP_HOST_LAG: OFFLINE_MODE_MAX_OFFLINE_PCT: + MYSYNC_SWITCHOVER_MAX_ATTEMPTS: healthcheck: test: "mysql --user=admin --password=admin_pwd -e 'SELECT 1'" start_period: 30s @@ -161,6 +162,7 @@ services: LOW_REPLICATION_MARK: RESETUP_HOST_LAG: OFFLINE_MODE_MAX_OFFLINE_PCT: + MYSYNC_SWITCHOVER_MAX_ATTEMPTS: depends_on: mysql1: condition: service_healthy @@ -210,6 +212,7 @@ services: LOW_REPLICATION_MARK: RESETUP_HOST_LAG: OFFLINE_MODE_MAX_OFFLINE_PCT: + MYSYNC_SWITCHOVER_MAX_ATTEMPTS: depends_on: mysql1: condition: service_healthy diff --git a/tests/images/mysql/mysync.yaml b/tests/images/mysql/mysync.yaml index f0d83d16..950a8ecc 100644 --- a/tests/images/mysql/mysync.yaml +++ b/tests/images/mysql/mysync.yaml @@ -66,6 +66,7 @@ repl_mon: ${REPL_MON:-false} force_switchover: ${FORCE_SWITCHOVER:-false} manager_switchover: ${MANAGER_SWITCHOVER:-true} replication_convergence_timeout_switchover: 300s +switchover_max_attempts: ${MYSYNC_SWITCHOVER_MAX_ATTEMPTS:-60} manager_election_delay_after_quorum_loss: ${MANAGER_ELECTION_DELAY_AFTER_QUORUM_LOSS:-15s} resetup_host_lag: ${RESETUP_HOST_LAG:-30000s} manager_lock_acquire_delay_after_quorum_loss: ${MANAGER_LOCK_ACQUIRE_DELAY_AFTER_QUORUM_LOSS:-30s}