Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions internal/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -964,6 +964,12 @@ func (app *App) emulateError(pos string) bool {
}

func (app *App) approveSwitchover(switchover *Switchover, activeNodes []string, clusterState map[string]*nodestate.NodeState) error {
// Limit amount of switchover retries
if switchover.MasterTransition != FailoverTransition &&
app.config.SwitchoverMaxAttempts > 0 && switchover.RunCount >= app.config.SwitchoverMaxAttempts {
return fmt.Errorf("switchover failed %d times, giving up after reaching switchover_max_attempts (%d)",
switchover.RunCount, app.config.SwitchoverMaxAttempts)
}
if switchover.RunCount > 0 {
return nil
}
Expand Down
2 changes: 2 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ type Config struct {
ForceSwitchover bool `config:"force_switchover" yaml:"force_switchover"` // TODO: Remove when we will be sure it's right way to do switchover
ReplicationConvergenceTimeoutSwitchover time.Duration `config:"replication_convergence_timeout_switchover" yaml:"replication_convergence_timeout_switchover"`
SwitchoverTimeout time.Duration `config:"switchover_timeout" yaml:"switchover_timeout"`
SwitchoverMaxAttempts int `config:"switchover_max_attempts" yaml:"switchover_max_attempts"`
DSNSettings string `config:"dsn_settings" yaml:"dsn_settings"`
OptimizationConfig OptimizationConfig `config:"optimization_config" yaml:"optimization_config"`
}
Expand Down Expand Up @@ -210,6 +211,7 @@ func DefaultConfig() (Config, error) {
ManagerSwitchover: false,
ForceSwitchover: false,
SwitchoverTimeout: 30 * time.Minute,
SwitchoverMaxAttempts: 60,
ReplicationConvergenceTimeoutSwitchover: 300 * time.Second,
DSNSettings: "?autocommit=1&sql_log_off=1",
OptimizationConfig: OptimizationConfig{
Expand Down
44 changes: 44 additions & 0 deletions tests/features/switchover_from.84.feature
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Feature: manual switchover from old master
Given cluster environment is
"""
FORCE_SWITCHOVER=<force_switchover>
MYSYNC_SWITCHOVER_MAX_ATTEMPTS=1000
"""
Given cluster is up and running
Then zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
Expand Down Expand Up @@ -84,6 +85,49 @@ Feature: manual switchover from old master
| true |
| false |

Scenario: switchover gives up and releases the master after reaching max attempts
Given cluster environment is
"""
MYSYNC_SWITCHOVER_MAX_ATTEMPTS=60
"""
And cluster is up and running
Then mysql host "mysql1" should be master
And zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
"""
["mysql1","mysql2","mysql3"]
"""
When mysql on host "mysql3" is killed
And mysql on host "mysql2" is killed
# pre-set run_count so the switchover is already approved and enters the retry loop;
# with both replicas dead it can never reach quorum and keeps failing
And I set zookeeper node "/test/switch" to
"""
{
"from": "mysql1",
"to": "",
"cause": "manual",
"initiated_by": "mysql1",
"run_count": 1,
"master_transition": "switchover"
}
"""
# after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover
Then zookeeper node "/test/last_rejected_switch" should match json within "250" seconds
"""
{
"from": "mysql1",
"master_transition": "switchover",
"result": {
"ok": false,
"error": "REGEXP:.*giving up after reaching switchover_max_attempts.*"
}
}
"""
And zookeeper node "/test/switch" should not exist
# the old master must be released from read-only instead of being held there forever
And mysql host "mysql1" should be master
And mysql host "mysql1" should become writable within "30" seconds

Scenario Outline: switchover from works on healthy cluster
Given cluster environment is
"""
Expand Down
44 changes: 44 additions & 0 deletions tests/features/switchover_from.feature
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Feature: manual switchover from old master
Given cluster environment is
"""
FORCE_SWITCHOVER=<force_switchover>
MYSYNC_SWITCHOVER_MAX_ATTEMPTS=1000
"""
Given cluster is up and running
Then zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
Expand Down Expand Up @@ -84,6 +85,49 @@ Feature: manual switchover from old master
| true |
| false |

Scenario: switchover gives up and releases the master after reaching max attempts
Given cluster environment is
"""
MYSYNC_SWITCHOVER_MAX_ATTEMPTS=60
"""
And cluster is up and running
Then mysql host "mysql1" should be master
And zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
"""
["mysql1","mysql2","mysql3"]
"""
When mysql on host "mysql3" is killed
And mysql on host "mysql2" is killed
# pre-set run_count so the switchover is already approved and enters the retry loop;
# with both replicas dead it can never reach quorum and keeps failing
And I set zookeeper node "/test/switch" to
"""
{
"from": "mysql1",
"to": "",
"cause": "manual",
"initiated_by": "mysql1",
"run_count": 1,
"master_transition": "switchover"
}
"""
# after switchover_max_attempts unsuccessful attempts mysync stops retrying and rejects the switchover
Then zookeeper node "/test/last_rejected_switch" should match json within "250" seconds
"""
{
"from": "mysql1",
"master_transition": "switchover",
"result": {
"ok": false,
"error": "REGEXP:.*giving up after reaching switchover_max_attempts.*"
}
}
"""
And zookeeper node "/test/switch" should not exist
# the old master must be released from read-only instead of being held there forever
And mysql host "mysql1" should be master
And mysql host "mysql1" should become writable within "30" seconds

Scenario Outline: switchover from works on healthy cluster
Given cluster environment is
"""
Expand Down
3 changes: 3 additions & 0 deletions tests/images/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ services:
LOW_REPLICATION_MARK:
RESETUP_HOST_LAG:
OFFLINE_MODE_MAX_OFFLINE_PCT:
MYSYNC_SWITCHOVER_MAX_ATTEMPTS:
healthcheck:
test: "mysql --user=admin --password=admin_pwd -e 'SELECT 1'"
start_period: 30s
Expand Down Expand Up @@ -161,6 +162,7 @@ services:
LOW_REPLICATION_MARK:
RESETUP_HOST_LAG:
OFFLINE_MODE_MAX_OFFLINE_PCT:
MYSYNC_SWITCHOVER_MAX_ATTEMPTS:
depends_on:
mysql1:
condition: service_healthy
Expand Down Expand Up @@ -210,6 +212,7 @@ services:
LOW_REPLICATION_MARK:
RESETUP_HOST_LAG:
OFFLINE_MODE_MAX_OFFLINE_PCT:
MYSYNC_SWITCHOVER_MAX_ATTEMPTS:
depends_on:
mysql1:
condition: service_healthy
Expand Down
1 change: 1 addition & 0 deletions tests/images/mysql/mysync.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ repl_mon: ${REPL_MON:-false}
force_switchover: ${FORCE_SWITCHOVER:-false}
manager_switchover: ${MANAGER_SWITCHOVER:-true}
replication_convergence_timeout_switchover: 300s
switchover_max_attempts: ${MYSYNC_SWITCHOVER_MAX_ATTEMPTS:-60}
manager_election_delay_after_quorum_loss: ${MANAGER_ELECTION_DELAY_AFTER_QUORUM_LOSS:-15s}
resetup_host_lag: ${RESETUP_HOST_LAG:-30000s}
manager_lock_acquire_delay_after_quorum_loss: ${MANAGER_LOCK_ACQUIRE_DELAY_AFTER_QUORUM_LOSS:-30s}
Expand Down
Loading