Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions internal/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,7 @@ func (app *App) stateManager() appState {
} else {
if !switchover.InitiatedAt.IsZero() && time.Since(switchover.InitiatedAt) > app.config.SwitchoverTimeout {
app.logger.Error().Msgf("switchover %s => %s timed out after %s", switchover.From, switchover.To, time.Since(switchover.InitiatedAt))
app.logSwitchoverFailure(switchover)
err = app.FailSwitchover(switchover, fmt.Errorf("switchover timed out after %s", time.Since(switchover.InitiatedAt)))
if err != nil {
app.logger.Error().Err(err).Msg("failed to report switchover timeout")
Expand Down Expand Up @@ -692,7 +693,12 @@ func (app *App) stateManager() appState {

if !clusterStateDcs[master].PingOk || clusterStateDcs[master].IsFileSystemReadonly {
app.logger.Error().Msgf("MASTER FAILURE")
app.t.SetIfZero(NodeFailedAt, master, time.Now())
if app.t.Get(NodeFailedAt, master).IsZero() {
now := time.Now()
app.t.Set(NodeFailedAt, master, now)
app.startTiming(timingDowntime, now)
app.startTiming(timingFailover, now)
}
if lightMaintenance {
app.logger.Info().Msgf("failover suppressed by light maintenance mode")
} else {
Expand All @@ -710,7 +716,11 @@ func (app *App) stateManager() appState {
return stateManager
}
} else {
app.t.Clean(NodeFailedAt, master)
if !app.t.Get(NodeFailedAt, master).IsZero() {
app.stopTiming(timingDowntime)
app.stopTiming(timingFailover)
app.t.Clean(NodeFailedAt, master)
}
}

if !clusterState[master].PingOk {
Expand Down Expand Up @@ -1373,6 +1383,11 @@ func (app *App) performSwitchover(clusterState map[string]*nodestate.NodeState,
}
}

// downtime for failover starts at master loss (see IssueFailover); for switchover it starts here
if switchover.MasterTransition == SwitchoverTransition {
app.startTiming(timingDowntime, time.Time{})
}

// set read only everywhere (all HA-nodes) and stop replication
app.logger.Info().Msg("switchover: phase 1: enter read only")
errs := util.RunParallel(func(host string) error {
Expand Down Expand Up @@ -1614,6 +1629,8 @@ func (app *App) performSwitchover(clusterState map[string]*nodestate.NodeState,
}
app.logger.Info().Msgf("switchover: new master %s set writable", newMaster)

app.stopTiming(timingDowntime)

// reenable events
events, err := newMasterNode.ReenableEventsRetry()
if err != nil || app.emulateError("promote_reenable_events") {
Expand Down
11 changes: 11 additions & 0 deletions internal/app/app_dcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,14 @@ func (app *App) FinishSwitchover(switchover *Switchover, switchErr error) error
switchover.Result.Error = switchErr.Error()
}

if switchErr != nil {
app.logSwitchoverFailure(switchover)
} else if switchover.MasterTransition == SwitchoverTransition {
app.stopTiming(timingSwitchover)
} else {
app.stopTiming(timingFailover)
}

err := app.dcs.Delete(pathCurrentSwitch)
if err != nil {
return err
Expand All @@ -171,6 +179,9 @@ func (app *App) StartSwitchover(switchover *Switchover) error {
app.logger.Info().Msgf("switchover: %s => %s starting...", switchover.From, switchover.To)
switchover.StartedAt = time.Now()
switchover.StartedBy = app.config.Hostname
if switchover.MasterTransition == SwitchoverTransition {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Switchover may be started from external source

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at InitiatedAt field then

app.startTiming(timingSwitchover, switchover.InitiatedAt)
}
return app.dcs.Set(pathCurrentSwitch, switchover)
}

Expand Down
3 changes: 3 additions & 0 deletions internal/app/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ const (

// last known timestamp from repl_mon table
pathMasterReplMonTS = "master_repl_mon_ts"

// timing start timestamps, structure: pathTimings/<name> -> time.Time
pathTimings = "timing"
)

var (
Expand Down
94 changes: 94 additions & 0 deletions internal/app/timing_tracker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package app

import (
"context"
"fmt"
"os/exec"
"strconv"
"time"

"github.com/yandex/mysync/internal/dcs"
"github.com/yandex/mysync/internal/util"
)

// names of tracked timings
const (
timingDowntime = "downtime"
timingFailover = "failover"
timingSwitchover = "switchover"
)

func (app *App) timingPath(name string) string {
return dcs.JoinPath(pathTimings, name)
}

// getTimingStart returns stored start time and whether it exists
func (app *App) getTimingStart(name string) (time.Time, bool) {
var ts time.Time
if err := app.dcs.Get(app.timingPath(name), &ts); err != nil {
return time.Time{}, false
}
return ts, true
}

// startTiming stores timing start in dcs (zero ts means now)
func (app *App) startTiming(name string, ts time.Time) {
if ts.IsZero() {
ts = time.Now()
}
if err := app.dcs.Set(app.timingPath(name), ts); err != nil {
app.logger.Warn().Err(err).Msgf("failed to start timing %s", name)
}
}

func (app *App) clearTiming(name string) {
if err := app.dcs.Delete(app.timingPath(name)); err != nil {
app.logger.Warn().Err(err).Msgf("failed to clear timing %s", name)
}
}

// stopTiming logs elapsed time since start and clears it
func (app *App) stopTiming(name string) {
now := time.Now()
start, ok := app.getTimingStart(name)
if !ok {
return
}
app.clearTiming(name)
app.logTiming(name, now.Sub(start))
}

// logSwitchoverFailure records time spent on a failed switchover as a negative value.
// Marker-guarded so it is logged once, and only when the switchover actually started.
func (app *App) logSwitchoverFailure(sw *Switchover) {
now := time.Now()
if sw.MasterTransition != SwitchoverTransition {
return
}
start, ok := app.getTimingStart(timingSwitchover)
if !ok {
return
}
app.clearTiming(timingSwitchover)
app.clearTiming(timingDowntime)
since := sw.InitiatedAt
if since.IsZero() {
since = start
}
app.logTiming(timingSwitchover+"_failure", -now.Sub(since))
}

// logTiming runs the configured log_timing command with name and elapsed seconds
func (app *App) logTiming(name string, d time.Duration) {
command, ok := app.config.Commands["log_timing"]
if !ok || command == "" {
return
}
command = fmt.Sprintf(command, name, strconv.FormatFloat(d.Seconds(), 'f', 3, 64))
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
shell := util.GetEnvVariable("SHELL", "sh")
if err := exec.CommandContext(ctx, shell, "-c", command).Run(); err != nil {
app.logger.Warn().Err(err).Msgf("failed to execute log_timing command for %s", name)
}
}
70 changes: 70 additions & 0 deletions tests/features/timings.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
Feature: failover and switchover timings are logged via log_timing command

Scenario: failover logs downtime and failover timings
Given cluster environment is
"""
MYSYNC_FAILOVER=true
MYSYNC_FAILOVER_DELAY=0s
"""
Given cluster is up and running
Then zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
"""
["mysql1","mysql2","mysql3"]
"""
When host "mysql1" is stopped
Then mysql host "mysql1" should become unavailable within "10" seconds
Then zookeeper node "/test/last_switch" should match json within "30" seconds
"""
{
"cause": "auto",
"from": "mysql1",
"master_transition": "failover",
"result": {
"ok": true
}
}
"""
When I get zookeeper node "/test/manager"
And I save zookeeper query result as "manager"
When I run command on host "{{.manager.hostname}}" until result match regexp "failover:" with timeout "30" seconds
"""
cat /tmp/timing.log
"""
Then command output should match regexp
"""
downtime:
"""

Scenario: switchover logs downtime and switchover timings
Given cluster is up and running
Then mysql host "mysql1" should be master
And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
"""
["mysql1","mysql2","mysql3"]
"""
When I run command on host "mysql1"
"""
mysync switch --to mysql2 --wait=0s
"""
Then command return code should be "0"
Then zookeeper node "/test/last_switch" should match json within "120" seconds
"""
{
"to": "mysql2",
"master_transition": "switchover",
"result": {
"ok": true
}
}
"""
Then mysql host "mysql2" should be master
When I get zookeeper node "/test/manager"
And I save zookeeper query result as "manager"
When I run command on host "{{.manager.hostname}}" until result match regexp "switchover:" with timeout "30" seconds
"""
cat /tmp/timing.log
"""
Then command output should match regexp
"""
downtime:
"""
4 changes: 4 additions & 0 deletions tests/images/mysql/mysync.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ resetupfile: /tmp/mysync.resetup
resetup_crashed_hosts: ${MYSYNC_RESETUP_CRASHED_HOSTS:-false}
zookeeper:
session_timeout: 3s
random_host_provider:
retry_jitter: 1s
namespace: /test
hosts: [ $ZK_SERVERS ]
auth: true
Expand All @@ -37,6 +39,8 @@ mysql:
error_log: /var/log/mysql/error.log
queries:
replication_lag: $MYSYNC_REPLICATION_LAG_QUERY
commands:
log_timing: 'echo "%s: %s" >> /tmp/timing.log'
disable_semi_sync_replication_on_maintenance: ${MYSYNC_DISABLE_REPLICATION_ON_MAINT:-false}
rpl_semi_sync_master_wait_for_slave_count: ${MYSYNC_WAIT_FOR_SLAVE_COUNT:-1}
critical_disk_usage: ${MYSYNC_CRITICAL_DISK_USAGE:-100}
Expand Down
Loading