From ee634d0c5794d6e1d4764c50ab163af0a2b3751f Mon Sep 17 00:00:00 2001 From: Alexander Morozov Date: Fri, 5 Jun 2026 10:27:36 +0300 Subject: [PATCH 1/5] feat: send timings metrics --- internal/app/app.go | 13 +++++ internal/app/app_dcs.go | 13 +++++ internal/app/data.go | 3 ++ internal/app/timing_tracker.go | 91 ++++++++++++++++++++++++++++++++++ tests/features/timings.feature | 70 ++++++++++++++++++++++++++ tests/images/mysql/mysync.yaml | 2 + 6 files changed, 192 insertions(+) create mode 100644 internal/app/timing_tracker.go create mode 100644 tests/features/timings.feature diff --git a/internal/app/app.go b/internal/app/app.go index bde78a09..9b75db2f 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -641,6 +641,7 @@ func (app *App) stateManager() appState { } else { if !switchover.InitiatedAt.IsZero() && time.Since(switchover.InitiatedAt) > app.config.SwitchoverTimeout { app.logger.Error().Msgf("switchover %s => %s timed out after %s", switchover.From, switchover.To, time.Since(switchover.InitiatedAt)) + app.logSwitchoverFailure(switchover) err = app.FailSwitchover(switchover, fmt.Errorf("switchover timed out after %s", time.Since(switchover.InitiatedAt))) if err != nil { app.logger.Error().Err(err).Msg("failed to report switchover timeout") @@ -711,6 +712,9 @@ func (app *App) stateManager() appState { } } else { app.t.Clean(NodeFailedAt, master) + // flush timings left behind by an aborted failover/switchover + app.stopTiming(timingDowntime) + app.stopTiming(timingFailover) } if !clusterState[master].PingOk { @@ -1367,6 +1371,11 @@ func (app *App) performSwitchover(clusterState map[string]*nodestate.NodeState, } } + // downtime for failover starts at master loss (see IssueFailover); for switchover it starts here + if switchover.MasterTransition == SwitchoverTransition { + app.startTiming(timingDowntime, time.Time{}) + } + // set read only everywhere (all HA-nodes) and stop replication app.logger.Info().Msg("switchover: phase 1: enter read only") errs := util.RunParallel(func(host string) error { @@ -1608,6 +1617,10 @@ func (app *App) performSwitchover(clusterState map[string]*nodestate.NodeState, } app.logger.Info().Msgf("switchover: new master %s set writable", newMaster) + // cluster is open for writes again, failover/downtime are over + app.stopTiming(timingDowntime) + app.stopTiming(timingFailover) + // reenable events events, err := newMasterNode.ReenableEventsRetry() if err != nil || app.emulateError("promote_reenable_events") { diff --git a/internal/app/app_dcs.go b/internal/app/app_dcs.go index 639579e1..92b73410 100644 --- a/internal/app/app_dcs.go +++ b/internal/app/app_dcs.go @@ -149,6 +149,12 @@ func (app *App) FinishSwitchover(switchover *Switchover, switchErr error) error switchover.Result.Error = switchErr.Error() } + if switchErr != nil { + app.logSwitchoverFailure(switchover) + } else if switchover.MasterTransition == SwitchoverTransition { + app.stopTiming(timingSwitchover) + } + err := app.dcs.Delete(pathCurrentSwitch) if err != nil { return err @@ -171,6 +177,9 @@ func (app *App) StartSwitchover(switchover *Switchover) error { app.logger.Info().Msgf("switchover: %s => %s starting...", switchover.From, switchover.To) switchover.StartedAt = time.Now() switchover.StartedBy = app.config.Hostname + if switchover.MasterTransition == SwitchoverTransition { + app.startTiming(timingSwitchover, time.Time{}) + } return app.dcs.Set(pathCurrentSwitch, switchover) } @@ -199,6 +208,10 @@ func (app *App) IssueFailover(master string) error { switchover.InitiatedAt = time.Now() switchover.Cause = CauseAuto switchover.MasterTransition = FailoverTransition + // downtime/failover are measured from the moment the master was lost + failedAt := app.t.Get(NodeFailedAt, master) + app.startTiming(timingDowntime, failedAt) + app.startTiming(timingFailover, failedAt) return app.dcs.Create(pathCurrentSwitch, switchover) } diff --git a/internal/app/data.go b/internal/app/data.go index 20d60949..fb6fd6b6 100644 --- a/internal/app/data.go +++ b/internal/app/data.go @@ -65,6 +65,9 @@ const ( // last known timestamp from repl_mon table pathMasterReplMonTS = "master_repl_mon_ts" + + // timing start timestamps, structure: pathTimings/ -> time.Time + pathTimings = "timing" ) var ( diff --git a/internal/app/timing_tracker.go b/internal/app/timing_tracker.go new file mode 100644 index 00000000..09fa0346 --- /dev/null +++ b/internal/app/timing_tracker.go @@ -0,0 +1,91 @@ +package app + +import ( + "context" + "fmt" + "os/exec" + "strconv" + "time" + + "github.com/yandex/mysync/internal/dcs" + "github.com/yandex/mysync/internal/util" +) + +// names of tracked timings +const ( + timingDowntime = "downtime" + timingFailover = "failover" + timingSwitchover = "switchover" +) + +func (app *App) timingPath(name string) string { + return dcs.JoinPath(pathTimings, name) +} + +// getTimingStart returns stored start time and whether it exists +func (app *App) getTimingStart(name string) (time.Time, bool) { + var ts time.Time + if err := app.dcs.Get(app.timingPath(name), &ts); err != nil { + return time.Time{}, false + } + return ts, true +} + +// startTiming stores timing start in dcs (zero ts means now) +func (app *App) startTiming(name string, ts time.Time) { + if ts.IsZero() { + ts = time.Now() + } + if err := app.dcs.Set(app.timingPath(name), ts); err != nil { + app.logger.Warn().Err(err).Msgf("failed to start timing %s", name) + } +} + +func (app *App) clearTiming(name string) { + if err := app.dcs.Delete(app.timingPath(name)); err != nil { + app.logger.Warn().Err(err).Msgf("failed to clear timing %s", name) + } +} + +// stopTiming logs elapsed time since start and clears it +func (app *App) stopTiming(name string) { + start, ok := app.getTimingStart(name) + if !ok { + return + } + app.clearTiming(name) + app.logTiming(name, time.Since(start)) +} + +// logSwitchoverFailure records time spent on a failed switchover as a negative value. +// Marker-guarded so it is logged once, and only when the switchover actually started. +func (app *App) logSwitchoverFailure(sw *Switchover) { + if sw.MasterTransition != SwitchoverTransition { + return + } + start, ok := app.getTimingStart(timingSwitchover) + if !ok { + return + } + app.clearTiming(timingSwitchover) + since := sw.InitiatedAt + if since.IsZero() { + since = start + } + app.logTiming(timingSwitchover+"_failure", -time.Since(since)) +} + +// logTiming runs the configured log_timing command with name and elapsed seconds +func (app *App) logTiming(name string, d time.Duration) { + command, ok := app.config.Commands["log_timing"] + if !ok || command == "" { + return + } + command = fmt.Sprintf(command, name, strconv.FormatFloat(d.Seconds(), 'f', 3, 64)) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + shell := util.GetEnvVariable("SHELL", "sh") + if err := exec.CommandContext(ctx, shell, "-c", command).Run(); err != nil { + app.logger.Warn().Err(err).Msgf("failed to execute log_timing command for %s", name) + } +} diff --git a/tests/features/timings.feature b/tests/features/timings.feature new file mode 100644 index 00000000..44f7bac1 --- /dev/null +++ b/tests/features/timings.feature @@ -0,0 +1,70 @@ +Feature: failover and switchover timings are logged via log_timing command + + Scenario: failover logs downtime and failover timings + Given cluster environment is + """ + MYSYNC_FAILOVER=true + MYSYNC_FAILOVER_DELAY=0s + """ + Given cluster is up and running + Then zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds + """ + ["mysql1","mysql2","mysql3"] + """ + When host "mysql1" is stopped + Then mysql host "mysql1" should become unavailable within "10" seconds + Then zookeeper node "/test/last_switch" should match json within "30" seconds + """ + { + "cause": "auto", + "from": "mysql1", + "master_transition": "failover", + "result": { + "ok": true + } + } + """ + When I get zookeeper node "/test/manager" + And I save zookeeper query result as "manager" + When I run command on host "{{.manager.hostname}}" until result match regexp "failover:" with timeout "30" seconds + """ + cat /tmp/timing.log + """ + Then command output should match regexp + """ + downtime: + """ + + Scenario: switchover logs downtime and switchover timings + Given cluster is up and running + Then mysql host "mysql1" should be master + And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + """ + ["mysql1","mysql2","mysql3"] + """ + When I run command on host "mysql1" + """ + mysync switch --to mysql2 --wait=0s + """ + Then command return code should be "0" + Then zookeeper node "/test/last_switch" should match json within "120" seconds + """ + { + "to": "mysql2", + "master_transition": "switchover", + "result": { + "ok": true + } + } + """ + Then mysql host "mysql2" should be master + When I get zookeeper node "/test/manager" + And I save zookeeper query result as "manager" + When I run command on host "{{.manager.hostname}}" until result match regexp "switchover:" with timeout "30" seconds + """ + cat /tmp/timing.log + """ + Then command output should match regexp + """ + downtime: + """ diff --git a/tests/images/mysql/mysync.yaml b/tests/images/mysql/mysync.yaml index f0d83d16..32aedf3d 100644 --- a/tests/images/mysql/mysync.yaml +++ b/tests/images/mysql/mysync.yaml @@ -37,6 +37,8 @@ mysql: error_log: /var/log/mysql/error.log queries: replication_lag: $MYSYNC_REPLICATION_LAG_QUERY +commands: + log_timing: 'echo "%s: %s" >> /tmp/timing.log' disable_semi_sync_replication_on_maintenance: ${MYSYNC_DISABLE_REPLICATION_ON_MAINT:-false} rpl_semi_sync_master_wait_for_slave_count: ${MYSYNC_WAIT_FOR_SLAVE_COUNT:-1} critical_disk_usage: ${MYSYNC_CRITICAL_DISK_USAGE:-100} From d0042470da33effcc16b351398aa20d3e72bae11 Mon Sep 17 00:00:00 2001 From: Alexander Morozov Date: Mon, 8 Jun 2026 13:59:23 +0300 Subject: [PATCH 2/5] fix timings --- internal/app/timing_tracker.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/app/timing_tracker.go b/internal/app/timing_tracker.go index 09fa0346..95753c48 100644 --- a/internal/app/timing_tracker.go +++ b/internal/app/timing_tracker.go @@ -68,6 +68,8 @@ func (app *App) logSwitchoverFailure(sw *Switchover) { return } app.clearTiming(timingSwitchover) + // a failed switchover must not leave a downtime marker for the safety net to mislog later + app.clearTiming(timingDowntime) since := sw.InitiatedAt if since.IsZero() { since = start From 0ca6eb16f5dd050c3d7d9ce06f1ca4ad21bbf0ac Mon Sep 17 00:00:00 2001 From: Alexander Morozov Date: Tue, 9 Jun 2026 15:18:51 +0300 Subject: [PATCH 3/5] test: fix zk jitter in tests --- tests/images/mysql/mysync.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/images/mysql/mysync.yaml b/tests/images/mysql/mysync.yaml index 32aedf3d..fe4ddbb8 100644 --- a/tests/images/mysql/mysync.yaml +++ b/tests/images/mysql/mysync.yaml @@ -16,6 +16,8 @@ resetupfile: /tmp/mysync.resetup resetup_crashed_hosts: ${MYSYNC_RESETUP_CRASHED_HOSTS:-false} zookeeper: session_timeout: 3s + random_host_provider: + retry_jitter: 1s namespace: /test hosts: [ $ZK_SERVERS ] auth: true From 3b3462b126fc488dd1ff471a3680a1e3fb55b105 Mon Sep 17 00:00:00 2001 From: Alexander Morozov Date: Wed, 10 Jun 2026 13:16:11 +0300 Subject: [PATCH 4/5] fxp --- internal/app/app.go | 16 +++++++++++----- internal/app/app_dcs.go | 6 +----- internal/app/timing_tracker.go | 7 ++++--- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/internal/app/app.go b/internal/app/app.go index 9b75db2f..18a3c492 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -693,7 +693,12 @@ func (app *App) stateManager() appState { if !clusterStateDcs[master].PingOk || clusterStateDcs[master].IsFileSystemReadonly { app.logger.Error().Msgf("MASTER FAILURE") - app.t.SetIfZero(NodeFailedAt, master, time.Now()) + if app.t.Get(NodeFailedAt, master).IsZero() { + now := time.Now() + app.t.Set(NodeFailedAt, master, now) + app.startTiming(timingDowntime, now) + app.startTiming(timingFailover, now) + } if lightMaintenance { app.logger.Info().Msgf("failover suppressed by light maintenance mode") } else { @@ -711,10 +716,11 @@ func (app *App) stateManager() appState { return stateManager } } else { - app.t.Clean(NodeFailedAt, master) - // flush timings left behind by an aborted failover/switchover - app.stopTiming(timingDowntime) - app.stopTiming(timingFailover) + if !app.t.Get(NodeFailedAt, master).IsZero() { + app.stopTiming(timingDowntime) + app.stopTiming(timingFailover) + app.t.Clean(NodeFailedAt, master) + } } if !clusterState[master].PingOk { diff --git a/internal/app/app_dcs.go b/internal/app/app_dcs.go index 92b73410..c7c68128 100644 --- a/internal/app/app_dcs.go +++ b/internal/app/app_dcs.go @@ -178,7 +178,7 @@ func (app *App) StartSwitchover(switchover *Switchover) error { switchover.StartedAt = time.Now() switchover.StartedBy = app.config.Hostname if switchover.MasterTransition == SwitchoverTransition { - app.startTiming(timingSwitchover, time.Time{}) + app.startTiming(timingSwitchover, switchover.InitiatedAt) } return app.dcs.Set(pathCurrentSwitch, switchover) } @@ -208,10 +208,6 @@ func (app *App) IssueFailover(master string) error { switchover.InitiatedAt = time.Now() switchover.Cause = CauseAuto switchover.MasterTransition = FailoverTransition - // downtime/failover are measured from the moment the master was lost - failedAt := app.t.Get(NodeFailedAt, master) - app.startTiming(timingDowntime, failedAt) - app.startTiming(timingFailover, failedAt) return app.dcs.Create(pathCurrentSwitch, switchover) } diff --git a/internal/app/timing_tracker.go b/internal/app/timing_tracker.go index 95753c48..fd1af971 100644 --- a/internal/app/timing_tracker.go +++ b/internal/app/timing_tracker.go @@ -49,17 +49,19 @@ func (app *App) clearTiming(name string) { // stopTiming logs elapsed time since start and clears it func (app *App) stopTiming(name string) { + now := time.Now() start, ok := app.getTimingStart(name) if !ok { return } app.clearTiming(name) - app.logTiming(name, time.Since(start)) + app.logTiming(name, now.Sub(start)) } // logSwitchoverFailure records time spent on a failed switchover as a negative value. // Marker-guarded so it is logged once, and only when the switchover actually started. func (app *App) logSwitchoverFailure(sw *Switchover) { + now := time.Now() if sw.MasterTransition != SwitchoverTransition { return } @@ -68,13 +70,12 @@ func (app *App) logSwitchoverFailure(sw *Switchover) { return } app.clearTiming(timingSwitchover) - // a failed switchover must not leave a downtime marker for the safety net to mislog later app.clearTiming(timingDowntime) since := sw.InitiatedAt if since.IsZero() { since = start } - app.logTiming(timingSwitchover+"_failure", -time.Since(since)) + app.logTiming(timingSwitchover+"_failure", -now.Sub(since)) } // logTiming runs the configured log_timing command with name and elapsed seconds From 3086190b0a3fb6ff986d0077b120d2f93d6bad68 Mon Sep 17 00:00:00 2001 From: Alexander Morozov Date: Wed, 10 Jun 2026 18:08:47 +0300 Subject: [PATCH 5/5] move failover timing --- internal/app/app.go | 2 -- internal/app/app_dcs.go | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/app/app.go b/internal/app/app.go index 18a3c492..6e74b2f1 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -1623,9 +1623,7 @@ func (app *App) performSwitchover(clusterState map[string]*nodestate.NodeState, } app.logger.Info().Msgf("switchover: new master %s set writable", newMaster) - // cluster is open for writes again, failover/downtime are over app.stopTiming(timingDowntime) - app.stopTiming(timingFailover) // reenable events events, err := newMasterNode.ReenableEventsRetry() diff --git a/internal/app/app_dcs.go b/internal/app/app_dcs.go index c7c68128..a32f769d 100644 --- a/internal/app/app_dcs.go +++ b/internal/app/app_dcs.go @@ -153,6 +153,8 @@ func (app *App) FinishSwitchover(switchover *Switchover, switchErr error) error app.logSwitchoverFailure(switchover) } else if switchover.MasterTransition == SwitchoverTransition { app.stopTiming(timingSwitchover) + } else { + app.stopTiming(timingFailover) } err := app.dcs.Delete(pathCurrentSwitch)