yandex · teem0n · Jun 10, 2026 · Jun 5, 2026 · Jun 8, 2026 · Jun 9, 2026
diff --git a/internal/app/app.go b/internal/app/app.go
@@ -641,6 +641,7 @@ func (app *App) stateManager() appState {
 		} else {
 			if !switchover.InitiatedAt.IsZero() && time.Since(switchover.InitiatedAt) > app.config.SwitchoverTimeout {
 				app.logger.Error().Msgf("switchover %s => %s timed out after %s", switchover.From, switchover.To, time.Since(switchover.InitiatedAt))
+				app.logSwitchoverFailure(switchover)
 				err = app.FailSwitchover(switchover, fmt.Errorf("switchover timed out after %s", time.Since(switchover.InitiatedAt)))
 				if err != nil {
 					app.logger.Error().Err(err).Msg("failed to report switchover timeout")
@@ -692,7 +693,12 @@ func (app *App) stateManager() appState {
 
 	if !clusterStateDcs[master].PingOk || clusterStateDcs[master].IsFileSystemReadonly {
 		app.logger.Error().Msgf("MASTER FAILURE")
-		app.t.SetIfZero(NodeFailedAt, master, time.Now())
+		if app.t.Get(NodeFailedAt, master).IsZero() {
+			now := time.Now()
+			app.t.Set(NodeFailedAt, master, now)
+			app.startTiming(timingDowntime, now)
+			app.startTiming(timingFailover, now)
+		}
 		if lightMaintenance {
 			app.logger.Info().Msgf("failover suppressed by light maintenance mode")
 		} else {
@@ -710,7 +716,11 @@ func (app *App) stateManager() appState {
 			return stateManager
 		}
 	} else {
-		app.t.Clean(NodeFailedAt, master)
+		if !app.t.Get(NodeFailedAt, master).IsZero() {
+			app.stopTiming(timingDowntime)
+			app.stopTiming(timingFailover)
+			app.t.Clean(NodeFailedAt, master)
+		}
 	}
 
 	if !clusterState[master].PingOk {
@@ -1373,6 +1383,11 @@ func (app *App) performSwitchover(clusterState map[string]*nodestate.NodeState,
 		}
 	}
 
+	// downtime for failover starts at master loss (see IssueFailover); for switchover it starts here
+	if switchover.MasterTransition == SwitchoverTransition {
+		app.startTiming(timingDowntime, time.Time{})
+	}
+
 	// set read only everywhere (all HA-nodes) and stop replication
 	app.logger.Info().Msg("switchover: phase 1: enter read only")
 	errs := util.RunParallel(func(host string) error {
@@ -1614,6 +1629,8 @@ func (app *App) performSwitchover(clusterState map[string]*nodestate.NodeState,
 	}
 	app.logger.Info().Msgf("switchover: new master %s set writable", newMaster)
 
+	app.stopTiming(timingDowntime)
+
 	// reenable events
 	events, err := newMasterNode.ReenableEventsRetry()
 	if err != nil || app.emulateError("promote_reenable_events") {

diff --git a/internal/app/app_dcs.go b/internal/app/app_dcs.go
@@ -149,6 +149,14 @@ func (app *App) FinishSwitchover(switchover *Switchover, switchErr error) error
 		switchover.Result.Error = switchErr.Error()
 	}
 
+	if switchErr != nil {
+		app.logSwitchoverFailure(switchover)
+	} else if switchover.MasterTransition == SwitchoverTransition {
+		app.stopTiming(timingSwitchover)
+	} else {
+		app.stopTiming(timingFailover)
+	}
+
 	err := app.dcs.Delete(pathCurrentSwitch)
 	if err != nil {
 		return err
@@ -171,6 +179,9 @@ func (app *App) StartSwitchover(switchover *Switchover) error {
 	app.logger.Info().Msgf("switchover: %s => %s starting...", switchover.From, switchover.To)
 	switchover.StartedAt = time.Now()
 	switchover.StartedBy = app.config.Hostname
+	if switchover.MasterTransition == SwitchoverTransition {
+		app.startTiming(timingSwitchover, switchover.InitiatedAt)
+	}
 	return app.dcs.Set(pathCurrentSwitch, switchover)
 }
 

diff --git a/internal/app/data.go b/internal/app/data.go
@@ -65,6 +65,9 @@ const (
 
 	// last known timestamp from repl_mon table
 	pathMasterReplMonTS = "master_repl_mon_ts"
+
+	// timing start timestamps, structure: pathTimings/<name> -> time.Time
+	pathTimings = "timing"
 )
 
 var (

diff --git a/internal/app/timing_tracker.go b/internal/app/timing_tracker.go
@@ -0,0 +1,94 @@
+package app
+
+import (
+	"context"
+	"fmt"
+	"os/exec"
+	"strconv"
+	"time"
+
+	"github.com/yandex/mysync/internal/dcs"
+	"github.com/yandex/mysync/internal/util"
+)
+
+// names of tracked timings
+const (
+	timingDowntime   = "downtime"
+	timingFailover   = "failover"
+	timingSwitchover = "switchover"
+)
+
+func (app *App) timingPath(name string) string {
+	return dcs.JoinPath(pathTimings, name)
+}
+
+// getTimingStart returns stored start time and whether it exists
+func (app *App) getTimingStart(name string) (time.Time, bool) {
+	var ts time.Time
+	if err := app.dcs.Get(app.timingPath(name), &ts); err != nil {
+		return time.Time{}, false
+	}
+	return ts, true
+}
+
+// startTiming stores timing start in dcs (zero ts means now)
+func (app *App) startTiming(name string, ts time.Time) {
+	if ts.IsZero() {
+		ts = time.Now()
+	}
+	if err := app.dcs.Set(app.timingPath(name), ts); err != nil {
+		app.logger.Warn().Err(err).Msgf("failed to start timing %s", name)
+	}
+}
+
+func (app *App) clearTiming(name string) {
+	if err := app.dcs.Delete(app.timingPath(name)); err != nil {
+		app.logger.Warn().Err(err).Msgf("failed to clear timing %s", name)
+	}
+}
+
+// stopTiming logs elapsed time since start and clears it
+func (app *App) stopTiming(name string) {
+	now := time.Now()
+	start, ok := app.getTimingStart(name)
+	if !ok {
+		return
+	}
+	app.clearTiming(name)
+	app.logTiming(name, now.Sub(start))
+}
+
+// logSwitchoverFailure records time spent on a failed switchover as a negative value.
+// Marker-guarded so it is logged once, and only when the switchover actually started.
+func (app *App) logSwitchoverFailure(sw *Switchover) {
+	now := time.Now()
+	if sw.MasterTransition != SwitchoverTransition {
+		return
+	}
+	start, ok := app.getTimingStart(timingSwitchover)
+	if !ok {
+		return
+	}
+	app.clearTiming(timingSwitchover)
+	app.clearTiming(timingDowntime)
+	since := sw.InitiatedAt
+	if since.IsZero() {
+		since = start
+	}
+	app.logTiming(timingSwitchover+"_failure", -now.Sub(since))
+}
+
+// logTiming runs the configured log_timing command with name and elapsed seconds
+func (app *App) logTiming(name string, d time.Duration) {
+	command, ok := app.config.Commands["log_timing"]
+	if !ok || command == "" {
+		return
+	}
+	command = fmt.Sprintf(command, name, strconv.FormatFloat(d.Seconds(), 'f', 3, 64))
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+	shell := util.GetEnvVariable("SHELL", "sh")
+	if err := exec.CommandContext(ctx, shell, "-c", command).Run(); err != nil {
+		app.logger.Warn().Err(err).Msgf("failed to execute log_timing command for %s", name)
+	}
+}
diff --git a/tests/features/timings.feature b/tests/features/timings.feature
@@ -0,0 +1,70 @@
+Feature: failover and switchover timings are logged via log_timing command
+
+  Scenario: failover logs downtime and failover timings
+    Given cluster environment is
+      """
+      MYSYNC_FAILOVER=true
+      MYSYNC_FAILOVER_DELAY=0s
+      """
+    Given cluster is up and running
+    Then zookeeper node "/test/active_nodes" should match json_exactly within "20" seconds
+      """
+      ["mysql1","mysql2","mysql3"]
+      """
+    When host "mysql1" is stopped
+    Then mysql host "mysql1" should become unavailable within "10" seconds
+    Then zookeeper node "/test/last_switch" should match json within "30" seconds
+      """
+      {
+        "cause": "auto",
+        "from": "mysql1",
+        "master_transition": "failover",
+        "result": {
+          "ok": true
+        }
+      }
+      """
+    When I get zookeeper node "/test/manager"
+    And I save zookeeper query result as "manager"
+    When I run command on host "{{.manager.hostname}}" until result match regexp "failover:" with timeout "30" seconds
+      """
+      cat /tmp/timing.log
+      """
+    Then command output should match regexp
+      """
+      downtime:
+      """
+
+  Scenario: switchover logs downtime and switchover timings
+    Given cluster is up and running
+    Then mysql host "mysql1" should be master
+    And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
+      """
+      ["mysql1","mysql2","mysql3"]
+      """
+    When I run command on host "mysql1"
+      """
+      mysync switch --to mysql2 --wait=0s
+      """
+    Then command return code should be "0"
+    Then zookeeper node "/test/last_switch" should match json within "120" seconds
+      """
+      {
+        "to": "mysql2",
+        "master_transition": "switchover",
+        "result": {
+          "ok": true
+        }
+      }
+      """
+    Then mysql host "mysql2" should be master
+    When I get zookeeper node "/test/manager"
+    And I save zookeeper query result as "manager"
+    When I run command on host "{{.manager.hostname}}" until result match regexp "switchover:" with timeout "30" seconds
+      """
+      cat /tmp/timing.log
+      """
+    Then command output should match regexp
+      """
+      downtime:
+      """
diff --git a/tests/images/mysql/mysync.yaml b/tests/images/mysql/mysync.yaml
@@ -16,6 +16,8 @@ resetupfile: /tmp/mysync.resetup
 resetup_crashed_hosts: ${MYSYNC_RESETUP_CRASHED_HOSTS:-false}
 zookeeper:
   session_timeout: 3s
+  random_host_provider:
+    retry_jitter: 1s
   namespace: /test
   hosts: [ $ZK_SERVERS ]
   auth: true
@@ -37,6 +39,8 @@ mysql:
   error_log: /var/log/mysql/error.log
 queries:
   replication_lag: $MYSYNC_REPLICATION_LAG_QUERY
+commands:
+  log_timing: 'echo "%s: %s" >> /tmp/timing.log'
 disable_semi_sync_replication_on_maintenance: ${MYSYNC_DISABLE_REPLICATION_ON_MAINT:-false}
 rpl_semi_sync_master_wait_for_slave_count: ${MYSYNC_WAIT_FOR_SLAVE_COUNT:-1}
 critical_disk_usage: ${MYSYNC_CRITICAL_DISK_USAGE:-100}