From 5233eee2b0bc923e18faa7a856ac3a4132281cc8 Mon Sep 17 00:00:00 2001 From: Haifeng He Date: Wed, 13 May 2026 14:04:38 -0700 Subject: [PATCH] Split connection-provider metrics from mux metrics Replace MuxDialFailed / MuxDialSuccess with EstablisherError, and the listener-side use of MuxErrors with ReceiverError. The new counters name the layer they describe (the connection provider), which is distinct from the mux session metrics they were previously bundled with. classifyError's fallback label is also renamed from "unclassified error" to "unknown" for shorter and OCI/Prometheus-friendlier tag values. Co-Authored-By: Claude Opus 4.7 (1M context) --- metrics/prometheus_defs.go | 11 ++++++----- transport/mux/establisher.go | 8 ++++---- transport/mux/receiver.go | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/metrics/prometheus_defs.go b/metrics/prometheus_defs.go index b14ca927..289f71be 100644 --- a/metrics/prometheus_defs.go +++ b/metrics/prometheus_defs.go @@ -58,15 +58,16 @@ var ( muxSessionLabels...) // Mux Manager - muxManagerLabels = []string{"addr", "mode", "config_name"} MuxErrors = DefaultCounterVec("mux_errors", "Number of errors observed from mux", append(muxManagerLabels, "error")...) MuxConnectionEstablish = DefaultCounterVec("mux_connection_establish", "Number of times mux has established", muxManagerLabels...) - MuxDialFailed = DefaultCounterVec("mux_dial_failed", "Mux failed when dialing", muxManagerLabels...) - MuxDialSuccess = DefaultCounterVec("mux_dial_success", "Mux succeeded on dial", muxManagerLabels...) MuxServerDisconnected = DefaultCounterVec("mux_server_disconnected", "Mux server disconnected", muxManagerLabels...) NumMuxesActive = DefaultGaugeVec("num_muxes_active", "Host-local number of active muxes for config", muxManagerLabels...) + // Connection provider + ReceiverError = DefaultCounterVec("receiver_error", "Number of errors observed from connection receiver", append(muxManagerLabels, "error")...) + EstablisherError = DefaultCounterVec("establisher_error", "Number of errors observed from connection establisher", muxManagerLabels...) + // Translation interceptor translationLabels = []string{"kind", "message_type"} @@ -128,9 +129,9 @@ func init() { // Mux Manager prometheus.MustRegister(MuxErrors) + prometheus.MustRegister(ReceiverError) prometheus.MustRegister(MuxConnectionEstablish) - prometheus.MustRegister(MuxDialFailed) - prometheus.MustRegister(MuxDialSuccess) + prometheus.MustRegister(EstablisherError) prometheus.MustRegister(MuxServerDisconnected) prometheus.MustRegister(NumMuxesActive) diff --git a/transport/mux/establisher.go b/transport/mux/establisher.go index b6cee678..a3e78def 100644 --- a/transport/mux/establisher.go +++ b/transport/mux/establisher.go @@ -73,8 +73,7 @@ func NewMuxEstablisherProvider(lifetime context.Context, name string, transportF return yamux.Client(conn, cfg) } // pre-initialize the MuxDial metrics - metrics.MuxDialFailed.WithLabelValues(metricLabels...) - metrics.MuxDialSuccess.WithLabelValues(metricLabels...) + metrics.EstablisherError.WithLabelValues(metricLabels...) return NewMuxProvider(lifetime, name, connPv, sessionFn, connectionsCapacity, transportFn, metricLabels, logger), nil } @@ -100,16 +99,17 @@ func (p *establishingConnProvider) NewConnection() (net.Conn, error) { p.logger.Info("mux client failed to dial", tag.Error(err)) return true } + if err := backoff.ThrottleRetry(dialFn, retryPolicy, retryable); err != nil { if p.lifetime.Err() != nil { // shutting down, just exit return nil, p.lifetime.Err() } p.logger.Error("mux client failed to dial with retry", tag.Error(err)) - metrics.MuxDialFailed.WithLabelValues(p.metricLabels...).Inc() + metrics.EstablisherError.WithLabelValues(p.metricLabels...).Inc() return nil, err } - metrics.MuxDialSuccess.WithLabelValues(p.metricLabels...).Inc() + return client, nil } diff --git a/transport/mux/receiver.go b/transport/mux/receiver.go index 9e1ae6de..c8d4549a 100644 --- a/transport/mux/receiver.go +++ b/transport/mux/receiver.go @@ -86,7 +86,7 @@ func (r *receivingConnProvider) NewConnection() (net.Conn, error) { } if err != nil { r.logger.Fatal("listener.Accept failed", tag.Error(err)) - metrics.MuxErrors.WithLabelValues(append(r.metricLabels, classifyError(err))...).Inc() + metrics.ReceiverError.WithLabelValues(append(r.metricLabels, classifyError(err))...).Inc() return nil, err } r.logger.Info("Accept new connection", tag.NewStringTag("remoteAddr", conn.RemoteAddr().String())) @@ -98,7 +98,7 @@ func classifyError(err error) string { if err == io.EOF { return "eof" } else { - return "unclassified error" + return "unknown" } }