From 1d615705f07a8ad9655f48edbf4cba4a2b7ff3f2 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Tue, 7 Apr 2026 16:05:34 +0800 Subject: [PATCH 01/24] bump sarama to avoid verbose logs --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index ec9f8a7ce4..4929bce08d 100644 --- a/go.mod +++ b/go.mod @@ -439,7 +439,7 @@ require ( sourcegraph.com/sourcegraph/appdash-data v0.0.0-20151005221446-73f23eafcf67 // indirect ) -replace github.com/IBM/sarama v1.41.2 => github.com/pingcap/sarama v1.41.2-pingcap-20250415 +replace github.com/IBM/sarama v1.41.2 => github.com/pingcap/sarama v1.41.2-pingcap-20251202-x // Fix https://github.com/pingcap/tiflow/issues/4961 replace github.com/benbjohnson/clock v1.3.5 => github.com/benbjohnson/clock v1.1.0 diff --git a/go.sum b/go.sum index 1cabd51f1c..d7c28da508 100644 --- a/go.sum +++ b/go.sum @@ -888,8 +888,8 @@ github.com/pingcap/log v1.1.1-0.20250917021125-19901e015dc9 h1:qG9BSvlWFEE5otQGa github.com/pingcap/log v1.1.1-0.20250917021125-19901e015dc9/go.mod h1:ORfBOFp1eteu2odzsyaxI+b8TzJwgjwyQcGhI+9SfEA= github.com/pingcap/metering_sdk v0.0.0-20251110022152-dac449ac5389 h1:bqbE3bwFSrUDSiN5M4EG+IXmm5eWLJnGRy/caXnxuHA= github.com/pingcap/metering_sdk v0.0.0-20251110022152-dac449ac5389/go.mod h1:zie1N5PRttgtqkZmRtpIDM7CuyWtvlX9LTxRd3fVSc4= -github.com/pingcap/sarama v1.41.2-pingcap-20250415 h1:jc/31lgAuSWLh8zr3y5bL0atpBFHAjch5H1Nnlb04J0= -github.com/pingcap/sarama v1.41.2-pingcap-20250415/go.mod h1:Kwi9CT6CuDYad3KS7HqjsbmD2DWkIKI7qI6a8PKlGb4= +github.com/pingcap/sarama v1.41.2-pingcap-20251202-x h1:9Vi3qqyDNZxG6fnXQhpeTsnwzSBWNpMeb8o02JkL9JM= +github.com/pingcap/sarama v1.41.2-pingcap-20251202-x/go.mod h1:xdpu7sd6OE1uxNdjYTSKUfY8FaKkJES9/+EyjSgiGQk= github.com/pingcap/sysutil v1.0.1-0.20240311050922-ae81ee01f3a5 h1:T4pXRhBflzDeAhmOQHNPRRogMYxP13V7BkYw3ZsoSfE= github.com/pingcap/sysutil v1.0.1-0.20240311050922-ae81ee01f3a5/go.mod h1:rlimy0GcTvjiJqvD5mXTRr8O2eNZPBrcUgiWVYp9530= github.com/pingcap/tidb v1.1.0-beta.0.20260325043212-0c4df2e19ecc h1:2Ateg1PUQqozi+TtbL+Bx62qn4IpQ48VTOpqCh2d3AI= From bd14141d258abe49abc788ae3e6197ebcc579273 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Tue, 7 Apr 2026 16:56:17 +0800 Subject: [PATCH 02/24] remove verbose log --- cdc/kv/shared_client.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cdc/kv/shared_client.go b/cdc/kv/shared_client.go index 6953cbc32a..5ce8744fb8 100644 --- a/cdc/kv/shared_client.go +++ b/cdc/kv/shared_client.go @@ -832,13 +832,10 @@ func (s *SharedClient) logSlowRegions(ctx context.Context) error { return ctx.Err() case <-ticker.C: } - log.Info("event feed starts to check locked regions", - zap.String("namespace", s.changefeed.Namespace), - zap.String("changefeed", s.changefeed.ID)) currTime := s.pdClock.CurrentTime() - s.totalSpans.RLock() var slowInitializeRegionCount int + s.totalSpans.RLock() for subscriptionID, rt := range s.totalSpans.v { attr := rt.rangeLock.IterAll(nil) ckptTime := oracle.GetTimeFromTS(attr.SlowestRegion.ResolvedTs) From 109caa87c0b6dd6ecd58838808d69f904f041bf7 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Tue, 7 Apr 2026 17:17:00 +0800 Subject: [PATCH 03/24] remove some useless logs --- cdc/owner/owner.go | 2 +- cdc/processor/sinkmanager/manager.go | 21 +++++++++++-------- .../sinkmanager/table_sink_wrapper.go | 8 ------- .../v3/replication/replication_set.go | 5 ++--- 4 files changed, 15 insertions(+), 21 deletions(-) diff --git a/cdc/owner/owner.go b/cdc/owner/owner.go index 7c7307a714..0c2f17762a 100644 --- a/cdc/owner/owner.go +++ b/cdc/owner/owner.go @@ -315,7 +315,7 @@ func preflightCheck(changefeed *orchestrator.ChangefeedReactorState, log.Info("changefeed preflight check failed, will skip this tick", zap.String("namespace", changefeed.ID.Namespace), zap.String("changefeed", changefeed.ID.ID), - zap.Any("status", changefeed.Status), zap.Bool("ok", ok), + zap.Any("status", changefeed.Status), ) } diff --git a/cdc/processor/sinkmanager/manager.go b/cdc/processor/sinkmanager/manager.go index 171443fa1b..2ad5334b64 100644 --- a/cdc/processor/sinkmanager/manager.go +++ b/cdc/processor/sinkmanager/manager.go @@ -863,12 +863,6 @@ func (m *SinkManager) AddTable(span tablepb.Span, startTs model.Ts, targetTs mod // StartTable sets the table(TableSink) state to replicating. func (m *SinkManager) StartTable(span tablepb.Span, startTs model.Ts) error { - log.Info("Start table sink", - zap.String("namespace", m.changefeedID.Namespace), - zap.String("changefeed", m.changefeedID.ID), - zap.Stringer("span", &span), - zap.Uint64("startTs", startTs), - ) tableSink, ok := m.tableSinks.Load(span) if !ok { log.Panic("Table sink not found when starting table stats", @@ -877,22 +871,31 @@ func (m *SinkManager) StartTable(span tablepb.Span, startTs model.Ts) error { zap.Stringer("span", &span)) } - if err := tableSink.(*tableSinkWrapper).start(m.managerCtx, startTs); err != nil { + t := tableSink.(*tableSinkWrapper) + if err := t.start(m.managerCtx, startTs); err != nil { return err } m.sinkProgressHeap.push(&progress{ span: span, nextLowerBoundPos: sorter.Position{StartTs: 0, CommitTs: startTs + 1}, - version: tableSink.(*tableSinkWrapper).version, + version: t.version, }) if m.redoDMLMgr != nil { m.redoProgressHeap.push(&progress{ span: span, nextLowerBoundPos: sorter.Position{StartTs: 0, CommitTs: startTs + 1}, - version: tableSink.(*tableSinkWrapper).version, + version: t.version, }) } + + log.Info("Sink is started", + zap.String("namespace", m.changefeedID.Namespace), + zap.String("changefeed", m.changefeedID.ID), + zap.Stringer("span", &span), + zap.Uint64("startTs", startTs), + zap.Uint64("replicateTs", t.GetReplicaTs()), + ) return nil } diff --git a/cdc/processor/sinkmanager/table_sink_wrapper.go b/cdc/processor/sinkmanager/table_sink_wrapper.go index 3dfa10c7f9..580a0f5581 100644 --- a/cdc/processor/sinkmanager/table_sink_wrapper.go +++ b/cdc/processor/sinkmanager/table_sink_wrapper.go @@ -160,14 +160,6 @@ func (t *tableSinkWrapper) start(ctx context.Context, startTs model.Ts) (err err } t.replicateTs.Store(ts) - log.Info("Sink is started", - zap.String("namespace", t.changefeed.Namespace), - zap.String("changefeed", t.changefeed.ID), - zap.Stringer("span", &t.span), - zap.Uint64("startTs", startTs), - zap.Uint64("replicateTs", ts), - ) - // This start ts maybe greater than the initial start ts of the table sink. // Because in two phase scheduling, the table sink may be advanced to a later ts. // And we can just continue to replicate the table sink from the new start ts. diff --git a/cdc/scheduler/internal/v3/replication/replication_set.go b/cdc/scheduler/internal/v3/replication/replication_set.go index f3662cf419..0883d31143 100644 --- a/cdc/scheduler/internal/v3/replication/replication_set.go +++ b/cdc/scheduler/internal/v3/replication/replication_set.go @@ -908,9 +908,8 @@ func (r *ReplicationSet) handleAddTable( func (r *ReplicationSet) handleMoveTable( dest model.CaptureID, ) ([]*schedulepb.Message, error) { - // Ignore move table if it has been removed already. if r.hasRemoved() { - log.Warn("schedulerv3: move table is ignored", + log.Warn("schedulerv3: move table is ignored, since it removed already", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.Int64("tableID", r.Span.TableID), @@ -921,7 +920,7 @@ func (r *ReplicationSet) handleMoveTable( // 1) it's not in Replicating state or // 2) the dest capture is the primary. if r.State != ReplicationSetStateReplicating || r.Primary == dest { - log.Warn("schedulerv3: move table is ignored", + log.Warn("schedulerv3: move table is ignored, since it's not replicating or the primary is the same as the move destination", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.Int64("tableID", r.Span.TableID), From 2b91a12fad3bd4205d3fe508bfc7513275f483c6 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Tue, 7 Apr 2026 18:06:54 +0800 Subject: [PATCH 04/24] remove some useless logs --- cdc/owner/changefeed.go | 1 - cdc/processor/processor.go | 1 - 2 files changed, 2 deletions(-) diff --git a/cdc/owner/changefeed.go b/cdc/owner/changefeed.go index 950fa2fa27..1813243d44 100755 --- a/cdc/owner/changefeed.go +++ b/cdc/owner/changefeed.go @@ -665,7 +665,6 @@ LOOP2: return errors.Trace(err) } cfInfo.Config.Sink.TiDBSourceID = sourceID - log.Info("get sourceID from PD", zap.Uint64("sourceID", sourceID), zap.Stringer("changefeedID", c.id)) c.ddlSink = c.newSink(c.id, cfInfo, c.Throw(ctx), func(err error) { select { diff --git a/cdc/processor/processor.go b/cdc/processor/processor.go index faa32e1348..22f941f216 100644 --- a/cdc/processor/processor.go +++ b/cdc/processor/processor.go @@ -668,7 +668,6 @@ func (p *processor) lazyInitImpl(_ context.Context) (err error) { if err != nil { return errors.Trace(err) } - log.Info("get sourceID from PD", zap.Uint64("sourceID", sourceID), zap.Stringer("changefeedID", p.changefeedID)) cfConfig.Sink.TiDBSourceID = sourceID p.redo.r = redo.NewDMLManager(p.changefeedID, cfConfig.Consistent) From cf888cb0f26881469dd25307dfa721eea91c0c03 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Thu, 9 Apr 2026 11:43:18 +0800 Subject: [PATCH 05/24] remove useless logs --- .../v3/replication/replication_manager.go | 58 ++++++----- .../replication/replication_manager_test.go | 85 +++++------------ .../v3/replication/replication_set.go | 43 --------- .../v3/replication/replication_set_test.go | 95 ------------------- pkg/txnutil/gc/gc_manager.go | 3 - 5 files changed, 51 insertions(+), 233 deletions(-) diff --git a/cdc/scheduler/internal/v3/replication/replication_manager.go b/cdc/scheduler/internal/v3/replication/replication_manager.go index 8f200f2d35..6688fb1457 100644 --- a/cdc/scheduler/internal/v3/replication/replication_manager.go +++ b/cdc/scheduler/internal/v3/replication/replication_manager.go @@ -15,7 +15,6 @@ package replication import ( "bytes" - "container/heap" "fmt" "math" "time" @@ -35,9 +34,8 @@ import ( const ( checkpointCannotProceed = internal.CheckpointCannotProceed - defaultSlowTableHeapSize = 4 logSlowTablesLagThreshold = 30 * time.Second - logSlowTablesInterval = 1 * time.Minute + logSlowTablesInterval = 10 * time.Minute logMissingTableInterval = 30 * time.Second ) @@ -154,7 +152,6 @@ type Manager struct { //nolint:revive acceptMoveTableTask int acceptBurstBalanceTask int - slowTableHeap SetHeap lastLogSlowTablesTime time.Time lastMissTableID tablepb.TableID lastLogMissTime time.Time @@ -719,8 +716,7 @@ func (r *Manager) AdvanceCheckpoint( } // If changefeed's checkpoint lag is larger than 30s, - // log the 4 slowlest table infos every minute, which can - // help us find the problematic tables. + // log the slowest table info which can help us find the problematic table. checkpointLag := currentPDTime.Sub(oracle.GetTimeFromTS(watermark.CheckpointTs)) if checkpointLag > logSlowTablesLagThreshold && time.Since(r.lastLogSlowTablesTime) > logSlowTablesInterval { @@ -744,32 +740,32 @@ func (r *Manager) AdvanceCheckpoint( return watermark } -func (r *Manager) logSlowTableInfo(currentPDTime time.Time) { - // find the slow tables - r.spans.Ascend(func(span tablepb.Span, table *ReplicationSet) bool { - lag := currentPDTime.Sub(oracle.GetTimeFromTS(table.Checkpoint.CheckpointTs)) - if lag > logSlowTablesLagThreshold { - heap.Push(&r.slowTableHeap, table) - if r.slowTableHeap.Len() > defaultSlowTableHeapSize { - heap.Pop(&r.slowTableHeap) - } - } - return true - }) - - num := r.slowTableHeap.Len() - for i := 0; i < num; i++ { - table := heap.Pop(&r.slowTableHeap).(*ReplicationSet) - log.Info("schedulerv3: slow table", - zap.String("namespace", r.changefeedID.Namespace), - zap.String("changefeed", r.changefeedID.ID), - zap.Int64("tableID", table.Span.TableID), - zap.String("tableStatus", table.State.String()), - zap.Uint64("checkpointTs", table.Checkpoint.CheckpointTs), - zap.Uint64("resolvedTs", table.Checkpoint.ResolvedTs), - zap.Duration("checkpointLag", currentPDTime. - Sub(oracle.GetTimeFromTS(table.Checkpoint.CheckpointTs)))) +func (r *Manager) getSlowestTableForLog(currentPDTime time.Time) *ReplicationSet { + table, ok := r.spans.Get(r.slowestSink) + if !ok { + return nil + } + checkpointLag := currentPDTime.Sub(oracle.GetTimeFromTS(table.Checkpoint.CheckpointTs)) + if checkpointLag <= logSlowTablesLagThreshold { + return nil } + return table +} + +func (r *Manager) logSlowTableInfo(currentPDTime time.Time) { + table := r.getSlowestTableForLog(currentPDTime) + if table == nil { + return + } + log.Info("schedulerv3: slow table", + zap.String("namespace", r.changefeedID.Namespace), + zap.String("changefeed", r.changefeedID.ID), + zap.Int64("tableID", table.Span.TableID), + zap.String("tableStatus", table.State.String()), + zap.Uint64("checkpointTs", table.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", table.Checkpoint.ResolvedTs), + zap.Duration("checkpointLag", currentPDTime. + Sub(oracle.GetTimeFromTS(table.Checkpoint.CheckpointTs)))) } // CollectMetrics collects metrics. diff --git a/cdc/scheduler/internal/v3/replication/replication_manager_test.go b/cdc/scheduler/internal/v3/replication/replication_manager_test.go index a899bc4a98..4f659c224e 100644 --- a/cdc/scheduler/internal/v3/replication/replication_manager_test.go +++ b/cdc/scheduler/internal/v3/replication/replication_manager_test.go @@ -26,6 +26,7 @@ import ( "github.com/pingcap/tiflow/pkg/spanz" "github.com/pingcap/tiflow/pkg/util" "github.com/stretchr/testify/require" + "github.com/tikv/client-go/v2/oracle" ) func TestReplicationManagerHandleAddTableTask(t *testing.T) { @@ -678,6 +679,7 @@ func TestReplicationManagerAdvanceCheckpoint(t *testing.T) { require.Equal(t, model.Ts(20), watermark.ResolvedTs) require.Equal(t, model.Ts(20), watermark.LastSyncedTs) require.Equal(t, model.Ts(30), watermark.PullerResolvedTs) + require.Equal(t, span, r.slowestSink) // some table not exist yet. currentTables.UpdateTables([]model.TableID{1, 2, 3}) @@ -852,6 +854,28 @@ func TestReplicationManagerAdvanceCheckpoint(t *testing.T) { require.Equal(t, model.Ts(12), watermark.LastSyncedTs) require.Equal(t, model.Ts(16), watermark.PullerResolvedTs) require.Equal(t, model.Ts(9), barrier.GetGlobalBarrierTs()) + + redoMetaManager.enable = false + currentPDTime := time.Now() + rsRedo := r.spans.GetV(spanRedo) + rsRedo.Checkpoint.CheckpointTs = oracle.GoTimeToTS( + currentPDTime.Add(-logSlowTablesLagThreshold - time.Second)) + watermark = r.AdvanceCheckpoint( + currentTables, currentPDTime, schedulepb.NewBarrierWithMinTs(math.MaxUint64), redoMetaManager) + require.Equal(t, rsRedo.Checkpoint.CheckpointTs, watermark.CheckpointTs) + require.Equal(t, spanRedo, r.slowestSink) + require.Equal(t, rsRedo, r.getSlowestTableForLog(currentPDTime)) + + r.spans.Delete(spanRedo) + require.Nil(t, r.getSlowestTableForLog(currentPDTime)) + + r.spans.ReplaceOrInsert(spanRedo, rsRedo) + rsRedo.Checkpoint.CheckpointTs = oracle.GoTimeToTS(currentPDTime.Add(-time.Second)) + watermark = r.AdvanceCheckpoint( + currentTables, currentPDTime, schedulepb.NewBarrierWithMinTs(math.MaxUint64), redoMetaManager) + require.Equal(t, rsRedo.Checkpoint.CheckpointTs, watermark.CheckpointTs) + require.Equal(t, spanRedo, r.slowestSink) + require.Nil(t, r.getSlowestTableForLog(currentPDTime)) } func TestReplicationManagerAdvanceCheckpointWithRedoEnabled(t *testing.T) { @@ -1007,64 +1031,3 @@ func TestReplicationManagerHandleCaptureChangesDuringAddTable(t *testing.T) { require.NotNil(t, r.runningTasks.Has(spanz.TableIDToComparableSpan(1))) require.Equal(t, 1, <-addTableCh) } - -func TestLogSlowTableInfo(t *testing.T) { - t.Parallel() - r := NewReplicationManager(1, model.ChangeFeedID{}) - r.spans.ReplaceOrInsert(spanz.TableIDToComparableSpan(1), &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(1), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 1}, - State: ReplicationSetStateReplicating, - }) - r.spans.ReplaceOrInsert(spanz.TableIDToComparableSpan(2), &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(2), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 2}, - State: ReplicationSetStatePrepare, - }) - r.spans.ReplaceOrInsert(spanz.TableIDToComparableSpan(3), &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(3), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 3}, - State: ReplicationSetStatePrepare, - }) - r.logSlowTableInfo(time.Now()) - // make sure all tables are will be pop out from heal after logged - require.Equal(t, r.slowTableHeap.Len(), 0) - r.spans.ReplaceOrInsert(spanz.TableIDToComparableSpan(4), &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(4), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 4}, - State: ReplicationSetStatePrepare, - }) - r.spans.ReplaceOrInsert(spanz.TableIDToComparableSpan(5), &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(5), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 5}, - State: ReplicationSetStatePrepare, - }) - r.spans.ReplaceOrInsert(spanz.TableIDToComparableSpan(6), &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(6), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 6}, - State: ReplicationSetStatePrepare, - }) - r.spans.ReplaceOrInsert(spanz.TableIDToComparableSpan(7), &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(7), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 7}, - State: ReplicationSetStatePrepare, - }) - r.spans.ReplaceOrInsert(spanz.TableIDToComparableSpan(8), &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(8), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 8}, - State: ReplicationSetStatePrepare, - }) - r.spans.ReplaceOrInsert(spanz.TableIDToComparableSpan(9), &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(9), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 9}, - State: ReplicationSetStatePrepare, - }) - r.spans.ReplaceOrInsert(spanz.TableIDToComparableSpan(1), &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(10), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 10}, - State: ReplicationSetStatePrepare, - }) - r.logSlowTableInfo(time.Now()) - // make sure the slowTableHeap's capacity will not extend - require.Equal(t, cap(r.slowTableHeap), 8) -} diff --git a/cdc/scheduler/internal/v3/replication/replication_set.go b/cdc/scheduler/internal/v3/replication/replication_set.go index 0883d31143..8e1b3d4574 100644 --- a/cdc/scheduler/internal/v3/replication/replication_set.go +++ b/cdc/scheduler/internal/v3/replication/replication_set.go @@ -1059,46 +1059,3 @@ func (r *ReplicationSet) updateCheckpointAndStats( r.Stats = stats } } - -// SetHeap is a max-heap, it implements heap.Interface. -type SetHeap []*ReplicationSet - -// NewReplicationSetHeap creates a new SetHeap. -func NewReplicationSetHeap(capacity int) SetHeap { - if capacity <= 0 { - panic("capacity must be positive") - } - return make(SetHeap, 0, capacity) -} - -// Len returns the length of the heap. -func (h SetHeap) Len() int { return len(h) } - -// Less returns true if the element at i is less than the element at j. -func (h SetHeap) Less(i, j int) bool { - if h[i].Checkpoint.CheckpointTs > h[j].Checkpoint.CheckpointTs { - return true - } - if h[i].Checkpoint.CheckpointTs == h[j].Checkpoint.CheckpointTs { - return h[i].Checkpoint.ResolvedTs > h[j].Checkpoint.ResolvedTs - } - return false -} - -// Swap swaps the elements with indexes i and j. -func (h SetHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } - -// Push pushes an element to the heap. -func (h *SetHeap) Push(x interface{}) { - *h = append(*h, x.(*ReplicationSet)) -} - -// Pop pops an element from the heap. -func (h *SetHeap) Pop() interface{} { - old := *h - n := len(old) - x := old[n-1] - old[n-1] = nil - *h = old[0 : n-1] - return x -} diff --git a/cdc/scheduler/internal/v3/replication/replication_set_test.go b/cdc/scheduler/internal/v3/replication/replication_set_test.go index d7ab23f739..7e0ae3cc0d 100644 --- a/cdc/scheduler/internal/v3/replication/replication_set_test.go +++ b/cdc/scheduler/internal/v3/replication/replication_set_test.go @@ -14,7 +14,6 @@ package replication import ( - "container/heap" "encoding/json" "math/rand" "testing" @@ -23,7 +22,6 @@ import ( "github.com/pingcap/tiflow/cdc/model" "github.com/pingcap/tiflow/cdc/processor/tablepb" "github.com/pingcap/tiflow/cdc/scheduler/schedulepb" - "github.com/pingcap/tiflow/pkg/spanz" "github.com/stretchr/testify/require" ) @@ -1595,99 +1593,6 @@ func TestReplicationSetRemoveRestart(t *testing.T) { require.True(t, r.hasRemoved()) } -func TestReplicationSetHeap_Len(t *testing.T) { - t.Parallel() - - h := NewReplicationSetHeap(defaultSlowTableHeapSize) - require.Equal(t, 0, h.Len()) - - h = append(h, &ReplicationSet{Span: spanz.TableIDToComparableSpan(0)}) - require.Equal(t, 1, h.Len()) - - h = append(h, &ReplicationSet{Span: spanz.TableIDToComparableSpan(1)}) - require.Equal(t, 2, h.Len()) -} - -func TestReplicationSetHeap_Less(t *testing.T) { - t.Parallel() - - h := NewReplicationSetHeap(defaultSlowTableHeapSize) - h = append(h, &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(0), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 1}, - }) - h = append(h, &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(1), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 2, ResolvedTs: 3}, - }) - h = append(h, &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(2), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 2, ResolvedTs: 4}, - }) - require.True(t, h.Less(1, 0)) - require.True(t, h.Less(2, 1)) -} - -func TestReplicationSetHeap_Basic(t *testing.T) { - t.Parallel() - - h := NewReplicationSetHeap(defaultSlowTableHeapSize) - heap.Init(&h) - heap.Push(&h, &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(0), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 1}, - }) - heap.Push(&h, &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(1), - Checkpoint: tablepb.Checkpoint{CheckpointTs: 2}, - }) - require.Equal(t, 2, h.Len()) - - require.Equal(t, int64(1), heap.Pop(&h).(*ReplicationSet).Span.TableID) - require.Equal(t, 1, h.Len()) - - require.Equal(t, int64(0), heap.Pop(&h).(*ReplicationSet).Span.TableID) - require.Equal(t, 0, h.Len()) -} - -// TestReplicationSetHeap_MinK tests that the heap can be -// used to keep the min K elements. -func TestReplicationSetHeap_MinK(t *testing.T) { - t.Parallel() - - // K = defaultSlowTableHeapSize - h := NewReplicationSetHeap(defaultSlowTableHeapSize) - heap.Init(&h) - - for i := 2 * defaultSlowTableHeapSize; i > 0; i-- { - replicationSet := &ReplicationSet{ - Span: spanz.TableIDToComparableSpan(int64(i)), - Checkpoint: tablepb.Checkpoint{CheckpointTs: uint64(i)}, - } - heap.Push(&h, replicationSet) - if h.Len() > defaultSlowTableHeapSize { - heap.Pop(&h) - } - } - - require.Equal(t, defaultSlowTableHeapSize, h.Len()) - - expectedTables := make([]int64, 0) - for i := defaultSlowTableHeapSize; i > 0; i-- { - expectedTables = append(expectedTables, int64(i)) - } - - tables := make([]model.TableID, 0) - tableCounts := h.Len() - for i := 0; i < tableCounts; i++ { - element := heap.Pop(&h).(*ReplicationSet) - t.Log(element.Span) - tables = append(tables, element.Span.TableID) - } - require.Equal(t, expectedTables, tables) - require.Equal(t, 0, h.Len()) -} - func TestUpdateCheckpointAndStats(t *testing.T) { cases := []struct { checkpoint tablepb.Checkpoint diff --git a/pkg/txnutil/gc/gc_manager.go b/pkg/txnutil/gc/gc_manager.go index 64fa462e35..a7aafeb500 100644 --- a/pkg/txnutil/gc/gc_manager.go +++ b/pkg/txnutil/gc/gc_manager.go @@ -90,9 +90,6 @@ func (m *gcManager) TryUpdateGCSafePoint( failpoint.Inject("InjectActualGCSafePoint", func(val failpoint.Value) { actual = uint64(val.(int)) }) - if actual == checkpointTs { - log.Info("update gc safe point success", zap.Uint64("gcSafePointTs", checkpointTs)) - } if actual > checkpointTs { log.Warn("update gc safe point failed, the gc safe point is larger than checkpointTs", zap.Uint64("actual", actual), zap.Uint64("checkpointTs", checkpointTs)) From a39a9d93c2262b6372c38d0c8982830edadda238 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Thu, 9 Apr 2026 14:03:39 +0800 Subject: [PATCH 06/24] simplify some logs --- cdc/kv/shared_client.go | 2 +- cdc/model/changefeed.go | 3 --- cdc/processor/processor.go | 2 +- cdc/processor/sinkmanager/manager.go | 12 ++---------- cdc/processor/sourcemanager/sorter/factory/pebble.go | 3 --- cdc/server/server.go | 1 - cdc/sink/dmlsink/txn/worker.go | 6 ------ pkg/causality/txn_cache.go | 4 ---- pkg/config/sink.go | 4 ---- pkg/etcd/client.go | 2 -- pkg/migrate/migrate.go | 2 -- 11 files changed, 4 insertions(+), 37 deletions(-) diff --git a/cdc/kv/shared_client.go b/cdc/kv/shared_client.go index 5ce8744fb8..d9fbd0699b 100644 --- a/cdc/kv/shared_client.go +++ b/cdc/kv/shared_client.go @@ -824,7 +824,7 @@ func (s *SharedClient) handleResolveLockTasks(ctx context.Context) error { } func (s *SharedClient) logSlowRegions(ctx context.Context) error { - ticker := time.NewTicker(30 * time.Second) + ticker := time.NewTicker(5 * time.Minute) defer ticker.Stop() for { select { diff --git a/cdc/model/changefeed.go b/cdc/model/changefeed.go index 1f009d11f7..dfd315a0ec 100644 --- a/cdc/model/changefeed.go +++ b/cdc/model/changefeed.go @@ -381,9 +381,6 @@ func (info *ChangeFeedInfo) RmUnusedFields() { } func (info *ChangeFeedInfo) rmMQOnlyFields() { - log.Info("since the downstream is not a MQ, remove MQ only fields", - zap.String("namespace", info.Namespace), - zap.String("changefeed", info.ID)) info.Config.Sink.DispatchRules = nil info.Config.Sink.SchemaRegistry = nil info.Config.Sink.EncoderConcurrency = nil diff --git a/cdc/processor/processor.go b/cdc/processor/processor.go index 22f941f216..be1befb4a3 100644 --- a/cdc/processor/processor.go +++ b/cdc/processor/processor.go @@ -303,7 +303,7 @@ func (p *processor) IsAddTableSpanFinished(span tablepb.Span, isPrepare bool) bo zap.String("captureID", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), - zap.Stringer("span", &span), + zap.Int64("tableID", span.TableID), zap.Uint64("tableResolvedTs", tableResolvedTs), zap.Uint64("tableCheckpointTs", tableCheckpointTs), zap.Uint64("globalCheckpointTs", globalCheckpointTs), diff --git a/cdc/processor/sinkmanager/manager.go b/cdc/processor/sinkmanager/manager.go index 2ad5334b64..2d1e174ad9 100644 --- a/cdc/processor/sinkmanager/manager.go +++ b/cdc/processor/sinkmanager/manager.go @@ -865,10 +865,10 @@ func (m *SinkManager) AddTable(span tablepb.Span, startTs model.Ts, targetTs mod func (m *SinkManager) StartTable(span tablepb.Span, startTs model.Ts) error { tableSink, ok := m.tableSinks.Load(span) if !ok { - log.Panic("Table sink not found when starting table stats", + log.Panic("table sink not found when start it", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.Stringer("span", &span)) + zap.Int64("tableID", span.TableID)) } t := tableSink.(*tableSinkWrapper) @@ -888,14 +888,6 @@ func (m *SinkManager) StartTable(span tablepb.Span, startTs model.Ts) error { version: t.version, }) } - - log.Info("Sink is started", - zap.String("namespace", m.changefeedID.Namespace), - zap.String("changefeed", m.changefeedID.ID), - zap.Stringer("span", &span), - zap.Uint64("startTs", startTs), - zap.Uint64("replicateTs", t.GetReplicaTs()), - ) return nil } diff --git a/cdc/processor/sourcemanager/sorter/factory/pebble.go b/cdc/processor/sourcemanager/sorter/factory/pebble.go index 3b4ff1ad07..62a7bb3b97 100644 --- a/cdc/processor/sourcemanager/sorter/factory/pebble.go +++ b/cdc/processor/sourcemanager/sorter/factory/pebble.go @@ -79,9 +79,6 @@ func createPebbleDBs( log.Error("create pebble fails", zap.String("dir", dir), zap.Int("id", id), zap.Error(err)) return } - log.Info("create pebble instance success", - zap.Int("id", id+1), - zap.Uint64("sharedCacheSize", memQuotaInBytes)) dbs = append(dbs, db) } err = tableCache.Unref() diff --git a/cdc/server/server.go b/cdc/server/server.go index 92b1294dca..63afa0c7ea 100644 --- a/cdc/server/server.go +++ b/cdc/server/server.go @@ -160,7 +160,6 @@ func (s *server) prepare(ctx context.Context) error { if err != nil { return errors.Trace(err) } - log.Info("create etcdCli", zap.Strings("endpoints", s.pdEndpoints)) // we do not pass a `context` to create a the etcd client, // to prevent it's cancelled when the server is closing. // For example, when the non-owner node goes offline, diff --git a/cdc/sink/dmlsink/txn/worker.go b/cdc/sink/dmlsink/txn/worker.go index d9727618b7..2936fb490d 100644 --- a/cdc/sink/dmlsink/txn/worker.go +++ b/cdc/sink/dmlsink/txn/worker.go @@ -87,9 +87,6 @@ func (w *worker) runLoop(txnCh <-chan causality.TxnWithNotifier[*txnEvent]) erro zap.Error(err)) } }() - log.Info("Transaction dmlSink worker starts", - zap.String("changefeedID", w.changefeed), - zap.Int("workerID", w.ID)) cleanSlowLogHistory := time.NewTicker(time.Hour) defer cleanSlowLogHistory.Stop() @@ -98,9 +95,6 @@ func (w *worker) runLoop(txnCh <-chan causality.TxnWithNotifier[*txnEvent]) erro for { select { case <-w.ctx.Done(): - log.Info("Transaction dmlSink worker exits as canceled", - zap.String("changefeedID", w.changefeed), - zap.Int("workerID", w.ID)) return nil case <-cleanSlowLogHistory.C: lastSlowConflictDetectLog := w.lastSlowConflictDetectLog diff --git a/pkg/causality/txn_cache.go b/pkg/causality/txn_cache.go index 86d064f010..108057b616 100644 --- a/pkg/causality/txn_cache.go +++ b/pkg/causality/txn_cache.go @@ -17,7 +17,6 @@ import ( "sync/atomic" "github.com/pingcap/log" - "go.uber.org/zap" ) const ( @@ -68,9 +67,6 @@ type txnCache[Txn txnEvent] interface { } func newTxnCache[Txn txnEvent](opt TxnCacheOption) txnCache[Txn] { - log.Info("create new worker cache in conflict detector", - zap.Int("cacheCount", opt.Count), - zap.Int("cacheSize", opt.Size), zap.String("BlockStrategy", string(opt.BlockStrategy))) if opt.Size <= 0 { log.Panic("WorkerOption.CacheSize should be greater than 0, please report a bug") } diff --git a/pkg/config/sink.go b/pkg/config/sink.go index ff8c1236ff..6590ea9d0e 100644 --- a/pkg/config/sink.go +++ b/pkg/config/sink.go @@ -845,10 +845,6 @@ func (s *SinkConfig) validateAndAdjustSinkURI(sinkURI *url.URL) error { return err } - log.Info("succeed to parse parameter from sink uri", - zap.String("protocol", util.GetOrZero(s.Protocol)), - zap.String("txnAtomicity", string(util.GetOrZero(s.TxnAtomicity)))) - // Check that protocol config is compatible with the scheme. if sink.IsMySQLCompatibleScheme(sinkURI.Scheme) && s.Protocol != nil { return cerror.ErrSinkURIInvalid.GenWithStackByArgs(fmt.Sprintf("protocol %s "+ diff --git a/pkg/etcd/client.go b/pkg/etcd/client.go index ce52911096..20072f45b6 100644 --- a/pkg/etcd/client.go +++ b/pkg/etcd/client.go @@ -389,8 +389,6 @@ func newClient(tlsConfig *tls.Config, grpcDialOption grpc.DialOption, endpoints // CreateRawEtcdClient creates etcd v3 client with detecting endpoints. // It will check the health of endpoints periodically, and update endpoints if needed. func CreateRawEtcdClient(securityConf *security.Credential, grpcDialOption grpc.DialOption, endpoints ...string) (*clientv3.Client, error) { - log.Info("create etcdCli", zap.Strings("endpoints", endpoints)) - tlsConfig, err := securityConf.ToTLSConfig() if err != nil { return nil, err diff --git a/pkg/migrate/migrate.go b/pkg/migrate/migrate.go index d7723a15ab..7b64ae015c 100644 --- a/pkg/migrate/migrate.go +++ b/pkg/migrate/migrate.go @@ -279,9 +279,7 @@ func (m *migrator) migrate(ctx context.Context, etcdNoMetaVersion bool, oldVersi log.Error("update meta version failed, etcd meta data migration failed", zap.Error(err)) return cerror.WrapError(cerror.ErrEtcdMigrateFailed, err) } - log.Info("etcd data migration successful") cleanOldData(ctx, m.cli.GetEtcdClient()) - log.Info("clean old etcd data successful") return nil } From 2b28a4628f66b1d01c527eb48d0b8a325fbafe4d Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Thu, 9 Apr 2026 14:47:29 +0800 Subject: [PATCH 07/24] simplify some logs --- cdc/entry/schema_storage.go | 8 ++++---- cdc/kv/shared_client.go | 3 --- cdc/owner/ddl_sink.go | 3 --- cdc/owner/owner.go | 1 - cdc/puller/ddl_puller.go | 9 --------- cdc/puller/multiplexing_puller.go | 5 ----- pkg/config/sink.go | 1 - pkg/orchestrator/reactor_state.go | 2 -- pkg/upstream/manager.go | 1 - 9 files changed, 4 insertions(+), 29 deletions(-) diff --git a/cdc/entry/schema_storage.go b/cdc/entry/schema_storage.go index 438f68fae0..afb17a9045 100644 --- a/cdc/entry/schema_storage.go +++ b/cdc/entry/schema_storage.go @@ -219,11 +219,11 @@ func (s *schemaStorage) HandleDDLJob(job *timodel.Job) error { log.Error("schemaStorage: update snapshot by the DDL job failed", zap.String("namespace", s.id.Namespace), zap.String("changefeed", s.id.ID), + zap.String("role", s.role.String()), zap.String("schema", job.SchemaName), zap.String("table", job.TableName), - zap.String("query", job.Query), zap.Uint64("finishedTs", job.BinlogInfo.FinishedTS), - zap.String("role", s.role.String()), + zap.String("query", job.Query), zap.Error(err)) return errors.Trace(err) } @@ -232,11 +232,11 @@ func (s *schemaStorage) HandleDDLJob(job *timodel.Job) error { log.Info("schemaStorage: update snapshot by the DDL job", zap.String("namespace", s.id.Namespace), zap.String("changefeed", s.id.ID), + zap.String("role", s.role.String()), zap.String("schema", job.SchemaName), zap.String("table", job.TableName), - zap.String("query", job.Query), zap.Uint64("finishedTs", job.BinlogInfo.FinishedTS), - zap.String("role", s.role.String())) + zap.String("query", job.Query)) return nil } diff --git a/cdc/kv/shared_client.go b/cdc/kv/shared_client.go index d9fbd0699b..29e2a7bd68 100644 --- a/cdc/kv/shared_client.go +++ b/cdc/kv/shared_client.go @@ -361,9 +361,6 @@ func (s *SharedClient) Run(ctx context.Context) error { g.Go(func() error { return s.handleResolveLockTasks(ctx) }) g.Go(func() error { return s.logSlowRegions(ctx) }) - log.Info("event feed started", - zap.String("namespace", s.changefeed.Namespace), - zap.String("changefeed", s.changefeed.ID)) defer log.Info("event feed exits", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID)) diff --git a/cdc/owner/ddl_sink.go b/cdc/owner/ddl_sink.go index 8fb0079610..8ee2590ba9 100644 --- a/cdc/owner/ddl_sink.go +++ b/cdc/owner/ddl_sink.go @@ -277,9 +277,6 @@ func (s *ddlSinkImpl) run(ctx context.Context) { s.wg.Add(1) go func() { var err error - log.Info("owner ddl sink background loop is started", - zap.String("namespace", s.changefeedID.Namespace), - zap.String("changefeed", s.changefeedID.ID)) defer func() { s.wg.Done() log.Info("owner ddl sink background loop exits", diff --git a/cdc/owner/owner.go b/cdc/owner/owner.go index 0c2f17762a..f9982ac3e4 100644 --- a/cdc/owner/owner.go +++ b/cdc/owner/owner.go @@ -474,7 +474,6 @@ func (o *ownerImpl) cleanStaleMetrics() { // Bootstrap checks if the state contains incompatible or incorrect information and tries to fix it. func (o *ownerImpl) Bootstrap(state *orchestrator.GlobalReactorState) { - log.Info("Start bootstrapping") o.cleanStaleMetrics() fixChangefeedInfos(state) } diff --git a/cdc/puller/ddl_puller.go b/cdc/puller/ddl_puller.go index baf9c7f824..764ae02df2 100644 --- a/cdc/puller/ddl_puller.go +++ b/cdc/puller/ddl_puller.go @@ -215,15 +215,6 @@ func (p *ddlJobPullerImpl) handleRawKVEntry(ctx context.Context, ddlRawKV *model if skip { return nil } - log.Info("a new ddl job is received", - zap.String("namespace", p.changefeedID.Namespace), - zap.String("changefeed", p.changefeedID.ID), - zap.String("schema", job.SchemaName), - zap.String("table", job.TableName), - zap.Uint64("startTs", job.StartTS), - zap.Uint64("finishedTs", job.BinlogInfo.FinishedTS), - zap.String("query", job.Query), - zap.Any("job", job)) } jobEntry := &model.DDLJobEntry{ diff --git a/cdc/puller/multiplexing_puller.go b/cdc/puller/multiplexing_puller.go index c6ee56d074..36ef5c2513 100644 --- a/cdc/puller/multiplexing_puller.go +++ b/cdc/puller/multiplexing_puller.go @@ -353,11 +353,6 @@ func (p *MultiplexingPuller) run(ctx context.Context, includeClient bool) error eg.Go(func() error { return p.runResolvedTsAdvancer(ctx) }) } - log.Info("MultiplexingPuller starts", - zap.String("namespace", p.changefeed.Namespace), - zap.String("changefeed", p.changefeed.ID), - zap.Int("workerConcurrent", len(p.inputChs)), - zap.Int("frontierConcurrent", p.resolvedTsAdvancerCount)) return eg.Wait() } diff --git a/pkg/config/sink.go b/pkg/config/sink.go index 6590ea9d0e..9b2f77b601 100644 --- a/pkg/config/sink.go +++ b/pkg/config/sink.go @@ -238,7 +238,6 @@ func (s *SinkConfig) ShouldSendAllBootstrapAtStart() bool { return false } should := s.ShouldSendBootstrapMsg() && util.GetOrZero(s.SendAllBootstrapAtStart) - log.Info("should send all bootstrap at start", zap.Bool("should", should)) return should } diff --git a/pkg/orchestrator/reactor_state.go b/pkg/orchestrator/reactor_state.go index ccbde53e1c..e31501a5f9 100644 --- a/pkg/orchestrator/reactor_state.go +++ b/pkg/orchestrator/reactor_state.go @@ -150,8 +150,6 @@ func (s *GlobalReactorState) Update(key util.EtcdKey, value []byte, _ bool) erro if err != nil { return cerrors.ErrUnmarshalFailed.Wrap(err).GenWithStackByArgs() } - log.Info("new upstream is add", zap.Uint64("upstream", k.UpstreamID), - zap.Any("info", newUpstreamInfo), zap.String("role", s.Role)) s.Upstreams[k.UpstreamID] = &newUpstreamInfo case etcd.CDCKeyTypeMetaVersion: default: diff --git a/pkg/upstream/manager.go b/pkg/upstream/manager.go index 93c9bf7f57..12f45d8ffa 100644 --- a/pkg/upstream/manager.go +++ b/pkg/upstream/manager.go @@ -157,7 +157,6 @@ func (m *Manager) add(upstreamID uint64, up.err.Store(err) }() up.resetIdleTime() - log.Info("new upstream is added", zap.Uint64("id", up.ID)) return up } From 7635863c2b308a8f126b0c382e22db0697a06ebd Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Thu, 9 Apr 2026 19:43:04 +0800 Subject: [PATCH 08/24] add more code --- cdc/entry/schema_storage.go | 28 ++------ cdc/owner/ddl_manager.go | 71 +++++++++---------- cdc/owner/ddl_sink.go | 53 +++++++------- cdc/processor/processor.go | 8 +-- cdc/puller/ddl_puller.go | 62 +++++++--------- .../ddlsink/blackhole/black_hole_ddl_sink.go | 4 -- .../cloudstorage/cloud_storage_ddl_sink.go | 5 -- cdc/sink/ddlsink/mq/mq_ddl_sink.go | 14 ---- cdc/sink/ddlsink/mysql/mysql_ddl_sink.go | 71 +++++++++++++++---- pkg/txnutil/lock_resolver.go | 22 ------ 10 files changed, 154 insertions(+), 184 deletions(-) diff --git a/cdc/entry/schema_storage.go b/cdc/entry/schema_storage.go index afb17a9045..8b91aaf79e 100644 --- a/cdc/entry/schema_storage.go +++ b/cdc/entry/schema_storage.go @@ -200,15 +200,6 @@ func (s *schemaStorage) HandleDDLJob(job *timodel.Job) error { lastSnap := s.snaps[len(s.snaps)-1] // already-executed DDL could filted by finishedTs. if job.BinlogInfo.FinishedTS <= lastSnap.CurrentTs() { - log.Info("schemaStorage: ignore foregone DDL", - zap.String("namespace", s.id.Namespace), - zap.String("changefeed", s.id.ID), - zap.String("DDL", job.Query), - zap.String("state", job.State.String()), - zap.Int64("jobID", job.ID), - zap.Uint64("finishTs", job.BinlogInfo.FinishedTS), - zap.Int64("jobSchemaVersion", job.BinlogInfo.SchemaVersion), - zap.String("role", s.role.String())) return nil } snap = lastSnap.Copy() @@ -220,6 +211,7 @@ func (s *schemaStorage) HandleDDLJob(job *timodel.Job) error { zap.String("namespace", s.id.Namespace), zap.String("changefeed", s.id.ID), zap.String("role", s.role.String()), + zap.Int64("jobID", job.ID), zap.String("schema", job.SchemaName), zap.String("table", job.TableName), zap.Uint64("finishedTs", job.BinlogInfo.FinishedTS), @@ -229,14 +221,15 @@ func (s *schemaStorage) HandleDDLJob(job *timodel.Job) error { } s.snaps = append(s.snaps, snap) s.AdvanceResolvedTs(job.BinlogInfo.FinishedTS) - log.Info("schemaStorage: update snapshot by the DDL job", + log.Info("ddl job applied to schema storage", zap.String("namespace", s.id.Namespace), zap.String("changefeed", s.id.ID), zap.String("role", s.role.String()), + zap.Int64("jobID", job.ID), + zap.String("type", job.Type.String()), zap.String("schema", job.SchemaName), zap.String("table", job.TableName), - zap.Uint64("finishedTs", job.BinlogInfo.FinishedTS), - zap.String("query", job.Query)) + zap.Uint64("finishedTs", job.BinlogInfo.FinishedTS)) return nil } @@ -262,11 +255,6 @@ func (s *schemaStorage) AllPhysicalTables(ctx context.Context, ts model.Ts) ([]m res = append(res, tblInfo.ID) } }) - log.Debug("get new schema snapshot", - zap.Uint64("ts", ts), - zap.Uint64("snapTs", snap.CurrentTs()), - zap.Any("tables", res)) - return res, nil } @@ -380,11 +368,6 @@ func (s *schemaStorage) DoGC(ts uint64) (lastSchemaTs uint64) { // job is changed to *done* (before change to *synced*) // At state *done*, it will be always and only changed to *synced*. func (s *schemaStorage) skipJob(job *timodel.Job) bool { - log.Debug("handle DDL new commit", - zap.String("DDL", job.Query), zap.Stringer("job", job), - zap.String("namespace", s.id.Namespace), - zap.String("changefeed", s.id.ID), - zap.String("role", s.role.String())) return !job.IsDone() } @@ -431,7 +414,6 @@ func (s *schemaStorage) BuildDDLEvents( var tableInfo *model.TableInfo err = preSnap.FillSchemaName(job) if err != nil { - log.Error("build DDL event fail", zap.Any("job", job), zap.Error(err)) return nil, errors.Trace(err) } // TODO: find a better way to refactor this. For example, drop table job should not diff --git a/cdc/owner/ddl_manager.go b/cdc/owner/ddl_manager.go index da53481af1..2e2237c8e9 100644 --- a/cdc/owner/ddl_manager.go +++ b/cdc/owner/ddl_manager.go @@ -300,7 +300,9 @@ func (m *ddlManager) tick( zap.String("changefeed", m.changfeedID.ID), zap.Int64("tableID", job.TableID), zap.Int64("jobID", job.ID), - zap.String("query", job.Query), + zap.String("type", job.Type.String()), + zap.String("schema", job.SchemaName), + zap.String("table", job.TableName), zap.Uint64("finishedTs", job.BinlogInfo.FinishedTS), ) events, err := m.schema.BuildDDLEvents(ctx, job) @@ -316,8 +318,10 @@ func (m *ddlManager) tick( log.Info("table is ineligible, skip the ddl", zap.String("namespace", m.changfeedID.Namespace), zap.String("changefeed", m.changfeedID.ID), - zap.String("query", job.Query), - zap.Any("table", event.TableInfo)) + zap.String("schema", event.TableInfo.TableName.Schema), + zap.String("table", event.TableInfo.TableName.Table), + zap.Uint64("commitTs", event.CommitTs), + zap.String("query", event.Query)) continue } } @@ -369,10 +373,20 @@ func (m *ddlManager) tick( if m.shouldExecDDL(nextDDL) { if m.executingDDL == nil { + tableInfo := nextDDL.TableInfo + if tableInfo == nil { + tableInfo = nextDDL.PreTableInfo + } + schemaName, tableName := "", "" + if tableInfo != nil { + schemaName = tableInfo.TableName.Schema + tableName = tableInfo.TableName.Table + } log.Info("execute a ddl event", - zap.String("query", nextDDL.Query), + zap.String("schema", schemaName), + zap.String("table", tableName), zap.Uint64("commitTs", nextDDL.CommitTs), - zap.Uint64("checkpointTs", m.checkpointTs)) + zap.String("query", nextDDL.Query)) m.executingDDL = nextDDL skip, cleanMsg, err := m.shouldSkipDDL(m.executingDDL) if err != nil { @@ -481,8 +495,6 @@ func (m *ddlManager) getNextDDL() *model.DDLEvent { var res *model.DDLEvent for tb, ddls := range m.pendingDDLs { if len(ddls) == 0 { - log.Debug("no more ddl event, gc the table from pendingDDLs", - zap.String("table", tb.String())) delete(m.pendingDDLs, tb) continue } @@ -589,13 +601,6 @@ func (m *ddlManager) allTables(ctx context.Context) ([]*model.TableInfo, error) return nil, err } m.tableInfoCache = tableInfoCache - log.Debug("changefeed current tables updated", - zap.String("namespace", m.changfeedID.Namespace), - zap.String("changefeed", m.changfeedID.ID), - zap.Uint64("checkpointTs", m.checkpointTs), - zap.Uint64("snapshotTs", ts), - zap.Any("tables", m.tableInfoCache), - ) } return m.tableInfoCache, nil @@ -610,13 +615,6 @@ func (m *ddlManager) allPhysicalTables(ctx context.Context) ([]model.TableID, er if err != nil { return nil, err } - log.Debug("changefeed physical tables updated", - zap.String("namespace", m.changfeedID.Namespace), - zap.String("changefeed", m.changfeedID.ID), - zap.Uint64("checkpointTs", m.checkpointTs), - zap.Uint64("snapshotTs", ts), - zap.Any("tables", m.physicalTablesCache), - ) m.physicalTablesCache = cache } return m.physicalTablesCache, nil @@ -628,34 +626,35 @@ func (m *ddlManager) allPhysicalTables(ctx context.Context) ([]model.TableID, er // otherwise we use the checkpointTs. func (m *ddlManager) getSnapshotTs() (ts uint64) { ts = m.checkpointTs - if m.ddlResolvedTs == m.startTs { // If ddlResolvedTs is equal to startTs it means that the changefeed is just started, // So we need to get all tables from the snapshot at the startTs. ts = m.startTs - log.Debug("changefeed is just started, use startTs to get snapshot", - zap.String("namespace", m.changfeedID.Namespace), - zap.String("changefeed", m.changfeedID.ID), - zap.Uint64("startTs", m.startTs), - zap.Uint64("checkpointTs", m.checkpointTs), - zap.Uint64("ddlResolvedTs", m.ddlResolvedTs), - ) - return } - - log.Debug("snapshotTs", zap.Uint64("ts", ts)) return ts } // cleanCache cleans the tableInfoCache and physicalTablesCache. // It should be called after a DDL is skipped or sent to downstream successfully. func (m *ddlManager) cleanCache(msg string) { - tableName := m.executingDDL.TableInfo.TableName - log.Info(msg, zap.String("ddl", m.executingDDL.Query), + tableInfo := m.executingDDL.TableInfo + if tableInfo == nil { + tableInfo = m.executingDDL.PreTableInfo + } + var tableName model.TableName + schemaName, table := "", "" + if tableInfo != nil { + tableName = tableInfo.TableName + schemaName = tableInfo.TableName.Schema + table = tableInfo.TableName.Table + } + log.Info(msg, zap.String("namespace", m.changfeedID.Namespace), zap.String("changefeed", m.changfeedID.ID), - zap.String("bdrRole", m.executingDDL.BDRRole), - zap.Any("ddlEvent", m.executingDDL)) + zap.String("schema", schemaName), + zap.String("table", table), + zap.Uint64("commitTs", m.executingDDL.CommitTs), + zap.String("query", m.executingDDL.Query)) // Set it to nil first to accelerate GC. m.pendingDDLs[tableName][0] = nil diff --git a/cdc/owner/ddl_sink.go b/cdc/owner/ddl_sink.go index 8ee2590ba9..9c996e1783 100644 --- a/cdc/owner/ddl_sink.go +++ b/cdc/owner/ddl_sink.go @@ -240,11 +240,6 @@ func (s *ddlSinkImpl) writeCheckpointTs(ctx context.Context, lastCheckpointTs *m } func (s *ddlSinkImpl) writeDDLEvent(ctx context.Context, ddl *model.DDLEvent) error { - log.Info("begin emit ddl event", - zap.String("namespace", s.changefeedID.Namespace), - zap.String("changefeed", s.changefeedID.ID), - zap.Any("DDL", ddl)) - doWrite := func() (err error) { if err = s.makeSinkReady(ctx); err == nil { err = s.sink.WriteDDLEvent(ctx, ddl) @@ -256,14 +251,17 @@ func (s *ddlSinkImpl) writeDDLEvent(ctx context.Context, ddl *model.DDLEvent) er log.Error("Execute DDL failed", zap.String("namespace", s.changefeedID.Namespace), zap.String("changefeed", s.changefeedID.ID), - zap.Any("DDL", ddl), - zap.Error(err)) + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), + zap.Error(err), + ) } else { ddl.Done.Store(true) log.Info("Execute DDL succeeded", zap.String("namespace", s.changefeedID.Namespace), zap.String("changefeed", s.changefeedID.ID), - zap.Any("DDL", ddl)) + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query)) } return } @@ -330,10 +328,11 @@ func (s *ddlSinkImpl) emitDDLEvent(ctx context.Context, ddl *model.DDLEvent) (bo s.mu.Lock() if ddl.Done.Load() { // the DDL event is executed successfully, and done is true - log.Info("ddl already executed, skip it", + log.Debug("ddl already executed, skip it", zap.String("namespace", s.changefeedID.Namespace), zap.String("changefeed", s.changefeedID.ID), - zap.Any("DDL", ddl)) + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query)) delete(s.ddlSentTsMap, ddl) s.mu.Unlock() return true, nil @@ -344,7 +343,10 @@ func (s *ddlSinkImpl) emitDDLEvent(ctx context.Context, ddl *model.DDLEvent) (bo log.Debug("ddl is not finished yet", zap.String("namespace", s.changefeedID.Namespace), zap.String("changefeed", s.changefeedID.ID), - zap.Uint64("ddlSentTs", ddlSentTs), zap.Any("DDL", ddl)) + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), + zap.Uint64("ddlSentTs", ddlSentTs), + ) // the DDL event is executing and not finished yet, return false s.mu.Unlock() return false, nil @@ -355,8 +357,10 @@ func (s *ddlSinkImpl) emitDDLEvent(ctx context.Context, ddl *model.DDLEvent) (bo log.Error("Add special comment failed", zap.String("namespace", s.changefeedID.Namespace), zap.String("changefeed", s.changefeedID.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), zap.Error(err), - zap.Any("ddl", ddl)) + ) s.mu.Unlock() return false, errors.Trace(err) } @@ -368,16 +372,14 @@ func (s *ddlSinkImpl) emitDDLEvent(ctx context.Context, ddl *model.DDLEvent) (bo return false, errors.Trace(ctx.Err()) case s.ddlCh <- ddl: s.ddlSentTsMap[ddl] = ddl.CommitTs - log.Info("ddl is sent", - zap.String("namespace", s.changefeedID.Namespace), - zap.String("changefeed", s.changefeedID.ID), - zap.Uint64("ddlSentTs", ddl.CommitTs)) default: log.Warn("ddl chan full, send it the next round", zap.String("namespace", s.changefeedID.Namespace), zap.String("changefeed", s.changefeedID.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), zap.Uint64("ddlSentTs", ddlSentTs), - zap.Any("DDL", ddl)) + ) // if this hit, we think that ddlCh is full, // just return false and send the ddl in the next round. } @@ -463,13 +465,16 @@ func (s *ddlSinkImpl) addSpecialComment(ddl *model.DDLEvent) (string, error) { } result := sb.String() - log.Info("add special comment to DDL", - zap.String("namespace", s.changefeedID.Namespace), - zap.String("changefeed", s.changefeedID.ID), - zap.String("DDL", ddl.Query), - zap.String("charset", ddl.Charset), - zap.String("collate", ddl.Collate), - zap.String("result", result)) + if result != ddl.Query { + log.Info("add special comment to DDL", + zap.String("namespace", s.changefeedID.Namespace), + zap.String("changefeed", s.changefeedID.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), + zap.String("newQuery", result), + zap.String("charset", ddl.Charset), + zap.String("collate", ddl.Collate)) + } return result, nil } diff --git a/cdc/processor/processor.go b/cdc/processor/processor.go index be1befb4a3..38ac7f5064 100644 --- a/cdc/processor/processor.go +++ b/cdc/processor/processor.go @@ -788,9 +788,8 @@ func (p *processor) initDDLHandler() error { } serverCfg := config.GetGlobalServerConfig() - changefeedID := model.DefaultChangeFeedID(p.changefeedID.ID + "_processor_ddl_puller") ddlPuller := puller.NewDDLJobPuller( - p.upstream, ddlStartTs, serverCfg, changefeedID, schemaStorage, p.filter, + p.upstream, ddlStartTs, serverCfg, p.changefeedID, util.RoleProcessor, schemaStorage, p.filter, ) p.ddlHandler.r = &ddlHandler{puller: ddlPuller, schemaStorage: schemaStorage} return nil @@ -809,11 +808,6 @@ func (p *processor) updateBarrierTs(barrier *schedulepb.Barrier) { // may pile up in memory, as they have to wait DDL. globalBarrierTs = schemaResolvedTs } - log.Debug("update barrierTs", - zap.String("namespace", p.changefeedID.Namespace), - zap.String("changefeed", p.changefeedID.ID), - zap.Any("tableBarriers", barrier.GetTableBarriers()), - zap.Uint64("globalBarrierTs", globalBarrierTs)) p.sinkManager.r.UpdateBarrierTs(globalBarrierTs, tableBarrier) } diff --git a/cdc/puller/ddl_puller.go b/cdc/puller/ddl_puller.go index 764ae02df2..f4a77412c7 100644 --- a/cdc/puller/ddl_puller.go +++ b/cdc/puller/ddl_puller.go @@ -68,6 +68,7 @@ type DDLJobPuller interface { // be called in the same one goroutine. type ddlJobPullerImpl struct { changefeedID model.ChangeFeedID + role util.Role mp *MultiplexingPuller // memorysorter is used to sort the DDL events. sorter *memorysorter.EntrySorter @@ -91,6 +92,7 @@ func NewDDLJobPuller( checkpointTs uint64, cfg *config.ServerConfig, changefeed model.ChangeFeedID, + role util.Role, schemaStorage entry.SchemaStorage, filter filter.Filter, ) DDLJobPuller { @@ -107,6 +109,7 @@ func NewDDLJobPuller( ddlJobPuller := &ddlJobPullerImpl{ changefeedID: changefeed, + role: role, schemaStorage: schemaStorage, kvStorage: kvStorage, filter: filter, @@ -302,35 +305,33 @@ func (p *ddlJobPullerImpl) handleJob(job *timodel.Job) (skip bool, err error) { "discard the ddl job", zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), - zap.String("schema", job.SchemaName), - zap.String("table", job.TableName), + zap.Int64("jobID", job.ID), zap.Uint64("startTs", job.StartTS), zap.Uint64("finishedTs", job.BinlogInfo.FinishedTS), - zap.String("query", job.Query), zap.Uint64("pullerResolvedTs", p.getResolvedTs())) return true, nil } defer func() { if skip && err == nil { - log.Info("ddl job schema or table does not match, discard it", + log.Debug("ddl job schema or table does not match, discard it", zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), + zap.Int64("jobID", job.ID), zap.String("schema", job.SchemaName), zap.String("table", job.TableName), - zap.String("query", job.Query), - zap.Uint64("startTs", job.StartTS), zap.Uint64("finishTs", job.BinlogInfo.FinishedTS)) } if err != nil { log.Warn("handle ddl job failed", zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), + zap.Int64("jobID", job.ID), zap.String("schema", job.SchemaName), zap.String("table", job.TableName), - zap.String("query", job.Query), zap.Uint64("startTs", job.StartTS), zap.Uint64("finishTs", job.BinlogInfo.FinishedTS), + zap.String("query", job.Query), zap.Error(err)) } }() @@ -340,11 +341,12 @@ func (p *ddlJobPullerImpl) handleJob(job *timodel.Job) (skip bool, err error) { log.Info("failed to fill schema name for ddl job", zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), + zap.Int64("jobID", job.ID), zap.String("schema", job.SchemaName), zap.String("table", job.TableName), - zap.String("query", job.Query), zap.Uint64("startTs", job.StartTS), zap.Uint64("finishTs", job.BinlogInfo.FinishedTS), + zap.String("query", job.Query), zap.Error(err)) if p.filter.ShouldDiscardDDL(job.Type, job.SchemaName, job.TableName, job.StartTS) { return true, nil @@ -360,11 +362,12 @@ func (p *ddlJobPullerImpl) handleJob(job *timodel.Job) (skip bool, err error) { log.Warn("handle rename tables ddl job failed", zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), + zap.Int64("jobID", job.ID), zap.String("schema", job.SchemaName), zap.String("table", job.TableName), - zap.String("query", job.Query), zap.Uint64("startTs", job.StartTS), zap.Uint64("finishTs", job.BinlogInfo.FinishedTS), + zap.String("query", job.Query), zap.Error(err)) return false, cerror.WrapError(cerror.ErrHandleDDLFailed, errors.Trace(err), job.Query, job.StartTS, job.StartTS) @@ -438,14 +441,6 @@ func (p *ddlJobPullerImpl) handleJob(job *timodel.Job) (skip bool, err error) { } return true, nil } - log.Info("ddl puller receive rename table ddl job", - zap.String("namespace", p.changefeedID.Namespace), - zap.String("changefeed", p.changefeedID.ID), - zap.String("schema", job.SchemaName), - zap.String("table", job.TableName), - zap.String("query", job.Query), - zap.Uint64("startTs", job.StartTS), - zap.Uint64("finishedTs", job.BinlogInfo.FinishedTS)) default: // nil means it is a schema ddl job, it's no need to fill the table name. if job.BinlogInfo.TableInfo != nil { @@ -458,6 +453,13 @@ func (p *ddlJobPullerImpl) handleJob(job *timodel.Job) (skip bool, err error) { return true, nil } + log.Info("ddl job received by ddl puller", + zap.String("namespace", p.changefeedID.Namespace), + zap.String("changefeed", p.changefeedID.ID), + zap.String("role", p.role.String()), + zap.Int64("jobID", job.ID), + zap.Any("job", job)) + err = p.schemaStorage.HandleDDLJob(job) if err != nil { return false, cerror.WrapError(cerror.ErrHandleDDLFailed, @@ -518,7 +520,11 @@ func (p *ddlJobPullerImpl) checkIneligibleTableDDL(snapBefore *schema.Snapshot, isIneligibleBefore := snapBefore.IsIneligibleTableID(oldTableID) if isIneligibleBefore { log.Warn("Ignore the DDL event of ineligible table", - zap.String("changefeed", p.changefeedID.ID), zap.Any("ddl", job)) + zap.String("changefeed", p.changefeedID.ID), + zap.Int64("jobID", job.ID), + zap.String("schema", job.SchemaName), + zap.String("table", job.TableName), + zap.Uint64("finishTs", job.BinlogInfo.FinishedTS)) return true, nil } @@ -643,9 +649,8 @@ func NewDDLPuller( var puller DDLJobPuller // storage can be nil only in the test if up.KVStorage != nil { - changefeed.ID += "_owner_ddl_puller" puller = NewDDLJobPuller(up, startTs, config.GetGlobalServerConfig(), - changefeed, schemaStorage, filter) + changefeed, util.RoleOwner, schemaStorage, filter) } return &ddlPullerImpl{ @@ -664,28 +669,15 @@ func (h *ddlPullerImpl) addToPending(job *timodel.Job) { log.Warn("ignore duplicated DDL job", zap.String("namespace", h.changefeedID.Namespace), zap.String("changefeed", h.changefeedID.ID), - zap.String("schema", job.SchemaName), - zap.String("table", job.TableName), - - zap.String("query", job.Query), + zap.Int64("jobID", job.ID), zap.Uint64("startTs", job.StartTS), - zap.Uint64("finishTs", job.BinlogInfo.FinishedTS), - zap.Int64("jobID", job.ID)) + zap.Uint64("finishTs", job.BinlogInfo.FinishedTS)) return } h.mu.Lock() defer h.mu.Unlock() h.pendingDDLJobs = append(h.pendingDDLJobs, job) h.lastDDLJobID = job.ID - log.Info("ddl puller receives new pending job", - zap.String("namespace", h.changefeedID.Namespace), - zap.String("changefeed", h.changefeedID.ID), - zap.String("schema", job.SchemaName), - zap.String("table", job.TableName), - zap.String("query", job.Query), - zap.Uint64("startTs", job.StartTS), - zap.Uint64("finishTs", job.BinlogInfo.FinishedTS), - zap.Int64("jobID", job.ID)) } // Run the ddl puller to receive DDL events diff --git a/cdc/sink/ddlsink/blackhole/black_hole_ddl_sink.go b/cdc/sink/ddlsink/blackhole/black_hole_ddl_sink.go index 39e1e37079..59d37ae113 100644 --- a/cdc/sink/ddlsink/blackhole/black_hole_ddl_sink.go +++ b/cdc/sink/ddlsink/blackhole/black_hole_ddl_sink.go @@ -16,10 +16,8 @@ package blackhole import ( "context" - "github.com/pingcap/log" "github.com/pingcap/tiflow/cdc/model" "github.com/pingcap/tiflow/cdc/sink/ddlsink" - "go.uber.org/zap" ) // Assert Sink implementation @@ -37,7 +35,6 @@ func NewDDLSink() *DDLSink { func (d *DDLSink) WriteDDLEvent(ctx context.Context, ddl *model.DDLEvent, ) error { - log.Debug("BlackHoleSink: DDL Event", zap.Any("ddl", ddl)) return nil } @@ -45,7 +42,6 @@ func (d *DDLSink) WriteDDLEvent(ctx context.Context, func (d *DDLSink) WriteCheckpointTs(ctx context.Context, ts uint64, tables []*model.TableInfo, ) error { - log.Debug("BlackHoleSink: Checkpoint Ts Event", zap.Uint64("ts", ts), zap.Any("tables", tables)) return nil } diff --git a/cdc/sink/ddlsink/cloudstorage/cloud_storage_ddl_sink.go b/cdc/sink/ddlsink/cloudstorage/cloud_storage_ddl_sink.go index 8147bd89b5..39c6965f6e 100644 --- a/cdc/sink/ddlsink/cloudstorage/cloud_storage_ddl_sink.go +++ b/cdc/sink/ddlsink/cloudstorage/cloud_storage_ddl_sink.go @@ -109,8 +109,6 @@ func (d *DDLSink) WriteDDLEvent(ctx context.Context, ddl *model.DDLEvent) error if err != nil { return errors.Trace(err) } - log.Debug("write ddl event to external storage", - zap.String("path", path), zap.Any("ddl", ddl)) return d.statistics.RecordDDLExecution(func() error { err1 := d.storage.WriteFile(ctx, path, encodedDef) if err1 != nil { @@ -141,9 +139,6 @@ func (d *DDLSink) WriteCheckpointTs(ctx context.Context, ts uint64, tables []*model.TableInfo, ) error { if time.Since(d.lastSendCheckpointTsTime) < 2*time.Second { - log.Debug("skip write checkpoint ts to external storage", - zap.Any("changefeedID", d.id), - zap.Uint64("ts", ts)) return nil } diff --git a/cdc/sink/ddlsink/mq/mq_ddl_sink.go b/cdc/sink/ddlsink/mq/mq_ddl_sink.go index 0bf09117d9..b9c2e9703c 100644 --- a/cdc/sink/ddlsink/mq/mq_ddl_sink.go +++ b/cdc/sink/ddlsink/mq/mq_ddl_sink.go @@ -16,7 +16,6 @@ package mq import ( "context" - "github.com/pingcap/log" "github.com/pingcap/tiflow/cdc/model" "github.com/pingcap/tiflow/cdc/sink/ddlsink" "github.com/pingcap/tiflow/cdc/sink/ddlsink/mq/ddlproducer" @@ -27,7 +26,6 @@ import ( "github.com/pingcap/tiflow/pkg/sink" "github.com/pingcap/tiflow/pkg/sink/codec" "github.com/pingcap/tiflow/pkg/sink/kafka" - "go.uber.org/zap" ) // DDLDispatchRule is the dispatch rule for DDL event. @@ -106,21 +104,11 @@ func (k *DDLSink) WriteDDLEvent(ctx context.Context, ddl *model.DDLEvent) error return err } if msg == nil { - log.Info("Skip ddl event", zap.Uint64("commitTs", ddl.CommitTs), - zap.String("query", ddl.Query), - zap.String("protocol", k.protocol.String()), - zap.String("namespace", k.id.Namespace), - zap.String("changefeed", k.id.ID)) return nil } topic := k.eventRouter.GetTopicForDDL(ddl) partitionRule := getDDLDispatchRule(k.protocol) - log.Debug("Emit ddl event", - zap.Uint64("commitTs", ddl.CommitTs), - zap.String("query", ddl.Query), - zap.String("namespace", k.id.Namespace), - zap.String("changefeed", k.id.ID)) // Notice: We must call GetPartitionNum here, // which will be responsible for automatically creating topics when they don't exist. // If it is not called here and kafka has `auto.create.topics.enable` turned on, @@ -174,8 +162,6 @@ func (k *DDLSink) WriteCheckpointTs(ctx context.Context, if err != nil { return err } - log.Debug("Emit checkpointTs to default topic", - zap.String("topic", topic), zap.Uint64("checkpointTs", ts)) return k.producer.SyncBroadcastMessage(ctx, topic, partitionNum, msg) } var tableNames []model.TableName diff --git a/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go b/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go index 2199eeb50f..0cdaa20403 100644 --- a/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go +++ b/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go @@ -138,22 +138,30 @@ func (m *DDLSink) execDDLWithMaxRetries(ctx context.Context, ddl *model.DDLEvent if errorutil.IsIgnorableMySQLDDLError(err) { // NOTE: don't change the log, some tests depend on it. log.Info("Execute DDL failed, but error can be ignored", - zap.Uint64("startTs", ddl.StartTs), zap.String("ddl", ddl.Query), zap.String("namespace", m.id.Namespace), zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), zap.Error(err)) // If the error is ignorable, we will ignore the error directly. return nil } if m.cfg.IsTiDB && ddlCreateTime != "" && errors.Cause(err) == mysql.ErrInvalidConn { - log.Warn("Wait the asynchronous ddl to synchronize", zap.String("ddl", ddl.Query), zap.String("ddlCreateTime", ddlCreateTime), - zap.String("readTimeout", m.cfg.ReadTimeout), zap.Error(err)) + log.Warn("Wait the asynchronous ddl to synchronize", + zap.String("namespace", m.id.Namespace), + zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), + zap.String("ddlCreateTime", ddlCreateTime), + zap.String("readTimeout", m.cfg.ReadTimeout), + zap.Error(err)) return m.waitDDLDone(ctx, ddl, ddlCreateTime) } log.Warn("Execute DDL with error, retry later", - zap.Uint64("startTs", ddl.StartTs), zap.String("ddl", ddl.Query), zap.String("namespace", m.id.Namespace), zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), zap.Error(err)) return err } @@ -171,7 +179,14 @@ func (m *DDLSink) execDDL(pctx context.Context, ddl *model.DDLEvent) error { // Convert vector type to string type for unsupport database if m.needFormat { if newQuery := formatQuery(ddl.Query); newQuery != ddl.Query { - log.Warn("format ddl query", zap.String("newQuery", newQuery), zap.String("query", ddl.Query), zap.String("collate", ddl.Collate), zap.String("charset", ddl.Charset)) + log.Info("format ddl query", + zap.String("namespace", m.id.Namespace), + zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("newQuery", newQuery), + zap.String("query", ddl.Query), + zap.String("collate", ddl.Collate), + zap.String("charset", ddl.Charset)) ddl.Query = newQuery } } @@ -186,8 +201,6 @@ func (m *DDLSink) execDDL(pctx context.Context, ddl *model.DDLEvent) error { }) start := time.Now() - log.Info("Start exec DDL", zap.String("namespace", m.id.Namespace), zap.String("changefeed", m.id.ID), - zap.Uint64("commitTs", ddl.CommitTs), zap.String("DDL", ddl.Query)) tx, err := m.db.BeginTx(ctx, nil) if err != nil { return err @@ -242,6 +255,7 @@ func (m *DDLSink) execDDL(pctx context.Context, ddl *model.DDLEvent) error { log.Warn("Skip setting session timestamp due to failpoint", zap.String("namespace", m.id.Namespace), zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), zap.String("query", ddl.Query)) } if useSessionTimestamp && !skipSetTimestamp { @@ -249,7 +263,6 @@ func (m *DDLSink) execDDL(pctx context.Context, ddl *model.DDLEvent) error { if err := setSessionTimestamp(ctx, tx, ddlTimestamp); err != nil { log.Error("Fail to set session timestamp for DDL", zap.Float64("timestamp", ddlTimestamp), - zap.Uint64("startTs", ddl.StartTs), zap.Uint64("commitTs", ddl.CommitTs), zap.String("query", ddl.Query), zap.Error(err)) @@ -261,12 +274,18 @@ func (m *DDLSink) execDDL(pctx context.Context, ddl *model.DDLEvent) error { } if _, err = tx.ExecContext(ctx, ddl.Query); err != nil { - log.Error("Failed to ExecContext", zap.Any("err", err), zap.Any("query", ddl.Query)) + log.Error("Failed to ExecContext", + zap.String("namespace", m.id.Namespace), + zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), + zap.Error(err)) if useSessionTimestamp { if skipResetAfterDDL { log.Warn("Skip resetting session timestamp after DDL execution failure due to failpoint", zap.String("namespace", m.id.Namespace), zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), zap.String("query", ddl.Query)) } else if tsErr := resetSessionTimestamp(ctx, tx); tsErr != nil { log.Warn("Failed to reset session timestamp after DDL execution failure", @@ -291,6 +310,7 @@ func (m *DDLSink) execDDL(pctx context.Context, ddl *model.DDLEvent) error { log.Warn("Skip resetting session timestamp after DDL execution due to failpoint", zap.String("namespace", m.id.Namespace), zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), zap.String("query", ddl.Query)) } else if err := resetSessionTimestamp(ctx, tx); err != nil { log.Error("Failed to reset session timestamp after DDL execution", zap.Error(err)) @@ -308,8 +328,9 @@ func (m *DDLSink) execDDL(pctx context.Context, ddl *model.DDLEvent) error { logFields := []zap.Field{ zap.String("namespace", m.id.Namespace), zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), zap.Duration("duration", time.Since(start)), - zap.String("sql", ddl.Query), } if useSessionTimestamp { @@ -332,7 +353,12 @@ func (m *DDLSink) waitDDLDone(ctx context.Context, ddl *model.DDLEvent, ddlCreat return ctx.Err() case <-ticker.C: case <-ticker1.C: - log.Info("DDL is still running downstream, it blocks other DDL or DML events", zap.String("ddl", ddl.Query), zap.String("ddlCreateTime", ddlCreateTime)) + log.Info("DDL is still running downstream, it blocks other DDL or DML events", + zap.String("namespace", m.id.Namespace), + zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), + zap.String("ddlCreateTime", ddlCreateTime)) } state, err := getDDLStateFromTiDB(ctx, m.db, ddl.Query, ddlCreateTime) @@ -341,7 +367,12 @@ func (m *DDLSink) waitDDLDone(ctx context.Context, ddl *model.DDLEvent, ddlCreat } switch state { case timodel.JobStateDone, timodel.JobStateSynced: - log.Info("DDL replicate success", zap.String("ddl", ddl.Query), zap.String("ddlCreateTime", ddlCreateTime)) + log.Info("DDL replicate success", + zap.String("namespace", m.id.Namespace), + zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), + zap.String("ddlCreateTime", ddlCreateTime)) return nil case timodel.JobStateCancelled, timodel.JobStateRollingback, timodel.JobStateRollbackDone, timodel.JobStateCancelling: return errors.ErrExecDDLFailed.GenWithStackByArgs(ddl.Query) @@ -349,11 +380,23 @@ func (m *DDLSink) waitDDLDone(ctx context.Context, ddl *model.DDLEvent, ddlCreat switch ddl.Type { // returned immediately if not block dml case timodel.ActionAddIndex: - log.Info("DDL is running downstream", zap.String("ddl", ddl.Query), zap.String("ddlCreateTime", ddlCreateTime), zap.Any("ddlState", state)) + log.Info("DDL is running downstream", + zap.String("namespace", m.id.Namespace), + zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), + zap.String("ddlCreateTime", ddlCreateTime), + zap.Any("ddlState", state)) return nil } default: - log.Warn("Unexpected DDL state, may not be found downstream, retry later", zap.String("ddl", ddl.Query), zap.String("ddlCreateTime", ddlCreateTime), zap.Any("ddlState", state)) + log.Warn("Unexpected DDL state, may not be found downstream, retry later", + zap.String("namespace", m.id.Namespace), + zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), + zap.String("ddlCreateTime", ddlCreateTime), + zap.Any("ddlState", state)) return errors.ErrDDLStateNotFound.GenWithStackByArgs(state) } } diff --git a/pkg/txnutil/lock_resolver.go b/pkg/txnutil/lock_resolver.go index 6184dc0045..f89494d849 100644 --- a/pkg/txnutil/lock_resolver.go +++ b/pkg/txnutil/lock_resolver.go @@ -16,17 +16,14 @@ package txnutil import ( "bytes" "context" - "time" "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/kvrpcpb" - "github.com/pingcap/log" "github.com/pingcap/tiflow/cdc/model" tikverr "github.com/tikv/client-go/v2/error" "github.com/tikv/client-go/v2/tikv" "github.com/tikv/client-go/v2/tikvrpc" "github.com/tikv/client-go/v2/txnkv" - "go.uber.org/zap" ) // LockResolver resolves lock in the given region. @@ -53,25 +50,6 @@ const scanLockLimit = 1024 func (r *resolver) Resolve(ctx context.Context, regionID uint64, maxVersion uint64) (err error) { var totalLocks []*txnkv.Lock - - start := time.Now() - - defer func() { - // Only log when there are locks or error to avoid log flooding. - if len(totalLocks) != 0 || err != nil { - cost := time.Since(start) - log.Info("resolve lock finishes", - zap.Uint64("regionID", regionID), - zap.Int("lockCount", len(totalLocks)), - zap.Any("locks", totalLocks), - zap.Uint64("maxVersion", maxVersion), - zap.String("namespace", r.changefeed.Namespace), - zap.String("changefeed", r.changefeed.ID), - zap.Duration("duration", cost), - zap.Error(err)) - } - }() - // TODO test whether this function will kill active transaction req := tikvrpc.NewRequest(tikvrpc.CmdScanLock, &kvrpcpb.ScanLockRequest{ MaxVersion: maxVersion, From 305d006c8a615cdeb0a89fa9d04327c39d83d276 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Thu, 9 Apr 2026 22:03:00 +0800 Subject: [PATCH 09/24] add more code --- cdc/entry/schema_storage.go | 2 - cdc/owner/ddl_manager.go | 20 +----- cdc/puller/ddl_puller.go | 9 +-- cdc/sink/ddlsink/mysql/mysql_ddl_sink.go | 81 +++++++++++++++++++----- 4 files changed, 68 insertions(+), 44 deletions(-) diff --git a/cdc/entry/schema_storage.go b/cdc/entry/schema_storage.go index 8b91aaf79e..e2f4de1f34 100644 --- a/cdc/entry/schema_storage.go +++ b/cdc/entry/schema_storage.go @@ -212,8 +212,6 @@ func (s *schemaStorage) HandleDDLJob(job *timodel.Job) error { zap.String("changefeed", s.id.ID), zap.String("role", s.role.String()), zap.Int64("jobID", job.ID), - zap.String("schema", job.SchemaName), - zap.String("table", job.TableName), zap.Uint64("finishedTs", job.BinlogInfo.FinishedTS), zap.String("query", job.Query), zap.Error(err)) diff --git a/cdc/owner/ddl_manager.go b/cdc/owner/ddl_manager.go index 2e2237c8e9..77c43f1334 100644 --- a/cdc/owner/ddl_manager.go +++ b/cdc/owner/ddl_manager.go @@ -318,8 +318,6 @@ func (m *ddlManager) tick( log.Info("table is ineligible, skip the ddl", zap.String("namespace", m.changfeedID.Namespace), zap.String("changefeed", m.changfeedID.ID), - zap.String("schema", event.TableInfo.TableName.Schema), - zap.String("table", event.TableInfo.TableName.Table), zap.Uint64("commitTs", event.CommitTs), zap.String("query", event.Query)) continue @@ -373,18 +371,9 @@ func (m *ddlManager) tick( if m.shouldExecDDL(nextDDL) { if m.executingDDL == nil { - tableInfo := nextDDL.TableInfo - if tableInfo == nil { - tableInfo = nextDDL.PreTableInfo - } - schemaName, tableName := "", "" - if tableInfo != nil { - schemaName = tableInfo.TableName.Schema - tableName = tableInfo.TableName.Table - } log.Info("execute a ddl event", - zap.String("schema", schemaName), - zap.String("table", tableName), + zap.String("namespace", m.changfeedID.Namespace), + zap.String("changefeed", m.changfeedID.ID), zap.Uint64("commitTs", nextDDL.CommitTs), zap.String("query", nextDDL.Query)) m.executingDDL = nextDDL @@ -642,17 +631,12 @@ func (m *ddlManager) cleanCache(msg string) { tableInfo = m.executingDDL.PreTableInfo } var tableName model.TableName - schemaName, table := "", "" if tableInfo != nil { tableName = tableInfo.TableName - schemaName = tableInfo.TableName.Schema - table = tableInfo.TableName.Table } log.Info(msg, zap.String("namespace", m.changfeedID.Namespace), zap.String("changefeed", m.changfeedID.ID), - zap.String("schema", schemaName), - zap.String("table", table), zap.Uint64("commitTs", m.executingDDL.CommitTs), zap.String("query", m.executingDDL.Query)) diff --git a/cdc/puller/ddl_puller.go b/cdc/puller/ddl_puller.go index f4a77412c7..c82ac3fcc8 100644 --- a/cdc/puller/ddl_puller.go +++ b/cdc/puller/ddl_puller.go @@ -327,8 +327,6 @@ func (p *ddlJobPullerImpl) handleJob(job *timodel.Job) (skip bool, err error) { zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), zap.Int64("jobID", job.ID), - zap.String("schema", job.SchemaName), - zap.String("table", job.TableName), zap.Uint64("startTs", job.StartTS), zap.Uint64("finishTs", job.BinlogInfo.FinishedTS), zap.String("query", job.Query), @@ -342,8 +340,6 @@ func (p *ddlJobPullerImpl) handleJob(job *timodel.Job) (skip bool, err error) { zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), zap.Int64("jobID", job.ID), - zap.String("schema", job.SchemaName), - zap.String("table", job.TableName), zap.Uint64("startTs", job.StartTS), zap.Uint64("finishTs", job.BinlogInfo.FinishedTS), zap.String("query", job.Query), @@ -363,8 +359,6 @@ func (p *ddlJobPullerImpl) handleJob(job *timodel.Job) (skip bool, err error) { zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), zap.Int64("jobID", job.ID), - zap.String("schema", job.SchemaName), - zap.String("table", job.TableName), zap.Uint64("startTs", job.StartTS), zap.Uint64("finishTs", job.BinlogInfo.FinishedTS), zap.String("query", job.Query), @@ -580,8 +574,9 @@ func (p *ddlJobPullerImpl) handleRenameTables(job *timodel.Job) (skip bool, err if shouldDiscardOldTable && shouldDiscardNewTable { // skip a rename table ddl only when its old table name and new table name are both filtered. log.Info("RenameTables is filtered", + zap.String("namespace", p.changefeedID.Namespace), + zap.String("changefeed", p.changefeedID.ID), zap.Int64("tableID", tableInfo.ID), - zap.String("schema", info.OldSchemaName.O), zap.String("query", job.Query)) continue } diff --git a/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go b/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go index 0cdaa20403..bb3f5955a1 100644 --- a/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go +++ b/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go @@ -131,7 +131,7 @@ func (m *DDLSink) WriteDDLEvent(ctx context.Context, ddl *model.DDLEvent) error // If the downstream is TiDB, it will query the DDL and wait until it finishes. // For 'add index' ddl, it will return immediately without waiting and will query it during the next DDL execution. func (m *DDLSink) execDDLWithMaxRetries(ctx context.Context, ddl *model.DDLEvent) error { - ddlCreateTime := getDDLCreateTime(ctx, m.db) + ddlCreateTime := getDDLCreateTime(ctx, m.id, m.db) return retry.Do(ctx, func() error { err := m.statistics.RecordDDLExecution(func() error { return m.execDDL(ctx, ddl) }) if err != nil { @@ -236,7 +236,10 @@ func (m *DDLSink) execDDL(pctx context.Context, ddl *model.DDLEvent) error { zap.String("changefeed", m.id.ID), zap.Error(err)) if rbErr := tx.Rollback(); rbErr != nil { - log.Error("Failed to rollback", zap.String("changefeed", m.id.ID), zap.Error(rbErr)) + log.Error("Failed to rollback", + zap.String("namespace", m.id.Namespace), + zap.String("changefeed", m.id.ID), + zap.Error(rbErr)) } return err } @@ -262,12 +265,17 @@ func (m *DDLSink) execDDL(pctx context.Context, ddl *model.DDLEvent) error { // set the session timestamp to match upstream DDL execution time if err := setSessionTimestamp(ctx, tx, ddlTimestamp); err != nil { log.Error("Fail to set session timestamp for DDL", + zap.String("namespace", m.id.Namespace), + zap.String("changefeed", m.id.ID), zap.Float64("timestamp", ddlTimestamp), zap.Uint64("commitTs", ddl.CommitTs), zap.String("query", ddl.Query), zap.Error(err)) if rbErr := tx.Rollback(); rbErr != nil { - log.Error("Failed to rollback", zap.String("changefeed", m.id.ID), zap.Error(rbErr)) + log.Error("Failed to rollback", + zap.String("namespace", m.id.Namespace), + zap.String("changefeed", m.id.ID), + zap.Error(rbErr)) } return err } @@ -298,8 +306,8 @@ func (m *DDLSink) execDDL(pctx context.Context, ddl *model.DDLEvent) error { log.Error("Failed to rollback", zap.String("namespace", m.id.Namespace), zap.String("changefeed", m.id.ID), - zap.String("sql", ddl.Query), - zap.Error(err)) + zap.String("query", ddl.Query), + zap.Error(rbErr)) } return err } @@ -313,9 +321,18 @@ func (m *DDLSink) execDDL(pctx context.Context, ddl *model.DDLEvent) error { zap.Uint64("commitTs", ddl.CommitTs), zap.String("query", ddl.Query)) } else if err := resetSessionTimestamp(ctx, tx); err != nil { - log.Error("Failed to reset session timestamp after DDL execution", zap.Error(err)) + log.Error("Failed to reset session timestamp after DDL execution", + zap.String("namespace", m.id.Namespace), + zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), + zap.Error(err)) if rbErr := tx.Rollback(); rbErr != nil { - log.Error("Failed to rollback", zap.String("sql", ddl.Query), zap.Error(rbErr)) + log.Error("Failed to rollback", + zap.String("namespace", m.id.Namespace), + zap.String("changefeed", m.id.ID), + zap.String("query", ddl.Query), + zap.Error(rbErr)) } return errors.WrapError(errors.ErrMySQLTxnError, errors.WithMessage(err, fmt.Sprintf("Query info: %s; ", ddl.Query))) } @@ -361,9 +378,14 @@ func (m *DDLSink) waitDDLDone(ctx context.Context, ddl *model.DDLEvent, ddlCreat zap.String("ddlCreateTime", ddlCreateTime)) } - state, err := getDDLStateFromTiDB(ctx, m.db, ddl.Query, ddlCreateTime) + state, err := getDDLStateFromTiDB(ctx, m.id, m.db, ddl, ddlCreateTime) if err != nil { - log.Error("Error when getting DDL state from TiDB", zap.Error(err)) + log.Error("Error when getting DDL state from TiDB", + zap.String("namespace", m.id.Namespace), + zap.String("changefeed", m.id.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), + zap.Error(err)) } switch state { case timodel.JobStateDone, timodel.JobStateSynced: @@ -454,16 +476,22 @@ func needFormatDDL(db *sql.DB, cfg *pmysql.Config) bool { return false } -func getDDLCreateTime(ctx context.Context, db *sql.DB) string { +func getDDLCreateTime(ctx context.Context, changefeedID model.ChangeFeedID, db *sql.DB) string { ddlCreateTime := "" // default when scan failed row, err := db.QueryContext(ctx, "BEGIN; SET @ticdc_ts := TIDB_PARSE_TSO(@@tidb_current_ts); ROLLBACK; SELECT @ticdc_ts; SET @ticdc_ts=NULL;") if err != nil { - log.Warn("selecting tidb current timestamp failed", zap.Error(err)) + log.Warn("selecting tidb current timestamp failed", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Error(err)) } else { for row.Next() { err = row.Scan(&ddlCreateTime) if err != nil { - log.Warn("getting ddlCreateTime failed", zap.Error(err)) + log.Warn("getting ddlCreateTime failed", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Error(err)) } } //nolint:sqlclosecheck @@ -474,21 +502,37 @@ func getDDLCreateTime(ctx context.Context, db *sql.DB) string { } // getDDLStateFromTiDB retrieves the ddl job status of the ddl query from downstream tidb based on the ddl query and the approximate ddl create time. -func getDDLStateFromTiDB(ctx context.Context, db *sql.DB, ddl string, createTime string) (timodel.JobState, error) { +func getDDLStateFromTiDB( + ctx context.Context, + changefeedID model.ChangeFeedID, + db *sql.DB, + ddl *model.DDLEvent, + createTime string, +) (timodel.JobState, error) { // ddlCreateTime and createTime are both based on UTC timezone of downstream showJobs := fmt.Sprintf(`SELECT JOB_ID, JOB_TYPE, SCHEMA_STATE, SCHEMA_ID, TABLE_ID, STATE, QUERY FROM information_schema.ddl_jobs - WHERE CREATE_TIME >= "%s" AND QUERY = "%s";`, createTime, ddl) + WHERE CREATE_TIME >= "%s" AND QUERY = "%s";`, createTime, ddl.Query) var jobsResults [][]string err := retry.Do(ctx, func() error { //nolint:rowserrcheck jobsRows, err := db.QueryContext(ctx, showJobs) if err != nil { - log.Warn("failed to query from downstream to get ddl state", zap.Error(err)) + log.Warn("failed to query from downstream to get ddl state", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), + zap.Error(err)) return err } jobsResults, err = export.GetSpecifiedColumnValuesAndClose(jobsRows, "QUERY", "STATE", "JOB_ID", "JOB_TYPE", "SCHEMA_STATE") if err != nil { - log.Warn("get jobs results failed", zap.Error(err)) + log.Warn("get jobs results failed", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Uint64("commitTs", ddl.CommitTs), + zap.String("query", ddl.Query), + zap.Error(err)) return err } return nil @@ -503,10 +547,13 @@ func getDDLStateFromTiDB(ctx context.Context, db *sql.DB, ddl string, createTime result := jobsResults[0] state, jobID, jobType, schemaState := result[1], result[2], result[3], result[4] log.Debug("Find ddl state in downstream", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Uint64("commitTs", ddl.CommitTs), zap.String("jobID", jobID), zap.String("jobType", jobType), zap.String("schemaState", schemaState), - zap.String("ddl", ddl), + zap.String("query", ddl.Query), zap.String("state", state), zap.Any("jobsResults", jobsResults), ) From 6ce888c65a74a00ed82d11d2719dc39ea0946dff Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Thu, 9 Apr 2026 23:29:58 +0800 Subject: [PATCH 10/24] fix code --- cdc/kv/shared_client.go | 38 ++++++++++++++++---------------- cdc/kv/shared_region_worker.go | 4 ++-- cdc/kv/shared_stream.go | 6 ++--- pkg/sink/kafka/sarama_factory.go | 18 +++++++-------- 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/cdc/kv/shared_client.go b/cdc/kv/shared_client.go index 29e2a7bd68..eff9b7c7e5 100644 --- a/cdc/kv/shared_client.go +++ b/cdc/kv/shared_client.go @@ -296,7 +296,7 @@ func (s *SharedClient) Subscribe(subID SubscriptionID, span tablepb.Span, startT log.Info("event feed subscribes table success", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", rt.subscriptionID), + zap.Uint64("subscriptionID", uint64(rt.subscriptionID)), zap.String("span", rt.span.String())) } @@ -312,14 +312,14 @@ func (s *SharedClient) Unsubscribe(subID SubscriptionID) { log.Info("event feed unsubscribes table", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", rt.subscriptionID), + zap.Uint64("subscriptionID", uint64(rt.subscriptionID)), zap.String("span", rt.span.String())) return } log.Warn("event feed unsubscribes table, but not found", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", subID)) + zap.Uint64("subscriptionID", uint64(subID))) } // ResolveLock is a function. If outsider subscribers find a span resolved timestamp is @@ -386,7 +386,7 @@ func (s *SharedClient) setTableStopped(rt *subscribedTable) { log.Info("event feed starts to stop table", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", rt.subscriptionID), + zap.Uint64("subscriptionID", uint64(rt.subscriptionID)), zap.Int64("tableID", rt.span.TableID)) // Set stopped to true so we can stop handling region events from the table. @@ -404,7 +404,7 @@ func (s *SharedClient) onTableDrained(rt *subscribedTable) { log.Info("event feed stop table is finished", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", rt.subscriptionID), + zap.Uint64("subscriptionID", uint64(rt.subscriptionID)), zap.Int64("tableID", rt.span.TableID)) s.totalSpans.Lock() @@ -445,7 +445,7 @@ func (s *SharedClient) handleRegions(ctx context.Context, eg *errgroup.Group) er zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), zap.Uint64("streamID", stream.streamID), - zap.Any("subscriptionID", region.subscribedTable.subscriptionID), + zap.Uint64("subscriptionID", uint64(region.subscribedTable.subscriptionID)), zap.Uint64("regionID", region.verID.GetID()), zap.String("span", region.span.String()), zap.String("addr", store.storeAddr)) @@ -466,7 +466,7 @@ func (s *SharedClient) attachRPCContextForRegion(ctx context.Context, region reg log.Debug("event feed get RPC context fail", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", region.subscribedTable.subscriptionID), + zap.Uint64("subscriptionID", uint64(region.subscribedTable.subscriptionID)), zap.Uint64("regionID", region.verID.GetID()), zap.Error(err)) } @@ -549,7 +549,7 @@ func (s *SharedClient) divideSpanAndScheduleRegionRequests( log.Debug("event feed is going to load regions", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", subscribedTable.subscriptionID), + zap.Uint64("subscriptionID", uint64(subscribedTable.subscriptionID)), zap.Any("span", nextSpan)) backoff := tikv.NewBackoffer(ctx, tikvRequestMaxBackoff) @@ -558,7 +558,7 @@ func (s *SharedClient) divideSpanAndScheduleRegionRequests( log.Warn("event feed load regions failed", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", subscribedTable.subscriptionID), + zap.Uint64("subscriptionID", uint64(subscribedTable.subscriptionID)), zap.String("span", nextSpan.String()), zap.Error(err)) backoffBeforeLoad = true @@ -576,7 +576,7 @@ func (s *SharedClient) divideSpanAndScheduleRegionRequests( log.Warn("event feed load regions with holes", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", subscribedTable.subscriptionID), + zap.Uint64("subscriptionID", uint64(subscribedTable.subscriptionID)), zap.String("span", nextSpan.String())) backoffBeforeLoad = true continue @@ -595,7 +595,7 @@ func (s *SharedClient) divideSpanAndScheduleRegionRequests( log.Panic("event feed check spans intersect shouldn't fail", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", subscribedTable.subscriptionID), + zap.Uint64("subscriptionID", uint64(subscribedTable.subscriptionID)), zap.String("span", nextSpan.String())) } @@ -681,7 +681,7 @@ func (s *SharedClient) doHandleError(ctx context.Context, errInfo regionErrorInf s.logRegionDetails("cdc region error", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", errInfo.subscribedTable.subscriptionID), + zap.Uint64("subscriptionID", uint64(errInfo.subscribedTable.subscriptionID)), zap.Uint64("regionID", errInfo.verID.GetID()), zap.Int64("tableID", errInfo.span.TableID), zap.Stringer("error", innerErr)) @@ -727,7 +727,7 @@ func (s *SharedClient) doHandleError(ctx context.Context, errInfo regionErrorInf log.Warn("empty or unknown cdc error", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", errInfo.subscribedTable.subscriptionID), + zap.Uint64("subscriptionID", uint64(errInfo.subscribedTable.subscriptionID)), zap.Uint64("regionID", errInfo.verID.GetID()), zap.Int64("tableID", errInfo.span.TableID), zap.Stringer("error", innerErr)) @@ -756,7 +756,7 @@ func (s *SharedClient) doHandleError(ctx context.Context, errInfo regionErrorInf log.Warn("event feed meets an internal error, fail the changefeed", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", errInfo.subscribedTable.subscriptionID), + zap.Uint64("subscriptionID", uint64(errInfo.subscribedTable.subscriptionID)), zap.Uint64("regionID", errInfo.verID.GetID()), zap.Int64("tableID", errInfo.span.TableID), zap.Error(err)) @@ -841,7 +841,7 @@ func (s *SharedClient) logSlowRegions(ctx context.Context) error { log.Info("event feed finds a initialized slow region", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", subscriptionID), + zap.Uint64("subscriptionID", uint64(subscriptionID)), zap.Int64("tableID", rt.span.TableID), zap.Any("slowRegion", attr.SlowestRegion)) } @@ -850,14 +850,14 @@ func (s *SharedClient) logSlowRegions(ctx context.Context) error { log.Info("event feed initializes a region too slow", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", subscriptionID), + zap.Uint64("subscriptionID", uint64(subscriptionID)), zap.Int64("tableID", rt.span.TableID), zap.Any("slowRegion", attr.SlowestRegion)) } else if currTime.Sub(ckptTime) > 10*time.Minute { log.Info("event feed finds a uninitialized slow region", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", subscriptionID), + zap.Uint64("subscriptionID", uint64(subscriptionID)), zap.Int64("tableID", rt.span.TableID), zap.Any("slowRegion", attr.SlowestRegion)) } @@ -865,7 +865,7 @@ func (s *SharedClient) logSlowRegions(ctx context.Context) error { log.Info("event feed holes exist", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", subscriptionID), + zap.Uint64("subscriptionID", uint64(subscriptionID)), zap.Int64("tableID", rt.span.TableID), zap.Any("holes", attr.UnLockedRanges)) } @@ -910,7 +910,7 @@ func (r *subscribedTable) resolveStaleLocks(s *SharedClient, targetTs uint64) { s.logRegionDetails("event feed finds slow locked ranges", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), - zap.Any("subscriptionID", r.subscriptionID), + zap.Uint64("subscriptionID", uint64(r.subscriptionID)), zap.Any("ranges", res)) } diff --git a/cdc/kv/shared_region_worker.go b/cdc/kv/shared_region_worker.go index f00b131d3d..1efa0db8b8 100644 --- a/cdc/kv/shared_region_worker.go +++ b/cdc/kv/shared_region_worker.go @@ -173,7 +173,7 @@ func (w *sharedRegionWorker) handleSingleRegionError(state *regionFeedState, str zap.String("namespace", w.changefeed.Namespace), zap.String("changefeed", w.changefeed.ID), zap.Uint64("streamID", stream.streamID), - zap.Any("subscriptionID", state.getRegionID()), + zap.Uint64("subscriptionID", state.getRegionID()), zap.Uint64("regionID", state.region.verID.GetID()), zap.Int64("tableID", state.region.span.TableID), zap.Bool("reschedule", stepsToRemoved), @@ -427,7 +427,7 @@ func (w *sharedRegionWorker) forwardResolvedTsToPullerFrontier(ctx context.Conte log.Debug("region worker get a ResolvedTs", zap.String("namespace", w.changefeed.Namespace), zap.String("changefeed", w.changefeed.ID), - zap.Any("subscriptionID", subscriptionID), + zap.Uint64("subscriptionID", uint64(subscriptionID)), zap.Uint64("ResolvedTs", batch.ts), zap.Int("spanCount", len(spansAndChan.spans))) if len(spansAndChan.spans) > 0 { diff --git a/cdc/kv/shared_stream.go b/cdc/kv/shared_stream.go index cdf9429158..cfc0c7b137 100644 --- a/cdc/kv/shared_stream.go +++ b/cdc/kv/shared_stream.go @@ -323,7 +323,7 @@ func (s *requestedStream) send(ctx context.Context, c *SharedClient, rs *request zap.String("namespace", c.changefeed.Namespace), zap.String("changefeed", c.changefeed.ID), zap.Uint64("streamID", s.streamID), - zap.Any("subscriptionID", subscriptionID), + zap.Uint64("subscriptionID", uint64(subscriptionID)), zap.Int64("tableID", region.span.TableID), zap.Uint64("regionID", req.RegionId), zap.String("addr", rs.storeAddr), @@ -372,7 +372,7 @@ func (s *requestedStream) send(ctx context.Context, c *SharedClient, rs *request zap.String("namespace", c.changefeed.Namespace), zap.String("changefeed", c.changefeed.ID), zap.Uint64("streamID", s.streamID), - zap.Any("subscriptionID", subscriptionID), + zap.Uint64("subscriptionID", uint64(subscriptionID)), zap.Uint64("regionID", region.verID.GetID()), zap.Int64("tableID", region.span.TableID), zap.String("addr", rs.storeAddr), @@ -477,7 +477,7 @@ func (s *requestedStream) sendRegionChangeEvents( zap.String("namespace", c.changefeed.Namespace), zap.String("changefeed", c.changefeed.ID), zap.Uint64("streamID", s.streamID), - zap.Any("subscriptionID", subscriptionID), + zap.Uint64("subscriptionID", uint64(subscriptionID)), zap.Uint64("regionID", event.RegionId), zap.Bool("stateIsNil", state == nil), zap.Any("error", x.Error), diff --git a/pkg/sink/kafka/sarama_factory.go b/pkg/sink/kafka/sarama_factory.go index 8811f75b18..548d2fda6c 100644 --- a/pkg/sink/kafka/sarama_factory.go +++ b/pkg/sink/kafka/sarama_factory.go @@ -48,9 +48,9 @@ func NewSaramaFactory( func (f *saramaFactory) AdminClient(ctx context.Context) (ClusterAdminClient, error) { start := time.Now() config, err := NewSaramaConfig(ctx, f.option) - duration := time.Since(start).Seconds() - if duration > 2 { - log.Warn("new sarama config cost too much time", zap.Any("duration", duration), zap.Stringer("changefeedID", f.changefeedID)) + duration := time.Since(start) + if duration > 2*time.Second { + log.Warn("new sarama config cost too much time", zap.Duration("duration", duration), zap.Stringer("changefeedID", f.changefeedID)) } if err != nil { return nil, err @@ -58,9 +58,9 @@ func (f *saramaFactory) AdminClient(ctx context.Context) (ClusterAdminClient, er start = time.Now() client, err := sarama.NewClient(f.option.BrokerEndpoints, config) - duration = time.Since(start).Seconds() - if duration > 2 { - log.Warn("new sarama client cost too much time", zap.Any("duration", duration), zap.Stringer("changefeedID", f.changefeedID)) + duration = time.Since(start) + if duration > 2*time.Second { + log.Warn("new sarama client cost too much time", zap.Duration("duration", duration), zap.Stringer("changefeedID", f.changefeedID)) } if err != nil { return nil, errors.Trace(err) @@ -68,9 +68,9 @@ func (f *saramaFactory) AdminClient(ctx context.Context) (ClusterAdminClient, er start = time.Now() admin, err := sarama.NewClusterAdminFromClient(client) - duration = time.Since(start).Seconds() - if duration > 2 { - log.Warn("new sarama cluster admin cost too much time", zap.Any("duration", duration), zap.Stringer("changefeedID", f.changefeedID)) + duration = time.Since(start) + if duration > 2*time.Second { + log.Warn("new sarama cluster admin cost too much time", zap.Duration("duration", duration), zap.Stringer("changefeedID", f.changefeedID)) } if err != nil { return nil, errors.Trace(err) From caa49e043ad888954a3e3d3db9b1f951229ee0e7 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Fri, 10 Apr 2026 10:25:23 +0800 Subject: [PATCH 11/24] remove a lot of verbose logs. --- cdc/processor/processor.go | 14 ++--------- cdc/processor/sinkmanager/manager.go | 14 +---------- cdc/scheduler/internal/v3/agent/agent.go | 9 +------- cdc/scheduler/internal/v3/agent/table.go | 16 ------------- cdc/scheduler/internal/v3/coordinator.go | 8 ------- .../v3/replication/replication_manager.go | 23 ------------------- .../internal/v3/transport/transport.go | 7 ------ 7 files changed, 4 insertions(+), 87 deletions(-) diff --git a/cdc/processor/processor.go b/cdc/processor/processor.go index 38ac7f5064..5f8c17fd8f 100644 --- a/cdc/processor/processor.go +++ b/cdc/processor/processor.go @@ -286,16 +286,6 @@ func (p *processor) IsAddTableSpanFinished(span tablepb.Span, isPrepare bool) bo return state == tablepb.TableStateReplicating } if !done() { - log.Debug("Add Table not finished", - zap.String("captureID", p.captureInfo.ID), - zap.String("namespace", p.changefeedID.Namespace), - zap.String("changefeed", p.changefeedID.ID), - zap.Stringer("span", &span), - zap.Uint64("tableResolvedTs", tableResolvedTs), - zap.Uint64("tableCheckpointTs", tableCheckpointTs), - zap.Uint64("globalCheckpointTs", globalCheckpointTs), - zap.Any("state", state), - zap.Bool("isPrepare", isPrepare)) return false } @@ -349,8 +339,8 @@ func (p *processor) IsRemoveTableSpanFinished(span tablepb.Span) (model.Ts, bool zap.String("captureID", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), - zap.Stringer("span", &span), - zap.Uint64("checkpointTs", stats.CheckpointTs)) + zap.Uint64("checkpointTs", stats.CheckpointTs) + zap.Stringer("span", &span)) return stats.CheckpointTs, true } diff --git a/cdc/processor/sinkmanager/manager.go b/cdc/processor/sinkmanager/manager.go index 2d1e174ad9..2da347c38d 100644 --- a/cdc/processor/sinkmanager/manager.go +++ b/cdc/processor/sinkmanager/manager.go @@ -852,12 +852,6 @@ func (m *SinkManager) AddTable(span tablepb.Span, startTs model.Ts, targetTs mod } m.sinkMemQuota.AddTable(span) m.redoMemQuota.AddTable(span) - log.Info("Add table sink", - zap.String("namespace", m.changefeedID.Namespace), - zap.String("changefeed", m.changefeedID.ID), - zap.Stringer("span", &span), - zap.Uint64("startTs", startTs), - zap.Uint64("version", sinkWrapper.version)) return sinkWrapper } @@ -920,19 +914,13 @@ func (m *SinkManager) RemoveTable(span tablepb.Span) { // NOTICE: It is safe to only remove the table sink from the map. // Because if we found the table sink is closed, we will not add it back to the heap. // Also, no need to GC the SortEngine. Because the SortEngine also removes this table. - value, exists := m.tableSinks.LoadAndDelete(span) + _, exists := m.tableSinks.LoadAndDelete(span) if !exists { log.Panic("Remove an unexist table sink", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), zap.Stringer("span", &span)) } - checkpointTs := value.(*tableSinkWrapper).getCheckpointTs() - log.Info("Remove table sink successfully", - zap.String("namespace", m.changefeedID.Namespace), - zap.String("changefeed", m.changefeedID.ID), - zap.Stringer("span", &span), - zap.Uint64("checkpointTs", checkpointTs.Ts)) } // GetAllCurrentTableSpans returns all spans in the sinkManager. diff --git a/cdc/scheduler/internal/v3/agent/agent.go b/cdc/scheduler/internal/v3/agent/agent.go index d2dd1bbfa7..6ac7435c79 100644 --- a/cdc/scheduler/internal/v3/agent/agent.go +++ b/cdc/scheduler/internal/v3/agent/agent.go @@ -302,13 +302,6 @@ func (a *agent) handleMessageHeartbeat(request *schedulepb.Heartbeat) (*schedule MsgType: schedulepb.MsgHeartbeatResponse, HeartbeatResponse: response, } - - log.Debug("schedulerv3: agent generate heartbeat response", - zap.String("capture", a.CaptureID), - zap.String("namespace", a.ChangeFeedID.Namespace), - zap.String("changefeed", a.ChangeFeedID.ID), - zap.Any("message", message)) - return message, request.GetBarrier() } @@ -442,7 +435,7 @@ func (a *agent) handleOwnerInfo(id model.CaptureID, revision int64, version stri } // staled owner heartbeat, just ignore it. - log.Info("schedulerv3: message from staled owner", + log.Debug("schedulerv3: message from staled owner", zap.String("capture", a.CaptureID), zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID), diff --git a/cdc/scheduler/internal/v3/agent/table.go b/cdc/scheduler/internal/v3/agent/table.go index 63fecbafc8..6691e67bb0 100644 --- a/cdc/scheduler/internal/v3/agent/table.go +++ b/cdc/scheduler/internal/v3/agent/table.go @@ -58,12 +58,6 @@ func (t *tableSpan) getAndUpdateTableSpanState() (tablepb.TableState, bool) { t.state = meta.State if oldState != t.state { - log.Debug("schedulerv3: table state changed", - zap.String("namespace", t.changefeedID.Namespace), - zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span), - zap.Stringer("oldState", oldState), - zap.Stringer("state", t.state)) return t.state, true } return t.state, false @@ -264,11 +258,6 @@ func (t *tableSpan) injectDispatchTableTask(task *dispatchTableTask) { zap.Stringer("task.TableID", &task.Span)) } if t.task == nil { - log.Info("schedulerv3: table found new task", - zap.String("namespace", t.changefeedID.Namespace), - zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span), - zap.Any("task", task)) t.task = task return } @@ -375,11 +364,6 @@ func (tm *tableSpanManager) dropTableSpan(span tablepb.Span) { zap.String("span", span.String()), zap.Stringer("state", table.state)) } - - log.Debug("schedulerv3: tableManager drop table", - zap.String("namespace", tm.changefeedID.Namespace), - zap.String("changefeed", tm.changefeedID.ID), - zap.String("span", span.String())) tm.tables.Delete(span) } diff --git a/cdc/scheduler/internal/v3/coordinator.go b/cdc/scheduler/internal/v3/coordinator.go index 2ff15e5162..8b8eb577f7 100644 --- a/cdc/scheduler/internal/v3/coordinator.go +++ b/cdc/scheduler/internal/v3/coordinator.go @@ -273,14 +273,6 @@ func (c *coordinator) poll( barrier *schedulepb.BarrierWithMinTs, ) (watermark schedulepb.Watermark, err error) { c.maybeCollectMetrics() - if c.compat.UpdateCaptureInfo(aliveCaptures) { - spanReplicationEnabled := c.compat.CheckSpanReplicationEnabled() - log.Info("schedulerv3: compat update capture info", - zap.String("namespace", c.changefeedID.Namespace), - zap.String("changefeed", c.changefeedID.ID), - zap.Any("captures", aliveCaptures), - zap.Bool("spanReplicationEnabled", spanReplicationEnabled)) - } recvMsgs, err := c.recvMsgs(ctx) if err != nil { diff --git a/cdc/scheduler/internal/v3/replication/replication_manager.go b/cdc/scheduler/internal/v3/replication/replication_manager.go index 6688fb1457..654a8bf5b5 100644 --- a/cdc/scheduler/internal/v3/replication/replication_manager.go +++ b/cdc/scheduler/internal/v3/replication/replication_manager.go @@ -284,11 +284,6 @@ func (r *Manager) handleMessageHeartbeatResponse( return nil, errors.Trace(err) } if table.hasRemoved() { - log.Info("schedulerv3: table has removed", - zap.String("namespace", r.changefeedID.Namespace), - zap.String("changefeed", r.changefeedID.ID), - zap.Any("from", from), - zap.Int64("tableID", status.Span.TableID)) r.spans.Delete(status.Span) } sentMsgs = append(sentMsgs, msgs...) @@ -326,10 +321,6 @@ func (r *Manager) handleMessageDispatchTableResponse( return nil, errors.Trace(err) } if table.hasRemoved() { - log.Info("schedulerv3: table has removed", - zap.String("namespace", r.changefeedID.Namespace), - zap.String("changefeed", r.changefeedID.ID), - zap.Int64("tableID", status.Span.TableID)) r.spans.Delete(status.Span) } return msgs, nil @@ -452,10 +443,6 @@ func (r *Manager) handleRemoveTableTask( r.acceptRemoveTableTask++ table, _ := r.spans.Get(task.Span) if table.hasRemoved() { - log.Info("schedulerv3: table has removed", - zap.String("namespace", r.changefeedID.Namespace), - zap.String("changefeed", r.changefeedID.ID), - zap.Int64("tableID", task.Span.TableID)) r.spans.Delete(task.Span) return nil, nil } @@ -481,16 +468,6 @@ func (r *Manager) handleBurstBalanceTasks( for _, task := range task.RemoveTables { perCapture[task.CaptureID]++ } - fields := make([]zap.Field, 0) - for captureID, count := range perCapture { - fields = append(fields, zap.Int(captureID, count)) - } - fields = append(fields, zap.Int("addTable", len(task.AddTables))) - fields = append(fields, zap.Int("removeTable", len(task.RemoveTables))) - fields = append(fields, zap.Int("moveTable", len(task.MoveTables))) - fields = append(fields, zap.String("namespace", r.changefeedID.Namespace)) - fields = append(fields, zap.String("changefeed", r.changefeedID.ID)) - log.Info("schedulerv3: handle burst balance task", fields...) sentMsgs := make([]*schedulepb.Message, 0, len(task.AddTables)) for i := range task.AddTables { diff --git a/cdc/scheduler/internal/v3/transport/transport.go b/cdc/scheduler/internal/v3/transport/transport.go index ade03146df..8fb2d86754 100644 --- a/cdc/scheduler/internal/v3/transport/transport.go +++ b/cdc/scheduler/internal/v3/transport/transport.go @@ -159,13 +159,6 @@ func (t *p2pTransport) Send( return errors.Trace(err) } } - - if len(msgs) != 0 { - log.Debug("schedulerv3: all messages sent", - zap.String("namespace", t.changefeed.Namespace), - zap.String("changefeed", t.changefeed.ID), - zap.Int("len", len(msgs))) - } return nil } From 633b69b58f8b80df75634117beab51c66f51a8e6 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Fri, 10 Apr 2026 11:09:22 +0800 Subject: [PATCH 12/24] remove a lot of verbose logs. --- cdc/processor/processor.go | 2 +- cdc/scheduler/internal/v3/agent/table.go | 17 +++++------------ 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/cdc/processor/processor.go b/cdc/processor/processor.go index 5f8c17fd8f..b8e5f44923 100644 --- a/cdc/processor/processor.go +++ b/cdc/processor/processor.go @@ -339,7 +339,7 @@ func (p *processor) IsRemoveTableSpanFinished(span tablepb.Span) (model.Ts, bool zap.String("captureID", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), - zap.Uint64("checkpointTs", stats.CheckpointTs) + zap.Uint64("checkpointTs", stats.CheckpointTs), zap.Stringer("span", &span)) return stats.CheckpointTs, true diff --git a/cdc/scheduler/internal/v3/agent/table.go b/cdc/scheduler/internal/v3/agent/table.go index 6691e67bb0..f17e49eb5f 100644 --- a/cdc/scheduler/internal/v3/agent/table.go +++ b/cdc/scheduler/internal/v3/agent/table.go @@ -181,20 +181,12 @@ func (t *tableSpan) handleAddTableTask(ctx context.Context) (result *schedulepb. } state, changed = t.getAndUpdateTableSpanState() case tablepb.TableStateReplicating: - log.Info("schedulerv3: table is replicating", - zap.String("namespace", t.changefeedID.Namespace), - zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span), zap.Stringer("state", state)) t.task = nil status := t.getTableSpanStatus(false) return newAddTableResponseMessage(status), nil case tablepb.TableStatePrepared: if t.task.IsPrepare { // `prepared` is a stable state, if the task was to prepare the table. - log.Info("schedulerv3: table is prepared", - zap.String("namespace", t.changefeedID.Namespace), - zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span), zap.Stringer("state", state)) t.task = nil return newAddTableResponseMessage(t.getTableSpanStatus(false)), nil } @@ -226,10 +218,6 @@ func (t *tableSpan) handleAddTableTask(ctx context.Context) (result *schedulepb. return nil, nil } state, changed = t.getAndUpdateTableSpanState() - log.Info("schedulerv3: add table finished", - zap.String("namespace", t.changefeedID.Namespace), - zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span), zap.Stringer("state", state)) case tablepb.TableStateStopping, tablepb.TableStateStopped: log.Warn("schedulerv3: ignore add table", @@ -258,6 +246,11 @@ func (t *tableSpan) injectDispatchTableTask(task *dispatchTableTask) { zap.Stringer("task.TableID", &task.Span)) } if t.task == nil { + log.Info("schedulerv3: table found new task", + zap.String("namespace", t.changefeedID.Namespace), + zap.String("changefeed", t.changefeedID.ID), + zap.Any("tableSpan", t.span), + zap.Any("task", task)) t.task = task return } From e3e114d96cfe89fa0eaf1d169e27aac77c374a2f Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Fri, 10 Apr 2026 11:28:43 +0800 Subject: [PATCH 13/24] refactor burst logs --- .../internal/v3/scheduler/scheduler_basic.go | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go b/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go index 1ce7244b69..8c3906f4e5 100644 --- a/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go +++ b/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go @@ -144,6 +144,7 @@ func newBurstAddTables( ) *replication.ScheduleTask { idx := 0 tables := make([]replication.AddTable, 0, len(newSpans)) + tablesPerCapture := make(map[model.CaptureID][]int64, len(captureIDs)) for _, span := range newSpans { targetCapture := captureIDs[idx] tables = append(tables, replication.AddTable{ @@ -151,17 +152,19 @@ func newBurstAddTables( CaptureID: targetCapture, CheckpointTs: checkpointTs, }) - log.Info("schedulerv3: burst add table", - zap.String("namespace", changefeedID.Namespace), - zap.String("changefeed", changefeedID.ID), - zap.String("captureID", targetCapture), - zap.Any("tableID", span.TableID)) - + tablesPerCapture[targetCapture] = append(tablesPerCapture[targetCapture], span.TableID) idx++ if idx >= len(captureIDs) { idx = 0 } } + for captureID, tableIDs := range tablesPerCapture { + log.Info("schedulerv3: burst add tables", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.String("captureID", captureID), + zap.Int64s("tableIDs", tableIDs)) + } return &replication.ScheduleTask{ BurstBalance: &replication.BurstBalance{ AddTables: tables, @@ -174,6 +177,7 @@ func newBurstRemoveTables( changefeedID model.ChangeFeedID, ) *replication.ScheduleTask { tables := make([]replication.RemoveTable, 0, len(rmSpans)) + tablesPerCapture := make(map[model.CaptureID][]int64) for _, span := range rmSpans { rep := replications.GetV(span) var captureID model.CaptureID @@ -193,17 +197,21 @@ func newBurstRemoveTables( Span: span, CaptureID: captureID, }) - log.Info("schedulerv3: burst remove table", - zap.String("namespace", changefeedID.Namespace), - zap.String("changefeed", changefeedID.ID), - zap.String("captureID", captureID), - zap.Any("tableID", span.TableID)) + tablesPerCapture[captureID] = append(tablesPerCapture[captureID], span.TableID) } if len(tables) == 0 { return nil } + for captureID, tableIDs := range tablesPerCapture { + log.Info("schedulerv3: burst remove table", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.String("captureID", captureID), + zap.Int64s("tableIDs", tableIDs)) + } + return &replication.ScheduleTask{ BurstBalance: &replication.BurstBalance{ RemoveTables: tables, From 04b03c0bbb7cb1030c5a86618c852a833e31588a Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Fri, 10 Apr 2026 13:48:34 +0800 Subject: [PATCH 14/24] add more code --- .../v3/replication/replication_set.go | 302 +++++++++++++----- 1 file changed, 224 insertions(+), 78 deletions(-) diff --git a/cdc/scheduler/internal/v3/replication/replication_set.go b/cdc/scheduler/internal/v3/replication/replication_set.go index 8e1b3d4574..22914ccab2 100644 --- a/cdc/scheduler/internal/v3/replication/replication_set.go +++ b/cdc/scheduler/internal/v3/replication/replication_set.go @@ -198,9 +198,7 @@ func NewReplicationSet( case tablepb.TableStateReplicating: if len(r.Primary) != 0 { return nil, r.multiplePrimaryError( - table, captureID, "schedulerv3: multiple primary", - zap.Any("primary", r.Primary), - zap.Any("status", tableStatus)) + table, captureID, "schedulerv3: multiple primary") } // Recognize primary if it's table is in replicating state. err := r.setCapture(captureID, RoleSecondary) @@ -229,12 +227,17 @@ func NewReplicationSet( // capture is primary, and is still replicating data to downstream. // We need to wait its state becomes Stopped or Absent before // proceeding further scheduling. + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: found a stopping capture during initializing", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Int64("tableID", table.Span.TableID), - zap.Any("replicationSet", r), - zap.Any("status", tableStatus)) + zap.String("captureID", captureID), + zap.Any("checkpoint", table.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", table), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) err := r.setCapture(captureID, RoleUndetermined) if err != nil { return nil, errors.Trace(err) @@ -244,12 +247,17 @@ func NewReplicationSet( tablepb.TableStateStopped: // Ignore stop state. default: + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: unknown table state", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Int64("tableID", table.Span.TableID), - zap.Any("replicationSet", r), - zap.Any("status", tableStatus)) + zap.String("captureID", captureID), + zap.Any("checkpoint", table.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", table), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) } } @@ -271,11 +279,6 @@ func NewReplicationSet( if r.State == ReplicationSetStateUnknown && len(r.Captures) == stoppingCount { r.State = ReplicationSetStateRemoving } - log.Info("schedulerv3: initialize replication set", - zap.String("namespace", r.Changefeed.Namespace), - zap.String("changefeed", r.Changefeed.ID), - zap.Any("replicationSet", r)) - return r, nil } @@ -325,11 +328,16 @@ func (r *ReplicationSet) clearCapture(captureID model.CaptureID, role Role) erro func (r *ReplicationSet) promoteSecondary(captureID model.CaptureID) error { if r.Primary == captureID { + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: capture is already promoted as the primary", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("replicationSet", r)) + zap.Any("checkpoint", r.Checkpoint), + zap.Stringer("state", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return nil } role, ok := r.Captures[captureID] @@ -355,12 +363,17 @@ func (r *ReplicationSet) clearPrimary() { func (r *ReplicationSet) inconsistentError( input *tablepb.TableStatus, captureID model.CaptureID, msg string, fields ...zap.Field, ) error { + secondary, _ := r.getRole(RoleSecondary) fields = append(fields, []zap.Field{ zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), zap.Stringer("tableState", input), - zap.Any("replicationSet", r), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span), }...) log.L().WithOptions(zap.AddCallerSkip(1)).Error(msg, fields...) return errors.ErrReplicationSetInconsistent.GenWithStackByArgs( @@ -370,12 +383,17 @@ func (r *ReplicationSet) inconsistentError( func (r *ReplicationSet) multiplePrimaryError( input *tablepb.TableStatus, captureID model.CaptureID, msg string, fields ...zap.Field, ) error { + secondary, _ := r.getRole(RoleSecondary) fields = append(fields, []zap.Field{ zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), zap.Stringer("tableState", input), - zap.Any("replicationSet", r), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span), }...) log.L().WithOptions(zap.AddCallerSkip(1)).Error(msg, fields...) return errors.ErrReplicationSetMultiplePrimaryError.GenWithStackByArgs( @@ -458,10 +476,10 @@ func (r *ReplicationSet) poll( log.Info("schedulerv3: replication state transition, poll", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Stringer("tableState", input), zap.String("captureID", captureID), zap.Stringer("old", oldState), - zap.Stringer("new", r.State)) + zap.Stringer("new", r.State), + zap.Stringer("tableState", input)) } } @@ -486,12 +504,17 @@ func (r *ReplicationSet) pollOnAbsent( tablepb.TableStateReplicating, tablepb.TableStateStopping: } + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: ignore input, unexpected replication set state", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Stringer("tableState", input), zap.String("captureID", captureID), - zap.Any("replicationSet", r)) + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return false, nil } @@ -533,24 +556,34 @@ func (r *ReplicationSet) pollOnPrepare( } case tablepb.TableStateStopping, tablepb.TableStateStopped: if r.Primary == captureID { + secondary, _ := r.getRole(RoleSecondary) // Primary is stopped, but we may still has secondary. // Clear primary and promote secondary when it's prepared. log.Info("schedulerv3: primary is stopped during Prepare", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Stringer("tableState", input), zap.String("captureID", captureID), - zap.Any("replicationSet", r)) + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) r.clearPrimary() return nil, false, nil } if r.isInRole(captureID, RoleSecondary) { + secondary, _ := r.getRole(RoleSecondary) log.Info("schedulerv3: capture is stopped during Prepare", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Stringer("tableState", input), zap.String("captureID", captureID), - zap.Any("replicationSet", r)) + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) err := r.clearCapture(captureID, RoleSecondary) if err != nil { return nil, false, errors.Trace(err) @@ -567,12 +600,17 @@ func (r *ReplicationSet) pollOnPrepare( return nil, true, nil } } + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: ignore input, unexpected replication set state", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Stringer("tableState", input), zap.String("captureID", captureID), - zap.Any("replicationSet", r)) + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return nil, false, nil } @@ -605,9 +643,13 @@ func (r *ReplicationSet) pollOnCommit( log.Info("schedulerv3: there are unknown captures during commit", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Any("replicationSet", r), + zap.String("captureID", captureID), + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), zap.Stringer("tableState", input), - zap.String("captureID", captureID)) + zap.String("primary", r.Primary), + zap.String("secondary", captureID), + zap.Stringer("span", &r.Span)) return nil, false, nil } // No primary, promote secondary to primary. @@ -619,9 +661,13 @@ func (r *ReplicationSet) pollOnCommit( log.Info("schedulerv3: promote secondary, no primary", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Any("replicationSet", r), + zap.String("captureID", captureID), + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), zap.Stringer("tableState", input), - zap.String("captureID", captureID)) + zap.String("primary", r.Primary), + zap.String("secondary", captureID), + zap.Stringer("span", &r.Span)) } // Secondary has been promoted, retry AddTableRequest. if r.Primary == captureID && !r.hasRole(RoleSecondary) { @@ -646,13 +692,18 @@ func (r *ReplicationSet) pollOnCommit( original := r.Primary r.clearPrimary() if !r.hasRole(RoleSecondary) { + secondary, _ := r.getRole(RoleSecondary) // If there is no secondary, transit to Absent. log.Info("schedulerv3: primary is stopped during Commit", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Stringer("tableState", input), zap.String("captureID", captureID), - zap.Any("replicationSet", r)) + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) r.State = ReplicationSetStateAbsent return nil, true, nil } @@ -665,10 +716,15 @@ func (r *ReplicationSet) pollOnCommit( log.Info("schedulerv3: replication state promote secondary", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Any("replicationSet", r), + zap.String("captureID", secondary), + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span), zap.String("original", original), - zap.String("captureID", secondary)) + ) return &schedulepb.Message{ To: r.Primary, MsgType: schedulepb.MsgDispatchTableRequest, @@ -689,9 +745,13 @@ func (r *ReplicationSet) pollOnCommit( log.Info("schedulerv3: secondary is stopped during Commit", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Stringer("tableState", input), zap.String("captureID", captureID), - zap.Any("replicationSet", r)) + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", captureID), + zap.Stringer("span", &r.Span)) err := r.clearCapture(captureID, RoleSecondary) if err != nil { return nil, false, errors.Trace(err) @@ -702,12 +762,17 @@ func (r *ReplicationSet) pollOnCommit( } return nil, true, nil } else if r.isInRole(captureID, RoleUndetermined) { + secondary, _ := r.getRole(RoleSecondary) log.Info("schedulerv3: capture is stopped during Commit", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Stringer("tableState", input), zap.String("captureID", captureID), - zap.Any("replicationSet", r)) + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) err := r.clearCapture(captureID, RoleUndetermined) return nil, false, errors.Trace(err) } @@ -752,23 +817,33 @@ func (r *ReplicationSet) pollOnCommit( r.updateCheckpointAndStats(input.Checkpoint, input.Stats) return nil, false, nil } else if r.isInRole(captureID, RoleUndetermined) { + secondary, _ := r.getRole(RoleSecondary) log.Info("schedulerv3: capture is stopping during Commit", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Stringer("tableState", input), zap.String("captureID", captureID), - zap.Any("replicationSet", r)) + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return nil, false, nil } case tablepb.TableStatePreparing: } + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: ignore input, unexpected replication set state", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Stringer("tableState", input), zap.String("captureID", captureID), - zap.Any("replicationSet", r)) + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return nil, false, nil } @@ -792,25 +867,35 @@ func (r *ReplicationSet) pollOnReplicating( case tablepb.TableStateStopped: if r.Primary == captureID { r.updateCheckpointAndStats(input.Checkpoint, input.Stats) + secondary, _ := r.getRole(RoleSecondary) // Primary is stopped, but we still has secondary. // Clear primary and promote secondary when it's prepared. log.Info("schedulerv3: primary is stopped during Replicating", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Stringer("tableState", input), zap.String("captureID", captureID), - zap.Any("replicationSet", r)) + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) r.clearPrimary() r.State = ReplicationSetStateAbsent return nil, true, nil } } + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: ignore input, unexpected replication set state", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Stringer("tableState", input), zap.String("captureID", captureID), - zap.Any("replicationSet", r)) + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return nil, false, nil } @@ -843,24 +928,34 @@ func (r *ReplicationSet) pollOnRemoving( err = r.clearCapture(captureID, RoleUndetermined) } if err != nil { + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: replication state remove capture with error", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Any("replicationSet", r), - zap.Stringer("tableState", input), zap.String("captureID", captureID), + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span), zap.Error(err)) } return nil, false, nil case tablepb.TableStateStopping: return nil, false, nil } + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: ignore input, unexpected replication set state", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Stringer("tableState", input), zap.String("captureID", captureID), - zap.Any("replicationSet", r)) + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.Stringer("tableState", input), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return nil, false, nil } @@ -875,11 +970,16 @@ func (r *ReplicationSet) handleAddTable( ) ([]*schedulepb.Message, error) { // Ignore add table if it's not in Absent state. if r.State != ReplicationSetStateAbsent { + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: add table is ignored", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Int64("tableID", r.Span.TableID), - zap.Any("replicationSet", r)) + zap.String("captureID", captureID), + zap.Any("checkpoint", r.Checkpoint), + zap.Stringer("state", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return nil, nil } err := r.setCapture(captureID, RoleSecondary) @@ -896,12 +996,18 @@ func (r *ReplicationSet) handleAddTable( if err != nil { return nil, errors.Trace(err) } + secondary, _ := r.getRole(RoleSecondary) log.Info("schedulerv3: replication state transition, add table", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Any("replicationSet", r), - zap.Stringer("old", oldState), zap.Stringer("new", r.State)) + zap.String("captureID", captureID), + zap.Any("checkpoint", r.Checkpoint), + zap.Stringer("old", oldState), + zap.Stringer("new", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return msgs, nil } @@ -909,22 +1015,32 @@ func (r *ReplicationSet) handleMoveTable( dest model.CaptureID, ) ([]*schedulepb.Message, error) { if r.hasRemoved() { + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: move table is ignored, since it removed already", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Int64("tableID", r.Span.TableID), - zap.Any("replicationSet", r)) + zap.String("captureID", dest), + zap.Any("checkpoint", r.Checkpoint), + zap.Stringer("state", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return nil, nil } // Ignore move table if // 1) it's not in Replicating state or // 2) the dest capture is the primary. if r.State != ReplicationSetStateReplicating || r.Primary == dest { + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: move table is ignored, since it's not replicating or the primary is the same as the move destination", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Int64("tableID", r.Span.TableID), - zap.Any("replicationSet", r)) + zap.String("captureID", dest), + zap.Any("checkpoint", r.Checkpoint), + zap.Stringer("state", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return nil, nil } oldState := r.State @@ -933,12 +1049,17 @@ func (r *ReplicationSet) handleMoveTable( if err != nil { return nil, errors.Trace(err) } + secondary, _ := r.getRole(RoleSecondary) log.Info("schedulerv3: replication state transition, move table", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), + zap.String("captureID", dest), + zap.Any("checkpoint", r.Checkpoint), + zap.Stringer("old", oldState), zap.Stringer("new", r.State), - zap.Any("replicationSet", r), - zap.Stringer("old", oldState)) + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) status := tablepb.TableStatus{ Span: r.Span, State: tablepb.TableStateAbsent, @@ -950,30 +1071,45 @@ func (r *ReplicationSet) handleMoveTable( func (r *ReplicationSet) handleRemoveTable() ([]*schedulepb.Message, error) { // Ignore remove table if it has been removed already. if r.hasRemoved() { + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: remove table is ignored", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Int64("tableID", r.Span.TableID), - zap.Any("replicationSet", r)) + zap.String("captureID", r.Primary), + zap.Any("checkpoint", r.Checkpoint), + zap.Stringer("state", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return nil, nil } // Ignore remove table if it's not in Replicating state. if r.State != ReplicationSetStateReplicating { + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: remove table is ignored", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Int64("tableID", r.Span.TableID), - zap.Any("replicationSet", r)) + zap.String("captureID", r.Primary), + zap.Any("checkpoint", r.Checkpoint), + zap.Stringer("state", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return nil, nil } oldState := r.State r.State = ReplicationSetStateRemoving + secondary, _ := r.getRole(RoleSecondary) log.Info("schedulerv3: replication state transition, remove table", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Int64("tableID", r.Span.TableID), - zap.Any("replicationSet", r), - zap.Stringer("old", oldState)) + zap.String("captureID", r.Primary), + zap.Any("checkpoint", r.Checkpoint), + zap.Stringer("old", oldState), + zap.Stringer("new", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) status := tablepb.TableStatus{ Span: r.Span, State: tablepb.TableStateReplicating, @@ -1009,12 +1145,17 @@ func (r *ReplicationSet) handleCaptureShutdown( } oldState := r.State msgs, err := r.poll(&status, captureID) + secondary, _ := r.getRole(RoleSecondary) log.Info("schedulerv3: replication state transition, capture shutdown", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Int64("tableID", r.Span.TableID), - zap.Any("replicationSet", r), - zap.Stringer("old", oldState), zap.Stringer("new", r.State)) + zap.String("captureID", captureID), + zap.Any("checkpoint", r.Checkpoint), + zap.Stringer("old", oldState), + zap.Stringer("new", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return msgs, true, errors.Trace(err) } @@ -1022,12 +1163,15 @@ func (r *ReplicationSet) updateCheckpointAndStats( checkpoint tablepb.Checkpoint, stats tablepb.Stats, ) { if checkpoint.ResolvedTs < checkpoint.CheckpointTs { + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: resolved ts should not less than checkpoint ts", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Int64("tableID", r.Span.TableID), - zap.Any("replicationSet", r), - zap.Any("checkpoint", checkpoint)) + zap.Any("checkpoint", checkpoint), + zap.Stringer("state", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) // TODO: resolvedTs should not be zero, but we have to handle it for now. if checkpoint.ResolvedTs == 0 { @@ -1041,13 +1185,15 @@ func (r *ReplicationSet) updateCheckpointAndStats( r.Checkpoint.ResolvedTs = checkpoint.ResolvedTs } if r.Checkpoint.ResolvedTs < r.Checkpoint.CheckpointTs { + secondary, _ := r.getRole(RoleSecondary) log.Warn("schedulerv3: resolved ts should not less than checkpoint ts", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Int64("tableID", r.Span.TableID), - zap.Any("replicationSet", r), - zap.Any("checkpointTs", r.Checkpoint.CheckpointTs), - zap.Any("resolvedTs", r.Checkpoint.ResolvedTs)) + zap.Any("checkpoint", r.Checkpoint), + zap.Stringer("state", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) } if r.Checkpoint.LastSyncedTs < checkpoint.LastSyncedTs { From 165bb72b406cdc0772f03b7f3c3c1cef1c3bd549 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Fri, 10 Apr 2026 15:15:04 +0800 Subject: [PATCH 15/24] fix the logs of the ddl execution line --- cdc/entry/schema_storage.go | 29 ++++++++++++++------ cdc/owner/ddl_manager.go | 21 ++++++++------ cdc/owner/ddl_sink.go | 19 +------------ cdc/puller/ddl_puller.go | 12 ++++---- cdc/sink/ddlsink/mysql/mysql_ddl_sink.go | 20 +++----------- tests/integration_tests/ddl_reentrant/run.sh | 4 +-- 6 files changed, 45 insertions(+), 60 deletions(-) diff --git a/cdc/entry/schema_storage.go b/cdc/entry/schema_storage.go index e2f4de1f34..d7cabbb219 100644 --- a/cdc/entry/schema_storage.go +++ b/cdc/entry/schema_storage.go @@ -200,6 +200,13 @@ func (s *schemaStorage) HandleDDLJob(job *timodel.Job) error { lastSnap := s.snaps[len(s.snaps)-1] // already-executed DDL could filted by finishedTs. if job.BinlogInfo.FinishedTS <= lastSnap.CurrentTs() { + log.Info("ddl job already applied in schema storage, skip", + zap.String("namespace", s.id.Namespace), + zap.String("changefeed", s.id.ID), + zap.String("role", s.role.String()), + zap.Int64("jobID", job.ID), + zap.String("type", job.Type.String()), + zap.Uint64("finishedTs", job.BinlogInfo.FinishedTS)) return nil } snap = lastSnap.Copy() @@ -219,15 +226,19 @@ func (s *schemaStorage) HandleDDLJob(job *timodel.Job) error { } s.snaps = append(s.snaps, snap) s.AdvanceResolvedTs(job.BinlogInfo.FinishedTS) - log.Info("ddl job applied to schema storage", - zap.String("namespace", s.id.Namespace), - zap.String("changefeed", s.id.ID), - zap.String("role", s.role.String()), - zap.Int64("jobID", job.ID), - zap.String("type", job.Type.String()), - zap.String("schema", job.SchemaName), - zap.String("table", job.TableName), - zap.Uint64("finishedTs", job.BinlogInfo.FinishedTS)) + // Owner has a later, owner-specific progress log in ddlManager. + // Processor has no equivalent stage, so keep the shared apply log only there. + if s.role == util.RoleProcessor { + log.Info("ddl job applied to schema storage", + zap.String("namespace", s.id.Namespace), + zap.String("changefeed", s.id.ID), + zap.String("role", s.role.String()), + zap.Int64("jobID", job.ID), + zap.String("type", job.Type.String()), + zap.String("schema", job.SchemaName), + zap.String("table", job.TableName), + zap.Uint64("finishedTs", job.BinlogInfo.FinishedTS)) + } return nil } diff --git a/cdc/owner/ddl_manager.go b/cdc/owner/ddl_manager.go index 77c43f1334..f32cb64a20 100644 --- a/cdc/owner/ddl_manager.go +++ b/cdc/owner/ddl_manager.go @@ -382,7 +382,12 @@ func (m *ddlManager) tick( return nil, nil, errors.Trace(err) } if skip { - m.cleanCache(cleanMsg) + log.Info(cleanMsg, + zap.String("namespace", m.changfeedID.Namespace), + zap.String("changefeed", m.changfeedID.ID), + zap.Uint64("commitTs", m.executingDDL.CommitTs), + zap.String("query", m.executingDDL.Query)) + m.cleanCache() } } err := m.executeDDL(ctx) @@ -471,7 +476,12 @@ func (m *ddlManager) executeDDL(ctx context.Context) error { return errors.Trace(err) } if done { - m.cleanCache("execute a ddl event successfully") + log.Info("execute a ddl event successfully", + zap.String("namespace", m.changfeedID.Namespace), + zap.String("changefeed", m.changfeedID.ID), + zap.Uint64("commitTs", m.executingDDL.CommitTs), + zap.String("query", m.executingDDL.Query)) + m.cleanCache() } return nil } @@ -625,7 +635,7 @@ func (m *ddlManager) getSnapshotTs() (ts uint64) { // cleanCache cleans the tableInfoCache and physicalTablesCache. // It should be called after a DDL is skipped or sent to downstream successfully. -func (m *ddlManager) cleanCache(msg string) { +func (m *ddlManager) cleanCache() { tableInfo := m.executingDDL.TableInfo if tableInfo == nil { tableInfo = m.executingDDL.PreTableInfo @@ -634,11 +644,6 @@ func (m *ddlManager) cleanCache(msg string) { if tableInfo != nil { tableName = tableInfo.TableName } - log.Info(msg, - zap.String("namespace", m.changfeedID.Namespace), - zap.String("changefeed", m.changfeedID.ID), - zap.Uint64("commitTs", m.executingDDL.CommitTs), - zap.String("query", m.executingDDL.Query)) // Set it to nil first to accelerate GC. m.pendingDDLs[tableName][0] = nil diff --git a/cdc/owner/ddl_sink.go b/cdc/owner/ddl_sink.go index 9c996e1783..3d111c20dc 100644 --- a/cdc/owner/ddl_sink.go +++ b/cdc/owner/ddl_sink.go @@ -257,11 +257,6 @@ func (s *ddlSinkImpl) writeDDLEvent(ctx context.Context, ddl *model.DDLEvent) er ) } else { ddl.Done.Store(true) - log.Info("Execute DDL succeeded", - zap.String("namespace", s.changefeedID.Namespace), - zap.String("changefeed", s.changefeedID.ID), - zap.Uint64("commitTs", ddl.CommitTs), - zap.String("query", ddl.Query)) } return } @@ -464,19 +459,7 @@ func (s *ddlSinkImpl) addSpecialComment(ddl *model.DDLEvent) (string, error) { return "", errors.Trace(err) } - result := sb.String() - if result != ddl.Query { - log.Info("add special comment to DDL", - zap.String("namespace", s.changefeedID.Namespace), - zap.String("changefeed", s.changefeedID.ID), - zap.Uint64("commitTs", ddl.CommitTs), - zap.String("query", ddl.Query), - zap.String("newQuery", result), - zap.String("charset", ddl.Charset), - zap.String("collate", ddl.Collate)) - } - - return result, nil + return sb.String(), nil } func (s *ddlSinkImpl) emitBootstrap(ctx context.Context, bootstrap *model.DDLEvent) error { diff --git a/cdc/puller/ddl_puller.go b/cdc/puller/ddl_puller.go index c82ac3fcc8..21eddc0398 100644 --- a/cdc/puller/ddl_puller.go +++ b/cdc/puller/ddl_puller.go @@ -447,13 +447,6 @@ func (p *ddlJobPullerImpl) handleJob(job *timodel.Job) (skip bool, err error) { return true, nil } - log.Info("ddl job received by ddl puller", - zap.String("namespace", p.changefeedID.Namespace), - zap.String("changefeed", p.changefeedID.ID), - zap.String("role", p.role.String()), - zap.Int64("jobID", job.ID), - zap.Any("job", job)) - err = p.schemaStorage.HandleDDLJob(job) if err != nil { return false, cerror.WrapError(cerror.ErrHandleDDLFailed, @@ -671,6 +664,11 @@ func (h *ddlPullerImpl) addToPending(job *timodel.Job) { } h.mu.Lock() defer h.mu.Unlock() + log.Info("ddl job received by ddl puller", + zap.String("namespace", h.changefeedID.Namespace), + zap.String("changefeed", h.changefeedID.ID), + zap.Int64("jobID", job.ID), + zap.Any("job", job)) h.pendingDDLJobs = append(h.pendingDDLJobs, job) h.lastDDLJobID = job.ID } diff --git a/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go b/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go index bb3f5955a1..ffff4d1084 100644 --- a/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go +++ b/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go @@ -131,7 +131,10 @@ func (m *DDLSink) WriteDDLEvent(ctx context.Context, ddl *model.DDLEvent) error // If the downstream is TiDB, it will query the DDL and wait until it finishes. // For 'add index' ddl, it will return immediately without waiting and will query it during the next DDL execution. func (m *DDLSink) execDDLWithMaxRetries(ctx context.Context, ddl *model.DDLEvent) error { - ddlCreateTime := getDDLCreateTime(ctx, m.id, m.db) + ddlCreateTime := "" + if m.cfg.IsTiDB { + ddlCreateTime = getDDLCreateTime(ctx, m.id, m.db) + } return retry.Do(ctx, func() error { err := m.statistics.RecordDDLExecution(func() error { return m.execDDL(ctx, ddl) }) if err != nil { @@ -200,7 +203,6 @@ func (m *DDLSink) execDDL(pctx context.Context, ddl *model.DDLEvent) error { failpoint.Return(nil) }) - start := time.Now() tx, err := m.db.BeginTx(ctx, nil) if err != nil { return err @@ -342,20 +344,6 @@ func (m *DDLSink) execDDL(pctx context.Context, ddl *model.DDLEvent) error { return errors.WrapError(errors.ErrMySQLTxnError, errors.WithMessage(err, fmt.Sprintf("Query info: %s; ", ddl.Query))) } - logFields := []zap.Field{ - zap.String("namespace", m.id.Namespace), - zap.String("changefeed", m.id.ID), - zap.Uint64("commitTs", ddl.CommitTs), - zap.String("query", ddl.Query), - zap.Duration("duration", time.Since(start)), - } - - if useSessionTimestamp { - logFields = append(logFields, zap.Float64("sessionTimestamp", ddlTimestamp)) - } - - log.Info("Exec DDL succeeded", logFields...) - return nil } diff --git a/tests/integration_tests/ddl_reentrant/run.sh b/tests/integration_tests/ddl_reentrant/run.sh index 50dc3198ae..2f7aa491b6 100644 --- a/tests/integration_tests/ddl_reentrant/run.sh +++ b/tests/integration_tests/ddl_reentrant/run.sh @@ -60,7 +60,7 @@ function check_ddl_executed() { ddl=$(cat $2) success="$3" if [[ $success == "true" ]]; then - key_word="Exec DDL succeeded" + key_word="execute a ddl event successfully" else key_word="Execute DDL failed, but error can be ignored" fi @@ -92,7 +92,7 @@ function ddl_test() { echo $restored_sql >${WORK_DIR}/ddl_temp.sql ensure 10 check_ddl_executed "${WORK_DIR}/cdc.log" "${WORK_DIR}/ddl_temp.sql" true - ddl_finished_ts=$(grep "Execute DDL succeeded" ${WORK_DIR}/cdc.log | tail -n 1 | grep -oE '"CommitTs\\":[0-9]{18}' | awk -F: '{print $(NF)}') + ddl_finished_ts=$(grep "execute a ddl event successfully" ${WORK_DIR}/cdc.log | tail -n 1 | grep -oE 'commitTs=[0-9]{18}' | awk -F= '{print $(NF)}') cdc cli changefeed pause --changefeed-id=${changefeedid} cdc cli changefeed resume --no-confirm --changefeed-id=${changefeedid} --overwrite-checkpoint-ts=${ddl_finished_ts} echo "resume changefeed ${changefeedid} from ${ddl_finished_ts}" From 5f7f9eaf9a845fbb715ab272cc278372cc74a188 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Fri, 10 Apr 2026 15:51:32 +0800 Subject: [PATCH 16/24] fix logs about the scheduler --- cdc/processor/processor.go | 5 ----- cdc/scheduler/internal/v3/agent/table.go | 7 +++++-- .../v3/replication/replication_set.go | 19 +++++-------------- .../internal/v3/scheduler/scheduler_basic.go | 13 ++++++++++++- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/cdc/processor/processor.go b/cdc/processor/processor.go index b8e5f44923..902955e4c6 100644 --- a/cdc/processor/processor.go +++ b/cdc/processor/processor.go @@ -310,11 +310,6 @@ func (p *processor) IsRemoveTableSpanFinished(span tablepb.Span) (model.Ts, bool state, ok := p.sinkManager.r.GetTableState(span) if !ok { - log.Warn("table has been stopped", - zap.String("captureID", p.captureInfo.ID), - zap.String("namespace", p.changefeedID.Namespace), - zap.String("changefeed", p.changefeedID.ID), - zap.Stringer("span", &span)) return 0, true } diff --git a/cdc/scheduler/internal/v3/agent/table.go b/cdc/scheduler/internal/v3/agent/table.go index f17e49eb5f..a43fb85a25 100644 --- a/cdc/scheduler/internal/v3/agent/table.go +++ b/cdc/scheduler/internal/v3/agent/table.go @@ -249,8 +249,11 @@ func (t *tableSpan) injectDispatchTableTask(task *dispatchTableTask) { log.Info("schedulerv3: table found new task", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span), - zap.Any("task", task)) + zap.Stringer("span", &t.span), + zap.Any("checkpoint", task.Checkpoint), + zap.Bool("isRemove", task.IsRemove), + zap.Bool("isPrepare", task.IsPrepare), + zap.String("epoch", task.Epoch.Epoch)) t.task = task return } diff --git a/cdc/scheduler/internal/v3/replication/replication_set.go b/cdc/scheduler/internal/v3/replication/replication_set.go index 22914ccab2..075c786db6 100644 --- a/cdc/scheduler/internal/v3/replication/replication_set.go +++ b/cdc/scheduler/internal/v3/replication/replication_set.go @@ -473,13 +473,17 @@ func (r *ReplicationSet) poll( msgBuf = append(msgBuf, msg) } if stateChanged { + secondary, _ := r.getRole(RoleSecondary) log.Info("schedulerv3: replication state transition, poll", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), + zap.Any("checkpoint", input.Checkpoint), zap.Stringer("old", oldState), zap.Stringer("new", r.State), - zap.Stringer("tableState", input)) + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) } } @@ -986,7 +990,6 @@ func (r *ReplicationSet) handleAddTable( if err != nil { return nil, errors.Trace(err) } - oldState := r.State status := tablepb.TableStatus{ Span: r.Span, State: tablepb.TableStateAbsent, @@ -996,18 +999,6 @@ func (r *ReplicationSet) handleAddTable( if err != nil { return nil, errors.Trace(err) } - secondary, _ := r.getRole(RoleSecondary) - - log.Info("schedulerv3: replication state transition, add table", - zap.String("namespace", r.Changefeed.Namespace), - zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), - zap.Any("checkpoint", r.Checkpoint), - zap.Stringer("old", oldState), - zap.Stringer("new", r.State), - zap.String("primary", r.Primary), - zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) return msgs, nil } diff --git a/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go b/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go index 8c3906f4e5..9fb9481034 100644 --- a/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go +++ b/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go @@ -186,11 +186,22 @@ func newBurstRemoveTables( break } if captureID == "" { + secondary := "" + for id, role := range rep.Captures { + if role == replication.RoleSecondary { + secondary = id + break + } + } log.Warn("schedulerv3: primary or secondary not found for removed table,"+ "this may happen if the capture shutdown", zap.String("namespace", changefeedID.Namespace), zap.String("changefeed", changefeedID.ID), - zap.Any("table", rep)) + zap.Any("checkpoint", rep.Checkpoint), + zap.Stringer("state", rep.State), + zap.String("primary", rep.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &span)) continue } tables = append(tables, replication.RemoveTable{ From 1142ae107c53121a8f1b87e9be09b3ce88049f07 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Fri, 10 Apr 2026 16:58:14 +0800 Subject: [PATCH 17/24] fix logs about the scheduler --- cdc/processor/processor.go | 3 +- cdc/scheduler/internal/v3/agent/agent.go | 2 +- cdc/scheduler/internal/v3/agent/agent_test.go | 12 ++ cdc/scheduler/internal/v3/agent/table.go | 20 ++- .../v3/replication/replication_set.go | 124 ++++++++++++------ 5 files changed, 111 insertions(+), 50 deletions(-) diff --git a/cdc/processor/processor.go b/cdc/processor/processor.go index 902955e4c6..d54d9a1a3b 100644 --- a/cdc/processor/processor.go +++ b/cdc/processor/processor.go @@ -297,8 +297,7 @@ func (p *processor) IsAddTableSpanFinished(span tablepb.Span, isPrepare bool) bo zap.Uint64("tableResolvedTs", tableResolvedTs), zap.Uint64("tableCheckpointTs", tableCheckpointTs), zap.Uint64("globalCheckpointTs", globalCheckpointTs), - zap.Any("state", state), - zap.Bool("isPrepare", isPrepare)) + zap.Any("state", state)) return true } diff --git a/cdc/scheduler/internal/v3/agent/agent.go b/cdc/scheduler/internal/v3/agent/agent.go index 6ac7435c79..d9fb6c7987 100644 --- a/cdc/scheduler/internal/v3/agent/agent.go +++ b/cdc/scheduler/internal/v3/agent/agent.go @@ -60,7 +60,7 @@ type agentInfo struct { changefeedEpoch uint64 } -func (a agentInfo) resetEpoch() { +func (a *agentInfo) resetEpoch() { a.Epoch = schedulepb.ProcessorEpoch{Epoch: uuid.New().String()} } diff --git a/cdc/scheduler/internal/v3/agent/agent_test.go b/cdc/scheduler/internal/v3/agent/agent_test.go index 1b9617b3d4..c4ee8c6b9d 100644 --- a/cdc/scheduler/internal/v3/agent/agent_test.go +++ b/cdc/scheduler/internal/v3/agent/agent_test.go @@ -305,6 +305,18 @@ func TestAgentHandleMessageDispatchTable(t *testing.T) { require.False(t, a.tableM.tables.Has(spanz.TableIDToComparableSpan(1))) } +func TestAgentInfoResetEpoch(t *testing.T) { + t.Parallel() + + info := newAgentInfo(model.DefaultChangeFeedID("changefeed-test"), "capture-test", 0) + require.NotEmpty(t, info.Epoch.Epoch) + + oldEpoch := info.Epoch.Epoch + info.resetEpoch() + require.NotEmpty(t, info.Epoch.Epoch) + require.NotEqual(t, oldEpoch, info.Epoch.Epoch) +} + func TestAgentHandleMessageHeartbeat(t *testing.T) { t.Parallel() diff --git a/cdc/scheduler/internal/v3/agent/table.go b/cdc/scheduler/internal/v3/agent/table.go index a43fb85a25..8e56b877ea 100644 --- a/cdc/scheduler/internal/v3/agent/table.go +++ b/cdc/scheduler/internal/v3/agent/table.go @@ -252,18 +252,28 @@ func (t *tableSpan) injectDispatchTableTask(task *dispatchTableTask) { zap.Stringer("span", &t.span), zap.Any("checkpoint", task.Checkpoint), zap.Bool("isRemove", task.IsRemove), - zap.Bool("isPrepare", task.IsPrepare), - zap.String("epoch", task.Epoch.Epoch)) + zap.Bool("isPrepare", task.IsPrepare)) t.task = task return } + if t.task.Span.Eq(&task.Span) && + t.task.Checkpoint == task.Checkpoint && + t.task.IsRemove == task.IsRemove && + t.task.IsPrepare == task.IsPrepare && + t.task.Epoch.Epoch == task.Epoch.Epoch { + return + } log.Warn("schedulerv3: table inject dispatch table task ignored,"+ "since there is one not finished yet", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span), - zap.Any("nowTask", t.task), - zap.Any("ignoredTask", task)) + zap.Stringer("span", &t.span), + zap.Any("currentCheckpoint", t.task.Checkpoint), + zap.Bool("currentIsRemove", t.task.IsRemove), + zap.Bool("currentIsPrepare", t.task.IsPrepare), + zap.Any("ignoredCheckpoint", task.Checkpoint), + zap.Bool("ignoredIsRemove", task.IsRemove), + zap.Bool("ignoredIsPrepare", task.IsPrepare)) } func (t *tableSpan) poll(ctx context.Context) (*schedulepb.Message, error) { diff --git a/cdc/scheduler/internal/v3/replication/replication_set.go b/cdc/scheduler/internal/v3/replication/replication_set.go index 075c786db6..a80829ccaa 100644 --- a/cdc/scheduler/internal/v3/replication/replication_set.go +++ b/cdc/scheduler/internal/v3/replication/replication_set.go @@ -449,7 +449,6 @@ func (r *ReplicationSet) poll( if err != nil { return nil, errors.Trace(err) } - oldState := r.State var msg *schedulepb.Message switch r.State { case ReplicationSetStateAbsent: @@ -472,19 +471,6 @@ func (r *ReplicationSet) poll( if msg != nil { msgBuf = append(msgBuf, msg) } - if stateChanged { - secondary, _ := r.getRole(RoleSecondary) - log.Info("schedulerv3: replication state transition, poll", - zap.String("namespace", r.Changefeed.Namespace), - zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), - zap.Stringer("old", oldState), - zap.Stringer("new", r.State), - zap.String("primary", r.Primary), - zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) - } } return msgBuf, nil @@ -496,9 +482,24 @@ func (r *ReplicationSet) pollOnAbsent( ) (bool, error) { switch input.State { case tablepb.TableStateAbsent: + oldState := r.State r.State = ReplicationSetStatePrepare err := r.setCapture(captureID, RoleSecondary) - return true, errors.Trace(err) + if err != nil { + return true, errors.Trace(err) + } + secondary, _ := r.getRole(RoleSecondary) + log.Info("schedulerv3: replication state transition, add table", + zap.String("namespace", r.Changefeed.Namespace), + zap.String("changefeed", r.Changefeed.ID), + zap.String("captureID", captureID), + zap.Any("checkpoint", r.Checkpoint), + zap.Stringer("old", oldState), + zap.Stringer("new", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) + return true, nil case tablepb.TableStateStopped: // Ignore stopped table state as a capture may shutdown unexpectedly. @@ -550,7 +551,19 @@ func (r *ReplicationSet) pollOnPrepare( case tablepb.TableStatePrepared: if r.isInRole(captureID, RoleSecondary) { // Secondary is prepared, transit to Commit state. + oldState := r.State r.State = ReplicationSetStateCommit + secondary, _ := r.getRole(RoleSecondary) + log.Info("schedulerv3: replication state transition, table prepared", + zap.String("namespace", r.Changefeed.Namespace), + zap.String("changefeed", r.Changefeed.ID), + zap.String("captureID", captureID), + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("old", oldState), + zap.Stringer("new", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return nil, true, nil } case tablepb.TableStateReplicating: @@ -577,17 +590,8 @@ func (r *ReplicationSet) pollOnPrepare( return nil, false, nil } if r.isInRole(captureID, RoleSecondary) { + oldState := r.State secondary, _ := r.getRole(RoleSecondary) - log.Info("schedulerv3: capture is stopped during Prepare", - zap.String("namespace", r.Changefeed.Namespace), - zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), - zap.Stringer("state", r.State), - zap.Stringer("tableState", input), - zap.String("primary", r.Primary), - zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) err := r.clearCapture(captureID, RoleSecondary) if err != nil { return nil, false, errors.Trace(err) @@ -601,6 +605,16 @@ func (r *ReplicationSet) pollOnPrepare( // Transit to Absent. r.State = ReplicationSetStateAbsent } + log.Info("schedulerv3: capture is stopped during Prepare", + zap.String("namespace", r.Changefeed.Namespace), + zap.String("changefeed", r.Changefeed.ID), + zap.String("captureID", captureID), + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("old", oldState), + zap.Stringer("new", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Stringer("span", &r.Span)) return nil, true, nil } } @@ -696,19 +710,20 @@ func (r *ReplicationSet) pollOnCommit( original := r.Primary r.clearPrimary() if !r.hasRole(RoleSecondary) { + oldState := r.State secondary, _ := r.getRole(RoleSecondary) // If there is no secondary, transit to Absent. + r.State = ReplicationSetStateAbsent log.Info("schedulerv3: primary is stopped during Commit", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), zap.Any("checkpoint", input.Checkpoint), - zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("old", oldState), + zap.Stringer("new", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), zap.Stringer("span", &r.Span)) - r.State = ReplicationSetStateAbsent return nil, true, nil } // Primary is stopped, promote secondary to primary. @@ -746,16 +761,7 @@ func (r *ReplicationSet) pollOnCommit( // As it sends RemoveTableRequest to the original primary // upon entering Commit state. Do not change state and wait // the original primary reports its table. - log.Info("schedulerv3: secondary is stopped during Commit", - zap.String("namespace", r.Changefeed.Namespace), - zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), - zap.Stringer("state", r.State), - zap.Stringer("tableState", input), - zap.String("primary", r.Primary), - zap.String("secondary", captureID), - zap.Stringer("span", &r.Span)) + oldState := r.State err := r.clearCapture(captureID, RoleSecondary) if err != nil { return nil, false, errors.Trace(err) @@ -764,6 +770,28 @@ func (r *ReplicationSet) pollOnCommit( // If there is no primary, transit to Absent. r.State = ReplicationSetStateAbsent } + if r.State != oldState { + log.Info("schedulerv3: secondary is stopped during Commit", + zap.String("namespace", r.Changefeed.Namespace), + zap.String("changefeed", r.Changefeed.ID), + zap.String("captureID", captureID), + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("old", oldState), + zap.Stringer("new", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", captureID), + zap.Stringer("span", &r.Span)) + } else { + log.Info("schedulerv3: secondary is stopped during Commit", + zap.String("namespace", r.Changefeed.Namespace), + zap.String("changefeed", r.Changefeed.ID), + zap.String("captureID", captureID), + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("state", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", captureID), + zap.Stringer("span", &r.Span)) + } return nil, true, nil } else if r.isInRole(captureID, RoleUndetermined) { secondary, _ := r.getRole(RoleSecondary) @@ -810,7 +838,18 @@ func (r *ReplicationSet) pollOnCommit( // before the original primary receives RemoveTable request. // Transit to Replicating, and wait for the next table state of // the primary, Stopping or Stopped. + oldState := r.State r.State = ReplicationSetStateReplicating + log.Info("schedulerv3: replication state transition, commit finished", + zap.String("namespace", r.Changefeed.Namespace), + zap.String("changefeed", r.Changefeed.ID), + zap.String("captureID", captureID), + zap.Any("checkpoint", input.Checkpoint), + zap.Stringer("old", oldState), + zap.Stringer("new", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", ""), + zap.Stringer("span", &r.Span)) return nil, true, nil } return nil, false, r.multiplePrimaryError( @@ -871,21 +910,22 @@ func (r *ReplicationSet) pollOnReplicating( case tablepb.TableStateStopped: if r.Primary == captureID { r.updateCheckpointAndStats(input.Checkpoint, input.Stats) + oldState := r.State secondary, _ := r.getRole(RoleSecondary) // Primary is stopped, but we still has secondary. // Clear primary and promote secondary when it's prepared. + r.clearPrimary() + r.State = ReplicationSetStateAbsent log.Info("schedulerv3: primary is stopped during Replicating", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), zap.Any("checkpoint", input.Checkpoint), - zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("old", oldState), + zap.Stringer("new", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), zap.Stringer("span", &r.Span)) - r.clearPrimary() - r.State = ReplicationSetStateAbsent return nil, true, nil } } From 89bbb19de3514a4ba8ddaf75cbc7cbc6ffcbaa55 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Fri, 10 Apr 2026 18:23:42 +0800 Subject: [PATCH 18/24] fix logs about the scheduler --- cdc/processor/processor.go | 4 +-- cdc/puller/multiplexing_puller.go | 20 +++++++------- cdc/scheduler/internal/v3/agent/table.go | 2 +- .../v3/replication/replication_set.go | 26 ++++--------------- 4 files changed, 17 insertions(+), 35 deletions(-) diff --git a/cdc/processor/processor.go b/cdc/processor/processor.go index d54d9a1a3b..32692fedc1 100644 --- a/cdc/processor/processor.go +++ b/cdc/processor/processor.go @@ -290,14 +290,12 @@ func (p *processor) IsAddTableSpanFinished(span tablepb.Span, isPrepare bool) bo } log.Info("Add Table finished", - zap.String("captureID", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), zap.Int64("tableID", span.TableID), zap.Uint64("tableResolvedTs", tableResolvedTs), zap.Uint64("tableCheckpointTs", tableCheckpointTs), - zap.Uint64("globalCheckpointTs", globalCheckpointTs), - zap.Any("state", state)) + zap.Uint64("globalCheckpointTs", globalCheckpointTs)) return true } diff --git a/cdc/puller/multiplexing_puller.go b/cdc/puller/multiplexing_puller.go index 36ef5c2513..2cd48a5fac 100644 --- a/cdc/puller/multiplexing_puller.go +++ b/cdc/puller/multiplexing_puller.go @@ -91,16 +91,16 @@ func (p *tableProgress) handleResolvedSpans(ctx context.Context, e *model.Resolv } resolvedTs := p.tsTracker.Frontier() - if resolvedTs > 0 && p.initialized.CompareAndSwap(false, true) { - log.Info("puller is initialized", - zap.String("namespace", p.changefeed.Namespace), - zap.String("changefeed", p.changefeed.ID), - zap.String("tableName", p.tableName), - zap.Any("tableID", p.spans), - zap.Uint64("resolvedTs", resolvedTs), - zap.Duration("duration", time.Since(p.start)), - ) - } + if resolvedTs > 0 && p.initialized.CompareAndSwap(false, true) { + log.Info("puller is initialized", + zap.String("namespace", p.changefeed.Namespace), + zap.String("changefeed", p.changefeed.ID), + zap.String("tableName", p.tableName), + zap.Uint64("resolvedTs", resolvedTs), + zap.Duration("duration", time.Since(p.start)), + zap.Any("spans", p.spans), + ) + } if resolvedTs > p.resolvedTs.Load() { p.resolvedTs.Store(resolvedTs) p.resolvedTsUpdated.Store(time.Now().Unix()) diff --git a/cdc/scheduler/internal/v3/agent/table.go b/cdc/scheduler/internal/v3/agent/table.go index 8e56b877ea..82e25b0ee9 100644 --- a/cdc/scheduler/internal/v3/agent/table.go +++ b/cdc/scheduler/internal/v3/agent/table.go @@ -250,7 +250,7 @@ func (t *tableSpan) injectDispatchTableTask(task *dispatchTableTask) { zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), zap.Stringer("span", &t.span), - zap.Any("checkpoint", task.Checkpoint), + zap.Uint64("checkpointTs", task.Checkpoint.CheckpointTs), zap.Bool("isRemove", task.IsRemove), zap.Bool("isPrepare", task.IsPrepare)) t.task = task diff --git a/cdc/scheduler/internal/v3/replication/replication_set.go b/cdc/scheduler/internal/v3/replication/replication_set.go index a80829ccaa..901fe11cc4 100644 --- a/cdc/scheduler/internal/v3/replication/replication_set.go +++ b/cdc/scheduler/internal/v3/replication/replication_set.go @@ -488,16 +488,13 @@ func (r *ReplicationSet) pollOnAbsent( if err != nil { return true, errors.Trace(err) } - secondary, _ := r.getRole(RoleSecondary) log.Info("schedulerv3: replication state transition, add table", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", r.Checkpoint), + zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), zap.Stringer("old", oldState), zap.Stringer("new", r.State), - zap.String("primary", r.Primary), - zap.String("secondary", secondary), zap.Stringer("span", &r.Span)) return true, nil @@ -553,16 +550,14 @@ func (r *ReplicationSet) pollOnPrepare( // Secondary is prepared, transit to Commit state. oldState := r.State r.State = ReplicationSetStateCommit - secondary, _ := r.getRole(RoleSecondary) log.Info("schedulerv3: replication state transition, table prepared", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("checkpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("old", oldState), zap.Stringer("new", r.State), - zap.String("primary", r.Primary), - zap.String("secondary", secondary), zap.Stringer("span", &r.Span)) return nil, true, nil } @@ -675,17 +670,6 @@ func (r *ReplicationSet) pollOnCommit( if err != nil { return nil, false, errors.Trace(err) } - - log.Info("schedulerv3: promote secondary, no primary", - zap.String("namespace", r.Changefeed.Namespace), - zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), - zap.Stringer("state", r.State), - zap.Stringer("tableState", input), - zap.String("primary", r.Primary), - zap.String("secondary", captureID), - zap.Stringer("span", &r.Span)) } // Secondary has been promoted, retry AddTableRequest. if r.Primary == captureID && !r.hasRole(RoleSecondary) { @@ -844,11 +828,11 @@ func (r *ReplicationSet) pollOnCommit( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("checkpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("old", oldState), zap.Stringer("new", r.State), zap.String("primary", r.Primary), - zap.String("secondary", ""), zap.Stringer("span", &r.Span)) return nil, true, nil } From 6a5a2a643709e74c6ad60839bfd4af985fc3688a Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Fri, 10 Apr 2026 19:59:17 +0800 Subject: [PATCH 19/24] fix logs about the scheduler --- cdc/kv/shared_client.go | 3 ++- cdc/processor/processor.go | 1 + cdc/puller/multiplexing_puller.go | 25 ++++++++++++------- cdc/scheduler/internal/v3/agent/agent.go | 23 +++++++++++++++++ cdc/scheduler/internal/v3/agent/table.go | 14 +++-------- .../internal/v3/keyspan/reconciler.go | 8 +++--- .../v3/replication/replication_set.go | 22 +++++++--------- 7 files changed, 59 insertions(+), 37 deletions(-) diff --git a/cdc/kv/shared_client.go b/cdc/kv/shared_client.go index eff9b7c7e5..b31a9d2821 100644 --- a/cdc/kv/shared_client.go +++ b/cdc/kv/shared_client.go @@ -297,7 +297,8 @@ func (s *SharedClient) Subscribe(subID SubscriptionID, span tablepb.Span, startT zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(rt.subscriptionID)), - zap.String("span", rt.span.String())) + zap.Int64("tableID", rt.span.TableID), + zap.Stringer("startKey", rt.span.StartKey)) } // Unsubscribe the given table span. All covered regions will be deregistered asynchronously. diff --git a/cdc/processor/processor.go b/cdc/processor/processor.go index 32692fedc1..b3efaf9d00 100644 --- a/cdc/processor/processor.go +++ b/cdc/processor/processor.go @@ -293,6 +293,7 @@ func (p *processor) IsAddTableSpanFinished(span tablepb.Span, isPrepare bool) bo zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), zap.Uint64("tableResolvedTs", tableResolvedTs), zap.Uint64("tableCheckpointTs", tableCheckpointTs), zap.Uint64("globalCheckpointTs", globalCheckpointTs)) diff --git a/cdc/puller/multiplexing_puller.go b/cdc/puller/multiplexing_puller.go index 2cd48a5fac..85ea1cc04f 100644 --- a/cdc/puller/multiplexing_puller.go +++ b/cdc/puller/multiplexing_puller.go @@ -91,16 +91,23 @@ func (p *tableProgress) handleResolvedSpans(ctx context.Context, e *model.Resolv } resolvedTs := p.tsTracker.Frontier() - if resolvedTs > 0 && p.initialized.CompareAndSwap(false, true) { - log.Info("puller is initialized", - zap.String("namespace", p.changefeed.Namespace), - zap.String("changefeed", p.changefeed.ID), - zap.String("tableName", p.tableName), - zap.Uint64("resolvedTs", resolvedTs), - zap.Duration("duration", time.Since(p.start)), - zap.Any("spans", p.spans), - ) + if resolvedTs > 0 && p.initialized.CompareAndSwap(false, true) { + tableID := int64(0) + var startKey tablepb.Key + if len(p.spans) != 0 { + tableID = p.spans[0].TableID + startKey = p.spans[0].StartKey } + log.Info("puller is initialized", + zap.String("namespace", p.changefeed.Namespace), + zap.String("changefeed", p.changefeed.ID), + zap.String("tableName", p.tableName), + zap.Int64("tableID", tableID), + zap.Stringer("startKey", startKey), + zap.Uint64("resolvedTs", resolvedTs), + zap.Duration("duration", time.Since(p.start)), + ) + } if resolvedTs > p.resolvedTs.Load() { p.resolvedTs.Store(resolvedTs) p.resolvedTsUpdated.Store(time.Now().Unix()) diff --git a/cdc/scheduler/internal/v3/agent/agent.go b/cdc/scheduler/internal/v3/agent/agent.go index d9fb6c7987..58a9a7cbfa 100644 --- a/cdc/scheduler/internal/v3/agent/agent.go +++ b/cdc/scheduler/internal/v3/agent/agent.go @@ -354,6 +354,23 @@ func (a *agent) handleMessageDispatchTableRequest( status: dispatchTableTaskReceived, } table = a.tableM.addTableSpan(span) + if req.AddTable.GetIsSecondary() { + log.Info("schedulerv3: agent received prepare table task", + zap.String("capture", a.CaptureID), + zap.String("namespace", a.ChangeFeedID.Namespace), + zap.String("changefeed", a.ChangeFeedID.ID), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), + zap.Uint64("checkpointTs", req.AddTable.GetCheckpoint().CheckpointTs)) + } else { + log.Info("schedulerv3: agent received replicate table task", + zap.String("capture", a.CaptureID), + zap.String("namespace", a.ChangeFeedID.Namespace), + zap.String("changefeed", a.ChangeFeedID.ID), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), + zap.Uint64("checkpointTs", req.AddTable.GetCheckpoint().CheckpointTs)) + } case *schedulepb.DispatchTableRequest_RemoveTable: span := req.RemoveTable.GetSpan() table, ok = a.tableM.getTableSpan(span) @@ -373,6 +390,12 @@ func (a *agent) handleMessageDispatchTableRequest( Epoch: epoch, status: dispatchTableTaskReceived, } + log.Info("schedulerv3: agent received remove table task", + zap.String("capture", a.CaptureID), + zap.String("namespace", a.ChangeFeedID.Namespace), + zap.String("changefeed", a.ChangeFeedID.ID), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) default: log.Warn("schedulerv3: agent ignore unknown dispatch table request", zap.String("capture", a.CaptureID), diff --git a/cdc/scheduler/internal/v3/agent/table.go b/cdc/scheduler/internal/v3/agent/table.go index 82e25b0ee9..11dc369939 100644 --- a/cdc/scheduler/internal/v3/agent/table.go +++ b/cdc/scheduler/internal/v3/agent/table.go @@ -246,13 +246,6 @@ func (t *tableSpan) injectDispatchTableTask(task *dispatchTableTask) { zap.Stringer("task.TableID", &task.Span)) } if t.task == nil { - log.Info("schedulerv3: table found new task", - zap.String("namespace", t.changefeedID.Namespace), - zap.String("changefeed", t.changefeedID.ID), - zap.Stringer("span", &t.span), - zap.Uint64("checkpointTs", task.Checkpoint.CheckpointTs), - zap.Bool("isRemove", task.IsRemove), - zap.Bool("isPrepare", task.IsPrepare)) t.task = task return } @@ -267,11 +260,12 @@ func (t *tableSpan) injectDispatchTableTask(task *dispatchTableTask) { "since there is one not finished yet", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), - zap.Stringer("span", &t.span), - zap.Any("currentCheckpoint", t.task.Checkpoint), + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey), + zap.Uint64("currentCheckpointTs", t.task.Checkpoint.CheckpointTs), zap.Bool("currentIsRemove", t.task.IsRemove), zap.Bool("currentIsPrepare", t.task.IsPrepare), - zap.Any("ignoredCheckpoint", task.Checkpoint), + zap.Uint64("ignoredCheckpointTs", task.Checkpoint.CheckpointTs), zap.Bool("ignoredIsRemove", task.IsRemove), zap.Bool("ignoredIsPrepare", task.IsPrepare)) } diff --git a/cdc/scheduler/internal/v3/keyspan/reconciler.go b/cdc/scheduler/internal/v3/keyspan/reconciler.go index 021cc28aa6..65f735a7e6 100644 --- a/cdc/scheduler/internal/v3/keyspan/reconciler.go +++ b/cdc/scheduler/internal/v3/keyspan/reconciler.go @@ -150,10 +150,10 @@ func (m *Reconciler) Reconcile( zap.String("namespace", m.changefeedID.Namespace), zap.Int64("tableID", tableID), zap.Int("holes", len(holes)), - zap.String("spanStart", tableStart.String()), - zap.String("spanEnd", tableEnd.String()), - zap.String("foundStart", coveredSpans[0].String()), - zap.String("foundEnd", coveredSpans[len(coveredSpans)-1].String())) + zap.Stringer("startKey", tableStart.StartKey), + zap.Stringer("endKey", tableEnd.EndKey), + zap.Stringer("foundStartKey", coveredSpans[0].StartKey), + zap.Stringer("foundEndKey", coveredSpans[len(coveredSpans)-1].EndKey)) spans := make([]tablepb.Span, 0, len(coveredSpans)+len(holes)) spans = append(spans, coveredSpans...) for _, s := range holes { diff --git a/cdc/scheduler/internal/v3/replication/replication_set.go b/cdc/scheduler/internal/v3/replication/replication_set.go index 901fe11cc4..d51c22f395 100644 --- a/cdc/scheduler/internal/v3/replication/replication_set.go +++ b/cdc/scheduler/internal/v3/replication/replication_set.go @@ -482,7 +482,6 @@ func (r *ReplicationSet) pollOnAbsent( ) (bool, error) { switch input.State { case tablepb.TableStateAbsent: - oldState := r.State r.State = ReplicationSetStatePrepare err := r.setCapture(captureID, RoleSecondary) if err != nil { @@ -492,10 +491,10 @@ func (r *ReplicationSet) pollOnAbsent( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey), zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), - zap.Stringer("old", oldState), - zap.Stringer("new", r.State), - zap.Stringer("span", &r.Span)) + ) return true, nil case tablepb.TableStateStopped: @@ -548,17 +547,16 @@ func (r *ReplicationSet) pollOnPrepare( case tablepb.TableStatePrepared: if r.isInRole(captureID, RoleSecondary) { // Secondary is prepared, transit to Commit state. - oldState := r.State r.State = ReplicationSetStateCommit log.Info("schedulerv3: replication state transition, table prepared", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey), zap.Uint64("checkpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("resolvedTs", input.Checkpoint.ResolvedTs), - zap.Stringer("old", oldState), - zap.Stringer("new", r.State), - zap.Stringer("span", &r.Span)) + ) return nil, true, nil } case tablepb.TableStateReplicating: @@ -822,18 +820,16 @@ func (r *ReplicationSet) pollOnCommit( // before the original primary receives RemoveTable request. // Transit to Replicating, and wait for the next table state of // the primary, Stopping or Stopped. - oldState := r.State r.State = ReplicationSetStateReplicating log.Info("schedulerv3: replication state transition, commit finished", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey), zap.Uint64("checkpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("resolvedTs", input.Checkpoint.ResolvedTs), - zap.Stringer("old", oldState), - zap.Stringer("new", r.State), - zap.String("primary", r.Primary), - zap.Stringer("span", &r.Span)) + ) return nil, true, nil } return nil, false, r.multiplePrimaryError( From 55204f142fbd6624175cb53e6027c3d25a9a9364 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Mon, 13 Apr 2026 14:42:57 +0800 Subject: [PATCH 20/24] fix all log issues --- cdc/entry/mounter.go | 103 +++++-- cdc/entry/schema/snapshot.go | 10 +- cdc/kv/regionlock/region_range_lock.go | 37 ++- cdc/kv/shared_client.go | 50 +++- cdc/model/sink.go | 16 +- cdc/owner/changefeed.go | 38 ++- cdc/owner/ddl_manager.go | 32 ++- cdc/owner/feed_state_manager.go | 16 +- cdc/owner/owner.go | 12 +- cdc/processor/manager.go | 2 - cdc/processor/processor.go | 53 ++-- .../sinkmanager/table_sink_wrapper.go | 39 ++- cdc/puller/multiplexing_puller.go | 18 +- cdc/scheduler/internal/v3/agent/agent.go | 86 +++--- cdc/scheduler/internal/v3/agent/table.go | 100 +++++-- cdc/scheduler/internal/v3/coordinator.go | 31 ++- .../internal/v3/keyspan/reconciler.go | 2 +- .../v3/keyspan/splitter_region_count.go | 20 +- .../internal/v3/keyspan/splitter_write.go | 20 +- .../v3/replication/replication_manager.go | 95 +++++-- .../v3/replication/replication_set.go | 253 +++++++++++------- .../internal/v3/scheduler/scheduler_basic.go | 37 +-- .../v3/scheduler/scheduler_drain_capture.go | 11 +- .../v3/scheduler/scheduler_manager.go | 3 +- .../v3/scheduler/scheduler_move_table.go | 26 +- .../cloudstorage/cloud_storage_ddl_sink.go | 18 +- .../mq/ddlproducer/pulsar_ddl_producer.go | 22 +- cdc/sink/ddlsink/mq/kafka_ddl_sink.go | 5 +- cdc/sink/ddlsink/mq/pulsar_ddl_sink.go | 13 +- cdc/sink/ddlsink/mysql/mysql_ddl_sink.go | 22 +- cdc/sink/dmlsink/cloudstorage/dml_worker.go | 23 +- .../dmlsink/cloudstorage/encoding_worker.go | 5 +- .../mq/dmlproducer/pulsar_dml_producer.go | 13 +- cdc/sink/dmlsink/mq/kafka_dml_sink.go | 2 +- cdc/sink/dmlsink/mq/mq_dml_sink.go | 4 +- cdc/sink/dmlsink/mq/pulsar_dml_sink.go | 12 +- cdc/sink/dmlsink/txn/mysql/mysql.go | 118 +++++--- cdc/sink/dmlsink/txn/worker.go | 26 +- pkg/etcd/etcd.go | 4 +- pkg/sink/cloudstorage/path.go | 10 +- pkg/sink/codec/bootstraper.go | 8 +- pkg/sink/kafka/sarama_factory.go | 15 +- pkg/sink/pulsar/factory.go | 7 +- 43 files changed, 962 insertions(+), 475 deletions(-) diff --git a/cdc/entry/mounter.go b/cdc/entry/mounter.go index 07fd0baacb..e718a8b278 100644 --- a/cdc/entry/mounter.go +++ b/cdc/entry/mounter.go @@ -134,7 +134,12 @@ func (m *mounter) DecodeEvent(ctx context.Context, event *model.PolymorphicEvent func (m *mounter) unmarshalAndMountRowChanged(ctx context.Context, raw *model.RawKVEntry) (*model.RowChangedEvent, error) { if !bytes.HasPrefix(raw.Key, tablePrefix) { - log.Error("unexpected key prefix found in row kv entry", zap.String("key", hex.EncodeToString(raw.Key)), zap.Any("eventCommitTs", raw.CRTs), zap.Any("eventStartTs", raw.StartTs)) + log.Error("unexpected key prefix found in row kv entry", + zap.String("namespace", m.changefeedID.Namespace), + zap.String("changefeed", m.changefeedID.ID), + zap.String("key", hex.EncodeToString(raw.Key)), + zap.Uint64("eventCommitTs", raw.CRTs), + zap.Uint64("eventStartTs", raw.StartTs)) return nil, nil } // checksumKey is only used to calculate raw checksum if necessary. @@ -177,6 +182,8 @@ func (m *mounter) unmarshalAndMountRowChanged(ctx context.Context, raw *model.Ra return nil, nil } log.Error("can not found table schema", + zap.String("namespace", m.changefeedID.Namespace), + zap.String("changefeed", m.changefeedID.ID), zap.Uint64("ts", raw.CRTs), zap.String("key", hex.EncodeToString(raw.Key)), zap.Int64("tableID", physicalTableID)) @@ -216,8 +223,17 @@ func (m *mounter) unmarshalAndMountRowChanged(ctx context.Context, raw *model.Ra return nil, nil }() if err != nil && !cerror.ShouldFailChangefeed(err) { - log.Error("failed to mount and unmarshals entry, start to print debug info", zap.Error(err)) - snap.PrintStatus(log.Error) + log.Error("failed to mount and unmarshals entry, start to print debug info", + zap.String("namespace", m.changefeedID.Namespace), + zap.String("changefeed", m.changefeedID.ID), + zap.Error(err)) + snap.PrintStatus(func(msg string, fields ...zap.Field) { + fields = append([]zap.Field{ + zap.String("namespace", m.changefeedID.Namespace), + zap.String("changefeed", m.changefeedID.ID), + }, fields...) + log.Error(msg, fields...) + }) } return row, err } @@ -369,7 +385,8 @@ func datum2Column( return nil, nil, nil, errors.Trace(err) } if warn != "" { - log.Warn(warn, zap.String("table", tableInfo.TableName.String()), + log.Warn(warn, + zap.String("table", tableInfo.TableName.String()), zap.String("column", colInfo.Name.String())) } @@ -427,8 +444,12 @@ func (m *mounter) verifyColumnChecksum( checksum, err := calculateColumnChecksum(columnInfos, rawColumns, m.tz) if err != nil { log.Error("failed to calculate the checksum", - zap.Uint32("first", first), zap.Any("columnInfos", columnInfos), - zap.Any("rawColumns", rawColumns), zap.Error(err)) + zap.String("namespace", m.changefeedID.Namespace), + zap.String("changefeed", m.changefeedID.ID), + zap.Uint32("first", first), + zap.Any("columnInfos", columnInfos), + zap.Any("rawColumns", rawColumns), + zap.Error(err)) return 0, false, err } @@ -444,16 +465,28 @@ func (m *mounter) verifyColumnChecksum( if !skipFail { log.Error("cannot found the extra checksum, the first checksum mismatched", - zap.Uint32("checksum", checksum), zap.Uint32("first", first), zap.Uint32("extra", extra), - zap.Any("columnInfos", columnInfos), zap.Any("rawColumns", rawColumns), zap.Any("tz", m.tz)) + zap.String("namespace", m.changefeedID.Namespace), + zap.String("changefeed", m.changefeedID.ID), + zap.Uint32("checksum", checksum), + zap.Uint32("first", first), + zap.Uint32("extra", extra), + zap.Any("columnInfos", columnInfos), + zap.Any("rawColumns", rawColumns), + zap.Any("tz", m.tz)) return checksum, false, nil } if time.Since(m.lastSkipOldValueTime) > time.Minute { log.Warn("checksum mismatch on the old value, "+ "this may caused by Add Column / Drop Column executed, skip verification", - zap.Uint32("checksum", checksum), zap.Uint32("first", first), zap.Uint32("extra", extra), - zap.Any("columnInfos", columnInfos), zap.Any("rawColumns", rawColumns), zap.Any("tz", m.tz)) + zap.String("namespace", m.changefeedID.Namespace), + zap.String("changefeed", m.changefeedID.ID), + zap.Uint32("checksum", checksum), + zap.Uint32("first", first), + zap.Uint32("extra", extra), + zap.Any("columnInfos", columnInfos), + zap.Any("rawColumns", rawColumns), + zap.Any("tz", m.tz)) m.lastSkipOldValueTime = time.Now() } return checksum, true, nil @@ -559,7 +592,9 @@ func verifyRawBytesChecksum( datum, err := newDatum(col.Value, columnInfo.FieldType) if err != nil { log.Error("build datum for raw checksum calculation failed", - zap.Any("col", col), zap.Any("columnInfo", columnInfo), zap.Error(err)) + zap.Any("col", col), + zap.Any("columnInfo", columnInfo), + zap.Error(err)) return 0, false, errors.Trace(err) } datums = append(datums, &datum) @@ -575,9 +610,12 @@ func verifyRawBytesChecksum( log.Error("raw bytes checksum mismatch", zap.Int("version", decoder.ChecksumVersion()), - zap.Uint32("expected", expected), zap.Uint32("obtained", obtained), - zap.Any("tableInfo", tableInfo), zap.Any("columns", columns), - zap.Any("handle", handle.String()), zap.Any("tz", tz)) + zap.Uint32("expected", expected), + zap.Uint32("obtained", obtained), + zap.Any("tableInfo", tableInfo), + zap.Any("columns", columns), + zap.String("handle", handle.String()), + zap.Any("tz", tz)) return expected, false, nil } @@ -611,8 +649,14 @@ func (m *mounter) verifyChecksum( expected, matched, err := verifyRawBytesChecksum(tableInfo, columns, decoder, handle, key, m.tz) if err != nil { log.Error("calculate raw checksum failed", - zap.Int("version", version), zap.Any("tz", m.tz), zap.Any("handle", handle.String()), - zap.Any("key", key), zap.Any("columns", columns), zap.Error(err)) + zap.String("namespace", m.changefeedID.Namespace), + zap.String("changefeed", m.changefeedID.ID), + zap.Int("version", version), + zap.Any("tz", m.tz), + zap.String("handle", handle.String()), + zap.Binary("key", key), + zap.Any("columns", columns), + zap.Error(err)) return 0, false, errors.Trace(err) } if !matched { @@ -621,8 +665,12 @@ func (m *mounter) verifyChecksum( columnChecksum, err := calculateColumnChecksum(columnInfos, rawColumns, m.tz) if err != nil { log.Error("failed to calculate column-level checksum, after raw checksum verification passed", - zap.Any("columnsInfo", columnInfos), zap.Any("rawColumns", rawColumns), - zap.Any("tz", m.tz), zap.Error(err)) + zap.String("namespace", m.changefeedID.Namespace), + zap.String("changefeed", m.changefeedID.ID), + zap.Any("columnsInfo", columnInfos), + zap.Any("rawColumns", rawColumns), + zap.Any("tz", m.tz), + zap.Error(err)) return 0, false, errors.Trace(err) } return columnChecksum, true, nil @@ -673,8 +721,12 @@ func (m *mounter) mountRowKVEntry( if !matched { log.Error("previous columns checksum mismatch", - zap.Uint32("checksum", preChecksum), zap.Any("tableInfo", tableInfo), - zap.Any("preCols", preCols), zap.Any("rawCols", preRawCols)) + zap.String("namespace", m.changefeedID.Namespace), + zap.String("changefeed", m.changefeedID.ID), + zap.Uint32("checksum", preChecksum), + zap.Any("tableInfo", tableInfo), + zap.Any("preCols", preCols), + zap.Any("rawCols", preRawCols)) if m.integrity.ErrorHandle() { return nil, rawRow, cerror.ErrCorruptedDataMutation. GenWithStackByArgs(m.changefeedID.Namespace, m.changefeedID.ID) @@ -700,8 +752,12 @@ func (m *mounter) mountRowKVEntry( } if !matched { log.Error("current columns checksum mismatch", - zap.Uint32("checksum", currentChecksum), zap.Any("tableInfo", tableInfo), - zap.Any("cols", cols), zap.Any("rawCols", rawCols)) + zap.String("namespace", m.changefeedID.Namespace), + zap.String("changefeed", m.changefeedID.ID), + zap.Uint32("checksum", currentChecksum), + zap.Any("tableInfo", tableInfo), + zap.Any("cols", cols), + zap.Any("rawCols", rawCols)) if m.integrity.ErrorHandle() { return nil, rawRow, cerror.ErrCorruptedDataMutation. GenWithStackByArgs(m.changefeedID.Namespace, m.changefeedID.ID) @@ -900,7 +956,8 @@ func getDefaultOrZeroValue( default: d = table.GetZeroValue(col) if d.IsNull() { - log.Error("meet unsupported column type", zap.String("columnInfo", col.FieldType.String())) + log.Error("meet unsupported column type", + zap.String("columnInfo", col.FieldType.String())) } } } diff --git a/cdc/entry/schema/snapshot.go b/cdc/entry/schema/snapshot.go index f91b351f90..c31d9b57d0 100644 --- a/cdc/entry/schema/snapshot.go +++ b/cdc/entry/schema/snapshot.go @@ -135,7 +135,10 @@ func NewSnapshotFromMeta( tag := negative(currentTs) for _, dbinfo := range dbinfos { if filter.ShouldIgnoreSchema(dbinfo.Name.O) { - log.Debug("ignore database", zap.Stringer("db", dbinfo.Name), zap.Stringer("changefeed", id)) + log.Debug("ignore database", + zap.String("namespace", id.Namespace), + zap.String("changefeed", id.ID), + zap.Stringer("db", dbinfo.Name)) continue } vid := newVersionedID(dbinfo.ID, tag) @@ -211,10 +214,11 @@ func NewSnapshotFromMeta( snap.inner.currentTs = currentTs log.Info("schema snapshot created", - zap.Stringer("changefeed", id), + zap.String("namespace", id.Namespace), + zap.String("changefeed", id.ID), zap.Int("tables", tableCount), zap.Uint64("currentTs", currentTs), - zap.Any("duration", time.Since(start).Seconds())) + zap.Duration("duration", time.Since(start))) return snap, nil } diff --git a/cdc/kv/regionlock/region_range_lock.go b/cdc/kv/regionlock/region_range_lock.go index 51ed495d2c..5cfe01db98 100644 --- a/cdc/kv/regionlock/region_range_lock.go +++ b/cdc/kv/regionlock/region_range_lock.go @@ -19,6 +19,7 @@ import ( "encoding/hex" "fmt" "math" + "strings" "sync" "sync/atomic" "time" @@ -110,7 +111,8 @@ type RangeLock struct { id uint64 // totalSpan is the total range of the table, totalSpan = unlockedRanges + lockedRanges totalSpan tablepb.Span - // changefeed is used to identify the changefeed which the RangeLock belongs to. + // namespace and changefeed are used to identify the changefeed which the RangeLock belongs to. + namespace string changefeed string mu sync.RWMutex @@ -128,10 +130,12 @@ func NewRangeLock( id uint64, startKey, endKey []byte, startTs uint64, changefeedLogInfo string, ) *RangeLock { + namespace, changefeed, _ := strings.Cut(changefeedLogInfo, "/") return &RangeLock{ id: id, totalSpan: tablepb.Span{StartKey: startKey, EndKey: endKey}, - changefeed: changefeedLogInfo, + namespace: namespace, + changefeed: changefeed, unlockedRanges: newRangeTsMap(startKey, endKey, startTs), lockedRanges: btree.NewG(16, rangeLockEntryLess), regionIDToLockedRanges: make(map[uint64]*rangeLockEntry), @@ -181,6 +185,7 @@ func (l *RangeLock) UnlockRange( entry, ok := l.lockedRanges.Get(rangeLockEntryWithKey(startKey)) if !ok { log.Panic("unlocking a not locked range", + zap.String("namespace", l.namespace), zap.String("changefeed", l.changefeed), zap.Uint64("regionID", regionID), zap.String("startKey", hex.EncodeToString(startKey)), @@ -189,6 +194,7 @@ func (l *RangeLock) UnlockRange( } if entry.regionID != regionID { log.Panic("unlocked a range but regionID mismatch", + zap.String("namespace", l.namespace), zap.String("changefeed", l.changefeed), zap.Uint64("expectedRegionID", regionID), zap.Uint64("foundRegionID", entry.regionID), @@ -197,6 +203,7 @@ func (l *RangeLock) UnlockRange( } if entry != l.regionIDToLockedRanges[regionID] { log.Panic("range lock and region id lock mismatch when trying to unlock", + zap.String("namespace", l.namespace), zap.String("changefeed", l.changefeed), zap.Uint64("unlockingRegionID", regionID), zap.String("rangeLockEntry", entry.String()), @@ -207,6 +214,7 @@ func (l *RangeLock) UnlockRange( if entry.regionVersion != version || !bytes.Equal(entry.endKey, endKey) { log.Panic("unlocking region doesn't match the locked region", + zap.String("namespace", l.namespace), zap.String("changefeed", l.changefeed), zap.Uint64("regionID", regionID), zap.String("startKey", hex.EncodeToString(startKey)), @@ -232,8 +240,10 @@ func (l *RangeLock) UnlockRange( l.unlockedRanges.set(startKey, endKey, newResolvedTs) log.Debug("unlocked range", + zap.String("namespace", l.namespace), zap.String("changefeed", l.changefeed), - zap.Uint64("lockID", l.id), zap.Uint64("regionID", entry.regionID), + zap.Uint64("lockID", l.id), + zap.Uint64("regionID", entry.regionID), zap.Uint64("resolvedTs", newResolvedTs), zap.String("startKey", hex.EncodeToString(startKey)), zap.String("endKey", hex.EncodeToString(endKey))) @@ -432,6 +442,7 @@ func (l *RangeLock) tryLockRange(startKey, endKey []byte, regionID, regionVersio l.unlockedRanges.unset(startKey, endKey) log.Debug("range locked", + zap.String("namespace", l.namespace), zap.String("changefeed", l.changefeed), zap.Uint64("lockID", l.id), zap.Uint64("regionID", regionID), @@ -446,11 +457,9 @@ func (l *RangeLock) tryLockRange(startKey, endKey []byte, regionID, regionVersio }, nil } - // Format overlapping ranges for printing log - var overlapStr []string + overlapRegionIDs := make([]uint64, 0, len(overlappedRangeLocks)) for _, r := range overlappedRangeLocks { - overlapStr = append(overlapStr, fmt.Sprintf("regionID: %v, ver: %v, start: %v, end: %v", - r.regionID, r.regionVersion, hex.EncodeToString(r.startKey), hex.EncodeToString(r.endKey))) // DEBUG + overlapRegionIDs = append(overlapRegionIDs, r.regionID) } // Check if the current acuqiring range is stale, @@ -469,11 +478,14 @@ func (l *RangeLock) tryLockRange(startKey, endKey []byte, regionID, regionVersio currentRangeStartKey := startKey log.Info("try lock range staled", + zap.String("namespace", l.namespace), zap.String("changefeed", l.changefeed), - zap.Uint64("lockID", l.id), zap.Uint64("regionID", regionID), + zap.Uint64("lockID", l.id), + zap.Uint64("regionID", regionID), zap.String("startKey", hex.EncodeToString(startKey)), zap.String("endKey", hex.EncodeToString(endKey)), - zap.Strings("allOverlapping", overlapStr)) // DEBUG + zap.Int("overlapRegionCount", len(overlapRegionIDs)), + zap.Uint64s("overlapRegionIDs", overlapRegionIDs)) for _, r := range overlappedRangeLocks { // Ignore the totally-disjointed range which may be added to the list because of @@ -510,11 +522,14 @@ func (l *RangeLock) tryLockRange(startKey, endKey []byte, regionID, regionVersio } log.Info("lock range blocked", + zap.String("namespace", l.namespace), zap.String("changefeed", l.changefeed), - zap.Uint64("lockID", l.id), zap.Uint64("regionID", regionID), + zap.Uint64("lockID", l.id), + zap.Uint64("regionID", regionID), zap.String("startKey", hex.EncodeToString(startKey)), zap.String("endKey", hex.EncodeToString(endKey)), - zap.Strings("blockedBy", overlapStr)) // DEBUG + zap.Int("blockedByRegionCount", len(overlapRegionIDs)), + zap.Uint64s("blockedByRegionIDs", overlapRegionIDs)) return LockRangeResult{ Status: LockRangeStatusWait, diff --git a/cdc/kv/shared_client.go b/cdc/kv/shared_client.go index b31a9d2821..1f1e78d2ee 100644 --- a/cdc/kv/shared_client.go +++ b/cdc/kv/shared_client.go @@ -314,7 +314,8 @@ func (s *SharedClient) Unsubscribe(subID SubscriptionID) { zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(rt.subscriptionID)), - zap.String("span", rt.span.String())) + zap.Int64("tableID", rt.span.TableID), + zap.Stringer("startKey", rt.span.StartKey)) return } log.Warn("event feed unsubscribes table, but not found", @@ -448,7 +449,9 @@ func (s *SharedClient) handleRegions(ctx context.Context, eg *errgroup.Group) er zap.Uint64("streamID", stream.streamID), zap.Uint64("subscriptionID", uint64(region.subscribedTable.subscriptionID)), zap.Uint64("regionID", region.verID.GetID()), - zap.String("span", region.span.String()), + zap.Int64("tableID", region.span.TableID), + zap.Stringer("startKey", region.span.StartKey), + zap.Stringer("endKey", region.span.EndKey), zap.String("addr", store.storeAddr)) } } @@ -551,7 +554,8 @@ func (s *SharedClient) divideSpanAndScheduleRegionRequests( zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(subscribedTable.subscriptionID)), - zap.Any("span", nextSpan)) + zap.Int64("tableID", nextSpan.TableID), + zap.Stringer("startKey", nextSpan.StartKey)) backoff := tikv.NewBackoffer(ctx, tikvRequestMaxBackoff) regions, err := s.regionCache.BatchLoadRegionsWithKeyRange(backoff, nextSpan.StartKey, nextSpan.EndKey, limit) @@ -560,7 +564,9 @@ func (s *SharedClient) divideSpanAndScheduleRegionRequests( zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(subscribedTable.subscriptionID)), - zap.String("span", nextSpan.String()), + zap.Int64("tableID", nextSpan.TableID), + zap.Stringer("startKey", nextSpan.StartKey), + zap.Stringer("endKey", nextSpan.EndKey), zap.Error(err)) backoffBeforeLoad = true continue @@ -578,7 +584,9 @@ func (s *SharedClient) divideSpanAndScheduleRegionRequests( zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(subscribedTable.subscriptionID)), - zap.String("span", nextSpan.String())) + zap.Int64("tableID", nextSpan.TableID), + zap.Stringer("startKey", nextSpan.StartKey), + zap.Stringer("endKey", nextSpan.EndKey)) backoffBeforeLoad = true continue } @@ -597,7 +605,9 @@ func (s *SharedClient) divideSpanAndScheduleRegionRequests( zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(subscribedTable.subscriptionID)), - zap.String("span", nextSpan.String())) + zap.Int64("tableID", nextSpan.TableID), + zap.Stringer("startKey", nextSpan.StartKey), + zap.Stringer("endKey", nextSpan.EndKey)) } verID := tikv.NewRegionVerID(regionMeta.Id, regionMeta.RegionEpoch.ConfVer, regionMeta.RegionEpoch.Version) @@ -844,7 +854,10 @@ func (s *SharedClient) logSlowRegions(ctx context.Context) error { zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(subscriptionID)), zap.Int64("tableID", rt.span.TableID), - zap.Any("slowRegion", attr.SlowestRegion)) + zap.Uint64("regionID", attr.SlowestRegion.RegionID), + zap.Uint64("resolvedTs", attr.SlowestRegion.ResolvedTs), + zap.Bool("initialized", attr.SlowestRegion.Initialized), + zap.Duration("since", currTime.Sub(ckptTime))) } } else if currTime.Sub(attr.SlowestRegion.Created) > 10*time.Minute { slowInitializeRegionCount += 1 @@ -853,22 +866,32 @@ func (s *SharedClient) logSlowRegions(ctx context.Context) error { zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(subscriptionID)), zap.Int64("tableID", rt.span.TableID), - zap.Any("slowRegion", attr.SlowestRegion)) + zap.Uint64("regionID", attr.SlowestRegion.RegionID), + zap.Uint64("resolvedTs", attr.SlowestRegion.ResolvedTs), + zap.Bool("initialized", attr.SlowestRegion.Initialized), + zap.Duration("since", currTime.Sub(attr.SlowestRegion.Created))) } else if currTime.Sub(ckptTime) > 10*time.Minute { log.Info("event feed finds a uninitialized slow region", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(subscriptionID)), zap.Int64("tableID", rt.span.TableID), - zap.Any("slowRegion", attr.SlowestRegion)) + zap.Uint64("regionID", attr.SlowestRegion.RegionID), + zap.Uint64("resolvedTs", attr.SlowestRegion.ResolvedTs), + zap.Bool("initialized", attr.SlowestRegion.Initialized), + zap.Duration("since", currTime.Sub(ckptTime))) } if len(attr.UnLockedRanges) > 0 { + firstHole := attr.UnLockedRanges[0] log.Info("event feed holes exist", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(subscriptionID)), zap.Int64("tableID", rt.span.TableID), - zap.Any("holes", attr.UnLockedRanges)) + zap.Int("holeCount", len(attr.UnLockedRanges)), + zap.Stringer("startKey", firstHole.Span.StartKey), + zap.Stringer("endKey", firstHole.Span.EndKey), + zap.Uint64("resolvedTs", firstHole.ResolvedTs)) } } s.totalSpans.RUnlock() @@ -912,7 +935,12 @@ func (r *subscribedTable) resolveStaleLocks(s *SharedClient, targetTs uint64) { zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(r.subscriptionID)), - zap.Any("ranges", res)) + zap.Int("lockedRegionCount", res.LockedRegionCount), + zap.Int("holeCount", len(res.UnLockedRanges)), + zap.Uint64("slowRegionID", res.SlowestRegion.RegionID), + zap.Uint64("slowResolvedTs", res.SlowestRegion.ResolvedTs), + zap.Uint64("fastRegionID", res.FastestRegion.RegionID), + zap.Uint64("fastResolvedTs", res.FastestRegion.ResolvedTs)) } type sharedClientMetrics struct { diff --git a/cdc/model/sink.go b/cdc/model/sink.go index 54ccf98d3e..b01b604024 100644 --- a/cdc/model/sink.go +++ b/cdc/model/sink.go @@ -1193,7 +1193,7 @@ func trySplitAndSortUpdateEvent( split := false for _, e := range events { if e == nil { - log.Warn("skip emit nil event", zap.Any("event", e)) + log.Warn("skip emit nil event") continue } @@ -1203,7 +1203,19 @@ func trySplitAndSortUpdateEvent( // begin; insert into t (id) values (1); delete from t where id=1; commit; // Just ignore these row changed events. if colLen == 0 && preColLen == 0 { - log.Warn("skip emit empty row event", zap.Any("event", e)) + if e.TableInfo != nil { + log.Warn("skip emit empty row event", + zap.String("schema", e.TableInfo.GetSchemaName()), + zap.String("table", e.TableInfo.GetTableName()), + zap.Int64("tableID", e.PhysicalTableID), + zap.Uint64("startTs", e.StartTs), + zap.Uint64("commitTs", e.CommitTs)) + } else { + log.Warn("skip emit empty row event", + zap.Int64("tableID", e.PhysicalTableID), + zap.Uint64("startTs", e.StartTs), + zap.Uint64("commitTs", e.CommitTs)) + } continue } diff --git a/cdc/owner/changefeed.go b/cdc/owner/changefeed.go index 1813243d44..cc08194744 100755 --- a/cdc/owner/changefeed.go +++ b/cdc/owner/changefeed.go @@ -307,7 +307,10 @@ func (c *changefeed) Tick(ctx context.Context, } if err != nil { - log.Error("changefeed tick failed", zap.Error(err)) + log.Error("changefeed tick failed", + zap.String("namespace", c.id.Namespace), + zap.String("changefeed", c.id.ID), + zap.Error(err)) c.handleErr(ctx, err) } return checkpointTs, minTableBarrierTs @@ -468,6 +471,8 @@ func (c *changefeed) tick(ctx context.Context, c.lastSyncedTs = watermark.LastSyncedTs } else if c.lastSyncedTs > watermark.LastSyncedTs { log.Warn("LastSyncedTs should not be greater than newLastSyncedTs", + zap.String("namespace", c.id.Namespace), + zap.String("changefeed", c.id.ID), zap.Uint64("c.LastSyncedTs", c.lastSyncedTs), zap.Uint64("newLastSyncedTs", watermark.LastSyncedTs)) } @@ -478,6 +483,8 @@ func (c *changefeed) tick(ctx context.Context, c.pullerResolvedTs = watermark.PullerResolvedTs } else if watermark.PullerResolvedTs < c.pullerResolvedTs { log.Warn("the newPullerResolvedTs should not be smaller than c.pullerResolvedTs", + zap.String("namespace", c.id.Namespace), + zap.String("changefeed", c.id.ID), zap.Uint64("c.pullerResolvedTs", c.pullerResolvedTs), zap.Uint64("newPullerResolvedTs", watermark.PullerResolvedTs)) } @@ -882,7 +889,10 @@ func (c *changefeed) cleanupRedoManager(ctx context.Context, cfInfo *model.Chang if c.isRemoved { if cfInfo == nil || cfInfo.Config == nil || cfInfo.Config.Consistent == nil { - log.Warn("changefeed is removed, but state is not complete", zap.Any("info", cfInfo)) + log.Warn("changefeed is removed, but state is not complete", + zap.String("namespace", c.id.Namespace), + zap.String("changefeed", c.id.ID), + zap.Any("info", cfInfo)) return } if !redoCfg.IsConsistentEnabled(cfInfo.Config.Consistent.Level) { @@ -894,7 +904,10 @@ func (c *changefeed) cleanupRedoManager(ctx context.Context, cfInfo *model.Chang } err := c.redoMetaMgr.Cleanup(ctx) if err != nil { - log.Error("cleanup redo logs failed", zap.String("changefeed", c.id.ID), zap.Error(err)) + log.Error("cleanup redo logs failed", + zap.String("namespace", c.id.Namespace), + zap.String("changefeed", c.id.ID), + zap.Error(err)) } } } @@ -951,7 +964,10 @@ func (c *changefeed) handleBarrier(ctx context.Context, case finishBarrier: c.feedStateManager.MarkFinished() default: - log.Error("Unknown barrier type", zap.Int("barrierType", int(barrierTp))) + log.Error("Unknown barrier type", + zap.String("namespace", c.id.Namespace), + zap.String("changefeed", c.id.ID), + zap.Int("barrierType", int(barrierTp))) return cerror.ErrUnexpected.FastGenByArgs("Unknown barrier type") } } @@ -962,6 +978,8 @@ func (c *changefeed) handleBarrier(ctx context.Context, // but we can ignore them because they will be handled in the processor. if barrier.GlobalBarrierTs > barrierTs { log.Debug("There are other barriers less than ddl barrier, wait for them", + zap.String("namespace", c.id.Namespace), + zap.String("changefeed", c.id.ID), zap.Uint64("otherBarrierTs", barrierTs), zap.Uint64("globalBarrierTs", barrier.GlobalBarrierTs)) barrier.GlobalBarrierTs = barrierTs @@ -969,6 +987,8 @@ func (c *changefeed) handleBarrier(ctx context.Context, if barrier.MinTableBarrierTs > barrierTs { log.Debug("There are other barriers less than min table barrier, wait for them", + zap.String("namespace", c.id.Namespace), + zap.String("changefeed", c.id.ID), zap.Uint64("otherBarrierTs", barrierTs), zap.Uint64("minTableBarrierTs", barrier.GlobalBarrierTs)) barrier.MinTableBarrierTs = barrierTs @@ -1004,6 +1024,7 @@ func (c *changefeed) Close(ctx context.Context) { costTime := time.Since(startTime) if costTime > changefeedLogsWarnDuration { log.Warn("changefeed close took too long", + zap.String("namespace", c.id.Namespace), zap.String("changefeed", c.id.ID), zap.Duration("duration", costTime)) } @@ -1026,9 +1047,9 @@ func (c *changefeed) checkUpstream() (skip bool, err error) { } if c.upstream.IsClosed() { log.Warn("upstream is closed", - zap.Uint64("upstreamID", c.upstream.ID), zap.String("namespace", c.id.Namespace), - zap.String("changefeed", c.id.ID)) + zap.String("changefeed", c.id.ID), + zap.Uint64("upstreamID", c.upstream.ID)) return true, cerror. WrapChangefeedUnretryableErr( cerror.ErrUpstreamClosed.GenWithStackByArgs()) @@ -1060,7 +1081,10 @@ func (c *changefeed) tickDownstreamObserver(ctx context.Context) { if strings.Contains(err.Error(), noPrometheusMsg) { return } - log.Warn("backend observer tick error", zap.Error(err)) + log.Warn("backend observer tick error", + zap.String("namespace", c.id.Namespace), + zap.String("changefeed", c.id.ID), + zap.Error(err)) } }() } diff --git a/cdc/owner/ddl_manager.go b/cdc/owner/ddl_manager.go index f32cb64a20..02ec54a167 100644 --- a/cdc/owner/ddl_manager.go +++ b/cdc/owner/ddl_manager.go @@ -214,7 +214,8 @@ func (m *ddlManager) trySendBootstrap(ctx context.Context, currentTables []*mode start := time.Now() go func() { log.Info("start to send bootstrap messages", - zap.Stringer("changefeed", m.changfeedID), + zap.String("namespace", m.changfeedID.Namespace), + zap.String("changefeed", m.changfeedID.ID), zap.Int("tables", len(currentTables))) for idx, table := range currentTables { if table.TableInfo.IsView() { @@ -227,7 +228,8 @@ func (m *ddlManager) trySendBootstrap(ctx context.Context, currentTables []*mode err := m.ddlSink.emitBootstrap(ctx, ddlEvent) if err != nil { log.Error("send bootstrap message failed", - zap.Stringer("changefeed", m.changfeedID), + zap.String("namespace", m.changfeedID.Namespace), + zap.String("changefeed", m.changfeedID.ID), zap.Int("tables", len(currentTables)), zap.Int("emitted", idx+1), zap.Duration("duration", time.Since(start)), @@ -238,7 +240,8 @@ func (m *ddlManager) trySendBootstrap(ctx context.Context, currentTables []*mode } storeBootstrapState(&m.bootstrapState, bootstrapFinished) log.Info("send bootstrap messages finished", - zap.Stringer("changefeed", m.changfeedID), + zap.String("namespace", m.changfeedID.Namespace), + zap.String("changefeed", m.changfeedID.ID), zap.Int("tables", len(currentTables)), zap.Duration("cost", time.Since(start))) }() @@ -365,6 +368,8 @@ func (m *ddlManager) tick( if nextDDL != nil { if m.checkpointTs > nextDDL.CommitTs { log.Panic("checkpointTs is greater than next ddl commitTs", + zap.String("namespace", m.changfeedID.Namespace), + zap.String("changefeed", m.changfeedID.ID), zap.Uint64("checkpointTs", m.checkpointTs), zap.Uint64("commitTs", nextDDL.CommitTs)) } @@ -414,7 +419,9 @@ func (m *ddlManager) shouldExecDDL(nextDDL *model.DDLEvent) bool { redoDDLResolvedTsExceedBarrier := true if m.redoMetaManager.Enabled() { if !m.redoDDLManager.Enabled() { - log.Panic("Redo meta manager is enabled but redo ddl manager is not enabled") + log.Panic("Redo meta manager is enabled but redo ddl manager is not enabled", + zap.String("namespace", m.changfeedID.Namespace), + zap.String("changefeed", m.changfeedID.ID)) } flushed := m.redoMetaManager.GetFlushedMeta() // Use the same example as above, let say there are some events are replicated by cdc: @@ -467,7 +474,10 @@ func (m *ddlManager) executeDDL(ctx context.Context) error { failpoint.Inject("ExecuteDDLSlowly", func() { lag := time.Duration(rand.Intn(5000)) * time.Millisecond - log.Warn("execute ddl slowly", zap.Duration("lag", lag)) + log.Warn("execute ddl slowly", + zap.String("namespace", m.changfeedID.Namespace), + zap.String("changefeed", m.changfeedID.ID), + zap.Duration("lag", lag)) time.Sleep(lag) }) @@ -479,8 +489,7 @@ func (m *ddlManager) executeDDL(ctx context.Context) error { log.Info("execute a ddl event successfully", zap.String("namespace", m.changfeedID.Namespace), zap.String("changefeed", m.changfeedID.ID), - zap.Uint64("commitTs", m.executingDDL.CommitTs), - zap.String("query", m.executingDDL.Query)) + zap.Uint64("commitTs", m.executingDDL.CommitTs)) m.cleanCache() } return nil @@ -545,7 +554,7 @@ func (m *ddlManager) barrier() *schedulepb.BarrierWithMinTs { } } else { // barrier related physical tables - ids := getRelatedPhysicalTableIDs(ddl) + ids := getRelatedPhysicalTableIDs(m.changfeedID, ddl) for _, id := range ids { // The same physical table may have multiple related ddl events when calculating barrier. // Example cases: @@ -658,7 +667,7 @@ func (m *ddlManager) cleanCache() { // getRelatedPhysicalTableIDs get all related physical table ids of a ddl event. // It is a helper function to calculate tableBarrier. -func getRelatedPhysicalTableIDs(ddl *model.DDLEvent) []model.TableID { +func getRelatedPhysicalTableIDs(changefeedID model.ChangeFeedID, ddl *model.DDLEvent) []model.TableID { res := make([]model.TableID, 0, 1) table := ddl.TableInfo if ddl.PreTableInfo != nil { @@ -667,7 +676,10 @@ func getRelatedPhysicalTableIDs(ddl *model.DDLEvent) []model.TableID { if table == nil { // If the table is nil, it means that the ddl is a global ddl. // It should never go here. - log.Panic("tableInfo of this ddl is nil", zap.Any("ddl", ddl)) + log.Panic("tableInfo of this ddl is nil", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Any("ddl", ddl)) } res = append(res, table.ID) partitionInfo := table.TableInfo.GetPartitionInfo() diff --git a/cdc/owner/feed_state_manager.go b/cdc/owner/feed_state_manager.go index 1c2c87cd85..343ee7b39a 100644 --- a/cdc/owner/feed_state_manager.go +++ b/cdc/owner/feed_state_manager.go @@ -172,6 +172,8 @@ func (m *feedStateManager) Tick(resolvedTs model.Ts, changefeedErrorStuckDuration := util.GetOrZero(m.state.GetChangefeedInfo().Config.ChangefeedErrorStuckDuration) if m.changefeedErrorStuckDuration != changefeedErrorStuckDuration { log.Info("changefeedErrorStuckDuration update", + zap.String("namespace", m.state.GetID().Namespace), + zap.String("changefeed", m.state.GetID().ID), zap.Duration("oldChangefeedErrorStuckDuration", m.changefeedErrorStuckDuration), zap.Duration("newChangefeedErrorStuckDuration", changefeedErrorStuckDuration), ) @@ -377,7 +379,9 @@ func (m *feedStateManager) patchState(feedState model.FeedState) { adminJobType = model.AdminRemove updateEpoch = true default: - log.Panic("Unreachable") + log.Panic("Unreachable", + zap.String("namespace", m.state.GetID().Namespace), + zap.String("changefeed", m.state.GetID().ID)) } epoch := uint64(0) if updateEpoch { @@ -436,7 +440,10 @@ func (m *feedStateManager) HandleError(errs ...*model.RunningError) { // if any error is occurred in this tick, we should set the changefeed state to warning // and stop the changefeed if lastError != nil { - log.Warn("changefeed meets an error", zap.Any("error", lastError)) + log.Warn("changefeed meets an error", + zap.String("namespace", m.state.GetID().Namespace), + zap.String("changefeed", m.state.GetID().ID), + zap.Any("error", lastError)) m.shouldBeRunning = false m.patchState(model.StatePending) @@ -528,5 +535,8 @@ func (m *feedStateManager) checkAndInitLastRetryCheckpointTs(status *model.Chang } m.lastWarningReportCheckpointTs = status.CheckpointTs m.lastErrorRetryCheckpointTs = status.CheckpointTs - log.Info("init lastRetryCheckpointTs", zap.Uint64("lastRetryCheckpointTs", m.lastErrorRetryCheckpointTs)) + log.Info("init lastRetryCheckpointTs", + zap.String("namespace", m.state.GetID().Namespace), + zap.String("changefeed", m.state.GetID().ID), + zap.Uint64("lastRetryCheckpointTs", m.lastErrorRetryCheckpointTs)) } diff --git a/cdc/owner/owner.go b/cdc/owner/owner.go index f9982ac3e4..feef082ada 100644 --- a/cdc/owner/owner.go +++ b/cdc/owner/owner.go @@ -550,19 +550,19 @@ func (o *ownerImpl) handleDrainCaptures(ctx context.Context, query *scheduler.Qu state := changefeed.latestInfo.State if state != model.StateNormal { log.Info("skip drain changefeed", - zap.String("state", string(state)), - zap.String("target", query.CaptureID), zap.String("namespace", changefeed.id.Namespace), - zap.String("changefeed", changefeed.id.ID)) + zap.String("changefeed", changefeed.id.ID), + zap.String("state", string(state)), + zap.String("target", query.CaptureID)) continue } if changefeed.scheduler == nil { // Scheduler is created lazily, it is nil before initialization. log.Info("drain a changefeed without scheduler", - zap.String("state", string(state)), - zap.String("target", query.CaptureID), zap.String("namespace", changefeed.id.Namespace), - zap.String("changefeed", changefeed.id.ID)) + zap.String("changefeed", changefeed.id.ID), + zap.String("state", string(state)), + zap.String("target", query.CaptureID)) // To prevent a changefeed being considered drained, // we increase totalTableCount. totalTableCount++ diff --git a/cdc/processor/manager.go b/cdc/processor/manager.go index b9f76bd963..9f1e3615f3 100644 --- a/cdc/processor/manager.go +++ b/cdc/processor/manager.go @@ -217,7 +217,6 @@ func patchProcessorErr(captureInfo *model.CaptureInfo, ) { if isProcessorIgnorableError(err) { log.Info("processor exited", - zap.String("capture", captureInfo.ID), zap.String("namespace", changefeed.ID.Namespace), zap.String("changefeed", changefeed.ID.ID), zap.Error(err)) @@ -244,7 +243,6 @@ func patchProcessorErr(captureInfo *model.CaptureInfo, return position, true, nil }) log.Error("run processor failed", - zap.String("capture", captureInfo.ID), zap.String("namespace", changefeed.ID.Namespace), zap.String("changefeed", changefeed.ID.ID), zap.Error(err)) diff --git a/cdc/processor/processor.go b/cdc/processor/processor.go index b3efaf9d00..77aa6a6756 100644 --- a/cdc/processor/processor.go +++ b/cdc/processor/processor.go @@ -151,10 +151,10 @@ func (p *processor) AddTableSpan( startTs := checkpoint.CheckpointTs if startTs == 0 { log.Error("table start ts must not be 0", - zap.String("captureID", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), - zap.Stringer("span", &span), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), zap.Uint64("checkpointTs", startTs), zap.Bool("isPrepare", isPrepare)) return false, cerror.ErrUnexpected.FastGenByArgs("table start ts must not be 0") @@ -168,10 +168,10 @@ func (p *processor) AddTableSpan( // no matter `isPrepare` or not, just ignore it should be ok. case tablepb.TableStatePreparing: log.Warn("table is still preparing, ignore the request", - zap.String("captureID", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), - zap.Stringer("span", &span), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), zap.Uint64("checkpointTs", startTs), zap.Bool("isPrepare", isPrepare)) return true, nil @@ -191,19 +191,19 @@ func (p *processor) AddTableSpan( return true, nil case tablepb.TableStateReplicating: log.Warn("Ignore existing table", - zap.String("captureID", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), - zap.Stringer("span", &span), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), zap.Uint64("checkpointTs", startTs), zap.Bool("isPrepare", isPrepare)) return true, nil case tablepb.TableStateStopped: log.Warn("The same table exists but is stopped. Cancel it and continue.", - zap.String("captureID", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), - zap.Stringer("span", &span), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), zap.Uint64("checkpointTs", startTs), zap.Bool("isPrepare", isPrepare)) p.removeTable(span) @@ -216,10 +216,10 @@ func (p *processor) AddTableSpan( globalCheckpointTs := p.latestStatus.CheckpointTs if startTs < globalCheckpointTs { log.Warn("addTable: startTs < checkpoint", - zap.String("captureID", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), - zap.Stringer("span", &span), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), zap.Uint64("checkpointTs", startTs), zap.Bool("isPrepare", isPrepare)) } @@ -243,10 +243,10 @@ func (p *processor) RemoveTableSpan(span tablepb.Span) bool { _, exist := p.sinkManager.r.GetTableState(span) if !exist { log.Warn("Table which will be deleted is not found", - zap.String("capture", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), - zap.Stringer("span", &span)) + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) return true } return p.sinkManager.r.AsyncStopTable(span) @@ -271,10 +271,10 @@ func (p *processor) IsAddTableSpanFinished(span tablepb.Span, isPrepare bool) bo tableCheckpointTs = stats.CheckpointTs } else { log.Panic("table which was added is not found", - zap.String("captureID", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), - zap.Stringer("span", &span), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), zap.Bool("isPrepare", isPrepare)) } @@ -314,12 +314,12 @@ func (p *processor) IsRemoveTableSpanFinished(span tablepb.Span) (model.Ts, bool stats := p.sinkManager.r.GetTableStats(span) if state != tablepb.TableStateStopped { log.Debug("table is still not stopped", - zap.String("captureID", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), zap.Uint64("checkpointTs", stats.CheckpointTs), - zap.Stringer("span", &span), - zap.Any("tableStatus", state)) + zap.Stringer("tableStatus", state)) return 0, false } @@ -329,11 +329,12 @@ func (p *processor) IsRemoveTableSpanFinished(span tablepb.Span) (model.Ts, bool p.sinkManager.r.RemoveTable(span) p.sourceManager.r.RemoveTable(span) log.Info("table removed", - zap.String("captureID", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), zap.Uint64("checkpointTs", stats.CheckpointTs), - zap.Stringer("span", &span)) + ) return stats.CheckpointTs, true } @@ -501,18 +502,18 @@ func (p *processor) Tick( } if p.upstream.IsClosed() { log.Error("upstream is closed", - zap.Uint64("upstreamID", p.upstream.ID), zap.String("namespace", p.changefeedID.Namespace), - zap.String("changefeed", p.changefeedID.ID)) + zap.String("changefeed", p.changefeedID.ID), + zap.Uint64("upstreamID", p.upstream.ID)) return cerror.ErrUnexpected.FastGenByArgs("upstream is closed"), nil } // skip this tick if !p.upstream.IsNormal() { log.Warn("upstream is not ready, skip", - zap.Uint64("id", p.upstream.ID), - zap.Strings("pd", p.upstream.PdEndpoints), zap.String("namespace", p.changefeedID.Namespace), - zap.String("changefeed", p.changefeedID.ID)) + zap.String("changefeed", p.changefeedID.ID), + zap.Uint64("upstreamID", p.upstream.ID), + zap.Strings("pd", p.upstream.PdEndpoints)) return nil, nil } startTime := time.Now() @@ -522,7 +523,6 @@ func (p *processor) Tick( log.Warn("processor tick took too long", zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), - zap.String("capture", p.captureInfo.ID), zap.Duration("duration", costTime)) } @@ -700,7 +700,6 @@ func (p *processor) lazyInitImpl(_ context.Context) (err error) { p.initialized.Store(true) log.Info("processor initialized", - zap.String("capture", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), zap.Uint64("changefeedEpoch", p.changefeedEpoch)) @@ -738,14 +737,12 @@ func (p *processor) handleErrorCh() (err error) { } if !isProcessorIgnorableError(err) { log.Error("error on running processor", - zap.String("capture", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), zap.Error(err)) return err } log.Info("processor exited", - zap.String("capture", p.captureInfo.ID), zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID)) return cerror.ErrReactorFinished diff --git a/cdc/processor/sinkmanager/table_sink_wrapper.go b/cdc/processor/sinkmanager/table_sink_wrapper.go index 580a0f5581..92e614b059 100644 --- a/cdc/processor/sinkmanager/table_sink_wrapper.go +++ b/cdc/processor/sinkmanager/table_sink_wrapper.go @@ -147,7 +147,8 @@ func (t *tableSinkWrapper) start(ctx context.Context, startTs model.Ts) (err err log.Panic("The table sink has already started", zap.String("namespace", t.changefeed.Namespace), zap.String("changefeed", t.changefeed.ID), - zap.Stringer("span", &t.span), + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey), zap.Uint64("startTs", startTs), zap.Uint64("oldReplicateTs", t.replicateTs.Load()), ) @@ -271,7 +272,8 @@ func (t *tableSinkWrapper) markAsClosing() { log.Info("Sink is closing", zap.String("namespace", t.changefeed.Namespace), zap.String("changefeed", t.changefeed.ID), - zap.Stringer("span", &t.span)) + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey)) break } } @@ -287,7 +289,8 @@ func (t *tableSinkWrapper) markAsClosed() { log.Info("Sink is closed", zap.String("namespace", t.changefeed.Namespace), zap.String("changefeed", t.changefeed.ID), - zap.Stringer("span", &t.span)) + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey)) return } } @@ -388,7 +391,8 @@ func (t *tableSinkWrapper) restart(ctx context.Context) (err error) { log.Info("Sink is restarted", zap.String("namespace", t.changefeed.Namespace), zap.String("changefeed", t.changefeed.ID), - zap.Stringer("span", &t.span), + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey), zap.Uint64("replicateTs", ts)) return nil } @@ -452,12 +456,27 @@ func handleRowChangedEvents( size := 0 rowChangedEvents := make([]*model.RowChangedEvent, 0, len(events)) for _, e := range events { - if e == nil || e.Row == nil { + if e == nil { log.Warn("skip emit nil event", zap.String("namespace", changefeed.Namespace), zap.String("changefeed", changefeed.ID), - zap.Stringer("span", &span), - zap.Any("event", e)) + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) + continue + } + if e.Row == nil { + regionID := uint64(0) + if e.RawKV != nil { + regionID = e.RawKV.RegionID + } + log.Warn("skip emit nil row event", + zap.String("namespace", changefeed.Namespace), + zap.String("changefeed", changefeed.ID), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), + zap.Uint64("startTs", e.StartTs), + zap.Uint64("commitTs", e.CRTs), + zap.Uint64("regionID", regionID)) continue } @@ -467,10 +486,12 @@ func handleRowChangedEvents( // Just ignore these row changed events. if len(rowEvent.Columns) == 0 && len(rowEvent.PreColumns) == 0 { log.Warn("skip emit empty row event", - zap.Stringer("span", &span), zap.String("namespace", changefeed.Namespace), zap.String("changefeed", changefeed.ID), - zap.Any("event", e)) + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), + zap.Uint64("startTs", rowEvent.StartTs), + zap.Uint64("commitTs", rowEvent.CommitTs)) continue } diff --git a/cdc/puller/multiplexing_puller.go b/cdc/puller/multiplexing_puller.go index 85ea1cc04f..0e8264f77f 100644 --- a/cdc/puller/multiplexing_puller.go +++ b/cdc/puller/multiplexing_puller.go @@ -78,11 +78,21 @@ type tableProgress struct { func (p *tableProgress) handleResolvedSpans(ctx context.Context, e *model.ResolvedSpans) (err error) { for _, resolvedSpan := range e.Spans { if !spanz.IsSubSpan(resolvedSpan.Span, p.spans...) { + tableID := int64(0) + var startKey tablepb.Key + if len(p.spans) != 0 { + tableID = p.spans[0].TableID + startKey = p.spans[0].StartKey + } log.Panic("the resolved span is not in the table spans", zap.String("namespace", p.changefeed.Namespace), zap.String("changefeed", p.changefeed.ID), zap.String("tableName", p.tableName), - zap.Any("spans", p.spans)) + zap.Int64("tableID", tableID), + zap.Stringer("startKey", startKey), + zap.Int64("resolvedTableID", resolvedSpan.Span.TableID), + zap.Stringer("resolvedStartKey", resolvedSpan.Span.StartKey), + zap.Stringer("resolvedEndKey", resolvedSpan.Span.EndKey)) } p.tsTracker.Forward(resolvedSpan.Region, resolvedSpan.Span, e.ResolvedTs) if e.ResolvedTs > p.maxIngressResolvedTs.Load() { @@ -232,7 +242,8 @@ func (p *MultiplexingPuller) subscribe( log.Panic("redundant subscription", zap.String("namespace", p.changefeed.Namespace), zap.String("changefeed", p.changefeed.ID), - zap.String("span", span.String())) + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) } } @@ -299,7 +310,8 @@ func (p *MultiplexingPuller) unsubscribe(spans []tablepb.Span) { log.Panic("unexist unsubscription", zap.String("namespace", p.changefeed.Namespace), zap.String("changefeed", p.changefeed.ID), - zap.String("span", span.String())) + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) } } if len(progress.spans) != len(spans) { diff --git a/cdc/scheduler/internal/v3/agent/agent.go b/cdc/scheduler/internal/v3/agent/agent.go index 58a9a7cbfa..8387fbcd13 100644 --- a/cdc/scheduler/internal/v3/agent/agent.go +++ b/cdc/scheduler/internal/v3/agent/agent.go @@ -113,9 +113,9 @@ func newAgent( // If we are registered in Etcd, an elected Owner will have to // contact us before it can schedule any table. log.Info("schedulerv3: no owner found. We will wait for an owner to contact us.", - zap.String("ownerCaptureID", ownerCaptureID), zap.String("namespace", changeFeedID.Namespace), zap.String("changefeed", changeFeedID.ID), + zap.String("ownerCaptureID", ownerCaptureID), zap.Error(err)) return result, nil } @@ -139,20 +139,18 @@ func newAgent( }) log.Info("schedulerv3: agent owner found", - zap.String("ownerCaptureID", ownerCaptureID), - zap.String("captureID", captureID), zap.String("namespace", changeFeedID.Namespace), - zap.String("changefeed", changeFeedID.ID)) + zap.String("changefeed", changeFeedID.ID), + zap.String("ownerCaptureID", ownerCaptureID)) revision, err := client.GetOwnerRevision(etcdCliCtx, ownerCaptureID) if err != nil { if errors.ErrOwnerNotFound.Equal(err) || errors.ErrNotOwner.Equal(err) { // These are expected errors when no owner has been elected log.Info("schedulerv3: no owner found when querying for the owner revision", - zap.String("ownerCaptureID", ownerCaptureID), - zap.String("captureID", captureID), zap.String("namespace", changeFeedID.Namespace), zap.String("changefeed", changeFeedID.ID), + zap.String("ownerCaptureID", ownerCaptureID), zap.Error(err)) return result, nil } @@ -256,10 +254,13 @@ func (a *agent) handleMessage(msg []*schedulepb.Message) (result []*schedulepb.M a.handleMessageDispatchTableRequest(message.DispatchTableRequest, processorEpoch) default: log.Warn("schedulerv3: unknown message received", - zap.String("capture", a.CaptureID), zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID), - zap.Any("message", message)) + zap.String("captureID", ownerCaptureID), + zap.Stringer("type", message.GetMsgType()), + zap.Int64("ownerRevision", ownerRevision), + zap.String("processorEpoch", processorEpoch.Epoch), + zap.String("ownerVersion", ownerVersion)) } } return @@ -275,7 +276,8 @@ func (a *agent) handleMessageHeartbeat(request *schedulepb.Heartbeat) (*schedule log.Warn("schedulerv3: CheckpointTs is greater than ResolvedTs", zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID), - zap.String("span", span.String())) + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) } if table.task != nil && table.task.IsRemove { status.State = tablepb.TableStateStopping @@ -328,7 +330,6 @@ func (a *agent) handleMessageDispatchTableRequest( if a.Epoch != epoch { log.Info("schedulerv3: agent receive dispatch table request "+ "epoch does not match, ignore it", - zap.String("capture", a.CaptureID), zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID), zap.String("epoch", epoch.Epoch), @@ -354,34 +355,16 @@ func (a *agent) handleMessageDispatchTableRequest( status: dispatchTableTaskReceived, } table = a.tableM.addTableSpan(span) - if req.AddTable.GetIsSecondary() { - log.Info("schedulerv3: agent received prepare table task", - zap.String("capture", a.CaptureID), - zap.String("namespace", a.ChangeFeedID.Namespace), - zap.String("changefeed", a.ChangeFeedID.ID), - zap.Int64("tableID", span.TableID), - zap.Stringer("startKey", span.StartKey), - zap.Uint64("checkpointTs", req.AddTable.GetCheckpoint().CheckpointTs)) - } else { - log.Info("schedulerv3: agent received replicate table task", - zap.String("capture", a.CaptureID), - zap.String("namespace", a.ChangeFeedID.Namespace), - zap.String("changefeed", a.ChangeFeedID.ID), - zap.Int64("tableID", span.TableID), - zap.Stringer("startKey", span.StartKey), - zap.Uint64("checkpointTs", req.AddTable.GetCheckpoint().CheckpointTs)) - } case *schedulepb.DispatchTableRequest_RemoveTable: span := req.RemoveTable.GetSpan() table, ok = a.tableM.getTableSpan(span) if !ok { log.Warn("schedulerv3: agent ignore remove table request, "+ "since the table not found", - zap.String("capture", a.CaptureID), zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID), - zap.String("span", span.String()), - zap.Any("request", request)) + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) return } task = &dispatchTableTask{ @@ -390,18 +373,13 @@ func (a *agent) handleMessageDispatchTableRequest( Epoch: epoch, status: dispatchTableTaskReceived, } - log.Info("schedulerv3: agent received remove table task", - zap.String("capture", a.CaptureID), - zap.String("namespace", a.ChangeFeedID.Namespace), - zap.String("changefeed", a.ChangeFeedID.ID), - zap.Int64("tableID", span.TableID), - zap.Stringer("startKey", span.StartKey)) default: log.Warn("schedulerv3: agent ignore unknown dispatch table request", - zap.String("capture", a.CaptureID), zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID), - zap.Any("request", request)) + zap.Bool("hasAddTable", request.GetAddTable() != nil), + zap.Bool("hasRemoveTable", request.GetRemoveTable() != nil), + zap.String("epoch", epoch.Epoch)) return } table.injectDispatchTableTask(task) @@ -410,7 +388,6 @@ func (a *agent) handleMessageDispatchTableRequest( // Close implement agent interface func (a *agent) Close() error { log.Debug("schedulerv3: agent closed", - zap.String("capture", a.CaptureID), zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID)) return a.trans.Close() @@ -428,7 +405,6 @@ func (a *agent) handleOwnerInfo(id model.CaptureID, revision int64, version stri // with the same ownerRev but with different ownerIDs. // This should never happen unless the election via Etcd is buggy. log.Panic("schedulerv3: owner IDs do not match", - zap.String("capture", a.CaptureID), zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID), zap.String("expected", a.ownerInfo.ID), @@ -450,27 +426,28 @@ func (a *agent) handleOwnerInfo(id model.CaptureID, revision int64, version stri id: &captureInfo, }) log.Info("schedulerv3: new owner in power", - zap.String("capture", a.CaptureID), zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID), - zap.Any("owner", a.ownerInfo), zap.Any("agent", a)) + zap.String("ownerCaptureID", a.ownerInfo.ID), + zap.String("ownerVersion", a.ownerInfo.Version), + zap.Int64("ownerRevision", a.ownerInfo.Revision.Revision), + zap.String("captureID", a.CaptureID), + zap.String("processorEpoch", a.Epoch.Epoch)) return true } // staled owner heartbeat, just ignore it. log.Debug("schedulerv3: message from staled owner", - zap.String("capture", a.CaptureID), zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID), - zap.Any("staledOwner", ownerInfo{ - CaptureInfo: model.CaptureInfo{ - ID: id, - Version: version, - }, - Revision: schedulepb.OwnerRevision{Revision: revision}, - }), - zap.Any("owner", a.ownerInfo), - zap.Any("agent", a.agentInfo)) + zap.String("staledOwnerCaptureID", id), + zap.String("staledOwnerVersion", version), + zap.Int64("staledOwnerRevision", revision), + zap.String("ownerCaptureID", a.ownerInfo.ID), + zap.String("ownerVersion", a.ownerInfo.Version), + zap.Int64("ownerRevision", a.ownerInfo.Revision.Revision), + zap.String("captureID", a.CaptureID), + zap.String("processorEpoch", a.Epoch.Epoch)) return false } @@ -503,10 +480,11 @@ func (a *agent) sendMsgs(ctx context.Context, msgs []*schedulepb.Message) error m := msgs[i] if m.MsgType == schedulepb.MsgUnknown { log.Panic("schedulerv3: invalid message no destination or unknown message type", - zap.String("capture", a.CaptureID), zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID), - zap.Any("message", m)) + zap.Stringer("type", m.MsgType), + zap.String("to", m.To), + zap.String("from", m.From)) } m.Header = &schedulepb.Message_Header{ Version: a.Version, diff --git a/cdc/scheduler/internal/v3/agent/table.go b/cdc/scheduler/internal/v3/agent/table.go index 11dc369939..d35de74b5c 100644 --- a/cdc/scheduler/internal/v3/agent/table.go +++ b/cdc/scheduler/internal/v3/agent/table.go @@ -67,12 +67,16 @@ func (t *tableSpan) getTableSpanStatus(collectStat bool) tablepb.TableStatus { return t.executor.GetTableSpanStatus(t.span, collectStat) } -func newAddTableResponseMessage(status tablepb.TableStatus) *schedulepb.Message { +func newAddTableResponseMessage(changefeedID model.ChangeFeedID, status tablepb.TableStatus) *schedulepb.Message { if status.Checkpoint.ResolvedTs < status.Checkpoint.CheckpointTs { log.Warn("schedulerv3: resolved ts should not less than checkpoint ts", - zap.Any("tableStatus", status), - zap.Any("checkpoint", status.Checkpoint.CheckpointTs), - zap.Any("resolved", status.Checkpoint.ResolvedTs)) + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Int64("tableID", status.Span.TableID), + zap.Stringer("startKey", status.Span.StartKey), + zap.Stringer("state", status.State), + zap.Uint64("checkpointTs", status.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", status.Checkpoint.ResolvedTs)) } return &schedulepb.Message{ MsgType: schedulepb.MsgDispatchTableResponse, @@ -87,7 +91,7 @@ func newAddTableResponseMessage(status tablepb.TableStatus) *schedulepb.Message } } -func newRemoveTableResponseMessage(status tablepb.TableStatus) *schedulepb.Message { +func newRemoveTableResponseMessage(changefeedID model.ChangeFeedID, status tablepb.TableStatus) *schedulepb.Message { if status.Checkpoint.ResolvedTs < status.Checkpoint.CheckpointTs { // TODO: resolvedTs should not be zero, but we have to handle it for now. if status.Checkpoint.ResolvedTs == 0 { @@ -95,9 +99,13 @@ func newRemoveTableResponseMessage(status tablepb.TableStatus) *schedulepb.Messa status.Checkpoint.ResolvedTs = status.Checkpoint.CheckpointTs } else { log.Warn("schedulerv3: resolved ts should not less than checkpoint ts", - zap.Any("tableStatus", status), - zap.Any("checkpoint", status.Checkpoint.CheckpointTs), - zap.Any("resolved", status.Checkpoint.ResolvedTs)) + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Int64("tableID", status.Span.TableID), + zap.Stringer("startKey", status.Span.StartKey), + zap.Stringer("state", status.State), + zap.Uint64("checkpointTs", status.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", status.Checkpoint.ResolvedTs)) } } message := &schedulepb.Message{ @@ -124,9 +132,10 @@ func (t *tableSpan) handleRemoveTableTask() *schedulepb.Message { log.Warn("schedulerv3: remove table, but table is absent", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span)) + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey)) t.task = nil - return newRemoveTableResponseMessage(t.getTableSpanStatus(false)) + return newRemoveTableResponseMessage(t.changefeedID, t.getTableSpanStatus(false)) case tablepb.TableStateStopping, // stopping now is useless tablepb.TableStateStopped: // release table resource, and get the latest checkpoint @@ -136,13 +145,13 @@ func (t *tableSpan) handleRemoveTableTask() *schedulepb.Message { // actually, this should never be hit, since we know that table is stopped. status := t.getTableSpanStatus(false) status.State = tablepb.TableStateStopping - return newRemoveTableResponseMessage(status) + return newRemoveTableResponseMessage(t.changefeedID, status) } t.task = nil status := t.getTableSpanStatus(false) status.State = tablepb.TableStateStopped status.Checkpoint.CheckpointTs = checkpointTs - return newRemoveTableResponseMessage(status) + return newRemoveTableResponseMessage(t.changefeedID, status) case tablepb.TableStatePreparing, tablepb.TableStatePrepared, tablepb.TableStateReplicating: @@ -150,14 +159,16 @@ func (t *tableSpan) handleRemoveTableTask() *schedulepb.Message { if !done { status := t.getTableSpanStatus(false) status.State = tablepb.TableStateStopping - return newRemoveTableResponseMessage(status) + return newRemoveTableResponseMessage(t.changefeedID, status) } state, changed = t.getAndUpdateTableSpanState() default: log.Panic("schedulerv3: unknown table state", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span), zap.Stringer("state", state)) + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey), + zap.Stringer("state", state)) } } return nil @@ -174,21 +185,24 @@ func (t *tableSpan) handleAddTableTask(ctx context.Context) (result *schedulepb. log.Warn("schedulerv3: agent add table failed", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span), zap.Any("task", t.task), + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey), + zap.Uint64("checkpointTs", t.task.Checkpoint.CheckpointTs), + zap.Bool("isPrepare", t.task.IsPrepare), zap.Error(err)) status := t.getTableSpanStatus(false) - return newAddTableResponseMessage(status), errors.Trace(err) + return newAddTableResponseMessage(t.changefeedID, status), errors.Trace(err) } state, changed = t.getAndUpdateTableSpanState() case tablepb.TableStateReplicating: t.task = nil status := t.getTableSpanStatus(false) - return newAddTableResponseMessage(status), nil + return newAddTableResponseMessage(t.changefeedID, status), nil case tablepb.TableStatePrepared: if t.task.IsPrepare { // `prepared` is a stable state, if the task was to prepare the table. t.task = nil - return newAddTableResponseMessage(t.getTableSpanStatus(false)), nil + return newAddTableResponseMessage(t.changefeedID, t.getTableSpanStatus(false)), nil } if t.task.status == dispatchTableTaskReceived { @@ -197,17 +211,20 @@ func (t *tableSpan) handleAddTableTask(ctx context.Context) (result *schedulepb. log.Warn("schedulerv3: agent add table failed", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span), zap.Stringer("state", state), + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey), + zap.Stringer("state", state), + zap.Uint64("checkpointTs", t.task.Checkpoint.CheckpointTs), zap.Error(err)) status := t.getTableSpanStatus(false) - return newAddTableResponseMessage(status), errors.Trace(err) + return newAddTableResponseMessage(t.changefeedID, status), errors.Trace(err) } t.task.status = dispatchTableTaskProcessed } done := t.executor.IsAddTableSpanFinished(t.task.Span, false) if !done { - return newAddTableResponseMessage(t.getTableSpanStatus(false)), nil + return newAddTableResponseMessage(t.changefeedID, t.getTableSpanStatus(false)), nil } state, changed = t.getAndUpdateTableSpanState() case tablepb.TableStatePreparing: @@ -223,14 +240,16 @@ func (t *tableSpan) handleAddTableTask(ctx context.Context) (result *schedulepb. log.Warn("schedulerv3: ignore add table", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span)) + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey)) t.task = nil - return newAddTableResponseMessage(t.getTableSpanStatus(false)), nil + return newAddTableResponseMessage(t.changefeedID, t.getTableSpanStatus(false)), nil default: log.Panic("schedulerv3: unknown table state", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span)) + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey)) } } @@ -242,11 +261,34 @@ func (t *tableSpan) injectDispatchTableTask(task *dispatchTableTask) { log.Panic("schedulerv3: tableID not match", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), - zap.Any("tableSpan", t.span), - zap.Stringer("task.TableID", &task.Span)) + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey), + zap.Int64("taskTableID", task.Span.TableID), + zap.Stringer("taskStartKey", task.Span.StartKey)) } if t.task == nil { t.task = task + if task.IsRemove { + log.Info("schedulerv3: agent accepted remove table task", + zap.String("namespace", t.changefeedID.Namespace), + zap.String("changefeed", t.changefeedID.ID), + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey)) + } else if task.IsPrepare { + log.Info("schedulerv3: agent accepted prepare table task", + zap.String("namespace", t.changefeedID.Namespace), + zap.String("changefeed", t.changefeedID.ID), + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey), + zap.Uint64("checkpointTs", task.Checkpoint.CheckpointTs)) + } else { + log.Info("schedulerv3: agent accepted replicate table task", + zap.String("namespace", t.changefeedID.Namespace), + zap.String("changefeed", t.changefeedID.ID), + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey), + zap.Uint64("checkpointTs", task.Checkpoint.CheckpointTs)) + } return } if t.task.Span.Eq(&task.Span) && @@ -353,7 +395,8 @@ func (tm *tableSpanManager) dropTableSpan(span tablepb.Span) { log.Warn("schedulerv3: tableManager drop table not found", zap.String("namespace", tm.changefeedID.Namespace), zap.String("changefeed", tm.changefeedID.ID), - zap.String("span", span.String())) + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) return } state, _ := table.getAndUpdateTableSpanState() @@ -361,7 +404,8 @@ func (tm *tableSpanManager) dropTableSpan(span tablepb.Span) { log.Panic("schedulerv3: tableManager drop table undesired", zap.String("namespace", tm.changefeedID.Namespace), zap.String("changefeed", tm.changefeedID.ID), - zap.String("span", span.String()), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), zap.Stringer("state", table.state)) } tm.tables.Delete(span) diff --git a/cdc/scheduler/internal/v3/coordinator.go b/cdc/scheduler/internal/v3/coordinator.go index 8b8eb577f7..58d033b0bb 100644 --- a/cdc/scheduler/internal/v3/coordinator.go +++ b/cdc/scheduler/internal/v3/coordinator.go @@ -260,7 +260,7 @@ func (c *coordinator) Close(ctx context.Context) { log.Info("schedulerv3: coordinator closed", zap.String("namespace", c.changefeedID.Namespace), zap.String("changefeed", c.changefeedID.ID), - zap.Any("ownerRev", c.captureM.OwnerRev)) + zap.Int64("ownerRevision", c.captureM.OwnerRev.Revision)) } // =========== @@ -408,12 +408,39 @@ func (c *coordinator) recvMsgs(ctx context.Context) ([]*schedulepb.Message, erro func (c *coordinator) sendMsgs(ctx context.Context, msgs []*schedulepb.Message) error { for i := range msgs { m := msgs[i] + header := m.GetHeader() + ownerRevision := int64(0) + processorEpoch := "" + if header != nil { + ownerRevision = header.OwnerRevision.Revision + processorEpoch = header.ProcessorEpoch.Epoch + } + dispatchRequest := m.GetDispatchTableRequest() + addTable := dispatchRequest.GetAddTable() + removeTable := dispatchRequest.GetRemoveTable() + tableID := model.TableID(0) + if addTable != nil { + tableID = addTable.TableID + } else if removeTable != nil { + tableID = removeTable.TableID + } // Correctness check. if len(m.To) == 0 || m.MsgType == schedulepb.MsgUnknown { log.Panic("invalid message no destination or unknown message type", zap.String("namespace", c.changefeedID.Namespace), zap.String("changefeed", c.changefeedID.ID), - zap.Any("message", m)) + zap.Stringer("type", m.MsgType), + zap.String("from", m.From), + zap.String("to", m.To), + zap.Int64("ownerRevision", ownerRevision), + zap.String("processorEpoch", processorEpoch), + zap.Bool("hasDispatchTableRequest", dispatchRequest != nil), + zap.Bool("hasDispatchTableResponse", m.GetDispatchTableResponse() != nil), + zap.Bool("hasHeartbeat", m.GetHeartbeat() != nil), + zap.Bool("hasHeartbeatResponse", m.GetHeartbeatResponse() != nil), + zap.Bool("isAddTable", addTable != nil), + zap.Bool("isRemoveTable", removeTable != nil), + zap.Int64("tableID", tableID)) } epoch := schedulepb.ProcessorEpoch{} diff --git a/cdc/scheduler/internal/v3/keyspan/reconciler.go b/cdc/scheduler/internal/v3/keyspan/reconciler.go index 65f735a7e6..8380ffedf5 100644 --- a/cdc/scheduler/internal/v3/keyspan/reconciler.go +++ b/cdc/scheduler/internal/v3/keyspan/reconciler.go @@ -146,8 +146,8 @@ func (m *Reconciler) Reconcile( } // 3. owner switch after some captures failed. log.Info("schedulerv3: detect owner switch after captures fail", - zap.String("changefeed", m.changefeedID.ID), zap.String("namespace", m.changefeedID.Namespace), + zap.String("changefeed", m.changefeedID.ID), zap.Int64("tableID", tableID), zap.Int("holes", len(holes)), zap.Stringer("startKey", tableStart.StartKey), diff --git a/cdc/scheduler/internal/v3/keyspan/splitter_region_count.go b/cdc/scheduler/internal/v3/keyspan/splitter_region_count.go index 49205f928d..61e249b97a 100644 --- a/cdc/scheduler/internal/v3/keyspan/splitter_region_count.go +++ b/cdc/scheduler/internal/v3/keyspan/splitter_region_count.go @@ -53,7 +53,9 @@ func (m *regionCountSplitter) split( log.Warn("schedulerv3: list regions failed, skip split span", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("span", span.String()), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), + zap.Stringer("endKey", span.EndKey), zap.Error(err)) return []tablepb.Span{span} } @@ -61,7 +63,9 @@ func (m *regionCountSplitter) split( log.Info("schedulerv3: skip split span by region count", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("span", span.String()), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), + zap.Stringer("endKey", span.EndKey), zap.Int("totalCaptures", captureNum), zap.Int("regionCount", len(regions)), zap.Int("regionThreshold", m.regionThreshold)) @@ -82,8 +86,11 @@ func (m *regionCountSplitter) split( log.Warn("schedulerv3: list region out of order detected", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("span", span.String()), - zap.Stringer("lastSpan", &spans[len(spans)-1]), + zap.Int64("tableID", span.TableID), + zap.Stringer("spanStartKey", span.StartKey), + zap.Stringer("spanEndKey", span.EndKey), + zap.Stringer("lastStartKey", spans[len(spans)-1].StartKey), + zap.Stringer("lastEndKey", spans[len(spans)-1].EndKey), zap.String("startKey", hex.EncodeToString(startKey)), zap.String("endKey", hex.EncodeToString(endKey))) return []tablepb.Span{span} @@ -111,7 +118,9 @@ func (m *regionCountSplitter) split( log.Info("schedulerv3: split span by region count", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("span", span.String()), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), + zap.Stringer("endKey", span.EndKey), zap.Int("spans", len(spans)), zap.Int("totalCaptures", captureNum), zap.Int("regionCount", len(regions)), @@ -145,7 +154,6 @@ func newEvenlySplitStepper(pages int, totalRegion int) evenlySplitStepper { extraRegionPerSpan: extraRegionPerSpan, remain: remain, } - log.Info("schedulerv3: evenly split stepper", zap.Any("evenlySplitStepper", res)) return res } diff --git a/cdc/scheduler/internal/v3/keyspan/splitter_write.go b/cdc/scheduler/internal/v3/keyspan/splitter_write.go index 653328ba61..43364b7d4c 100644 --- a/cdc/scheduler/internal/v3/keyspan/splitter_write.go +++ b/cdc/scheduler/internal/v3/keyspan/splitter_write.go @@ -65,7 +65,9 @@ func (m *writeSplitter) split( log.Warn("schedulerv3: scan regions failed, skip split span", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("span", span.String()), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), + zap.Stringer("endKey", span.EndKey), zap.Error(err)) return nil } @@ -76,7 +78,9 @@ func (m *writeSplitter) split( " the maxSpanRegionLimit, skip split span", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("span", span.String()), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), + zap.Stringer("endKey", span.EndKey), zap.Error(err)) return []tablepb.Span{span} } @@ -85,7 +89,9 @@ func (m *writeSplitter) split( log.Info("schedulerv3: split span by written keys", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("span", span.String()), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), + zap.Stringer("endKey", span.EndKey), zap.Ints("perSpanRegionCounts", splitInfo.RegionCounts), zap.Uint64s("weights", splitInfo.Weights), zap.Int("spans", len(splitInfo.Spans)), @@ -260,8 +266,12 @@ func (m *writeSplitter) splitRegionsByWrittenKeysV1( zap.Int("regionsLength", len(regions)), zap.Int("restSpans", restSpans), zap.Int64("restWeight", restWeight), - zap.Any("prevSpan", spans[len(spans)-2]), - zap.Any("lastSpan", spans[len(spans)-1]), + zap.Int64("prevTableID", spans[len(spans)-2].TableID), + zap.Stringer("prevStartKey", spans[len(spans)-2].StartKey), + zap.Stringer("prevEndKey", spans[len(spans)-2].EndKey), + zap.Int64("tableID", spans[len(spans)-1].TableID), + zap.Stringer("startKey", spans[len(spans)-1].StartKey), + zap.Stringer("endKey", spans[len(spans)-1].EndKey), ) } return &splitRegionsInfo{ diff --git a/cdc/scheduler/internal/v3/replication/replication_manager.go b/cdc/scheduler/internal/v3/replication/replication_manager.go index 654a8bf5b5..cba9d50a09 100644 --- a/cdc/scheduler/internal/v3/replication/replication_manager.go +++ b/cdc/scheduler/internal/v3/replication/replication_manager.go @@ -183,7 +183,8 @@ func (r *Manager) HandleCaptureChanges( log.Panic("schedulerv3: init again", zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), - zap.Any("init", init), zap.Any("tablesCount", r.spans.Len())) + zap.Int("initCaptureCount", len(init)), + zap.Int("trackedTableCount", r.spans.Len())) } spanStatusMap := spanz.NewBtreeMap[map[model.CaptureID]*tablepb.TableStatus]() for captureID, spans := range init { @@ -256,10 +257,21 @@ func (r *Manager) HandleMessage( } sentMsgs = append(sentMsgs, msgs...) default: + header := msg.GetHeader() + ownerRevision := int64(0) + processorEpoch := "" + if header != nil { + ownerRevision = header.OwnerRevision.Revision + processorEpoch = header.ProcessorEpoch.Epoch + } log.Warn("schedulerv3: ignore message", zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), - zap.Stringer("type", msg.MsgType), zap.Any("message", msg)) + zap.Stringer("type", msg.MsgType), + zap.String("from", msg.GetFrom()), + zap.String("to", msg.GetTo()), + zap.Int64("ownerRevision", ownerRevision), + zap.String("processorEpoch", processorEpoch)) } } return sentMsgs, nil @@ -275,8 +287,12 @@ func (r *Manager) handleMessageHeartbeatResponse( log.Info("schedulerv3: ignore table status no table found", zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), - zap.Any("from", from), - zap.Any("message", status)) + zap.String("captureID", from), + zap.Int64("tableID", status.Span.TableID), + zap.Stringer("startKey", status.Span.StartKey), + zap.Stringer("state", status.State), + zap.Uint64("checkpointTs", status.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", status.Checkpoint.ResolvedTs)) continue } msgs, err := table.handleTableStatus(from, &status) @@ -304,7 +320,9 @@ func (r *Manager) handleMessageDispatchTableResponse( log.Warn("schedulerv3: ignore unknown dispatch table response", zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), - zap.Any("message", msg)) + zap.String("captureID", from), + zap.Bool("hasAddTable", msg.GetAddTable() != nil), + zap.Bool("hasRemoveTable", msg.GetRemoveTable() != nil)) return nil, nil } @@ -313,7 +331,12 @@ func (r *Manager) handleMessageDispatchTableResponse( log.Info("schedulerv3: ignore table status no table found", zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), - zap.Any("message", status)) + zap.String("captureID", from), + zap.Int64("tableID", status.Span.TableID), + zap.Stringer("startKey", status.Span.StartKey), + zap.Stringer("state", status.State), + zap.Uint64("checkpointTs", status.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", status.Checkpoint.ResolvedTs)) return nil, nil } msgs, err := table.handleTableStatus(from, status) @@ -386,17 +409,52 @@ func (r *Manager) HandleTasks( // Skip task if the table is already running a task, // or the table has removed. if _, ok := r.runningTasks.Get(span); ok { - log.Info("schedulerv3: ignore task, already exists", - zap.String("namespace", r.changefeedID.Namespace), - zap.String("changefeed", r.changefeedID.ID), - zap.Any("task", task)) + if task.AddTable != nil { + log.Info("schedulerv3: ignore task, already exists", + zap.String("namespace", r.changefeedID.Namespace), + zap.String("changefeed", r.changefeedID.ID), + zap.String("task", task.Name()), + zap.String("captureID", task.AddTable.CaptureID), + zap.Int64("tableID", task.AddTable.Span.TableID), + zap.Stringer("startKey", task.AddTable.Span.StartKey), + zap.Uint64("checkpointTs", task.AddTable.CheckpointTs)) + } else if task.RemoveTable != nil { + log.Info("schedulerv3: ignore task, already exists", + zap.String("namespace", r.changefeedID.Namespace), + zap.String("changefeed", r.changefeedID.ID), + zap.String("task", task.Name()), + zap.String("captureID", task.RemoveTable.CaptureID), + zap.Int64("tableID", task.RemoveTable.Span.TableID), + zap.Stringer("startKey", task.RemoveTable.Span.StartKey)) + } else if task.MoveTable != nil { + log.Info("schedulerv3: ignore task, already exists", + zap.String("namespace", r.changefeedID.Namespace), + zap.String("changefeed", r.changefeedID.ID), + zap.String("task", task.Name()), + zap.String("captureID", task.MoveTable.DestCapture), + zap.Int64("tableID", task.MoveTable.Span.TableID), + zap.Stringer("startKey", task.MoveTable.Span.StartKey)) + } continue } if _, ok := r.spans.Get(span); !ok && task.AddTable == nil { - log.Info("schedulerv3: ignore task, table not found", - zap.String("namespace", r.changefeedID.Namespace), - zap.String("changefeed", r.changefeedID.ID), - zap.Any("task", task)) + if task.RemoveTable != nil { + log.Info("schedulerv3: ignore task, table not found", + zap.String("namespace", r.changefeedID.Namespace), + zap.String("changefeed", r.changefeedID.ID), + zap.String("task", task.Name()), + zap.String("captureID", task.RemoveTable.CaptureID), + zap.Int64("tableID", task.RemoveTable.Span.TableID), + zap.Stringer("startKey", task.RemoveTable.Span.StartKey)) + } else if task.MoveTable != nil { + log.Info("schedulerv3: ignore task, table not found", + zap.String("namespace", r.changefeedID.Namespace), + zap.String("changefeed", r.changefeedID.ID), + zap.String("task", task.Name()), + zap.String("captureID", task.MoveTable.DestCapture), + zap.Int64("tableID", task.MoveTable.Span.TableID), + zap.Stringer("startKey", task.MoveTable.Span.StartKey)) + } continue } @@ -587,8 +645,11 @@ func (r *Manager) AdvanceCheckpoint( log.Warn("schedulerv3: span hole detected, skip advance checkpoint", zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), - zap.String("lastSpan", lastSpan.String()), - zap.String("span", span.String())) + zap.Int64("tableID", span.TableID), + zap.Stringer("lastStartKey", lastSpan.StartKey), + zap.Stringer("lastEndKey", lastSpan.EndKey), + zap.Stringer("startKey", span.StartKey), + zap.Stringer("endKey", span.EndKey)) tableHasHole = true return false } @@ -675,7 +736,7 @@ func (r *Manager) AdvanceCheckpoint( zap.String("changefeed", r.changefeedID.ID), zap.Uint64("newCheckpointTs", watermark.CheckpointTs), zap.Uint64("newResolvedTs", watermark.ResolvedTs), - zap.Any("currentTables", currentTables)) + zap.Int("currentTableCount", currentTables.Len())) } watermark.ResolvedTs = barrier.GlobalBarrierTs watermark.CheckpointTs = barrier.MinTableBarrierTs diff --git a/cdc/scheduler/internal/v3/replication/replication_set.go b/cdc/scheduler/internal/v3/replication/replication_set.go index d51c22f395..d3a490929f 100644 --- a/cdc/scheduler/internal/v3/replication/replication_set.go +++ b/cdc/scheduler/internal/v3/replication/replication_set.go @@ -232,12 +232,14 @@ func NewReplicationSet( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", table.Checkpoint), + zap.Uint64("inputCheckpointTs", table.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", table.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", table), + zap.Stringer("inputState", table.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) err := r.setCapture(captureID, RoleUndetermined) if err != nil { return nil, errors.Trace(err) @@ -252,12 +254,14 @@ func NewReplicationSet( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", table.Checkpoint), + zap.Uint64("inputCheckpointTs", table.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", table.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", table), + zap.Stringer("inputState", table.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) } } @@ -333,11 +337,13 @@ func (r *ReplicationSet) promoteSecondary(captureID model.CaptureID) error { zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", r.Checkpoint), + zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil } role, ok := r.Captures[captureID] @@ -368,12 +374,14 @@ func (r *ReplicationSet) inconsistentError( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span), + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey), }...) log.L().WithOptions(zap.AddCallerSkip(1)).Error(msg, fields...) return errors.ErrReplicationSetInconsistent.GenWithStackByArgs( @@ -388,12 +396,14 @@ func (r *ReplicationSet) multiplePrimaryError( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span), + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey), }...) log.L().WithOptions(zap.AddCallerSkip(1)).Error(msg, fields...) return errors.ErrReplicationSetMultiplePrimaryError.GenWithStackByArgs( @@ -510,12 +520,14 @@ func (r *ReplicationSet) pollOnAbsent( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return false, nil } @@ -573,12 +585,14 @@ func (r *ReplicationSet) pollOnPrepare( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) r.clearPrimary() return nil, false, nil } @@ -602,12 +616,14 @@ func (r *ReplicationSet) pollOnPrepare( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("old", oldState), zap.Stringer("new", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, true, nil } } @@ -616,12 +632,14 @@ func (r *ReplicationSet) pollOnPrepare( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, false, nil } @@ -655,12 +673,14 @@ func (r *ReplicationSet) pollOnCommit( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), zap.String("secondary", captureID), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, false, nil } // No primary, promote secondary to primary. @@ -700,12 +720,14 @@ func (r *ReplicationSet) pollOnCommit( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("old", oldState), zap.Stringer("new", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, true, nil } // Primary is stopped, promote secondary to primary. @@ -718,12 +740,14 @@ func (r *ReplicationSet) pollOnCommit( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", secondary), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span), + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey), zap.String("original", original), ) return &schedulepb.Message{ @@ -757,22 +781,26 @@ func (r *ReplicationSet) pollOnCommit( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("old", oldState), zap.Stringer("new", r.State), zap.String("primary", r.Primary), zap.String("secondary", captureID), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) } else { log.Info("schedulerv3: secondary is stopped during Commit", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), zap.String("primary", r.Primary), zap.String("secondary", captureID), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) } return nil, true, nil } else if r.isInRole(captureID, RoleUndetermined) { @@ -781,12 +809,14 @@ func (r *ReplicationSet) pollOnCommit( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) err := r.clearCapture(captureID, RoleUndetermined) return nil, false, errors.Trace(err) } @@ -845,12 +875,14 @@ func (r *ReplicationSet) pollOnCommit( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, false, nil } @@ -861,12 +893,14 @@ func (r *ReplicationSet) pollOnCommit( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, false, nil } @@ -900,12 +934,14 @@ func (r *ReplicationSet) pollOnReplicating( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("old", oldState), zap.Stringer("new", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, true, nil } } @@ -914,12 +950,14 @@ func (r *ReplicationSet) pollOnReplicating( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, false, nil } @@ -957,12 +995,14 @@ func (r *ReplicationSet) pollOnRemoving( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span), + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey), zap.Error(err)) } return nil, false, nil @@ -974,12 +1014,14 @@ func (r *ReplicationSet) pollOnRemoving( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", input.Checkpoint), + zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), - zap.Stringer("tableState", input), + zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, false, nil } @@ -999,11 +1041,13 @@ func (r *ReplicationSet) handleAddTable( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", r.Checkpoint), + zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, nil } err := r.setCapture(captureID, RoleSecondary) @@ -1027,34 +1071,50 @@ func (r *ReplicationSet) handleMoveTable( ) ([]*schedulepb.Message, error) { if r.hasRemoved() { secondary, _ := r.getRole(RoleSecondary) - log.Warn("schedulerv3: move table is ignored, since it removed already", + log.Warn("schedulerv3: move table is ignored, table already removed", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", dest), - zap.Any("checkpoint", r.Checkpoint), + zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, nil } - // Ignore move table if - // 1) it's not in Replicating state or - // 2) the dest capture is the primary. - if r.State != ReplicationSetStateReplicating || r.Primary == dest { + + if r.State != ReplicationSetStateReplicating { secondary, _ := r.getRole(RoleSecondary) - log.Warn("schedulerv3: move table is ignored, since it's not replicating or the primary is the same as the move destination", + log.Warn("schedulerv3: move table is ignored, table is not replicating", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", dest), - zap.Any("checkpoint", r.Checkpoint), + zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) + return nil, nil + } + if r.Primary == dest { + secondary, _ := r.getRole(RoleSecondary) + log.Warn("schedulerv3: move table is ignored, target capture is already primary", + zap.String("namespace", r.Changefeed.Namespace), + zap.String("changefeed", r.Changefeed.ID), + zap.String("captureID", dest), + zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), + zap.Stringer("state", r.State), + zap.String("primary", r.Primary), + zap.String("secondary", secondary), + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, nil } - oldState := r.State r.State = ReplicationSetStatePrepare err := r.setCapture(dest, RoleSecondary) if err != nil { @@ -1065,12 +1125,12 @@ func (r *ReplicationSet) handleMoveTable( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", dest), - zap.Any("checkpoint", r.Checkpoint), - zap.Stringer("old", oldState), - zap.Stringer("new", r.State), + zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) status := tablepb.TableStatus{ Span: r.Span, State: tablepb.TableStateAbsent, @@ -1083,44 +1143,45 @@ func (r *ReplicationSet) handleRemoveTable() ([]*schedulepb.Message, error) { // Ignore remove table if it has been removed already. if r.hasRemoved() { secondary, _ := r.getRole(RoleSecondary) - log.Warn("schedulerv3: remove table is ignored", + log.Warn("schedulerv3: remove table is ignored, table already removed", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", r.Primary), - zap.Any("checkpoint", r.Checkpoint), + zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, nil } // Ignore remove table if it's not in Replicating state. if r.State != ReplicationSetStateReplicating { secondary, _ := r.getRole(RoleSecondary) - log.Warn("schedulerv3: remove table is ignored", + log.Warn("schedulerv3: remove table is ignored, table is not replicating", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", r.Primary), - zap.Any("checkpoint", r.Checkpoint), + zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return nil, nil } - oldState := r.State r.State = ReplicationSetStateRemoving secondary, _ := r.getRole(RoleSecondary) log.Info("schedulerv3: replication state transition, remove table", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", r.Primary), - zap.Any("checkpoint", r.Checkpoint), - zap.Stringer("old", oldState), - zap.Stringer("new", r.State), + zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) status := tablepb.TableStatus{ Span: r.Span, State: tablepb.TableStateReplicating, @@ -1161,12 +1222,14 @@ func (r *ReplicationSet) handleCaptureShutdown( zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.String("captureID", captureID), - zap.Any("checkpoint", r.Checkpoint), + zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("old", oldState), zap.Stringer("new", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) return msgs, true, errors.Trace(err) } @@ -1178,11 +1241,13 @@ func (r *ReplicationSet) updateCheckpointAndStats( log.Warn("schedulerv3: resolved ts should not less than checkpoint ts", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Any("checkpoint", checkpoint), + zap.Uint64("inputCheckpointTs", checkpoint.CheckpointTs), + zap.Uint64("inputResolvedTs", checkpoint.ResolvedTs), zap.Stringer("state", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) // TODO: resolvedTs should not be zero, but we have to handle it for now. if checkpoint.ResolvedTs == 0 { @@ -1200,11 +1265,13 @@ func (r *ReplicationSet) updateCheckpointAndStats( log.Warn("schedulerv3: resolved ts should not less than checkpoint ts", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.Any("checkpoint", r.Checkpoint), + zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), zap.String("primary", r.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &r.Span)) + zap.Int64("tableID", r.Span.TableID), + zap.Stringer("startKey", r.Span.StartKey)) } if r.Checkpoint.LastSyncedTs < checkpoint.LastSyncedTs { diff --git a/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go b/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go index 9fb9481034..8c9d969142 100644 --- a/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go +++ b/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go @@ -78,13 +78,17 @@ func (b *basicScheduler) Schedule( // Build add table tasks. if len(newSpans) > 0 { captureIDs := make([]model.CaptureID, 0, len(captures)) + stoppingCaptureCount := 0 for captureID, status := range captures { if status.State == member.CaptureStateStopping { + stoppingCaptureCount++ log.Warn("schedulerv3: capture is stopping, "+ "skip the capture when add new table", zap.String("namespace", b.changefeedID.Namespace), zap.String("changefeed", b.changefeedID.ID), - zap.Any("captureStatus", status)) + zap.String("captureID", captureID), + zap.Stringer("state", status.State), + zap.Bool("isOwner", status.IsOwner)) continue } captureIDs = append(captureIDs, captureID) @@ -98,11 +102,12 @@ func (b *basicScheduler) Schedule( log.Warn("schedulerv3: cannot found capture when add new table", zap.String("namespace", b.changefeedID.Namespace), zap.String("changefeed", b.changefeedID.ID), - zap.Any("allCaptureStatus", captures)) + zap.Int("captureCount", len(captures)), + zap.Int("stoppingCaptureCount", stoppingCaptureCount)) return tasks } tasks = append( - tasks, newBurstAddTables(b.changefeedID, checkpointTs, newSpans, captureIDs)) + tasks, newBurstAddTables(checkpointTs, newSpans, captureIDs)) } // Build remove table tasks. @@ -139,12 +144,10 @@ func (b *basicScheduler) Schedule( // newBurstAddTables add each new table to captures in a round-robin way. func newBurstAddTables( - changefeedID model.ChangeFeedID, checkpointTs model.Ts, newSpans []tablepb.Span, captureIDs []model.CaptureID, ) *replication.ScheduleTask { idx := 0 tables := make([]replication.AddTable, 0, len(newSpans)) - tablesPerCapture := make(map[model.CaptureID][]int64, len(captureIDs)) for _, span := range newSpans { targetCapture := captureIDs[idx] tables = append(tables, replication.AddTable{ @@ -152,19 +155,11 @@ func newBurstAddTables( CaptureID: targetCapture, CheckpointTs: checkpointTs, }) - tablesPerCapture[targetCapture] = append(tablesPerCapture[targetCapture], span.TableID) idx++ if idx >= len(captureIDs) { idx = 0 } } - for captureID, tableIDs := range tablesPerCapture { - log.Info("schedulerv3: burst add tables", - zap.String("namespace", changefeedID.Namespace), - zap.String("changefeed", changefeedID.ID), - zap.String("captureID", captureID), - zap.Int64s("tableIDs", tableIDs)) - } return &replication.ScheduleTask{ BurstBalance: &replication.BurstBalance{ AddTables: tables, @@ -177,7 +172,6 @@ func newBurstRemoveTables( changefeedID model.ChangeFeedID, ) *replication.ScheduleTask { tables := make([]replication.RemoveTable, 0, len(rmSpans)) - tablesPerCapture := make(map[model.CaptureID][]int64) for _, span := range rmSpans { rep := replications.GetV(span) var captureID model.CaptureID @@ -197,32 +191,25 @@ func newBurstRemoveTables( "this may happen if the capture shutdown", zap.String("namespace", changefeedID.Namespace), zap.String("changefeed", changefeedID.ID), - zap.Any("checkpoint", rep.Checkpoint), + zap.Uint64("checkpointTs", rep.Checkpoint.CheckpointTs), + zap.Uint64("resolvedTs", rep.Checkpoint.ResolvedTs), zap.Stringer("state", rep.State), zap.String("primary", rep.Primary), zap.String("secondary", secondary), - zap.Stringer("span", &span)) + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) continue } tables = append(tables, replication.RemoveTable{ Span: span, CaptureID: captureID, }) - tablesPerCapture[captureID] = append(tablesPerCapture[captureID], span.TableID) } if len(tables) == 0 { return nil } - for captureID, tableIDs := range tablesPerCapture { - log.Info("schedulerv3: burst remove table", - zap.String("namespace", changefeedID.Namespace), - zap.String("changefeed", changefeedID.ID), - zap.String("captureID", captureID), - zap.Int64s("tableIDs", tableIDs)) - } - return &replication.ScheduleTask{ BurstBalance: &replication.BurstBalance{ RemoveTables: tables, diff --git a/cdc/scheduler/internal/v3/scheduler/scheduler_drain_capture.go b/cdc/scheduler/internal/v3/scheduler/scheduler_drain_capture.go index 66de141b7a..9be0d8a5c6 100644 --- a/cdc/scheduler/internal/v3/scheduler/scheduler_drain_capture.go +++ b/cdc/scheduler/internal/v3/scheduler/scheduler_drain_capture.go @@ -119,7 +119,8 @@ func (d *drainCaptureScheduler) Schedule( "since cannot found destination captures", zap.String("namespace", d.changefeedID.Namespace), zap.String("changefeed", d.changefeedID.ID), - zap.String("target", d.target), zap.Any("captures", captures)) + zap.String("target", d.target), + zap.Int("captureCount", len(captures))) d.target = captureIDNotDraining return nil } @@ -136,7 +137,10 @@ func (d *drainCaptureScheduler) Schedule( zap.String("namespace", d.changefeedID.Namespace), zap.String("changefeed", d.changefeedID.ID), zap.String("target", d.target), - zap.Any("replication", rep)) + zap.String("captureID", rep.Primary), + zap.Stringer("state", rep.State), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) skipDrain = true return false } @@ -186,7 +190,8 @@ func (d *drainCaptureScheduler) Schedule( log.Panic("schedulerv3: drain capture meet unexpected min workload", zap.String("namespace", d.changefeedID.Namespace), zap.String("changefeed", d.changefeedID.ID), - zap.Any("workload", captureWorkload)) + zap.Int("captureCount", len(captureWorkload)), + zap.Int("victimSpanCount", len(victimSpans))) } result = append(result, &replication.ScheduleTask{ diff --git a/cdc/scheduler/internal/v3/scheduler/scheduler_manager.go b/cdc/scheduler/internal/v3/scheduler/scheduler_manager.go index f41921fc75..84845958d5 100644 --- a/cdc/scheduler/internal/v3/scheduler/scheduler_manager.go +++ b/cdc/scheduler/internal/v3/scheduler/scheduler_manager.go @@ -108,7 +108,8 @@ func (sm *Manager) MoveTable(span tablepb.Span, target model.CaptureID) { "since the last triggered task not finished", zap.String("namespace", sm.changefeedID.Namespace), zap.String("changefeed", sm.changefeedID.ID), - zap.String("span", span.String()), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), zap.String("targetCapture", target)) } } diff --git a/cdc/scheduler/internal/v3/scheduler/scheduler_move_table.go b/cdc/scheduler/internal/v3/scheduler/scheduler_move_table.go index 10b44de77e..5407b92aea 100644 --- a/cdc/scheduler/internal/v3/scheduler/scheduler_move_table.go +++ b/cdc/scheduler/internal/v3/scheduler/scheduler_move_table.go @@ -101,8 +101,9 @@ func (m *moveTableScheduler) Schedule( log.Warn("schedulerv3: move table ignored, since the table cannot found", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("span", span.String()), - zap.String("captureID", task.MoveTable.DestCapture)) + zap.String("captureID", task.MoveTable.DestCapture), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) toBeDeleted = append(toBeDeleted, span) return true } @@ -113,8 +114,9 @@ func (m *moveTableScheduler) Schedule( log.Info("schedulerv3: move table ignored, since the target capture cannot found", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("span", span.String()), - zap.String("captureID", task.MoveTable.DestCapture)) + zap.String("captureID", task.MoveTable.DestCapture), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) toBeDeleted = append(toBeDeleted, span) return true } @@ -122,9 +124,10 @@ func (m *moveTableScheduler) Schedule( log.Warn("schedulerv3: move table ignored, target capture is not initialized", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("span", span.String()), zap.String("captureID", task.MoveTable.DestCapture), - zap.Any("state", status.State)) + zap.Stringer("state", status.State), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) toBeDeleted = append(toBeDeleted, span) return true } @@ -134,8 +137,9 @@ func (m *moveTableScheduler) Schedule( log.Warn("schedulerv3: move table ignored, table not found in the replication set", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("span", span.String()), - zap.String("captureID", task.MoveTable.DestCapture)) + zap.String("captureID", task.MoveTable.DestCapture), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) toBeDeleted = append(toBeDeleted, span) return true } @@ -144,9 +148,11 @@ func (m *moveTableScheduler) Schedule( log.Info("schedulerv3: move table ignored, since the table is not replicating now", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("span", span.String()), zap.String("captureID", task.MoveTable.DestCapture), - zap.Any("replicationState", rep.State)) + zap.Stringer("state", rep.State), + zap.String("primary", rep.Primary), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) toBeDeleted = append(toBeDeleted, span) } return true diff --git a/cdc/sink/ddlsink/cloudstorage/cloud_storage_ddl_sink.go b/cdc/sink/ddlsink/cloudstorage/cloud_storage_ddl_sink.go index 39c6965f6e..02640cc01d 100644 --- a/cdc/sink/ddlsink/cloudstorage/cloud_storage_ddl_sink.go +++ b/cdc/sink/ddlsink/cloudstorage/cloud_storage_ddl_sink.go @@ -175,7 +175,7 @@ func (d *DDLSink) bgCleanup(ctx context.Context) { if d.cfg.DateSeparator != config.DateSeparatorDay.String() || d.cfg.FileExpirationDays <= 0 { log.Info("skip cleanup expired files for storage sink", zap.String("namespace", d.id.Namespace), - zap.String("changefeedID", d.id.ID), + zap.String("changefeed", d.id.ID), zap.String("dateSeparator", d.cfg.DateSeparator), zap.Int("expiredFileTTL", d.cfg.FileExpirationDays)) return @@ -185,7 +185,7 @@ func (d *DDLSink) bgCleanup(ctx context.Context) { defer d.cron.Stop() log.Info("start schedule cleanup expired files for storage sink", zap.String("namespace", d.id.Namespace), - zap.String("changefeedID", d.id.ID), + zap.String("changefeed", d.id.ID), zap.String("dateSeparator", d.cfg.DateSeparator), zap.Int("expiredFileTTL", d.cfg.FileExpirationDays)) @@ -193,7 +193,7 @@ func (d *DDLSink) bgCleanup(ctx context.Context) { <-ctx.Done() log.Info("stop schedule cleanup expired files for storage sink", zap.String("namespace", d.id.Namespace), - zap.String("changefeedID", d.id.ID), + zap.String("changefeed", d.id.ID), zap.Error(ctx.Err())) } @@ -207,7 +207,7 @@ func (d *DDLSink) genCleanupJob(ctx context.Context, uri *url.URL) []func() { if !isRemoveEmptyDirsRuning.CompareAndSwap(false, true) { log.Warn("remove empty dirs is already running, skip this round", zap.String("namespace", d.id.Namespace), - zap.String("changefeedID", d.id.ID)) + zap.String("changefeed", d.id.ID)) return } @@ -217,7 +217,7 @@ func (d *DDLSink) genCleanupJob(ctx context.Context, uri *url.URL) []func() { if err != nil { log.Error("failed to remove empty dirs", zap.String("namespace", d.id.Namespace), - zap.String("changefeedID", d.id.ID), + zap.String("changefeed", d.id.ID), zap.Uint64("checkpointTs", checkpointTs), zap.Duration("cost", time.Since(start)), zap.Error(err), @@ -226,7 +226,7 @@ func (d *DDLSink) genCleanupJob(ctx context.Context, uri *url.URL) []func() { } log.Info("remove empty dirs", zap.String("namespace", d.id.Namespace), - zap.String("changefeedID", d.id.ID), + zap.String("changefeed", d.id.ID), zap.Uint64("checkpointTs", checkpointTs), zap.Uint64("count", cnt), zap.Duration("cost", time.Since(start))) @@ -238,7 +238,7 @@ func (d *DDLSink) genCleanupJob(ctx context.Context, uri *url.URL) []func() { if !isCleanupRunning.CompareAndSwap(false, true) { log.Warn("cleanup expired files is already running, skip this round", zap.String("namespace", d.id.Namespace), - zap.String("changefeedID", d.id.ID)) + zap.String("changefeed", d.id.ID)) return } @@ -249,7 +249,7 @@ func (d *DDLSink) genCleanupJob(ctx context.Context, uri *url.URL) []func() { if err != nil { log.Error("failed to remove expired files", zap.String("namespace", d.id.Namespace), - zap.String("changefeedID", d.id.ID), + zap.String("changefeed", d.id.ID), zap.Uint64("checkpointTs", checkpointTs), zap.Duration("cost", time.Since(start)), zap.Error(err), @@ -258,7 +258,7 @@ func (d *DDLSink) genCleanupJob(ctx context.Context, uri *url.URL) []func() { } log.Info("remove expired files", zap.String("namespace", d.id.Namespace), - zap.String("changefeedID", d.id.ID), + zap.String("changefeed", d.id.ID), zap.Uint64("checkpointTs", checkpointTs), zap.Uint64("count", cnt), zap.Duration("cost", time.Since(start))) diff --git a/cdc/sink/ddlsink/mq/ddlproducer/pulsar_ddl_producer.go b/cdc/sink/ddlsink/mq/ddlproducer/pulsar_ddl_producer.go index ec91dd2c5a..b520924df9 100644 --- a/cdc/sink/ddlsink/mq/ddlproducer/pulsar_ddl_producer.go +++ b/cdc/sink/ddlsink/mq/ddlproducer/pulsar_ddl_producer.go @@ -63,7 +63,11 @@ func (p *pulsarProducers) SyncSendMessage(ctx context.Context, topic string, producer, err := p.GetProducerByTopic(topic) if err != nil { - log.Error("ddl SyncSendMessage GetProducerByTopic fail", zap.Error(err)) + log.Error("ddl SyncSendMessage GetProducerByTopic fail", + zap.String("namespace", p.id.Namespace), + zap.String("changefeed", p.id.ID), + zap.String("topic", topic), + zap.Error(err)) return err } @@ -73,19 +77,29 @@ func (p *pulsarProducers) SyncSendMessage(ctx context.Context, topic string, } mID, err := producer.Send(ctx, data) if err != nil { - log.Error("ddl producer send fail", zap.Error(err)) + log.Error("ddl producer send fail", + zap.String("namespace", p.id.Namespace), + zap.String("changefeed", p.id.ID), + zap.String("topic", topic), + zap.Error(err)) mq.IncPublishedDDLFail(topic, p.id.ID, message) return err } if message.Type == model.MessageTypeDDL { log.Info("pulsarProducers SyncSendMessage success", - zap.Any("mID", mID), zap.String("topic", topic), + zap.String("namespace", p.id.Namespace), + zap.String("changefeed", p.id.ID), + zap.Any("mID", mID), + zap.String("topic", topic), zap.String("ddl", string(message.Value))) } log.Debug("pulsarProducers SyncSendMessage success", - zap.Any("mID", mID), zap.String("topic", topic)) + zap.String("namespace", p.id.Namespace), + zap.String("changefeed", p.id.ID), + zap.Any("mID", mID), + zap.String("topic", topic)) mq.IncPublishedDDLSuccess(topic, p.id.ID, message) return nil diff --git a/cdc/sink/ddlsink/mq/kafka_ddl_sink.go b/cdc/sink/ddlsink/mq/kafka_ddl_sink.go index a47cd57c80..0c9af99f95 100644 --- a/cdc/sink/ddlsink/mq/kafka_ddl_sink.go +++ b/cdc/sink/ddlsink/mq/kafka_ddl_sink.go @@ -105,8 +105,6 @@ func NewKafkaDDLSink( } start := time.Now() - log.Info("Try to create a DDL sink producer", - zap.String("changefeed", changefeedID.String())) syncProducer, err := factory.SyncProducer(ctx) if err != nil { return nil, errors.Trace(err) @@ -114,6 +112,7 @@ func NewKafkaDDLSink( ddlProducer := producerCreator(ctx, changefeedID, syncProducer) s := newDDLSink(changefeedID, ddlProducer, adminClient, topicManager, eventRouter, encoderBuilder.Build(), protocol, syncProducer) - log.Info("DDL sink producer client created", zap.Duration("duration", time.Since(start))) + log.Info("kafka ddl sink created", zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), zap.Duration("duration", time.Since(start))) return s, nil } diff --git a/cdc/sink/ddlsink/mq/pulsar_ddl_sink.go b/cdc/sink/ddlsink/mq/pulsar_ddl_sink.go index b4df662b11..0503cc825f 100644 --- a/cdc/sink/ddlsink/mq/pulsar_ddl_sink.go +++ b/cdc/sink/ddlsink/mq/pulsar_ddl_sink.go @@ -63,7 +63,8 @@ func NewPulsarDDLSink( } log.Info("Try to create a DDL sink producer", - zap.String("changefeed", changefeedID.String())) + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID)) // NewEventRouter eventRouter, err := dispatcher.NewEventRouter(replicaConfig, protocol, defaultTopic, sinkURI.Scheme) @@ -85,12 +86,18 @@ func NewPulsarDDLSink( start := time.Now() client, err := clientCreator(pConfig, changefeedID, replicaConfig.Sink) if err != nil { - log.Error("DDL sink producer client create fail", zap.Error(err)) + log.Error("DDL sink producer client create fail", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Error(err)) return nil, cerror.WrapError(cerror.ErrPulsarNewClient, err) } p, err := producerCreator(ctx, changefeedID, pConfig, client, replicaConfig.Sink) - log.Info("DDL sink producer client created", zap.Duration("duration", time.Since(start))) + log.Info("DDL sink producer client created", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Duration("duration", time.Since(start))) if err != nil { return nil, cerror.WrapError(cerror.ErrPulsarNewProducer, err) } diff --git a/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go b/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go index ffff4d1084..11077f702d 100644 --- a/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go +++ b/cdc/sink/ddlsink/mysql/mysql_ddl_sink.go @@ -107,7 +107,7 @@ func NewDDLSink( db: db, cfg: cfg, statistics: metrics.NewStatistics(changefeedID, sink.TxnSink), - needFormat: needFormatDDL(db, cfg), + needFormat: needFormatDDL(changefeedID, db, cfg), } log.Info("MySQL DDL sink is created", @@ -444,21 +444,33 @@ func needSwitchDB(ddl *model.DDLEvent) bool { } // needFormatDDL checks vector type support -func needFormatDDL(db *sql.DB, cfg *pmysql.Config) bool { +func needFormatDDL(changefeedID model.ChangeFeedID, db *sql.DB, cfg *pmysql.Config) bool { if !cfg.HasVectorType { log.Warn("please set `has-vector-type` to be true if a column is vector type when the downstream is not TiDB or TiDB version less than specify version", - zap.Any("hasVectorType", cfg.HasVectorType), zap.Any("supportVectorVersion", defaultSupportVectorVersion)) + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Bool("hasVectorType", cfg.HasVectorType), + zap.String("supportVectorVersion", defaultSupportVectorVersion)) return false } versionInfo, err := export.SelectVersion(db) if err != nil { - log.Warn("fail to get version", zap.Error(err), zap.Bool("isTiDB", cfg.IsTiDB)) + log.Warn("fail to get version", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Bool("isTiDB", cfg.IsTiDB), + zap.Error(err)) return false } serverInfo := version.ParseServerInfo(versionInfo) version := semver.New(defaultSupportVectorVersion) if !cfg.IsTiDB || serverInfo.ServerVersion.LessThan(*version) { - log.Error("downstream unsupport vector type. it will be converted to longtext", zap.String("version", serverInfo.ServerVersion.String()), zap.String("supportVectorVersion", defaultSupportVectorVersion), zap.Bool("isTiDB", cfg.IsTiDB)) + log.Error("downstream unsupport vector type. it will be converted to longtext", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.String("version", serverInfo.ServerVersion.String()), + zap.String("supportVectorVersion", defaultSupportVectorVersion), + zap.Bool("isTiDB", cfg.IsTiDB)) return true } return false diff --git a/cdc/sink/dmlsink/cloudstorage/dml_worker.go b/cdc/sink/dmlsink/cloudstorage/dml_worker.go index f98f2f448b..40c7fc4a34 100644 --- a/cdc/sink/dmlsink/cloudstorage/dml_worker.go +++ b/cdc/sink/dmlsink/cloudstorage/dml_worker.go @@ -139,9 +139,10 @@ func newDMLWorker( // run creates a set of background goroutines. func (d *dmlWorker) run(ctx context.Context) error { - log.Debug("dml worker started", zap.Int("workerID", d.id), + log.Debug("dml worker started", zap.String("namespace", d.changeFeedID.Namespace), - zap.String("changefeed", d.changeFeedID.ID)) + zap.String("changefeed", d.changeFeedID.ID), + zap.Int("workerID", d.id)) eg, ctx := errgroup.WithContext(ctx) eg.Go(func() error { @@ -183,9 +184,9 @@ func (d *dmlWorker) flushMessages(ctx context.Context) error { err := d.filePathGenerator.CheckOrWriteSchema(ctx, table, task.tableInfo) if err != nil { log.Error("failed to write schema file to external storage", - zap.Int("workerID", d.id), zap.String("namespace", d.changeFeedID.Namespace), zap.String("changefeed", d.changeFeedID.ID), + zap.Int("workerID", d.id), zap.Error(err)) return errors.Trace(err) } @@ -198,9 +199,9 @@ func (d *dmlWorker) flushMessages(ctx context.Context) error { dataFilePath, err := d.filePathGenerator.GenerateDataFilePath(ctx, table, date) if err != nil { log.Error("failed to generate data file path", - zap.Int("workerID", d.id), zap.String("namespace", d.changeFeedID.Namespace), zap.String("changefeed", d.changeFeedID.ID), + zap.Int("workerID", d.id), zap.Error(err)) return errors.Trace(err) } @@ -211,9 +212,9 @@ func (d *dmlWorker) flushMessages(ctx context.Context) error { err = d.writeIndexFile(ctx, indexFilePath, path.Base(dataFilePath)+"\n") if err != nil { log.Error("failed to write index file to external storage", - zap.Int("workerID", d.id), zap.String("namespace", d.changeFeedID.Namespace), zap.String("changefeed", d.changeFeedID.ID), + zap.Int("workerID", d.id), zap.String("path", indexFilePath), zap.Error(err)) } @@ -222,17 +223,18 @@ func (d *dmlWorker) flushMessages(ctx context.Context) error { err = d.writeDataFile(ctx, dataFilePath, task) if err != nil { log.Error("failed to write data file to external storage", - zap.Int("workerID", d.id), zap.String("namespace", d.changeFeedID.Namespace), zap.String("changefeed", d.changeFeedID.ID), + zap.Int("workerID", d.id), zap.String("path", dataFilePath), zap.Error(err)) return errors.Trace(err) } - log.Debug("write file to storage success", zap.Int("workerID", d.id), + log.Debug("write file to storage success", zap.String("namespace", d.changeFeedID.Namespace), zap.String("changefeed", d.changeFeedID.ID), + zap.Int("workerID", d.id), zap.String("schema", table.TableNameWithPhysicTableID.Schema), zap.String("table", table.TableNameWithPhysicTableID.Table), zap.String("path", dataFilePath), @@ -286,11 +288,12 @@ func (d *dmlWorker) writeDataFile(ctx context.Context, path string, task *single // We have to wait the writer to close to complete the upload // If failed to close writer, some DMLs may not be upload successfully if inErr = writer.Close(ctx); inErr != nil { - log.Error("failed to close writer", zap.Error(inErr), + log.Error("failed to close writer", + zap.String("namespace", d.changeFeedID.Namespace), + zap.String("changefeed", d.changeFeedID.ID), zap.Int("workerID", d.id), zap.Any("table", task.tableInfo.TableName), - zap.String("namespace", d.changeFeedID.Namespace), - zap.String("changefeed", d.changeFeedID.ID)) + zap.Error(inErr)) return 0, 0, inErr } diff --git a/cdc/sink/dmlsink/cloudstorage/encoding_worker.go b/cdc/sink/dmlsink/cloudstorage/encoding_worker.go index 0ba69ef395..f8024f52af 100644 --- a/cdc/sink/dmlsink/cloudstorage/encoding_worker.go +++ b/cdc/sink/dmlsink/cloudstorage/encoding_worker.go @@ -52,9 +52,10 @@ func newEncodingWorker( } func (w *encodingWorker) run(ctx context.Context) error { - log.Debug("encoding worker started", zap.Int("workerID", w.id), + log.Debug("encoding worker started", zap.String("namespace", w.changeFeedID.Namespace), - zap.String("changefeed", w.changeFeedID.ID)) + zap.String("changefeed", w.changeFeedID.ID), + zap.Int("workerID", w.id)) eg, ctx := errgroup.WithContext(ctx) eg.Go(func() error { diff --git a/cdc/sink/dmlsink/mq/dmlproducer/pulsar_dml_producer.go b/cdc/sink/dmlsink/mq/dmlproducer/pulsar_dml_producer.go index 0f2c77237c..d87ba09ef5 100644 --- a/cdc/sink/dmlsink/mq/dmlproducer/pulsar_dml_producer.go +++ b/cdc/sink/dmlsink/mq/dmlproducer/pulsar_dml_producer.go @@ -119,7 +119,9 @@ func NewPulsarDMLProducer( failpointCh: failpointCh, errChan: errCh, } - log.Info("Pulsar DML producer created", zap.Stringer("changefeed", p.id), + log.Info("Pulsar DML producer created", + zap.String("namespace", p.id.Namespace), + zap.String("changefeed", p.id.ID), zap.Duration("duration", time.Since(start))) return p, nil } @@ -181,7 +183,9 @@ func (p *pulsarDMLProducer) AsyncSendMessage( case p.errChan <- e: default: log.Warn("Error channel is full in pulsar DML producer", - zap.Stringer("changefeed", p.id), zap.Error(e)) + zap.String("namespace", p.id.Namespace), + zap.String("changefeed", p.id.ID), + zap.Error(e)) } } else if message.Callback != nil { // success @@ -215,9 +219,10 @@ func (p *pulsarDMLProducer) Close() { // We have to hold the lock to synchronize p.producers.Remove(topic) // callback func will be called topicName, _ := topic.(string) log.Info("Async client closed in pulsar DML producer", - zap.Duration("duration", time.Since(start)), zap.String("namespace", p.id.Namespace), - zap.String("changefeed", p.id.ID), zap.String("topic", topicName)) + zap.String("changefeed", p.id.ID), + zap.String("topic", topicName), + zap.Duration("duration", time.Since(start))) } p.client.Close() } diff --git a/cdc/sink/dmlsink/mq/kafka_dml_sink.go b/cdc/sink/dmlsink/mq/kafka_dml_sink.go index f399db3dfc..81f86a7b66 100644 --- a/cdc/sink/dmlsink/mq/kafka_dml_sink.go +++ b/cdc/sink/dmlsink/mq/kafka_dml_sink.go @@ -127,7 +127,7 @@ func NewKafkaDMLSink( protocol, scheme, replicaConfig.Sink.KafkaConfig.GetOutputRawChangeEvent(), errCh) log.Info("DML sink producer created", zap.String("namespace", changefeedID.Namespace), - zap.String("changefeedID", changefeedID.ID)) + zap.String("changefeed", changefeedID.ID)) return s, nil } diff --git a/cdc/sink/dmlsink/mq/mq_dml_sink.go b/cdc/sink/dmlsink/mq/mq_dml_sink.go index c11ac8682b..f38ed5066b 100644 --- a/cdc/sink/dmlsink/mq/mq_dml_sink.go +++ b/cdc/sink/dmlsink/mq/mq_dml_sink.go @@ -170,7 +170,9 @@ func (s *dmlSink) WriteEvents(txns ...*dmlsink.CallbackableEvent[*model.SingleTa topic := s.alive.eventRouter.GetTopicForRowChange(row) partitionNum, err := s.alive.topicManager.GetPartitionNum(s.ctx, topic) failpoint.Inject("MQSinkGetPartitionError", func() { - log.Info("failpoint MQSinkGetPartitionError injected", zap.String("changefeedID", s.id.ID)) + log.Info("failpoint MQSinkGetPartitionError injected", + zap.String("namespace", s.id.Namespace), + zap.String("changefeed", s.id.ID)) err = errors.New("MQSinkGetPartitionError") }) if err != nil { diff --git a/cdc/sink/dmlsink/mq/pulsar_dml_sink.go b/cdc/sink/dmlsink/mq/pulsar_dml_sink.go index f9934b5914..9b88c9354a 100644 --- a/cdc/sink/dmlsink/mq/pulsar_dml_sink.go +++ b/cdc/sink/dmlsink/mq/pulsar_dml_sink.go @@ -73,16 +73,22 @@ func NewPulsarDMLSink( client, err := clientCreator(pConfig, changefeedID, replicaConfig.Sink) if err != nil { - log.Error("DML sink producer client create fail", zap.Error(err)) + log.Error("DML sink producer client create fail", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Error(err)) return nil, cerror.WrapError(cerror.ErrPulsarNewClient, err) } failpointCh := make(chan error, 1) - log.Info("Try to create a DML sink producer", zap.String("changefeed", changefeedID.String())) + log.Info("Try to create a DML sink producer", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID)) start := time.Now() p, err := producerCreator(ctx, changefeedID, client, replicaConfig.Sink, errCh, failpointCh) log.Info("DML sink producer created", - zap.String("changefeed", changefeedID.String()), + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), zap.Duration("duration", time.Since(start))) if err != nil { defer func() { diff --git a/cdc/sink/dmlsink/txn/mysql/mysql.go b/cdc/sink/dmlsink/txn/mysql/mysql.go index 8331bb7aea..28314879de 100644 --- a/cdc/sink/dmlsink/txn/mysql/mysql.go +++ b/cdc/sink/dmlsink/txn/mysql/mysql.go @@ -60,11 +60,11 @@ const ( ) type mysqlBackend struct { - workerID int - changefeed string - db *sql.DB - cfg *pmysql.Config - dmlMaxRetry uint64 + workerID int + changefeedID model.ChangeFeedID + db *sql.DB + cfg *pmysql.Config + dmlMaxRetry uint64 events []*dmlsink.TxnCallbackableEvent rows int @@ -90,8 +90,6 @@ func NewMySQLBackends( dbConnFactory pmysql.IDBConnectionFactory, statistics *metrics.Statistics, ) ([]*mysqlBackend, error) { - changefeed := fmt.Sprintf("%s.%s", changefeedID.Namespace, changefeedID.ID) - cfg := pmysql.NewConfig() err := cfg.Apply(config.GetGlobalServerConfig().TZ, changefeedID, sinkURI, replicaConfig) if err != nil { @@ -168,7 +166,8 @@ func NewMySQLBackends( maxAllowedPacket, err = pmysql.QueryMaxAllowedPacket(ctx, db) if err != nil { log.Warn("failed to query max_allowed_packet, use default value", - zap.String("changefeed", changefeed), + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), zap.Error(err)) maxAllowedPacket = int64(vardef.DefMaxAllowedPacket) } @@ -176,12 +175,12 @@ func NewMySQLBackends( backends := make([]*mysqlBackend, 0, cfg.WorkerCount) for i := 0; i < cfg.WorkerCount; i++ { backends = append(backends, &mysqlBackend{ - workerID: i, - changefeed: changefeed, - db: db, - cfg: cfg, - dmlMaxRetry: defaultDMLMaxRetry, - statistics: statistics, + workerID: i, + changefeedID: changefeedID, + db: db, + cfg: cfg, + dmlMaxRetry: defaultDMLMaxRetry, + statistics: statistics, metricTxnSinkDMLBatchCommit: txn.SinkDMLBatchCommit.WithLabelValues(changefeedID.Namespace, changefeedID.ID), metricTxnSinkDMLBatchCallback: txn.SinkDMLBatchCallback.WithLabelValues(changefeedID.Namespace, changefeedID.ID), @@ -193,7 +192,8 @@ func NewMySQLBackends( } log.Info("MySQL backends is created", - zap.String("changefeed", changefeed), + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), zap.Int("workerCount", cfg.WorkerCount), zap.Bool("forceReplicate", cfg.ForceReplicate)) return backends, nil @@ -225,13 +225,20 @@ func (s *mysqlBackend) Flush(ctx context.Context) (err error) { } dmls := s.prepareDMLs() - log.Debug("prepare DMLs", zap.String("changefeed", s.changefeed), zap.Any("rows", s.rows), - zap.Strings("sqls", dmls.sqls), zap.Any("values", dmls.values)) + log.Debug("prepare DMLs", + zap.String("namespace", s.changefeedID.Namespace), + zap.String("changefeed", s.changefeedID.ID), + zap.Int("rows", s.rows), + zap.Strings("sqls", dmls.sqls), + zap.Any("values", dmls.values)) start := time.Now() if err := s.execDMLWithMaxRetries(ctx, dmls); err != nil { if errors.Cause(err) != context.Canceled { - log.Error("execute DMLs failed", zap.String("changefeed", s.changefeed), zap.Error(err)) + log.Error("execute DMLs failed", + zap.String("namespace", s.changefeedID.Namespace), + zap.String("changefeed", s.changefeedID.ID), + zap.Error(err)) } return errors.Trace(err) } @@ -532,7 +539,8 @@ func (s *mysqlBackend) prepareDMLs() *preparedDMLs { // replicated before, and there is no such row in downstream MySQL. translateToInsert = translateToInsert && firstRow.CommitTs > firstRow.ReplicatingTs log.Debug("translate to insert", - zap.String("changefeed", s.changefeed), + zap.String("namespace", s.changefeedID.Namespace), + zap.String("changefeed", s.changefeedID.ID), zap.Bool("translateToInsert", translateToInsert), zap.Uint64("firstRowCommitTs", firstRow.CommitTs), zap.Uint64("firstRowReplicatingTs", firstRow.ReplicatingTs), @@ -635,8 +643,12 @@ func (s *mysqlBackend) multiStmtExecute( } multiStmtSQL := strings.Join(dmls.sqls, ";") - log.Debug("exec row", zap.String("changefeed", s.changefeed), zap.Int("workerID", s.workerID), - zap.String("sql", multiStmtSQL), zap.Any("args", multiStmtArgs)) + log.Debug("exec row", + zap.String("namespace", s.changefeedID.Namespace), + zap.String("changefeed", s.changefeedID.ID), + zap.Int("workerID", s.workerID), + zap.String("sql", multiStmtSQL), + zap.Any("args", multiStmtArgs)) ctx, cancel := context.WithTimeout(ctx, writeTimeout) defer cancel() start := time.Now() @@ -644,10 +656,13 @@ func (s *mysqlBackend) multiStmtExecute( if execError != nil { err := logDMLTxnErr( wrapMysqlTxnError(execError), - start, s.changefeed, multiStmtSQL, dmls.rowCount, dmls.startTs) + start, s.changefeedID, multiStmtSQL, dmls.rowCount, dmls.startTs) if rbErr := tx.Rollback(); rbErr != nil { if errors.Cause(rbErr) != context.Canceled { - log.Warn("failed to rollback txn", zap.String("changefeed", s.changefeed), zap.Error(rbErr)) + log.Warn("failed to rollback txn", + zap.String("namespace", s.changefeedID.Namespace), + zap.String("changefeed", s.changefeedID.ID), + zap.Error(rbErr)) } } return err @@ -662,8 +677,12 @@ func (s *mysqlBackend) sequenceExecute( start := time.Now() for i, query := range dmls.sqls { args := dmls.values[i] - log.Debug("exec row", zap.String("changefeed", s.changefeed), zap.Int("workerID", s.workerID), - zap.String("sql", query), zap.Any("args", args)) + log.Debug("exec row", + zap.String("namespace", s.changefeedID.Namespace), + zap.String("changefeed", s.changefeedID.ID), + zap.Int("workerID", s.workerID), + zap.String("sql", query), + zap.Any("args", args)) ctx, cancelFunc := context.WithTimeout(ctx, writeTimeout) var prepStmt *sql.Stmt @@ -691,10 +710,13 @@ func (s *mysqlBackend) sequenceExecute( if execError != nil { err := logDMLTxnErr( wrapMysqlTxnError(execError), - start, s.changefeed, query, dmls.rowCount, dmls.startTs) + start, s.changefeedID, query, dmls.rowCount, dmls.startTs) if rbErr := tx.Rollback(); rbErr != nil { if errors.Cause(rbErr) != context.Canceled { - log.Warn("failed to rollback txn", zap.String("changefeed", s.changefeed), zap.Error(rbErr)) + log.Warn("failed to rollback txn", + zap.String("namespace", s.changefeedID.Namespace), + zap.String("changefeed", s.changefeedID.ID), + zap.Error(rbErr)) } } cancelFunc() @@ -708,7 +730,8 @@ func (s *mysqlBackend) sequenceExecute( func (s *mysqlBackend) execDMLWithMaxRetries(pctx context.Context, dmls *preparedDMLs) error { if len(dmls.sqls) != len(dmls.values) { log.Error("unexpected number of sqls and values", - zap.String("changefeed", s.changefeed), + zap.String("namespace", s.changefeedID.Namespace), + zap.String("changefeed", s.changefeedID.ID), zap.Strings("sqls", dmls.sqls), zap.Any("values", dmls.values)) return cerror.ErrUnexpected.FastGenByArgs("unexpected number of sqls and values") @@ -724,7 +747,7 @@ func (s *mysqlBackend) execDMLWithMaxRetries(pctx context.Context, dmls *prepare failpoint.Inject("MySQLSinkTxnRandomError", func() { log.Warn("inject MySQLSinkTxnRandomError") - err := logDMLTxnErr(errors.Trace(driver.ErrBadConn), start, s.changefeed, "failpoint", 0, nil) + err := logDMLTxnErr(errors.Trace(driver.ErrBadConn), start, s.changefeedID, "failpoint", 0, nil) failpoint.Return(err) }) failpoint.Inject("MySQLSinkHangLongTime", func() { _ = util.Hang(pctx, time.Hour) }) @@ -733,7 +756,7 @@ func (s *mysqlBackend) execDMLWithMaxRetries(pctx context.Context, dmls *prepare err := logDMLTxnErr(cerror.WrapError(cerror.ErrMySQLDuplicateEntry, &dmysql.MySQLError{ Number: uint16(mysql.ErrDupEntry), Message: "Duplicate entry", - }), start, s.changefeed, "failpoint", 0, nil) + }), start, s.changefeedID, "failpoint", 0, nil) failpoint.Return(err) }) @@ -742,7 +765,7 @@ func (s *mysqlBackend) execDMLWithMaxRetries(pctx context.Context, dmls *prepare if err != nil { return 0, 0, logDMLTxnErr( wrapMysqlTxnError(err), - start, s.changefeed, "BEGIN", dmls.rowCount, dmls.startTs) + start, s.changefeedID, "BEGIN", dmls.rowCount, dmls.startTs) } // Set session variables first and then execute the transaction. @@ -751,13 +774,16 @@ func (s *mysqlBackend) execDMLWithMaxRetries(pctx context.Context, dmls *prepare if err = pmysql.SetWriteSource(pctx, s.cfg, tx); err != nil { err := logDMLTxnErr( wrapMysqlTxnError(err), - start, s.changefeed, + start, s.changefeedID, fmt.Sprintf("SET SESSION %s = %d", "tidb_cdc_write_source", s.cfg.SourceID), dmls.rowCount, dmls.startTs) if rbErr := tx.Rollback(); rbErr != nil { if errors.Cause(rbErr) != context.Canceled { - log.Warn("failed to rollback txn", zap.String("changefeed", s.changefeed), zap.Error(rbErr)) + log.Warn("failed to rollback txn", + zap.String("namespace", s.changefeedID.Namespace), + zap.String("changefeed", s.changefeedID.ID), + zap.Error(rbErr)) } } return 0, 0, err @@ -784,7 +810,7 @@ func (s *mysqlBackend) execDMLWithMaxRetries(pctx context.Context, dmls *prepare if err = tx.Commit(); err != nil { return 0, 0, logDMLTxnErr( wrapMysqlTxnError(err), - start, s.changefeed, "COMMIT", dmls.rowCount, dmls.startTs) + start, s.changefeedID, "COMMIT", dmls.rowCount, dmls.startTs) } return dmls.rowCount, dmls.approximateSize, nil }) @@ -792,7 +818,8 @@ func (s *mysqlBackend) execDMLWithMaxRetries(pctx context.Context, dmls *prepare return errors.Trace(err) } log.Debug("Exec Rows succeeded", - zap.String("changefeed", s.changefeed), + zap.String("namespace", s.changefeedID.Namespace), + zap.String("changefeed", s.changefeedID.ID), zap.Int("workerID", s.workerID), zap.Int("numOfRows", dmls.rowCount)) return nil @@ -815,7 +842,7 @@ func wrapMysqlTxnError(err error) error { } func logDMLTxnErr( - err error, start time.Time, changefeed string, + err error, start time.Time, changefeedID model.ChangeFeedID, query string, count int, startTs []model.Ts, ) error { if len(query) > 1024 { @@ -823,15 +850,22 @@ func logDMLTxnErr( } if isRetryableDMLError(err) { log.Warn("execute DMLs with error, retry later", - zap.Error(err), zap.Duration("duration", time.Since(start)), - zap.String("query", query), zap.Int("count", count), + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Error(err), + zap.Duration("duration", time.Since(start)), + zap.String("query", query), + zap.Int("count", count), zap.Uint64s("startTs", startTs), - zap.String("changefeed", changefeed)) + ) } else { log.Error("execute DMLs with error, can not retry", - zap.Error(err), zap.Duration("duration", time.Since(start)), - zap.String("query", query), zap.Int("count", count), - zap.String("changefeed", changefeed)) + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), + zap.Error(err), + zap.Duration("duration", time.Since(start)), + zap.String("query", query), + zap.Int("count", count)) } return errors.WithMessage(err, fmt.Sprintf("Failed query info: %s; ", query)) } diff --git a/cdc/sink/dmlsink/txn/worker.go b/cdc/sink/dmlsink/txn/worker.go index 2936fb490d..18fb7b1e0f 100644 --- a/cdc/sink/dmlsink/txn/worker.go +++ b/cdc/sink/dmlsink/txn/worker.go @@ -28,9 +28,9 @@ import ( ) type worker struct { - ctx context.Context - changefeed string - workerCount int + ctx context.Context + changefeedID model.ChangeFeedID + workerCount int ID int backend backend @@ -56,9 +56,9 @@ func newWorker(ctx context.Context, changefeedID model.ChangeFeedID, wid := fmt.Sprintf("%d", ID) return &worker{ - ctx: ctx, - changefeed: fmt.Sprintf("%s.%s", changefeedID.Namespace, changefeedID.ID), - workerCount: workerCount, + ctx: ctx, + changefeedID: changefeedID, + workerCount: workerCount, ID: ID, backend: backend, @@ -82,7 +82,8 @@ func (w *worker) runLoop(txnCh <-chan causality.TxnWithNotifier[*txnEvent]) erro defer func() { if err := w.backend.Close(); err != nil { log.Info("Transaction dmlSink backend close fail", - zap.String("changefeedID", w.changefeed), + zap.String("namespace", w.changefeedID.Namespace), + zap.String("changefeed", w.changefeedID.ID), zap.Int("workerID", w.ID), zap.Error(err)) } @@ -132,7 +133,8 @@ func (w *worker) runLoop(txnCh <-chan causality.TxnWithNotifier[*txnEvent]) erro // needFlush must be true here, so we can do flush. if err := w.doFlush(); err != nil { log.Error("Transaction dmlSink worker exits unexpectly", - zap.String("changefeedID", w.changefeed), + zap.String("namespace", w.changefeedID.Namespace), + zap.String("changefeed", w.changefeedID.ID), zap.Int("workerID", w.ID), zap.Error(err)) return err @@ -171,9 +173,10 @@ func (w *worker) onEvent(txn *txnEvent, postTxnExecuted func()) bool { // Log slow conflict detect tables every minute. if lastLog, ok := w.lastSlowConflictDetectLog[txn.Event.PhysicalTableID]; !ok || now.Sub(lastLog) > time.Minute { log.Warn("Transaction dmlSink finds a slow transaction in conflict detector", - zap.String("changefeedID", w.changefeed), + zap.String("namespace", w.changefeedID.Namespace), + zap.String("changefeed", w.changefeedID.ID), zap.Int("workerID", w.ID), - zap.Int64("TableID", txn.Event.PhysicalTableID), + zap.Int64("tableID", txn.Event.PhysicalTableID), zap.Float64("seconds", conflictDetectTime)) w.lastSlowConflictDetectLog[txn.Event.PhysicalTableID] = now } @@ -193,7 +196,8 @@ func (w *worker) doFlush() error { }() if err := w.backend.Flush(w.ctx); err != nil { log.Warn("Transaction dmlSink backend flush fail", - zap.String("changefeedID", w.changefeed), + zap.String("namespace", w.changefeedID.Namespace), + zap.String("changefeed", w.changefeedID.ID), zap.Int("workerID", w.ID), zap.Error(err)) return err diff --git a/pkg/etcd/etcd.go b/pkg/etcd/etcd.go index c88053ff19..2fec387908 100644 --- a/pkg/etcd/etcd.go +++ b/pkg/etcd/etcd.go @@ -502,7 +502,9 @@ func (c *CDCEtcdClientImpl) saveChangefeedAndUpstreamInfo( } if len(jobResp.Kvs) == 0 { // Note that status may not exist, so we don't check it here. - log.Debug("job status not exists", zap.Stringer("changefeed", changeFeedID)) + log.Debug("job status not exists", + zap.String("namespace", changeFeedID.Namespace), + zap.String("changefeed", changeFeedID.ID)) } else { jobModRevision = jobResp.Kvs[0].ModRevision } diff --git a/pkg/sink/cloudstorage/path.go b/pkg/sink/cloudstorage/path.go index f773de6d43..96be272abe 100644 --- a/pkg/sink/cloudstorage/path.go +++ b/pkg/sink/cloudstorage/path.go @@ -157,7 +157,7 @@ func NewFilePathGenerator( pdclock = pdutil.NewMonotonicClock(clock.New()) log.Warn("pd clock is not set in storage sink, use local clock instead", zap.String("namespace", changefeedID.Namespace), - zap.String("changefeedID", changefeedID.ID)) + zap.String("changefeed", changefeedID.ID)) } return &FilePathGenerator{ changefeedID: changefeedID, @@ -188,7 +188,7 @@ func (f *FilePathGenerator) CheckOrWriteSchema( // only check schema for table log.Error("invalid table schema", zap.String("namespace", f.changefeedID.Namespace), - zap.String("changefeedID", f.changefeedID.ID), + zap.String("changefeed", f.changefeedID.ID), zap.Any("versionedTableName", table), zap.Any("tableInfo", tableInfo)) return errors.ErrInternalCheckFailed.GenWithStackByArgs("invalid table schema in FilePathGenerator") @@ -226,7 +226,7 @@ func (f *FilePathGenerator) CheckOrWriteSchema( if parsedChecksum != checksum { log.Error("invalid schema file name", zap.String("namespace", f.changefeedID.Namespace), - zap.String("changefeedID", f.changefeedID.ID), + zap.String("changefeed", f.changefeedID.ID), zap.String("path", path), zap.Any("checksum", checksum)) errMsg := fmt.Sprintf("invalid schema filename in storage sink, "+ "expected checksum: %d, actual checksum: %d", checksum, parsedChecksum) @@ -253,7 +253,7 @@ func (f *FilePathGenerator) CheckOrWriteSchema( if schemaFileCnt != 0 && lastVersion == 0 { log.Warn("no table schema file found in an non-empty meta path", zap.String("namespace", f.changefeedID.Namespace), - zap.String("changefeedID", f.changefeedID.ID), + zap.String("changefeed", f.changefeedID.ID), zap.Any("versionedTableName", table), zap.Uint32("checksum", checksum)) } @@ -485,7 +485,7 @@ func RemoveEmptyDirs( if err == nil && len(files) == 0 { log.Debug("Deleting empty directory", zap.String("namespace", id.Namespace), - zap.String("changeFeedID", id.ID), + zap.String("changefeed", id.ID), zap.String("path", path)) os.Remove(path) cnt++ diff --git a/pkg/sink/codec/bootstraper.go b/pkg/sink/codec/bootstraper.go index 32f52e3944..36033c9a5e 100644 --- a/pkg/sink/codec/bootstraper.go +++ b/pkg/sink/codec/bootstraper.go @@ -59,7 +59,8 @@ func newBootstrapWorker( ) *bootstrapWorker { log.Info("Sending bootstrap event is enabled for simple protocol. "+ "Both send-bootstrap-interval-in-sec and send-bootstrap-in-msg-count are > 0.", - zap.Stringer("changefeed", changefeedID), + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), zap.Int64("sendBootstrapIntervalInSec", sendBootstrapInterval), zap.Int32("sendBootstrapInMsgCount", sendBootstrapInMsgCount)) return &bootstrapWorker{ @@ -184,9 +185,10 @@ func (b *bootstrapWorker) gcInactiveTables() { table := value.(*tableStatistic) if table.isInactive(b.maxInactiveDuration) { log.Info("A table is removed from the bootstrap worker", + zap.String("namespace", b.changefeedID.Namespace), + zap.String("changefeed", b.changefeedID.ID), zap.Int64("tableID", table.id), - zap.String("topic", table.topic), - zap.Stringer("changefeed", b.changefeedID)) + zap.String("topic", table.topic)) b.activeTables.Delete(key) } return true diff --git a/pkg/sink/kafka/sarama_factory.go b/pkg/sink/kafka/sarama_factory.go index 548d2fda6c..e0a6f4f111 100644 --- a/pkg/sink/kafka/sarama_factory.go +++ b/pkg/sink/kafka/sarama_factory.go @@ -50,7 +50,10 @@ func (f *saramaFactory) AdminClient(ctx context.Context) (ClusterAdminClient, er config, err := NewSaramaConfig(ctx, f.option) duration := time.Since(start) if duration > 2*time.Second { - log.Warn("new sarama config cost too much time", zap.Duration("duration", duration), zap.Stringer("changefeedID", f.changefeedID)) + log.Warn("new sarama config cost too much time", + zap.String("namespace", f.changefeedID.Namespace), + zap.String("changefeed", f.changefeedID.ID), + zap.Duration("duration", duration)) } if err != nil { return nil, err @@ -60,7 +63,10 @@ func (f *saramaFactory) AdminClient(ctx context.Context) (ClusterAdminClient, er client, err := sarama.NewClient(f.option.BrokerEndpoints, config) duration = time.Since(start) if duration > 2*time.Second { - log.Warn("new sarama client cost too much time", zap.Duration("duration", duration), zap.Stringer("changefeedID", f.changefeedID)) + log.Warn("new sarama client cost too much time", + zap.String("namespace", f.changefeedID.Namespace), + zap.String("changefeed", f.changefeedID.ID), + zap.Duration("duration", duration)) } if err != nil { return nil, errors.Trace(err) @@ -70,7 +76,10 @@ func (f *saramaFactory) AdminClient(ctx context.Context) (ClusterAdminClient, er admin, err := sarama.NewClusterAdminFromClient(client) duration = time.Since(start) if duration > 2*time.Second { - log.Warn("new sarama cluster admin cost too much time", zap.Duration("duration", duration), zap.Stringer("changefeedID", f.changefeedID)) + log.Warn("new sarama cluster admin cost too much time", + zap.String("namespace", f.changefeedID.Namespace), + zap.String("changefeed", f.changefeedID.ID), + zap.Duration("duration", duration)) } if err != nil { return nil, errors.Trace(err) diff --git a/pkg/sink/pulsar/factory.go b/pkg/sink/pulsar/factory.go index abd1a81cb1..66c16e49a5 100644 --- a/pkg/sink/pulsar/factory.go +++ b/pkg/sink/pulsar/factory.go @@ -42,7 +42,8 @@ func NewCreatorFactory(config *config.PulsarConfig, changefeedID model.ChangeFee Logger: NewPulsarLogger(log.L()), } log.Info("pulsar client factory created", - zap.Stringer("changefeedID", changefeedID), + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID), zap.Any("clientOptions", option)) var err error @@ -131,6 +132,8 @@ func setupAuthentication(config *config.PulsarConfig) (bool, pulsar.Authenticati func NewMockCreatorFactory(config *config.PulsarConfig, changefeedID model.ChangeFeedID, sinkConfig *config.SinkConfig, ) (pulsar.Client, error) { - log.Info("mock pulsar client factory created", zap.Any("changfeedID", changefeedID)) + log.Info("mock pulsar client factory created", + zap.String("namespace", changefeedID.Namespace), + zap.String("changefeed", changefeedID.ID)) return nil, nil } From 67ab86b64fe20d508a2ece831ac3e134ab61b4ca Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Mon, 13 Apr 2026 15:16:53 +0800 Subject: [PATCH 21/24] fix all log issues --- cdc/capture/capture.go | 8 ++++++-- cdc/model/changefeed.go | 3 ++- cdc/owner/changefeed.go | 11 +++++++++-- cdc/owner/feed_state_manager.go | 19 +++++++++---------- cdc/owner/owner.go | 14 ++++++++++++-- cdc/processor/processor.go | 2 +- cdc/processor/sinkmanager/manager.go | 4 +++- .../sinkmanager/table_sink_advancer.go | 8 ++++++-- .../cloudstorage/cloud_storage_dml_sink.go | 3 +-- 9 files changed, 49 insertions(+), 23 deletions(-) diff --git a/cdc/capture/capture.go b/cdc/capture/capture.go index 7b46150b19..dbb3c3e37b 100644 --- a/cdc/capture/capture.go +++ b/cdc/capture/capture.go @@ -224,7 +224,7 @@ func (c *captureImpl) reset(ctx context.Context) (*vars.GlobalVars, error) { if err != nil { return nil, errors.Trace(err) } - log.Info("reset session successfully", zap.Any("session", sess)) + log.Info("reset session successfully", zap.Int64("leaseID", int64(lease.ID))) c.captureMu.Lock() defer c.captureMu.Unlock() @@ -291,7 +291,11 @@ func (c *captureImpl) reset(ctx context.Context) (*vars.GlobalVars, error) { c.processorManager = c.newProcessorManager( c.info, c.upstreamManager, &c.liveness, c.config.Debug.Scheduler, globalVars) - log.Info("capture initialized", zap.Any("capture", c.info)) + log.Info("capture initialized", + zap.String("captureID", c.info.ID), + zap.String("advertiseAddr", c.info.AdvertiseAddr), + zap.String("version", c.info.Version), + zap.String("gitHash", c.info.GitHash)) return globalVars, nil } diff --git a/cdc/model/changefeed.go b/cdc/model/changefeed.go index dfd315a0ec..b7479e61cd 100644 --- a/cdc/model/changefeed.go +++ b/cdc/model/changefeed.go @@ -219,7 +219,8 @@ func (info *ChangeFeedInfo) NeedBlockGC() bool { func (info *ChangeFeedInfo) isFailedByGC() bool { if info.Error == nil { log.Panic("changefeed info is not consistent", - zap.Any("state", info.State), zap.Any("error", info.Error)) + zap.String("state", string(info.State)), + zap.Bool("errorNil", info.Error == nil)) } return cerror.IsChangefeedGCFastFailErrorCode(errors.RFCErrorCode(info.Error.Code)) } diff --git a/cdc/owner/changefeed.go b/cdc/owner/changefeed.go index cc08194744..4bc272c693 100755 --- a/cdc/owner/changefeed.go +++ b/cdc/owner/changefeed.go @@ -450,7 +450,7 @@ func (c *changefeed) tick(ctx context.Context, zap.Uint64("preResolvedTs", c.resolvedTs.Load()), zap.Uint64("globalBarrierTs", barrier.GlobalBarrierTs), zap.Uint64("minTableBarrierTs", barrier.MinTableBarrierTs), - zap.Any("tableBarrier", barrier.TableBarriers)) + zap.Int("tableBarrierCount", len(barrier.TableBarriers))) if barrier.GlobalBarrierTs < preCheckpointTs { // This condition implies that the DDL resolved-ts has not yet reached checkpointTs, @@ -889,10 +889,17 @@ func (c *changefeed) cleanupRedoManager(ctx context.Context, cfInfo *model.Chang if c.isRemoved { if cfInfo == nil || cfInfo.Config == nil || cfInfo.Config.Consistent == nil { + state := "" + if cfInfo != nil { + state = string(cfInfo.State) + } log.Warn("changefeed is removed, but state is not complete", zap.String("namespace", c.id.Namespace), zap.String("changefeed", c.id.ID), - zap.Any("info", cfInfo)) + zap.String("state", state), + zap.Bool("changefeedInfoNil", cfInfo == nil), + zap.Bool("configNil", cfInfo != nil && cfInfo.Config == nil), + zap.Bool("consistentNil", cfInfo != nil && cfInfo.Config != nil && cfInfo.Config.Consistent == nil)) return } if !redoCfg.IsConsistentEnabled(cfInfo.Config.Consistent.Level) { diff --git a/cdc/owner/feed_state_manager.go b/cdc/owner/feed_state_manager.go index 343ee7b39a..45dd599ac8 100644 --- a/cdc/owner/feed_state_manager.go +++ b/cdc/owner/feed_state_manager.go @@ -417,16 +417,6 @@ func (m *feedStateManager) HandleError(errs ...*model.RunningError) { } } - // Changing changefeed state from stopped to failed is allowed - // but changing changefeed state from stopped to error or normal is not allowed. - if m.state.GetChangefeedInfo() != nil && m.state.GetChangefeedInfo().State == model.StateStopped { - log.Warn("changefeed is stopped, ignore errors", - zap.String("namespace", m.state.GetID().Namespace), - zap.String("changefeed", m.state.GetID().ID), - zap.Any("errors", errs)) - return - } - var lastError *model.RunningError // find the last non nil error // BTW, there shouldn't be any nil error in errs @@ -437,6 +427,15 @@ func (m *feedStateManager) HandleError(errs ...*model.RunningError) { break } } + // Changing changefeed state from stopped to failed is allowed + // but changing changefeed state from stopped to error or normal is not allowed. + if m.state.GetChangefeedInfo() != nil && m.state.GetChangefeedInfo().State == model.StateStopped { + log.Warn("changefeed is stopped, ignore errors", + zap.String("namespace", m.state.GetID().Namespace), + zap.String("changefeed", m.state.GetID().ID), + zap.Any("errors", errs)) + return + } // if any error is occurred in this tick, we should set the changefeed state to warning // and stop the changefeed if lastError != nil { diff --git a/cdc/owner/owner.go b/cdc/owner/owner.go index feef082ada..8ec4a7c960 100644 --- a/cdc/owner/owner.go +++ b/cdc/owner/owner.go @@ -604,7 +604,17 @@ func (o *ownerImpl) handleJobs(ctx context.Context) { changefeedID := job.ChangefeedID cfReactor, exist := o.changefeeds[changefeedID] if !exist && (job.Tp != ownerJobTypeQuery && job.Tp != ownerJobTypeDrainCapture) { - log.Warn("changefeed not found when handle a job", zap.Any("job", job)) + adminJobType := "" + if job.AdminJob != nil { + adminJobType = job.AdminJob.Type.String() + } + log.Warn("changefeed not found when handle a job", + zap.String("namespace", job.ChangefeedID.Namespace), + zap.String("changefeed", job.ChangefeedID.ID), + zap.Int("jobType", int(job.Tp)), + zap.Int64("tableID", job.TableID), + zap.String("targetCaptureID", job.TargetCaptureID), + zap.String("adminJobType", adminJobType)) job.done <- cerror.ErrChangeFeedNotExists.FastGenByArgs(job.ChangefeedID) close(job.done) continue @@ -787,7 +797,7 @@ func (o *ownerImpl) isHealthy() bool { log.Warn("isHealthy: changefeed not normal", zap.String("namespace", changefeed.id.Namespace), zap.String("changefeed", changefeed.id.ID), - zap.Any("state", changefeed.latestInfo.State)) + zap.String("state", string(changefeed.latestInfo.State))) continue } diff --git a/cdc/processor/processor.go b/cdc/processor/processor.go index 77aa6a6756..88b3dffdc1 100644 --- a/cdc/processor/processor.go +++ b/cdc/processor/processor.go @@ -811,7 +811,7 @@ func (p *processor) getTableName(ctx context.Context, tableID model.TableID) str log.Warn("failed to get table name for metric", zap.String("namespace", p.changefeedID.Namespace), zap.String("changefeed", p.changefeedID.ID), - zap.Any("tableID", tableID)) + zap.Int64("tableID", tableID)) return strconv.Itoa(int(tableID)) } diff --git a/cdc/processor/sinkmanager/manager.go b/cdc/processor/sinkmanager/manager.go index 2da347c38d..88e8841ee7 100644 --- a/cdc/processor/sinkmanager/manager.go +++ b/cdc/processor/sinkmanager/manager.go @@ -1001,7 +1001,9 @@ func (m *SinkManager) GetTableStats(span tablepb.Span) TableStats { zap.String("changefeed", m.changefeedID.ID), zap.Stringer("span", &span), zap.Uint64("upperbound", sinkUpperBound), - zap.Any("checkpointTs", checkpointTs)) + zap.Uint64("checkpointTs", checkpointTs.Ts), + zap.Int("checkpointMode", int(checkpointTs.Mode)), + zap.Uint64("checkpointBatchID", checkpointTs.BatchID)) } return TableStats{ CheckpointTs: checkpointTs.ResolvedMark(), diff --git a/cdc/processor/sinkmanager/table_sink_advancer.go b/cdc/processor/sinkmanager/table_sink_advancer.go index be70fb9ad6..781276f53c 100644 --- a/cdc/processor/sinkmanager/table_sink_advancer.go +++ b/cdc/processor/sinkmanager/table_sink_advancer.go @@ -302,7 +302,9 @@ func advanceTableSinkWithBatchID( zap.String("namespace", t.tableSink.changefeed.Namespace), zap.String("changefeed", t.tableSink.changefeed.ID), zap.Stringer("span", &t.span), - zap.Any("resolvedTs", resolvedTs), + zap.Uint64("resolvedTs", resolvedTs.Ts), + zap.Int("resolvedMode", int(resolvedTs.Mode)), + zap.Uint64("resolvedBatchID", resolvedTs.BatchID), zap.Uint64("size", size)) if size > 0 { sinkMemQuota.Record(t.span, resolvedTs, size) @@ -321,7 +323,9 @@ func advanceTableSink( zap.String("namespace", t.tableSink.changefeed.Namespace), zap.String("changefeed", t.tableSink.changefeed.ID), zap.Stringer("span", &t.span), - zap.Any("resolvedTs", resolvedTs), + zap.Uint64("resolvedTs", resolvedTs.Ts), + zap.Int("resolvedMode", int(resolvedTs.Mode)), + zap.Uint64("resolvedBatchID", resolvedTs.BatchID), zap.Uint64("size", size)) if size > 0 { sinkMemQuota.Record(t.span, resolvedTs, size) diff --git a/cdc/sink/dmlsink/cloudstorage/cloud_storage_dml_sink.go b/cdc/sink/dmlsink/cloudstorage/cloud_storage_dml_sink.go index 095ba7eb1b..90f44e1c7f 100644 --- a/cdc/sink/dmlsink/cloudstorage/cloud_storage_dml_sink.go +++ b/cdc/sink/dmlsink/cloudstorage/cloud_storage_dml_sink.go @@ -227,8 +227,7 @@ func (s *DMLSink) run(ctx context.Context) error { log.Info("dml worker started", zap.String("namespace", s.changefeedID.Namespace), zap.String("changefeed", s.changefeedID.ID), - zap.Int("workerCount", len(s.workers)), - zap.Any("config", s.workers[0].config)) + zap.Int("workerCount", len(s.workers))) return eg.Wait() } From a0951f2f5bb47a9fd6e617291767a057cd86d8ee Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Mon, 13 Apr 2026 16:29:38 +0800 Subject: [PATCH 22/24] fix all log issues --- cdc/kv/shared_client.go | 6 +-- cdc/owner/ddl_manager.go | 4 +- cdc/processor/sinkmanager/manager.go | 20 ++++--- .../sinkmanager/table_sink_wrapper.go | 4 +- cdc/scheduler/internal/v3/agent/agent.go | 2 +- cdc/scheduler/internal/v3/agent/table.go | 52 +++++++++++-------- cdc/sink/tablesink/table_sink_impl.go | 10 ++-- tests/integration_tests/bank/case.go | 5 +- 8 files changed, 57 insertions(+), 46 deletions(-) diff --git a/cdc/kv/shared_client.go b/cdc/kv/shared_client.go index 1f1e78d2ee..91e162b971 100644 --- a/cdc/kv/shared_client.go +++ b/cdc/kv/shared_client.go @@ -310,7 +310,7 @@ func (s *SharedClient) Unsubscribe(subID SubscriptionID) { s.totalSpans.Unlock() if rt != nil { s.setTableStopped(rt) - log.Info("event feed unsubscribes table", + log.Debug("event feed unsubscribes table", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(rt.subscriptionID)), @@ -385,7 +385,7 @@ func (s *SharedClient) Close() { } func (s *SharedClient) setTableStopped(rt *subscribedTable) { - log.Info("event feed starts to stop table", + log.Debug("event feed starts to stop table", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(rt.subscriptionID)), @@ -403,7 +403,7 @@ func (s *SharedClient) setTableStopped(rt *subscribedTable) { } func (s *SharedClient) onTableDrained(rt *subscribedTable) { - log.Info("event feed stop table is finished", + log.Debug("event feed stop table is finished", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), zap.Uint64("subscriptionID", uint64(rt.subscriptionID)), diff --git a/cdc/owner/ddl_manager.go b/cdc/owner/ddl_manager.go index 02ec54a167..0f5eb81e96 100644 --- a/cdc/owner/ddl_manager.go +++ b/cdc/owner/ddl_manager.go @@ -296,9 +296,7 @@ func (m *ddlManager) tick( continue } - // Note: do not change the key words in the log, it is used to search the - // FinishTS of the DDL job. Some integration tests and users depend on it. - log.Info("handle a ddl job", + log.Info("build ddl events from ddl job", zap.String("namespace", m.changfeedID.Namespace), zap.String("changefeed", m.changfeedID.ID), zap.Int64("tableID", job.TableID), diff --git a/cdc/processor/sinkmanager/manager.go b/cdc/processor/sinkmanager/manager.go index 88e8841ee7..1b00e78181 100644 --- a/cdc/processor/sinkmanager/manager.go +++ b/cdc/processor/sinkmanager/manager.go @@ -498,10 +498,11 @@ func (m *SinkManager) generateSinkTasks(ctx context.Context) error { value, ok := m.tableSinks.Load(span) if !ok { - log.Info("Table sink not found, probably already removed", + log.Debug("Table sink not found, probably already removed", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.Stringer("span", &span)) + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) // Maybe the table sink is removed by the processor.(Scheduled the table to other nodes.) // So we do **not** need add it back to the heap. continue @@ -517,10 +518,11 @@ func (m *SinkManager) generateSinkTasks(ctx context.Context) error { // We should skip it and do not push it back. // Because there is no case that stopping/stopped -> replicating. if tableState != tablepb.TableStateReplicating { - log.Info("Table sink is not replicating, skip it", + log.Debug("Table sink is not replicating, skip it", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.Stringer("span", &span), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), zap.String("tableState", tableState.String())) continue } @@ -671,10 +673,11 @@ func (m *SinkManager) generateRedoTasks(ctx context.Context) error { value, ok := m.tableSinks.Load(span) if !ok { - log.Info("Table sink not found, probably already removed", + log.Debug("Table sink not found, probably already removed", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.Stringer("span", &span)) + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey)) // Maybe the table sink is removed by the processor.(Scheduled the table to other nodes.) // So we do **not** need add it back to the heap. continue @@ -690,10 +693,11 @@ func (m *SinkManager) generateRedoTasks(ctx context.Context) error { // We should skip it and do not push it back. // Because there is no case that stopping/stopped -> replicating. if tableState != tablepb.TableStateReplicating { - log.Info("Table sink is not replicating, skip it", + log.Debug("Table sink is not replicating, skip it", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.Stringer("span", &span), + zap.Int64("tableID", span.TableID), + zap.Stringer("startKey", span.StartKey), zap.String("tableState", tableState.String())) continue } diff --git a/cdc/processor/sinkmanager/table_sink_wrapper.go b/cdc/processor/sinkmanager/table_sink_wrapper.go index 92e614b059..951b9967a5 100644 --- a/cdc/processor/sinkmanager/table_sink_wrapper.go +++ b/cdc/processor/sinkmanager/table_sink_wrapper.go @@ -269,7 +269,7 @@ func (t *tableSinkWrapper) markAsClosing() { break } if t.state.CompareAndSwap(curr, tablepb.TableStateStopping) { - log.Info("Sink is closing", + log.Debug("Sink is closing", zap.String("namespace", t.changefeed.Namespace), zap.String("changefeed", t.changefeed.ID), zap.Int64("tableID", t.span.TableID), @@ -286,7 +286,7 @@ func (t *tableSinkWrapper) markAsClosed() { return } if t.state.CompareAndSwap(curr, tablepb.TableStateStopped) { - log.Info("Sink is closed", + log.Debug("Sink is closed", zap.String("namespace", t.changefeed.Namespace), zap.String("changefeed", t.changefeed.ID), zap.Int64("tableID", t.span.TableID), diff --git a/cdc/scheduler/internal/v3/agent/agent.go b/cdc/scheduler/internal/v3/agent/agent.go index 8387fbcd13..d656e1d110 100644 --- a/cdc/scheduler/internal/v3/agent/agent.go +++ b/cdc/scheduler/internal/v3/agent/agent.go @@ -359,7 +359,7 @@ func (a *agent) handleMessageDispatchTableRequest( span := req.RemoveTable.GetSpan() table, ok = a.tableM.getTableSpan(span) if !ok { - log.Warn("schedulerv3: agent ignore remove table request, "+ + log.Debug("schedulerv3: agent ignore remove table request, "+ "since the table not found", zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID), diff --git a/cdc/scheduler/internal/v3/agent/table.go b/cdc/scheduler/internal/v3/agent/table.go index d35de74b5c..90c74f1a38 100644 --- a/cdc/scheduler/internal/v3/agent/table.go +++ b/cdc/scheduler/internal/v3/agent/table.go @@ -155,6 +155,14 @@ func (t *tableSpan) handleRemoveTableTask() *schedulepb.Message { case tablepb.TableStatePreparing, tablepb.TableStatePrepared, tablepb.TableStateReplicating: + if t.task.status == dispatchTableTaskReceived { + log.Info("schedulerv3: agent accepted remove table task", + zap.String("namespace", t.changefeedID.Namespace), + zap.String("changefeed", t.changefeedID.ID), + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey)) + t.task.status = dispatchTableTaskProcessed + } done := t.executor.RemoveTableSpan(t.task.Span) if !done { status := t.getTableSpanStatus(false) @@ -193,6 +201,21 @@ func (t *tableSpan) handleAddTableTask(ctx context.Context) (result *schedulepb. status := t.getTableSpanStatus(false) return newAddTableResponseMessage(t.changefeedID, status), errors.Trace(err) } + if t.task.IsPrepare { + log.Info("schedulerv3: agent accepted prepare table task", + zap.String("namespace", t.changefeedID.Namespace), + zap.String("changefeed", t.changefeedID.ID), + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey), + zap.Uint64("checkpointTs", t.task.Checkpoint.CheckpointTs)) + } else { + log.Info("schedulerv3: agent accepted replicate table task", + zap.String("namespace", t.changefeedID.Namespace), + zap.String("changefeed", t.changefeedID.ID), + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey), + zap.Uint64("checkpointTs", t.task.Checkpoint.CheckpointTs)) + } state, changed = t.getAndUpdateTableSpanState() case tablepb.TableStateReplicating: t.task = nil @@ -219,6 +242,12 @@ func (t *tableSpan) handleAddTableTask(ctx context.Context) (result *schedulepb. status := t.getTableSpanStatus(false) return newAddTableResponseMessage(t.changefeedID, status), errors.Trace(err) } + log.Info("schedulerv3: agent accepted replicate table task", + zap.String("namespace", t.changefeedID.Namespace), + zap.String("changefeed", t.changefeedID.ID), + zap.Int64("tableID", t.span.TableID), + zap.Stringer("startKey", t.span.StartKey), + zap.Uint64("checkpointTs", t.task.Checkpoint.CheckpointTs)) t.task.status = dispatchTableTaskProcessed } @@ -268,27 +297,6 @@ func (t *tableSpan) injectDispatchTableTask(task *dispatchTableTask) { } if t.task == nil { t.task = task - if task.IsRemove { - log.Info("schedulerv3: agent accepted remove table task", - zap.String("namespace", t.changefeedID.Namespace), - zap.String("changefeed", t.changefeedID.ID), - zap.Int64("tableID", t.span.TableID), - zap.Stringer("startKey", t.span.StartKey)) - } else if task.IsPrepare { - log.Info("schedulerv3: agent accepted prepare table task", - zap.String("namespace", t.changefeedID.Namespace), - zap.String("changefeed", t.changefeedID.ID), - zap.Int64("tableID", t.span.TableID), - zap.Stringer("startKey", t.span.StartKey), - zap.Uint64("checkpointTs", task.Checkpoint.CheckpointTs)) - } else { - log.Info("schedulerv3: agent accepted replicate table task", - zap.String("namespace", t.changefeedID.Namespace), - zap.String("changefeed", t.changefeedID.ID), - zap.Int64("tableID", t.span.TableID), - zap.Stringer("startKey", t.span.StartKey), - zap.Uint64("checkpointTs", task.Checkpoint.CheckpointTs)) - } return } if t.task.Span.Eq(&task.Span) && @@ -298,7 +306,7 @@ func (t *tableSpan) injectDispatchTableTask(task *dispatchTableTask) { t.task.Epoch.Epoch == task.Epoch.Epoch { return } - log.Warn("schedulerv3: table inject dispatch table task ignored,"+ + log.Debug("schedulerv3: table inject dispatch table task ignored,"+ "since there is one not finished yet", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), diff --git a/cdc/sink/tablesink/table_sink_impl.go b/cdc/sink/tablesink/table_sink_impl.go index 7e6b7d7154..9c9a666265 100644 --- a/cdc/sink/tablesink/table_sink_impl.go +++ b/cdc/sink/tablesink/table_sink_impl.go @@ -236,10 +236,11 @@ func (e *EventTableSink[E, P]) freeze() { } if e.state.CompareAndSwap(currentState, state.TableSinkStopping) { stoppingCheckpointTs := e.GetCheckpointTs() - log.Info("Stopping table sink", + log.Debug("Stopping table sink", zap.String("namespace", e.changefeedID.Namespace), zap.String("changefeed", e.changefeedID.ID), - zap.Stringer("span", &e.span), + zap.Int64("tableID", e.span.TableID), + zap.Stringer("startKey", e.span.StartKey), zap.Uint64("checkpointTs", stoppingCheckpointTs.Ts)) break } @@ -254,10 +255,11 @@ func (e *EventTableSink[E, P]) markAsClosed() (modified bool) { } if e.state.CompareAndSwap(currentState, state.TableSinkStopped) { stoppedCheckpointTs := e.GetCheckpointTs() - log.Info("Table sink stopped", + log.Debug("Table sink stopped", zap.String("namespace", e.changefeedID.Namespace), zap.String("changefeed", e.changefeedID.ID), - zap.Stringer("span", &e.span), + zap.Int64("tableID", e.span.TableID), + zap.Stringer("startKey", e.span.StartKey), zap.Uint64("checkpointTs", stoppedCheckpointTs.Ts)) return true } diff --git a/tests/integration_tests/bank/case.go b/tests/integration_tests/bank/case.go index b13e391510..ee4d0c7e01 100644 --- a/tests/integration_tests/bank/case.go +++ b/tests/integration_tests/bank/case.go @@ -716,9 +716,8 @@ func tryGetEndTsFromLog(db *sql.DB, tableName string) (result uint64, ok bool) { } log.Info("total files", zap.Any("file", cdcLogFiles)) - logRegex := regexp.MustCompile(`handle a ddl job`) - tableNameRegex := regexp.MustCompile(tableName + "`") timeStampRegex := regexp.MustCompile(`finishedTs=([0-9]+)`) + jobIDLogField := fmt.Sprintf("jobID=%d", jobID) for _, f := range cdcLogFiles { file, err := os.Open(f) if err != nil { @@ -729,7 +728,7 @@ func tryGetEndTsFromLog(db *sql.DB, tableName string) (result uint64, ok bool) { scanner := bufio.NewScanner(file) for scanner.Scan() { line := scanner.Text() - if !logRegex.MatchString(line) || !tableNameRegex.MatchString(line) { + if !strings.Contains(line, jobIDLogField) { continue } From f714b5868093d052969df6392976d8c0a649e0e4 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Mon, 13 Apr 2026 16:50:57 +0800 Subject: [PATCH 23/24] remove useless debug log --- cdc/entry/schema/snapshot.go | 20 ------------- cdc/entry/schema_storage.go | 8 ----- cdc/kv/shared_client.go | 18 ----------- cdc/kv/shared_region_worker.go | 8 +---- cdc/owner/changefeed.go | 15 ---------- cdc/processor/processor.go | 5 ---- cdc/processor/sinkmanager/manager.go | 28 ----------------- .../sinkmanager/table_sink_wrapper.go | 10 ------- cdc/scheduler/internal/v3/agent/agent.go | 3 -- .../mq/ddlproducer/pulsar_ddl_producer.go | 6 ---- cdc/sink/dmlsink/cloudstorage/dml_worker.go | 18 ----------- .../dmlsink/cloudstorage/encoding_worker.go | 7 ----- cdc/sink/dmlsink/txn/mysql/mysql.go | 30 ------------------- cdc/sink/tablesink/table_sink_impl.go | 16 ---------- 14 files changed, 1 insertion(+), 191 deletions(-) diff --git a/cdc/entry/schema/snapshot.go b/cdc/entry/schema/snapshot.go index c31d9b57d0..b30a926da7 100644 --- a/cdc/entry/schema/snapshot.go +++ b/cdc/entry/schema/snapshot.go @@ -736,7 +736,6 @@ func (s *snapshot) dropSchema(id int64, currentTs uint64) error { s.doDropTable(tbInfo, currentTs) } s.currentTs = currentTs - log.Debug("drop schema success", zap.String("name", dbInfo.Name.O), zap.Int64("id", dbInfo.ID)) return nil } @@ -751,7 +750,6 @@ func (s *snapshot) createSchema(dbInfo *timodel.DBInfo, currentTs uint64) error } s.doCreateSchema(dbInfo, currentTs) s.currentTs = currentTs - log.Debug("create schema success", zap.String("name", dbInfo.Name.O), zap.Int64("id", dbInfo.ID)) return nil } @@ -768,7 +766,6 @@ func (s *snapshot) replaceSchema(dbInfo *timodel.DBInfo, currentTs uint64) error s.schemaNameToID.ReplaceOrInsert(newVersionedEntityName(-1, old.Name.O, tag)) } s.currentTs = currentTs - log.Debug("replace schema success", zap.String("name", dbInfo.Name.O), zap.Int64("id", dbInfo.ID)) return nil } @@ -790,10 +787,6 @@ func (s *snapshot) dropTable(id int64, currentTs uint64) error { } s.doDropTable(tbInfo, currentTs) s.currentTs = currentTs - log.Debug("drop table success", - zap.String("schema", tbInfo.TableName.Schema), - zap.String("table", tbInfo.TableName.Table), - zap.Int64("id", tbInfo.ID)) return nil } @@ -837,10 +830,6 @@ func (s *snapshot) truncateTable(id int64, tbInfo *model.TableInfo, currentTs ui s.truncatedTables.ReplaceOrInsert(newVersionedID(id, tag)) } s.currentTs = currentTs - log.Debug("truncate table success", - zap.String("schema", tbInfo.TableName.Schema), - zap.String("table", tbInfo.TableName.Table), - zap.Int64("id", tbInfo.ID)) return } @@ -854,8 +843,6 @@ func (s *snapshot) createTable(tbInfo *model.TableInfo, currentTs uint64) error } s.doCreateTable(tbInfo, currentTs) s.currentTs = currentTs - log.Debug("create table success", zap.Int64("id", tbInfo.ID), - zap.String("name", fmt.Sprintf("%s.%s", tbInfo.TableName.Schema, tbInfo.TableName.Table))) return nil } @@ -869,7 +856,6 @@ func (s *snapshot) replaceTable(tbInfo *model.TableInfo, currentTs uint64) error } s.doCreateTable(tbInfo, currentTs) s.currentTs = currentTs - log.Debug("replace table success", zap.String("name", tbInfo.Name.O), zap.Int64("id", tbInfo.ID)) return nil } @@ -951,12 +937,6 @@ func (s *snapshot) updatePartition(tbInfo *model.TableInfo, isTruncate bool, cur } } s.currentTs = currentTs - - log.Debug("adjust partition success", - zap.String("schema", tbInfo.TableName.Schema), - zap.String("table", tbInfo.TableName.Table), - zap.Any("partitions", newPi.Definitions), - ) return nil } diff --git a/cdc/entry/schema_storage.go b/cdc/entry/schema_storage.go index d7cabbb219..82751f2c6d 100644 --- a/cdc/entry/schema_storage.go +++ b/cdc/entry/schema_storage.go @@ -33,7 +33,6 @@ import ( "github.com/pingcap/tiflow/pkg/retry" "github.com/pingcap/tiflow/pkg/util" "go.uber.org/zap" - "go.uber.org/zap/zapcore" ) // SchemaStorage stores the schema information with multi-version @@ -347,13 +346,6 @@ func (s *schemaStorage) DoGC(ts uint64) (lastSchemaTs uint64) { if startIdx == 0 { return s.snaps[0].CurrentTs() } - if log.GetLevel() == zapcore.DebugLevel { - log.Debug("Do GC in schema storage") - for i := 0; i < startIdx; i++ { - s.snaps[i].PrintStatus(log.Debug) - } - } - // NOTE: Drop must be called to remove stale versions. s.snaps[startIdx-1].Drop() diff --git a/cdc/kv/shared_client.go b/cdc/kv/shared_client.go index 91e162b971..1cd55749ca 100644 --- a/cdc/kv/shared_client.go +++ b/cdc/kv/shared_client.go @@ -310,12 +310,6 @@ func (s *SharedClient) Unsubscribe(subID SubscriptionID) { s.totalSpans.Unlock() if rt != nil { s.setTableStopped(rt) - log.Debug("event feed unsubscribes table", - zap.String("namespace", s.changefeed.Namespace), - zap.String("changefeed", s.changefeed.ID), - zap.Uint64("subscriptionID", uint64(rt.subscriptionID)), - zap.Int64("tableID", rt.span.TableID), - zap.Stringer("startKey", rt.span.StartKey)) return } log.Warn("event feed unsubscribes table, but not found", @@ -385,12 +379,6 @@ func (s *SharedClient) Close() { } func (s *SharedClient) setTableStopped(rt *subscribedTable) { - log.Debug("event feed starts to stop table", - zap.String("namespace", s.changefeed.Namespace), - zap.String("changefeed", s.changefeed.ID), - zap.Uint64("subscriptionID", uint64(rt.subscriptionID)), - zap.Int64("tableID", rt.span.TableID)) - // Set stopped to true so we can stop handling region events from the table. // Then send a special singleRegionInfo to regionRouter to deregister the table // from all TiKV instances. @@ -403,12 +391,6 @@ func (s *SharedClient) setTableStopped(rt *subscribedTable) { } func (s *SharedClient) onTableDrained(rt *subscribedTable) { - log.Debug("event feed stop table is finished", - zap.String("namespace", s.changefeed.Namespace), - zap.String("changefeed", s.changefeed.ID), - zap.Uint64("subscriptionID", uint64(rt.subscriptionID)), - zap.Int64("tableID", rt.span.TableID)) - s.totalSpans.Lock() defer s.totalSpans.Unlock() delete(s.totalSpans.v, rt.subscriptionID) diff --git a/cdc/kv/shared_region_worker.go b/cdc/kv/shared_region_worker.go index 1efa0db8b8..0b0c1ab0d4 100644 --- a/cdc/kv/shared_region_worker.go +++ b/cdc/kv/shared_region_worker.go @@ -423,13 +423,7 @@ func (w *sharedRegionWorker) forwardResolvedTsToPullerFrontier(ctx context.Conte spansAndChan.spans = append(spansAndChan.spans, span) } - for subscriptionID, spansAndChan := range resolvedSpans { - log.Debug("region worker get a ResolvedTs", - zap.String("namespace", w.changefeed.Namespace), - zap.String("changefeed", w.changefeed.ID), - zap.Uint64("subscriptionID", uint64(subscriptionID)), - zap.Uint64("ResolvedTs", batch.ts), - zap.Int("spanCount", len(spansAndChan.spans))) + for _, spansAndChan := range resolvedSpans { if len(spansAndChan.spans) > 0 { revent := model.RegionFeedEvent{Resolved: &model.ResolvedSpans{ Spans: spansAndChan.spans, ResolvedTs: batch.ts, diff --git a/cdc/owner/changefeed.go b/cdc/owner/changefeed.go index 4bc272c693..30dbba7a53 100755 --- a/cdc/owner/changefeed.go +++ b/cdc/owner/changefeed.go @@ -443,15 +443,6 @@ func (c *changefeed) tick(ctx context.Context, return 0, 0, errors.Trace(err) } - log.Debug("owner handles barrier", - zap.String("namespace", c.id.Namespace), - zap.String("changefeed", c.id.ID), - zap.Uint64("preCheckpointTs", preCheckpointTs), - zap.Uint64("preResolvedTs", c.resolvedTs.Load()), - zap.Uint64("globalBarrierTs", barrier.GlobalBarrierTs), - zap.Uint64("minTableBarrierTs", barrier.MinTableBarrierTs), - zap.Int("tableBarrierCount", len(barrier.TableBarriers))) - if barrier.GlobalBarrierTs < preCheckpointTs { // This condition implies that the DDL resolved-ts has not yet reached checkpointTs, // which implies that it would be premature to schedule tables or to update status. @@ -504,12 +495,6 @@ func (c *changefeed) tick(ctx context.Context, return 0, 0, nil } - log.Debug("owner prepares to update status", - zap.Uint64("prevResolvedTs", c.resolvedTs.Load()), - zap.Uint64("newResolvedTs", watermark.ResolvedTs), - zap.Uint64("newCheckpointTs", watermark.CheckpointTs), - zap.String("namespace", c.id.Namespace), - zap.String("changefeed", c.id.ID)) // resolvedTs should never regress. if watermark.ResolvedTs > c.resolvedTs.Load() { c.resolvedTs.Store(watermark.ResolvedTs) diff --git a/cdc/processor/processor.go b/cdc/processor/processor.go index 88b3dffdc1..5d9de3de40 100644 --- a/cdc/processor/processor.go +++ b/cdc/processor/processor.go @@ -845,11 +845,6 @@ func (p *processor) doGCSchemaStorage() { return } p.lastSchemaTs = lastSchemaTs - - log.Debug("finished gc in schema storage", - zap.Uint64("gcTs", lastSchemaTs), - zap.String("namespace", p.changefeedID.Namespace), - zap.String("changefeed", p.changefeedID.ID)) lastSchemaPhysicalTs := oracle.ExtractPhysical(lastSchemaTs) p.metricSchemaStorageGcTsGauge.Set(float64(lastSchemaPhysicalTs)) } diff --git a/cdc/processor/sinkmanager/manager.go b/cdc/processor/sinkmanager/manager.go index 1b00e78181..68f437ff39 100644 --- a/cdc/processor/sinkmanager/manager.go +++ b/cdc/processor/sinkmanager/manager.go @@ -461,12 +461,6 @@ func (m *SinkManager) backgroundGC(errors chan<- error) { case errors <- err: case <-m.managerCtx.Done(): } - } else { - log.Debug("table stale data has been cleaned", - zap.String("namespace", m.changefeedID.Namespace), - zap.String("changefeed", m.changefeedID.ID), - zap.Stringer("span", &span), - zap.Any("upperBound", cleanPos)) } sink.lastCleanTime = time.Now() return true @@ -498,11 +492,6 @@ func (m *SinkManager) generateSinkTasks(ctx context.Context) error { value, ok := m.tableSinks.Load(span) if !ok { - log.Debug("Table sink not found, probably already removed", - zap.String("namespace", m.changefeedID.Namespace), - zap.String("changefeed", m.changefeedID.ID), - zap.Int64("tableID", span.TableID), - zap.Stringer("startKey", span.StartKey)) // Maybe the table sink is removed by the processor.(Scheduled the table to other nodes.) // So we do **not** need add it back to the heap. continue @@ -518,12 +507,6 @@ func (m *SinkManager) generateSinkTasks(ctx context.Context) error { // We should skip it and do not push it back. // Because there is no case that stopping/stopped -> replicating. if tableState != tablepb.TableStateReplicating { - log.Debug("Table sink is not replicating, skip it", - zap.String("namespace", m.changefeedID.Namespace), - zap.String("changefeed", m.changefeedID.ID), - zap.Int64("tableID", span.TableID), - zap.Stringer("startKey", span.StartKey), - zap.String("tableState", tableState.String())) continue } tables = append(tables, tableSink) @@ -673,11 +656,6 @@ func (m *SinkManager) generateRedoTasks(ctx context.Context) error { value, ok := m.tableSinks.Load(span) if !ok { - log.Debug("Table sink not found, probably already removed", - zap.String("namespace", m.changefeedID.Namespace), - zap.String("changefeed", m.changefeedID.ID), - zap.Int64("tableID", span.TableID), - zap.Stringer("startKey", span.StartKey)) // Maybe the table sink is removed by the processor.(Scheduled the table to other nodes.) // So we do **not** need add it back to the heap. continue @@ -693,12 +671,6 @@ func (m *SinkManager) generateRedoTasks(ctx context.Context) error { // We should skip it and do not push it back. // Because there is no case that stopping/stopped -> replicating. if tableState != tablepb.TableStateReplicating { - log.Debug("Table sink is not replicating, skip it", - zap.String("namespace", m.changefeedID.Namespace), - zap.String("changefeed", m.changefeedID.ID), - zap.Int64("tableID", span.TableID), - zap.Stringer("startKey", span.StartKey), - zap.String("tableState", tableState.String())) continue } tables = append(tables, tableSink) diff --git a/cdc/processor/sinkmanager/table_sink_wrapper.go b/cdc/processor/sinkmanager/table_sink_wrapper.go index 951b9967a5..12f7c3c496 100644 --- a/cdc/processor/sinkmanager/table_sink_wrapper.go +++ b/cdc/processor/sinkmanager/table_sink_wrapper.go @@ -269,11 +269,6 @@ func (t *tableSinkWrapper) markAsClosing() { break } if t.state.CompareAndSwap(curr, tablepb.TableStateStopping) { - log.Debug("Sink is closing", - zap.String("namespace", t.changefeed.Namespace), - zap.String("changefeed", t.changefeed.ID), - zap.Int64("tableID", t.span.TableID), - zap.Stringer("startKey", t.span.StartKey)) break } } @@ -286,11 +281,6 @@ func (t *tableSinkWrapper) markAsClosed() { return } if t.state.CompareAndSwap(curr, tablepb.TableStateStopped) { - log.Debug("Sink is closed", - zap.String("namespace", t.changefeed.Namespace), - zap.String("changefeed", t.changefeed.ID), - zap.Int64("tableID", t.span.TableID), - zap.Stringer("startKey", t.span.StartKey)) return } } diff --git a/cdc/scheduler/internal/v3/agent/agent.go b/cdc/scheduler/internal/v3/agent/agent.go index d656e1d110..419652bebd 100644 --- a/cdc/scheduler/internal/v3/agent/agent.go +++ b/cdc/scheduler/internal/v3/agent/agent.go @@ -387,9 +387,6 @@ func (a *agent) handleMessageDispatchTableRequest( // Close implement agent interface func (a *agent) Close() error { - log.Debug("schedulerv3: agent closed", - zap.String("namespace", a.ChangeFeedID.Namespace), - zap.String("changefeed", a.ChangeFeedID.ID)) return a.trans.Close() } diff --git a/cdc/sink/ddlsink/mq/ddlproducer/pulsar_ddl_producer.go b/cdc/sink/ddlsink/mq/ddlproducer/pulsar_ddl_producer.go index b520924df9..e16405879e 100644 --- a/cdc/sink/ddlsink/mq/ddlproducer/pulsar_ddl_producer.go +++ b/cdc/sink/ddlsink/mq/ddlproducer/pulsar_ddl_producer.go @@ -95,12 +95,6 @@ func (p *pulsarProducers) SyncSendMessage(ctx context.Context, topic string, zap.String("ddl", string(message.Value))) } - log.Debug("pulsarProducers SyncSendMessage success", - zap.String("namespace", p.id.Namespace), - zap.String("changefeed", p.id.ID), - zap.Any("mID", mID), - zap.String("topic", topic)) - mq.IncPublishedDDLSuccess(topic, p.id.ID, message) return nil } diff --git a/cdc/sink/dmlsink/cloudstorage/dml_worker.go b/cdc/sink/dmlsink/cloudstorage/dml_worker.go index 40c7fc4a34..1f01f0be56 100644 --- a/cdc/sink/dmlsink/cloudstorage/dml_worker.go +++ b/cdc/sink/dmlsink/cloudstorage/dml_worker.go @@ -139,11 +139,6 @@ func newDMLWorker( // run creates a set of background goroutines. func (d *dmlWorker) run(ctx context.Context) error { - log.Debug("dml worker started", - zap.String("namespace", d.changeFeedID.Namespace), - zap.String("changefeed", d.changeFeedID.ID), - zap.Int("workerID", d.id)) - eg, ctx := errgroup.WithContext(ctx) eg.Go(func() error { return d.flushMessages(ctx) @@ -231,14 +226,6 @@ func (d *dmlWorker) flushMessages(ctx context.Context) error { return errors.Trace(err) } - log.Debug("write file to storage success", - zap.String("namespace", d.changeFeedID.Namespace), - zap.String("changefeed", d.changeFeedID.ID), - zap.Int("workerID", d.id), - zap.String("schema", table.TableNameWithPhysicTableID.Schema), - zap.String("table", table.TableNameWithPhysicTableID.Table), - zap.String("path", dataFilePath), - ) } flushTimeSlice += time.Since(start) } @@ -341,8 +328,6 @@ func (d *dmlWorker) genAndDispatchTask(ctx context.Context, case <-ctx.Done(): return errors.Trace(ctx.Err()) case d.toBeFlushedCh <- batchedTask: - log.Debug("flush task is emitted successfully when flush interval exceeds", - zap.Int("tablesLength", len(batchedTask.batch))) batchedTask = newBatchedTask() default: } @@ -360,9 +345,6 @@ func (d *dmlWorker) genAndDispatchTask(ctx context.Context, case <-ctx.Done(): return errors.Trace(ctx.Err()) case d.toBeFlushedCh <- task: - log.Debug("flush task is emitted successfully when file size exceeds", - zap.Any("table", table), - zap.Int("eventsLenth", len(task.batch[table].msgs))) } } } diff --git a/cdc/sink/dmlsink/cloudstorage/encoding_worker.go b/cdc/sink/dmlsink/cloudstorage/encoding_worker.go index f8024f52af..55dcdd9d25 100644 --- a/cdc/sink/dmlsink/cloudstorage/encoding_worker.go +++ b/cdc/sink/dmlsink/cloudstorage/encoding_worker.go @@ -17,10 +17,8 @@ import ( "sync/atomic" "github.com/pingcap/errors" - "github.com/pingcap/log" "github.com/pingcap/tiflow/cdc/model" "github.com/pingcap/tiflow/pkg/sink/codec" - "go.uber.org/zap" "golang.org/x/sync/errgroup" ) @@ -52,11 +50,6 @@ func newEncodingWorker( } func (w *encodingWorker) run(ctx context.Context) error { - log.Debug("encoding worker started", - zap.String("namespace", w.changeFeedID.Namespace), - zap.String("changefeed", w.changeFeedID.ID), - zap.Int("workerID", w.id)) - eg, ctx := errgroup.WithContext(ctx) eg.Go(func() error { for { diff --git a/cdc/sink/dmlsink/txn/mysql/mysql.go b/cdc/sink/dmlsink/txn/mysql/mysql.go index 28314879de..979c18cfbd 100644 --- a/cdc/sink/dmlsink/txn/mysql/mysql.go +++ b/cdc/sink/dmlsink/txn/mysql/mysql.go @@ -225,12 +225,6 @@ func (s *mysqlBackend) Flush(ctx context.Context) (err error) { } dmls := s.prepareDMLs() - log.Debug("prepare DMLs", - zap.String("namespace", s.changefeedID.Namespace), - zap.String("changefeed", s.changefeedID.ID), - zap.Int("rows", s.rows), - zap.Strings("sqls", dmls.sqls), - zap.Any("values", dmls.values)) start := time.Now() if err := s.execDMLWithMaxRetries(ctx, dmls); err != nil { @@ -538,13 +532,6 @@ func (s *mysqlBackend) prepareDMLs() *preparedDMLs { // the table it belongs to been replicating by TiCDC, which means it must not be // replicated before, and there is no such row in downstream MySQL. translateToInsert = translateToInsert && firstRow.CommitTs > firstRow.ReplicatingTs - log.Debug("translate to insert", - zap.String("namespace", s.changefeedID.Namespace), - zap.String("changefeed", s.changefeedID.ID), - zap.Bool("translateToInsert", translateToInsert), - zap.Uint64("firstRowCommitTs", firstRow.CommitTs), - zap.Uint64("firstRowReplicatingTs", firstRow.ReplicatingTs), - zap.Bool("safeMode", s.cfg.SafeMode)) if event.Callback != nil { callbacks = append(callbacks, event.Callback) @@ -643,12 +630,6 @@ func (s *mysqlBackend) multiStmtExecute( } multiStmtSQL := strings.Join(dmls.sqls, ";") - log.Debug("exec row", - zap.String("namespace", s.changefeedID.Namespace), - zap.String("changefeed", s.changefeedID.ID), - zap.Int("workerID", s.workerID), - zap.String("sql", multiStmtSQL), - zap.Any("args", multiStmtArgs)) ctx, cancel := context.WithTimeout(ctx, writeTimeout) defer cancel() start := time.Now() @@ -677,12 +658,6 @@ func (s *mysqlBackend) sequenceExecute( start := time.Now() for i, query := range dmls.sqls { args := dmls.values[i] - log.Debug("exec row", - zap.String("namespace", s.changefeedID.Namespace), - zap.String("changefeed", s.changefeedID.ID), - zap.Int("workerID", s.workerID), - zap.String("sql", query), - zap.Any("args", args)) ctx, cancelFunc := context.WithTimeout(ctx, writeTimeout) var prepStmt *sql.Stmt @@ -817,11 +792,6 @@ func (s *mysqlBackend) execDMLWithMaxRetries(pctx context.Context, dmls *prepare if err != nil { return errors.Trace(err) } - log.Debug("Exec Rows succeeded", - zap.String("namespace", s.changefeedID.Namespace), - zap.String("changefeed", s.changefeedID.ID), - zap.Int("workerID", s.workerID), - zap.Int("numOfRows", dmls.rowCount)) return nil }, retry.WithBackoffBaseDelay(pmysql.BackoffBaseDelay.Milliseconds()), retry.WithBackoffMaxDelay(pmysql.BackoffMaxDelay.Milliseconds()), diff --git a/cdc/sink/tablesink/table_sink_impl.go b/cdc/sink/tablesink/table_sink_impl.go index 9c9a666265..ac2c094bf4 100644 --- a/cdc/sink/tablesink/table_sink_impl.go +++ b/cdc/sink/tablesink/table_sink_impl.go @@ -17,7 +17,6 @@ import ( "sort" "sync" - "github.com/pingcap/log" "github.com/pingcap/tiflow/cdc/model" "github.com/pingcap/tiflow/cdc/processor/tablepb" "github.com/pingcap/tiflow/cdc/sink/dmlsink" @@ -25,7 +24,6 @@ import ( "github.com/pingcap/tiflow/pkg/pdutil" "github.com/prometheus/client_golang/prometheus" "github.com/tikv/client-go/v2/oracle" - "go.uber.org/zap" ) // Assert TableSink implementation @@ -235,13 +233,6 @@ func (e *EventTableSink[E, P]) freeze() { break } if e.state.CompareAndSwap(currentState, state.TableSinkStopping) { - stoppingCheckpointTs := e.GetCheckpointTs() - log.Debug("Stopping table sink", - zap.String("namespace", e.changefeedID.Namespace), - zap.String("changefeed", e.changefeedID.ID), - zap.Int64("tableID", e.span.TableID), - zap.Stringer("startKey", e.span.StartKey), - zap.Uint64("checkpointTs", stoppingCheckpointTs.Ts)) break } } @@ -254,13 +245,6 @@ func (e *EventTableSink[E, P]) markAsClosed() (modified bool) { return } if e.state.CompareAndSwap(currentState, state.TableSinkStopped) { - stoppedCheckpointTs := e.GetCheckpointTs() - log.Debug("Table sink stopped", - zap.String("namespace", e.changefeedID.Namespace), - zap.String("changefeed", e.changefeedID.ID), - zap.Int64("tableID", e.span.TableID), - zap.Stringer("startKey", e.span.StartKey), - zap.Uint64("checkpointTs", stoppedCheckpointTs.Ts)) return true } } From 1895a44ad56eb3536235f3e9ffddd6737a248ba7 Mon Sep 17 00:00:00 2001 From: 3AceShowHand Date: Mon, 13 Apr 2026 17:58:33 +0800 Subject: [PATCH 24/24] fix scheduler logs --- cdc/scheduler/internal/v3/agent/agent.go | 13 +-- cdc/scheduler/internal/v3/agent/table.go | 16 +++- .../internal/v3/keyspan/splitter_write.go | 7 +- .../internal/v3/member/capture_manager.go | 12 +-- .../v3/replication/replication_manager.go | 19 ++-- .../v3/replication/replication_set.go | 90 ++++++++----------- .../v3/scheduler/scheduler_balance.go | 2 +- .../internal/v3/scheduler/scheduler_basic.go | 2 +- .../v3/scheduler/scheduler_drain_capture.go | 20 ++--- .../v3/scheduler/scheduler_manager.go | 2 +- .../v3/scheduler/scheduler_move_table.go | 14 +-- .../v3/scheduler/scheduler_rebalance.go | 6 +- 12 files changed, 99 insertions(+), 104 deletions(-) diff --git a/cdc/scheduler/internal/v3/agent/agent.go b/cdc/scheduler/internal/v3/agent/agent.go index 419652bebd..6838f66b94 100644 --- a/cdc/scheduler/internal/v3/agent/agent.go +++ b/cdc/scheduler/internal/v3/agent/agent.go @@ -115,12 +115,14 @@ func newAgent( log.Info("schedulerv3: no owner found. We will wait for an owner to contact us.", zap.String("namespace", changeFeedID.Namespace), zap.String("changefeed", changeFeedID.ID), - zap.String("ownerCaptureID", ownerCaptureID), - zap.Error(err)) + zap.String("ownerCaptureID", ownerCaptureID)) return result, nil } var ownerCaptureInfo *model.CaptureInfo _, captures, err := client.GetCaptures(ctx) + if err != nil { + return nil, errors.Trace(err) + } for _, captureInfo := range captures { if captureInfo.ID == ownerCaptureID { ownerCaptureInfo = captureInfo @@ -131,7 +133,7 @@ func newAgent( log.Info("schedulerv3: no owner found. We will wait for an owner to contact us.", zap.String("namespace", changeFeedID.Namespace), zap.String("changefeed", changeFeedID.ID), - zap.Error(err)) + zap.String("ownerCaptureID", ownerCaptureID)) return result, nil } result.compat.UpdateCaptureInfo(map[model.CaptureID]*model.CaptureInfo{ @@ -256,7 +258,7 @@ func (a *agent) handleMessage(msg []*schedulepb.Message) (result []*schedulepb.M log.Warn("schedulerv3: unknown message received", zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID), - zap.String("captureID", ownerCaptureID), + zap.String("ownerCaptureID", ownerCaptureID), zap.Stringer("type", message.GetMsgType()), zap.Int64("ownerRevision", ownerRevision), zap.String("processorEpoch", processorEpoch.Epoch), @@ -328,8 +330,7 @@ func (a *agent) handleMessageDispatchTableRequest( epoch schedulepb.ProcessorEpoch, ) { if a.Epoch != epoch { - log.Info("schedulerv3: agent receive dispatch table request "+ - "epoch does not match, ignore it", + log.Info("schedulerv3: agent received dispatch table request with unexpected epoch, ignore it", zap.String("namespace", a.ChangeFeedID.Namespace), zap.String("changefeed", a.ChangeFeedID.ID), zap.String("epoch", epoch.Epoch), diff --git a/cdc/scheduler/internal/v3/agent/table.go b/cdc/scheduler/internal/v3/agent/table.go index 90c74f1a38..b3cf8b11ad 100644 --- a/cdc/scheduler/internal/v3/agent/table.go +++ b/cdc/scheduler/internal/v3/agent/table.go @@ -69,7 +69,7 @@ func (t *tableSpan) getTableSpanStatus(collectStat bool) tablepb.TableStatus { func newAddTableResponseMessage(changefeedID model.ChangeFeedID, status tablepb.TableStatus) *schedulepb.Message { if status.Checkpoint.ResolvedTs < status.Checkpoint.CheckpointTs { - log.Warn("schedulerv3: resolved ts should not less than checkpoint ts", + log.Warn("schedulerv3: resolved ts should not be less than checkpoint ts", zap.String("namespace", changefeedID.Namespace), zap.String("changefeed", changefeedID.ID), zap.Int64("tableID", status.Span.TableID), @@ -98,7 +98,7 @@ func newRemoveTableResponseMessage(changefeedID model.ChangeFeedID, status table // Advance resolved ts to checkpoint ts if table is removed. status.Checkpoint.ResolvedTs = status.Checkpoint.CheckpointTs } else { - log.Warn("schedulerv3: resolved ts should not less than checkpoint ts", + log.Warn("schedulerv3: resolved ts should not be less than checkpoint ts", zap.String("namespace", changefeedID.Namespace), zap.String("changefeed", changefeedID.ID), zap.Int64("tableID", status.Span.TableID), @@ -189,7 +189,7 @@ func (t *tableSpan) handleAddTableTask(ctx context.Context) (result *schedulepb. switch state { case tablepb.TableStateAbsent: done, err := t.executor.AddTableSpan(ctx, t.task.Span, t.task.Checkpoint, t.task.IsPrepare) - if err != nil || !done { + if err != nil { log.Warn("schedulerv3: agent add table failed", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), @@ -201,6 +201,10 @@ func (t *tableSpan) handleAddTableTask(ctx context.Context) (result *schedulepb. status := t.getTableSpanStatus(false) return newAddTableResponseMessage(t.changefeedID, status), errors.Trace(err) } + if !done { + status := t.getTableSpanStatus(false) + return newAddTableResponseMessage(t.changefeedID, status), nil + } if t.task.IsPrepare { log.Info("schedulerv3: agent accepted prepare table task", zap.String("namespace", t.changefeedID.Namespace), @@ -230,7 +234,7 @@ func (t *tableSpan) handleAddTableTask(ctx context.Context) (result *schedulepb. if t.task.status == dispatchTableTaskReceived { done, err := t.executor.AddTableSpan(ctx, t.task.Span, t.task.Checkpoint, false) - if err != nil || !done { + if err != nil { log.Warn("schedulerv3: agent add table failed", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), @@ -242,6 +246,10 @@ func (t *tableSpan) handleAddTableTask(ctx context.Context) (result *schedulepb. status := t.getTableSpanStatus(false) return newAddTableResponseMessage(t.changefeedID, status), errors.Trace(err) } + if !done { + status := t.getTableSpanStatus(false) + return newAddTableResponseMessage(t.changefeedID, status), nil + } log.Info("schedulerv3: agent accepted replicate table task", zap.String("namespace", t.changefeedID.Namespace), zap.String("changefeed", t.changefeedID.ID), diff --git a/cdc/scheduler/internal/v3/keyspan/splitter_write.go b/cdc/scheduler/internal/v3/keyspan/splitter_write.go index 43364b7d4c..be76fa74b6 100644 --- a/cdc/scheduler/internal/v3/keyspan/splitter_write.go +++ b/cdc/scheduler/internal/v3/keyspan/splitter_write.go @@ -74,14 +74,15 @@ func (m *writeSplitter) split( spansNum := getSpansNumber(len(regions), captureNum) if spansNum <= 1 { - log.Warn("schedulerv3: only one capture and the regions number less than"+ - " the maxSpanRegionLimit, skip split span", + log.Info("schedulerv3: skip split span by written keys", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), zap.Int64("tableID", span.TableID), zap.Stringer("startKey", span.StartKey), zap.Stringer("endKey", span.EndKey), - zap.Error(err)) + zap.Int("totalCaptures", captureNum), + zap.Int("regionCount", len(regions)), + zap.Int("spanRegionLimit", spanRegionLimit)) return []tablepb.Span{span} } diff --git a/cdc/scheduler/internal/v3/member/capture_manager.go b/cdc/scheduler/internal/v3/member/capture_manager.go index 651b28be71..b8d706d825 100644 --- a/cdc/scheduler/internal/v3/member/capture_manager.go +++ b/cdc/scheduler/internal/v3/member/capture_manager.go @@ -89,7 +89,7 @@ func (c *CaptureStatus) handleHeartbeatResponse( zap.String("namespace", c.changefeedID.Namespace), zap.String("changefeed", c.changefeedID.ID), zap.String("captureAddr", c.Addr), - zap.String("capture", c.ID), + zap.String("captureID", c.ID), zap.String("epoch", c.Epoch.Epoch), zap.String("respEpoch", epoch.Epoch), zap.Int64("ownerRev", c.OwnerRev.Revision)) @@ -102,7 +102,7 @@ func (c *CaptureStatus) handleHeartbeatResponse( log.Info("schedulerv3: capture initialized", zap.String("namespace", c.changefeedID.Namespace), zap.String("changefeed", c.changefeedID.ID), - zap.String("capture", c.ID), + zap.String("captureID", c.ID), zap.String("captureAddr", c.Addr)) } if resp.Liveness == model.LivenessCaptureStopping { @@ -110,7 +110,7 @@ func (c *CaptureStatus) handleHeartbeatResponse( log.Info("schedulerv3: capture stopping", zap.String("namespace", c.changefeedID.Namespace), zap.String("changefeed", c.changefeedID.ID), - zap.String("capture", c.ID), + zap.String("captureID", c.ID), zap.String("captureAddr", c.Addr)) } c.Tables = resp.Tables @@ -224,7 +224,7 @@ func (c *CaptureManager) HandleMessage( log.Warn("schedulerv3: heartbeat response from unknown capture", zap.String("namespace", c.changefeedID.Namespace), zap.String("changefeed", c.changefeedID.ID), - zap.String("capture", msg.From)) + zap.String("captureID", msg.From)) continue } captureStatus.handleHeartbeatResponse( @@ -247,7 +247,7 @@ func (c *CaptureManager) HandleAliveCaptureUpdate( zap.String("namespace", c.changefeedID.Namespace), zap.String("changefeed", c.changefeedID.ID), zap.String("captureAddr", info.AdvertiseAddr), - zap.String("capture", id)) + zap.String("captureID", id)) msgs = append(msgs, &schedulepb.Message{ To: id, MsgType: schedulepb.MsgHeartbeat, @@ -263,7 +263,7 @@ func (c *CaptureManager) HandleAliveCaptureUpdate( zap.String("namespace", c.changefeedID.Namespace), zap.String("changefeed", c.changefeedID.ID), zap.String("captureAddr", capture.Addr), - zap.String("capture", id)) + zap.String("captureID", id)) delete(c.Captures, id) // Only update changes after initialization. diff --git a/cdc/scheduler/internal/v3/replication/replication_manager.go b/cdc/scheduler/internal/v3/replication/replication_manager.go index cba9d50a09..eaed35ec8d 100644 --- a/cdc/scheduler/internal/v3/replication/replication_manager.go +++ b/cdc/scheduler/internal/v3/replication/replication_manager.go @@ -287,7 +287,7 @@ func (r *Manager) handleMessageHeartbeatResponse( log.Info("schedulerv3: ignore table status no table found", zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), - zap.String("captureID", from), + zap.String("reportingCapture", from), zap.Int64("tableID", status.Span.TableID), zap.Stringer("startKey", status.Span.StartKey), zap.Stringer("state", status.State), @@ -320,7 +320,7 @@ func (r *Manager) handleMessageDispatchTableResponse( log.Warn("schedulerv3: ignore unknown dispatch table response", zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), - zap.String("captureID", from), + zap.String("reportingCapture", from), zap.Bool("hasAddTable", msg.GetAddTable() != nil), zap.Bool("hasRemoveTable", msg.GetRemoveTable() != nil)) return nil, nil @@ -331,7 +331,7 @@ func (r *Manager) handleMessageDispatchTableResponse( log.Info("schedulerv3: ignore table status no table found", zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), - zap.String("captureID", from), + zap.String("reportingCapture", from), zap.Int64("tableID", status.Span.TableID), zap.Stringer("startKey", status.Span.StartKey), zap.Stringer("state", status.State), @@ -389,7 +389,7 @@ func (r *Manager) HandleTasks( // Check if accepting one more task exceeds maxTaskConcurrency. if r.runningTasks.Len() == r.maxTaskConcurrency { - log.Debug("schedulerv3: too many running task", + log.Debug("schedulerv3: too many running tasks", zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID)) // Does not use break, in case there is burst balance task @@ -414,7 +414,7 @@ func (r *Manager) HandleTasks( zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), zap.String("task", task.Name()), - zap.String("captureID", task.AddTable.CaptureID), + zap.String("targetCapture", task.AddTable.CaptureID), zap.Int64("tableID", task.AddTable.Span.TableID), zap.Stringer("startKey", task.AddTable.Span.StartKey), zap.Uint64("checkpointTs", task.AddTable.CheckpointTs)) @@ -423,7 +423,7 @@ func (r *Manager) HandleTasks( zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), zap.String("task", task.Name()), - zap.String("captureID", task.RemoveTable.CaptureID), + zap.String("targetCapture", task.RemoveTable.CaptureID), zap.Int64("tableID", task.RemoveTable.Span.TableID), zap.Stringer("startKey", task.RemoveTable.Span.StartKey)) } else if task.MoveTable != nil { @@ -431,7 +431,7 @@ func (r *Manager) HandleTasks( zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), zap.String("task", task.Name()), - zap.String("captureID", task.MoveTable.DestCapture), + zap.String("targetCapture", task.MoveTable.DestCapture), zap.Int64("tableID", task.MoveTable.Span.TableID), zap.Stringer("startKey", task.MoveTable.Span.StartKey)) } @@ -443,7 +443,7 @@ func (r *Manager) HandleTasks( zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), zap.String("task", task.Name()), - zap.String("captureID", task.RemoveTable.CaptureID), + zap.String("targetCapture", task.RemoveTable.CaptureID), zap.Int64("tableID", task.RemoveTable.Span.TableID), zap.Stringer("startKey", task.RemoveTable.Span.StartKey)) } else if task.MoveTable != nil { @@ -451,7 +451,7 @@ func (r *Manager) HandleTasks( zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), zap.String("task", task.Name()), - zap.String("captureID", task.MoveTable.DestCapture), + zap.String("targetCapture", task.MoveTable.DestCapture), zap.Int64("tableID", task.MoveTable.Span.TableID), zap.Stringer("startKey", task.MoveTable.Span.StartKey)) } @@ -799,6 +799,7 @@ func (r *Manager) logSlowTableInfo(currentPDTime time.Time) { zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID), zap.Int64("tableID", table.Span.TableID), + zap.Stringer("startKey", table.Span.StartKey), zap.String("tableStatus", table.State.String()), zap.Uint64("checkpointTs", table.Checkpoint.CheckpointTs), zap.Uint64("resolvedTs", table.Checkpoint.ResolvedTs), diff --git a/cdc/scheduler/internal/v3/replication/replication_set.go b/cdc/scheduler/internal/v3/replication/replication_set.go index d3a490929f..7c0efaa106 100644 --- a/cdc/scheduler/internal/v3/replication/replication_set.go +++ b/cdc/scheduler/internal/v3/replication/replication_set.go @@ -228,10 +228,10 @@ func NewReplicationSet( // We need to wait its state becomes Stopped or Absent before // proceeding further scheduling. secondary, _ := r.getRole(RoleSecondary) - log.Warn("schedulerv3: found a stopping capture during initializing", + log.Warn("schedulerv3: found a stopping capture during initialization", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("stoppingCapture", captureID), zap.Uint64("inputCheckpointTs", table.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", table.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -253,7 +253,7 @@ func NewReplicationSet( log.Warn("schedulerv3: unknown table state", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("reportingCapture", captureID), zap.Uint64("inputCheckpointTs", table.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", table.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -333,10 +333,9 @@ func (r *ReplicationSet) clearCapture(captureID model.CaptureID, role Role) erro func (r *ReplicationSet) promoteSecondary(captureID model.CaptureID) error { if r.Primary == captureID { secondary, _ := r.getRole(RoleSecondary) - log.Warn("schedulerv3: capture is already promoted as the primary", + log.Warn("schedulerv3: capture is already the primary", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -373,7 +372,7 @@ func (r *ReplicationSet) inconsistentError( fields = append(fields, []zap.Field{ zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("reportingCapture", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -395,7 +394,7 @@ func (r *ReplicationSet) multiplePrimaryError( fields = append(fields, []zap.Field{ zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("reportingCapture", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -500,7 +499,7 @@ func (r *ReplicationSet) pollOnAbsent( log.Info("schedulerv3: replication state transition, add table", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("secondary", captureID), zap.Int64("tableID", r.Span.TableID), zap.Stringer("startKey", r.Span.StartKey), zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), @@ -519,7 +518,7 @@ func (r *ReplicationSet) pollOnAbsent( log.Warn("schedulerv3: ignore input, unexpected replication set state", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("reportingCapture", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -563,7 +562,7 @@ func (r *ReplicationSet) pollOnPrepare( log.Info("schedulerv3: replication state transition, table prepared", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("secondary", captureID), zap.Int64("tableID", r.Span.TableID), zap.Stringer("startKey", r.Span.StartKey), zap.Uint64("checkpointTs", input.Checkpoint.CheckpointTs), @@ -584,12 +583,11 @@ func (r *ReplicationSet) pollOnPrepare( log.Info("schedulerv3: primary is stopped during Prepare", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("stoppedPrimary", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), zap.Stringer("inputState", input.State), - zap.String("primary", r.Primary), zap.String("secondary", secondary), zap.Int64("tableID", r.Span.TableID), zap.Stringer("startKey", r.Span.StartKey)) @@ -598,7 +596,6 @@ func (r *ReplicationSet) pollOnPrepare( } if r.isInRole(captureID, RoleSecondary) { oldState := r.State - secondary, _ := r.getRole(RoleSecondary) err := r.clearCapture(captureID, RoleSecondary) if err != nil { return nil, false, errors.Trace(err) @@ -612,16 +609,15 @@ func (r *ReplicationSet) pollOnPrepare( // Transit to Absent. r.State = ReplicationSetStateAbsent } - log.Info("schedulerv3: capture is stopped during Prepare", + log.Info("schedulerv3: secondary is stopped during Prepare", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("secondary", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("old", oldState), zap.Stringer("new", r.State), zap.String("primary", r.Primary), - zap.String("secondary", secondary), zap.Int64("tableID", r.Span.TableID), zap.Stringer("startKey", r.Span.StartKey)) return nil, true, nil @@ -631,7 +627,7 @@ func (r *ReplicationSet) pollOnPrepare( log.Warn("schedulerv3: ignore input, unexpected replication set state", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("reportingCapture", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -672,13 +668,12 @@ func (r *ReplicationSet) pollOnCommit( log.Info("schedulerv3: there are unknown captures during commit", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("secondary", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), zap.Stringer("inputState", input.State), zap.String("primary", r.Primary), - zap.String("secondary", captureID), zap.Int64("tableID", r.Span.TableID), zap.Stringer("startKey", r.Span.StartKey)) return nil, false, nil @@ -719,12 +714,11 @@ func (r *ReplicationSet) pollOnCommit( log.Info("schedulerv3: primary is stopped during Commit", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("stoppedPrimary", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("old", oldState), zap.Stringer("new", r.State), - zap.String("primary", r.Primary), zap.String("secondary", secondary), zap.Int64("tableID", r.Span.TableID), zap.Stringer("startKey", r.Span.StartKey)) @@ -739,16 +733,12 @@ func (r *ReplicationSet) pollOnCommit( log.Info("schedulerv3: replication state promote secondary", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", secondary), + zap.String("oldPrimary", original), + zap.String("newPrimary", r.Primary), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), - zap.Stringer("state", r.State), - zap.Stringer("inputState", input.State), - zap.String("primary", r.Primary), - zap.String("secondary", secondary), zap.Int64("tableID", r.Span.TableID), zap.Stringer("startKey", r.Span.StartKey), - zap.String("original", original), ) return &schedulepb.Message{ To: r.Primary, @@ -780,35 +770,33 @@ func (r *ReplicationSet) pollOnCommit( log.Info("schedulerv3: secondary is stopped during Commit", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("secondary", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("old", oldState), zap.Stringer("new", r.State), zap.String("primary", r.Primary), - zap.String("secondary", captureID), zap.Int64("tableID", r.Span.TableID), zap.Stringer("startKey", r.Span.StartKey)) } else { log.Info("schedulerv3: secondary is stopped during Commit", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("secondary", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), zap.String("primary", r.Primary), - zap.String("secondary", captureID), zap.Int64("tableID", r.Span.TableID), zap.Stringer("startKey", r.Span.StartKey)) } return nil, true, nil } else if r.isInRole(captureID, RoleUndetermined) { secondary, _ := r.getRole(RoleSecondary) - log.Info("schedulerv3: capture is stopped during Commit", + log.Info("schedulerv3: undetermined capture is stopped during Commit", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("undeterminedCapture", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -854,7 +842,7 @@ func (r *ReplicationSet) pollOnCommit( log.Info("schedulerv3: replication state transition, commit finished", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("primary", captureID), zap.Int64("tableID", r.Span.TableID), zap.Stringer("startKey", r.Span.StartKey), zap.Uint64("checkpointTs", input.Checkpoint.CheckpointTs), @@ -871,10 +859,10 @@ func (r *ReplicationSet) pollOnCommit( return nil, false, nil } else if r.isInRole(captureID, RoleUndetermined) { secondary, _ := r.getRole(RoleSecondary) - log.Info("schedulerv3: capture is stopping during Commit", + log.Info("schedulerv3: undetermined capture is stopping during Commit", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("undeterminedCapture", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -892,7 +880,7 @@ func (r *ReplicationSet) pollOnCommit( log.Warn("schedulerv3: ignore input, unexpected replication set state", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("reportingCapture", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -933,12 +921,11 @@ func (r *ReplicationSet) pollOnReplicating( log.Info("schedulerv3: primary is stopped during Replicating", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("stoppedPrimary", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("old", oldState), zap.Stringer("new", r.State), - zap.String("primary", r.Primary), zap.String("secondary", secondary), zap.Int64("tableID", r.Span.TableID), zap.Stringer("startKey", r.Span.StartKey)) @@ -949,7 +936,7 @@ func (r *ReplicationSet) pollOnReplicating( log.Warn("schedulerv3: ignore input, unexpected replication set state", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("reportingCapture", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -994,7 +981,7 @@ func (r *ReplicationSet) pollOnRemoving( log.Warn("schedulerv3: replication state remove capture with error", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("removingCapture", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -1013,7 +1000,7 @@ func (r *ReplicationSet) pollOnRemoving( log.Warn("schedulerv3: ignore input, unexpected replication set state", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("reportingCapture", captureID), zap.Uint64("inputCheckpointTs", input.Checkpoint.CheckpointTs), zap.Uint64("inputResolvedTs", input.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -1040,7 +1027,7 @@ func (r *ReplicationSet) handleAddTable( log.Warn("schedulerv3: add table is ignored", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("targetCapture", captureID), zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -1074,7 +1061,7 @@ func (r *ReplicationSet) handleMoveTable( log.Warn("schedulerv3: move table is ignored, table already removed", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", dest), + zap.String("targetCapture", dest), zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -1090,7 +1077,7 @@ func (r *ReplicationSet) handleMoveTable( log.Warn("schedulerv3: move table is ignored, table is not replicating", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", dest), + zap.String("targetCapture", dest), zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -1105,7 +1092,7 @@ func (r *ReplicationSet) handleMoveTable( log.Warn("schedulerv3: move table is ignored, target capture is already primary", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", dest), + zap.String("targetCapture", dest), zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("state", r.State), @@ -1120,15 +1107,13 @@ func (r *ReplicationSet) handleMoveTable( if err != nil { return nil, errors.Trace(err) } - secondary, _ := r.getRole(RoleSecondary) log.Info("schedulerv3: replication state transition, move table", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", dest), + zap.String("targetCapture", dest), zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.String("primary", r.Primary), - zap.String("secondary", secondary), zap.Int64("tableID", r.Span.TableID), zap.Stringer("startKey", r.Span.StartKey)) status := tablepb.TableStatus{ @@ -1175,7 +1160,6 @@ func (r *ReplicationSet) handleRemoveTable() ([]*schedulepb.Message, error) { log.Info("schedulerv3: replication state transition, remove table", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", r.Primary), zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.String("primary", r.Primary), @@ -1221,7 +1205,7 @@ func (r *ReplicationSet) handleCaptureShutdown( log.Info("schedulerv3: replication state transition, capture shutdown", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), - zap.String("captureID", captureID), + zap.String("shutdownCapture", captureID), zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), zap.Uint64("resolvedTs", r.Checkpoint.ResolvedTs), zap.Stringer("old", oldState), @@ -1238,7 +1222,7 @@ func (r *ReplicationSet) updateCheckpointAndStats( ) { if checkpoint.ResolvedTs < checkpoint.CheckpointTs { secondary, _ := r.getRole(RoleSecondary) - log.Warn("schedulerv3: resolved ts should not less than checkpoint ts", + log.Warn("schedulerv3: resolved ts should not be less than checkpoint ts", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.Uint64("inputCheckpointTs", checkpoint.CheckpointTs), @@ -1262,7 +1246,7 @@ func (r *ReplicationSet) updateCheckpointAndStats( } if r.Checkpoint.ResolvedTs < r.Checkpoint.CheckpointTs { secondary, _ := r.getRole(RoleSecondary) - log.Warn("schedulerv3: resolved ts should not less than checkpoint ts", + log.Warn("schedulerv3: resolved ts should not be less than checkpoint ts", zap.String("namespace", r.Changefeed.Namespace), zap.String("changefeed", r.Changefeed.ID), zap.Uint64("checkpointTs", r.Checkpoint.CheckpointTs), diff --git a/cdc/scheduler/internal/v3/scheduler/scheduler_balance.go b/cdc/scheduler/internal/v3/scheduler/scheduler_balance.go index a87f4d7f8f..cc059bc01e 100644 --- a/cdc/scheduler/internal/v3/scheduler/scheduler_balance.go +++ b/cdc/scheduler/internal/v3/scheduler/scheduler_balance.go @@ -75,7 +75,7 @@ func (b *balanceScheduler) Schedule( for _, capture := range captures { if capture.State == member.CaptureStateStopping { - log.Debug("schedulerv3: capture is stopping, premature to balance table", + log.Debug("schedulerv3: capture is stopping, skip balancing tables", zap.String("namespace", b.changefeedID.Namespace), zap.String("changefeed", b.changefeedID.ID)) return nil diff --git a/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go b/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go index 8c9d969142..ade8da85ba 100644 --- a/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go +++ b/cdc/scheduler/internal/v3/scheduler/scheduler_basic.go @@ -99,7 +99,7 @@ func (b *basicScheduler) Schedule( // the changefeed cannot make progress // for a cluster with n captures, n should be at least 2 // only n - 1 captures can be in the `stopping` at the same time. - log.Warn("schedulerv3: cannot found capture when add new table", + log.Warn("schedulerv3: cannot find capture when adding a new table", zap.String("namespace", b.changefeedID.Namespace), zap.String("changefeed", b.changefeedID.ID), zap.Int("captureCount", len(captures)), diff --git a/cdc/scheduler/internal/v3/scheduler/scheduler_drain_capture.go b/cdc/scheduler/internal/v3/scheduler/scheduler_drain_capture.go index 9be0d8a5c6..da8eb4c6c0 100644 --- a/cdc/scheduler/internal/v3/scheduler/scheduler_drain_capture.go +++ b/cdc/scheduler/internal/v3/scheduler/scheduler_drain_capture.go @@ -101,7 +101,7 @@ func (d *drainCaptureScheduler) Schedule( log.Info("schedulerv3: drain a stopping capture", zap.String("namespace", d.changefeedID.Namespace), zap.String("changefeed", d.changefeedID.ID), - zap.String("captureID", d.target)) + zap.String("targetCapture", d.target)) } // Currently, the workload is the number of tables in a capture. @@ -115,11 +115,11 @@ func (d *drainCaptureScheduler) Schedule( // this may happen when inject the target, there is at least 2 alive captures // but when schedule the task, only owner alive. if len(captureWorkload) == 0 { - log.Warn("schedulerv3: drain capture scheduler ignore drain target capture, "+ - "since cannot found destination captures", + log.Warn("schedulerv3: drain capture scheduler ignored the drain target capture, "+ + "since it cannot find destination captures", zap.String("namespace", d.changefeedID.Namespace), zap.String("changefeed", d.changefeedID.ID), - zap.String("target", d.target), + zap.String("targetCapture", d.target), zap.Int("captureCount", len(captures))) d.target = captureIDNotDraining return nil @@ -132,12 +132,12 @@ func (d *drainCaptureScheduler) Schedule( replications.Ascend(func(span tablepb.Span, rep *replication.ReplicationSet) bool { if rep.State != replication.ReplicationSetStateReplicating { // only drain the target capture if all tables is replicating, - log.Debug("schedulerv3: drain capture scheduler skip this tick,"+ - "not all table is replicating", + log.Debug("schedulerv3: drain capture scheduler skipped this tick, "+ + "not all tables are replicating", zap.String("namespace", d.changefeedID.Namespace), zap.String("changefeed", d.changefeedID.ID), - zap.String("target", d.target), - zap.String("captureID", rep.Primary), + zap.String("targetCapture", d.target), + zap.String("primary", rep.Primary), zap.Stringer("state", rep.State), zap.Int64("tableID", span.TableID), zap.Stringer("startKey", span.StartKey)) @@ -169,7 +169,7 @@ func (d *drainCaptureScheduler) Schedule( log.Info("schedulerv3: drain capture scheduler finished, since no table", zap.String("namespace", d.changefeedID.Namespace), zap.String("changefeed", d.changefeedID.ID), - zap.String("target", d.target)) + zap.String("targetCapture", d.target)) d.target = captureIDNotDraining return nil } @@ -187,7 +187,7 @@ func (d *drainCaptureScheduler) Schedule( } if minWorkload == math.MaxInt64 { - log.Panic("schedulerv3: drain capture meet unexpected min workload", + log.Panic("schedulerv3: drain capture encountered unexpected min workload", zap.String("namespace", d.changefeedID.Namespace), zap.String("changefeed", d.changefeedID.ID), zap.Int("captureCount", len(captureWorkload)), diff --git a/cdc/scheduler/internal/v3/scheduler/scheduler_manager.go b/cdc/scheduler/internal/v3/scheduler/scheduler_manager.go index 84845958d5..0c4a8213c8 100644 --- a/cdc/scheduler/internal/v3/scheduler/scheduler_manager.go +++ b/cdc/scheduler/internal/v3/scheduler/scheduler_manager.go @@ -104,7 +104,7 @@ func (sm *Manager) MoveTable(span tablepb.Span, target model.CaptureID) { zap.String("changefeed", sm.changefeedID.ID)) } if !moveTableScheduler.addTask(span, target) { - log.Info("schedulerv3: manual move Table task ignored, "+ + log.Info("schedulerv3: manual move table task ignored, "+ "since the last triggered task not finished", zap.String("namespace", sm.changefeedID.Namespace), zap.String("changefeed", sm.changefeedID.ID), diff --git a/cdc/scheduler/internal/v3/scheduler/scheduler_move_table.go b/cdc/scheduler/internal/v3/scheduler/scheduler_move_table.go index 5407b92aea..8dafa09606 100644 --- a/cdc/scheduler/internal/v3/scheduler/scheduler_move_table.go +++ b/cdc/scheduler/internal/v3/scheduler/scheduler_move_table.go @@ -98,10 +98,10 @@ func (m *moveTableScheduler) Schedule( // table may not in the all current tables // if it was removed after manual move table triggered. if !allSpans.Contain(span) { - log.Warn("schedulerv3: move table ignored, since the table cannot found", + log.Warn("schedulerv3: move table ignored, the table cannot be found", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("captureID", task.MoveTable.DestCapture), + zap.String("targetCapture", task.MoveTable.DestCapture), zap.Int64("tableID", span.TableID), zap.Stringer("startKey", span.StartKey)) toBeDeleted = append(toBeDeleted, span) @@ -111,10 +111,10 @@ func (m *moveTableScheduler) Schedule( // the target capture may offline after manual move table triggered. status, ok := captures[task.MoveTable.DestCapture] if !ok { - log.Info("schedulerv3: move table ignored, since the target capture cannot found", + log.Info("schedulerv3: move table ignored, the target capture cannot be found", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("captureID", task.MoveTable.DestCapture), + zap.String("targetCapture", task.MoveTable.DestCapture), zap.Int64("tableID", span.TableID), zap.Stringer("startKey", span.StartKey)) toBeDeleted = append(toBeDeleted, span) @@ -124,7 +124,7 @@ func (m *moveTableScheduler) Schedule( log.Warn("schedulerv3: move table ignored, target capture is not initialized", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("captureID", task.MoveTable.DestCapture), + zap.String("targetCapture", task.MoveTable.DestCapture), zap.Stringer("state", status.State), zap.Int64("tableID", span.TableID), zap.Stringer("startKey", span.StartKey)) @@ -137,7 +137,7 @@ func (m *moveTableScheduler) Schedule( log.Warn("schedulerv3: move table ignored, table not found in the replication set", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("captureID", task.MoveTable.DestCapture), + zap.String("targetCapture", task.MoveTable.DestCapture), zap.Int64("tableID", span.TableID), zap.Stringer("startKey", span.StartKey)) toBeDeleted = append(toBeDeleted, span) @@ -148,7 +148,7 @@ func (m *moveTableScheduler) Schedule( log.Info("schedulerv3: move table ignored, since the table is not replicating now", zap.String("namespace", m.changefeedID.Namespace), zap.String("changefeed", m.changefeedID.ID), - zap.String("captureID", task.MoveTable.DestCapture), + zap.String("targetCapture", task.MoveTable.DestCapture), zap.Stringer("state", rep.State), zap.String("primary", rep.Primary), zap.Int64("tableID", span.TableID), diff --git a/cdc/scheduler/internal/v3/scheduler/scheduler_rebalance.go b/cdc/scheduler/internal/v3/scheduler/scheduler_rebalance.go index ecc02d37c2..f67aa17bfb 100644 --- a/cdc/scheduler/internal/v3/scheduler/scheduler_rebalance.go +++ b/cdc/scheduler/internal/v3/scheduler/scheduler_rebalance.go @@ -83,7 +83,7 @@ func (r *rebalanceScheduler) Schedule( return nil } if rep.State != replication.ReplicationSetStateReplicating { - log.Debug("schedulerv3: not all table replicating, premature to rebalance tables", + log.Debug("schedulerv3: not all tables are replicating, premature to rebalance tables", zap.String("namespace", r.changefeedID.Namespace), zap.String("changefeed", r.changefeedID.ID)) return nil @@ -186,8 +186,8 @@ func newBalanceMoveTables( } if minWorkload == math.MaxInt64 { - log.Panic("schedulerv3: rebalance meet unexpected min workload "+ - "when try to the the target capture", + log.Panic("schedulerv3: rebalance encountered unexpected min workload "+ + "when choosing the target capture", zap.String("namespace", changefeedID.Namespace), zap.String("changefeed", changefeedID.ID)) }