diff --git a/blocksync/msgs.go b/blocksync/msgs.go index 676feea95f7..1a7fd2a663f 100644 --- a/blocksync/msgs.go +++ b/blocksync/msgs.go @@ -8,7 +8,6 @@ import ( bcproto "github.com/tendermint/tendermint/proto/tendermint/blocksync" "github.com/tendermint/tendermint/types" - "github.com/tendermint/tendermint/upgrade" ) const ( @@ -88,23 +87,13 @@ func ValidateMsg(pb proto.Message) error { return errors.New("negative Height") } case *bcproto.BlockResponse: - // V1 block format - block, err := types.BlockFromProto(msg.Block) - if err != nil { + if _, err := types.BlockFromProto(msg.Block); err != nil { return err } - if upgrade.IsUpgraded(block.Height) { - return fmt.Errorf("unexpected BlockResponse at upgraded height %d", block.Height) - } case *bcproto.BlockResponseV2: - // V2 block format (sequencer mode) - block, err := types.BlockV2FromProto(msg.Block) - if err != nil { + if _, err := types.BlockV2FromProto(msg.Block); err != nil { return err } - if !upgrade.IsUpgraded(block.GetHeight()) { - return fmt.Errorf("unexpected BlockResponseV2 before upgraded height %d", block.GetHeight()) - } case *bcproto.NoBlockResponse: if msg.Height < 0 { return errors.New("negative Height") diff --git a/blocksync/msgs_test.go b/blocksync/msgs_test.go index 8d904b97121..9c8f5160843 100644 --- a/blocksync/msgs_test.go +++ b/blocksync/msgs_test.go @@ -1,7 +1,6 @@ package blocksync_test import ( - "bytes" "encoding/hex" "math" "math/big" @@ -13,10 +12,8 @@ import ( "github.com/stretchr/testify/require" "github.com/tendermint/tendermint/blocksync" - "github.com/tendermint/tendermint/crypto" bcproto "github.com/tendermint/tendermint/proto/tendermint/blocksync" "github.com/tendermint/tendermint/types" - "github.com/tendermint/tendermint/upgrade" ) func TestBcBlockRequestMessageValidateBasic(t *testing.T) { @@ -84,65 +81,24 @@ func TestBcStatusResponseMessageValidateBasic(t *testing.T) { } } -func TestValidateMsgBlockResponseRejectsV1AtUpgradeHeight(t *testing.T) { - oldHeight := upgrade.UpgradeBlockHeight - upgrade.SetUpgradeBlockHeight(10) - defer upgrade.SetUpgradeBlockHeight(oldHeight) - - lastBlockID := types.BlockID{ - Hash: bytes.Repeat([]byte{1}, 32), - PartSetHeader: types.PartSetHeader{ - Total: 1, - Hash: bytes.Repeat([]byte{2}, 32), - }, - } - lastCommit := types.NewCommit(9, 0, lastBlockID, []types.CommitSig{{ - BlockIDFlag: types.BlockIDFlagCommit, - ValidatorAddress: bytes.Repeat([]byte{3}, crypto.AddressSize), - Signature: []byte{1}, - }}) - - block := types.MakeBlock(10, []types.Tx{types.Tx("Hello World")}, nil, nil, nil, lastCommit, nil) - block.ProposerAddress = bytes.Repeat([]byte{4}, crypto.AddressSize) - bpb, err := block.ToProto() - require.NoError(t, err) - - err = blocksync.ValidateMsg(&bcproto.BlockResponse{Block: bpb}) - require.Error(t, err) - assert.Contains(t, err.Error(), "unexpected BlockResponse") -} - -func TestValidateMsgBlockResponseV2RejectsPreUpgradeHeight(t *testing.T) { - oldHeight := upgrade.UpgradeBlockHeight - upgrade.SetUpgradeBlockHeight(10) - defer upgrade.SetUpgradeBlockHeight(oldHeight) - - blockV2 := &types.BlockV2{ - ParentHash: common.HexToHash("0x1"), - Hash: common.HexToHash("0x2"), - BaseFee: big.NewInt(1), - Number: 9, - } - - err := blocksync.ValidateMsg(&bcproto.BlockResponseV2{Block: types.BlockV2ToProto(blockV2)}) - require.Error(t, err) - assert.Contains(t, err.Error(), "unexpected BlockResponseV2") -} - -func TestValidateMsgBlockResponseV2AllowsUpgradeHeight(t *testing.T) { - oldHeight := upgrade.UpgradeBlockHeight - upgrade.SetUpgradeBlockHeight(10) - defer upgrade.SetUpgradeBlockHeight(oldHeight) - +// ValidateMsg validates block responses structurally only. The old "reject V1/V2 by upgrade +// height" checks were removed: the timestamp-driven boundary is unknown at message-validation +// time, so the blocksync reactor guards the V1/V2 type at consume time instead. A structurally +// valid block passes regardless of height; a malformed one is rejected. +func TestValidateMsgBlockResponseV2Structural(t *testing.T) { blockV2 := &types.BlockV2{ ParentHash: common.HexToHash("0x1"), Hash: common.HexToHash("0x2"), + Signature: make([]byte, 65), BaseFee: big.NewInt(1), Number: 10, } + require.NoError(t, blocksync.ValidateMsg(&bcproto.BlockResponseV2{Block: types.BlockV2ToProto(blockV2)})) - err := blocksync.ValidateMsg(&bcproto.BlockResponseV2{Block: types.BlockV2ToProto(blockV2)}) - require.NoError(t, err) + // Malformed V2 (bad hash length) -> rejected by structural validation. + bad := types.BlockV2ToProto(blockV2) + bad.Hash = []byte{0x1, 0x2} + require.Error(t, blocksync.ValidateMsg(&bcproto.BlockResponseV2{Block: bad})) } //nolint:lll // ignore line length in tests @@ -164,7 +120,7 @@ func TestBlockchainMessageVectors(t *testing.T) { BlockRequest: &bcproto.BlockRequest{Height: math.MaxInt64}}}, "0a0a08ffffffffffffffff7f"}, {"BlockResponseMessage", &bcproto.Message{Sum: &bcproto.Message_BlockResponse{ - BlockResponse: &bcproto.BlockResponse{Block: bpb}}}, "1a700a6e0a5b0a02080b1803220b088092b8c398feffffff012a0212003a20c4da88e876062aa1543400d50d0eaa0dac88096057949cfb7bca7f3a48c04bf96a20e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855120d0a0b48656c6c6f20576f726c641a00"}, + BlockResponse: &bcproto.BlockResponse{Block: bpb}}}, "1a700a6e0a5b0a02080b1803220b088092b8c398feffffff012a021a003a20fb5fddfb1922fa75ac66398d8adeb962739e0259e7b3e811a81449683a3b78cd6a20e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855120d0a0b48656c6c6f20576f726c641a00"}, {"NoBlockResponseMessage", &bcproto.Message{Sum: &bcproto.Message_NoBlockResponse{ NoBlockResponse: &bcproto.NoBlockResponse{Height: 1}}}, "12020801"}, {"NoBlockResponseMessage", &bcproto.Message{Sum: &bcproto.Message_NoBlockResponse{ diff --git a/blocksync/pool.go b/blocksync/pool.go index 4835a0727d5..fe411226e70 100644 --- a/blocksync/pool.go +++ b/blocksync/pool.go @@ -335,39 +335,47 @@ func (pool *BlockPool) GetPeerIDs() []p2p.ID { // It also enforces basic sanity checks against malicious or buggy peers // (see spec-003-blocksync-malicious-peer-fix): // -// - A peer reporting base > height is structurally impossible and is banned +// - A peer reporting base > height is structurally impossible and is rejected // (CometBFT issue #5801 hardening). // - An existing peer reporting a strictly lower height or base than what it -// reported previously is removed from the pool and banned to prevent +// reported previously is removed from the pool to prevent // poisoning of maxPeerHeight (CVE-2025-24371). +// - Non-persistent peers that report invalid ranges are also banned. +// Persistent peers are removed but not added to bannedPeers so the switch +// can reconnect them later without accepting the invalid range. // - A peer that has been banned recently is silently ignored if it tries to // re-introduce itself before its ban expires. // // In all non-malicious cases, maxPeerHeight is recomputed via // updateMaxPeerHeight, which filters peers whose base is above pool.height. -func (pool *BlockPool) SetPeerRange(peerID p2p.ID, base int64, height int64) { +func (pool *BlockPool) SetPeerRange(srcPeer p2p.Peer, base int64, height int64) { + peerID := srcPeer.ID() pool.mtx.Lock() defer pool.mtx.Unlock() if base > height { - pool.Logger.Info("Peer reporting base greater than height; banning", + pool.Logger.Info("Peer reporting base greater than height; removing", "peer", peerID, "base", base, "height", height) if _, exists := pool.peers[peerID]; exists { pool.removePeer(peerID) } - pool.banPeer(peerID) + if !srcPeer.IsPersistent() { + pool.banPeer(peerID) + } return } peer := pool.peers[peerID] if peer != nil { if height < peer.height || base < peer.base { - pool.Logger.Info("Peer reporting decreasing height or base; removing and banning", + pool.Logger.Info("Peer reporting decreasing height or base; removing", "peer", peerID, "prevHeight", peer.height, "height", height, "prevBase", peer.base, "base", base) pool.removePeer(peerID) - pool.banPeer(peerID) + if !srcPeer.IsPersistent() { + pool.banPeer(peerID) + } return } peer.base = base diff --git a/blocksync/pool_test.go b/blocksync/pool_test.go index 636fe72115c..4d4a1217670 100644 --- a/blocksync/pool_test.go +++ b/blocksync/pool_test.go @@ -2,6 +2,7 @@ package blocksync import ( "fmt" + "net" "testing" "time" @@ -11,6 +12,7 @@ import ( "github.com/tendermint/tendermint/libs/log" tmrand "github.com/tendermint/tendermint/libs/rand" "github.com/tendermint/tendermint/p2p" + "github.com/tendermint/tendermint/p2p/conn" "github.com/tendermint/tendermint/types" ) @@ -18,6 +20,45 @@ func init() { peerTimeout = 2 * time.Second } +// fakePeer is a minimal p2p.Peer implementation for SetPeerRange unit tests. +// Only ID() and IsPersistent() are read by SetPeerRange; the rest are stubs. +type fakePeer struct { + id p2p.ID + persistent bool +} + +func (p *fakePeer) ID() p2p.ID { return p.id } +func (p *fakePeer) IsPersistent() bool { return p.persistent } +func (p *fakePeer) IsOutbound() bool { return false } +func (p *fakePeer) RemoteIP() net.IP { return nil } +func (p *fakePeer) RemoteAddr() net.Addr { return nil } +func (p *fakePeer) CloseConn() error { return nil } +func (p *fakePeer) NodeInfo() p2p.NodeInfo { + return p2p.DefaultNodeInfo{DefaultNodeID: p.id} +} +func (p *fakePeer) Status() conn.ConnectionStatus { return conn.ConnectionStatus{} } +func (p *fakePeer) SocketAddr() *p2p.NetAddress { return nil } +func (p *fakePeer) Send(byte, []byte) bool { return true } +func (p *fakePeer) TrySend(byte, []byte) bool { return true } +func (p *fakePeer) Set(string, interface{}) {} +func (p *fakePeer) Get(string) interface{} { return nil } +func (p *fakePeer) FlushStop() {} +func (p *fakePeer) Start() error { return nil } +func (p *fakePeer) OnStart() error { return nil } +func (p *fakePeer) Stop() error { return nil } +func (p *fakePeer) OnStop() {} +func (p *fakePeer) Reset() error { return nil } +func (p *fakePeer) OnReset() error { return nil } +func (p *fakePeer) IsRunning() bool { return true } +func (p *fakePeer) Quit() <-chan struct{} { return nil } +func (p *fakePeer) String() string { return string(p.id) } +func (p *fakePeer) SetLogger(log.Logger) {} + +func mkPeer(id p2p.ID) p2p.Peer { return &fakePeer{id: id} } +func mkPersistentPeer(id p2p.ID) p2p.Peer { + return &fakePeer{id: id, persistent: true} +} + type testPeer struct { id p2p.ID base int64 @@ -105,7 +146,7 @@ func TestBlockPoolBasic(t *testing.T) { // the pool from spawning any requesters. go func() { for _, peer := range peers { - pool.SetPeerRange(peer.id, start, peer.height) + pool.SetPeerRange(mkPeer(peer.id), start, peer.height) } }() @@ -167,7 +208,7 @@ func TestBlockPoolTimeout(t *testing.T) { // the pool from spawning any requesters. go func() { for _, peer := range peers { - pool.SetPeerRange(peer.id, start, peer.height) + pool.SetPeerRange(mkPeer(peer.id), start, peer.height) } }() @@ -228,7 +269,7 @@ func TestBlockPoolRemovePeer(t *testing.T) { // add peers for peerID, peer := range peers { - pool.SetPeerRange(peerID, peer.base, peer.height) + pool.SetPeerRange(mkPeer(peerID), peer.base, peer.height) } assert.EqualValues(t, 10, pool.MaxPeerHeight()) @@ -265,11 +306,11 @@ func TestSetPeerRange_DecreasingHeight(t *testing.T) { pool.SetLogger(log.TestingLogger()) peerID := p2p.ID("malicious") - pool.SetPeerRange(peerID, 1, 1000) + pool.SetPeerRange(mkPeer(peerID), 1, 1000) require.EqualValues(t, 1000, pool.MaxPeerHeight()) require.NotNil(t, pool.peers[peerID]) - pool.SetPeerRange(peerID, 1, 1) + pool.SetPeerRange(mkPeer(peerID), 1, 1) require.Nil(t, pool.peers[peerID], "peer must be removed after lowering height") require.True(t, pool.isPeerBanned(peerID), "peer must be banned after lowering height") require.EqualValues(t, 0, pool.MaxPeerHeight(), @@ -287,10 +328,10 @@ func TestSetPeerRange_DecreasingBase(t *testing.T) { pool.SetLogger(log.TestingLogger()) peerID := p2p.ID("malicious") - pool.SetPeerRange(peerID, 50, 1000) + pool.SetPeerRange(mkPeer(peerID), 50, 1000) require.NotNil(t, pool.peers[peerID]) - pool.SetPeerRange(peerID, 10, 1000) + pool.SetPeerRange(mkPeer(peerID), 10, 1000) require.Nil(t, pool.peers[peerID], "peer must be removed after lowering base") require.True(t, pool.isPeerBanned(peerID), "peer must be banned after lowering base") } @@ -307,12 +348,45 @@ func TestSetPeerRange_BaseGreaterThanHeight(t *testing.T) { pool.SetLogger(log.TestingLogger()) peerID := p2p.ID("malicious") - pool.SetPeerRange(peerID, 500, 100) + pool.SetPeerRange(mkPeer(peerID), 500, 100) require.Nil(t, pool.peers[peerID], "peer with base > height must not be added") require.True(t, pool.isPeerBanned(peerID)) require.EqualValues(t, 0, pool.MaxPeerHeight()) } +func TestSetPeerRange_PersistentPeerInvalidRangeRemovedButNotBanned(t *testing.T) { + requestsCh := make(chan BlockRequest, 10) + errorsCh := make(chan peerError, 10) + + pool := NewBlockPool(1, requestsCh, errorsCh) + pool.SetLogger(log.TestingLogger()) + + peerID := p2p.ID("persistent-invalid") + pool.SetPeerRange(mkPersistentPeer(peerID), 500, 100) + require.Nil(t, pool.peers[peerID], "persistent peer with base > height must not be added") + require.False(t, pool.isPeerBanned(peerID), "persistent peer must not be written to bannedPeers") + require.EqualValues(t, 0, pool.MaxPeerHeight()) +} + +func TestSetPeerRange_PersistentPeerDecreasingHeightRemovedButNotBanned(t *testing.T) { + requestsCh := make(chan BlockRequest, 10) + errorsCh := make(chan peerError, 10) + + pool := NewBlockPool(1, requestsCh, errorsCh) + pool.SetLogger(log.TestingLogger()) + + peerID := p2p.ID("persistent-decreasing") + pool.SetPeerRange(mkPersistentPeer(peerID), 1, 1000) + require.EqualValues(t, 1000, pool.MaxPeerHeight()) + require.NotNil(t, pool.peers[peerID]) + + pool.SetPeerRange(mkPersistentPeer(peerID), 1, 1) + require.Nil(t, pool.peers[peerID], "persistent peer must be removed after lowering height") + require.False(t, pool.isPeerBanned(peerID), "persistent peer must not be written to bannedPeers") + require.EqualValues(t, 0, pool.MaxPeerHeight(), + "maxPeerHeight must drop to 0 once the invalid persistent peer is removed") +} + // TestBanPeer_PreventsReentry verifies that a banned peer cannot reintroduce // itself with a fresh — even otherwise valid — StatusResponse during the // ban window. @@ -324,10 +398,10 @@ func TestBanPeer_PreventsReentry(t *testing.T) { pool.SetLogger(log.TestingLogger()) peerID := p2p.ID("malicious") - pool.SetPeerRange(peerID, 500, 100) + pool.SetPeerRange(mkPeer(peerID), 500, 100) require.True(t, pool.isPeerBanned(peerID)) - pool.SetPeerRange(peerID, 1, 200) + pool.SetPeerRange(mkPeer(peerID), 1, 200) require.Nil(t, pool.peers[peerID], "banned peer must not be re-added during ban window") require.EqualValues(t, 0, pool.MaxPeerHeight()) } @@ -347,12 +421,12 @@ func TestBanPeer_ExpiryAllowsReentry(t *testing.T) { pool.SetLogger(log.TestingLogger()) peerID := p2p.ID("rehabilitated") - pool.SetPeerRange(peerID, 500, 100) + pool.SetPeerRange(mkPeer(peerID), 500, 100) require.True(t, pool.isPeerBanned(peerID)) time.Sleep(20 * time.Millisecond) - pool.SetPeerRange(peerID, 1, 200) + pool.SetPeerRange(mkPeer(peerID), 1, 200) require.NotNil(t, pool.peers[peerID], "peer should be re-admitted after ban expiry") require.False(t, pool.isPeerBanned(peerID)) require.EqualValues(t, 200, pool.MaxPeerHeight()) @@ -374,10 +448,10 @@ func TestBlockPoolMaxPeerHeightNotPoisonedByHighBase(t *testing.T) { pool := NewBlockPool(200, requestsCh, errorsCh) pool.SetLogger(log.TestingLogger()) - pool.SetPeerRange(p2p.ID("honest"), 1, 200) + pool.SetPeerRange(mkPeer(p2p.ID("honest")), 1, 200) require.EqualValues(t, 200, pool.MaxPeerHeight()) - pool.SetPeerRange(p2p.ID("malicious"), 1_000_000, 1_000_100) + pool.SetPeerRange(mkPeer(p2p.ID("malicious")), 1_000_000, 1_000_100) require.EqualValues(t, 200, pool.MaxPeerHeight(), "malicious peer with base above pool.height must not raise maxPeerHeight") @@ -396,8 +470,8 @@ func TestBlockPoolMaxPeerHeightRefreshesOnPopRequest(t *testing.T) { pool := NewBlockPool(10, requestsCh, errorsCh) pool.SetLogger(log.TestingLogger()) - pool.SetPeerRange(p2p.ID("A"), 1, 20) - pool.SetPeerRange(p2p.ID("B"), 15, 100) + pool.SetPeerRange(mkPeer(p2p.ID("A")), 1, 20) + pool.SetPeerRange(mkPeer(p2p.ID("B")), 15, 100) require.EqualValues(t, 20, pool.MaxPeerHeight(), "peer B (base=15) must not contribute while pool.height (10) is below its base") @@ -434,8 +508,8 @@ func TestSegmentedPeers_BoundaryDeadlockPrevention(t *testing.T) { // the maxPeerHeight branch we want to exercise. pool.startTime = time.Now().Add(-10 * time.Second) - pool.SetPeerRange(p2p.ID("A"), 1, 150) - pool.SetPeerRange(p2p.ID("B"), 151, 500) + pool.SetPeerRange(mkPeer(p2p.ID("A")), 1, 150) + pool.SetPeerRange(mkPeer(p2p.ID("B")), 151, 500) require.EqualValues(t, 150, pool.MaxPeerHeight(), "peer B must be filtered while its base (151) > pool.height (150)") diff --git a/blocksync/reactor.go b/blocksync/reactor.go index f9a03713350..6ae80f95e02 100644 --- a/blocksync/reactor.go +++ b/blocksync/reactor.go @@ -3,12 +3,14 @@ package blocksync import ( "fmt" "reflect" + "sync" "time" "github.com/tendermint/tendermint/l2node" "github.com/tendermint/tendermint/libs/log" "github.com/tendermint/tendermint/p2p" bcproto "github.com/tendermint/tendermint/proto/tendermint/blocksync" + "github.com/tendermint/tendermint/sequencer" sm "github.com/tendermint/tendermint/state" "github.com/tendermint/tendermint/store" "github.com/tendermint/tendermint/types" @@ -25,8 +27,8 @@ const ( // within this much of the system time. // stopSyncingDurationMinutes = 10 - // ask for best height every 10s - statusUpdateIntervalSeconds = 10 + // ask for best height every 5s + statusUpdateIntervalSeconds = 5 // check if we should switch to consensus reactor switchToConsensusIntervalSeconds = 1 ) @@ -40,6 +42,11 @@ type consensusReactor interface { type sequencerReactor interface { // for when we switch from blockchain reactor to sequencer mode StartSequencerRoutines() error + // SetPool updates the peer-height pool reference used by the broadcast + // reactor. Called after a reorg rebuild so the broadcast reactor stops + // reading from the discarded pool. The argument is the *blocksync.BlockPool + // directly; it satisfies sequencer.BlockPool by structural typing. + SetPool(sequencer.BlockPool) } // SequencerState interface for accessing sequencer state (avoids import cycle) @@ -72,8 +79,16 @@ type Reactor struct { pool *BlockPool blockSync bool - requestsCh <-chan BlockRequest - errorsCh <-chan peerError + // Bidirectional so the same buffers can be handed to a freshly-built BlockPool + // during reorg (see SwitchToBlockSyncFromReorg), while still being read here. + requestsCh chan BlockRequest + errorsCh chan peerError + + poolWg sync.WaitGroup + + // Sequencer signature verification (set after upgrade via SetVerifier/SetSigStore) + verifier sequencer.SequencerVerifier + sigStore *sequencer.SignatureStore } // NewReactor returns new reactor instance. @@ -139,6 +154,16 @@ func (bcR *Reactor) SetStateV2(stateV2 SequencerState) { bcR.stateV2 = stateV2 } +// SetVerifier sets the sequencer verifier for block signature validation. +func (bcR *Reactor) SetVerifier(v sequencer.SequencerVerifier) { + bcR.verifier = v +} + +// SetSigStore sets the signature store for persisting/attaching block signatures. +func (bcR *Reactor) SetSigStore(s *sequencer.SignatureStore) { + bcR.sigStore = s +} + // Pool returns the block pool for broadcast reactor to check peer heights. func (bcR *Reactor) Pool() *BlockPool { return bcR.pool @@ -151,11 +176,19 @@ func (bcR *Reactor) OnStart() error { if err != nil { return err } - go bcR.poolRoutine(false) + bcR.startPoolRoutine(false) } return nil } +func (bcR *Reactor) startPoolRoutine(stateSynced bool) { + bcR.poolWg.Add(1) + go func() { + defer bcR.poolWg.Done() + bcR.poolRoutine(stateSynced) + }() +} + // SwitchToBlockSync is called by the state sync reactor when switching to block sync. func (bcR *Reactor) SwitchToBlockSync(state sm.State) error { bcR.blockSync = true @@ -166,10 +199,49 @@ func (bcR *Reactor) SwitchToBlockSync(state sm.State) error { if err != nil { return err } - go bcR.poolRoutine(true) + bcR.startPoolRoutine(true) + return nil +} + +func (bcR *Reactor) StopForReorg() error { + if bcR.IsRunning() { + if err := bcR.Stop(); err != nil { + return err + } + } + bcR.poolWg.Wait() return nil } +// SwitchToBlockSyncFromReorg restarts block sync after a reorg by rebuilding +// the BlockPool. currentHeight is the post-reorg head; the new pool will +// request from currentHeight+1. +// +// Stop -> Reset -> swap pool -> Start, on the same reactor instance. +// +// Reset() requires the reactor to be in stopped state. It can legitimately +// fail when the reactor was never started yet (e.g. early init paths, +// certain test setups). Log the error rather than abort: continuing to +// rebuild the pool and call Start() is the right recovery, and silent +// `_ = bcR.Reset()` would hide genuine lifecycle drift bugs. +func (bcR *Reactor) SwitchToBlockSyncFromReorg(currentHeight int64) error { + if err := bcR.Reset(); err != nil { + bcR.Logger.Error("reset blocksync reactor failed; continuing", "err", err) + } + + bcR.pool = NewBlockPool(currentHeight+1, bcR.requestsCh, bcR.errorsCh) + bcR.pool.SetLogger(bcR.Logger) + + // Point the sequencer broadcast reactor at the new pool so peer-height + // lookups stop reading the discarded one. Best-effort; in PBFT-only or + // test setups SEQUENCER may not be registered. + if seqR, ok := bcR.Switch.Reactor("SEQUENCER").(sequencerReactor); ok { + seqR.SetPool(bcR.pool) + } + + return bcR.Start() +} + // OnStop implements service.Service. func (bcR *Reactor) OnStop() { if bcR.blockSync { @@ -179,6 +251,11 @@ func (bcR *Reactor) OnStop() { } } +// OnReset implements service.Service. +// Pool is rebuilt explicitly in SwitchToBlockSyncFromReorg before Start, so +// nothing to do here; kept only so BaseService.Reset() doesn't panic. +func (bcR *Reactor) OnReset() error { return nil } + // GetChannels implements Reactor func (bcR *Reactor) GetChannels() []*p2p.ChannelDescriptor { return []*p2p.ChannelDescriptor{ @@ -269,6 +346,19 @@ func (bcR *Reactor) respondToPeerV2(msg *bcproto.BlockRequest, src p2p.Peer) boo return src.TrySend(BlocksyncChannel, msgBytes) } + // Attach signature from store; if unavailable, refuse to serve + sig, err := bcR.sigStore.GetSignature(blockData.Hash) + if err != nil { + bcR.Logger.Error("No signature available for block, refusing to serve", + "height", msg.Height, "err", err) + msgBytes, encErr := EncodeMsg(&bcproto.NoBlockResponse{Height: msg.Height}) + if encErr != nil { + return false + } + return src.TrySend(BlocksyncChannel, msgBytes) + } + blockData.Signature = sig + bcR.Logger.Debug("respondToPeerV2: got block from geth", "height", msg.Height, "hash", blockData.Hash.Hex()) @@ -290,24 +380,55 @@ func (bcR *Reactor) L2Node() l2node.L2Node { return bcR.l2Node } -// syncBlockV2 handles syncing a single BlockV2 in sequencer mode. -// No signature verification during sync - only broadcast channel verifies signatures. -// Returns true if sync was successful, false if there was an error (already handled). -func (bcR *Reactor) syncBlockV2(block types.SyncableBlock, blocksSynced *uint64, lastRate *float64, lastHundred *time.Time) bool { +// verifyBlockV2 asserts block is a *types.BlockV2 and verifies its sequencer signature +// (history-aware lookup via IsSequencerAt). Pure check: no pool side effects — the caller +// decides what to redo/ban. Shared by the boundary check and syncBlockV2. +func (bcR *Reactor) verifyBlockV2(block types.SyncableBlock) (*types.BlockV2, error) { blockV2, ok := block.(*types.BlockV2) if !ok { - bcR.Logger.Error("Expected BlockV2 after upgrade", "height", block.GetHeight()) - bcR.pool.RedoRequest(block.GetHeight()) + return nil, fmt.Errorf("expected BlockV2 at height %d, got V1", block.GetHeight()) + } + if err := sequencer.VerifyBlockSignature(bcR.verifier, blockV2); err != nil { + return nil, fmt.Errorf("block signature verification failed at height %d: %w", blockV2.GetHeight(), err) + } + return blockV2, nil +} + +// redoAndBan re-queues the block's height and disconnects the peer that served it. +// Single-block granularity: the PBFT path bans only the mismatched block, while the +// one-time boundary check calls it for both blocks. +func (bcR *Reactor) redoAndBan(block types.SyncableBlock, reason string) { + bcR.Logger.Error("blocksync redo+ban", "height", block.GetHeight(), "reason", reason) + peerID := bcR.pool.RedoRequest(block.GetHeight()) + if peer := bcR.Switch.Peers().Get(peerID); peer != nil { + bcR.Switch.StopPeerForError(peer, fmt.Errorf("blocksync: %s", reason)) + } +} + +// syncBlockV2 handles syncing a single BlockV2 in sequencer mode. +// Verifies block signature using the history-aware verifier, then applies. +func (bcR *Reactor) syncBlockV2(block types.SyncableBlock, blocksSynced *uint64, lastRate *float64, lastHundred *time.Time) bool { + blockV2, err := bcR.verifyBlockV2(block) + if err != nil { + // Wrong type or bad signature == the peer served bad/forged data: redo + ban. + bcR.redoAndBan(block, fmt.Sprintf("BlockV2 verification failed: %v", err)) return false } - // Apply BlockV2 via stateV2 (no signature verification during sync) + // Apply BlockV2 via stateV2. The block is already signature-verified (authentic), so an + // apply failure is almost always a local issue (geth state), not the peer's fault — redo + // without banning to avoid dropping honest peers during a local hiccup. if err := bcR.stateV2.ApplyBlock(blockV2); err != nil { bcR.Logger.Error("Failed to apply BlockV2", "height", blockV2.Number, "err", err) bcR.pool.RedoRequest(blockV2.GetHeight()) return false } + // Persist signature for historical serving + if err := bcR.sigStore.SaveSignature(blockV2.Hash, blockV2.Signature); err != nil { + panic(fmt.Sprintf("failed to save signature at height %d: %v", blockV2.Number, err)) + } + bcR.pool.PopRequest() *blocksSynced++ @@ -372,7 +493,7 @@ func (bcR *Reactor) Receive(chID byte, src p2p.Peer, msgBytes []byte) { case *bcproto.StatusResponse: // Got a peer status. Unverified. bcR.Logger.Debug("SetPeerRange", "peer", src.ID(), "base", msg.Base, "height", msg.Height) - bcR.pool.SetPeerRange(src.ID(), msg.Base, msg.Height) + bcR.pool.SetPeerRange(src, msg.Base, msg.Height) case *bcproto.NoBlockResponse: bcR.Logger.Debug("Peer does not have requested block", "peer", src, "height", msg.Height) default: @@ -389,10 +510,6 @@ func (bcR *Reactor) poolRoutine(stateSynced bool) { trySyncTicker := time.NewTicker(trySyncIntervalMS * time.Millisecond) defer trySyncTicker.Stop() - statusUpdateTicker := time.NewTicker(statusUpdateIntervalSeconds * time.Second) - // no longer stop the ticker, reuse it for sequencer mode status updates - //defer statusUpdateTicker.Stop() - switchToConsensusTicker := time.NewTicker(switchToConsensusIntervalSeconds * time.Second) defer switchToConsensusTicker.Stop() @@ -406,7 +523,12 @@ func (bcR *Reactor) poolRoutine(stateSynced bool) { didProcessCh := make(chan struct{}, 1) + bcR.poolWg.Add(1) go func() { + defer bcR.poolWg.Done() + statusUpdateTicker := time.NewTicker(statusUpdateIntervalSeconds * time.Second) + // no longer stop the ticker, reuse it for sequencer mode status updates + defer statusUpdateTicker.Stop() for { select { case <-bcR.Quit(): @@ -468,6 +590,7 @@ FOR_LOOP: bcR.Logger.Info("Switching to sequencer mode", "height", height) seqR, ok := bcR.Switch.Reactor("SEQUENCER").(sequencerReactor) if ok { + bcR.BroadcastStatusRequest() if err := seqR.StartSequencerRoutines(); err != nil { bcR.Logger.Error("Failed to start sequencer mode", "err", err) } @@ -509,18 +632,9 @@ FOR_LOOP: didProcessCh <- struct{}{} } - if firstSync.GetHeight()+1 == upgrade.UpgradeBlockHeight { - if err := bcR.handleTheLastTMBlock(state, firstSync); err != nil { - bcR.Logger.Error("Error in apply last tendermint block", "err", err) - peerID := bcR.pool.RedoRequest(firstSync.GetHeight()) - peer := bcR.Switch.Peers().Get(peerID) - if peer != nil { - bcR.Switch.StopPeerForError(peer, fmt.Errorf("handleTheLastTMBlock error: %v", err)) - } - continue FOR_LOOP - } - bcR.pool.PopRequest() - blocksSynced++ + // Detect and process the last PBFT block at the timestamp-driven upgrade + // boundary (runs at most once; no-op after the boundary height is recorded). + if bcR.checkAndHandleTheLastTMBlock(state, firstSync, secondSync, &blocksSynced) { continue FOR_LOOP } @@ -530,8 +644,20 @@ FOR_LOOP: continue FOR_LOOP } - // PBFT mode: type assert to *types.Block - first, second := firstSync.(*types.Block), secondSync.(*types.Block) + // PBFT mode: both blocks must be V1. A peer that serves a V2 block in this + // range is malicious or on a bad fork — ban only the mismatched one(s) (this + // is the hot path; avoid false-banning the peer that served a correct block). + first, ok1 := firstSync.(*types.Block) + second, ok2 := secondSync.(*types.Block) + if !ok1 || !ok2 { + if !ok1 { + bcR.redoAndBan(firstSync, "expected V1 block in PBFT range") + } + if !ok2 { + bcR.redoAndBan(secondSync, "expected V1 block in PBFT range") + } + continue FOR_LOOP + } firstParts, err := first.MakePartSet(types.BlockPartSizeBytes) if err != nil { @@ -624,10 +750,34 @@ func (bcR *Reactor) BroadcastStatusRequest() error { return nil } -// just skip the last tendermint block commit check. -// TODO: consider add the commit check in the future or using batch derivation reorg +func (bcR *Reactor) checkAndHandleTheLastTMBlock(state sm.State, first, second types.SyncableBlock, blocksSynced *uint64) bool { + if upgrade.UpgradeBlockHeight() >= 0 { + return false // boundary already discovered; only runs once + } + if first.GetBlockVersion() != types.Version1 || !upgrade.IsUpgradedByTs(first.GetTime()) { + return false // not the boundary; fall through to the normal sync paths + } + if _, err := bcR.verifyBlockV2(second); err != nil { + bcR.redoAndBan(first, fmt.Sprintf("boundary check failed: %v", err)) + bcR.redoAndBan(second, fmt.Sprintf("boundary: second must be a valid V2 block: %v", err)) + return true + } + if err := bcR.handleTheLastTMBlock(state, first); err != nil { + bcR.redoAndBan(first, fmt.Sprintf("apply last tendermint block: %v", err)) + bcR.redoAndBan(second, fmt.Sprintf("boundary rolled back after first failed: %v", err)) + return true + } + upgrade.SetUpgradeBlockHeight(first.GetHeight()) + bcR.pool.PopRequest() + *blocksSynced++ + return true +} + func (bcR *Reactor) handleTheLastTMBlock(state sm.State, lastSyncable types.SyncableBlock) error { - last := lastSyncable.(*types.Block) + last, ok := lastSyncable.(*types.Block) + if !ok { + return fmt.Errorf("handleTheLastTMBlock: expected V1 block at height %d", lastSyncable.GetHeight()) + } lastParts, err := last.MakePartSet(types.BlockPartSizeBytes) if err != nil { bcR.Logger.Error( @@ -665,12 +815,21 @@ func (bcR *Reactor) handleTheLastTMBlock(state sm.State, lastSyncable types.Sync func (bcR *Reactor) getHeight() int64 { height := bcR.store.Height() - // In sequencer mode, get height directly from l2Node (geth) - if bcR.l2Node != nil && upgrade.IsUpgraded(height+1) { + if !upgrade.IsUpgraded(height + 1) { + return height + } + // In sequencer mode, get height from state v2 or L2 node + if bcR.stateV2 != nil { + if l2Height := bcR.stateV2.LatestHeight(); l2Height > height { + return l2Height + } + } + + if bcR.l2Node != nil { if l2Block, err := bcR.l2Node.GetLatestBlockV2(); err == nil && l2Block != nil { bcR.Logger.Debug("StatusRequest", "l2Height", l2Block.Number) if l2Height := int64(l2Block.Number); l2Height > height { - height = l2Height + return l2Height } } } diff --git a/consensus/reactor.go b/consensus/reactor.go index 51769d2f872..531139640f8 100644 --- a/consensus/reactor.go +++ b/consensus/reactor.go @@ -81,7 +81,7 @@ func (conR *Reactor) OnStart() error { if upgrade.IsUpgraded(conR.conS.Height) { conR.Logger.Info("Already upgraded to sequencer mode, consensus reactor will not start", "height", conR.conS.Height, - "upgradeHeight", upgrade.UpgradeBlockHeight) + "upgradeHeight", upgrade.UpgradeBlockHeight()) return nil } @@ -113,14 +113,6 @@ func (conR *Reactor) OnStop() { } } -// StopForUpgrade stops the consensus reactor when upgrading to sequencer mode. -// This is called when the chain reaches the upgrade height. -func (conR *Reactor) StopForUpgrade() { - conR.Logger.Info("Stopping consensus reactor for sequencer upgrade", - "height", conR.conS.Height, - "upgradeHeight", upgrade.UpgradeBlockHeight) - conR.OnStop() -} // SwitchToConsensus switches from block_sync mode to consensus mode. // It resets the state, turns off block_sync, and starts the consensus state-machine @@ -128,7 +120,7 @@ func (conR *Reactor) SwitchToConsensus(state sm.State, skipWAL bool) { conR.Logger.Info("SwitchToConsensus") // We have no votes, so reconstruct LastCommit from SeenCommit. - if state.LastBlockHeight > 0 { + if state.LastBlockHeight > 0 && !upgrade.IsUpgraded(state.LastBlockHeight+1) { conR.conS.reconstructLastCommit(state) } diff --git a/consensus/state.go b/consensus/state.go index 69819f8f673..da8562ab15c 100644 --- a/consensus/state.go +++ b/consensus/state.go @@ -1809,21 +1809,32 @@ func (cs *State) finalizeCommit(height int64) { } // Check for upgrade to sequencer mode - if upgrade.IsUpgraded(cs.Height) { - logger.Info("Upgrade height reached, switching to sequencer mode", - "height", cs.Height, - "upgradeHeight", upgrade.UpgradeBlockHeight) + if upgrade.IsUpgradedByTs(block.Time.UnixMilli()) { + // persistent the upgradeHeight, load it into upgradeHeight when startup + upgrade.SetUpgradeBlockHeight(height) - // Call upgrade callback first (allows reactor to handle transition) - if cs.onUpgrade != nil { - cs.onUpgrade() - } + logger.Info("Upgrade time reached, switching to sequencer mode", + "height", height, + "upgradeTime", upgrade.UpgradeBlockTime()) - // Stop consensus state + // Stop consensus state first. + // + // This closes the BaseService quit channel synchronously, so any + // goroutine started by onUpgrade can safely call cs.Wait() to block + // until receiveRoutine has fully exited (cs.done closed). Stopping + // before onUpgrade guarantees the happens-before relationship that + // the reactor switch relies on. if err := cs.Stop(); err != nil { logger.Error("Failed to stop consensus state", "err", err) panic(err) } + + // Notify upgrade observer (e.g. node-level reactor switch). + // The callback is expected to do its work in a goroutine and use + // cs.Wait() for synchronization rather than time-based heuristics. + if cs.onUpgrade != nil { + cs.onUpgrade() + } return } diff --git a/go.mod b/go.mod index 37550218781..ffdc0d55f8b 100644 --- a/go.mod +++ b/go.mod @@ -64,6 +64,7 @@ require ( github.com/Nvveen/Gotty v0.0.0-20120604004816-cd527374f1e5 // indirect github.com/OpenPeeDeeP/depguard v1.1.0 // indirect github.com/StackExchange/wmi v1.2.1 // indirect + github.com/VictoriaMetrics/fastcache v1.12.1 // indirect github.com/alexkohler/prealloc v1.0.0 // indirect github.com/alingse/asasalint v0.0.11 // indirect github.com/ashanbrown/forbidigo v1.3.0 // indirect @@ -216,6 +217,7 @@ require ( github.com/pointlander/jetset v1.0.1-0.20190518214125-eee7eff80bd4 // indirect github.com/polyfloyd/go-errorlint v1.0.2 // indirect github.com/prometheus/procfs v0.12.0 // indirect + github.com/prometheus/tsdb v0.7.1 // indirect github.com/quasilyte/go-ruleguard v0.3.17 // indirect github.com/quasilyte/gogrep v0.0.0-20220120141003-628d8b3623b5 // indirect github.com/quasilyte/regex/syntax v0.0.0-20200407221936-30656e2c4a95 // indirect diff --git a/go.sum b/go.sum index b3c0e4873c7..0d1de343307 100644 --- a/go.sum +++ b/go.sum @@ -112,6 +112,8 @@ github.com/alexkohler/prealloc v1.0.0 h1:Hbq0/3fJPQhNkN0dR95AVrr6R7tou91y0uHG5pO github.com/alexkohler/prealloc v1.0.0/go.mod h1:VetnK3dIgFBBKmg0YnD9F9x6Icjd+9cvfHR56wJVlKE= github.com/alingse/asasalint v0.0.11 h1:SFwnQXJ49Kx/1GghOFz1XGqHYKp21Kq1nHad/0WQRnw= github.com/alingse/asasalint v0.0.11/go.mod h1:nCaoMhw7a9kSJObvQyVzNTPBDbNpdocqrSP7t/cW5+I= +github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156 h1:eMwmnE/GDgah4HI848JfFxHt+iPb26b4zyfspmqY0/8= +github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156/go.mod h1:Cb/ax3seSYIx7SuZdm2G2xzfwmv3TPSk2ucNfQESPXM= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/apache/thrift v0.12.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ= github.com/apache/thrift v0.13.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ= @@ -1564,6 +1566,7 @@ golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= diff --git a/l2node/l2node.go b/l2node/l2node.go index 1e1bc21365b..ba0300db22d 100644 --- a/l2node/l2node.go +++ b/l2node/l2node.go @@ -57,8 +57,9 @@ type L2Node interface { RequestBlockDataV2(parentHash []byte) (block *BlockV2, collectedL1Msgs bool, err error) // ApplyBlockV2 applies a BlockV2 to the L2 execution layer. - // Uses Engine API (NewL2Block) internally. - ApplyBlockV2(block *BlockV2) error + // Returns (applied, err): applied=true if geth state changed, + // applied=false if idempotent skip (block already on-chain with same hash). + ApplyBlockV2(block *BlockV2) (applied bool, err error) // GetBlockByNumber retrieves a BlockV2 by its number. // Can be implemented using geth's eth_getBlockByNumber JSON-RPC. diff --git a/l2node/mock.go b/l2node/mock.go index b5cbf3354d0..941eed212bb 100644 --- a/l2node/mock.go +++ b/l2node/mock.go @@ -5,6 +5,7 @@ import ( "fmt" "os" + "github.com/morph-l2/go-ethereum/common" tmjson "github.com/tendermint/tendermint/libs/json" tmos "github.com/tendermint/tendermint/libs/os" tmrand "github.com/tendermint/tendermint/libs/rand" @@ -14,7 +15,9 @@ var _ L2Node = &MockL2Node{} type MockL2Node struct { TxNumber int - ValidatorSetFile string // used to control the validator set updates + ValidatorSetFile string // used to control the validator set updates + maxAppliedHeight uint64 // tracks highest block applied (for V2 idempotent check) + maxAppliedHash common.Hash // hash at maxAppliedHeight, so same-height reorg blocks aren't skipped } func NewMockL2Node(n int, validatorSetFile string) L2Node { @@ -139,9 +142,17 @@ func (l *MockL2Node) RequestBlockDataV2(parentHash []byte) (*BlockV2, bool, erro }, false, nil } -func (l *MockL2Node) ApplyBlockV2(block *BlockV2) error { - // Mock implementation: do nothing - return nil +func (l *MockL2Node) ApplyBlockV2(block *BlockV2) (bool, error) { + // Skip only a true duplicate (same height AND same hash). A same-height + // block with a different hash is a reorg and must be applied; otherwise + // reorg unit tests get false-positive passes. + if block.Number < l.maxAppliedHeight || + (block.Number == l.maxAppliedHeight && block.Hash == l.maxAppliedHash) { + return false, nil + } + l.maxAppliedHeight = block.Number + l.maxAppliedHash = block.Hash + return true, nil } func (l *MockL2Node) GetBlockByNumber(height uint64) (*BlockV2, error) { diff --git a/node/node.go b/node/node.go index 7d2b97e8d8b..2a40c2d16b0 100644 --- a/node/node.go +++ b/node/node.go @@ -111,8 +111,10 @@ func DefaultNewNode(config *cfg.Config, logger log.Logger) (*Node, error) { DefaultDBProvider, DefaultMetricsProvider(config.Instrumentation), logger, - nil, - nil, + nil, // sequencerVerifier + nil, // l1Tracker + nil, // sequencerSigner + nil, // ha: no HA in default node ) } @@ -140,6 +142,8 @@ type Option func(*Node) // See: https://github.com/tendermint/tendermint/issues/4595 type blockSyncReactor interface { SwitchToBlockSync(sm.State) error + StopForReorg() error + SwitchToBlockSyncFromReorg(int64) error } // CustomReactors allows you to add custom reactors (name -> p2p.Reactor) to @@ -237,20 +241,42 @@ type Node struct { // Sequencer mode (after upgrade) stateV2 *sequencer.StateV2 blockBroadcastReactor *sequencer.BlockBroadcastReactor + sigStore *sequencer.SignatureStore + ha sequencer.SequencerHA } -func initDBs(config *cfg.Config, dbProvider DBProvider) (blockStore *store.BlockStore, stateDB dbm.DB, err error) { +func initDBs(config *cfg.Config, dbProvider DBProvider) (blockStore *store.BlockStore, stateDB dbm.DB, sigStore *sequencer.SignatureStore, err error) { + var opened []dbm.DB + defer func() { + if err != nil { + for _, db := range opened { + _ = db.Close() + } + blockStore, stateDB, sigStore = nil, nil, nil + } + }() + var blockStoreDB dbm.DB blockStoreDB, err = dbProvider(&DBContext{"blockstore", config}) if err != nil { return } + opened = append(opened, blockStoreDB) blockStore = store.NewBlockStore(blockStoreDB) stateDB, err = dbProvider(&DBContext{"state", config}) if err != nil { return } + opened = append(opened, stateDB) + + var sigDB dbm.DB + sigDB, err = dbProvider(&DBContext{"signatures", config}) + if err != nil { + return + } + opened = append(opened, sigDB) + sigStore = sequencer.NewSignatureStore(sigDB) return } @@ -488,30 +514,37 @@ func createConsensusReactor( func createSequencerComponents( l2Node l2node.L2Node, pool *bc.BlockPool, - waitSync bool, logger log.Logger, verifier sequencer.SequencerVerifier, + l1Tracker sequencer.L1Tracker, signer sequencer.Signer, + sigStore *sequencer.SignatureStore, + ha sequencer.SequencerHA, ) (*sequencer.StateV2, *sequencer.BlockBroadcastReactor, error) { // Create StateV2 stateV2, err := sequencer.NewStateV2( l2Node, - sequencer.DefaultBlockInterval, logger, verifier, + l1Tracker, signer, + sigStore, + ha, ) if err != nil { return nil, nil, fmt.Errorf("failed to create StateV2: %w", err) } - // Create BlockBroadcastReactor (not started yet) + // Create BlockBroadcastReactor (not started yet). + // Routines are started later via StartSequencerRoutines() — see OnStart() + // doc comment for the full explanation. broadcastReactor := sequencer.NewBlockBroadcastReactor( pool, stateV2, - waitSync, logger, verifier, + l1Tracker, + sigStore, ) broadcastReactor.SetLogger(logger.With("module", "sequencer")) @@ -766,12 +799,14 @@ func NewNode( metricsProvider MetricsProvider, logger log.Logger, sequencerVerifier sequencer.SequencerVerifier, + l1Tracker sequencer.L1Tracker, sequencerSigner sequencer.Signer, + ha sequencer.SequencerHA, options ...Option, ) ( *Node, error, ) { - blockStore, stateDB, err := initDBs(config, dbProvider) + blockStore, stateDB, sigStore, err := initDBs(config, dbProvider) if err != nil { return nil, err } @@ -781,6 +816,14 @@ func NewNode( sm.StoreOptions{DiscardABCIResponses: config.Storage.DiscardABCIResponses}, ) + // Restore the persisted sequencer-upgrade boundary (if any) and wire the store so future + // SetUpgradeBlockHeight calls (finalizeCommit / blocksync) persist it automatically. + // A DB read failure here is fatal: booting with a reset boundary would let the node + // re-run PBFT past the upgrade. + if err := upgrade.SetStore(stateDB); err != nil { + return nil, err + } + state, genDoc, err := LoadStateFromDBOrGenesisDocProvider(stateDB, genesisDocProvider) if err != nil { return nil, err @@ -982,6 +1025,8 @@ func NewNode( indexerService: indexerService, blockIndexer: blockIndexer, eventBus: eventBus, + sigStore: sigStore, + ha: ha, } node.BaseService = *service.NewBaseService(logger, "Node", node) @@ -993,16 +1038,25 @@ func NewNode( if node.stateV2, node.blockBroadcastReactor, err = createSequencerComponents( l2NodeRef, bcR.Pool(), - blockSync || stateSync, logger, sequencerVerifier, + l1Tracker, sequencerSigner, + sigStore, + ha, // HA service injected from NewNode caller; nil disables HA mode ); err != nil { return nil, err } - // Set stateV2 on blocksync reactor for post-upgrade sync + // Wire HA FSM callback: ApplyBlock handles geth apply + SaveSignature. + if ha != nil { + ha.SetOnBlockApplied(node.stateV2.ApplyBlock) + } + + // Set stateV2&verifier&sigStore on blocksync reactor for post-upgrade bcR.SetStateV2(node.stateV2) + bcR.SetVerifier(sequencerVerifier) + bcR.SetSigStore(sigStore) // Register BlockBroadcastReactor with Switch sw.AddReactor("SEQUENCER", node.blockBroadcastReactor) @@ -1144,6 +1198,11 @@ func (n *Node) OnStop() { n.Logger.Error("problem closing statestore", "err", err) } } + if n.sigStore != nil { + if err := n.sigStore.Close(); err != nil { + n.Logger.Error("problem closing sigstore", "err", err) + } + } } // ConfigureRPC makes sure RPC has all the objects it needs to operate. @@ -1593,20 +1652,101 @@ func (n *Node) startSequencerMode() { func (n *Node) switchToSequencerMode() { n.Logger.Info("Upgrade callback triggered, scheduling switch to sequencer mode") - // NOTE: Must use goroutine to avoid deadlock - onUpgrade is called from finalizeCommit, - // and consensusReactor.Stop() would try to stop the state that's currently running. + // Run in a goroutine: this callback executes inside finalizeCommit which + // runs on receiveRoutine. We must let receiveRoutine return before it can + // observe cs.Quit() and close cs.done. Calling consensusReactor.Stop() + // synchronously here would block on conS.Wait() — which waits for the very + // goroutine we are running on — and deadlock. go func() { - // Wait a moment for finalizeCommit to complete - time.Sleep(100 * time.Millisecond) - - // Stop consensus reactor - n.Logger.Info("Stopping consensus reactor...") + // Wait until receiveRoutine has fully exited. + // + // cs.Stop() has already been called by finalizeCommit before invoking + // this callback (see consensus/state.go upgrade branch), so cs.Quit() + // is closed and receiveRoutine will hit the case <-cs.Quit() branch as + // soon as finalizeCommit returns. cs.done is closed at the very end of + // receiveRoutine. Waiting on it replaces the previous 100ms sleep with + // a deterministic synchronization point. + n.Logger.Info("Waiting for consensus state to drain...") + n.consensusState.Wait() + n.Logger.Info("Consensus state drained") + + // Belt-and-suspenders: ensure last PBFT votes / NewRoundStepMessage have + // time to flush to peers before we tear down the reactor's gossip routines. + // This compensates for tightened PBFT timeouts (timeout_precommit / commit). + time.Sleep(1 * time.Second) + + // Stop consensus reactor through the standard BaseService.Stop path so + // its stopped flag is set; this prevents the P2P switch from invoking + // OnStop a second time during node shutdown. OnStop will call + // conS.Stop() (idempotent: returns ErrAlreadyStopped, logged only) and + // conS.Wait() (already satisfied since cs.done is closed). + n.Logger.Info("Stopping consensus reactor for upgrade...") if err := n.consensusReactor.Stop(); err != nil { n.Logger.Error("Failed to stop consensus reactor", "err", err) } n.Logger.Info("Consensus reactor stopped") - // Start broadcast reactor + // Start broadcast reactor for sequencer mode. n.startSequencerMode() }() } + +// StopReactorsBeforeReorg stops the broadcast reactor and the blocksync reactor +// in preparation for a derivation-driven reorg. After this returns, no goroutine +// from either reactor is running and no new BlockResponseV2 / BlockRequest will +// be processed. HA-enabled nodes (sequencers) are skipped — sequencers do not +// participate in automatic reorg per docs/reorg/reorg兼容问题与优化.md. +// +// Order: broadcast first (stop applying new blocks via P2P), then blocksync +// (stop sync requests). Each Stop is gated on IsRunning so this is idempotent +// and safe to call when a reactor is already stopped or was never started. +func (n *Node) StopReactorsBeforeReorg() error { + if n.ha != nil { + return nil + } + + if err := n.blockBroadcastReactor.StopForReorg(); err != nil { + return err + } + + bcR, ok := n.bcReactor.(blockSyncReactor) + if !ok { + return fmt.Errorf("this blockchain reactor does not support stopping for reorg") + } + if err := bcR.StopForReorg(); err != nil { + return err + } + return nil +} + +// StartReactorsAfterReorg restarts the blocksync and broadcast reactors after +// a reorg, with the post-reorg head height as currentHeight. The blocksync +// reactor rebuilds its BlockPool from currentHeight+1 and is responsible for +// catching up the local node back to the network tip. Once caught up its +// switchToConsensus tick re-invokes StartSequencerRoutines on the broadcast +// reactor, which is brought back to a clean state by the Reset+Start below. +// +// Reset on a still-running reactor would fail with "can't reset running ..." +// — Stop must precede it (handled by StopReactorsBeforeReorg). The IsRunning +// check below makes this method idempotent in the rare case of partial state +// (e.g., previous Stop returned an error). +func (n *Node) StartReactorsAfterReorg(currentHeight int64) error { + if n.ha != nil { + return nil + } + + bcR, ok := n.bcReactor.(blockSyncReactor) + if !ok { + return fmt.Errorf("this blockchain reactor does not support reorg restart") + } + + // Reset() requires the reactor to be in stopped state. It can legitimately + // fail when the reactor was never started yet (e.g. very early init paths) + // or when StopReactorsBeforeReorg silently skipped Stop because the reactor + // wasn't running. Log and continue rather than abort the reorg flow — + // surfacing the error gives observability without blocking recovery. + if err := n.blockBroadcastReactor.Reset(); err != nil { + n.Logger.Error("reset broadcast reactor failed; continuing", "err", err) + } + return bcR.SwitchToBlockSyncFromReorg(currentHeight) +} diff --git a/rpc/test/helpers.go b/rpc/test/helpers.go index 0cc19f182e3..c1dce04961f 100644 --- a/rpc/test/helpers.go +++ b/rpc/test/helpers.go @@ -178,8 +178,10 @@ func NewTendermint(app abci.Application, opts *Options) *nm.Node { nm.DefaultDBProvider, nm.DefaultMetricsProvider(config.Instrumentation), logger, - nil, - nil, + nil, // sequencerVerifier + nil, // sequencerHealthGate + nil, // sequencerSigner + nil, // ha: no HA in RPC test node ) if err != nil { panic(err) diff --git a/sequencer/block_cache.go b/sequencer/block_cache.go index ff71a1759ba..dca239b89bf 100644 --- a/sequencer/block_cache.go +++ b/sequencer/block_cache.go @@ -193,3 +193,20 @@ func (rb *BlockRingBuffer) RollbackTo(targetHeight uint64) { defer rb.mtx.Unlock() rb.rollbackTo(targetHeight) } + +// Clear drops every cached block. Used during a reactor reset (e.g. reorg +// path via OnReset) when the chain head may have moved backwards and any +// previously cached block could be off-chain. Subsequent Add() calls +// repopulate the cache from the new head. +func (rb *BlockRingBuffer) Clear() { + rb.mtx.Lock() + defer rb.mtx.Unlock() + for i := range rb.blocks { + rb.blocks[i] = nil + } + rb.byHeight = make(map[uint64]*BlockV2) + rb.byHash = make(map[common.Hash]*BlockV2) + rb.position = make(map[uint64]int) + rb.minHeight = 0 + rb.maxHeight = 0 +} diff --git a/sequencer/block_cache_test.go b/sequencer/block_cache_test.go index b8b63302ec5..d598f60de25 100644 --- a/sequencer/block_cache_test.go +++ b/sequencer/block_cache_test.go @@ -239,17 +239,12 @@ func TestBlockRingBuffer_RollbackTo_All(t *testing.T) { rb.Add(makeBlock(i, byte(i))) } - // Rollback to before first block + // Rollback to before first block — removes all blocks with h > 99 rb.RollbackTo(99) - // All blocks removed, but minHeight stays at 100 - // (rollbackTo only removes > targetHeight) - assert.Equal(t, 1, rb.Count()) // Block 100 remains (not > 99, it's == 100) - - // Actually let's re-read the logic... rollbackTo removes h > targetHeight - // So rollbackTo(99) removes 100, 101, 102, 103, 104 - // Wait, the loop is: for h := rb.maxHeight; h > targetHeight; h-- - // So it removes 104, 103, 102, 101, 100 (all > 99) + // rollbackTo loop: for h := rb.maxHeight; h > targetHeight; h-- + // removes 104, 103, 102, 101, 100 (all > 99) + assert.Equal(t, 0, rb.Count()) } func TestBlockRingBuffer_RollbackTo_All_Correct(t *testing.T) { diff --git a/sequencer/block_verify.go b/sequencer/block_verify.go new file mode 100644 index 00000000000..a0ba5598150 --- /dev/null +++ b/sequencer/block_verify.go @@ -0,0 +1,113 @@ +package sequencer + +import ( + "fmt" + "math/big" + + "github.com/morph-l2/go-ethereum/common" + "github.com/morph-l2/go-ethereum/core/types" + "github.com/morph-l2/go-ethereum/crypto" + "github.com/morph-l2/go-ethereum/trie" +) + +// computeBlockHash recomputes a BlockV2's block hash from its content fields. +// +// This MUST stay in lockstep with geth's catalyst.executableDataToBlock so that +// honest blocks always pass: the sequencer signs newBlockResult.Block.Hash(), +// which is computed from the same canonical header layout used here. +// +// Notes on field alignment: +// - Difficulty=0, Nonce=0, UncleHash=EmptyUncleHash, Extra=[] are L2 consensus +// constants (consensus/l2 sets them in Prepare and executableDataToBlock). +// - NextL1MsgIndex and BatchHash live in the Header struct but are NOT included +// in Header.Hash() (see go-ethereum/core/types/block.go headerHashing layout), +// so we deliberately do not need to set them for hash computation. +// - Coinbase is taken directly from block.Miner. Honest sequencers always send +// Miner = newBlockResult.Block.Coinbase(), which is empty for V2 blocks +// (BuildBlock does not set coinbase, and the L2 engine's Prepare overrides +// to EmptyAddress when FeeVault is enabled). We do not re-apply the FeeVault +// override here: it is unnecessary in the honest case, and rejecting any +// mismatch is the correct behavior in the attacker case. +func computeBlockHash(block *BlockV2) (common.Hash, error) { + txs := make(types.Transactions, len(block.Transactions)) + for i, raw := range block.Transactions { + var tx types.Transaction + if err := tx.UnmarshalBinary(raw); err != nil { + return common.Hash{}, fmt.Errorf("decode tx %d: %w", i, err) + } + txs[i] = &tx + } + + header := &types.Header{ + ParentHash: block.ParentHash, + UncleHash: types.EmptyUncleHash, + Coinbase: block.Miner, + Root: block.StateRoot, + TxHash: types.DeriveSha(txs, trie.NewStackTrie(nil)), + ReceiptHash: block.ReceiptRoot, + Bloom: types.BytesToBloom(block.LogsBloom), + Difficulty: new(big.Int), + Number: new(big.Int).SetUint64(block.Number), + GasLimit: block.GasLimit, + GasUsed: block.GasUsed, + Time: block.Timestamp, + Extra: []byte{}, + BaseFee: block.BaseFee, + } + return header.Hash(), nil +} + +// VerifyBlockSignature verifies a block's ECDSA signature against the +// expected sequencer at that block's height. All V2 blocks must carry a valid signature. +// +// Hash binding: before recovering the signer, this function recomputes the +// block hash from the canonical content fields and rejects the block if it does +// not match block.Hash. Without this binding, an attacker could pair a valid +// (Hash, Signature) recovered from a legitimately signed block with a tampered +// body, and other peers would accept it - causing a chain fork at the next +// height. The geth-side check in NewL2BlockV2 is the safety net; this check +// rejects tampered blocks early so we never persist orphan signatures into +// SignatureStore. +// VerifyBlockSignature returns one of two classes of errors: +// - wrapping ErrInvalidSignature: the block is provably bad — the sender +// should be banned (malformed signature, tampered body, non-sequencer +// signer, etc.). +// - wrapping ErrVerifierUnavailable: the local verifier could not decide +// (not configured, L1 backend failure). The peer may be honest; the +// caller must not ban on this error. +func VerifyBlockSignature(verifier SequencerVerifier, block *BlockV2) error { + if verifier == nil { + return fmt.Errorf("%w: verifier not configured", ErrVerifierUnavailable) + } + + if len(block.Signature) != crypto.SignatureLength { + return fmt.Errorf("%w: invalid signature length %d (want %d) at height %d", + ErrInvalidSignature, len(block.Signature), crypto.SignatureLength, block.Number) + } + + computedHash, err := computeBlockHash(block) + if err != nil { + return fmt.Errorf("%w: compute hash at height %d: %v", ErrInvalidSignature, block.Number, err) + } + if block.Hash != computedHash { + return fmt.Errorf("%w: hash mismatch at height %d: declared %s, computed %s", + ErrInvalidSignature, block.Number, block.Hash.Hex(), computedHash.Hex()) + } + + pubKey, err := crypto.SigToPub(computedHash.Bytes(), block.Signature) + if err != nil { + return fmt.Errorf("%w: recover pubkey at height %d: %v", ErrInvalidSignature, block.Number, err) + } + signer := crypto.PubkeyToAddress(*pubKey) + + ok, err := verifier.IsSequencerAt(signer, block.Number) + if err != nil { + // Backend failure (L1 RPC / IO) — local problem, not peer misbehavior. + return fmt.Errorf("%w: IsSequencerAt height %d: %v", ErrVerifierUnavailable, block.Number, err) + } + if !ok { + return fmt.Errorf("%w: signer %s is not sequencer at height %d", + ErrInvalidSignature, signer.Hex(), block.Number) + } + return nil +} diff --git a/sequencer/block_verify_test.go b/sequencer/block_verify_test.go new file mode 100644 index 00000000000..10e748c1f76 --- /dev/null +++ b/sequencer/block_verify_test.go @@ -0,0 +1,217 @@ +package sequencer + +import ( + "errors" + "math/big" + "testing" + + "github.com/morph-l2/go-ethereum/common" + "github.com/morph-l2/go-ethereum/core/types" + "github.com/morph-l2/go-ethereum/crypto" + + tmtypes "github.com/tendermint/tendermint/types" +) + +// makeTestBlock builds a BlockV2 whose Hash is computed via computeBlockHash and +// whose Signature is produced by the given private key. The block is what an +// honest sequencer would produce. +func makeTestBlock(t *testing.T, priv []byte) (*tmtypes.BlockV2, common.Address) { + t.Helper() + + key, err := crypto.ToECDSA(priv) + if err != nil { + t.Fatalf("ToECDSA: %v", err) + } + signer := crypto.PubkeyToAddress(key.PublicKey) + + tx := types.NewTransaction(0, common.HexToAddress("0xdead"), big.NewInt(1), 21000, big.NewInt(1), nil) + rawTx, err := tx.MarshalBinary() + if err != nil { + t.Fatalf("MarshalBinary: %v", err) + } + + block := &tmtypes.BlockV2{ + ParentHash: common.HexToHash("0xabc"), + Miner: common.Address{}, + Number: 42, + GasLimit: 30_000_000, + BaseFee: big.NewInt(1_000_000_000), + Timestamp: 1_700_000_000, + Transactions: [][]byte{rawTx}, + StateRoot: common.HexToHash("0xdef"), + GasUsed: 21_000, + ReceiptRoot: common.HexToHash("0x123"), + LogsBloom: make([]byte, 256), + WithdrawTrieRoot: common.HexToHash("0x456"), + NextL1MessageIndex: 7, + } + + h, err := computeBlockHash(block) + if err != nil { + t.Fatalf("computeBlockHash: %v", err) + } + block.Hash = h + + sig, err := crypto.Sign(h.Bytes(), key) + if err != nil { + t.Fatalf("Sign: %v", err) + } + block.Signature = sig + + return block, signer +} + +// testKey is a deterministic private key for tests; do NOT use elsewhere. +var testKey = []byte{ + 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, + 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, + 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, + 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, +} + +func TestVerifyBlockSignature_Honest(t *testing.T) { + block, _ := makeTestBlock(t, testKey) + verifier := &mockSequencerVerifier{isSequencer: true} + + if err := VerifyBlockSignature(verifier, block); err != nil { + t.Fatalf("expected honest block to verify, got: %v", err) + } +} + +func TestVerifyBlockSignature_TamperedMiner(t *testing.T) { + block, _ := makeTestBlock(t, testKey) + verifier := &mockSequencerVerifier{isSequencer: true} + + // Attacker keeps the original Hash + Signature but forges the Miner field. + block.Miner = common.HexToAddress("0xbadbadbadbadbadbadbadbadbadbadbadbadbad0") + + err := VerifyBlockSignature(verifier, block) + if err == nil { + t.Fatal("expected error for tampered Miner, got nil") + } + if !errors.Is(err, ErrInvalidSignature) { + t.Errorf("expected ErrInvalidSignature, got: %v", err) + } +} + +func TestVerifyBlockSignature_TamperedStateRoot(t *testing.T) { + block, _ := makeTestBlock(t, testKey) + verifier := &mockSequencerVerifier{isSequencer: true} + + block.StateRoot = common.HexToHash("0xdeadbeef") + + err := VerifyBlockSignature(verifier, block) + if err == nil { + t.Fatal("expected error for tampered StateRoot, got nil") + } + if !errors.Is(err, ErrInvalidSignature) { + t.Errorf("expected ErrInvalidSignature, got: %v", err) + } +} + +func TestVerifyBlockSignature_TamperedTransactions(t *testing.T) { + block, _ := makeTestBlock(t, testKey) + verifier := &mockSequencerVerifier{isSequencer: true} + + tx := types.NewTransaction(99, common.HexToAddress("0xfeed"), big.NewInt(999), 21000, big.NewInt(1), nil) + raw, err := tx.MarshalBinary() + if err != nil { + t.Fatalf("MarshalBinary: %v", err) + } + block.Transactions = append(block.Transactions, raw) + + if verr := VerifyBlockSignature(verifier, block); verr == nil { + t.Fatal("expected error for appended transaction, got nil") + } +} + +func TestVerifyBlockSignature_NotSequencer(t *testing.T) { + block, _ := makeTestBlock(t, testKey) + verifier := &mockSequencerVerifier{isSequencer: false} + + err := VerifyBlockSignature(verifier, block) + if err == nil { + t.Fatal("expected error when signer is not the active sequencer, got nil") + } + if !errors.Is(err, ErrInvalidSignature) { + t.Errorf("expected ErrInvalidSignature, got: %v", err) + } +} + +func TestVerifyBlockSignature_MissingSignature(t *testing.T) { + block, _ := makeTestBlock(t, testKey) + block.Signature = nil + verifier := &mockSequencerVerifier{isSequencer: true} + + err := VerifyBlockSignature(verifier, block) + if err == nil { + t.Fatal("expected error for missing signature, got nil") + } + if !errors.Is(err, ErrInvalidSignature) { + t.Errorf("expected ErrInvalidSignature, got: %v", err) + } +} + +func TestVerifyBlockSignature_NilVerifier(t *testing.T) { + block, _ := makeTestBlock(t, testKey) + + err := VerifyBlockSignature(nil, block) + if err == nil { + t.Fatal("expected error for nil verifier, got nil") + } + // nil verifier is a local configuration problem, not peer misbehavior. + if !errors.Is(err, ErrVerifierUnavailable) { + t.Errorf("expected ErrVerifierUnavailable, got: %v", err) + } + if errors.Is(err, ErrInvalidSignature) { + t.Errorf("nil verifier must not return ErrInvalidSignature (would cause false ban)") + } +} + +// TestVerifyBlockSignature_BackendFailure: when the verifier backend (e.g. L1 +// RPC) fails transiently, VerifyBlockSignature must surface an +// ErrVerifierUnavailable — not ErrInvalidSignature — otherwise honest peers +// get banned during every local outage. +func TestVerifyBlockSignature_BackendFailure(t *testing.T) { + block, _ := makeTestBlock(t, testKey) + verifier := &mockSequencerVerifier{err: errors.New("L1 RPC timeout")} + + err := VerifyBlockSignature(verifier, block) + if err == nil { + t.Fatal("expected error when verifier backend fails, got nil") + } + if !errors.Is(err, ErrVerifierUnavailable) { + t.Errorf("expected ErrVerifierUnavailable for backend failure, got: %v", err) + } + if errors.Is(err, ErrInvalidSignature) { + t.Errorf("backend failure must not return ErrInvalidSignature (would cause false ban)") + } +} + +func TestComputeBlockHash_DeterministicAndDistinct(t *testing.T) { + b1, _ := makeTestBlock(t, testKey) + h1, err := computeBlockHash(b1) + if err != nil { + t.Fatalf("computeBlockHash: %v", err) + } + + // Same content produces same hash. + h1again, err := computeBlockHash(b1) + if err != nil { + t.Fatalf("computeBlockHash: %v", err) + } + if h1 != h1again { + t.Fatalf("computeBlockHash not deterministic: %s vs %s", h1.Hex(), h1again.Hex()) + } + + // Different content produces different hash. + b2 := *b1 + b2.GasUsed = b1.GasUsed + 1 + h2, err := computeBlockHash(&b2) + if err != nil { + t.Fatalf("computeBlockHash: %v", err) + } + if h1 == h2 { + t.Fatal("computeBlockHash should differ when GasUsed changes") + } +} diff --git a/sequencer/broadcast_reactor.go b/sequencer/broadcast_reactor.go index 7398a3feebb..fb7509210d4 100644 --- a/sequencer/broadcast_reactor.go +++ b/sequencer/broadcast_reactor.go @@ -1,23 +1,20 @@ package sequencer import ( - "context" "errors" "fmt" - "math/big" "math/rand" "reflect" "sync" + "sync/atomic" "time" "github.com/cosmos/gogoproto/proto" "github.com/morph-l2/go-ethereum/common" - "github.com/morph-l2/go-ethereum/crypto" "github.com/tendermint/tendermint/libs/log" "github.com/tendermint/tendermint/p2p" bcproto "github.com/tendermint/tendermint/proto/tendermint/blocksync" - seqproto "github.com/tendermint/tendermint/proto/tendermint/sequencer" "github.com/tendermint/tendermint/types" ) @@ -25,15 +22,43 @@ const ( BlockBroadcastChannel = byte(0x50) // For block broadcast (requires signature verification) SequencerSyncChannel = byte(0x51) // For block sync requests (no signature verification) - // TODO: make these parameters configurable - smallGapThreshold = 20 // Gap for direct block request + smallGapThreshold = 10 // Gap for direct block request recentBlocksCapacity = 1000 // Recent applied blocks cache seenBlocksCapacity = 2000 // Seen blocks for dedup - peerSentCapacity = 500 // Per-peer sent tracking - applyInterval = 10 * time.Second - syncInterval = 10 * time.Second + peerGossipedCapacity = 500 // Per-peer gossiped tracking + applyInterval = 3 * time.Second + syncInterval = 5 * time.Second + + // Sync request tracking + syncRequestTTL = 20 * time.Second + maxPendingSyncRequests = 500 + maxPendingSyncPerPeer = 200 + + // Peer ban + peerBanDuration = 10 * time.Minute + + // BlockRequest per-peer rate limit (token bucket). + // + // requestMissingBlocks sends a burst of up to maxPendingSyncPerPeer=200 + // requests to the same peer within a single syncInterval tick — it is NOT + // a paced 40 qps stream. The bucket must therefore absorb that full burst, + // otherwise a catching-up fullnode with a single viable upstream would + // trip the limiter and get banned by removeTimeoutPeers 20s later. + // + // burst=250 covers the 200-request peak with headroom; rate=50 refills the + // bucket in 5s, which is exactly one syncInterval — so by the next tick + // the bucket is full again and can absorb another burst of the same size. + blockRequestRateLimit = 50.0 // tokens/sec (bucket fully refills every 5s) + blockRequestBurst = 250 // bucket capacity (>= maxPendingSyncPerPeer) ) +// SyncReq represents a pending sync channel request. +type SyncReq struct { + Height int64 + PeerID p2p.ID + ExpireAt time.Time +} + // BlockPool interface (avoids import cycle) type BlockPool interface { MaxPeerHeight() int64 @@ -45,42 +70,87 @@ type BlockPool interface { type BlockBroadcastReactor struct { p2p.BaseReactor - stateV2 *StateV2 - pool BlockPool - waitSync bool + stateV2 *StateV2 + + // poolMu guards pool. The blocksync reactor rebuilds its BlockPool on reorg + // (see blocksync.Reactor.SwitchToBlockSyncFromReorg) and pushes the new + // instance through SetPool, while peer-height lookups here run concurrently + // from message handlers. + poolMu sync.RWMutex + pool BlockPool recentBlocks *BlockRingBuffer // Applied blocks for peer requests pendingCache *PendingBlockCache // Pending blocks // Gossip state (bounded capacity, auto-evict old entries) - seenBlocks *HashSet // Blocks we've seen (dedup) - peerSent *PeerHashSet // Blocks sent to each peer - - applyMtx sync.Mutex // Protects applyBlock to ensure sequential block application - sequencerStarted bool // True when sequencer mode is actually running (not just registered) - logger log.Logger - - verifier SequencerVerifier -} - -// NewBlockBroadcastReactor creates a new reactor. + seenBlocks *HashSet // Blocks we've seen (dedup) + peerGossiped *PeerHashSet // Blocks gossiped to / from each peer (for gossip dedup) + + // applyMtx serializes the "check parent + apply + save signature" sequence + // in the reactor's applyBlock path. StateV2.ApplyBlock has its own mutex + // for the actual apply, but applyMtx ensures the parent-hash check and + // signature persistence are also atomic with the apply. + applyMtx sync.Mutex + startMu sync.Mutex + // routinesStarted is true once StartSequencerRoutines() has successfully + // run — i.e. the node has caught up and reached the upgrade height. + // Also used as a fast gate in Receive() to drop P2P messages that arrive + // before the reactor is ready (e.g. while blocksync is still running). + routinesStarted atomic.Bool + logger log.Logger + + verifier SequencerVerifier + l1Tracker L1Tracker // required: gates peer-block sync on L1 freshness + sigStore *SignatureStore + + // syncRequests tracks pending sync channel requests, keyed by height. + // Used to reject unsolicited responses before decode/verification. + // syncPeerCounts is a secondary index for O(1) per-peer count lookup. + syncRequestsMu sync.Mutex + syncRequests map[int64]*SyncReq + syncPeerCounts map[p2p.ID]int + + // blockReqLimiter enforces a per-peer rate limit on BlockRequest messages + // to prevent a malicious peer from exhausting L2 RPC / signature store. + blockReqLimiter *PeerRateLimiter + + // bannedPeers tracks peers banned for misbehavior, keyed by peer ID. + // Value is the ban expiration time. Expired entries are lazily cleaned + // up on lookup. This is an in-memory ban list; it does not persist + // across restarts. + bannedPeersMu sync.Mutex + bannedPeers map[p2p.ID]time.Time + + firstSync atomic.Bool +} + +// NewBlockBroadcastReactor creates a new reactor. Routines are NOT started +// here; the caller must invoke StartSequencerRoutines() once the node has +// caught up and reached the upgrade height (done by blocksync's caught-up +// callback or the upgrade callback in node.go). func NewBlockBroadcastReactor( pool BlockPool, stateV2 *StateV2, - waitSync bool, logger log.Logger, verifier SequencerVerifier, + l1Tracker L1Tracker, + sigStore *SignatureStore, ) *BlockBroadcastReactor { r := &BlockBroadcastReactor{ - pool: pool, - stateV2: stateV2, - waitSync: waitSync, - recentBlocks: NewBlockRingBuffer(recentBlocksCapacity), - pendingCache: NewPendingBlockCache(), - seenBlocks: NewHashSet(seenBlocksCapacity), - peerSent: NewPeerHashSet(peerSentCapacity), - logger: logger.With("module", "broadcastReactor"), - verifier: verifier, + pool: pool, + stateV2: stateV2, + recentBlocks: NewBlockRingBuffer(recentBlocksCapacity), + pendingCache: NewPendingBlockCache(), + seenBlocks: NewHashSet(seenBlocksCapacity), + peerGossiped: NewPeerHashSet(peerGossipedCapacity), + syncRequests: make(map[int64]*SyncReq), + syncPeerCounts: make(map[p2p.ID]int), + bannedPeers: make(map[p2p.ID]time.Time), + blockReqLimiter: NewPeerRateLimiter(blockRequestRateLimit, blockRequestBurst), + logger: logger.With("module", "broadcastReactor"), + verifier: verifier, + l1Tracker: l1Tracker, + sigStore: sigStore, } r.BaseReactor = *p2p.NewBaseReactor("BlockBroadcast", r) return r @@ -91,22 +161,57 @@ func (r *BlockBroadcastReactor) SetLogger(l log.Logger) { r.logger = l.With("module", "broadcastReactor") } +// SetPool swaps the BlockPool reference used for peer-height lookups. The +// blocksync reactor calls this after rebuilding its pool (e.g. during a reorg +// restart) so that peer selection in this reactor follows the new pool's view +// of peer heights instead of the dead pool's frozen state. +func (r *BlockBroadcastReactor) SetPool(pool BlockPool) { + r.poolMu.Lock() + defer r.poolMu.Unlock() + r.pool = pool +} + +// getPool returns the current pool reference under the read lock. Callers +// hold a local copy of the interface value, which keeps the underlying pool +// reachable for the duration of their call even if SetPool swaps it. +func (r *BlockBroadcastReactor) getPool() BlockPool { + r.poolMu.RLock() + defer r.poolMu.RUnlock() + return r.pool +} + +// OnStart is intentionally a no-op. Sequencer routines are NOT started by +// the service lifecycle — they are started explicitly by +// StartSequencerRoutines() which is invoked by: +// - blocksync's caught-up callback (when local height reaches the upgrade +// height after catching up via blocksync), or +// - the upgrade callback in node.go (when a running PBFT node crosses the +// upgrade height via consensus). +// +// Until one of those fires, Receive() drops incoming messages via the +// routinesStarted gate — the reactor is not yet able to validate or apply +// blocks, and pretending otherwise would produce false "too high" / "too +// low" errors against an uninitialized StateV2. func (r *BlockBroadcastReactor) OnStart() error { - if r.waitSync { - // Don't start sequencer routines on start if we are in wait sync mode - // will be started via StartSequencerRoutines - return nil - } - return r.StartSequencerRoutines() + return nil } func (r *BlockBroadcastReactor) StartSequencerRoutines() error { - if r.sequencerStarted { - r.logger.Error("Sequencer routines already started, skipping") - return nil + // only make sure `isRunning()` is true + if !r.IsRunning() { + if err := r.Start(); err != nil { + return err + } } - r.waitSync = false + r.startMu.Lock() + defer r.startMu.Unlock() + + r.firstSync.Store(true) + + if r.routinesStarted.Load() { + return nil + } if r.stateV2 != nil && !r.stateV2.IsRunning() { if err := r.stateV2.Start(); err != nil { @@ -114,18 +219,60 @@ func (r *BlockBroadcastReactor) StartSequencerRoutines() error { } } - if r.stateV2.IsSequencerMode() { + // Open the Receive() gate: from this point on, incoming P2P messages + // will be processed instead of being silently dropped. + r.routinesStarted.Store(true) + + // Fullnode (no signer): only applyRoutine (P2P sync) + // Nodes with signer (ActiveSeq / HA-Leader / HA-Follower): only broadcastRoutine + // - roleCheckRoutine is started by StateV2.OnStart() + // - applyRoutine is NOT started (sequencer produces, HA uses Raft) + if r.stateV2.HasSigner() { go r.broadcastRoutine() } else { go r.applyRoutine() } - r.sequencerStarted = true return nil } func (r *BlockBroadcastReactor) OnStop() { r.logger.Info("Stopping BlockBroadcastReactor") + + if err := r.stateV2.Stop(); err != nil { + r.logger.Error("failed to stop StateV2", "err", err) + } + + r.startMu.Lock() + r.routinesStarted.Store(false) + r.startMu.Unlock() + + r.bannedPeersMu.Lock() + r.bannedPeers = make(map[p2p.ID]time.Time) + r.bannedPeersMu.Unlock() + + r.syncRequestsMu.Lock() + r.syncRequests = make(map[int64]*SyncReq) + r.syncPeerCounts = make(map[p2p.ID]int) + r.syncRequestsMu.Unlock() + + r.recentBlocks.Clear() + r.pendingCache.Clear() + r.seenBlocks.Clear() + r.peerGossiped.Clear() +} + +func (r *BlockBroadcastReactor) StopForReorg() error { + if r.IsRunning() { + if err := r.Stop(); err != nil { + return err + } + } + return nil +} + +func (r *BlockBroadcastReactor) OnReset() error { + return r.stateV2.Reset() } func (r *BlockBroadcastReactor) GetChannels() []*p2p.ChannelDescriptor { @@ -136,19 +283,42 @@ func (r *BlockBroadcastReactor) GetChannels() []*p2p.ChannelDescriptor { } func (r *BlockBroadcastReactor) AddPeer(peer p2p.Peer) { - r.peerSent.AddPeer(string(peer.ID())) + if r.isBanned(peer.ID()) { + r.logger.Info("Rejecting banned peer", "peer", peer.ID()) + r.Switch.StopPeerForError(peer, fmt.Errorf("peer is banned")) + return + } + r.peerGossiped.AddPeer(string(peer.ID())) + r.blockReqLimiter.AddPeer(peer.ID()) } func (r *BlockBroadcastReactor) RemovePeer(peer p2p.Peer, reason interface{}) { - r.peerSent.RemovePeer(string(peer.ID())) + r.peerGossiped.RemovePeer(string(peer.ID())) + r.removeSyncRequestsByPeer(peer.ID()) + r.blockReqLimiter.RemovePeer(peer.ID()) } func (r *BlockBroadcastReactor) Receive(chID byte, src p2p.Peer, msgBytes []byte) { + // Drop all messages until StartSequencerRoutines() has been called. + // This guards against messages arriving while blocksync is still running + // or before the upgrade height has been reached — at that point StateV2 + // is not yet initialized and we cannot validate or apply blocks. + if !r.routinesStarted.Load() { + return + } + // L1 tracker: while L1 is stale, drop all inbound P2P messages. We may be + // blind to L1 SequencerUpdated events, so we neither accept/verify peer + // blocks (verification uses the verifier's stale L1-derived sequencer set, + // which could let a revoked-sequencer block through or mis-ban honest peers) + // nor serve sync requests. Placed before decode so nothing is processed. + if r.l1Tracker.IsHalt() { + return + } r.logger.Debug("Receive message", "chId", chID, "src", src.ID(), "len", len(msgBytes)) msg, err := decodeMsg(msgBytes) if err != nil { r.logger.Error("Error decoding message", "src", src, "chId", chID, "err", err) - r.Switch.StopPeerForError(src, err) + r.banPeer(src, "invalid message encoding") return } @@ -168,12 +338,31 @@ func (r *BlockBroadcastReactor) handleBroadcastMsg(msg interface{}, src p2p.Peer switch msg := msg.(type) { case *bcproto.BlockResponseV2: if msg.Block != nil { + // Honest peers always produce a valid BlockV2 proto. A failure + // here is a malicious signal — ban the sender. blockV2, err := types.BlockV2FromProto(msg.Block) if err != nil { - r.logger.Error("Invalid BlockV2", "err", err) + r.logger.Error("Invalid BlockV2 from broadcast channel", "peer", src.ID(), "err", err) + r.banPeer(src, "invalid BlockV2 in broadcast") + return + } + localHeight := r.stateV2.LatestHeight() + // Reject blocks too old (already applied / below pruning window). + if localHeight-int64(blockV2.Number) > MaxPendingHeightBehind { + r.logger.Error("handleBroadcastMsg err, block height too low.", + "height", blockV2.Number, "hash", blockV2.Hash.Hex(), "peer", src.ID()) + return + } + if int64(blockV2.Number)-localHeight > MaxPendingHeightAhead { + r.logger.Error("handleBroadcastMsg err, block height too high.", + "height", blockV2.Number, "localHeight", localHeight, "peer", src.ID()) + return + } + if r.markSeen(blockV2.Hash) { + r.logger.Debug("onBlockV2 broadcast dedup", "number", blockV2.Number, "hash", blockV2.Hash.Hex()) return } - r.onBlockV2(blockV2, src, true) // verify signature + r.onBlockV2(blockV2, src, true) // from broadcast channel r.logger.Debug("handleBroadcastMsg", "src", src, "height", blockV2.Number, "hash", blockV2.Hash.Hex()) } default: @@ -184,22 +373,38 @@ func (r *BlockBroadcastReactor) handleBroadcastMsg(msg interface{}, src p2p.Peer func (r *BlockBroadcastReactor) handleSyncMsg(msg interface{}, src p2p.Peer) { switch msg := msg.(type) { case *bcproto.BlockRequest: + if !r.blockReqLimiter.Allow(src.ID()) { + r.logger.Debug("BlockRequest rate limited", "peer", src.ID(), "height", msg.Height) + return + } r.onBlockRequest(msg, src) r.logger.Debug("handleSyncMsg block request", "msg", msg, "src", src, "height", msg.Height) case *bcproto.BlockResponseV2: if msg.Block != nil { + // Check request before full decode to reject unsolicited responses early + height := int64(msg.Block.Number) + if !r.checkAndTakeSyncRequest(src.ID(), height) { + r.logger.Error("Unsolicited sync response, dropping", + "peer", src.ID(), "height", height) + return + } + // Honest peers always produce a valid BlockV2 proto. A failure here + // means the peer is sending malformed data — ban it. The slot we + // just consumed will be cleaned up by removeSyncRequestsByPeer via + // the RemovePeer callback when StopPeerForError disconnects. blockV2, err := types.BlockV2FromProto(msg.Block) if err != nil { - r.logger.Error("Invalid BlockV2", "err", err) + r.logger.Error("Invalid BlockV2 from sync channel", "peer", src.ID(), "err", err) + r.banPeer(src, "invalid BlockV2 in sync response") return } - r.onBlockV2(blockV2, src, false) // no signature verification + r.onBlockV2(blockV2, src, false) // from sync channel r.logger.Debug("handleSyncMsg block response", "src", src, "height", blockV2.Number, "hash", blockV2.Hash.Hex()) } case *bcproto.NoBlockResponse: - r.logger.Debug("Peer does not have requested block", "peer", src, "height", msg.Height) + r.logger.Error("Peer does not have requested block", "peer", src, "height", msg.Height) default: - r.logger.Debug("Ignoring unknown message on sync channel", "type", reflect.TypeOf(msg)) + r.logger.Error("Ignoring unknown message on sync channel", "type", reflect.TypeOf(msg)) } } @@ -207,15 +412,31 @@ func (r *BlockBroadcastReactor) handleSyncMsg(msg interface{}, src p2p.Peer) { // Routines // ============================================================================ -// broadcastRoutine: listen to stateV2 and broadcast new blocks +// broadcastRoutine: listen to block source and broadcast to P2P. +// Data source: HA mode reads from ha.Subscribe(), non-HA from broadcastCh. +// HA Follower consumes channel to prevent buildup but does not broadcast. func (r *BlockBroadcastReactor) broadcastRoutine() { r.logger.Info("Starting block broadcast routine") + + var source <-chan *BlockV2 + if r.stateV2.IsHAMode() { + source = r.stateV2.HASubscribe() + } else { + source = r.stateV2.BroadcastCh() + } + for { select { case <-r.Quit(): return - case block := <-r.stateV2.BroadcastCh(): + case block := <-source: + // Always populate recentBlocks so the cache is warm if this follower + // is promoted to leader and starts serving block requests from fullnodes. r.recentBlocks.Add(block) + // HA Follower: do not broadcast, only drain the channel. + if r.stateV2.IsHAMode() && !r.stateV2.IsHALeader() { + continue + } r.broadcast(block) } } @@ -229,6 +450,10 @@ func (r *BlockBroadcastReactor) applyRoutine() { defer tickerSync.Stop() defer tickerApply.Stop() + // sync immediately + r.requestMissingBlocks(r.stateV2.LatestHeight()+1, r.getPool().MaxPeerHeight()) + r.tryApplyFromCache() + for { select { case <-r.Quit(): @@ -245,40 +470,51 @@ func (r *BlockBroadcastReactor) applyRoutine() { // Core Logic // ============================================================================ -// onBlockV2: receive block from peer -// verifySig: true for broadcast channel, false for sync channel -func (r *BlockBroadcastReactor) onBlockV2(block *BlockV2, src p2p.Peer, verifySig bool) { - r.logger.Debug("onBlockV2", "number", block.Number, "hash", block.Hash.Hex(), "verifySig", verifySig) - // Dedup: skip if already seen, only for broadcast channel. - // Sync channel should not check dedup. - if r.markSeen(block.Hash) && verifySig { - r.logger.Debug("onBlockV2 broadcast dedup", "number", block.Number, "hash", block.Hash.Hex(), "verifySig", verifySig) +// onBlockV2: receive block from peer. +// fromBroadcast: true for broadcast channel (triggers gossip), false for sync channel. +func (r *BlockBroadcastReactor) onBlockV2(block *BlockV2, src p2p.Peer, fromBroadcast bool) { + // Only Fullnode (no signer) processes P2P blocks: + // - ActiveSequencer: sole producer, P2P blocks are only echoes or invalid + // - HA nodes: receive blocks via Raft, not P2P + if r.stateV2.HasSigner() { + return + } + + r.logger.Debug("onBlockV2", "number", block.Number, "hash", block.Hash.Hex(), "fromBroadcast", fromBroadcast) + + // Unified signature verification for all incoming blocks. + // Ban the sender ONLY on a malicious-signal error (ErrInvalidSignature). + // ErrVerifierUnavailable means we can't decide (verifier not configured, + // L1 backend transient failure) — the peer may be honest, so we discard + // the block silently and let it retry on the next gossip/sync round. + if err := VerifyBlockSignature(r.verifier, block); err != nil { + if errors.Is(err, ErrInvalidSignature) { + r.logger.Error("Block signature invalid, banning peer", + "number", block.Number, "hash", block.Hash.Hex(), "err", err) + r.banPeer(src, "block signature verification failed") + } else { + // Verifier unavailable — local problem, not peer's fault. + r.logger.Error("Verifier unavailable, dropping block without ban", + "number", block.Number, "hash", block.Hash.Hex(), "err", err) + } return } // Mark as received from this peer (don't send back) - r.markSentToPeer(src.ID(), block.Hash) + r.markGossipedToPeer(src.ID(), block.Hash) localHeight := r.stateV2.LatestHeight() - // Try apply if it's the next block (height + parent match) if r.isNextBlock(block) { - if err := r.applyBlock(block, verifySig); err != nil { + if err := r.applyBlock(block); err != nil { r.logger.Error("Apply failed", "number", block.Number, "hash", block.Hash.Hex(), "err", err) - // Only cache blocks with valid signatures (don't cache signature failures) - if verifySig && !errors.Is(err, ErrInvalidSignature) { - r.logger.Debug("Apply failed, caching block", "number", block.Number, "hash", block.Hash.Hex()) - r.pendingCache.Add(block, uint64(localHeight)) - } + r.pendingCache.Add(block, uint64(localHeight)) return } - // Gossip the latest block to other peers - if verifySig { + if fromBroadcast { r.gossipBlock(block, src.ID()) } - } else if verifySig { - // Cache all other blocks (future or past for potential reorg) - r.logger.Debug("future block, caching", "number", block.Number, "hash", block.Hash.Hex()) + } else { r.pendingCache.Add(block, uint64(localHeight)) } } @@ -303,7 +539,7 @@ func (r *BlockBroadcastReactor) tryApplyFromCache() { break } r.logger.Debug("Trying to apply from cache", "number", block.Number, "hash", block.Hash.Hex()) - if err := r.applyBlock(block, true); err != nil { // should signature verification + if err := r.applyBlock(block); err != nil { r.logger.Error("Apply from cache failed", "number", block.Number, "err", err) break } @@ -319,11 +555,17 @@ func (r *BlockBroadcastReactor) tryApplyFromCache() { // checkSyncGap: request missing blocks via SequencerSyncChannel // All sync requests go through this method (no longer uses blocksync pool) func (r *BlockBroadcastReactor) checkSyncGap() { + // L1 tracker: while L1 is stale, do not actively pull blocks from peers. + if r.l1Tracker.IsHalt() { + return + } + localHeight := r.stateV2.LatestHeight() - maxPeerHeight := r.pool.MaxPeerHeight() + maxPeerHeight := r.getPool().MaxPeerHeight() gap := maxPeerHeight - localHeight r.logger.Debug("Checking sync goroutines", "gap", gap, "localHeight", localHeight, "maxPeerHeight", maxPeerHeight) - if gap <= smallGapThreshold { + if gap <= smallGapThreshold && + !r.firstSync.CompareAndSwap(true, false) { return } @@ -331,17 +573,30 @@ func (r *BlockBroadcastReactor) checkSyncGap() { r.requestMissingBlocks(localHeight+1, maxPeerHeight) } -// requestMissingBlocks requests blocks in range [start, end] from peers +// requestMissingBlocks requests blocks in range [start, end] from peers. +// Respects maxOutstandingTotal, maxOutstandingPerPeer, and maxNewRequestsPerTick limits. func (r *BlockBroadcastReactor) requestMissingBlocks(start, end int64) { + r.removeTimeoutPeers() + peers := r.Switch.Peers().List() if len(peers) == 0 { return } for height := start; height <= end; height++ { - // Find a peer that has this height + if r.pendingSyncCount() >= maxPendingSyncRequests { + break + } + // Skip if already have a pending request for this height + r.syncRequestsMu.Lock() + _, exists := r.syncRequests[height] + r.syncRequestsMu.Unlock() + if exists { + continue + } + + // findPeerWithHeight already skips peers exceeding per-peer limit peer := r.findPeerWithHeight(peers, height) - r.logger.Debug("Finding peer with height", "height", height, "peer", peer.ID()) if peer == nil { continue } @@ -352,14 +607,15 @@ func (r *BlockBroadcastReactor) requestMissingBlocks(start, end int64) { r.logger.Error("Failed to encode BlockRequest", "height", height, "err", err) continue } - r.logger.Info("Requesting block", "height", height, "peer", peer.ID()) if !peer.TrySend(SequencerSyncChannel, bz) { - r.logger.Error("Failed to send BlockRequest (TrySend failed)", "height", height, "peer", peer.ID()) + r.logger.Error("Failed to send BlockRequest", "height", height, "peer", peer.ID()) + continue } + r.recordSyncRequest(peer.ID(), height) } } -// findPeerWithHeight finds a random peer that has the given height. +// findPeerWithHeight finds a random peer that has the given height and is within per-peer request limit. // Uses random start index to avoid always hitting the same peer. func (r *BlockBroadcastReactor) findPeerWithHeight(peers []p2p.Peer, height int64) p2p.Peer { n := len(peers) @@ -367,11 +623,11 @@ func (r *BlockBroadcastReactor) findPeerWithHeight(peers []p2p.Peer, height int6 return nil } - // Random start, then wrap around - start := rand.Intn(n) + start := rand.Intn(n) //nolint:gosec // non-security: peer selection randomization for i := 0; i < n; i++ { peer := peers[(start+i)%n] - if r.pool.GetPeerHeight(peer.ID()) >= height { + if r.getPool().GetPeerHeight(peer.ID()) >= height && + r.pendingSyncCountByPeer(peer.ID()) < maxPendingSyncPerPeer { return peer } } @@ -389,25 +645,19 @@ func (r *BlockBroadcastReactor) isNextBlock(block *BlockV2) bool { return block.Number == currentBlock.Number+1 && block.ParentHash == currentBlock.Hash } -// applyBlock: verify and apply a block atomically -// Thread-safe: uses mutex to ensure sequential block application -// verifySig: true for broadcast channel blocks, false for sync channel blocks -func (r *BlockBroadcastReactor) applyBlock(block *BlockV2, verifySig bool) error { +// applyBlock verifies signature, applies the block atomically, and persists the signature. +// Thread-safe: uses mutex to ensure sequential block application. +func (r *BlockBroadcastReactor) applyBlock(block *BlockV2) error { r.applyMtx.Lock() defer r.applyMtx.Unlock() - // Verify signature only for broadcast channel - if verifySig && !r.verifySignature(block) { - return ErrInvalidSignature - } - // Verify parent currentBlock := r.stateV2.LatestBlock() if currentBlock != nil && block.ParentHash != currentBlock.Hash { return fmt.Errorf("parent mismatch") } - // Update state via stateV2 (unified entry point) + // ApplyBlock handles geth apply + SaveSignature in one call. if err := r.stateV2.ApplyBlock(block); err != nil { return err } @@ -415,44 +665,10 @@ func (r *BlockBroadcastReactor) applyBlock(block *BlockV2, verifySig bool) error // Add to recent blocks r.recentBlocks.Add(block) - r.logger.Info("Applied block", "number", block.Number, "verifySig", verifySig) + r.logger.Info("Applied block", "number", block.Number) return nil } -func (r *BlockBroadcastReactor) verifySignature(block *BlockV2) bool { - if len(block.Signature) == 0 { - r.logger.Error("Signature verification failed: empty signature", "block", block.Number) - return false - } - pubKey, err := crypto.SigToPub(block.Hash.Bytes(), block.Signature) - if err != nil { - r.logger.Error("Signature verification failed: SigToPub error", "block", block.Number, "err", err) - return false - } - recoveredAddr := crypto.PubkeyToAddress(*pubKey) - - if r.verifier == nil { - r.logger.Error("Sequencer verifier not set", "block", block.Number) - return false - } - - isSeq, err := r.verifier.IsSequencer(context.Background(), recoveredAddr) - if err != nil { - r.logger.Error("Signature verification failed: verifier error", - "block", block.Number, - "recovered", recoveredAddr.Hex(), - "err", err) - return false - } - if !isSeq { - r.logger.Error("Signature verification failed: not a valid sequencer", - "block", block.Number, - "recovered", recoveredAddr.Hex()) - return false - } - return true -} - // ============================================================================ // Gossip // ============================================================================ @@ -462,14 +678,14 @@ func (r *BlockBroadcastReactor) markSeen(hash common.Hash) bool { return r.seenBlocks.Add(hash) // Returns true if already existed } -// markSentToPeer marks a block as sent to a peer. -func (r *BlockBroadcastReactor) markSentToPeer(peerID p2p.ID, hash common.Hash) { - r.peerSent.Add(string(peerID), hash) +// markGossipedToPeer marks a block as gossiped to/from a peer. +func (r *BlockBroadcastReactor) markGossipedToPeer(peerID p2p.ID, hash common.Hash) { + r.peerGossiped.Add(string(peerID), hash) } -// hasSentToPeer checks if block was sent to peer. -func (r *BlockBroadcastReactor) hasSentToPeer(peerID p2p.ID, hash common.Hash) bool { - return r.peerSent.Contains(string(peerID), hash) +// hasGossipedToPeer checks if a block has already been gossiped to/from a peer. +func (r *BlockBroadcastReactor) hasGossipedToPeer(peerID p2p.ID, hash common.Hash) bool { + return r.peerGossiped.Contains(string(peerID), hash) } // gossipBlock forwards a block to all peers except the source. @@ -477,7 +693,7 @@ func (r *BlockBroadcastReactor) hasSentToPeer(peerID p2p.ID, hash common.Hash) b func (r *BlockBroadcastReactor) gossipBlock(block *BlockV2, fromPeer p2p.ID) { r.logger.Info("Gossiping block", "number", block.Number, "hash", block.Hash.Hex(), "fromPeer", fromPeer) msg := &bcproto.BlockResponseV2{ - Block: BlockV2ToProto(block), + Block: types.BlockV2ToProto(block), } bz, err := encodeMsg(msg) if err != nil { @@ -495,22 +711,167 @@ func (r *BlockBroadcastReactor) gossipBlock(block *BlockV2, fromPeer p2p.ID) { } // Skip if already sent - if r.hasSentToPeer(peerID, block.Hash) { + if r.hasGossipedToPeer(peerID, block.Hash) { continue } // Send and mark if peer.TrySend(BlockBroadcastChannel, bz) { - r.markSentToPeer(peerID, block.Hash) + r.markGossipedToPeer(peerID, block.Hash) } } } +// ============================================================================ +// Sync Request Tracking +// ============================================================================ + +// recordSyncRequest records a pending sync request keyed by height. +func (r *BlockBroadcastReactor) recordSyncRequest(peerID p2p.ID, height int64) { + r.syncRequestsMu.Lock() + defer r.syncRequestsMu.Unlock() + if old, ok := r.syncRequests[height]; ok { + r.syncPeerCounts[old.PeerID]-- + } + r.syncRequests[height] = &SyncReq{ + Height: height, + PeerID: peerID, + ExpireAt: time.Now().Add(syncRequestTTL), + } + r.syncPeerCounts[peerID]++ +} + +// checkAndTakeSyncRequest validates a sync response against pending requests. +// Only removes the entry if peerID matches and request is not expired. +func (r *BlockBroadcastReactor) checkAndTakeSyncRequest(peerID p2p.ID, height int64) bool { + r.syncRequestsMu.Lock() + defer r.syncRequestsMu.Unlock() + + req, ok := r.syncRequests[height] + if !ok || req.PeerID != peerID || time.Now().After(req.ExpireAt) { + return false + } + delete(r.syncRequests, height) + r.syncPeerCounts[peerID]-- + return true +} + +// removeTimeoutPeers scans pending sync requests and kicks any peer whose +// request has expired. Map cleanup (syncRequests / syncPeerCounts) is +// delegated to removeSyncRequestsByPeer via the RemovePeer callback, so +// this method itself does not mutate the maps. +func (r *BlockBroadcastReactor) removeTimeoutPeers() { + seen := make(map[p2p.ID]struct{}) + + r.syncRequestsMu.Lock() + now := time.Now() + for _, req := range r.syncRequests { + if now.After(req.ExpireAt) { + seen[req.PeerID] = struct{}{} + } + } + r.syncRequestsMu.Unlock() + + for peerID := range seen { + peer := r.Switch.Peers().Get(peerID) + if peer == nil { + continue + } + r.banPeer(peer, "sync request timed out") + } +} + +// pendingSyncCount returns the total number of pending sync requests. +func (r *BlockBroadcastReactor) pendingSyncCount() int { + r.syncRequestsMu.Lock() + defer r.syncRequestsMu.Unlock() + return len(r.syncRequests) +} + +// pendingSyncCountByPeer returns the number of pending sync requests for a specific peer. +func (r *BlockBroadcastReactor) pendingSyncCountByPeer(peerID p2p.ID) int { + r.syncRequestsMu.Lock() + defer r.syncRequestsMu.Unlock() + return r.syncPeerCounts[peerID] +} + +// removeSyncRequestsByPeer removes all pending requests for a disconnected peer. +func (r *BlockBroadcastReactor) removeSyncRequestsByPeer(peerID p2p.ID) { + r.syncRequestsMu.Lock() + defer r.syncRequestsMu.Unlock() + for h, req := range r.syncRequests { + if req.PeerID == peerID { + delete(r.syncRequests, h) + } + } + delete(r.syncPeerCounts, peerID) +} + +// ============================================================================ +// Peer Ban +// ============================================================================ + +// banPeer adds a peer to the ban list for peerBanDuration. +// The peer is also immediately stopped. Expired entries are lazily evicted on +// the next isBanned call for the same peer. +// +// Persistent peers (whitelisted via [p2p].persistent_peers) are exempt: they +// still get disconnected — so connection state is fully reset via RemovePeer +// callbacks — but are NOT added to bannedPeers. The Switch will automatically +// reconnect them, and on reconnect AddPeer's isBanned check will let them in. +func (r *BlockBroadcastReactor) banPeer(peer p2p.Peer, reason string) { + peerID := peer.ID() + + if peer.IsPersistent() { + r.logger.Error("[WHITELIST_ALARM] whitelisted peer misbehaved, will reconnect", + "peer", peerID, "reason", reason) + r.Switch.StopPeerForError(peer, fmt.Errorf("%s", reason)) + return + } + + r.bannedPeersMu.Lock() + r.bannedPeers[peerID] = time.Now().Add(peerBanDuration) + r.bannedPeersMu.Unlock() + + r.logger.Error("Banning peer", "peer", peerID, "reason", reason, "duration", peerBanDuration) + r.Switch.StopPeerForError(peer, fmt.Errorf("%s", reason)) +} + +// isBanned returns true if the peer is currently banned. +// Lazily removes expired entries. +func (r *BlockBroadcastReactor) isBanned(peerID p2p.ID) bool { + r.bannedPeersMu.Lock() + defer r.bannedPeersMu.Unlock() + + expireAt, ok := r.bannedPeers[peerID] + if !ok { + return false + } + if time.Now().After(expireAt) { + delete(r.bannedPeers, peerID) + return false + } + return true +} + // ============================================================================ // Message Handlers // ============================================================================ func (r *BlockBroadcastReactor) onBlockRequest(msg *bcproto.BlockRequest, src p2p.Peer) { + localHeight := r.stateV2.LatestHeight() + if msg.Height <= 0 || msg.Height > localHeight { + r.logger.Info("Rejecting out-of-range BlockRequest", + "peer", src.ID(), "requested", msg.Height, "localHeight", localHeight) + bz, err := encodeMsg(&bcproto.NoBlockResponse{Height: msg.Height}) + if err != nil { + r.logger.Error("Failed to encode NoBlockResponse", "height", msg.Height, "err", err) + return + } + src.TrySend(SequencerSyncChannel, bz) + return + } + // Try to get from recent blocks cache first block := r.recentBlocks.GetByHeight(uint64(msg.Height)) @@ -534,8 +895,19 @@ func (r *BlockBroadcastReactor) onBlockRequest(msg *bcproto.BlockRequest, src p2 return } + // Attach signature from store if block doesn't already carry one + if len(block.Signature) == 0 && r.sigStore != nil { + if sig, err := r.sigStore.GetSignature(block.Hash); err == nil { + block.Signature = sig + } else { + r.logger.Error("Signature not found in store, sending block without signature", + "height", msg.Height, "hash", block.Hash.Hex(), "err", err) + return + } + } + resp := &bcproto.BlockResponseV2{ - Block: BlockV2ToProto(block), + Block: types.BlockV2ToProto(block), } bz, err := encodeMsg(resp) if err != nil { @@ -548,7 +920,7 @@ func (r *BlockBroadcastReactor) onBlockRequest(msg *bcproto.BlockRequest, src p2 // broadcast only for sequencer func (r *BlockBroadcastReactor) broadcast(block *BlockV2) { resp := &bcproto.BlockResponseV2{ - Block: BlockV2ToProto(block), + Block: types.BlockV2ToProto(block), } bz, err := encodeMsg(resp) if err != nil { @@ -559,58 +931,6 @@ func (r *BlockBroadcastReactor) broadcast(block *BlockV2) { r.logger.Info("Broadcast block", "number", block.Number, "hash", block.Hash.Hex()) } -// ============================================================================ -// Proto Conversion -// ============================================================================ - -func BlockV2ToProto(block *BlockV2) *seqproto.BlockV2 { - var baseFee []byte - if block.BaseFee != nil { - baseFee = block.BaseFee.Bytes() - } - return &seqproto.BlockV2{ - ParentHash: block.ParentHash.Bytes(), - Miner: block.Miner.Bytes(), - Number: block.Number, - GasLimit: block.GasLimit, - BaseFee: baseFee, - Timestamp: block.Timestamp, - Transactions: block.Transactions, - StateRoot: block.StateRoot.Bytes(), - GasUsed: block.GasUsed, - ReceiptRoot: block.ReceiptRoot.Bytes(), - LogsBloom: block.LogsBloom, - WithdrawTrieRoot: block.WithdrawTrieRoot.Bytes(), - NextL1MessageIndex: block.NextL1MessageIndex, - Hash: block.Hash.Bytes(), - Signature: block.Signature, - } -} - -func ProtoToBlockV2(pb *seqproto.BlockV2) *BlockV2 { - baseFee := new(big.Int) - if len(pb.BaseFee) > 0 { - baseFee.SetBytes(pb.BaseFee) - } - return &BlockV2{ - ParentHash: common.BytesToHash(pb.ParentHash), - Miner: common.BytesToAddress(pb.Miner), - Number: pb.Number, - GasLimit: pb.GasLimit, - BaseFee: baseFee, - Timestamp: pb.Timestamp, - Transactions: pb.Transactions, - StateRoot: common.BytesToHash(pb.StateRoot), - GasUsed: pb.GasUsed, - ReceiptRoot: common.BytesToHash(pb.ReceiptRoot), - LogsBloom: pb.LogsBloom, - WithdrawTrieRoot: common.BytesToHash(pb.WithdrawTrieRoot), - NextL1MessageIndex: pb.NextL1MessageIndex, - Hash: common.BytesToHash(pb.Hash), - Signature: pb.Signature, - } -} - // ==================== Message Encoding/Decoding ==================== // Local copies to avoid import cycle with blocksync package diff --git a/sequencer/broadcast_reactor_test.go b/sequencer/broadcast_reactor_test.go new file mode 100644 index 00000000000..611ae74a03a --- /dev/null +++ b/sequencer/broadcast_reactor_test.go @@ -0,0 +1,306 @@ +package sequencer + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" + "github.com/tendermint/tendermint/libs/log" + "github.com/tendermint/tendermint/p2p" + "github.com/tendermint/tendermint/p2p/mock" +) + +// newReactorForTest returns a BlockBroadcastReactor with only the fields +// needed for exercising banmap / sync-request tracking logic. It does NOT +// wire in Switch / StateV2 / verifier — callers must avoid code paths that +// dereference them. +func newReactorForTest() *BlockBroadcastReactor { + return &BlockBroadcastReactor{ + syncRequests: make(map[int64]*SyncReq), + syncPeerCounts: make(map[p2p.ID]int), + bannedPeers: make(map[p2p.ID]time.Time), + blockReqLimiter: NewPeerRateLimiter(blockRequestRateLimit, blockRequestBurst), + } +} + +// ---------------------------------------------------------------------------- +// Banmap behavior +// ---------------------------------------------------------------------------- + +// TestIsBanned_NotBanned: a fresh peer is not banned. +func TestIsBanned_NotBanned(t *testing.T) { + r := newReactorForTest() + require.False(t, r.isBanned("peer-a")) +} + +// TestIsBanned_Banned: writing directly into bannedPeers causes isBanned to +// return true before expiry. +func TestIsBanned_Banned(t *testing.T) { + r := newReactorForTest() + peer := p2p.ID("peer-b") + + r.bannedPeers[peer] = time.Now().Add(1 * time.Minute) + require.True(t, r.isBanned(peer)) +} + +// TestIsBanned_ExpiredLazyEvict: once the ban expires, isBanned returns false +// AND removes the stale entry from the map (lazy eviction). +func TestIsBanned_ExpiredLazyEvict(t *testing.T) { + r := newReactorForTest() + peer := p2p.ID("peer-c") + + r.bannedPeers[peer] = time.Now().Add(-1 * time.Second) // already expired + require.False(t, r.isBanned(peer)) + _, stillPresent := r.bannedPeers[peer] + require.False(t, stillPresent, "expired entry must be lazily evicted") +} + +// ---------------------------------------------------------------------------- +// removeTimeoutPeers: collect-only semantics +// ---------------------------------------------------------------------------- + +// TestRemoveTimeoutPeers_CollectsExpired verifies that expired entries are +// identified. We can't actually call banPeer() without a Switch, so we +// inline the collection logic and assert it catches the right peers. +// +// This mirrors the lock-scoped collection phase of removeTimeoutPeers. +func TestRemoveTimeoutPeers_CollectsExpired(t *testing.T) { + r := newReactorForTest() + now := time.Now() + + // Two expired requests from the same peer, one fresh, one from another peer. + r.syncRequests[100] = &SyncReq{Height: 100, PeerID: "peer-x", ExpireAt: now.Add(-1 * time.Second)} + r.syncRequests[101] = &SyncReq{Height: 101, PeerID: "peer-x", ExpireAt: now.Add(-2 * time.Second)} + r.syncRequests[102] = &SyncReq{Height: 102, PeerID: "peer-y", ExpireAt: now.Add(10 * time.Second)} // fresh + r.syncRequests[103] = &SyncReq{Height: 103, PeerID: "peer-z", ExpireAt: now.Add(-1 * time.Second)} + + // Replicate the collect step from removeTimeoutPeers. + seen := make(map[p2p.ID]struct{}) + r.syncRequestsMu.Lock() + for _, req := range r.syncRequests { + if time.Now().After(req.ExpireAt) { + seen[req.PeerID] = struct{}{} + } + } + r.syncRequestsMu.Unlock() + + require.Len(t, seen, 2, "peer-x and peer-z should be collected, peer-y must not") + require.Contains(t, seen, p2p.ID("peer-x")) + require.Contains(t, seen, p2p.ID("peer-z")) + require.NotContains(t, seen, p2p.ID("peer-y")) +} + +// ---------------------------------------------------------------------------- +// checkAndTakeSyncRequest: ensure slot is only consumed on exact peer+height match +// ---------------------------------------------------------------------------- + +// TestCheckAndTakeSyncRequest_PeerMismatch: a response from a different peer +// than the one we requested must not consume the slot. +func TestCheckAndTakeSyncRequest_PeerMismatch(t *testing.T) { + r := newReactorForTest() + r.syncRequests[200] = &SyncReq{ + Height: 200, + PeerID: "peer-requested", + ExpireAt: time.Now().Add(1 * time.Minute), + } + r.syncPeerCounts["peer-requested"] = 1 + + ok := r.checkAndTakeSyncRequest("peer-other", 200) + require.False(t, ok, "response from wrong peer must be rejected") + + // Slot must still exist. + _, exists := r.syncRequests[200] + require.True(t, exists) + require.Equal(t, 1, r.syncPeerCounts["peer-requested"]) +} + +// TestCheckAndTakeSyncRequest_Expired: a late response (after TTL) must not +// consume the slot. +func TestCheckAndTakeSyncRequest_Expired(t *testing.T) { + r := newReactorForTest() + r.syncRequests[201] = &SyncReq{ + Height: 201, + PeerID: "peer-slow", + ExpireAt: time.Now().Add(-1 * time.Second), + } + r.syncPeerCounts["peer-slow"] = 1 + + ok := r.checkAndTakeSyncRequest("peer-slow", 201) + require.False(t, ok, "expired slot must not be taken") +} + +// TestCheckAndTakeSyncRequest_Success: a well-formed response consumes the +// slot and decrements the per-peer count. +func TestCheckAndTakeSyncRequest_Success(t *testing.T) { + r := newReactorForTest() + r.syncRequests[202] = &SyncReq{ + Height: 202, + PeerID: "peer-good", + ExpireAt: time.Now().Add(1 * time.Minute), + } + r.syncPeerCounts["peer-good"] = 1 + + ok := r.checkAndTakeSyncRequest("peer-good", 202) + require.True(t, ok) + + _, exists := r.syncRequests[202] + require.False(t, exists, "slot must be taken") + require.Equal(t, 0, r.syncPeerCounts["peer-good"]) +} + +// ---------------------------------------------------------------------------- +// removeSyncRequestsByPeer: full cleanup on peer disconnect +// ---------------------------------------------------------------------------- + +// TestRemoveSyncRequestsByPeer_ClearsAllEntries: all slots belonging to a +// peer are deleted, and its counter entry is removed entirely. +func TestRemoveSyncRequestsByPeer_ClearsAllEntries(t *testing.T) { + r := newReactorForTest() + now := time.Now() + r.syncRequests[300] = &SyncReq{Height: 300, PeerID: "peer-gone", ExpireAt: now.Add(1 * time.Minute)} + r.syncRequests[301] = &SyncReq{Height: 301, PeerID: "peer-gone", ExpireAt: now.Add(1 * time.Minute)} + r.syncRequests[302] = &SyncReq{Height: 302, PeerID: "peer-alive", ExpireAt: now.Add(1 * time.Minute)} + r.syncPeerCounts["peer-gone"] = 2 + r.syncPeerCounts["peer-alive"] = 1 + + r.removeSyncRequestsByPeer("peer-gone") + + require.Len(t, r.syncRequests, 1) + _, stillThere := r.syncRequests[302] + require.True(t, stillThere, "peer-alive's slot must remain") + _, counted := r.syncPeerCounts["peer-gone"] + require.False(t, counted, "peer-gone's counter entry must be deleted") + require.Equal(t, 1, r.syncPeerCounts["peer-alive"]) +} + +// ---------------------------------------------------------------------------- +// recordSyncRequest: overwrite semantics +// ---------------------------------------------------------------------------- + +// TestRecordSyncRequest_OverwritesOldEntry: recording a new request for the +// same height decrements the old peer's counter and reassigns the slot. +func TestRecordSyncRequest_OverwritesOldEntry(t *testing.T) { + r := newReactorForTest() + r.recordSyncRequest("peer-1", 400) + require.Equal(t, 1, r.syncPeerCounts["peer-1"]) + + r.recordSyncRequest("peer-2", 400) // same height, new peer + require.Equal(t, 0, r.syncPeerCounts["peer-1"]) + require.Equal(t, 1, r.syncPeerCounts["peer-2"]) + + req := r.syncRequests[400] + require.Equal(t, p2p.ID("peer-2"), req.PeerID) +} + +// ---------------------------------------------------------------------------- +// Rate limiter wiring with reactor-specific constants +// ---------------------------------------------------------------------------- + +// TestReactorRateLimiter_UsesConstants verifies the reactor's rate limiter +// is configured with blockRequestRateLimit / blockRequestBurst and behaves +// end-to-end: after burst exhaustion, subsequent Allow() calls are denied. +func TestReactorRateLimiter_UsesConstants(t *testing.T) { + r := newReactorForTest() + peer := p2p.ID("peer-flood") + r.blockReqLimiter.AddPeer(peer) + + // Consume exactly burst tokens. + for i := 0; i < blockRequestBurst; i++ { + require.True(t, r.blockReqLimiter.Allow(peer), "token %d should be allowed", i+1) + } + // Next call must be denied (no time has elapsed for refill). + require.False(t, r.blockReqLimiter.Allow(peer)) +} + +// ---------------------------------------------------------------------------- +// banPeer whitelist exemption (persistent_peers) +// ---------------------------------------------------------------------------- + +// safeBanPeer drives banPeer with a nil Switch and absorbs the resulting +// panic. We only care about the ban-list write side-effect: by the time +// StopPeerForError runs (and panics on nil), the bannedPeers map has either +// been written (non-persistent path) or skipped (persistent path), which is +// exactly what we want to assert. +func safeBanPeer(r *BlockBroadcastReactor, peer p2p.Peer, reason string) { + defer func() { _ = recover() }() + r.banPeer(peer, reason) +} + +// TestBanPeer_PersistentPeerSkipsBanList: a peer with IsPersistent()==true +// must NOT be added to bannedPeers — that is the central whitelist invariant. +func TestBanPeer_PersistentPeerSkipsBanList(t *testing.T) { + r := newReactorForTest() + r.logger = log.NewNopLogger() + + mp := mock.NewPeer(nil) + mp.Persistent = true + + safeBanPeer(r, mp, "test signature failure") + + _, present := r.bannedPeers[mp.ID()] + require.False(t, present, "persistent peer must be exempt from bannedPeers") +} + +// TestBanPeer_NonPersistentPeerEntersBanList: regression — a peer with +// IsPersistent()==false must follow the original code path and be added +// to bannedPeers with a future expiry. +func TestBanPeer_NonPersistentPeerEntersBanList(t *testing.T) { + r := newReactorForTest() + r.logger = log.NewNopLogger() + + mp := mock.NewPeer(nil) + mp.Persistent = false + + before := time.Now() + safeBanPeer(r, mp, "test signature failure") + + expireAt, present := r.bannedPeers[mp.ID()] + require.True(t, present, "non-persistent peer must be added to bannedPeers") + require.True(t, expireAt.After(before.Add(peerBanDuration-time.Second)), + "ban expiry should be ~peerBanDuration in the future") +} + +// TestBanPeer_PersistentPeerAddPeerNotRejected: the natural follow-up — a +// persistent peer that has misbehaved before still passes AddPeer's +// isBanned gate, so the Switch's automatic reconnect can re-add it. +func TestBanPeer_PersistentPeerAddPeerNotRejected(t *testing.T) { + r := newReactorForTest() + r.logger = log.NewNopLogger() + + mp := mock.NewPeer(nil) + mp.Persistent = true + + safeBanPeer(r, mp, "decode failure") + require.False(t, r.isBanned(mp.ID()), + "persistent peer must not be considered banned after misbehavior") +} + +// ---------------------------------------------------------------------------- +// L1 health gate on the sync path +// ---------------------------------------------------------------------------- + +func TestReceive_L1HaltDropsMessages(t *testing.T) { + // routinesStarted=true and L1 halted: Receive must drop the message at the + // L1 gate, before decode/verify/ban — so it must NOT panic even though + // stateV2/Switch are nil and the payload is garbage. + r := newReactorForTest() + r.logger = log.NewNopLogger() + r.routinesStarted.Store(true) + r.l1Tracker = &mockL1Tracker{halt: true} + + require.NotPanics(t, func() { + r.Receive(BlockBroadcastChannel, nil, []byte("garbage")) + }, "Receive should drop messages at the L1 gate when halted, before decode") +} + +func TestCheckSyncGap_L1HaltBlocksRequests(t *testing.T) { + // With L1 halted, checkSyncGap must return before calling + // stateV2.LatestHeight()/getPool() (both nil here) — so it must NOT panic. + r := newReactorForTest() + r.logger = log.NewNopLogger() + r.l1Tracker = &mockL1Tracker{halt: true} + + require.NotPanics(t, func() { + r.checkSyncGap() + }, "checkSyncGap should early-return at the L1 gate before dereferencing nil pool/stateV2") +} diff --git a/sequencer/hash_set.go b/sequencer/hash_set.go index 5c7b903db64..64b35e9a8a5 100644 --- a/sequencer/hash_set.go +++ b/sequencer/hash_set.go @@ -6,7 +6,8 @@ import ( "github.com/morph-l2/go-ethereum/common" ) -// HashSet is a fixed-capacity set for common.Hash with LRU eviction. +// HashSet is a fixed-capacity set for common.Hash with FIFO eviction +// (insertion-order ring buffer; entries are not re-ordered on access). type HashSet struct { items []common.Hash index map[common.Hash]int @@ -59,6 +60,16 @@ func (s *HashSet) Contains(hash common.Hash) bool { return exists } +// Clear drops every entry. The backing items slice is reused; subsequent +// Add() calls overwrite it. Used during reactor reset. +func (s *HashSet) Clear() { + s.mtx.Lock() + defer s.mtx.Unlock() + s.index = make(map[common.Hash]int, s.capacity) + s.head = 0 + s.count = 0 +} + // PeerHashSet is a per-peer hash set. type PeerHashSet struct { peers map[string]*HashSet @@ -115,3 +126,10 @@ func (s *PeerHashSet) RemovePeer(peerID string) { defer s.mtx.Unlock() delete(s.peers, peerID) } + +// Clear drops all per-peer hash sets. Used during reactor reset. +func (s *PeerHashSet) Clear() { + s.mtx.Lock() + defer s.mtx.Unlock() + s.peers = make(map[string]*HashSet) +} diff --git a/sequencer/interfaces.go b/sequencer/interfaces.go index b720965cdcf..542ff4b98b6 100644 --- a/sequencer/interfaces.go +++ b/sequencer/interfaces.go @@ -1,7 +1,6 @@ package sequencer import ( - "context" "errors" "github.com/morph-l2/go-ethereum/common" @@ -9,13 +8,23 @@ import ( // Sentinel errors for block processing var ( - // ErrInvalidSignature indicates block signature verification failed + // ErrInvalidSignature indicates the block's signature is provably invalid. + // This is a malicious-peer signal: the block's content does not match a + // legitimate sequencer signature. Callers should ban the sender. ErrInvalidSignature = errors.New("invalid block signature") + + // ErrVerifierUnavailable indicates the local verifier could not determine + // whether a signature is valid (e.g. not configured, L1 backend RPC + // failure, transient IO error). This is NOT a peer-misbehavior signal — + // the peer may be honest; we just cannot decide right now. Callers must + // NOT ban the sender on this error; retry or drop silently instead. + ErrVerifierUnavailable = errors.New("verifier unavailable") ) -// SequencerVerifier verifies if an address is the current L1 sequencer +// SequencerVerifier verifies if an address is a valid L1 sequencer. type SequencerVerifier interface { - IsSequencer(ctx context.Context, addr common.Address) (bool, error) + // IsSequencerAt checks if addr was the valid sequencer at the given L2 block height. + IsSequencerAt(addr common.Address, l2Height uint64) (bool, error) } // Signer interface for sequencer block signing @@ -24,6 +33,54 @@ type Signer interface { Sign(data []byte) ([]byte, error) // Address returns the sequencer's address Address() common.Address - // IsActiveSequencer checks if this signer is the current L1 sequencer - IsActiveSequencer(ctx context.Context) (bool, error) +} + +// SequencerHA is the abstraction for Raft HA cluster. +// In single-node mode, ha == nil and all HA-related logic is skipped. +type SequencerHA interface { + // Start initializes and starts the Raft service. + // leader (bootstrap=true): starts as a single-node cluster, immediately becomes Leader. + // follower (bootstrap=false): initializes Raft, then asynchronously retries Join() in the + // background until it successfully joins the cluster or the service is stopped. + // Called by StateV2.OnStart() when the upgrade height is reached. + Start() error + + // Stop gracefully shuts down the Raft service. + // Called by StateV2.OnStop() when the upgrade height has been passed. + Stop() + + // IsLeader returns whether the current node is the Raft leader (sole block producer). + IsLeader() bool + + // Join adds this node to the Raft cluster. + // Precondition: node has synced to near chain tip via Fullnode mode. + // Fails if localHeight < raft.EarliestRetainedLogHeight. + // On success, the node is a full Raft member and P2P sync can be stopped. + Join() error + + // Commit replicates a signed block via Raft to the cluster. + // Blocks until majority of nodes have acknowledged receipt (NOT applied). + // ApplyBlock is handled separately by leader (after Commit) and followers (via Subscribe). + // On failure, the caller should abandon this block production round. + Commit(block *BlockV2) error + + // Subscribe returns a channel that delivers blocks after Raft commit. + // Both leader and follower subscribe; used by broadcastRoutine for P2P broadcast. + Subscribe() <-chan *BlockV2 + + // SetOnBlockApplied registers the callback invoked by the Raft FSM on every + // committed log entry. The callback should execute ApplyBlock + SaveSignature. + // Must be called before Start(). + SetOnBlockApplied(fn func(*BlockV2) error) +} + +// L1Tracker reports whether L1 RPC has fallen too far behind for this node to +// safely act. When L1 is stale, the node may be blind to L1 SequencerUpdated +// events, so the sequencer must stop producing and fullnodes must stop syncing +// to avoid following a revoked sequencer. Implemented by the L1 tracker in the +// node binary and required by StateV2 / the broadcast reactor. +type L1Tracker interface { + // IsHalt reports whether L1 is stale enough that this node must halt block + // production and sync. + IsHalt() bool } diff --git a/sequencer/pending_cache.go b/sequencer/pending_cache.go index 9e3af60d1bb..ea887f0a3b0 100644 --- a/sequencer/pending_cache.go +++ b/sequencer/pending_cache.go @@ -136,3 +136,14 @@ func (c *PendingBlockCache) Size() int { defer c.mtx.RUnlock() return len(c.blocks) } + +// Clear drops every pending block. Used during reactor reset (reorg path): +// after the chain head moves, previously cached future / fork blocks may +// reference parents that no longer exist, so we discard the lot rather +// than try to reconcile. +func (c *PendingBlockCache) Clear() { + c.mtx.Lock() + defer c.mtx.Unlock() + c.blocks = make(map[common.Hash]*BlockV2) + c.byParent = make(map[common.Hash][]*BlockV2) +} diff --git a/sequencer/rate_limiter.go b/sequencer/rate_limiter.go new file mode 100644 index 00000000000..3d9c9b1afd8 --- /dev/null +++ b/sequencer/rate_limiter.go @@ -0,0 +1,94 @@ +package sequencer + +import ( + "sync" + "time" + + "github.com/tendermint/tendermint/p2p" +) + +// tokenBucket is the per-peer token bucket state. +// Tokens are refilled lazily on each Allow() call based on elapsed time. +type tokenBucket struct { + tokens float64 + lastRefill time.Time +} + +// PeerRateLimiter enforces a per-peer token-bucket rate limit. +// +// Each peer has its own bucket with capacity `burst`, refilled at `rate` +// tokens per second. Allow() consumes one token and returns false when the +// bucket is empty. +// +// The limiter is safe for concurrent use. Buckets are created on AddPeer +// (initialized full, so newly connected peers are not penalized) and +// removed on RemovePeer. Allow() on an unknown peer lazily creates a full +// bucket as a fallback. +type PeerRateLimiter struct { + rate float64 // tokens per second + burst float64 // bucket capacity + + mu sync.Mutex + buckets map[p2p.ID]*tokenBucket +} + +// NewPeerRateLimiter creates a limiter with the given rate (tokens/sec) and +// burst (max tokens in a bucket). +func NewPeerRateLimiter(rate float64, burst int) *PeerRateLimiter { + return &PeerRateLimiter{ + rate: rate, + burst: float64(burst), + buckets: make(map[p2p.ID]*tokenBucket), + } +} + +// AddPeer initializes a full bucket for a newly connected peer. +// Calling AddPeer for a peer that already has a bucket is a no-op. +func (l *PeerRateLimiter) AddPeer(peerID p2p.ID) { + l.mu.Lock() + defer l.mu.Unlock() + if _, ok := l.buckets[peerID]; ok { + return + } + l.buckets[peerID] = &tokenBucket{ + tokens: l.burst, + lastRefill: time.Now(), + } +} + +// RemovePeer drops the bucket for a disconnected peer. +func (l *PeerRateLimiter) RemovePeer(peerID p2p.ID) { + l.mu.Lock() + defer l.mu.Unlock() + delete(l.buckets, peerID) +} + +// Allow returns true if the peer is allowed to perform one action, consuming +// one token. Returns false if the peer has exhausted its bucket. +// +// If the peer has no bucket (e.g. AddPeer was not called), a full bucket is +// created as a safe fallback. +func (l *PeerRateLimiter) Allow(peerID p2p.ID) bool { + l.mu.Lock() + defer l.mu.Unlock() + + bucket, ok := l.buckets[peerID] + if !ok { + bucket = &tokenBucket{tokens: l.burst, lastRefill: time.Now()} + l.buckets[peerID] = bucket + } + + now := time.Now() + elapsed := now.Sub(bucket.lastRefill).Seconds() + bucket.tokens += elapsed * l.rate + if bucket.tokens > l.burst { + bucket.tokens = l.burst + } + bucket.lastRefill = now + + if bucket.tokens < 1 { + return false + } + bucket.tokens-- + return true +} diff --git a/sequencer/rate_limiter_test.go b/sequencer/rate_limiter_test.go new file mode 100644 index 00000000000..89fec11186f --- /dev/null +++ b/sequencer/rate_limiter_test.go @@ -0,0 +1,147 @@ +package sequencer + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" + "github.com/tendermint/tendermint/p2p" +) + +const ( + testRate = 10.0 // tokens/sec + testBurst = 5 // bucket capacity +) + +func newTestLimiter() *PeerRateLimiter { + return NewPeerRateLimiter(testRate, testBurst) +} + +// TestAllow_WithinBurst verifies that a peer can fire up to burst requests +// immediately after connecting (bucket starts full). +func TestAllow_WithinBurst(t *testing.T) { + l := newTestLimiter() + peer := p2p.ID("peer-a") + l.AddPeer(peer) + + for i := 0; i < testBurst; i++ { + require.True(t, l.Allow(peer), "request %d should be allowed (within burst)", i+1) + } +} + +// TestAllow_ExceedBurst verifies that the (burst+1)-th immediate request is denied. +func TestAllow_ExceedBurst(t *testing.T) { + l := newTestLimiter() + peer := p2p.ID("peer-b") + l.AddPeer(peer) + + for i := 0; i < testBurst; i++ { + l.Allow(peer) + } + require.False(t, l.Allow(peer), "request beyond burst should be denied") +} + +// TestAllow_RefillAfterDelay verifies tokens are replenished over time. +func TestAllow_RefillAfterDelay(t *testing.T) { + l := newTestLimiter() + peer := p2p.ID("peer-c") + l.AddPeer(peer) + + // Drain the bucket completely. + for i := 0; i < testBurst; i++ { + l.Allow(peer) + } + require.False(t, l.Allow(peer), "bucket should be empty") + + // Wait for 1 token to refill (rate=10/s → 100ms per token). + time.Sleep(120 * time.Millisecond) + require.True(t, l.Allow(peer), "one token should have been refilled") +} + +// TestAllow_UnknownPeer verifies that a peer not added via AddPeer gets a +// full-bucket fallback and is not permanently blocked. +func TestAllow_UnknownPeer(t *testing.T) { + l := newTestLimiter() + peer := p2p.ID("peer-unknown") + // No AddPeer call — Allow should create a full bucket as fallback. + require.True(t, l.Allow(peer), "unknown peer should get a full bucket fallback") +} + +// TestAddPeer_Idempotent verifies that calling AddPeer twice does not reset +// an existing bucket (tokens already consumed should stay consumed). +func TestAddPeer_Idempotent(t *testing.T) { + l := newTestLimiter() + peer := p2p.ID("peer-d") + l.AddPeer(peer) + + // Drain the bucket. + for i := 0; i < testBurst; i++ { + l.Allow(peer) + } + require.False(t, l.Allow(peer)) + + // Second AddPeer must not reset the bucket. + l.AddPeer(peer) + require.False(t, l.Allow(peer), "AddPeer must not reset an existing bucket") +} + +// TestRemovePeer_CleansUp verifies that after RemovePeer the bucket is gone +// and a subsequent Allow re-creates a fresh full bucket (fallback path). +func TestRemovePeer_CleansUp(t *testing.T) { + l := newTestLimiter() + peer := p2p.ID("peer-e") + l.AddPeer(peer) + + // Drain the bucket. + for i := 0; i < testBurst; i++ { + l.Allow(peer) + } + require.False(t, l.Allow(peer)) + + // Remove peer — bucket must be deleted. + l.RemovePeer(peer) + + // Next Allow triggers the unknown-peer fallback (full bucket). + require.True(t, l.Allow(peer), "after RemovePeer, Allow should create a fresh full bucket") +} + +// TestAllow_MultiplePeers verifies that buckets are independent per peer. +func TestAllow_MultiplePeers(t *testing.T) { + l := newTestLimiter() + peerA := p2p.ID("peer-f") + peerB := p2p.ID("peer-g") + l.AddPeer(peerA) + l.AddPeer(peerB) + + // Drain peerA entirely. + for i := 0; i < testBurst; i++ { + l.Allow(peerA) + } + require.False(t, l.Allow(peerA), "peerA should be exhausted") + + // peerB's bucket must be untouched. + require.True(t, l.Allow(peerB), "peerB should still have tokens") +} + +// TestAllow_BurstCapNotExceeded verifies that tokens never exceed burst even +// after a long idle period. +func TestAllow_BurstCapNotExceeded(t *testing.T) { + l := newTestLimiter() + peer := p2p.ID("peer-h") + l.AddPeer(peer) + + // Drain fully. + for i := 0; i < testBurst; i++ { + l.Allow(peer) + } + + // Wait long enough for many tokens to theoretically accumulate. + time.Sleep(500 * time.Millisecond) // would add 5 tokens at rate=10 + + // Can only consume up to burst, not more. + allowed := 0 + for l.Allow(peer) { + allowed++ + } + require.LessOrEqual(t, allowed, testBurst, "tokens must not exceed burst capacity") +} diff --git a/sequencer/signature_store.go b/sequencer/signature_store.go new file mode 100644 index 00000000000..0d2941db872 --- /dev/null +++ b/sequencer/signature_store.go @@ -0,0 +1,60 @@ +package sequencer + +import ( + "fmt" + + "github.com/morph-l2/go-ethereum/common" + dbm "github.com/tendermint/tm-db" +) + +var signaturePrefix = []byte("block_sig:") + +// SignatureStore persists block signatures independently from geth. +// Key = "block_sig:" + blockHash (32 bytes), Value = signature (65 bytes). +type SignatureStore struct { + db dbm.DB +} + +// NewSignatureStore creates a new SignatureStore backed by the given DB. +func NewSignatureStore(db dbm.DB) *SignatureStore { + return &SignatureStore{db: db} +} + +func signatureKey(blockHash common.Hash) []byte { + key := make([]byte, len(signaturePrefix)+common.HashLength) + copy(key, signaturePrefix) + copy(key[len(signaturePrefix):], blockHash.Bytes()) + return key +} + +// SaveSignature persists a block's signature. +func (s *SignatureStore) SaveSignature(blockHash common.Hash, sig []byte) error { + if s == nil || s.db == nil { + return nil + } + return s.db.Set(signatureKey(blockHash), sig) +} + +// GetSignature retrieves a block's signature. Returns nil, error if not found. +func (s *SignatureStore) GetSignature(blockHash common.Hash) ([]byte, error) { + if s == nil || s.db == nil { + return nil, fmt.Errorf("signature store not initialized") + } + val, err := s.db.Get(signatureKey(blockHash)) + if err != nil { + return nil, err + } + if len(val) == 0 { + return nil, fmt.Errorf("signature not found for block %s", blockHash.Hex()) + } + return val, nil +} + +// Close releases the underlying DB. Safe to call on a nil receiver or when +// the store was never initialized. +func (s *SignatureStore) Close() error { + if s == nil || s.db == nil { + return nil + } + return s.db.Close() +} diff --git a/sequencer/state_v2.go b/sequencer/state_v2.go index 14833973de1..4fb5ce139ae 100644 --- a/sequencer/state_v2.go +++ b/sequencer/state_v2.go @@ -1,7 +1,6 @@ package sequencer import ( - "context" "fmt" "sync" "time" @@ -12,61 +11,73 @@ import ( ) const ( - // DefaultBlockInterval is the default interval between blocks - // TODO: make this configurable - DefaultBlockInterval = 3000 * time.Millisecond + // BlockInterval is the fallback interval for empty blocks (no txs). + BlockInterval = 5000 * time.Millisecond + // FastBlockInterval is the txpool polling interval. + // When pending txs are found, a block is produced immediately. + FastBlockInterval = 300 * time.Millisecond ) // StateV2 manages the state for centralized sequencer mode. // It replaces the PBFT consensus state after the upgrade. +// +// Node roles: +// - Fullnode (signer==nil): only applyRoutine runs (in reactor), no block production +// - ActiveSequencer (signer!=nil, ha==nil): roleCheckRoutine + broadcastRoutine +// - HA-Leader (signer!=nil, ha!=nil, ha.IsLeader()==true): roleCheckRoutine + broadcastRoutine +// - HA-Follower (signer!=nil, ha!=nil, ha.IsLeader()==false): roleCheckRoutine (idle) type StateV2 struct { service.BaseService mtx sync.RWMutex // Core state - latestBlock *BlockV2 - sequencerMode bool // Whether the node is started in sequencer mode (has signer configured) + latestBlock *BlockV2 // Dependencies - l2Node l2node.L2Node - signer Signer - verifier SequencerVerifier - logger log.Logger + l2Node l2node.L2Node + signer Signer + verifier SequencerVerifier + l1Tracker L1Tracker // required: gates block production on L1 freshness + sigStore *SignatureStore + ha SequencerHA // nil = single-node mode + logger log.Logger // Block production - blockTicker *time.Ticker - blockInterval time.Duration + blockInterval time.Duration // empty-block fallback interval (default 3s) + fastBlockInterval time.Duration // txpool polling interval (default 300ms) - // Broadcast channel - blocks produced by this sequencer are sent here + // Broadcast channel - non-HA self-produced blocks are sent here broadcastCh chan *BlockV2 - - // Quit channel - quitCh chan struct{} } // NewStateV2 creates a new StateV2 instance. -// sequencerMode is determined by whether a signer is provided. +// Node mode is determined by whether a signer and/or ha is provided. +// verifier is required when signer is configured (sequencer/HA nodes must verify blocks). func NewStateV2( l2Node l2node.L2Node, - blockInterval time.Duration, logger log.Logger, verifier SequencerVerifier, + l1Tracker L1Tracker, signer Signer, + sigStore *SignatureStore, + ha SequencerHA, ) (*StateV2, error) { - if blockInterval <= 0 { - blockInterval = DefaultBlockInterval + if verifier == nil { + return nil, fmt.Errorf("sequencer verifier is required for V2 mode") } s := &StateV2{ - l2Node: l2Node, - signer: signer, - verifier: verifier, - sequencerMode: signer != nil, - blockInterval: blockInterval, - logger: logger.With("module", "stateV2"), - broadcastCh: make(chan *BlockV2, 100), - quitCh: make(chan struct{}), + l2Node: l2Node, + signer: signer, + verifier: verifier, + l1Tracker: l1Tracker, + sigStore: sigStore, + ha: ha, + blockInterval: BlockInterval, + fastBlockInterval: FastBlockInterval, + logger: logger.With("module", "stateV2"), + broadcastCh: make(chan *BlockV2, 100), } s.BaseService = *service.NewBaseService(logger, "StateV2", s) @@ -75,9 +86,8 @@ func NewStateV2( } // OnStart implements service.Service. -// It initializes state from geth and starts block production if this node is the active sequencer. +// Initializes state from geth. Nodes with a signer start roleCheckRoutine. func (s *StateV2) OnStart() error { - // Initialize latest block from geth latestBlock, err := s.l2Node.GetLatestBlockV2() if err != nil { return fmt.Errorf("failed to get latest block: %w", err) @@ -87,28 +97,25 @@ func (s *StateV2) OnStart() error { s.latestBlock = latestBlock s.mtx.Unlock() - var seqAddr string - var isActiveSequencer bool - if s.signer != nil { - seqAddr = s.signer.Address().Hex() - // Check if this node is the active sequencer via L1 contract - isActiveSequencer, err = s.signer.IsActiveSequencer(context.Background()) - if err != nil { - s.logger.Error("Failed to check sequencer status", "error", err) - isActiveSequencer = false + // Use local variable to avoid accessing s.latestBlock without lock in log statement. + s.logger.Info("StateV2 initialized", + "latestHeight", latestBlock.Number, + "latestHash", latestBlock.Hash.Hex(), + "hasSigner", s.signer != nil, + "isHAMode", s.ha != nil) + + // Start HA service at upgrade height. This initializes Raft and begins + // leader election (bootstrap) or cluster join (follower). + if s.ha != nil { + if err := s.ha.Start(); err != nil { + return fmt.Errorf("failed to start HA service: %w", err) } } - s.logger.Info("StateV2 initialized", - "latestHeight", s.latestBlock.Number, - "latestHash", s.latestBlock.Hash.Hex(), - "sequencerMode", s.sequencerMode, - "isActiveSequencer", isActiveSequencer, - "seqAddr", seqAddr) - - // Start block production if sequencer mode is enabled and this node is the active sequencer - if s.sequencerMode && isActiveSequencer { - go s.produceBlockRoutine() + // Fullnode (no signer) does not produce blocks; applyRoutine is managed by the reactor. + // Nodes with signer start roleCheckRoutine, which handles dynamic role detection. + if s.signer != nil { + go s.roleCheckRoutine() } return nil @@ -117,72 +124,188 @@ func (s *StateV2) OnStart() error { // OnStop implements service.Service. func (s *StateV2) OnStop() { s.logger.Info("Stopping StateV2") - close(s.quitCh) - if s.blockTicker != nil { - s.blockTicker.Stop() + if s.ha != nil { + s.ha.Stop() } } -// produceBlockRoutine is the main loop for block production. -func (s *StateV2) produceBlockRoutine() { - s.blockTicker = time.NewTicker(s.blockInterval) - defer s.blockTicker.Stop() +func (s *StateV2) OnReset() error { + return nil +} - s.logger.Info("Starting block production routine", "interval", s.blockInterval) +// roleCheckRoutine is the unified loop for role detection and block production. +// It runs for all nodes with a signer (ActiveSequencer, HA-Leader, HA-Follower). +// +// Two timers drive block production: +// - fastTicker (300ms): polls txpool via assembleBlock; produces immediately when txs found. +// - slowTimer (3s): forces a block (even empty) to maintain chain liveness. +// +// When fastTicker produces a block, slowTimer is reset to avoid redundant empty blocks. +func (s *StateV2) roleCheckRoutine() { + fastTicker := time.NewTicker(s.fastBlockInterval) + slowTimer := time.NewTimer(s.blockInterval) + defer fastTicker.Stop() + defer slowTimer.Stop() + + s.logger.Info("Starting role check routine", + "pollInterval", s.fastBlockInterval, + "emptyBlockInterval", s.blockInterval) for { select { - case <-s.quitCh: - s.logger.Info("Block production routine stopped") + case <-s.Quit(): + s.logger.Info("Role check routine stopped") return - case <-s.blockTicker.C: - s.produceBlock() + + case <-fastTicker.C: + if !s.isActiveSequencer() { + continue + } + block, collectedL1Msgs, err := s.assembleBlock() + if err != nil { + continue + } + if len(block.Transactions) == 0 && !collectedL1Msgs { + continue // empty block, discard and wait for next tick + } + s.commitBlock(block, collectedL1Msgs) + resetTimer(slowTimer, s.blockInterval) + + case <-slowTimer.C: + if s.isActiveSequencer() { + block, collectedL1Msgs, err := s.assembleBlock() + if err == nil { + s.commitBlock(block, collectedL1Msgs) + } + } + slowTimer.Reset(s.blockInterval) } } } -// produceBlock produces a new block and broadcasts it. -func (s *StateV2) produceBlock() { - s.mtx.Lock() - parentHash := s.latestBlock.Hash - s.mtx.Unlock() +// resetTimer safely stops, drains, and resets a timer. +// t.Stop() returns false when the timer has already fired but t.C has not been +// consumed; the non-blocking drain prevents a stale fire from triggering the +// next select iteration. +func resetTimer(t *time.Timer, d time.Duration) { + if !t.Stop() { + select { + case <-t.C: + default: + } + } + t.Reset(d) +} - s.logger.Debug("Producing block", "parentHash", parentHash.Hex()) +// isActiveSequencer returns true if this node should produce the next block. +// For HA mode: must be Raft leader AND L1-designated sequencer. +// For single-node mode: must be L1-designated sequencer. +func (s *StateV2) isActiveSequencer() bool { + // HA mode: must be Raft leader + if s.ha != nil && !s.ha.IsLeader() { + return false + } + + // L1 tracker: stop producing if L1 RPC is stale (we may be blind to + // SequencerUpdated events on L1 and could produce as a revoked sequencer). + if s.l1Tracker.IsHalt() { + return false + } + + s.mtx.RLock() + lb := s.latestBlock + s.mtx.RUnlock() + if lb == nil { + return false + } + nextHeight := lb.Number + 1 + + ok, err := s.verifier.IsSequencerAt(s.signer.Address(), nextHeight) + if err != nil { + s.logger.Error("Failed to check sequencer status", "height", nextHeight, "err", err) + return false + } + return ok +} + +// assembleBlock calls geth Engine API to build a candidate block from the current +// txpool and pending L1 messages. This is a read-only operation with no side +// effects — the result can be safely discarded if the block has no work. +func (s *StateV2) assembleBlock() (*BlockV2, bool, error) { + s.mtx.RLock() + parentHash := s.latestBlock.Hash + s.mtx.RUnlock() - // Request block data from geth (pass hash as bytes) block, collectedL1Msgs, err := s.l2Node.RequestBlockDataV2(parentHash.Bytes()) if err != nil { - s.logger.Error("Failed to request block data", "error", err) - return + s.logger.Error("Failed to assemble block", "error", err) + return nil, false, err } - _ = collectedL1Msgs // TODO: log or use this info + return block, collectedL1Msgs, nil +} - // Sign the block +// commitBlock signs the assembled block and either commits via Raft (HA mode) +// or applies locally and sends to broadcastCh (single-node mode). +func (s *StateV2) commitBlock(block *BlockV2, collectedL1Msgs bool) { + t0 := time.Now() + + tSign := time.Now() if err := s.signBlock(block); err != nil { s.logger.Error("Failed to sign block", "error", err) return } + signDur := time.Since(tSign) + + if s.ha != nil { + // HA mode: replicate via Raft. FSM callback handles ApplyBlock + SaveSignature + // for both leader and follower. Broadcast via ha.Subscribe() -> broadcastRoutine. + tCommit := time.Now() + if err := s.ha.Commit(block); err != nil { + s.logger.Error("Failed to commit block via HA", "number", block.Number, "err", err) + return + } + commitDur := time.Since(tCommit) + totalDur := time.Since(t0) - // ********************* RAFT HA ********************* - // TODO: add raft HA - // **************************************************** - - // Apply the block to geth and update local state - if err := s.ApplyBlock(block); err != nil { - s.logger.Error("Failed to apply block", "error", err) - return - } + s.logger.Debug("[PERF] commitBlock", + "mode", "HA", + "height", block.Number, + "txCount", len(block.Transactions), + "gasUsed", block.GasUsed, + "sign_ms", float64(signDur.Microseconds())/1000.0, + "raft_commit_ms", float64(commitDur.Microseconds())/1000.0, + "total_ms", float64(totalDur.Microseconds())/1000.0, + ) + } else { + // Non-HA mode: apply locally (includes SaveSignature), broadcast via broadcastCh. + tApply := time.Now() + if err := s.ApplyBlock(block); err != nil { + s.logger.Error("Failed to apply block", "error", err) + return + } + applyDur := time.Since(tApply) + totalDur := time.Since(t0) - // Send to broadcast channel - select { - case s.broadcastCh <- block: - s.logger.Debug("Block produced and queued for broadcast", - "number", block.Number, - "hash", block.Hash.Hex(), + s.logger.Debug("[PERF] commitBlock", + "mode", "single", + "height", block.Number, "txCount", len(block.Transactions), - "collectedL1Msgs", collectedL1Msgs) - default: - s.logger.Error("Broadcast channel full, dropping block", "number", block.Number) + "gasUsed", block.GasUsed, + "sign_ms", float64(signDur.Microseconds())/1000.0, + "apply_ms", float64(applyDur.Microseconds())/1000.0, + "total_ms", float64(totalDur.Microseconds())/1000.0, + ) + + select { + case s.broadcastCh <- block: + s.logger.Debug("Block queued for broadcast", + "number", block.Number, + "hash", block.Hash.Hex(), + "txCount", len(block.Transactions), + "collectedL1Msgs", collectedL1Msgs) + default: + s.logger.Error("Broadcast channel full, dropping block", "number", block.Number) + } } } @@ -191,19 +314,59 @@ func (s *StateV2) signBlock(block *BlockV2) error { if s.signer == nil { return fmt.Errorf("signer not set") } - - // Sign the block hash signature, err := s.signer.Sign(block.Hash.Bytes()) if err != nil { return fmt.Errorf("failed to sign block: %w", err) } - block.Signature = signature - s.logger.Debug("Block signed", "number", block.Number, "hash", block.Hash.Hex(), "signer", s.signer.Address().Hex()) return nil } +// ApplyBlock saves the block signature and delegates to l2Node.ApplyBlockV2. +// Reorg detection and idempotent checks are handled in the Executor layer. +func (s *StateV2) ApplyBlock(block *BlockV2) error { + s.mtx.Lock() + defer s.mtx.Unlock() + + if len(block.Signature) == 0 { + return fmt.Errorf("ApplyBlock: block %d missing signature", block.Number) + } + + // Save signature BEFORE applying to geth. If crash happens after Apply + // but before SaveSignature, the block is on-chain but its signature is + // lost — which can cause P2P peers to reject/disconnect when they cannot + // verify the block. Saving first at worst leaves an orphan signature if + // Apply fails, which is harmless. + tSig := time.Now() + if err := s.sigStore.SaveSignature(block.Hash, block.Signature); err != nil { + return err + } + sigDur := time.Since(tSig) + + tGeth := time.Now() + applied, err := s.l2Node.ApplyBlockV2(block) + if err != nil { + return err + } + gethDur := time.Since(tGeth) + + if applied { + s.latestBlock = block + } + + s.logger.Debug("[PERF] ApplyBlock", + "height", block.Number, + "txCount", len(block.Transactions), + "gasUsed", block.GasUsed, + "sigSave_ms", float64(sigDur.Microseconds())/1000.0, + "geth_ms", float64(gethDur.Microseconds())/1000.0, + "total_ms", float64((gethDur+sigDur).Microseconds())/1000.0, + ) + + return nil +} + // LatestHeight returns the latest block height. func (s *StateV2) LatestHeight() int64 { s.mtx.RLock() @@ -221,36 +384,38 @@ func (s *StateV2) LatestBlock() *BlockV2 { return s.latestBlock } -// BroadcastCh returns the channel for blocks to be broadcast. -// No lock needed - channel itself is thread-safe. +// BroadcastCh returns the channel for self-produced blocks (non-HA mode only). func (s *StateV2) BroadcastCh() <-chan *BlockV2 { return s.broadcastCh } -// ApplyBlock applies a block to L2 and updates local state. -// This is the unified entry point for block application. -func (s *StateV2) ApplyBlock(block *BlockV2) error { - // Apply to L2 execution layer - if err := s.l2Node.ApplyBlockV2(block); err != nil { - return err - } +// GetBlockByNumber gets a block from l2node by number. +func (s *StateV2) GetBlockByNumber(number uint64) (*BlockV2, error) { + return s.l2Node.GetBlockByNumber(number) +} - // Update local state - s.mtx.Lock() - s.latestBlock = block - s.mtx.Unlock() +// HasSigner returns whether this node has a signer configured. +// Fullnode returns false; ActiveSequencer and HA nodes return true. +func (s *StateV2) HasSigner() bool { + return s.signer != nil +} - return nil +// IsHAMode returns whether this node is in HA mode (ha != nil). +func (s *StateV2) IsHAMode() bool { + return s.ha != nil } -// GetBlockByNumber gets a block from l2node by number. -// Uses geth's eth_getBlockByNumber RPC internally. -func (s *StateV2) GetBlockByNumber(number uint64) (*BlockV2, error) { - return s.l2Node.GetBlockByNumber(number) +// IsHALeader returns whether this node is the current Raft leader. +// Returns false if not in HA mode. +func (s *StateV2) IsHALeader() bool { + return s.ha != nil && s.ha.IsLeader() } -// IsSequencerMode returns whether this node is started in sequencer mode. -// This means the node has a signer configured and can potentially produce blocks. -func (s *StateV2) IsSequencerMode() bool { - return s.sequencerMode +// HASubscribe returns the HA block delivery channel. +// Panics if not in HA mode. +func (s *StateV2) HASubscribe() <-chan *BlockV2 { + if s.ha == nil { + panic("HASubscribe called but not in HA mode") + } + return s.ha.Subscribe() } diff --git a/sequencer/state_v2_test.go b/sequencer/state_v2_test.go index d33f25293fb..f95dfab315a 100644 --- a/sequencer/state_v2_test.go +++ b/sequencer/state_v2_test.go @@ -1,9 +1,8 @@ package sequencer import ( - "context" + "errors" "testing" - "time" "github.com/morph-l2/go-ethereum/common" "github.com/tendermint/tendermint/l2node" @@ -11,11 +10,14 @@ import ( "github.com/tendermint/tendermint/types" ) -// mockSignerImpl is a mock implementation of Signer for testing +// ============================================================================ +// Mock implementations +// ============================================================================ + +// mockSignerImpl is a mock implementation of Signer for testing. type mockSignerImpl struct { address common.Address signature []byte - isActive bool } func (m *mockSignerImpl) Sign(data []byte) ([]byte, error) { @@ -29,24 +31,58 @@ func (m *mockSignerImpl) Address() common.Address { return m.address } -func (m *mockSignerImpl) IsActiveSequencer(ctx context.Context) (bool, error) { - return m.isActive, nil +// mockSequencerVerifier is a mock implementation of SequencerVerifier for testing. +type mockSequencerVerifier struct { + isSequencer bool + err error +} + +func (m *mockSequencerVerifier) IsSequencerAt(addr common.Address, l2Height uint64) (bool, error) { + if m.err != nil { + return false, m.err + } + return m.isSequencer, nil +} + +// mockSequencerHA is a mock implementation of SequencerHA for testing. +type mockSequencerHA struct { + leader bool + commitErr error + subCh chan *BlockV2 } -// newTestMockL2Node creates a mock L2Node for testing +func newMockSequencerHA(leader bool) *mockSequencerHA { + return &mockSequencerHA{ + leader: leader, + subCh: make(chan *BlockV2, 10), + } +} + +func (m *mockSequencerHA) Start() error { return nil } +func (m *mockSequencerHA) Stop() {} +func (m *mockSequencerHA) IsLeader() bool { return m.leader } +func (m *mockSequencerHA) Join() error { return nil } +func (m *mockSequencerHA) Commit(block *BlockV2) error { return m.commitErr } +func (m *mockSequencerHA) Subscribe() <-chan *BlockV2 { return m.subCh } +func (m *mockSequencerHA) SetOnBlockApplied(fn func(*BlockV2) error) {} + +// newTestMockL2Node creates a mock L2Node for testing. func newTestMockL2Node() l2node.L2Node { return l2node.NewMockL2Node(0, "") } +// ============================================================================ +// Existing tests (adapted for new signature) +// ============================================================================ + func TestStateV2_NewStateV2(t *testing.T) { mockL2Node := newTestMockL2Node() logger := log.NewNopLogger() - stateV2, err := NewStateV2(mockL2Node, time.Second, logger, nil, nil) + stateV2, err := NewStateV2(mockL2Node, logger, &mockSequencerVerifier{}, nil, nil, nil, nil) if err != nil { t.Fatalf("NewStateV2 failed: %v", err) } - if stateV2 == nil { t.Fatal("StateV2 should not be nil") } @@ -56,41 +92,32 @@ func TestStateV2_LatestHeight(t *testing.T) { mockL2Node := newTestMockL2Node() logger := log.NewNopLogger() - stateV2, err := NewStateV2(mockL2Node, time.Second, logger, nil, nil) + stateV2, err := NewStateV2(mockL2Node, logger, &mockSequencerVerifier{}, nil, nil, nil, nil) if err != nil { t.Fatalf("NewStateV2 failed: %v", err) } - // Before start, latestBlock should be nil height := stateV2.LatestHeight() if height != 0 { t.Errorf("LatestHeight before start = %d, want 0", height) } } -func TestStateV2_IsSequencerMode(t *testing.T) { +func TestStateV2_HasSigner_MatchesIsSequencerMode(t *testing.T) { mockL2Node := newTestMockL2Node() logger := log.NewNopLogger() + mockVerifier := &mockSequencerVerifier{} - // Without signer, sequencerMode should be false - stateV2, err := NewStateV2(mockL2Node, time.Second, logger, nil, nil) - if err != nil { - t.Fatalf("NewStateV2 failed: %v", err) - } - - if stateV2.IsSequencerMode() { - t.Error("IsSequencerMode should be false when signer is nil") - } - - // With mock signer, sequencerMode should be true - mockSigner := &mockSignerImpl{} - stateV2WithSigner, err := NewStateV2(mockL2Node, time.Second, logger, nil, mockSigner) - if err != nil { - t.Fatalf("NewStateV2 failed: %v", err) + // Without signer + s1, _ := NewStateV2(mockL2Node, logger, mockVerifier, nil, nil, nil, nil) + if s1.HasSigner() { + t.Error("should be false when signer is nil") } - if !stateV2WithSigner.IsSequencerMode() { - t.Error("IsSequencerMode should be true when signer is provided") + // With signer + s2, _ := NewStateV2(mockL2Node, logger, mockVerifier, nil, &mockSignerImpl{}, nil, nil) + if !s2.HasSigner() { + t.Error("should be true when signer is provided") } } @@ -98,33 +125,22 @@ func TestStateV2_SignBlock(t *testing.T) { mockL2Node := newTestMockL2Node() logger := log.NewNopLogger() - // Create mock signer - mockSigner := &mockSignerImpl{ - signature: make([]byte, 65), // Mock 65-byte signature - } + mockSigner := &mockSignerImpl{signature: make([]byte, 65)} + mockVerifier := &mockSequencerVerifier{} - stateV2, err := NewStateV2(mockL2Node, time.Second, logger, nil, mockSigner) + stateV2, err := NewStateV2(mockL2Node, logger, mockVerifier, &mockL1Tracker{halt: false}, mockSigner, nil, nil) if err != nil { t.Fatalf("NewStateV2 failed: %v", err) } - // Create a test block block := &types.BlockV2{ Number: 1, Hash: [32]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, } - // Sign the block - err = stateV2.signBlock(block) - if err != nil { + if err := stateV2.signBlock(block); err != nil { t.Fatalf("signBlock failed: %v", err) } - - if len(block.Signature) == 0 { - t.Error("Block signature should not be empty") - } - - // Verify signature length (Ethereum signatures are 65 bytes) if len(block.Signature) != 65 { t.Errorf("Signature length = %d, want 65", len(block.Signature)) } @@ -134,33 +150,287 @@ func TestStateV2_SignBlockWithoutSigner(t *testing.T) { mockL2Node := newTestMockL2Node() logger := log.NewNopLogger() - // Create StateV2 without signer - stateV2, err := NewStateV2(mockL2Node, time.Second, logger, nil, nil) + stateV2, err := NewStateV2(mockL2Node, logger, &mockSequencerVerifier{}, nil, nil, nil, nil) if err != nil { t.Fatalf("NewStateV2 failed: %v", err) } - block := &types.BlockV2{ - Number: 1, - Hash: [32]byte{1, 2, 3, 4}, + block := &types.BlockV2{Number: 1, Hash: [32]byte{1, 2, 3, 4}} + if err := stateV2.signBlock(block); err == nil { + t.Error("signBlock should fail without signer") } +} + +// ============================================================================ +// New tests: verifier requirement +// ============================================================================ + +func TestNewStateV2_VerifierRequired(t *testing.T) { + mockL2Node := newTestMockL2Node() + logger := log.NewNopLogger() - // Sign should fail without signer - err = stateV2.signBlock(block) + // verifier==nil should fail + _, err := NewStateV2(mockL2Node, logger, nil, nil, nil, nil, nil) if err == nil { - t.Error("signBlock should fail without signer") + t.Fatal("expected error when verifier is nil") + } + + // with signer, still fails without verifier + mockSigner := &mockSignerImpl{} + _, err = NewStateV2(mockL2Node, logger, nil, nil, mockSigner, nil, nil) + if err == nil { + t.Fatal("expected error when verifier is nil (with signer)") } } -// Helper to create a test StateV2 -func createTestStateV2(t *testing.T, signer Signer) *StateV2 { +func TestNewStateV2_WithHA(t *testing.T) { mockL2Node := newTestMockL2Node() logger := log.NewNopLogger() + mockSigner := &mockSignerImpl{} + mockVerifier := &mockSequencerVerifier{} + ha := newMockSequencerHA(true) - stateV2, err := NewStateV2(mockL2Node, time.Second, logger, nil, signer) + stateV2, err := NewStateV2(mockL2Node, logger, mockVerifier, &mockL1Tracker{halt: false}, mockSigner, nil, ha) if err != nil { t.Fatalf("NewStateV2 failed: %v", err) } + if !stateV2.IsHAMode() { + t.Error("IsHAMode should be true when ha is provided") + } +} + +// ============================================================================ +// New tests: node mode helpers +// ============================================================================ + +func TestStateV2_HasSigner(t *testing.T) { + mockL2Node := newTestMockL2Node() + logger := log.NewNopLogger() + + fullnode, _ := NewStateV2(mockL2Node, logger, &mockSequencerVerifier{}, nil, nil, nil, nil) + if fullnode.HasSigner() { + t.Error("Fullnode should not have signer") + } - return stateV2 + mockSigner := &mockSignerImpl{} + mockVerifier := &mockSequencerVerifier{} + seqNode, _ := NewStateV2(mockL2Node, logger, mockVerifier, &mockL1Tracker{halt: false}, mockSigner, nil, nil) + if !seqNode.HasSigner() { + t.Error("Sequencer node should have signer") + } +} + +func TestStateV2_IsHAMode(t *testing.T) { + mockL2Node := newTestMockL2Node() + logger := log.NewNopLogger() + mockSigner := &mockSignerImpl{} + mockVerifier := &mockSequencerVerifier{} + + // Non-HA + nonHA, _ := NewStateV2(mockL2Node, logger, mockVerifier, &mockL1Tracker{halt: false}, mockSigner, nil, nil) + if nonHA.IsHAMode() { + t.Error("IsHAMode should be false without ha") + } + + // HA + ha, _ := NewStateV2(mockL2Node, logger, mockVerifier, &mockL1Tracker{halt: false}, mockSigner, nil, newMockSequencerHA(false)) + if !ha.IsHAMode() { + t.Error("IsHAMode should be true with ha") + } +} + +func TestStateV2_IsHALeader(t *testing.T) { + mockL2Node := newTestMockL2Node() + logger := log.NewNopLogger() + mockSigner := &mockSignerImpl{} + mockVerifier := &mockSequencerVerifier{} + + // Non-HA: never leader + nonHA, _ := NewStateV2(mockL2Node, logger, mockVerifier, &mockL1Tracker{halt: false}, mockSigner, nil, nil) + if nonHA.IsHALeader() { + t.Error("non-HA node should not be HA leader") + } + + // HA follower + follower, _ := NewStateV2(mockL2Node, logger, mockVerifier, &mockL1Tracker{halt: false}, mockSigner, nil, newMockSequencerHA(false)) + if follower.IsHALeader() { + t.Error("HA follower should not be leader") + } + + // HA leader + leader, _ := NewStateV2(mockL2Node, logger, mockVerifier, &mockL1Tracker{halt: false}, mockSigner, nil, newMockSequencerHA(true)) + if !leader.IsHALeader() { + t.Error("HA leader should be leader") + } +} + +// ============================================================================ +// New tests: isActiveSequencer +// ============================================================================ + +func TestStateV2_IsActiveSequencer_NonHA_Active(t *testing.T) { + mockL2Node := newTestMockL2Node() + logger := log.NewNopLogger() + mockSigner := &mockSignerImpl{address: common.HexToAddress("0x1")} + mockVerifier := &mockSequencerVerifier{isSequencer: true} + + s, _ := NewStateV2(mockL2Node, logger, mockVerifier, &mockL1Tracker{halt: false}, mockSigner, nil, nil) + s.latestBlock = &BlockV2{Number: 0} + + if !s.isActiveSequencer() { + t.Error("should be active sequencer when verifier returns true") + } +} + +func TestStateV2_IsActiveSequencer_NonHA_Inactive(t *testing.T) { + mockL2Node := newTestMockL2Node() + logger := log.NewNopLogger() + mockSigner := &mockSignerImpl{address: common.HexToAddress("0x1")} + mockVerifier := &mockSequencerVerifier{isSequencer: false} + + s, _ := NewStateV2(mockL2Node, logger, mockVerifier, &mockL1Tracker{halt: false}, mockSigner, nil, nil) + s.latestBlock = &BlockV2{Number: 0} + + if s.isActiveSequencer() { + t.Error("should not be active sequencer when verifier returns false") + } +} + +func TestStateV2_IsActiveSequencer_HA_Leader(t *testing.T) { + mockL2Node := newTestMockL2Node() + logger := log.NewNopLogger() + mockSigner := &mockSignerImpl{address: common.HexToAddress("0x1")} + mockVerifier := &mockSequencerVerifier{isSequencer: true} + ha := newMockSequencerHA(true) + + s, _ := NewStateV2(mockL2Node, logger, mockVerifier, &mockL1Tracker{halt: false}, mockSigner, nil, ha) + s.latestBlock = &BlockV2{Number: 0} + + if !s.isActiveSequencer() { + t.Error("HA leader should be active sequencer") + } +} + +func TestStateV2_IsActiveSequencer_HA_Follower(t *testing.T) { + mockL2Node := newTestMockL2Node() + logger := log.NewNopLogger() + mockSigner := &mockSignerImpl{address: common.HexToAddress("0x1")} + mockVerifier := &mockSequencerVerifier{isSequencer: true} // L1 says active + ha := newMockSequencerHA(false) // but not leader + + s, _ := NewStateV2(mockL2Node, logger, mockVerifier, &mockL1Tracker{halt: false}, mockSigner, nil, ha) + s.latestBlock = &BlockV2{Number: 0} + + if s.isActiveSequencer() { + t.Error("HA follower should not be active sequencer even if L1 says active") + } +} + +func TestStateV2_IsActiveSequencer_VerifierError(t *testing.T) { + mockL2Node := newTestMockL2Node() + logger := log.NewNopLogger() + mockSigner := &mockSignerImpl{} + mockVerifier := &mockSequencerVerifier{err: errors.New("rpc error")} + + s, _ := NewStateV2(mockL2Node, logger, mockVerifier, &mockL1Tracker{halt: false}, mockSigner, nil, nil) + s.latestBlock = &BlockV2{Number: 0} + + if s.isActiveSequencer() { + t.Error("should return false when verifier returns error") + } +} + +// ============================================================================ +// New tests: ApplyBlock +// ============================================================================ + +func TestStateV2_ApplyBlock_Idempotent(t *testing.T) { + mockL2Node := newTestMockL2Node() + logger := log.NewNopLogger() + + s, _ := NewStateV2(mockL2Node, logger, &mockSequencerVerifier{}, nil, nil, nil, nil) + block := &types.BlockV2{Number: 1, Signature: []byte{0x01, 0x02, 0x03}} + + // Apply twice should not error + if err := s.ApplyBlock(block); err != nil { + t.Fatalf("first apply failed: %v", err) + } + if err := s.ApplyBlock(block); err != nil { + t.Fatalf("second apply (idempotent) failed: %v", err) + } + if s.LatestHeight() != 1 { + t.Errorf("LatestHeight = %d, want 1", s.LatestHeight()) + } +} + +func TestStateV2_ApplyBlock_OlderBlockSkipped(t *testing.T) { + mockL2Node := newTestMockL2Node() + logger := log.NewNopLogger() + + s, _ := NewStateV2(mockL2Node, logger, &mockSequencerVerifier{}, nil, nil, nil, nil) + + block2 := &types.BlockV2{Number: 2, Signature: []byte{0x01, 0x02, 0x03}} + block1 := &types.BlockV2{Number: 1, Signature: []byte{0x01, 0x02, 0x03}} + + if err := s.ApplyBlock(block2); err != nil { + t.Fatalf("apply block2 failed: %v", err) + } + // Apply older block should be skipped silently + if err := s.ApplyBlock(block1); err != nil { + t.Fatalf("apply older block should not error: %v", err) + } + // latestBlock should still be 2 + if s.LatestHeight() != 2 { + t.Errorf("LatestHeight = %d, want 2", s.LatestHeight()) + } +} + +func TestStateV2_ApplyBlock_Sequential(t *testing.T) { + mockL2Node := newTestMockL2Node() + logger := log.NewNopLogger() + + s, _ := NewStateV2(mockL2Node, logger, &mockSequencerVerifier{}, nil, nil, nil, nil) + + for i := uint64(1); i <= 5; i++ { + block := &types.BlockV2{Number: i, Signature: []byte{0x01, 0x02, 0x03}} + if err := s.ApplyBlock(block); err != nil { + t.Fatalf("apply block %d failed: %v", i, err) + } + } + if s.LatestHeight() != 5 { + t.Errorf("LatestHeight = %d, want 5", s.LatestHeight()) + } +} + +// mockL1Tracker is a controllable L1Tracker for tests. +type mockL1Tracker struct { + halt bool +} + +func (m *mockL1Tracker) IsHalt() bool { return m.halt } + +func TestIsActiveSequencer_L1TrackerHaltsProduction(t *testing.T) { + logger := log.NewNopLogger() + verifier := &mockSequencerVerifier{isSequencer: true} + signer := &mockSignerImpl{} + + // L1 healthy -> active (verifier says we are the sequencer). + s, err := NewStateV2(newTestMockL2Node(), logger, verifier, &mockL1Tracker{halt: false}, signer, nil, nil) + if err != nil { + t.Fatalf("NewStateV2: %v", err) + } + s.latestBlock = &BlockV2{Number: 10} + if !s.isActiveSequencer() { + t.Fatal("expected active when L1 healthy and verifier says sequencer") + } + + // L1 halted -> NOT active even though verifier says we are the sequencer. + s2, err := NewStateV2(newTestMockL2Node(), logger, verifier, &mockL1Tracker{halt: true}, signer, nil, nil) + if err != nil { + t.Fatalf("NewStateV2: %v", err) + } + s2.latestBlock = &BlockV2{Number: 10} + if s2.isActiveSequencer() { + t.Fatal("expected NOT active when L1 tracker halts production") + } } diff --git a/test/e2e/node/main.go b/test/e2e/node/main.go index 84ba5435c7b..ea3da40a916 100644 --- a/test/e2e/node/main.go +++ b/test/e2e/node/main.go @@ -133,8 +133,10 @@ func startNode(cfg *Config) error { node.DefaultDBProvider, node.DefaultMetricsProvider(tmcfg.Instrumentation), nodeLogger, - nil, - nil, + nil, // sequencerVerifier + nil, // sequencerHealthGate + nil, // sequencerSigner + nil, // ha: no HA in e2e test node ) if err != nil { return err diff --git a/types/block.go b/types/block.go index be6ae7806b2..1c8fb351f44 100644 --- a/types/block.go +++ b/types/block.go @@ -59,6 +59,10 @@ func (b *Block) GetHash() []byte { return b.Hash().Bytes() } +func (b *Block) GetTime() int64 { return b.Header.Time.UnixMilli() } + +func (b *Block) GetBlockVersion() BlockVersion { return Version1 } + // Ensure Block implements SyncableBlock var _ SyncableBlock = (*Block)(nil) diff --git a/types/block_v2.go b/types/block_v2.go index 58e65a8d3d0..4a6099880bf 100644 --- a/types/block_v2.go +++ b/types/block_v2.go @@ -51,45 +51,31 @@ func (b *BlockV2) GetHash() []byte { return b.Hash.Bytes() } +func (b *BlockV2) GetTime() int64 { return int64(b.Timestamp) * 1000 } + +func (b *BlockV2) GetBlockVersion() BlockVersion { return Version2 } + +// BlockVersion identifies the wire format of a SyncableBlock: Version1 is the +// PBFT-era tendermint block, Version2 is the post-upgrade sequencer block. +type BlockVersion uint8 + +const ( + Version1 BlockVersion = 1 + Version2 BlockVersion = 2 +) + // SyncableBlock is an interface that both old Block and new BlockV2 can implement // for compatibility in the block pool. type SyncableBlock interface { GetHeight() int64 GetHash() []byte + GetTime() int64 + GetBlockVersion() BlockVersion } // Ensure BlockV2 implements SyncableBlock var _ SyncableBlock = (*BlockV2)(nil) -// SequencerAddress is the expected sequencer address for signature verification. -// This will be set by the sequencer package at init time. -var SequencerAddress common.Address - -// SetSequencerAddress sets the expected sequencer address. -func SetSequencerAddress(addr common.Address) { - SequencerAddress = addr -} - -// IsSequencerAddress checks if the given address is the expected sequencer. -func IsSequencerAddress(addr common.Address) bool { - return addr == SequencerAddress -} - -// RecoverBlockV2Signer recovers the signer address from the block's signature. -func RecoverBlockV2Signer(block *BlockV2) (common.Address, error) { - if len(block.Signature) == 0 { - return common.Address{}, fmt.Errorf("block has no signature") - } - - // Recover the public key from the signature - pubKey, err := crypto.SigToPub(block.Hash.Bytes(), block.Signature) - if err != nil { - return common.Address{}, fmt.Errorf("failed to recover public key: %w", err) - } - - return crypto.PubkeyToAddress(*pubKey), nil -} - // BlockV2FromProto converts a proto BlockV2 to types.BlockV2. func BlockV2FromProto(pb *seqproto.BlockV2) (*BlockV2, error) { if pb == nil { @@ -104,6 +90,10 @@ func BlockV2FromProto(pb *seqproto.BlockV2) (*BlockV2, error) { return nil, errors.New("invalid block hash length") } + if len(pb.Signature) != crypto.SignatureLength { + return nil, fmt.Errorf("invalid signature length: got %d, want %d", len(pb.Signature), crypto.SignatureLength) + } + baseFee := new(big.Int) if len(pb.BaseFee) > 0 { baseFee.SetBytes(pb.BaseFee) diff --git a/upgrade/upgrade.go b/upgrade/upgrade.go index 80222b24e12..3037ef438bc 100644 --- a/upgrade/upgrade.go +++ b/upgrade/upgrade.go @@ -1,23 +1,114 @@ package upgrade -// Hardcoded upgrade parameters +import ( + "encoding/binary" + "fmt" + "sync/atomic" +) + +// Upgrade parameters. These are written at runtime by the consensus / blocksync paths and read +// concurrently by multiple reactors and the derivation loop, so all access goes through atomic +// load/store. The variables are unexported; callers use the accessor functions below rather than +// touching them directly, which keeps every read/write race-free. var ( - // UpgradeBlockHeight is the block height at which the consensus switches to sequencer mode. - // Default is -1 (upgrade disabled). Set via --consensus.switchHeight flag. - // When < 0, the upgrade never activates. - UpgradeBlockHeight int64 = -1 + // upgradeBlockHeight is the block height at which the consensus switches to sequencer mode. + // Default is -1 (upgrade disabled). It is discovered at runtime once upgradeBlockTime is + // crossed (see finalizeCommit / blocksync) and persisted via the wired store so the boundary + // survives restarts. When < 0, the height-based upgrade check is disabled. + upgradeBlockHeight int64 = -1 + + // upgradeBlockTime is the block timestamp (Unix milliseconds) at which the consensus switches + // to sequencer mode. Set once at node startup via SetUpgradeBlockTime. When <= 0, the + // timestamp-based upgrade check is disabled. + upgradeBlockTime int64 + + // store, when wired via SetStore, persists upgradeBlockHeight across restarts. Set once in + // SetStore at node startup (before the consensus/blocksync goroutines start), then only read. + store Store + + // upgradeHeightKey is the key under which upgradeBlockHeight is persisted. It is a plain key + // in the state DB and does not collide with state-store keys (stateKey, validatorsKey, ...). + upgradeHeightKey = []byte("upgrade/block_height") ) -// IsUpgraded returns true if the given height is at or after the upgrade height. -// Returns false when UpgradeBlockHeight < 0 (upgrade disabled). +// Store is the minimal key/value subset of the node's database that the upgrade package needs. +// Keeping it tiny lets this package stay dependency-light (no tm-db import) and trivially testable +// with an in-memory map. tm-db.DB satisfies it. +type Store interface { + Get([]byte) ([]byte, error) + Set([]byte, []byte) error +} + +// UpgradeBlockHeight returns the current upgrade block height (-1 when not yet discovered). +func UpgradeBlockHeight() int64 { return atomic.LoadInt64(&upgradeBlockHeight) } + +// UpgradeBlockTime returns the configured upgrade block time in Unix milliseconds (0 = disabled). +func UpgradeBlockTime() int64 { return atomic.LoadInt64(&upgradeBlockTime) } + +// SetStore wires persistence and restores any previously persisted upgrade height. +// Call once at node startup, before consensus / blocksync run. +// +// Fail-hard policy on the critical upgrade-height data: +// - a DB read error (IO) -> return error (treating it as "nothing persisted" would silently +// reset the boundary and let the node re-run PBFT past the upgrade after a restart); +// - a non-empty value of unexpected length -> corrupt -> return error rather than ignore it; +// - an absent key (len 0) is normal on first run and leaves the height at its current value. +func SetStore(s Store) error { + store = s + bz, err := s.Get(upgradeHeightKey) + if err != nil { + return fmt.Errorf("upgrade: read persisted block height: %w", err) + } + switch len(bz) { + case 0: + // absent: first run, nothing to restore. + case 8: + atomic.StoreInt64(&upgradeBlockHeight, int64(binary.BigEndian.Uint64(bz))) + default: + return fmt.Errorf("upgrade: corrupt persisted block height: got %d bytes, want 8", len(bz)) + } + return nil +} + +// IsUpgraded returns true if the given height is past the upgrade height. +// Returns false when the upgrade height is not yet set (< 0, upgrade disabled). func IsUpgraded(height int64) bool { - if UpgradeBlockHeight < 0 { + h := atomic.LoadInt64(&upgradeBlockHeight) + if h < 0 { + return false + } + return height > h +} + +// IsUpgradedByTs returns true if the given block timestamp (Unix ms) is at or after the +// upgrade time. Returns false when the upgrade time is not set (<= 0, upgrade disabled). +func IsUpgradedByTs(timestamp int64) bool { + t := atomic.LoadInt64(&upgradeBlockTime) + if t <= 0 { return false } - return height >= UpgradeBlockHeight + return timestamp >= t } -// SetUpgradeBlockHeight sets the upgrade block height. +// SetUpgradeBlockHeight sets the upgrade block height and, if a store is wired, persists it so the +// discovered boundary survives restarts. +// +// The height is critical consensus state: a node that loses it on restart re-runs PBFT past the +// upgrade or mishandles the boundary. If the write fails we therefore panic rather than continue +// with in-memory-only state that would silently diverge after a restart (same fail-hard policy as +// the signature store on a failed persist). func SetUpgradeBlockHeight(height int64) { - UpgradeBlockHeight = height + atomic.StoreInt64(&upgradeBlockHeight, height) + if store != nil { + var bz [8]byte + binary.BigEndian.PutUint64(bz[:], uint64(height)) + if err := store.Set(upgradeHeightKey, bz[:]); err != nil { + panic(fmt.Sprintf("upgrade: persist block height %d failed: %v", height, err)) + } + } +} + +// SetUpgradeBlockTime sets the upgrade block time (Unix milliseconds). +func SetUpgradeBlockTime(timestamp int64) { + atomic.StoreInt64(&upgradeBlockTime, timestamp) } diff --git a/upgrade/upgrade_test.go b/upgrade/upgrade_test.go new file mode 100644 index 00000000000..12de95f9303 --- /dev/null +++ b/upgrade/upgrade_test.go @@ -0,0 +1,162 @@ +package upgrade + +import ( + "errors" + "testing" +) + +// memStore is an in-memory Store for tests. +type memStore map[string][]byte + +func (m memStore) Get(k []byte) ([]byte, error) { return m[string(k)], nil } +func (m memStore) Set(k, v []byte) error { m[string(k)] = v; return nil } + +// errStore is a Store whose Get/Set can be made to fail, to exercise the +// fail-hard policy on the critical upgrade-height data. +type errStore struct { + data map[string][]byte + getErr error + setErr error +} + +func (s *errStore) Get(k []byte) ([]byte, error) { + if s.getErr != nil { + return nil, s.getErr + } + return s.data[string(k)], nil +} + +func (s *errStore) Set(k, v []byte) error { + if s.setErr != nil { + return s.setErr + } + s.data[string(k)] = v + return nil +} + +// reset restores package globals so tests do not leak state into each other. +func reset() { + upgradeBlockHeight = -1 + upgradeBlockTime = 0 + store = nil +} + +func TestIsUpgradedByTs(t *testing.T) { + reset() + defer reset() + + // Disabled when unset. + if IsUpgradedByTs(1) { + t.Fatal("expected disabled when upgrade time <= 0") + } + + SetUpgradeBlockTime(1000) + cases := []struct { + ts int64 + want bool + }{ + {999, false}, // before + {1000, true}, // exactly at boundary (>=) + {1001, true}, // after + } + for _, c := range cases { + if got := IsUpgradedByTs(c.ts); got != c.want { + t.Fatalf("IsUpgradedByTs(%d) = %v, want %v", c.ts, got, c.want) + } + } +} + +func TestSetUpgradeBlockHeightPersists(t *testing.T) { + reset() + defer reset() + + db := memStore{} + + // Fresh store, nothing persisted: load leaves the -1 sentinel. + if err := SetStore(db); err != nil { + t.Fatalf("SetStore: %v", err) + } + if UpgradeBlockHeight() != -1 { + t.Fatalf("expected -1 after empty load, got %d", UpgradeBlockHeight()) + } + + // Discovering the boundary persists it. + SetUpgradeBlockHeight(42) + if UpgradeBlockHeight() != 42 { + t.Fatalf("expected 42, got %d", UpgradeBlockHeight()) + } + + // Simulate a restart: clear globals, re-wire the same DB, expect the height restored. + upgradeBlockHeight = -1 + store = nil + if err := SetStore(db); err != nil { + t.Fatalf("SetStore: %v", err) + } + if UpgradeBlockHeight() != 42 { + t.Fatalf("expected 42 restored after restart, got %d", UpgradeBlockHeight()) + } + if !IsUpgraded(43) || IsUpgraded(42) { + t.Fatalf("IsUpgraded boundary wrong after restore: 42=%v 43=%v", IsUpgraded(42), IsUpgraded(43)) + } +} + +func TestSetUpgradeBlockHeightNoStore(t *testing.T) { + reset() + defer reset() + + // No store wired: must not panic, just sets the global. + SetUpgradeBlockHeight(7) + if UpgradeBlockHeight() != 7 { + t.Fatalf("expected 7, got %d", UpgradeBlockHeight()) + } +} + +// A DB read failure must propagate (not be swallowed as "nothing persisted"), +// otherwise a restart would silently reset the boundary. +func TestSetStoreReadErrorPropagates(t *testing.T) { + reset() + defer reset() + + s := &errStore{getErr: errors.New("db read failed")} + if err := SetStore(s); err == nil { + t.Fatal("expected error when DB read fails") + } + if UpgradeBlockHeight() != -1 { + t.Fatalf("expected -1 unchanged on read error, got %d", UpgradeBlockHeight()) + } +} + +// A persisted value of unexpected (non-zero, non-8) length is corruption and must error, +// not be silently ignored (which would leave the boundary at the wrong value). +func TestSetStoreCorruptLengthErrors(t *testing.T) { + reset() + defer reset() + + db := memStore{} + db[string(upgradeHeightKey)] = []byte{0x1, 0x2, 0x3, 0x4} // 4 bytes -> corrupt + if err := SetStore(db); err == nil { + t.Fatal("expected error on corrupt (non-8-byte) persisted height") + } + if UpgradeBlockHeight() != -1 { + t.Fatalf("expected -1 unchanged on corrupt value, got %d", UpgradeBlockHeight()) + } +} + +// A failed persist of the critical upgrade height must stop the node (panic), +// not continue with in-memory-only state that diverges after a restart. +func TestSetUpgradeBlockHeightPanicsOnWriteFailure(t *testing.T) { + reset() + defer reset() + + s := &errStore{data: map[string][]byte{}, setErr: errors.New("disk full")} + if err := SetStore(s); err != nil { + t.Fatalf("SetStore: %v", err) + } + + defer func() { + if r := recover(); r == nil { + t.Fatal("expected panic when persisting block height fails") + } + }() + SetUpgradeBlockHeight(99) +}