Skip to content

Commit 06c7d60

Browse files
committed
discovery: fix endless loop in gossip syncer on context cancellation
This commit fixes a critical bug where the channelGraphSyncer goroutine would enter an endless loop when context cancellation or peer disconnect errors occurred during the syncingChans or queryNewChannels states. The root cause was that state handler functions (handleSyncingChans and synchronizeChanIDs) did not return errors to the main goroutine loop. When these functions encountered fatal errors like context cancellation, they would log the error and return early without changing the syncer's state. This caused the main loop to immediately re-enter the same state handler, encounter the same error, and loop indefinitely while spamming error logs. The fix makes error handling explicit by having state handlers return errors. The main channelGraphSyncer loop now checks these errors and exits cleanly when fatal errors occur. We return any error (not just context cancellation) because fatal errors can manifest in multiple forms: context.Canceled, ErrGossipSyncerExiting from the rate limiter, lnpeer.ErrPeerExiting from Brontide, or network errors like connection closed. This approach matches the error handling pattern already used in other goroutines like replyHandler.
1 parent f938e40 commit 06c7d60

File tree

2 files changed

+45
-12
lines changed

2 files changed

+45
-12
lines changed

discovery/syncer.go

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -492,15 +492,19 @@ func (g *GossipSyncer) Stop() {
492492

493493
// handleSyncingChans handles the state syncingChans for the GossipSyncer. When
494494
// in this state, we will send a QueryChannelRange msg to our peer and advance
495-
// the syncer's state to waitingQueryRangeReply.
496-
func (g *GossipSyncer) handleSyncingChans(ctx context.Context) {
495+
// the syncer's state to waitingQueryRangeReply. Returns an error if a fatal
496+
// error occurs that should cause the goroutine to exit.
497+
func (g *GossipSyncer) handleSyncingChans(ctx context.Context) error {
497498
// Prepare the query msg.
498499
queryRangeMsg, err := g.genChanRangeQuery(
499500
ctx, g.genHistoricalChanRangeQuery,
500501
)
501502
if err != nil {
502503
log.Errorf("Unable to gen chan range query: %v", err)
503-
return
504+
505+
// Any error here is likely fatal (context cancelled, db error,
506+
// etc.), so return it to exit the goroutine cleanly.
507+
return err
504508
}
505509

506510
// Acquire a lock so the following state transition is atomic.
@@ -517,12 +521,18 @@ func (g *GossipSyncer) handleSyncingChans(ctx context.Context) {
517521
err = g.sendToPeer(ctx, queryRangeMsg)
518522
if err != nil {
519523
log.Errorf("Unable to send chan range query: %v", err)
520-
return
524+
525+
// Any send error (peer exiting, connection closed, rate
526+
// limiter signaling exit, etc.) is fatal, so return it to
527+
// exit the goroutine cleanly.
528+
return err
521529
}
522530

523531
// With the message sent successfully, we'll transition into the next
524532
// state where we wait for their reply.
525533
g.setSyncState(waitingQueryRangeReply)
534+
535+
return nil
526536
}
527537

528538
// channelGraphSyncer is the main goroutine responsible for ensuring that we
@@ -545,7 +555,14 @@ func (g *GossipSyncer) channelGraphSyncer(ctx context.Context) {
545555
// understand, as we'll as responding to any other queries by
546556
// them.
547557
case syncingChans:
548-
g.handleSyncingChans(ctx)
558+
err := g.handleSyncingChans(ctx)
559+
if err != nil {
560+
log.Debugf("GossipSyncer(%x): exiting due to "+
561+
"error in syncingChans: %v",
562+
g.cfg.peerPub[:], err)
563+
564+
return
565+
}
549566

550567
// In this state, we've sent out our initial channel range
551568
// query and are waiting for the final response from the remote
@@ -593,7 +610,14 @@ func (g *GossipSyncer) channelGraphSyncer(ctx context.Context) {
593610
// First, we'll attempt to continue our channel
594611
// synchronization by continuing to send off another
595612
// query chunk.
596-
done := g.synchronizeChanIDs(ctx)
613+
done, err := g.synchronizeChanIDs(ctx)
614+
if err != nil {
615+
log.Debugf("GossipSyncer(%x): exiting due to "+
616+
"error in queryNewChannels: %v",
617+
g.cfg.peerPub[:], err)
618+
619+
return
620+
}
597621

598622
// If this wasn't our last query, then we'll need to
599623
// transition to our waiting state.
@@ -819,16 +843,18 @@ func (g *GossipSyncer) sendGossipTimestampRange(ctx context.Context,
819843
// range. This method will be called continually until the entire range has
820844
// been queried for with a response received. We'll chunk our requests as
821845
// required to ensure they fit into a single message. We may re-renter this
822-
// state in the case that chunking is required.
823-
func (g *GossipSyncer) synchronizeChanIDs(ctx context.Context) bool {
846+
// state in the case that chunking is required. Returns true if synchronization
847+
// is complete, and an error if a fatal error occurs that should cause the
848+
// goroutine to exit.
849+
func (g *GossipSyncer) synchronizeChanIDs(ctx context.Context) (bool, error) {
824850
// If we're in this state yet there are no more new channels to query
825851
// for, then we'll transition to our final synced state and return true
826852
// to signal that we're fully synchronized.
827853
if len(g.newChansToQuery) == 0 {
828854
log.Infof("GossipSyncer(%x): no more chans to query",
829855
g.cfg.peerPub[:])
830856

831-
return true
857+
return true, nil
832858
}
833859

834860
// Otherwise, we'll issue our next chunked query to receive replies
@@ -864,9 +890,14 @@ func (g *GossipSyncer) synchronizeChanIDs(ctx context.Context) bool {
864890
})
865891
if err != nil {
866892
log.Errorf("Unable to sync chan IDs: %v", err)
893+
894+
// Any send error (peer exiting, connection closed, rate
895+
// limiter signaling exit, etc.) is fatal, so return it to
896+
// exit the goroutine cleanly.
897+
return false, err
867898
}
868899

869-
return false
900+
return false, nil
870901
}
871902

872903
// isLegacyReplyChannelRange determines where a ReplyChannelRange message is

discovery/syncer_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,7 +1518,8 @@ func TestGossipSyncerSynchronizeChanIDs(t *testing.T) {
15181518

15191519
for i := 0; i < chunkSize*2; i += 2 {
15201520
// With our set up complete, we'll request a sync of chan ID's.
1521-
done := syncer.synchronizeChanIDs(t.Context())
1521+
done, err := syncer.synchronizeChanIDs(t.Context())
1522+
require.NoError(t, err)
15221523

15231524
// At this point, we shouldn't yet be done as only 2 items
15241525
// should have been queried for.
@@ -1565,7 +1566,8 @@ func TestGossipSyncerSynchronizeChanIDs(t *testing.T) {
15651566
}
15661567

15671568
// If we issue another query, the syncer should tell us that it's done.
1568-
done := syncer.synchronizeChanIDs(t.Context())
1569+
done, err := syncer.synchronizeChanIDs(t.Context())
1570+
require.NoError(t, err)
15691571
if done {
15701572
t.Fatalf("syncer should be finished!")
15711573
}

0 commit comments

Comments
 (0)