Include ChannelManager in flush() write batch

joostjager · claude · joostjager · commit 6f20a0026ff6 · 2026-02-05T14:38:48.000+01:00
Extends Persist::flush() to accept channel_manager_bytes, allowing the
channel manager to be written in the same write_batch() call as channel
monitors. The channel manager is always written first in the batch to
ensure proper ordering.

This removes the separate channel manager persistence from the background
processor and combines it with the monitor flush, reducing round trips.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/lightning-background-processor/src/lib.rs b/lightning-background-processor/src/lib.rs
@@ -57,11 +57,10 @@ use lightning::sign::{
 use lightning::util::async_poll::MaybeSend;
 use lightning::util::logger::Logger;
 use lightning::util::persist::{
-	KVStore, KVStoreSync, KVStoreSyncWrapper, CHANNEL_MANAGER_PERSISTENCE_KEY,
-	CHANNEL_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE, CHANNEL_MANAGER_PERSISTENCE_SECONDARY_NAMESPACE,
-	NETWORK_GRAPH_PERSISTENCE_KEY, NETWORK_GRAPH_PERSISTENCE_PRIMARY_NAMESPACE,
-	NETWORK_GRAPH_PERSISTENCE_SECONDARY_NAMESPACE, SCORER_PERSISTENCE_KEY,
-	SCORER_PERSISTENCE_PRIMARY_NAMESPACE, SCORER_PERSISTENCE_SECONDARY_NAMESPACE,
+	KVStore, KVStoreSync, KVStoreSyncWrapper, NETWORK_GRAPH_PERSISTENCE_KEY,
+	NETWORK_GRAPH_PERSISTENCE_PRIMARY_NAMESPACE, NETWORK_GRAPH_PERSISTENCE_SECONDARY_NAMESPACE,
+	SCORER_PERSISTENCE_KEY, SCORER_PERSISTENCE_PRIMARY_NAMESPACE,
+	SCORER_PERSISTENCE_SECONDARY_NAMESPACE,
 };
 use lightning::util::sweep::{OutputSweeper, OutputSweeperSync};
 use lightning::util::wakers::Future;
@@ -1150,44 +1149,13 @@ where
 			None => {},
 		}
 
-		let mut futures = Joiner::new();
+		// Type A is unused but needed for inference - we use the same boxed future type as other slots
+		let mut futures: Joiner<lightning::io::Error, core::pin::Pin<Box<dyn core::future::Future<Output = Result<(), lightning::io::Error>> + Send + 'static>>, _, _, _, _> = Joiner::new();
 
-		// Capture the number of pending monitor writes before persisting the channel manager.
-		// We'll only flush this many writes after the manager is persisted, to avoid flushing
-		// monitor updates that arrived after the manager state was captured.
+		// Capture the number of pending monitor writes and whether manager needs persistence.
+		// We'll flush monitors and manager together in a single batch after other tasks complete.
 		let pending_monitor_writes = chain_monitor.pending_write_count();
-
-		if channel_manager.get_cm().get_and_clear_needs_persistence() {
-			log_trace!(logger, "Persisting ChannelManager...");
-
-			let fut = async {
-				kv_store
-					.write(
-						CHANNEL_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
-						CHANNEL_MANAGER_PERSISTENCE_SECONDARY_NAMESPACE,
-						CHANNEL_MANAGER_PERSISTENCE_KEY,
-						channel_manager.get_cm().encode(),
-					)
-					.await
-			};
-			// TODO: Once our MSRV is 1.68 we should be able to drop the Box
-			let mut fut = Box::pin(fut);
-
-			// Because persisting the ChannelManager is important to avoid accidental
-			// force-closures, go ahead and poll the future once before we do slightly more
-			// CPU-intensive tasks in the form of NetworkGraph pruning or scorer time-stepping
-			// below. This will get it moving but won't block us for too long if the underlying
-			// future is actually async.
-			use core::future::Future;
-			let mut waker = dummy_waker();
-			let mut ctx = task::Context::from_waker(&mut waker);
-			match core::pin::Pin::new(&mut fut).poll(&mut ctx) {
-				task::Poll::Ready(res) => futures.set_a_res(res),
-				task::Poll::Pending => futures.set_a(fut),
-			}
-
-			log_trace!(logger, "Done persisting ChannelManager.");
-		}
+		let needs_manager_persist = channel_manager.get_cm().get_and_clear_needs_persistence();
 
 		// Note that we want to archive stale ChannelMonitors and run a network graph prune once
 		// not long after startup before falling back to their usual infrequent runs. This avoids
@@ -1354,11 +1322,13 @@ where
 			res?;
 		}
 
-		// Flush the monitor writes that were pending before we persisted the channel manager.
-		// Any writes that arrived after are left in the queue for the next iteration.
-		if pending_monitor_writes > 0 {
-			match chain_monitor.flush(pending_monitor_writes) {
-				Ok(()) => log_trace!(logger, "Flushed {} monitor writes", pending_monitor_writes),
+		// Flush monitors and manager together in a single batch.
+		// Any monitor writes that arrived after are left in the queue for the next iteration.
+		if pending_monitor_writes > 0 || needs_manager_persist {
+			log_trace!(logger, "Persisting ChannelManager and flushing {} monitor writes...", pending_monitor_writes);
+			let manager_bytes = channel_manager.get_cm().encode();
+			match chain_monitor.flush(pending_monitor_writes, manager_bytes) {
+				Ok(()) => log_trace!(logger, "Flushed ChannelManager and {} monitor writes", pending_monitor_writes),
 				Err(e) => log_error!(logger, "Failed to flush chain monitor: {}", e),
 			}
 		}
@@ -1416,25 +1386,18 @@ where
 	}
 	log_trace!(logger, "Terminating background processor.");
 
-	// After we exit, ensure we persist the ChannelManager one final time - this avoids
-	// some races where users quit while channel updates were in-flight, with
-	// ChannelMonitor update(s) persisted without a corresponding ChannelManager update.
-	kv_store
-		.write(
-			CHANNEL_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
-			CHANNEL_MANAGER_PERSISTENCE_SECONDARY_NAMESPACE,
-			CHANNEL_MANAGER_PERSISTENCE_KEY,
-			channel_manager.get_cm().encode(),
-		)
-		.await?;
-
-	// Flush all pending monitor writes after final channel manager persistence.
+	// After we exit, ensure we persist the ChannelManager one final time along with any
+	// pending monitor writes - this avoids some races where users quit while channel updates
+	// were in-flight, with ChannelMonitor update(s) persisted without a corresponding
+	// ChannelManager update.
 	let pending_monitor_writes = chain_monitor.pending_write_count();
-	if pending_monitor_writes > 0 {
-		match chain_monitor.flush(pending_monitor_writes) {
-			Ok(()) => log_trace!(logger, "Flushed {} monitor writes", pending_monitor_writes),
-			Err(e) => log_error!(logger, "Failed to flush chain monitor: {}", e),
-		}
+	let manager_bytes = channel_manager.get_cm().encode();
+	match chain_monitor.flush(pending_monitor_writes, manager_bytes) {
+		Ok(()) => log_trace!(logger, "Final flush: ChannelManager and {} monitor writes", pending_monitor_writes),
+		Err(e) => {
+			log_error!(logger, "Failed final flush: {}", e);
+			return Err(e);
+		},
 	}
 
 	if let Some(ref scorer) = scorer {
@@ -1746,25 +1709,17 @@ impl BackgroundProcessor {
 					channel_manager.get_cm().timer_tick_occurred();
 					last_freshness_call = Instant::now();
 				}
-				// Capture the number of pending monitor writes before persisting the channel manager.
+				// Capture the number of pending monitor writes and whether manager needs persistence.
 				let pending_monitor_writes = chain_monitor.pending_write_count();
+				let needs_manager_persist = channel_manager.get_cm().get_and_clear_needs_persistence();
 
-				if channel_manager.get_cm().get_and_clear_needs_persistence() {
-					log_trace!(logger, "Persisting ChannelManager...");
-					(kv_store.write(
-						CHANNEL_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
-						CHANNEL_MANAGER_PERSISTENCE_SECONDARY_NAMESPACE,
-						CHANNEL_MANAGER_PERSISTENCE_KEY,
-						channel_manager.get_cm().encode(),
-					))?;
-					log_trace!(logger, "Done persisting ChannelManager.");
-				}
-
-				// Flush the monitor writes that were pending before we persisted the channel manager.
-				if pending_monitor_writes > 0 {
-					match chain_monitor.flush(pending_monitor_writes) {
+				// Flush monitors and manager together in a single batch.
+				if pending_monitor_writes > 0 || needs_manager_persist {
+					log_trace!(logger, "Persisting ChannelManager and flushing {} monitor writes...", pending_monitor_writes);
+					let manager_bytes = channel_manager.get_cm().encode();
+					match chain_monitor.flush(pending_monitor_writes, manager_bytes) {
 						Ok(()) => {
-							log_trace!(logger, "Flushed {} monitor writes", pending_monitor_writes)
+							log_trace!(logger, "Flushed ChannelManager and {} monitor writes", pending_monitor_writes)
 						},
 						Err(e) => log_error!(logger, "Failed to flush chain monitor: {}", e),
 					}
@@ -1881,25 +1836,20 @@ impl BackgroundProcessor {
 				}
 			}
 
-			// After we exit, ensure we persist the ChannelManager one final time - this avoids
-			// some races where users quit while channel updates were in-flight, with
-			// ChannelMonitor update(s) persisted without a corresponding ChannelManager update.
-			kv_store.write(
-				CHANNEL_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
-				CHANNEL_MANAGER_PERSISTENCE_SECONDARY_NAMESPACE,
-				CHANNEL_MANAGER_PERSISTENCE_KEY,
-				channel_manager.get_cm().encode(),
-			)?;
-
-			// Flush all pending monitor writes after final channel manager persistence.
+			// After we exit, ensure we persist the ChannelManager one final time along with any
+			// pending monitor writes - this avoids some races where users quit while channel updates
+			// were in-flight, with ChannelMonitor update(s) persisted without a corresponding
+			// ChannelManager update.
 			let pending_monitor_writes = chain_monitor.pending_write_count();
-			if pending_monitor_writes > 0 {
-				match chain_monitor.flush(pending_monitor_writes) {
-					Ok(()) => {
-						log_trace!(logger, "Flushed {} monitor writes", pending_monitor_writes)
-					},
-					Err(e) => log_error!(logger, "Failed to flush chain monitor: {}", e),
-				}
+			let manager_bytes = channel_manager.get_cm().encode();
+			match chain_monitor.flush(pending_monitor_writes, manager_bytes) {
+				Ok(()) => {
+					log_trace!(logger, "Final flush: ChannelManager and {} monitor writes", pending_monitor_writes)
+				},
+				Err(e) => {
+					log_error!(logger, "Failed final flush: {}", e);
+					return Err(e.into());
+				},
 			}
 
 			if let Some(ref scorer) = scorer {
diff --git a/lightning/src/chain/chainmonitor.rs b/lightning/src/chain/chainmonitor.rs
@@ -209,13 +209,17 @@ pub trait Persist<ChannelSigner: EcdsaChannelSigner> {
 
 	/// Flushes pending writes to the underlying storage.
 	///
-	/// The `count` parameter specifies how many pending writes to flush.
+	/// The `count` parameter specifies how many pending monitor writes to flush.
+	/// The `channel_manager_bytes` parameter contains the serialized channel manager to persist.
+	///
+	/// The channel manager is always written first in the batch, before any monitor writes,
+	/// to ensure proper ordering (manager state should be at least as recent as monitors on disk).
 	///
 	/// For implementations that queue writes (returning [`ChannelMonitorUpdateStatus::InProgress`]
 	/// from persist methods), this method should write queued data to storage.
 	///
 	/// Returns the list of completed monitor updates (channel_id, update_id) that were flushed.
-	fn flush(&self, count: usize) -> Result<Vec<(ChannelId, u64)>, io::Error>;
+	fn flush(&self, count: usize, channel_manager_bytes: Vec<u8>) -> Result<Vec<(ChannelId, u64)>, io::Error>;
 }
 
 struct MonitorHolder<ChannelSigner: EcdsaChannelSigner> {
@@ -347,8 +351,8 @@ where
 		self.persister.pending_write_count()
 	}
 
-	fn flush(&self, count: usize) -> Result<Vec<(ChannelId, u64)>, io::Error> {
-		crate::util::persist::poll_sync_future(self.persister.flush(count))
+	fn flush(&self, count: usize, channel_manager_bytes: Vec<u8>) -> Result<Vec<(ChannelId, u64)>, io::Error> {
+		crate::util::persist::poll_sync_future(self.persister.flush(count, channel_manager_bytes))
 	}
 }
 
@@ -760,14 +764,17 @@ where
 
 	/// Flushes pending writes to the underlying storage.
 	///
-	/// If `count` is `Some(n)`, only the first `n` pending writes are flushed.
-	/// If `count` is `None`, all pending writes are flushed.
+	/// The `count` parameter specifies how many pending monitor writes to flush.
+	/// The `channel_manager_bytes` parameter contains the serialized channel manager to persist.
+	///
+	/// The channel manager is always written first in the batch, before any monitor writes,
+	/// to ensure proper ordering (manager state should be at least as recent as monitors on disk).
 	///
 	/// For persisters that queue writes (returning [`ChannelMonitorUpdateStatus::InProgress`]
 	/// from persist methods), this method writes queued data to storage and signals
 	/// completion to the channel manager via [`Self::channel_monitor_updated`].
-	pub fn flush(&self, count: usize) -> Result<(), io::Error> {
-		let completed = self.persister.flush(count)?;
+	pub fn flush(&self, count: usize, channel_manager_bytes: Vec<u8>) -> Result<(), io::Error> {
+		let completed = self.persister.flush(count, channel_manager_bytes)?;
 		for (channel_id, update_id) in completed {
 			let _ = self.channel_monitor_updated(channel_id, update_id);
 		}
diff --git a/lightning/src/util/persist.rs b/lightning/src/util/persist.rs
@@ -563,8 +563,15 @@ impl<ChannelSigner: EcdsaChannelSigner, K: KVStoreSync + ?Sized> Persist<Channel
 		);
 	}
 
-	fn flush(&self, _count: usize) -> Result<Vec<(ChannelId, u64)>, io::Error> {
-		// KVStoreSync implementations persist immediately, so there's nothing to flush.
+	fn flush(&self, _count: usize, channel_manager_bytes: Vec<u8>) -> Result<Vec<(ChannelId, u64)>, io::Error> {
+		// KVStoreSync implementations persist immediately, so there's nothing to flush
+		// for monitors. However, we still need to persist the channel manager.
+		self.write(
+			CHANNEL_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
+			CHANNEL_MANAGER_PERSISTENCE_SECONDARY_NAMESPACE,
+			CHANNEL_MANAGER_PERSISTENCE_KEY,
+			channel_manager_bytes,
+		)?;
 		Ok(Vec::new())
 	}
 }
@@ -902,8 +909,8 @@ where
 		self.0.pending_write_count()
 	}
 
-	fn flush(&self, count: usize) -> Result<Vec<(ChannelId, u64)>, io::Error> {
-		poll_sync_future(self.0.flush(count))
+	fn flush(&self, count: usize, channel_manager_bytes: Vec<u8>) -> Result<Vec<(ChannelId, u64)>, io::Error> {
+		poll_sync_future(self.0.flush(count, channel_manager_bytes))
 	}
 }
 
@@ -1116,22 +1123,35 @@ where
 
 	/// Flushes pending writes to the underlying [`KVStore`].
 	///
-	/// If `count` is `Some(n)`, only the first `n` pending writes are flushed.
-	/// If `count` is `None`, all pending writes are flushed.
+	/// The `count` parameter specifies how many pending monitor writes to flush.
+	/// The `channel_manager_bytes` parameter contains the serialized channel manager to persist.
+	///
+	/// The channel manager is always written first in the batch, before any monitor writes,
+	/// to ensure proper ordering (manager state should be at least as recent as monitors on disk).
 	///
 	/// This method should be called after one or more calls that queue persist operations
 	/// to actually write the data to storage.
 	///
 	/// Returns the list of completed monitor updates (channel_id, update_id) that were flushed.
-	pub async fn flush(&self, count: usize) -> Result<Vec<(ChannelId, u64)>, io::Error> {
+	pub async fn flush(
+		&self, count: usize, channel_manager_bytes: Vec<u8>,
+	) -> Result<Vec<(ChannelId, u64)>, io::Error> {
 		let pending = {
 			let mut queue = self.0.pending_writes.lock().unwrap();
 			let n = count.min(queue.len());
 			queue.drain(..n).collect::<Vec<_>>()
 		};
 
 		// Phase 1: Collect all batch entries
-		let mut batch_entries = Vec::with_capacity(pending.len());
+		// Channel manager goes FIRST to ensure it's written before monitors
+		let mut batch_entries = Vec::with_capacity(pending.len() + 1);
+		batch_entries.push(BatchWriteEntry::new(
+			CHANNEL_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
+			CHANNEL_MANAGER_PERSISTENCE_SECONDARY_NAMESPACE,
+			CHANNEL_MANAGER_PERSISTENCE_KEY,
+			channel_manager_bytes,
+		));
+
 		let mut stale_cleanups = Vec::new();
 
 		for (i, write) in pending.iter().enumerate() {
@@ -1164,29 +1184,30 @@ where
 		}
 
 		// Phase 2: Execute batch write
-		let successful_writes = if !batch_entries.is_empty() {
-			let result = self.0.kv_store.write_batch(batch_entries).await;
-			if let Some(err) = result.error {
-				// Re-queue failed and subsequent writes
-				let failed_writes =
-					pending.into_iter().skip(result.successful_writes).collect::<Vec<_>>();
-				if !failed_writes.is_empty() {
-					let mut queue = self.0.pending_writes.lock().unwrap();
-					// Prepend failed writes back to the front of the queue
-					for write in failed_writes.into_iter().rev() {
-						queue.insert(0, write);
-					}
+		let result = self.0.kv_store.write_batch(batch_entries).await;
+		if let Some(err) = result.error {
+			// The first entry is the channel manager, so successful_writes includes it
+			// Monitor writes start at index 1, so subtract 1 to get monitor success count
+			let successful_monitor_writes =
+				if result.successful_writes > 0 { result.successful_writes - 1 } else { 0 };
+			// Re-queue failed and subsequent monitor writes
+			let failed_writes =
+				pending.into_iter().skip(successful_monitor_writes).collect::<Vec<_>>();
+			if !failed_writes.is_empty() {
+				let mut queue = self.0.pending_writes.lock().unwrap();
+				// Prepend failed writes back to the front of the queue
+				for write in failed_writes.into_iter().rev() {
+					queue.insert(0, write);
 				}
-				return Err(err);
 			}
-			result.successful_writes
-		} else {
-			0
-		};
+			return Err(err);
+		}
+		// Subtract 1 for the channel manager entry to get monitor success count
+		let successful_monitor_writes = result.successful_writes.saturating_sub(1);
 
 		// Phase 3: Cleanup stale updates (only for successfully written monitors)
 		for (i, monitor_key, start, end) in stale_cleanups {
-			if i < successful_writes {
+			if i < successful_monitor_writes {
 				for update_id in start..end {
 					let update_name = UpdateName::from(update_id);
 					// Lazy delete - ignore errors as this is just cleanup
@@ -1207,7 +1228,7 @@ where
 		// Phase 4: Return completions for successful writes only
 		let completions = pending
 			.into_iter()
-			.take(successful_writes)
+			.take(successful_monitor_writes)
 			.map(|write| match write {
 				PendingWrite::FullMonitor { completion, .. } => completion,
 				PendingWrite::Update { completion, .. } => completion,
diff --git a/lightning/src/util/test_utils.rs b/lightning/src/util/test_utils.rs
@@ -825,7 +825,7 @@ impl<Signer: sign::ecdsa::EcdsaChannelSigner> Persist<Signer> for WatchtowerPers
 		);
 	}
 
-	fn flush(&self, _count: usize) -> Result<Vec<(ChannelId, u64)>, io::Error> {
+	fn flush(&self, _count: usize, _channel_manager_bytes: Vec<u8>) -> Result<Vec<(ChannelId, u64)>, io::Error> {
 		Ok(Vec::new())
 	}
 }
@@ -892,7 +892,7 @@ impl<Signer: sign::ecdsa::EcdsaChannelSigner> Persist<Signer> for TestPersister
 		self.chain_sync_monitor_persistences.lock().unwrap().retain(|x| x != &monitor_name);
 	}
 
-	fn flush(&self, _count: usize) -> Result<Vec<(ChannelId, u64)>, io::Error> {
+	fn flush(&self, _count: usize, _channel_manager_bytes: Vec<u8>) -> Result<Vec<(ChannelId, u64)>, io::Error> {
 		Ok(Vec::new())
 	}
 }

Original file line number	Diff line number	Diff line change
`@@ -825,7 +825,7 @@ impl<Signer: sign::ecdsa::EcdsaChannelSigner> Persist<Signer> for WatchtowerPers`
`825`	`825`	`);`
`826`	`826`	`}`
`827`	`827`
`828`		`- fn flush(&self, _count: usize) -> Result<Vec<(ChannelId, u64)>, io::Error> {`
	`828`	`+ fn flush(&self, _count: usize, _channel_manager_bytes: Vec<u8>) -> Result<Vec<(ChannelId, u64)>, io::Error> {`
`829`	`829`	`Ok(Vec::new())`
`830`	`830`	`}`
`831`	`831`	`}`
`@@ -892,7 +892,7 @@ impl<Signer: sign::ecdsa::EcdsaChannelSigner> Persist<Signer> for TestPersister`
`892`	`892`	`self.chain_sync_monitor_persistences.lock().unwrap().retain(\|x\| x != &monitor_name);`
`893`	`893`	`}`
`894`	`894`
`895`		`- fn flush(&self, _count: usize) -> Result<Vec<(ChannelId, u64)>, io::Error> {`
	`895`	`+ fn flush(&self, _count: usize, _channel_manager_bytes: Vec<u8>) -> Result<Vec<(ChannelId, u64)>, io::Error> {`
`896`	`896`	`Ok(Vec::new())`
`897`	`897`	`}`
`898`	`898`	`}`