From 8f5023a006ed0bdc08432c0c283e28d77ab3bc72 Mon Sep 17 00:00:00 2001
From: Matt Corallo <git@bluematt.me>
Date: Sun, 26 Sep 2021 00:09:17 +0000
Subject: [PATCH 1/2] Avoid disconnecting all peers if user code is slow

In the sample client (and likely other downstream users), event
processing may block on slow operations (e.g. Bitcoin Core RPCs)
and ChannelManager persistence may take some time. This should be
fine, except that we consider this a case of possible backgrounding
and disconnect all of our peers when it happens.

Instead, we here avoid considering event processing time in the
time between PeerManager events.
---
 lightning-background-processor/src/lib.rs | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/lightning-background-processor/src/lib.rs b/lightning-background-processor/src/lib.rs
index 2dbc8053b4e..363dfba93ce 100644
--- a/lightning-background-processor/src/lib.rs
+++ b/lightning-background-processor/src/lib.rs
@@ -219,11 +219,17 @@ impl BackgroundProcessor {
 			let mut have_pruned = false;
 
 			loop {
-				peer_manager.process_events();
+				peer_manager.process_events(); // Note that this may block on ChannelManager's locking
 				channel_manager.process_pending_events(&event_handler);
 				chain_monitor.process_pending_events(&event_handler);
+
+				// We wait up to 100ms, but track how long it takes to detect being put to sleep,
+				// see `await_start`'s use below.
+				let await_start = Instant::now();
 				let updates_available =
 					channel_manager.await_persistable_update_timeout(Duration::from_millis(100));
+				let await_time = await_start.elapsed();
+
 				if updates_available {
 					log_trace!(logger, "Persisting ChannelManager...");
 					persister.persist_manager(&*channel_manager)?;
@@ -239,15 +245,20 @@ impl BackgroundProcessor {
 					channel_manager.timer_tick_occurred();
 					last_freshness_call = Instant::now();
 				}
-				if last_ping_call.elapsed().as_secs() > PING_TIMER * 2 {
+				if await_time > Duration::from_secs(1) {
 					// On various platforms, we may be starved of CPU cycles for several reasons.
 					// E.g. on iOS, if we've been in the background, we will be entirely paused.
 					// Similarly, if we're on a desktop platform and the device has been asleep, we
 					// may not get any cycles.
-					// In any case, if we've been entirely paused for more than double our ping
-					// timer, we should have disconnected all sockets by now (and they're probably
-					// dead anyway), so disconnect them by calling `timer_tick_occurred()` twice.
-					log_trace!(logger, "Awoke after more than double our ping timer, disconnecting peers.");
+					// We detect this by checking if our max-100ms-sleep, above, ran longer than a
+					// full second, at which point we assume sockets may have been killed (they
+					// appear to be at least on some platforms, even if it has only been a second).
+					// Note that we have to take care to not get here just because user event
+					// processing was slow at the top of the loop. For example, the sample client
+					// may call Bitcoin Core RPCs during event handling, which very often takes
+					// more than a handful of seconds to complete, and shouldn't disconnect all our
+					// peers.
+					log_trace!(logger, "100ms sleep took more than a second, disconnecting peers.");
 					peer_manager.disconnect_all_peers();
 					last_ping_call = Instant::now();
 				} else if last_ping_call.elapsed().as_secs() > PING_TIMER {

From 2d3a2108979adca6b7632e2d59c10e4b131e8bf4 Mon Sep 17 00:00:00 2001
From: Matt Corallo <git@bluematt.me>
Date: Thu, 20 Jan 2022 23:42:56 +0000
Subject: [PATCH 2/2] Increase our PING_TIMER to ten seconds, from five.

Because many lightning nodes can take quite some time to respond to
pings, the five second ping timer can sometimes cause spurious
disconnects even though a peer is online. However, in part as a
response to mobile users where a connection may be lost as result
of only a short time with the app in a "paused" state, we had a
rather aggressive ping time to ensure we would disconnect quickly.

However, since we now just used a fixed time for the "went to
sleep" detection, we can somewhat increase the ping timer. We still
want to be fairly aggressive to avoid sending HTLCs to a peer that
is offline, but the tradeoff between spurious disconnections and
stuck payments is likely doesn't need to be quite as aggressive.
---
 lightning-background-processor/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightning-background-processor/src/lib.rs b/lightning-background-processor/src/lib.rs
index 363dfba93ce..c3be857a361 100644
--- a/lightning-background-processor/src/lib.rs
+++ b/lightning-background-processor/src/lib.rs
@@ -61,7 +61,7 @@ const FRESHNESS_TIMER: u64 = 60;
 const FRESHNESS_TIMER: u64 = 1;
 
 #[cfg(all(not(test), not(debug_assertions)))]
-const PING_TIMER: u64 = 5;
+const PING_TIMER: u64 = 10;
 /// Signature operations take a lot longer without compiler optimisations.
 /// Increasing the ping timer allows for this but slower devices will be disconnected if the
 /// timeout is reached.