Skip to content

Commit eabe2d1

Browse files
authored
Increase PeerFinder verbosity on persistent failure (#73128)
If a node is partitioned away from the rest of the cluster then the `ClusterFormationFailureHelper` periodically reports that it cannot discover the expected collection of nodes, but does not indicate why. To prove it's a connectivity problem, users must today restart the node with `DEBUG` logging on `org.elasticsearch.discovery.PeerFinder` to see further details. With this commit we log messages at `WARN` level if the node remains disconnected for longer than a configurable timeout, which defaults to 5 minutes. Relates #72968
1 parent 09d1b97 commit eabe2d1

File tree

3 files changed

+146
-12
lines changed

3 files changed

+146
-12
lines changed

docs/reference/modules/discovery/discovery-settings.asciidoc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,12 @@ handshake. Defaults to `30s`.
8585
Sets how long a node will wait after asking its peers again before considering
8686
the request to have failed. Defaults to `3s`.
8787

88+
`discovery.find_peers_warning_timeout`::
89+
(<<static-cluster-setting,Static>>)
90+
Sets how long a node will attempt to discover its peers before it starts to log
91+
verbose messages describing why the connection attempts are failing. Defaults
92+
to `5m`.
93+
8894
`discovery.seed_resolver.max_concurrent_resolvers`::
8995
(<<static-cluster-setting,Static>>)
9096
Specifies how many concurrent DNS lookups to perform when resolving the

server/src/main/java/org/elasticsearch/discovery/PeerFinder.java

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,17 @@ public abstract class PeerFinder {
5656
Setting.timeSetting("discovery.request_peers_timeout",
5757
TimeValue.timeValueMillis(3000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
5858

59+
// We do not log connection failures immediately: some failures are expected, especially if the hosts list isn't perfectly up-to-date
60+
// or contains some unnecessary junk. However if the node cannot find a master for an extended period of time then it is helpful to
61+
// users to describe in more detail why we cannot connect to the remote nodes. This setting defines how long we wait without discovering
62+
// the master before we start to emit more verbose logs.
63+
public static final Setting<TimeValue> VERBOSITY_INCREASE_TIMEOUT_SETTING =
64+
Setting.timeSetting("discovery.find_peers_warning_timeout",
65+
TimeValue.timeValueMinutes(5), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
66+
5967
private final TimeValue findPeersInterval;
6068
private final TimeValue requestPeersTimeout;
69+
private final TimeValue verbosityIncreaseTimeout;
6170

6271
private final Object mutex = new Object();
6372
private final TransportService transportService;
@@ -66,6 +75,7 @@ public abstract class PeerFinder {
6675

6776
private volatile long currentTerm;
6877
private boolean active;
78+
private long activatedAtMillis;
6979
private DiscoveryNodes lastAcceptedNodes;
7080
private final Map<TransportAddress, Peer> peersByAddress = new LinkedHashMap<>();
7181
private Optional<DiscoveryNode> leader = Optional.empty();
@@ -75,6 +85,7 @@ public PeerFinder(Settings settings, TransportService transportService, Transpor
7585
ConfiguredHostsResolver configuredHostsResolver) {
7686
findPeersInterval = DISCOVERY_FIND_PEERS_INTERVAL_SETTING.get(settings);
7787
requestPeersTimeout = DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING.get(settings);
88+
verbosityIncreaseTimeout = VERBOSITY_INCREASE_TIMEOUT_SETTING.get(settings);
7889
this.transportService = transportService;
7990
this.transportAddressConnector = transportAddressConnector;
8091
this.configuredHostsResolver = configuredHostsResolver;
@@ -90,6 +101,7 @@ public void activate(final DiscoveryNodes lastAcceptedNodes) {
90101
synchronized (mutex) {
91102
assert assertInactiveWithNoKnownPeers();
92103
active = true;
104+
activatedAtMillis = transportService.getThreadPool().relativeTimeInMillis();
93105
this.lastAcceptedNodes = lastAcceptedNodes;
94106
leader = Optional.empty();
95107
handleWakeUp(); // return value discarded: there are no known peers, so none can be disconnected
@@ -193,7 +205,7 @@ public interface TransportAddressConnector {
193205

194206
public interface ConfiguredHostsResolver {
195207
/**
196-
* Attempt to resolve the configured unicast hosts list to a list of transport addresses.
208+
* Attempt to resolve the configured hosts list to a list of transport addresses.
197209
*
198210
* @param consumer Consumer for the resolved list. May not be called if an error occurs or if another resolution attempt is in
199211
* progress.
@@ -293,7 +305,7 @@ protected void startProbe(TransportAddress transportAddress) {
293305

294306
private class Peer {
295307
private final TransportAddress transportAddress;
296-
private SetOnce<DiscoveryNode> discoveryNode = new SetOnce<>();
308+
private final SetOnce<DiscoveryNode> discoveryNode = new SetOnce<>();
297309
private volatile boolean peersRequestInFlight;
298310

299311
Peer(TransportAddress transportAddress) {
@@ -334,6 +346,9 @@ void establishConnection() {
334346
assert getDiscoveryNode() == null : "unexpectedly connected to " + getDiscoveryNode();
335347
assert active;
336348

349+
final boolean verboseFailureLogging
350+
= transportService.getThreadPool().relativeTimeInMillis() - activatedAtMillis > verbosityIncreaseTimeout.millis();
351+
337352
logger.trace("{} attempting connection", this);
338353
transportAddressConnector.connectToRemoteMasterNode(transportAddress, new ActionListener<DiscoveryNode>() {
339354
@Override
@@ -356,7 +371,25 @@ public void onResponse(DiscoveryNode remoteNode) {
356371

357372
@Override
358373
public void onFailure(Exception e) {
359-
logger.debug(() -> new ParameterizedMessage("{} connection failed", Peer.this), e);
374+
if (verboseFailureLogging) {
375+
if (logger.isDebugEnabled()) {
376+
// log message at level WARN, but since DEBUG logging is enabled we include the full stack trace
377+
logger.warn(new ParameterizedMessage("{} connection failed", Peer.this), e);
378+
} else {
379+
final StringBuilder messageBuilder = new StringBuilder();
380+
Throwable cause = e;
381+
while (cause != null && messageBuilder.length() <= 1024) {
382+
messageBuilder.append(": ").append(cause.getMessage());
383+
cause = cause.getCause();
384+
}
385+
final String message = messageBuilder.length() < 1024
386+
? messageBuilder.toString()
387+
: (messageBuilder.substring(0, 1023) + "...");
388+
logger.warn("{} connection failed{}", Peer.this, message);
389+
}
390+
} else {
391+
logger.debug(new ParameterizedMessage("{} connection failed", Peer.this), e);
392+
}
360393
synchronized (mutex) {
361394
peersByAddress.remove(transportAddress);
362395
}
@@ -413,7 +446,7 @@ public void handleResponse(PeersResponse response) {
413446
@Override
414447
public void handleException(TransportException exp) {
415448
peersRequestInFlight = false;
416-
logger.debug(new ParameterizedMessage("{} peers request failed", Peer.this), exp);
449+
logger.warn(new ParameterizedMessage("{} peers request failed", Peer.this), exp);
417450
}
418451

419452
@Override
@@ -429,11 +462,7 @@ public String executor() {
429462

430463
@Override
431464
public String toString() {
432-
return "Peer{" +
433-
"transportAddress=" + transportAddress +
434-
", discoveryNode=" + discoveryNode.get() +
435-
", peersRequestInFlight=" + peersRequestInFlight +
436-
'}';
465+
return "address [" + transportAddress + "], node [" + discoveryNode.get() + "], requesting [" + peersRequestInFlight + "]";
437466
}
438467
}
439468
}

server/src/test/java/org/elasticsearch/discovery/PeerFinderTests.java

Lines changed: 102 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88

99
package org.elasticsearch.discovery;
1010

11+
import org.apache.logging.log4j.Level;
12+
import org.apache.logging.log4j.LogManager;
13+
import org.apache.logging.log4j.core.LogEvent;
1114
import org.elasticsearch.ElasticsearchException;
1215
import org.elasticsearch.Version;
1316
import org.elasticsearch.action.ActionListener;
@@ -17,10 +20,13 @@
1720
import org.elasticsearch.cluster.node.DiscoveryNodes;
1821
import org.elasticsearch.cluster.node.DiscoveryNodes.Builder;
1922
import org.elasticsearch.common.io.stream.StreamInput;
23+
import org.elasticsearch.common.logging.Loggers;
2024
import org.elasticsearch.common.settings.Settings;
2125
import org.elasticsearch.common.transport.TransportAddress;
2226
import org.elasticsearch.discovery.PeerFinder.TransportAddressConnector;
2327
import org.elasticsearch.test.ESTestCase;
28+
import org.elasticsearch.test.MockLogAppender;
29+
import org.elasticsearch.test.junit.annotations.TestLogging;
2430
import org.elasticsearch.test.transport.CapturingTransport;
2531
import org.elasticsearch.test.transport.CapturingTransport.CapturedRequest;
2632
import org.elasticsearch.test.transport.StubbableConnectionManager;
@@ -55,6 +61,7 @@
5561
import static java.util.Collections.emptySet;
5662
import static java.util.Collections.singletonList;
5763
import static org.elasticsearch.discovery.PeerFinder.REQUEST_PEERS_ACTION_NAME;
64+
import static org.elasticsearch.discovery.PeerFinder.VERBOSITY_INCREASE_TIMEOUT_SETTING;
5865
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
5966
import static org.hamcrest.Matchers.contains;
6067
import static org.hamcrest.Matchers.empty;
@@ -74,13 +81,13 @@ public class PeerFinderTests extends ESTestCase {
7481
private List<TransportAddress> providedAddresses;
7582
private long addressResolveDelay; // -1 means address resolution fails
7683

77-
private Set<DiscoveryNode> disconnectedNodes = new HashSet<>();
78-
private Set<DiscoveryNode> connectedNodes = new HashSet<>();
84+
private final Set<DiscoveryNode> disconnectedNodes = new HashSet<>();
85+
private final Set<DiscoveryNode> connectedNodes = new HashSet<>();
7986
private DiscoveryNodes lastAcceptedNodes;
8087
private TransportService transportService;
8188
private Iterable<DiscoveryNode> foundPeersFromNotification;
8289

83-
private static long CONNECTION_TIMEOUT_MILLIS = 30000;
90+
private static final long CONNECTION_TIMEOUT_MILLIS = 30000;
8491

8592
class MockTransportAddressConnector implements TransportAddressConnector {
8693
final Map<TransportAddress, DiscoveryNode> reachableNodes = new HashMap<>();
@@ -734,6 +741,98 @@ public void testTimesOutAndRetriesConnectionsToBlackholedNodes() {
734741
assertFoundPeers(nodeToFind, otherNode);
735742
}
736743

744+
@TestLogging(reason = "testing logging at WARN level", value = "org.elasticsearch.discovery:WARN")
745+
public void testLogsWarningsIfActiveForLongEnough() throws IllegalAccessException {
746+
final DiscoveryNode otherNode = newDiscoveryNode("node-from-hosts-list");
747+
748+
providedAddresses.add(otherNode.getAddress());
749+
transportAddressConnector.unreachableAddresses.add(otherNode.getAddress());
750+
751+
peerFinder.activate(lastAcceptedNodes);
752+
final long endTime
753+
= deterministicTaskQueue.getCurrentTimeMillis() + VERBOSITY_INCREASE_TIMEOUT_SETTING.get(Settings.EMPTY).millis();
754+
755+
MockLogAppender appender = new MockLogAppender();
756+
try {
757+
appender.start();
758+
Loggers.addAppender(LogManager.getLogger("org.elasticsearch.discovery.PeerFinder"), appender);
759+
760+
appender.addExpectation(new MockLogAppender.SeenEventExpectation(
761+
"connection failed",
762+
"org.elasticsearch.discovery.PeerFinder",
763+
Level.WARN,
764+
"address [" + otherNode.getAddress() + "]* connection failed: cannot connect to*")
765+
{
766+
@Override
767+
public boolean innerMatch(LogEvent event) {
768+
return event.getThrown() == null; // no stack trace at this log level
769+
}
770+
});
771+
while (deterministicTaskQueue.getCurrentTimeMillis() <= endTime) {
772+
deterministicTaskQueue.advanceTime();
773+
runAllRunnableTasks();
774+
}
775+
appender.assertAllExpectationsMatched();
776+
777+
} finally {
778+
Loggers.removeAppender(LogManager.getLogger("org.elasticsearch.discovery.PeerFinder"), appender);
779+
appender.stop();
780+
}
781+
}
782+
783+
@TestLogging(reason = "testing logging at DEBUG level", value = "org.elasticsearch.discovery:DEBUG")
784+
public void testLogsStackTraceInConnectionFailedMessages() throws IllegalAccessException {
785+
final DiscoveryNode otherNode = newDiscoveryNode("node-from-hosts-list");
786+
787+
providedAddresses.add(otherNode.getAddress());
788+
transportAddressConnector.unreachableAddresses.add(otherNode.getAddress());
789+
790+
peerFinder.activate(lastAcceptedNodes);
791+
final long endTime
792+
= deterministicTaskQueue.getCurrentTimeMillis() + VERBOSITY_INCREASE_TIMEOUT_SETTING.get(Settings.EMPTY).millis();
793+
794+
MockLogAppender appender = new MockLogAppender();
795+
try {
796+
appender.start();
797+
Loggers.addAppender(LogManager.getLogger("org.elasticsearch.discovery.PeerFinder"), appender);
798+
appender.addExpectation(new MockLogAppender.SeenEventExpectation(
799+
"connection failed",
800+
"org.elasticsearch.discovery.PeerFinder",
801+
Level.DEBUG,
802+
"address [" + otherNode.getAddress() + "]* connection failed*") {
803+
@Override
804+
public boolean innerMatch(LogEvent event) {
805+
return event.getThrown() instanceof IOException && event.getThrown().getMessage().startsWith("cannot connect to");
806+
}
807+
});
808+
809+
deterministicTaskQueue.advanceTime();
810+
runAllRunnableTasks();
811+
appender.assertAllExpectationsMatched();
812+
813+
appender.addExpectation(new MockLogAppender.SeenEventExpectation(
814+
"connection failed",
815+
"org.elasticsearch.discovery.PeerFinder",
816+
Level.WARN,
817+
"address [" + otherNode.getAddress() + "]* connection failed*")
818+
{
819+
@Override
820+
public boolean innerMatch(LogEvent event) {
821+
return event.getThrown() instanceof IOException && event.getThrown().getMessage().startsWith("cannot connect to");
822+
}
823+
});
824+
while (deterministicTaskQueue.getCurrentTimeMillis() <= endTime) {
825+
deterministicTaskQueue.advanceTime();
826+
runAllRunnableTasks();
827+
}
828+
appender.assertAllExpectationsMatched();
829+
830+
} finally {
831+
Loggers.removeAppender(LogManager.getLogger("org.elasticsearch.discovery.PeerFinder"), appender);
832+
appender.stop();
833+
}
834+
}
835+
737836
public void testReconnectsToDisconnectedNodes() {
738837
final DiscoveryNode otherNode = newDiscoveryNode("original-node");
739838
providedAddresses.add(otherNode.getAddress());

0 commit comments

Comments
 (0)