@@ -71,10 +71,19 @@ public abstract class PeerFinder {
7171 Setting .timeSetting ("discovery.request_peers_timeout" ,
7272 TimeValue .timeValueMillis (3000 ), TimeValue .timeValueMillis (1 ), Setting .Property .NodeScope );
7373
74+ // We do not log connection failures immediately: some failures are expected, especially if the hosts list isn't perfectly up-to-date
75+ // or contains some unnecessary junk. However if the node cannot find a master for an extended period of time then it is helpful to
76+ // users to describe in more detail why we cannot connect to the remote nodes. This setting defines how long we wait without discovering
77+ // the master before we start to emit more verbose logs.
78+ public static final Setting <TimeValue > VERBOSITY_INCREASE_TIMEOUT_SETTING =
79+ Setting .timeSetting ("discovery.find_peers_warning_timeout" ,
80+ TimeValue .timeValueMinutes (5 ), TimeValue .timeValueMillis (1 ), Setting .Property .NodeScope );
81+
7482 private final Settings settings ;
7583
7684 private final TimeValue findPeersInterval ;
7785 private final TimeValue requestPeersTimeout ;
86+ private final TimeValue verbosityIncreaseTimeout ;
7887
7988 private final Object mutex = new Object ();
8089 private final TransportService transportService ;
@@ -83,6 +92,7 @@ public abstract class PeerFinder {
8392
8493 private volatile long currentTerm ;
8594 private boolean active ;
95+ private long activatedAtMillis ;
8696 private DiscoveryNodes lastAcceptedNodes ;
8797 private final Map <TransportAddress , Peer > peersByAddress = new LinkedHashMap <>();
8898 private Optional <DiscoveryNode > leader = Optional .empty ();
@@ -93,6 +103,7 @@ public PeerFinder(Settings settings, TransportService transportService, Transpor
93103 this .settings = settings ;
94104 findPeersInterval = DISCOVERY_FIND_PEERS_INTERVAL_SETTING .get (settings );
95105 requestPeersTimeout = DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING .get (settings );
106+ verbosityIncreaseTimeout = VERBOSITY_INCREASE_TIMEOUT_SETTING .get (settings );
96107 this .transportService = transportService ;
97108 this .transportAddressConnector = transportAddressConnector ;
98109 this .configuredHostsResolver = configuredHostsResolver ;
@@ -111,6 +122,7 @@ public void activate(final DiscoveryNodes lastAcceptedNodes) {
111122 synchronized (mutex ) {
112123 assert assertInactiveWithNoKnownPeers ();
113124 active = true ;
125+ activatedAtMillis = transportService .getThreadPool ().relativeTimeInMillis ();
114126 this .lastAcceptedNodes = lastAcceptedNodes ;
115127 leader = Optional .empty ();
116128 handleWakeUp (); // return value discarded: there are no known peers, so none can be disconnected
@@ -214,7 +226,7 @@ public interface TransportAddressConnector {
214226
215227 public interface ConfiguredHostsResolver {
216228 /**
217- * Attempt to resolve the configured unicast hosts list to a list of transport addresses.
229+ * Attempt to resolve the configured hosts list to a list of transport addresses.
218230 *
219231 * @param consumer Consumer for the resolved list. May not be called if an error occurs or if another resolution attempt is in
220232 * progress.
@@ -314,7 +326,7 @@ protected void startProbe(TransportAddress transportAddress) {
314326
315327 private class Peer {
316328 private final TransportAddress transportAddress ;
317- private SetOnce <DiscoveryNode > discoveryNode = new SetOnce <>();
329+ private final SetOnce <DiscoveryNode > discoveryNode = new SetOnce <>();
318330 private volatile boolean peersRequestInFlight ;
319331
320332 Peer (TransportAddress transportAddress ) {
@@ -355,6 +367,9 @@ void establishConnection() {
355367 assert getDiscoveryNode () == null : "unexpectedly connected to " + getDiscoveryNode ();
356368 assert active ;
357369
370+ final boolean verboseFailureLogging
371+ = transportService .getThreadPool ().relativeTimeInMillis () - activatedAtMillis > verbosityIncreaseTimeout .millis ();
372+
358373 logger .trace ("{} attempting connection" , this );
359374 transportAddressConnector .connectToRemoteMasterNode (transportAddress , new ActionListener <DiscoveryNode >() {
360375 @ Override
@@ -377,7 +392,25 @@ public void onResponse(DiscoveryNode remoteNode) {
377392
378393 @ Override
379394 public void onFailure (Exception e ) {
380- logger .debug (() -> new ParameterizedMessage ("{} connection failed" , Peer .this ), e );
395+ if (verboseFailureLogging ) {
396+ if (logger .isDebugEnabled ()) {
397+ // log message at level WARN, but since DEBUG logging is enabled we include the full stack trace
398+ logger .warn (new ParameterizedMessage ("{} connection failed" , Peer .this ), e );
399+ } else {
400+ final StringBuilder messageBuilder = new StringBuilder ();
401+ Throwable cause = e ;
402+ while (cause != null && messageBuilder .length () <= 1024 ) {
403+ messageBuilder .append (": " ).append (cause .getMessage ());
404+ cause = cause .getCause ();
405+ }
406+ final String message = messageBuilder .length () < 1024
407+ ? messageBuilder .toString ()
408+ : (messageBuilder .substring (0 , 1023 ) + "..." );
409+ logger .warn ("{} connection failed{}" , Peer .this , message );
410+ }
411+ } else {
412+ logger .debug (new ParameterizedMessage ("{} connection failed" , Peer .this ), e );
413+ }
381414 synchronized (mutex ) {
382415 peersByAddress .remove (transportAddress );
383416 }
@@ -434,7 +467,7 @@ public void handleResponse(PeersResponse response) {
434467 @ Override
435468 public void handleException (TransportException exp ) {
436469 peersRequestInFlight = false ;
437- logger .debug (new ParameterizedMessage ("{} peers request failed" , Peer .this ), exp );
470+ logger .warn (new ParameterizedMessage ("{} peers request failed" , Peer .this ), exp );
438471 }
439472
440473 @ Override
@@ -476,11 +509,7 @@ public String executor() {
476509
477510 @ Override
478511 public String toString () {
479- return "Peer{" +
480- "transportAddress=" + transportAddress +
481- ", discoveryNode=" + discoveryNode .get () +
482- ", peersRequestInFlight=" + peersRequestInFlight +
483- '}' ;
512+ return "address [" + transportAddress + "], node [" + discoveryNode .get () + "], requesting [" + peersRequestInFlight + "]" ;
484513 }
485514 }
486515
0 commit comments