@@ -56,8 +56,17 @@ public abstract class PeerFinder {
5656 Setting .timeSetting ("discovery.request_peers_timeout" ,
5757 TimeValue .timeValueMillis (3000 ), TimeValue .timeValueMillis (1 ), Setting .Property .NodeScope );
5858
59+ // We do not log connection failures immediately: some failures are expected, especially if the hosts list isn't perfectly up-to-date
60+ // or contains some unnecessary junk. However if the node cannot find a master for an extended period of time then it is helpful to
61+ // users to describe in more detail why we cannot connect to the remote nodes. This setting defines how long we wait without discovering
62+ // the master before we start to emit more verbose logs.
63+ public static final Setting <TimeValue > VERBOSITY_INCREASE_TIMEOUT_SETTING =
64+ Setting .timeSetting ("discovery.find_peers_warning_timeout" ,
65+ TimeValue .timeValueMinutes (5 ), TimeValue .timeValueMillis (1 ), Setting .Property .NodeScope );
66+
5967 private final TimeValue findPeersInterval ;
6068 private final TimeValue requestPeersTimeout ;
69+ private final TimeValue verbosityIncreaseTimeout ;
6170
6271 private final Object mutex = new Object ();
6372 private final TransportService transportService ;
@@ -66,6 +75,7 @@ public abstract class PeerFinder {
6675
6776 private volatile long currentTerm ;
6877 private boolean active ;
78+ private long activatedAtMillis ;
6979 private DiscoveryNodes lastAcceptedNodes ;
7080 private final Map <TransportAddress , Peer > peersByAddress = new LinkedHashMap <>();
7181 private Optional <DiscoveryNode > leader = Optional .empty ();
@@ -75,6 +85,7 @@ public PeerFinder(Settings settings, TransportService transportService, Transpor
7585 ConfiguredHostsResolver configuredHostsResolver ) {
7686 findPeersInterval = DISCOVERY_FIND_PEERS_INTERVAL_SETTING .get (settings );
7787 requestPeersTimeout = DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING .get (settings );
88+ verbosityIncreaseTimeout = VERBOSITY_INCREASE_TIMEOUT_SETTING .get (settings );
7889 this .transportService = transportService ;
7990 this .transportAddressConnector = transportAddressConnector ;
8091 this .configuredHostsResolver = configuredHostsResolver ;
@@ -90,6 +101,7 @@ public void activate(final DiscoveryNodes lastAcceptedNodes) {
90101 synchronized (mutex ) {
91102 assert assertInactiveWithNoKnownPeers ();
92103 active = true ;
104+ activatedAtMillis = transportService .getThreadPool ().relativeTimeInMillis ();
93105 this .lastAcceptedNodes = lastAcceptedNodes ;
94106 leader = Optional .empty ();
95107 handleWakeUp (); // return value discarded: there are no known peers, so none can be disconnected
@@ -193,7 +205,7 @@ public interface TransportAddressConnector {
193205
194206 public interface ConfiguredHostsResolver {
195207 /**
196- * Attempt to resolve the configured unicast hosts list to a list of transport addresses.
208+ * Attempt to resolve the configured hosts list to a list of transport addresses.
197209 *
198210 * @param consumer Consumer for the resolved list. May not be called if an error occurs or if another resolution attempt is in
199211 * progress.
@@ -293,7 +305,7 @@ protected void startProbe(TransportAddress transportAddress) {
293305
294306 private class Peer {
295307 private final TransportAddress transportAddress ;
296- private SetOnce <DiscoveryNode > discoveryNode = new SetOnce <>();
308+ private final SetOnce <DiscoveryNode > discoveryNode = new SetOnce <>();
297309 private volatile boolean peersRequestInFlight ;
298310
299311 Peer (TransportAddress transportAddress ) {
@@ -334,6 +346,9 @@ void establishConnection() {
334346 assert getDiscoveryNode () == null : "unexpectedly connected to " + getDiscoveryNode ();
335347 assert active ;
336348
349+ final boolean verboseFailureLogging
350+ = transportService .getThreadPool ().relativeTimeInMillis () - activatedAtMillis > verbosityIncreaseTimeout .millis ();
351+
337352 logger .trace ("{} attempting connection" , this );
338353 transportAddressConnector .connectToRemoteMasterNode (transportAddress , new ActionListener <DiscoveryNode >() {
339354 @ Override
@@ -356,7 +371,25 @@ public void onResponse(DiscoveryNode remoteNode) {
356371
357372 @ Override
358373 public void onFailure (Exception e ) {
359- logger .debug (() -> new ParameterizedMessage ("{} connection failed" , Peer .this ), e );
374+ if (verboseFailureLogging ) {
375+ if (logger .isDebugEnabled ()) {
376+ // log message at level WARN, but since DEBUG logging is enabled we include the full stack trace
377+ logger .warn (new ParameterizedMessage ("{} connection failed" , Peer .this ), e );
378+ } else {
379+ final StringBuilder messageBuilder = new StringBuilder ();
380+ Throwable cause = e ;
381+ while (cause != null && messageBuilder .length () <= 1024 ) {
382+ messageBuilder .append (": " ).append (cause .getMessage ());
383+ cause = cause .getCause ();
384+ }
385+ final String message = messageBuilder .length () < 1024
386+ ? messageBuilder .toString ()
387+ : (messageBuilder .substring (0 , 1023 ) + "..." );
388+ logger .warn ("{} connection failed{}" , Peer .this , message );
389+ }
390+ } else {
391+ logger .debug (new ParameterizedMessage ("{} connection failed" , Peer .this ), e );
392+ }
360393 synchronized (mutex ) {
361394 peersByAddress .remove (transportAddress );
362395 }
@@ -413,7 +446,7 @@ public void handleResponse(PeersResponse response) {
413446 @ Override
414447 public void handleException (TransportException exp ) {
415448 peersRequestInFlight = false ;
416- logger .debug (new ParameterizedMessage ("{} peers request failed" , Peer .this ), exp );
449+ logger .warn (new ParameterizedMessage ("{} peers request failed" , Peer .this ), exp );
417450 }
418451
419452 @ Override
@@ -429,11 +462,7 @@ public String executor() {
429462
430463 @ Override
431464 public String toString () {
432- return "Peer{" +
433- "transportAddress=" + transportAddress +
434- ", discoveryNode=" + discoveryNode .get () +
435- ", peersRequestInFlight=" + peersRequestInFlight +
436- '}' ;
465+ return "address [" + transportAddress + "], node [" + discoveryNode .get () + "], requesting [" + peersRequestInFlight + "]" ;
437466 }
438467 }
439468}
0 commit comments