Skip to content

Commit d995fc8

Browse files
authored
Integrate LeaderChecker with Coordinator (#34049)
This change ensures that follower nodes periodically check that their leader is healthy, and that they elect a new leader if not.
1 parent f886eeb commit d995fc8

File tree

7 files changed

+302
-141
lines changed

7 files changed

+302
-141
lines changed

server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,13 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
7878
private final UnicastConfiguredHostsResolver configuredHostsResolver;
7979
private final TimeValue publishTimeout;
8080
private final PublicationTransportHandler publicationHandler;
81+
private final LeaderChecker leaderChecker;
8182
@Nullable
8283
private Releasable electionScheduler;
8384
@Nullable
8485
private Releasable prevotingRound;
86+
@Nullable
87+
private Releasable leaderCheckScheduler;
8588
private AtomicLong maxTermSeen = new AtomicLong();
8689

8790
private Mode mode;
@@ -108,10 +111,27 @@ public Coordinator(Settings settings, TransportService transportService, Allocat
108111
this.peerFinder = new CoordinatorPeerFinder(settings, transportService,
109112
new HandshakingTransportAddressConnector(settings, transportService), configuredHostsResolver);
110113
this.publicationHandler = new PublicationTransportHandler(transportService, this::handlePublishRequest, this::handleApplyCommit);
114+
this.leaderChecker = new LeaderChecker(settings, transportService, getOnLeaderFailure());
111115

112116
masterService.setClusterStateSupplier(this::getStateForMasterService);
113117
}
114118

119+
private Runnable getOnLeaderFailure() {
120+
return new Runnable() {
121+
@Override
122+
public void run() {
123+
synchronized (mutex) {
124+
becomeCandidate("onLeaderFailure");
125+
}
126+
}
127+
128+
@Override
129+
public String toString() {
130+
return "notification of leader failure";
131+
}
132+
};
133+
}
134+
115135
private void handleApplyCommit(ApplyCommitRequest applyCommitRequest) {
116136
synchronized (mutex) {
117137
logger.trace("handleApplyCommit: applying commit {}", applyCommitRequest);
@@ -233,6 +253,12 @@ void becomeCandidate(String method) {
233253
joinAccumulator = joinHelper.new CandidateJoinAccumulator();
234254

235255
peerFinder.activate(coordinationState.get().getLastAcceptedState().nodes());
256+
leaderChecker.setCurrentNodes(DiscoveryNodes.EMPTY_NODES);
257+
258+
if (leaderCheckScheduler != null) {
259+
leaderCheckScheduler.close();
260+
leaderCheckScheduler = null;
261+
}
236262
}
237263

238264
preVoteCollector.update(getPreVoteResponse(), null);
@@ -251,23 +277,35 @@ void becomeLeader(String method) {
251277
peerFinder.deactivate(getLocalNode());
252278
closePrevotingAndElectionScheduler();
253279
preVoteCollector.update(getPreVoteResponse(), getLocalNode());
280+
281+
assert leaderCheckScheduler == null : leaderCheckScheduler;
254282
}
255283

256284
void becomeFollower(String method, DiscoveryNode leaderNode) {
257285
assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
258286
logger.debug("{}: becoming FOLLOWER of [{}] (was {}, lastKnownLeader was [{}])", method, leaderNode, mode, lastKnownLeader);
259287

288+
final boolean restartLeaderChecker = (mode == Mode.FOLLOWER && Optional.of(leaderNode).equals(lastKnownLeader)) == false;
289+
260290
if (mode != Mode.FOLLOWER) {
261291
mode = Mode.FOLLOWER;
262292
joinAccumulator.close(mode);
263293
joinAccumulator = new JoinHelper.FollowerJoinAccumulator();
294+
leaderChecker.setCurrentNodes(DiscoveryNodes.EMPTY_NODES);
264295
}
265296

266297
lastKnownLeader = Optional.of(leaderNode);
267298
peerFinder.deactivate(leaderNode);
268299
closePrevotingAndElectionScheduler();
269300
cancelActivePublication();
270301
preVoteCollector.update(getPreVoteResponse(), leaderNode);
302+
303+
if (restartLeaderChecker) {
304+
if (leaderCheckScheduler != null) {
305+
leaderCheckScheduler.close();
306+
}
307+
leaderCheckScheduler = leaderChecker.startLeaderChecker(leaderNode);
308+
}
271309
}
272310

273311
private PreVoteResponse getPreVoteResponse() {
@@ -339,6 +377,7 @@ public void invariant() {
339377
assert getStateForMasterService().nodes().getMasterNodeId() != null
340378
|| getStateForMasterService().term() != getCurrentTerm() :
341379
getStateForMasterService();
380+
assert leaderCheckScheduler == null : leaderCheckScheduler;
342381
} else if (mode == Mode.FOLLOWER) {
343382
assert coordinationState.get().electionWon() == false : getLocalNode() + " is FOLLOWER so electionWon() should be false";
344383
assert lastKnownLeader.isPresent() && (lastKnownLeader.get().equals(getLocalNode()) == false);
@@ -347,12 +386,16 @@ assert getStateForMasterService().nodes().getMasterNodeId() != null
347386
assert electionScheduler == null : electionScheduler;
348387
assert prevotingRound == null : prevotingRound;
349388
assert getStateForMasterService().nodes().getMasterNodeId() == null : getStateForMasterService();
389+
assert leaderChecker.currentNodeIsMaster() == false;
390+
assert leaderCheckScheduler != null;
350391
} else {
351392
assert mode == Mode.CANDIDATE;
352393
assert joinAccumulator instanceof JoinHelper.CandidateJoinAccumulator;
353394
assert peerFinderLeader.isPresent() == false : peerFinderLeader;
354395
assert prevotingRound == null || electionScheduler != null;
355396
assert getStateForMasterService().nodes().getMasterNodeId() == null : getStateForMasterService();
397+
assert leaderChecker.currentNodeIsMaster() == false;
398+
assert leaderCheckScheduler == null : leaderCheckScheduler;
356399
}
357400
}
358401
}
@@ -577,6 +620,8 @@ public String toString() {
577620
return "scheduled timeout for " + publication;
578621
}
579622
});
623+
624+
leaderChecker.setCurrentNodes(publishRequest.getAcceptedState().nodes());
580625
publication.start(Collections.emptySet()); // TODO start failure detector and put faultyNodes here
581626
}
582627
} catch (Exception e) {

server/src/main/java/org/elasticsearch/cluster/coordination/LeaderChecker.java

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ public class LeaderChecker extends AbstractComponent {
7777
private final TransportService transportService;
7878
private final Runnable onLeaderFailure;
7979

80-
private volatile DiscoveryNodes lastPublishedDiscoveryNodes;
80+
private volatile DiscoveryNodes discoveryNodes;
8181

8282
public LeaderChecker(final Settings settings, final TransportService transportService, final Runnable onLeaderFailure) {
8383
super(settings);
@@ -111,19 +111,24 @@ public Releasable startLeaderChecker(final DiscoveryNode leader) {
111111
* isLocalNodeElectedMaster() should reflect whether this node is a leader, and nodeExists()
112112
* should indicate whether nodes are known publication targets or not.
113113
*/
114-
public void setLastPublishedDiscoveryNodes(DiscoveryNodes discoveryNodes) {
115-
logger.trace("updating last-published nodes: {}", discoveryNodes);
116-
lastPublishedDiscoveryNodes = discoveryNodes;
114+
public void setCurrentNodes(DiscoveryNodes discoveryNodes) {
115+
logger.trace("setCurrentNodes: {}", discoveryNodes);
116+
this.discoveryNodes = discoveryNodes;
117+
}
118+
119+
// For assertions
120+
boolean currentNodeIsMaster() {
121+
return discoveryNodes.isLocalNodeElectedMaster();
117122
}
118123

119124
private void handleLeaderCheck(LeaderCheckRequest request, TransportChannel transportChannel, Task task) throws IOException {
120-
final DiscoveryNodes lastPublishedDiscoveryNodes = this.lastPublishedDiscoveryNodes;
121-
assert lastPublishedDiscoveryNodes != null;
125+
final DiscoveryNodes discoveryNodes = this.discoveryNodes;
126+
assert discoveryNodes != null;
122127

123-
if (lastPublishedDiscoveryNodes.isLocalNodeElectedMaster() == false) {
128+
if (discoveryNodes.isLocalNodeElectedMaster() == false) {
124129
logger.debug("non-master handling {}", request);
125130
transportChannel.sendResponse(new CoordinationStateRejectedException("non-leader rejecting leader check"));
126-
} else if (lastPublishedDiscoveryNodes.nodeExists(request.getSender()) == false) {
131+
} else if (discoveryNodes.nodeExists(request.getSender()) == false) {
127132
logger.debug("leader check from unknown node: {}", request);
128133
transportChannel.sendResponse(new CoordinationStateRejectedException("leader check from unknown node"));
129134
} else {

server/src/test/java/org/elasticsearch/action/support/replication/TransportReplicationActionTests.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,6 +1105,9 @@ public String toString() {
11051105
}
11061106

11071107
static class TestResponse extends ReplicationResponse {
1108+
TestResponse() {
1109+
setShardInfo(new ShardInfo());
1110+
}
11081111
}
11091112

11101113
private class TestAction extends TransportReplicationAction<Request, Request, TestResponse> {

server/src/test/java/org/elasticsearch/cluster/coordination/CoordinatorTests.java

Lines changed: 96 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import org.elasticsearch.indices.cluster.FakeThreadPoolMasterService;
4242
import org.elasticsearch.test.ESTestCase;
4343
import org.elasticsearch.test.disruption.DisruptableMockTransport;
44+
import org.elasticsearch.test.disruption.DisruptableMockTransport.ConnectionStatus;
4445
import org.elasticsearch.test.junit.annotations.TestLogging;
4546
import org.elasticsearch.transport.TransportService;
4647
import org.hamcrest.Matcher;
@@ -59,7 +60,11 @@
5960
import static org.elasticsearch.cluster.coordination.CoordinationStateTests.clusterState;
6061
import static org.elasticsearch.cluster.coordination.CoordinationStateTests.setValue;
6162
import static org.elasticsearch.cluster.coordination.CoordinationStateTests.value;
63+
import static org.elasticsearch.cluster.coordination.Coordinator.Mode.CANDIDATE;
6264
import static org.elasticsearch.cluster.coordination.Coordinator.Mode.FOLLOWER;
65+
import static org.elasticsearch.cluster.coordination.LeaderChecker.LEADER_CHECK_INTERVAL_SETTING;
66+
import static org.elasticsearch.cluster.coordination.LeaderChecker.LEADER_CHECK_RETRY_COUNT_SETTING;
67+
import static org.elasticsearch.cluster.coordination.LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING;
6368
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
6469
import static org.elasticsearch.transport.TransportService.HANDSHAKE_ACTION_NAME;
6570
import static org.elasticsearch.transport.TransportService.NOOP_TRANSPORT_INTERCEPTOR;
@@ -68,7 +73,7 @@
6873
import static org.hamcrest.Matchers.is;
6974
import static org.hamcrest.Matchers.not;
7075

71-
@TestLogging("org.elasticsearch.cluster.coordination:TRACE,org.elasticsearch.cluster.discovery:TRACE")
76+
@TestLogging("org.elasticsearch.cluster.coordination:TRACE,org.elasticsearch.discovery:TRACE")
7277
public class CoordinatorTests extends ESTestCase {
7378

7479
public void testCanUpdateClusterStateAfterStabilisation() {
@@ -101,6 +106,40 @@ public void testNodesJoinAfterStableCluster() {
101106
assertEquals(currentTerm, newTerm);
102107
}
103108

109+
public void testLeaderDisconnectionDetectedQuickly() {
110+
final Cluster cluster = new Cluster(randomIntBetween(3, 5));
111+
cluster.stabilise();
112+
113+
final ClusterNode originalLeader = cluster.getAnyLeader();
114+
logger.info("--> disconnecting leader {}", originalLeader);
115+
originalLeader.disconnect();
116+
117+
synchronized (originalLeader.coordinator.mutex) {
118+
originalLeader.coordinator.becomeCandidate("simulated failure detection"); // TODO remove once follower checker is integrated
119+
}
120+
121+
cluster.stabilise();
122+
assertThat(cluster.getAnyLeader().getId(), not(equalTo(originalLeader.getId())));
123+
}
124+
125+
public void testUnresponsiveLeaderDetectedEventually() {
126+
final Cluster cluster = new Cluster(randomIntBetween(3, 5));
127+
cluster.stabilise();
128+
129+
final ClusterNode originalLeader = cluster.getAnyLeader();
130+
logger.info("--> partitioning leader {}", originalLeader);
131+
originalLeader.partition();
132+
133+
synchronized (originalLeader.coordinator.mutex) {
134+
originalLeader.coordinator.becomeCandidate("simulated failure detection"); // TODO remove once follower checker is integrated
135+
}
136+
137+
cluster.stabilise(Cluster.DEFAULT_STABILISATION_TIME
138+
+ (LEADER_CHECK_INTERVAL_SETTING.get(Settings.EMPTY).millis() + LEADER_CHECK_TIMEOUT_SETTING.get(Settings.EMPTY).millis())
139+
* LEADER_CHECK_RETRY_COUNT_SETTING.get(Settings.EMPTY));
140+
assertThat(cluster.getAnyLeader().getId(), not(equalTo(originalLeader.getId())));
141+
}
142+
104143
private static String nodeIdFromIndex(int nodeIndex) {
105144
return "node" + nodeIndex;
106145
}
@@ -115,6 +154,9 @@ class Cluster {
115154
Settings.builder().put(NODE_NAME_SETTING.getKey(), "deterministic-task-queue").build());
116155
private final VotingConfiguration initialConfiguration;
117156

157+
private final Set<String> disconnectedNodes = new HashSet<>();
158+
private final Set<String> blackholedNodes = new HashSet<>();
159+
118160
Cluster(int initialNodeCount) {
119161
logger.info("--> creating cluster of {} nodes", initialNodeCount);
120162

@@ -142,8 +184,12 @@ void addNodes(int newNodesCount) {
142184
}
143185

144186
void stabilise() {
187+
stabilise(DEFAULT_STABILISATION_TIME);
188+
}
189+
190+
void stabilise(long stabilisationTime) {
145191
final long stabilisationStartTime = deterministicTaskQueue.getCurrentTimeMillis();
146-
while (deterministicTaskQueue.getCurrentTimeMillis() < stabilisationStartTime + DEFAULT_STABILISATION_TIME) {
192+
while (deterministicTaskQueue.getCurrentTimeMillis() < stabilisationStartTime + stabilisationTime) {
147193

148194
while (deterministicTaskQueue.hasRunnableTasks()) {
149195
try {
@@ -182,16 +228,21 @@ private void assertUniqueLeaderAndExpectedModes() {
182228
}
183229

184230
final String nodeId = clusterNode.getId();
185-
assertThat(nodeId + " has the same term as the leader", clusterNode.coordinator.getCurrentTerm(), is(leaderTerm));
186-
// TODO assert that all nodes have actually voted for the leader in this term
187-
188-
assertThat(nodeId + " is a follower", clusterNode.coordinator.getMode(), is(FOLLOWER));
189-
assertThat(nodeId + " is at the same accepted version as the leader",
190-
Optional.of(clusterNode.coordinator.getLastAcceptedState().getVersion()), isPresentAndEqualToLeaderVersion);
191-
assertThat(nodeId + " is at the same committed version as the leader",
192-
clusterNode.coordinator.getLastCommittedState().map(ClusterState::getVersion), isPresentAndEqualToLeaderVersion);
193-
assertThat(clusterNode.coordinator.getLastCommittedState().map(ClusterState::getNodes).map(dn -> dn.nodeExists(nodeId)),
194-
equalTo(Optional.of(true)));
231+
232+
if (disconnectedNodes.contains(nodeId) || blackholedNodes.contains(nodeId)) {
233+
assertThat(nodeId + " is a candidate", clusterNode.coordinator.getMode(), is(CANDIDATE));
234+
} else {
235+
assertThat(nodeId + " has the same term as the leader", clusterNode.coordinator.getCurrentTerm(), is(leaderTerm));
236+
// TODO assert that all nodes have actually voted for the leader in this term
237+
238+
assertThat(nodeId + " is a follower", clusterNode.coordinator.getMode(), is(FOLLOWER));
239+
assertThat(nodeId + " is at the same accepted version as the leader",
240+
Optional.of(clusterNode.coordinator.getLastAcceptedState().getVersion()), isPresentAndEqualToLeaderVersion);
241+
assertThat(nodeId + " is at the same committed version as the leader",
242+
clusterNode.coordinator.getLastCommittedState().map(ClusterState::getVersion), isPresentAndEqualToLeaderVersion);
243+
assertThat(clusterNode.coordinator.getLastCommittedState().map(ClusterState::getNodes).map(dn -> dn.nodeExists(nodeId)),
244+
equalTo(Optional.of(true)));
245+
}
195246
}
196247

197248
assertThat(leader.coordinator.getLastCommittedState().map(ClusterState::getNodes).map(DiscoveryNodes::getSize),
@@ -204,6 +255,18 @@ ClusterNode getAnyLeader() {
204255
return randomFrom(allLeaders);
205256
}
206257

258+
private ConnectionStatus getConnectionStatus(DiscoveryNode sender, DiscoveryNode destination) {
259+
ConnectionStatus connectionStatus;
260+
if (blackholedNodes.contains(sender.getId()) || blackholedNodes.contains(destination.getId())) {
261+
connectionStatus = ConnectionStatus.BLACK_HOLE;
262+
} else if (disconnectedNodes.contains(sender.getId()) || disconnectedNodes.contains(destination.getId())) {
263+
connectionStatus = ConnectionStatus.DISCONNECTED;
264+
} else {
265+
connectionStatus = ConnectionStatus.CONNECTED;
266+
}
267+
return connectionStatus;
268+
}
269+
207270
class ClusterNode extends AbstractComponent {
208271
private final int nodeIndex;
209272
private Coordinator coordinator;
@@ -241,7 +304,7 @@ protected DiscoveryNode getLocalNode() {
241304

242305
@Override
243306
protected ConnectionStatus getConnectionStatus(DiscoveryNode sender, DiscoveryNode destination) {
244-
return ConnectionStatus.CONNECTED;
307+
return Cluster.this.getConnectionStatus(sender, destination);
245308
}
246309

247310
@Override
@@ -264,6 +327,17 @@ protected void handle(DiscoveryNode sender, DiscoveryNode destination, String ac
264327
deterministicTaskQueue.scheduleNow(onNode(destination, doDelivery));
265328
}
266329
}
330+
331+
@Override
332+
protected void onBlackholedDuringSend(long requestId, String action, DiscoveryNode destination) {
333+
if (action.equals(HANDSHAKE_ACTION_NAME)) {
334+
logger.trace("ignoring blackhole and delivering {}", getRequestDescription(requestId, action, destination));
335+
// handshakes always have a timeout, and are sent in a blocking fashion, so we must respond with an exception.
336+
sendFromTo(destination, getLocalNode(), action, getDisconnectException(requestId, action, destination));
337+
} else {
338+
super.onBlackholedDuringSend(requestId, action, destination);
339+
}
340+
}
267341
};
268342

269343
masterService = new FakeThreadPoolMasterService("test",
@@ -290,7 +364,7 @@ String getId() {
290364
return localNode.getId();
291365
}
292366

293-
public DiscoveryNode getLocalNode() {
367+
DiscoveryNode getLocalNode() {
294368
return localNode;
295369
}
296370

@@ -316,6 +390,14 @@ public void onFailure(String source, Exception e) {
316390
public String toString() {
317391
return localNode.toString();
318392
}
393+
394+
void disconnect() {
395+
disconnectedNodes.add(localNode.getId());
396+
}
397+
398+
void partition() {
399+
blackholedNodes.add(localNode.getId());
400+
}
319401
}
320402

321403
private List<TransportAddress> provideUnicastHosts(HostsResolver ignored) {

0 commit comments

Comments
 (0)