Skip to content

Commit 02e34cb

Browse files
bleskesmartijnvg
authored andcommitted
Added ServiceDisruptionScheme(s) and testAckedIndexing
This commit adds the notion of ServiceDisruptionScheme allowing for introducing disruptions in our test cluster. This abstraction as used in a couple of wrappers around the functionality offered by MockTransportService to simulate various network partions. There is also one implementation for causing a node to be slow in processing cluster state updates. This new mechnaism is integrated into existing tests DiscoveryWithNetworkFailuresTests. A new test called testAckedIndexing is added to verify retrieval of documents whose indexing was acked during various disruptions. Closes #6505
1 parent 773e87d commit 02e34cb

17 files changed

+1144
-152
lines changed

src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ public ClusterState execute(ClusterState currentState) {
342342

343343
@Override
344344
public void onFailure(String source, Throwable t) {
345-
logger.error("unexpected failure during [{}]", t, source);
345+
logger.error("unexpected failure during [{}]", t, source);
346346
}
347347

348348
@Override
@@ -408,8 +408,7 @@ public ClusterState execute(ClusterState currentState) {
408408
public void onFailure(String source, Throwable t) {
409409
if (t instanceof ClusterService.NoLongerMasterException) {
410410
logger.debug("not processing {} leave request as we are no longer master", node);
411-
}
412-
else {
411+
} else {
413412
logger.error("unexpected failure during [{}]", t, source);
414413
}
415414
}
@@ -448,8 +447,7 @@ public ClusterState execute(ClusterState currentState) {
448447
public void onFailure(String source, Throwable t) {
449448
if (t instanceof ClusterService.NoLongerMasterException) {
450449
logger.debug("not processing [{}] as we are no longer master", source);
451-
}
452-
else {
450+
} else {
453451
logger.error("unexpected failure during [{}]", t, source);
454452
}
455453
}
@@ -486,8 +484,7 @@ public ClusterState execute(ClusterState currentState) {
486484
public void onFailure(String source, Throwable t) {
487485
if (t instanceof ClusterService.NoLongerMasterException) {
488486
logger.debug("not processing [{}] as we are no longer master", source);
489-
}
490-
else {
487+
} else {
491488
logger.error("unexpected failure during [{}]", t, source);
492489
}
493490
}

src/main/java/org/elasticsearch/transport/TransportService.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,10 @@ public void removeHandler(String action) {
245245
}
246246
}
247247

248+
protected TransportRequestHandler getHandler(String action) {
249+
return serverHandlers.get(action);
250+
}
251+
248252
class Adapter implements TransportServiceAdapter {
249253

250254
final MeanMetric rxMetric = new MeanMetric();

src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java

Lines changed: 257 additions & 140 deletions
Large diffs are not rendered by default.

src/test/java/org/elasticsearch/recovery/RecoveryWhileUnderLoadTests.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@
4343
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
4444
import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
4545
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.*;
46-
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoTimeout;
4746
import static org.hamcrest.Matchers.equalTo;
4847

4948
public class RecoveryWhileUnderLoadTests extends ElasticsearchIntegrationTest {

src/test/java/org/elasticsearch/test/BackgroundIndexer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ public void continueIndexing(int numOfDocs) {
217217
setBudget(numOfDocs);
218218
}
219219

220-
/** Stop all background threads **/
220+
/** Stop all background threads * */
221221
public void stop() throws InterruptedException {
222222
if (stop.get()) {
223223
return;

src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@
9696
import org.elasticsearch.search.SearchService;
9797
import org.elasticsearch.test.client.RandomizingClient;
9898
import org.hamcrest.Matchers;
99+
import org.elasticsearch.test.disruption.ServiceDisruptionScheme;
99100
import org.junit.*;
100101

101102
import java.io.IOException;
@@ -530,6 +531,7 @@ protected final void afterInternal() throws IOException {
530531
boolean success = false;
531532
try {
532533
logger.info("[{}#{}]: cleaning up after test", getTestClass().getSimpleName(), getTestName());
534+
clearDisruptionScheme();
533535
final Scope currentClusterScope = getCurrentClusterScope();
534536
try {
535537
if (currentClusterScope != Scope.TEST) {
@@ -643,6 +645,15 @@ protected int numberOfReplicas() {
643645
return between(minimumNumberOfReplicas(), maximumNumberOfReplicas());
644646
}
645647

648+
649+
public void setDisruptionScheme(ServiceDisruptionScheme scheme) {
650+
internalCluster().setDisruptionScheme(scheme);
651+
}
652+
653+
public void clearDisruptionScheme() {
654+
internalCluster().clearDisruptionScheme();
655+
}
656+
646657
/**
647658
* Returns a settings object used in {@link #createIndex(String...)} and {@link #prepareCreate(String)} and friends.
648659
* This method can be overwritten by subclasses to set defaults for the indices that are created by the test.

src/test/java/org/elasticsearch/test/InternalTestCluster.java

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
import org.elasticsearch.search.SearchService;
7878
import org.elasticsearch.test.cache.recycler.MockBigArraysModule;
7979
import org.elasticsearch.test.cache.recycler.MockPageCacheRecyclerModule;
80+
import org.elasticsearch.test.disruption.ServiceDisruptionScheme;
8081
import org.elasticsearch.test.engine.MockEngineModule;
8182
import org.elasticsearch.test.store.MockFSIndexStoreModule;
8283
import org.elasticsearch.test.transport.AssertingLocalTransportModule;
@@ -185,6 +186,8 @@ public final class InternalTestCluster extends TestCluster {
185186

186187
private final boolean hasFilterCache;
187188

189+
private ServiceDisruptionScheme activeDisruptionScheme;
190+
188191
public InternalTestCluster(long clusterSeed, String clusterName) {
189192
this(clusterSeed, DEFAULT_MIN_NUM_DATA_NODES, DEFAULT_MAX_NUM_DATA_NODES, clusterName, SettingsSource.EMPTY, DEFAULT_NUM_CLIENT_NODES, DEFAULT_ENABLE_RANDOM_BENCH_NODES);
190193
}
@@ -288,6 +291,10 @@ public String getClusterName() {
288291
return clusterName;
289292
}
290293

294+
public String[] getNodeNames() {
295+
return nodes.keySet().toArray(Strings.EMPTY_ARRAY);
296+
}
297+
291298
private static boolean isLocalTransportConfigured() {
292299
if ("local".equals(System.getProperty("es.node.mode", "network"))) {
293300
return true;
@@ -487,6 +494,7 @@ public synchronized void ensureAtMostNumDataNodes(int n) throws IOException {
487494
while (limit.hasNext()) {
488495
NodeAndClient next = limit.next();
489496
nodesToRemove.add(next);
497+
removeDistruptionSchemeFromNode(next);
490498
next.close();
491499
}
492500
for (NodeAndClient toRemove : nodesToRemove) {
@@ -661,6 +669,10 @@ public boolean apply(NodeAndClient nodeAndClient) {
661669
@Override
662670
public void close() {
663671
if (this.open.compareAndSet(true, false)) {
672+
if (activeDisruptionScheme != null) {
673+
activeDisruptionScheme.testClusterClosed();
674+
activeDisruptionScheme = null;
675+
}
664676
IOUtils.closeWhileHandlingException(nodes.values());
665677
nodes.clear();
666678
executor.shutdownNow();
@@ -858,6 +870,7 @@ public synchronized void beforeTest(Random random, double transportClientRatio)
858870
}
859871

860872
private synchronized void reset(boolean wipeData) throws IOException {
873+
clearDisruptionScheme();
861874
resetClients(); /* reset all clients - each test gets its own client based on the Random instance created above. */
862875
if (wipeData) {
863876
wipeDataDirectories();
@@ -1054,6 +1067,7 @@ public synchronized void stopRandomDataNode() throws IOException {
10541067
NodeAndClient nodeAndClient = getRandomNodeAndClient(new DataNodePredicate());
10551068
if (nodeAndClient != null) {
10561069
logger.info("Closing random node [{}] ", nodeAndClient.name);
1070+
removeDistruptionSchemeFromNode(nodeAndClient);
10571071
nodes.remove(nodeAndClient.name);
10581072
nodeAndClient.close();
10591073
}
@@ -1073,6 +1087,7 @@ public boolean apply(NodeAndClient nodeAndClient) {
10731087
});
10741088
if (nodeAndClient != null) {
10751089
logger.info("Closing filtered random node [{}] ", nodeAndClient.name);
1090+
removeDistruptionSchemeFromNode(nodeAndClient);
10761091
nodes.remove(nodeAndClient.name);
10771092
nodeAndClient.close();
10781093
}
@@ -1087,6 +1102,7 @@ public synchronized void stopCurrentMasterNode() throws IOException {
10871102
String masterNodeName = getMasterName();
10881103
assert nodes.containsKey(masterNodeName);
10891104
logger.info("Closing master node [{}] ", masterNodeName);
1105+
removeDistruptionSchemeFromNode(nodes.get(masterNodeName));
10901106
NodeAndClient remove = nodes.remove(masterNodeName);
10911107
remove.close();
10921108
}
@@ -1098,6 +1114,7 @@ public void stopRandomNonMasterNode() throws IOException {
10981114
NodeAndClient nodeAndClient = getRandomNodeAndClient(Predicates.not(new MasterNodePredicate(getMasterName())));
10991115
if (nodeAndClient != null) {
11001116
logger.info("Closing random non master node [{}] current master [{}] ", nodeAndClient.name, getMasterName());
1117+
removeDistruptionSchemeFromNode(nodeAndClient);
11011118
nodes.remove(nodeAndClient.name);
11021119
nodeAndClient.close();
11031120
}
@@ -1151,6 +1168,9 @@ private void restartAllNodes(boolean rollingRestart, RestartCallback callback) t
11511168
if (!callback.doRestart(nodeAndClient.name)) {
11521169
logger.info("Closing node [{}] during restart", nodeAndClient.name);
11531170
toRemove.add(nodeAndClient);
1171+
if (activeDisruptionScheme != null) {
1172+
activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
1173+
}
11541174
nodeAndClient.close();
11551175
}
11561176
}
@@ -1165,18 +1185,33 @@ private void restartAllNodes(boolean rollingRestart, RestartCallback callback) t
11651185
for (NodeAndClient nodeAndClient : nodes.values()) {
11661186
callback.doAfterNodes(numNodesRestarted++, nodeAndClient.nodeClient());
11671187
logger.info("Restarting node [{}] ", nodeAndClient.name);
1188+
if (activeDisruptionScheme != null) {
1189+
activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
1190+
}
11681191
nodeAndClient.restart(callback);
1192+
if (activeDisruptionScheme != null) {
1193+
activeDisruptionScheme.applyToNode(nodeAndClient.name, this);
1194+
}
11691195
}
11701196
} else {
11711197
int numNodesRestarted = 0;
11721198
for (NodeAndClient nodeAndClient : nodes.values()) {
11731199
callback.doAfterNodes(numNodesRestarted++, nodeAndClient.nodeClient());
11741200
logger.info("Stopping node [{}] ", nodeAndClient.name);
1201+
if (activeDisruptionScheme != null) {
1202+
activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
1203+
}
11751204
nodeAndClient.node.close();
11761205
}
11771206
for (NodeAndClient nodeAndClient : nodes.values()) {
11781207
logger.info("Starting node [{}] ", nodeAndClient.name);
1208+
if (activeDisruptionScheme != null) {
1209+
activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
1210+
}
11791211
nodeAndClient.restart(callback);
1212+
if (activeDisruptionScheme != null) {
1213+
activeDisruptionScheme.applyToNode(nodeAndClient.name, this);
1214+
}
11801215
}
11811216
}
11821217
}
@@ -1374,6 +1409,7 @@ private synchronized void publishNode(NodeAndClient nodeAndClient) {
13741409
dataDirToClean.addAll(Arrays.asList(nodeEnv.nodeDataLocations()));
13751410
}
13761411
nodes.put(nodeAndClient.name, nodeAndClient);
1412+
applyDisruptionSchemeToNode(nodeAndClient);
13771413
}
13781414

13791415
public void closeNonSharedNodes(boolean wipeData) throws IOException {
@@ -1395,6 +1431,33 @@ public boolean hasFilterCache() {
13951431
return hasFilterCache;
13961432
}
13971433

1434+
public void setDisruptionScheme(ServiceDisruptionScheme scheme) {
1435+
clearDisruptionScheme();
1436+
scheme.applyToCluster(this);
1437+
activeDisruptionScheme = scheme;
1438+
}
1439+
1440+
public void clearDisruptionScheme() {
1441+
if (activeDisruptionScheme != null) {
1442+
activeDisruptionScheme.removeFromCluster(this);
1443+
}
1444+
activeDisruptionScheme = null;
1445+
}
1446+
1447+
private void applyDisruptionSchemeToNode(NodeAndClient nodeAndClient) {
1448+
if (activeDisruptionScheme != null) {
1449+
assert nodes.containsKey(nodeAndClient.name);
1450+
activeDisruptionScheme.applyToNode(nodeAndClient.name, this);
1451+
}
1452+
}
1453+
1454+
private void removeDistruptionSchemeFromNode(NodeAndClient nodeAndClient) {
1455+
if (activeDisruptionScheme != null) {
1456+
assert nodes.containsKey(nodeAndClient.name);
1457+
activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
1458+
}
1459+
}
1460+
13981461
private synchronized Collection<NodeAndClient> dataNodeAndClients() {
13991462
return Collections2.filter(nodes.values(), new DataNodePredicate());
14001463
}

src/test/java/org/elasticsearch/test/TestCluster.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
2525
import org.elasticsearch.client.Client;
2626
import org.elasticsearch.cluster.metadata.IndexMetaData;
27+
import org.elasticsearch.common.Strings;
2728
import org.elasticsearch.common.logging.ESLogger;
2829
import org.elasticsearch.common.logging.Loggers;
2930
import org.elasticsearch.indices.IndexMissingException;
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.elasticsearch.test.disruption;
20+
21+
import org.elasticsearch.cluster.node.DiscoveryNode;
22+
import org.elasticsearch.common.unit.TimeValue;
23+
import org.elasticsearch.test.transport.MockTransportService;
24+
25+
import java.util.Random;
26+
import java.util.Set;
27+
28+
public class NetworkDelaysPartition extends NetworkPartition {
29+
30+
static long DEFAULT_DELAY_MIN = 10000;
31+
static long DEFAULT_DELAY_MAX = 90000;
32+
33+
34+
final long delayMin;
35+
final long delayMax;
36+
37+
TimeValue duration;
38+
39+
public NetworkDelaysPartition(Random random) {
40+
this(random, DEFAULT_DELAY_MIN, DEFAULT_DELAY_MAX);
41+
}
42+
43+
public NetworkDelaysPartition(Random random, long delayMin, long delayMax) {
44+
super(random);
45+
this.delayMin = delayMin;
46+
this.delayMax = delayMax;
47+
}
48+
49+
public NetworkDelaysPartition(String node1, String node2, Random random) {
50+
this(node1, node2, DEFAULT_DELAY_MIN, DEFAULT_DELAY_MAX, random);
51+
}
52+
53+
public NetworkDelaysPartition(String node1, String node2, long delayMin, long delayMax, Random random) {
54+
super(node1, node2, random);
55+
this.delayMin = delayMin;
56+
this.delayMax = delayMax;
57+
}
58+
59+
public NetworkDelaysPartition(Set<String> nodesSideOne, Set<String> nodesSideTwo, Random random) {
60+
this(nodesSideOne, nodesSideTwo, DEFAULT_DELAY_MIN, DEFAULT_DELAY_MAX, random);
61+
}
62+
63+
public NetworkDelaysPartition(Set<String> nodesSideOne, Set<String> nodesSideTwo, long delayMin, long delayMax, Random random) {
64+
super(nodesSideOne, nodesSideTwo, random);
65+
this.delayMin = delayMin;
66+
this.delayMax = delayMax;
67+
68+
}
69+
70+
@Override
71+
public synchronized void startDisrupting() {
72+
duration = new TimeValue(delayMin + random.nextInt((int) (delayMax - delayMin)));
73+
super.startDisrupting();
74+
}
75+
76+
@Override
77+
void applyDisruption(DiscoveryNode node1, MockTransportService transportService1,
78+
DiscoveryNode node2, MockTransportService transportService2) {
79+
transportService1.addUnresponsiveRule(node1, duration);
80+
transportService1.addUnresponsiveRule(node2, duration);
81+
}
82+
83+
@Override
84+
protected String getPartitionDescription() {
85+
return "network delays for [" + duration + "]";
86+
}
87+
88+
}

0 commit comments

Comments
 (0)