|
23 | 23 | import org.apache.logging.log4j.util.Supplier; |
24 | 24 | import org.apache.lucene.index.CorruptIndexException; |
25 | 25 | import org.elasticsearch.ElasticsearchException; |
| 26 | +import org.elasticsearch.action.ActionFuture; |
26 | 27 | import org.elasticsearch.action.DocWriteResponse; |
| 28 | +import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse; |
| 29 | +import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse; |
27 | 30 | import org.elasticsearch.action.get.GetResponse; |
28 | 31 | import org.elasticsearch.action.index.IndexRequestBuilder; |
29 | 32 | import org.elasticsearch.action.index.IndexResponse; |
30 | 33 | import org.elasticsearch.client.Client; |
| 34 | +import org.elasticsearch.cluster.ClusterChangedEvent; |
31 | 35 | import org.elasticsearch.cluster.ClusterState; |
| 36 | +import org.elasticsearch.cluster.ClusterStateListener; |
32 | 37 | import org.elasticsearch.cluster.ClusterStateUpdateTask; |
| 38 | +import org.elasticsearch.cluster.SnapshotsInProgress; |
33 | 39 | import org.elasticsearch.cluster.action.shard.ShardStateAction; |
34 | 40 | import org.elasticsearch.cluster.block.ClusterBlock; |
35 | 41 | import org.elasticsearch.cluster.block.ClusterBlockLevel; |
|
45 | 51 | import org.elasticsearch.common.Strings; |
46 | 52 | import org.elasticsearch.common.collect.Tuple; |
47 | 53 | import org.elasticsearch.common.settings.Settings; |
| 54 | +import org.elasticsearch.common.unit.ByteSizeUnit; |
48 | 55 | import org.elasticsearch.common.unit.TimeValue; |
49 | 56 | import org.elasticsearch.common.xcontent.XContentType; |
50 | 57 | import org.elasticsearch.discovery.zen.ElectMasterService; |
|
58 | 65 | import org.elasticsearch.indices.store.IndicesStoreIntegrationIT; |
59 | 66 | import org.elasticsearch.monitor.jvm.HotThreads; |
60 | 67 | import org.elasticsearch.plugins.Plugin; |
| 68 | +import org.elasticsearch.snapshots.SnapshotInfo; |
| 69 | +import org.elasticsearch.snapshots.SnapshotMissingException; |
| 70 | +import org.elasticsearch.snapshots.SnapshotState; |
61 | 71 | import org.elasticsearch.test.ESIntegTestCase; |
62 | 72 | import org.elasticsearch.test.ESIntegTestCase.ClusterScope; |
63 | 73 | import org.elasticsearch.test.ESIntegTestCase.Scope; |
|
113 | 123 | import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; |
114 | 124 | import static org.hamcrest.Matchers.equalTo; |
115 | 125 | import static org.hamcrest.Matchers.greaterThanOrEqualTo; |
| 126 | +import static org.hamcrest.Matchers.instanceOf; |
116 | 127 | import static org.hamcrest.Matchers.is; |
117 | 128 | import static org.hamcrest.Matchers.not; |
118 | 129 | import static org.hamcrest.Matchers.nullValue; |
@@ -1256,6 +1267,124 @@ public void testElectMasterWithLatestVersion() throws Exception { |
1256 | 1267 |
|
1257 | 1268 | } |
1258 | 1269 |
|
| 1270 | + public void testDisruptionOnSnapshotInitialization() throws Exception { |
| 1271 | + final Settings settings = Settings.builder() |
| 1272 | + .put(DEFAULT_SETTINGS) |
| 1273 | + .put(DiscoverySettings.COMMIT_TIMEOUT_SETTING.getKey(), "30s") // wait till cluster state is committed |
| 1274 | + .build(); |
| 1275 | + final String idxName = "test"; |
| 1276 | + configureCluster(settings, 4, null, 2); |
| 1277 | + final List<String> allMasterEligibleNodes = internalCluster().startMasterOnlyNodes(3); |
| 1278 | + final String dataNode = internalCluster().startDataOnlyNode(); |
| 1279 | + ensureStableCluster(4); |
| 1280 | + |
| 1281 | + createRandomIndex(idxName); |
| 1282 | + |
| 1283 | + logger.info("--> creating repository"); |
| 1284 | + assertAcked(client().admin().cluster().preparePutRepository("test-repo") |
| 1285 | + .setType("fs").setSettings(Settings.builder() |
| 1286 | + .put("location", randomRepoPath()) |
| 1287 | + .put("compress", randomBoolean()) |
| 1288 | + .put("chunk_size", randomIntBetween(100, 1000), ByteSizeUnit.BYTES))); |
| 1289 | + |
| 1290 | + // Writing incompatible snapshot can cause this test to fail due to a race condition in repo initialization |
| 1291 | + // by the current master and the former master. It is not causing any issues in real life scenario, but |
| 1292 | + // might make this test to fail. We are going to complete initialization of the snapshot to prevent this failures. |
| 1293 | + logger.info("--> initializing the repository"); |
| 1294 | + assertEquals(SnapshotState.SUCCESS, client().admin().cluster().prepareCreateSnapshot("test-repo", "test-snap-1") |
| 1295 | + .setWaitForCompletion(true).setIncludeGlobalState(true).setIndices().get().getSnapshotInfo().state()); |
| 1296 | + |
| 1297 | + final String masterNode1 = internalCluster().getMasterName(); |
| 1298 | + Set<String> otherNodes = new HashSet<>(); |
| 1299 | + otherNodes.addAll(allMasterEligibleNodes); |
| 1300 | + otherNodes.remove(masterNode1); |
| 1301 | + otherNodes.add(dataNode); |
| 1302 | + |
| 1303 | + NetworkDisruption networkDisruption = |
| 1304 | + new NetworkDisruption(new NetworkDisruption.TwoPartitions(Collections.singleton(masterNode1), otherNodes), |
| 1305 | + new NetworkDisruption.NetworkUnresponsive()); |
| 1306 | + internalCluster().setDisruptionScheme(networkDisruption); |
| 1307 | + |
| 1308 | + ClusterService clusterService = internalCluster().clusterService(masterNode1); |
| 1309 | + CountDownLatch disruptionStarted = new CountDownLatch(1); |
| 1310 | + clusterService.addListener(new ClusterStateListener() { |
| 1311 | + @Override |
| 1312 | + public void clusterChanged(ClusterChangedEvent event) { |
| 1313 | + SnapshotsInProgress snapshots = event.state().custom(SnapshotsInProgress.TYPE); |
| 1314 | + if (snapshots != null && snapshots.entries().size() > 0) { |
| 1315 | + if (snapshots.entries().get(0).state() == SnapshotsInProgress.State.INIT) { |
| 1316 | + // The snapshot started, we can start disruption so the INIT state will arrive to another master node |
| 1317 | + logger.info("--> starting disruption"); |
| 1318 | + networkDisruption.startDisrupting(); |
| 1319 | + clusterService.removeListener(this); |
| 1320 | + disruptionStarted.countDown(); |
| 1321 | + } |
| 1322 | + } |
| 1323 | + } |
| 1324 | + }); |
| 1325 | + |
| 1326 | + logger.info("--> starting snapshot"); |
| 1327 | + ActionFuture<CreateSnapshotResponse> future = client(masterNode1).admin().cluster() |
| 1328 | + .prepareCreateSnapshot("test-repo", "test-snap-2").setWaitForCompletion(false).setIndices(idxName).execute(); |
| 1329 | + |
| 1330 | + logger.info("--> waiting for disruption to start"); |
| 1331 | + assertTrue(disruptionStarted.await(1, TimeUnit.MINUTES)); |
| 1332 | + |
| 1333 | + logger.info("--> wait until the snapshot is done"); |
| 1334 | + assertBusy(() -> { |
| 1335 | + SnapshotsInProgress snapshots = dataNodeClient().admin().cluster().prepareState().setLocal(true).get().getState() |
| 1336 | + .custom(SnapshotsInProgress.TYPE); |
| 1337 | + if (snapshots != null && snapshots.entries().size() > 0) { |
| 1338 | + logger.info("Current snapshot state [{}]", snapshots.entries().get(0).state()); |
| 1339 | + fail("Snapshot is still running"); |
| 1340 | + } else { |
| 1341 | + logger.info("Snapshot is no longer in the cluster state"); |
| 1342 | + } |
| 1343 | + }, 1, TimeUnit.MINUTES); |
| 1344 | + |
| 1345 | + logger.info("--> verify that snapshot was successful or no longer exist"); |
| 1346 | + assertBusy(() -> { |
| 1347 | + try { |
| 1348 | + GetSnapshotsResponse snapshotsStatusResponse = dataNodeClient().admin().cluster().prepareGetSnapshots("test-repo") |
| 1349 | + .setSnapshots("test-snap-2").get(); |
| 1350 | + SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0); |
| 1351 | + assertEquals(SnapshotState.SUCCESS, snapshotInfo.state()); |
| 1352 | + assertEquals(snapshotInfo.totalShards(), snapshotInfo.successfulShards()); |
| 1353 | + assertEquals(0, snapshotInfo.failedShards()); |
| 1354 | + logger.info("--> done verifying"); |
| 1355 | + } catch (SnapshotMissingException exception) { |
| 1356 | + logger.info("--> snapshot doesn't exist"); |
| 1357 | + } |
| 1358 | + }, 1, TimeUnit.MINUTES); |
| 1359 | + |
| 1360 | + logger.info("--> stopping disrupting"); |
| 1361 | + networkDisruption.stopDisrupting(); |
| 1362 | + ensureStableCluster(4, masterNode1); |
| 1363 | + logger.info("--> done"); |
| 1364 | + |
| 1365 | + try { |
| 1366 | + future.get(); |
| 1367 | + } catch (Exception ex) { |
| 1368 | + logger.info("--> got exception from hanged master", ex); |
| 1369 | + Throwable cause = ex.getCause(); |
| 1370 | + assertThat(cause, instanceOf(MasterNotDiscoveredException.class)); |
| 1371 | + cause = cause.getCause(); |
| 1372 | + assertThat(cause, instanceOf(Discovery.FailedToCommitClusterStateException.class)); |
| 1373 | + } |
| 1374 | + } |
| 1375 | + |
| 1376 | + private void createRandomIndex(String idxName) throws ExecutionException, InterruptedException { |
| 1377 | + assertAcked(prepareCreate(idxName, 0, Settings.builder().put("number_of_shards", between(1, 20)) |
| 1378 | + .put("number_of_replicas", 0))); |
| 1379 | + logger.info("--> indexing some data"); |
| 1380 | + final int numdocs = randomIntBetween(10, 100); |
| 1381 | + IndexRequestBuilder[] builders = new IndexRequestBuilder[numdocs]; |
| 1382 | + for (int i = 0; i < builders.length; i++) { |
| 1383 | + builders[i] = client().prepareIndex(idxName, "type1", Integer.toString(i)).setSource("field1", "bar " + i); |
| 1384 | + } |
| 1385 | + indexRandom(true, builders); |
| 1386 | + } |
| 1387 | + |
1259 | 1388 | protected NetworkDisruption addRandomDisruptionType(TwoPartitions partitions) { |
1260 | 1389 | final NetworkLinkDisruptionType disruptionType; |
1261 | 1390 | if (randomBoolean()) { |
|
0 commit comments