@@ -1237,6 +1237,55 @@ public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception {
12371237 }, 60L , TimeUnit .SECONDS );
12381238 }
12391239
1240+ public void testDataNodeRestartAfterShardSnapshotFailure () throws Exception {
1241+ logger .info ("--> starting a master node and two data nodes" );
1242+ internalCluster ().startMasterOnlyNode ();
1243+ final List <String > dataNodes = internalCluster ().startDataOnlyNodes (2 );
1244+ logger .info ("--> creating repository" );
1245+ assertAcked (client ().admin ().cluster ().preparePutRepository ("test-repo" )
1246+ .setType ("mock" ).setSettings (Settings .builder ()
1247+ .put ("location" , randomRepoPath ())
1248+ .put ("compress" , randomBoolean ())
1249+ .put ("chunk_size" , randomIntBetween (100 , 1000 ), ByteSizeUnit .BYTES )));
1250+ assertAcked (prepareCreate ("test-idx" , 0 , Settings .builder ()
1251+ .put ("number_of_shards" , 2 ).put ("number_of_replicas" , 0 )));
1252+ ensureGreen ();
1253+ logger .info ("--> indexing some data" );
1254+ final int numdocs = randomIntBetween (50 , 100 );
1255+ IndexRequestBuilder [] builders = new IndexRequestBuilder [numdocs ];
1256+ for (int i = 0 ; i < builders .length ; i ++) {
1257+ builders [i ] = client ().prepareIndex ("test-idx" , "type1" ,
1258+ Integer .toString (i )).setSource ("field1" , "bar " + i );
1259+ }
1260+ indexRandom (true , builders );
1261+ flushAndRefresh ();
1262+ blockAllDataNodes ("test-repo" );
1263+ logger .info ("--> snapshot" );
1264+ client (internalCluster ().getMasterName ()).admin ().cluster ()
1265+ .prepareCreateSnapshot ("test-repo" , "test-snap" ).setWaitForCompletion (false ).setIndices ("test-idx" ).get ();
1266+ logger .info ("--> restarting first data node, which should cause the primary shard on it to be failed" );
1267+ internalCluster ().restartNode (dataNodes .get (0 ), InternalTestCluster .EMPTY_CALLBACK );
1268+
1269+ logger .info ("--> wait for shard snapshot of first primary to show as failed" );
1270+ assertBusy (() -> assertThat (
1271+ client ().admin ().cluster ().prepareSnapshotStatus ("test-repo" ).setSnapshots ("test-snap" ).get ().getSnapshots ()
1272+ .get (0 ).getShardsStats ().getFailedShards (), is (1 )), 60L , TimeUnit .SECONDS );
1273+
1274+ logger .info ("--> restarting second data node, which should cause the primary shard on it to be failed" );
1275+ internalCluster ().restartNode (dataNodes .get (1 ), InternalTestCluster .EMPTY_CALLBACK );
1276+
1277+ // check that snapshot completes with both failed shards being accounted for in the snapshot result
1278+ assertBusy (() -> {
1279+ GetSnapshotsResponse snapshotsStatusResponse = client ().admin ().cluster ()
1280+ .prepareGetSnapshots ("test-repo" ).setSnapshots ("test-snap" ).setIgnoreUnavailable (true ).get ();
1281+ assertEquals (1 , snapshotsStatusResponse .getSnapshots ().size ());
1282+ SnapshotInfo snapshotInfo = snapshotsStatusResponse .getSnapshots ().get (0 );
1283+ assertTrue (snapshotInfo .state ().toString (), snapshotInfo .state ().completed ());
1284+ assertThat (snapshotInfo .totalShards (), is (2 ));
1285+ assertThat (snapshotInfo .shardFailures (), hasSize (2 ));
1286+ }, 60L , TimeUnit .SECONDS );
1287+ }
1288+
12401289 public void testRetentionLeasesClearedOnRestore () throws Exception {
12411290 final String repoName = "test-repo-retention-leases" ;
12421291 assertAcked (client ().admin ().cluster ().preparePutRepository (repoName )
0 commit comments