@@ -1234,6 +1234,55 @@ public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception {
12341234 }, 60L , TimeUnit .SECONDS );
12351235 }
12361236
1237+ public void testDataNodeRestartAfterShardSnapshotFailure () throws Exception {
1238+ logger .info ("--> starting a master node and two data nodes" );
1239+ internalCluster ().startMasterOnlyNode ();
1240+ final List <String > dataNodes = internalCluster ().startDataOnlyNodes (2 );
1241+ logger .info ("--> creating repository" );
1242+ assertAcked (client ().admin ().cluster ().preparePutRepository ("test-repo" )
1243+ .setType ("mock" ).setSettings (Settings .builder ()
1244+ .put ("location" , randomRepoPath ())
1245+ .put ("compress" , randomBoolean ())
1246+ .put ("chunk_size" , randomIntBetween (100 , 1000 ), ByteSizeUnit .BYTES )));
1247+ assertAcked (prepareCreate ("test-idx" , 0 , Settings .builder ()
1248+ .put ("number_of_shards" , 2 ).put ("number_of_replicas" , 0 )));
1249+ ensureGreen ();
1250+ logger .info ("--> indexing some data" );
1251+ final int numdocs = randomIntBetween (50 , 100 );
1252+ IndexRequestBuilder [] builders = new IndexRequestBuilder [numdocs ];
1253+ for (int i = 0 ; i < builders .length ; i ++) {
1254+ builders [i ] = client ().prepareIndex ("test-idx" , "type1" ,
1255+ Integer .toString (i )).setSource ("field1" , "bar " + i );
1256+ }
1257+ indexRandom (true , builders );
1258+ flushAndRefresh ();
1259+ blockAllDataNodes ("test-repo" );
1260+ logger .info ("--> snapshot" );
1261+ client (internalCluster ().getMasterName ()).admin ().cluster ()
1262+ .prepareCreateSnapshot ("test-repo" , "test-snap" ).setWaitForCompletion (false ).setIndices ("test-idx" ).get ();
1263+ logger .info ("--> restarting first data node, which should cause the primary shard on it to be failed" );
1264+ internalCluster ().restartNode (dataNodes .get (0 ), InternalTestCluster .EMPTY_CALLBACK );
1265+
1266+ logger .info ("--> wait for shard snapshot of first primary to show as failed" );
1267+ assertBusy (() -> assertThat (
1268+ client ().admin ().cluster ().prepareSnapshotStatus ("test-repo" ).setSnapshots ("test-snap" ).get ().getSnapshots ()
1269+ .get (0 ).getShardsStats ().getFailedShards (), is (1 )), 60L , TimeUnit .SECONDS );
1270+
1271+ logger .info ("--> restarting second data node, which should cause the primary shard on it to be failed" );
1272+ internalCluster ().restartNode (dataNodes .get (1 ), InternalTestCluster .EMPTY_CALLBACK );
1273+
1274+ // check that snapshot completes with both failed shards being accounted for in the snapshot result
1275+ assertBusy (() -> {
1276+ GetSnapshotsResponse snapshotsStatusResponse = client ().admin ().cluster ()
1277+ .prepareGetSnapshots ("test-repo" ).setSnapshots ("test-snap" ).setIgnoreUnavailable (true ).get ();
1278+ assertEquals (1 , snapshotsStatusResponse .getSnapshots ("test-repo" ).size ());
1279+ SnapshotInfo snapshotInfo = snapshotsStatusResponse .getSnapshots ("test-repo" ).get (0 );
1280+ assertTrue (snapshotInfo .state ().toString (), snapshotInfo .state ().completed ());
1281+ assertThat (snapshotInfo .totalShards (), is (2 ));
1282+ assertThat (snapshotInfo .shardFailures (), hasSize (2 ));
1283+ }, 60L , TimeUnit .SECONDS );
1284+ }
1285+
12371286 public void testRetentionLeasesClearedOnRestore () throws Exception {
12381287 final String repoName = "test-repo-retention-leases" ;
12391288 assertAcked (client ().admin ().cluster ().preparePutRepository (repoName )
0 commit comments