From b0c8c7e9cfdb1b2326db5293d02ef5e8f9139055 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Fri, 24 Jan 2020 14:19:27 +0100 Subject: [PATCH 1/2] Fix Inconsistent Shard Failure Count in Failed Snapshots This fix was necessary to allow for the below test enhancement: We were not adding shard failure entries to a failed snapshot for those snapshot entries that were never attempted because the snapshot failed during the init stage and wasn't partial. This caused the never attempted snapshots to be counted towards the successful shard count which seems wrong and broke repository consistency tests. Also, this change adjusts snapshot resiliency tests to run another snapshot at the end of each test run to guarantee a correct `index.latest` blob exists after each run. Closes #47550 --- .../snapshots/SnapshotsService.java | 7 ++++++- .../snapshots/SnapshotResiliencyTests.java | 16 ++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java index 9762beac4eff7..1403b4c58ed1d 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java @@ -1076,8 +1076,13 @@ protected void doRun() { for (ObjectObjectCursor shardStatus : entry.shards()) { ShardId shardId = shardStatus.key; ShardSnapshotStatus status = shardStatus.value; - if (status.state().failed()) { + final ShardState state = status.state(); + if (state.failed()) { shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, status.reason())); + } else if (state.completed() == false) { + shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, "Not attempted [" + state + "]")); + } else { + assert state == ShardState.SUCCESS; } } final ShardGenerations shardGenerations = buildGenerations(entry, metaData); diff --git a/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java b/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java index d36a31a30f59b..0f6189e9db1a7 100644 --- a/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java +++ b/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java @@ -213,6 +213,7 @@ import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.iterableWithSize; import static org.hamcrest.Matchers.lessThanOrEqualTo; import static org.mockito.Mockito.mock; @@ -246,8 +247,19 @@ public void verifyReposThenStopServices() { clearDisruptionsAndAwaitSync(); final StepListener cleanupResponse = new StepListener<>(); - client().admin().cluster().cleanupRepository( - new CleanupRepositoryRequest("repo"), cleanupResponse); + final StepListener createSnapshotResponse = new StepListener<>(); + // Create another snapshot and then clean up the repository to verify that the repository works correctly no matter the + // failures seen during the previous test. + client().admin().cluster().prepareCreateSnapshot("repo", "last-snapshot") + .setWaitForCompletion(true).execute(createSnapshotResponse); + continueOrDie(createSnapshotResponse, r -> { + final SnapshotInfo snapshotInfo = r.getSnapshotInfo(); + // Snapshot can fail because some tests leave indices in a red state because data nodes were stopped + assertThat(snapshotInfo.state(), either(is(SnapshotState.SUCCESS)).or(is(SnapshotState.FAILED))); + assertThat(snapshotInfo.shardFailures(), iterableWithSize(snapshotInfo.failedShards())); + assertThat(snapshotInfo.successfulShards(), is(snapshotInfo.totalShards() - snapshotInfo.failedShards())); + client().admin().cluster().cleanupRepository(new CleanupRepositoryRequest("repo"), cleanupResponse); + }); final AtomicBoolean cleanedUp = new AtomicBoolean(false); continueOrDie(cleanupResponse, r -> cleanedUp.set(true)); From 90f5cef043d582ec35c1f4b1de65d61c694c4868 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Fri, 24 Jan 2020 14:40:58 +0100 Subject: [PATCH 2/2] correct failure message --- .../main/java/org/elasticsearch/snapshots/SnapshotsService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java index 1403b4c58ed1d..4a1b6e84b639a 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java @@ -1080,7 +1080,7 @@ protected void doRun() { if (state.failed()) { shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, status.reason())); } else if (state.completed() == false) { - shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, "Not attempted [" + state + "]")); + shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, "skipped")); } else { assert state == ShardState.SUCCESS; }