From c20b138349f5438047e8f63aa552344b185c2190 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Fri, 24 Jan 2020 16:29:03 +0100 Subject: [PATCH] Fix Inconsistent Shard Failure Count in Failed Snapshots (#51416) * Fix Inconsistent Shard Failure Count in Failed Snapshots This fix was necessary to allow for the below test enhancement: We were not adding shard failure entries to a failed snapshot for those snapshot entries that were never attempted because the snapshot failed during the init stage and wasn't partial. This caused the never attempted snapshots to be counted towards the successful shard count which seems wrong and broke repository consistency tests. Also, this change adjusts snapshot resiliency tests to run another snapshot at the end of each test run to guarantee a correct `index.latest` blob exists after each run. Closes #47550 --- .../snapshots/SnapshotsService.java | 7 ++++++- .../snapshots/SnapshotResiliencyTests.java | 19 ++++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java index 8cc7aa42347ab..4a66330316a4a 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java @@ -1084,8 +1084,13 @@ protected void doRun() { for (ObjectObjectCursor shardStatus : entry.shards()) { ShardId shardId = shardStatus.key; ShardSnapshotStatus status = shardStatus.value; - if (status.state().failed()) { + final ShardState state = status.state(); + if (state.failed()) { shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, status.reason())); + } else if (state.completed() == false) { + shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, "skipped")); + } else { + assert state == ShardState.SUCCESS; } } final ShardGenerations shardGenerations = buildGenerations(entry, metaData); diff --git a/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java b/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java index 575cc8b44d3c8..835548bc694a6 100644 --- a/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java +++ b/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java @@ -148,8 +148,6 @@ import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.index.seqno.GlobalCheckpointSyncAction; import org.elasticsearch.index.seqno.RetentionLeaseSyncer; -import org.elasticsearch.index.seqno.RetentionLeaseSyncer; -import org.elasticsearch.index.seqno.RetentionLeaseSyncer; import org.elasticsearch.index.shard.PrimaryReplicaSyncer; import org.elasticsearch.indices.IndicesModule; import org.elasticsearch.indices.IndicesService; @@ -216,6 +214,7 @@ import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.iterableWithSize; import static org.hamcrest.Matchers.lessThanOrEqualTo; import static org.mockito.Mockito.mock; @@ -249,8 +248,19 @@ public void verifyReposThenStopServices() { clearDisruptionsAndAwaitSync(); final StepListener cleanupResponse = new StepListener<>(); - client().admin().cluster().cleanupRepository( - new CleanupRepositoryRequest("repo"), cleanupResponse); + final StepListener createSnapshotResponse = new StepListener<>(); + // Create another snapshot and then clean up the repository to verify that the repository works correctly no matter the + // failures seen during the previous test. + client().admin().cluster().prepareCreateSnapshot("repo", "last-snapshot") + .setWaitForCompletion(true).execute(createSnapshotResponse); + continueOrDie(createSnapshotResponse, r -> { + final SnapshotInfo snapshotInfo = r.getSnapshotInfo(); + // Snapshot can fail because some tests leave indices in a red state because data nodes were stopped + assertThat(snapshotInfo.state(), either(is(SnapshotState.SUCCESS)).or(is(SnapshotState.FAILED))); + assertThat(snapshotInfo.shardFailures(), iterableWithSize(snapshotInfo.failedShards())); + assertThat(snapshotInfo.successfulShards(), is(snapshotInfo.totalShards() - snapshotInfo.failedShards())); + client().admin().cluster().cleanupRepository(new CleanupRepositoryRequest("repo"), cleanupResponse); + }); final AtomicBoolean cleanedUp = new AtomicBoolean(false); continueOrDie(cleanupResponse, r -> cleanedUp.set(true)); @@ -340,7 +350,6 @@ public void testSuccessfulSnapshotAndRestore() { assertEquals(0, snapshotInfo.failedShards()); } - @AwaitsFix(bugUrl="https://github.com/elastic/elasticsearch/issues/54459") public void testSnapshotWithNodeDisconnects() { final int dataNodes = randomIntBetween(2, 10); final int masterNodes = randomFrom(1, 3, 5);