Skip to content

Commit 9cd69e7

Browse files
authored
recovery from snapshot should fill gaps (#27850)
When snapshotting the primary we capture a lucene commit at an arbitrary moment from a sequence number perspective. This means that it is possible that the commit misses operations and that there is a gap between the local checkpoint in the commit and the maximum sequence number. When we restore, this will create a primary that "misses" operations and currently will mean that the sequence number system is stuck (i.e., the local checkpoint will be stuck). To fix this we should fill in gaps when we restore, in a similar fashion to normal store recovery.
1 parent 26fc717 commit 9cd69e7

File tree

3 files changed

+80
-0
lines changed

3 files changed

+80
-0
lines changed

core/src/main/java/org/elasticsearch/index/shard/StoreRecovery.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,8 @@ private void restore(final IndexShard indexShard, final Repository repository, f
436436
final IndexId indexId = repository.getRepositoryData().resolveIndexId(indexName);
437437
repository.restoreShard(indexShard, restoreSource.snapshot().getSnapshotId(), restoreSource.version(), indexId, snapshotShardId, indexShard.recoveryState());
438438
indexShard.skipTranslogRecovery();
439+
assert indexShard.shardRouting.primary() : "only primary shards can recover from store";
440+
indexShard.getEngine().fillSeqNoGaps(indexShard.getPrimaryTerm());
439441
indexShard.finalizeRecovery();
440442
indexShard.postRecovery("restore done");
441443
} catch (Exception e) {

core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.elasticsearch.action.admin.cluster.storedscripts.GetStoredScriptResponse;
3737
import org.elasticsearch.action.admin.indices.flush.FlushResponse;
3838
import org.elasticsearch.action.admin.indices.settings.get.GetSettingsResponse;
39+
import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse;
3940
import org.elasticsearch.action.admin.indices.stats.ShardStats;
4041
import org.elasticsearch.action.admin.indices.template.get.GetIndexTemplatesResponse;
4142
import org.elasticsearch.action.index.IndexRequestBuilder;
@@ -71,8 +72,10 @@
7172
import org.elasticsearch.common.unit.TimeValue;
7273
import org.elasticsearch.common.xcontent.XContentFactory;
7374
import org.elasticsearch.common.xcontent.XContentType;
75+
import org.elasticsearch.index.Index;
7476
import org.elasticsearch.index.IndexService;
7577
import org.elasticsearch.index.engine.Engine;
78+
import org.elasticsearch.index.shard.IndexShard;
7679
import org.elasticsearch.index.shard.ShardId;
7780
import org.elasticsearch.indices.IndicesService;
7881
import org.elasticsearch.indices.InvalidIndexNameException;
@@ -112,6 +115,7 @@
112115
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
113116
import static org.elasticsearch.index.IndexSettings.INDEX_REFRESH_INTERVAL_SETTING;
114117
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
118+
import static org.elasticsearch.index.shard.IndexShardTests.getEngineFromShard;
115119
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
116120
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAliasesExist;
117121
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAliasesMissing;
@@ -3072,6 +3076,73 @@ public void testGetSnapshotsFromIndexBlobOnly() throws Exception {
30723076
}
30733077
}
30743078

3079+
public void testSnapshottingWithMissingSequenceNumbers() {
3080+
final String repositoryName = "test-repo";
3081+
final String snapshotName = "test-snap";
3082+
final String indexName = "test-idx";
3083+
final Client client = client();
3084+
final Path repo = randomRepoPath();
3085+
3086+
logger.info("--> creating repository at {}", repo.toAbsolutePath());
3087+
assertAcked(client.admin().cluster().preparePutRepository(repositoryName)
3088+
.setType("fs").setSettings(Settings.builder()
3089+
.put("location", repo)
3090+
.put("compress", false)
3091+
.put("chunk_size", randomIntBetween(100, 1000), ByteSizeUnit.BYTES)));
3092+
logger.info("--> creating an index and indexing documents");
3093+
final String dataNode = internalCluster().getDataNodeInstance(ClusterService.class).localNode().getName();
3094+
final Settings settings =
3095+
Settings
3096+
.builder()
3097+
.put("index.number_of_shards", 1)
3098+
.put("index.number_of_replicas", 0)
3099+
.put("index.routing.allocation.include._name", dataNode)
3100+
.build();
3101+
createIndex(indexName, settings);
3102+
ensureGreen();
3103+
for (int i = 0; i < 5; i++) {
3104+
index(indexName, "_doc", Integer.toString(i), "foo", "bar" + i);
3105+
}
3106+
3107+
final Index index = resolveIndex(indexName);
3108+
final IndexShard primary = internalCluster().getInstance(IndicesService.class, dataNode).getShardOrNull(new ShardId(index, 0));
3109+
// create a gap in the sequence numbers
3110+
getEngineFromShard(primary).seqNoService().generateSeqNo();
3111+
3112+
for (int i = 5; i < 10; i++) {
3113+
index(indexName, "_doc", Integer.toString(i), "foo", "bar" + i);
3114+
}
3115+
3116+
refresh();
3117+
3118+
logger.info("--> snapshot");
3119+
CreateSnapshotResponse createSnapshotResponse = client.admin().cluster().prepareCreateSnapshot(repositoryName, snapshotName)
3120+
.setWaitForCompletion(true).setIndices(indexName).get();
3121+
assertThat(createSnapshotResponse.getSnapshotInfo().successfulShards(), greaterThan(0));
3122+
assertThat(createSnapshotResponse.getSnapshotInfo().successfulShards(),
3123+
equalTo(createSnapshotResponse.getSnapshotInfo().totalShards()));
3124+
3125+
logger.info("--> delete indices");
3126+
assertAcked(client.admin().indices().prepareDelete(indexName));
3127+
3128+
logger.info("--> restore all indices from the snapshot");
3129+
RestoreSnapshotResponse restoreSnapshotResponse = client.admin().cluster().prepareRestoreSnapshot("test-repo", "test-snap")
3130+
.setWaitForCompletion(true).execute().actionGet();
3131+
assertThat(restoreSnapshotResponse.getRestoreInfo().totalShards(), greaterThan(0));
3132+
3133+
logger.info("--> indexing some more");
3134+
for (int i = 10; i < 15; i++) {
3135+
index(indexName, "_doc", Integer.toString(i), "foo", "bar" + i);
3136+
}
3137+
3138+
IndicesStatsResponse stats = client().admin().indices().prepareStats(indexName).clear().get();
3139+
ShardStats shardStats = stats.getShards()[0];
3140+
assertTrue(shardStats.getShardRouting().primary());
3141+
assertThat(shardStats.getSeqNoStats().getLocalCheckpoint(), equalTo(15L)); // 15 indexed docs and one "missing" op.
3142+
assertThat(shardStats.getSeqNoStats().getGlobalCheckpoint(), equalTo(15L));
3143+
assertThat(shardStats.getSeqNoStats().getMaxSeqNo(), equalTo(15L));
3144+
}
3145+
30753146
private void verifySnapshotInfo(final GetSnapshotsResponse response, final Map<String, List<String>> indicesPerSnapshot) {
30763147
for (SnapshotInfo snapshotInfo : response.getSnapshots()) {
30773148
final List<String> expected = snapshotInfo.indices();

test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -738,6 +738,13 @@ public final void createIndex(String... names) {
738738
}
739739
}
740740

741+
/**
742+
* creates an index with the given setting
743+
*/
744+
public final void createIndex(String name, Settings indexSettings) {
745+
assertAcked(prepareCreate(name).setSettings(indexSettings));
746+
}
747+
741748
/**
742749
* Creates a new {@link CreateIndexRequestBuilder} with the settings obtained from {@link #indexSettings()}.
743750
*/

0 commit comments

Comments
 (0)