Skip to content

Commit 7e4dfd8

Browse files
committed
recovery from snapshot should fill gaps (#27850)
When snapshotting the primary we capture a lucene commit at an arbitrary moment from a sequence number perspective. This means that it is possible that the commit misses operations and that there is a gap between the local checkpoint in the commit and the maximum sequence number. When we restore, this will create a primary that "misses" operations and currently will mean that the sequence number system is stuck (i.e., the local checkpoint will be stuck). To fix this we should fill in gaps when we restore, in a similar fashion to normal store recovery.
1 parent fd96434 commit 7e4dfd8

File tree

3 files changed

+80
-1
lines changed

3 files changed

+80
-1
lines changed

core/src/main/java/org/elasticsearch/index/shard/StoreRecovery.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,8 @@ private void restore(final IndexShard indexShard, final Repository repository, f
436436
final IndexId indexId = repository.getRepositoryData().resolveIndexId(indexName);
437437
repository.restoreShard(indexShard, restoreSource.snapshot().getSnapshotId(), restoreSource.version(), indexId, snapshotShardId, indexShard.recoveryState());
438438
indexShard.skipTranslogRecovery();
439+
assert indexShard.shardRouting.primary() : "only primary shards can recover from store";
440+
indexShard.getEngine().fillSeqNoGaps(indexShard.getPrimaryTerm());
439441
indexShard.finalizeRecovery();
440442
indexShard.postRecovery("restore done");
441443
} catch (Exception e) {

core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.elasticsearch.action.admin.cluster.storedscripts.GetStoredScriptResponse;
3737
import org.elasticsearch.action.admin.indices.flush.FlushResponse;
3838
import org.elasticsearch.action.admin.indices.settings.get.GetSettingsResponse;
39+
import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse;
3940
import org.elasticsearch.action.admin.indices.stats.ShardStats;
4041
import org.elasticsearch.action.admin.indices.template.get.GetIndexTemplatesResponse;
4142
import org.elasticsearch.action.index.IndexRequestBuilder;
@@ -66,8 +67,10 @@
6667
import org.elasticsearch.common.unit.TimeValue;
6768
import org.elasticsearch.common.xcontent.XContentFactory;
6869
import org.elasticsearch.common.xcontent.XContentType;
70+
import org.elasticsearch.index.Index;
6971
import org.elasticsearch.index.IndexService;
7072
import org.elasticsearch.index.engine.Engine;
73+
import org.elasticsearch.index.shard.IndexShard;
7174
import org.elasticsearch.index.shard.ShardId;
7275
import org.elasticsearch.indices.IndicesService;
7376
import org.elasticsearch.indices.InvalidIndexNameException;
@@ -104,8 +107,8 @@
104107
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
105108
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
106109
import static org.elasticsearch.index.IndexSettings.INDEX_REFRESH_INTERVAL_SETTING;
107-
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
108110
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
111+
import static org.elasticsearch.index.shard.IndexShardTests.getEngineFromShard;
109112
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
110113
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAliasesExist;
111114
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAliasesMissing;
@@ -2881,6 +2884,73 @@ public void testGetSnapshotsFromIndexBlobOnly() throws Exception {
28812884
}
28822885
}
28832886

2887+
public void testSnapshottingWithMissingSequenceNumbers() {
2888+
final String repositoryName = "test-repo";
2889+
final String snapshotName = "test-snap";
2890+
final String indexName = "test-idx";
2891+
final Client client = client();
2892+
final Path repo = randomRepoPath();
2893+
2894+
logger.info("--> creating repository at {}", repo.toAbsolutePath());
2895+
assertAcked(client.admin().cluster().preparePutRepository(repositoryName)
2896+
.setType("fs").setSettings(Settings.builder()
2897+
.put("location", repo)
2898+
.put("compress", false)
2899+
.put("chunk_size", randomIntBetween(100, 1000), ByteSizeUnit.BYTES)));
2900+
logger.info("--> creating an index and indexing documents");
2901+
final String dataNode = internalCluster().getDataNodeInstance(ClusterService.class).localNode().getName();
2902+
final Settings settings =
2903+
Settings
2904+
.builder()
2905+
.put("index.number_of_shards", 1)
2906+
.put("index.number_of_replicas", 0)
2907+
.put("index.routing.allocation.include._name", dataNode)
2908+
.build();
2909+
createIndex(indexName, settings);
2910+
ensureGreen();
2911+
for (int i = 0; i < 5; i++) {
2912+
index(indexName, "doc", Integer.toString(i), "foo", "bar" + i);
2913+
}
2914+
2915+
final Index index = resolveIndex(indexName);
2916+
final IndexShard primary = internalCluster().getInstance(IndicesService.class, dataNode).getShardOrNull(new ShardId(index, 0));
2917+
// create a gap in the sequence numbers
2918+
getEngineFromShard(primary).seqNoService().generateSeqNo();
2919+
2920+
for (int i = 5; i < 10; i++) {
2921+
index(indexName, "doc", Integer.toString(i), "foo", "bar" + i);
2922+
}
2923+
2924+
refresh();
2925+
2926+
logger.info("--> snapshot");
2927+
CreateSnapshotResponse createSnapshotResponse = client.admin().cluster().prepareCreateSnapshot(repositoryName, snapshotName)
2928+
.setWaitForCompletion(true).setIndices(indexName).get();
2929+
assertThat(createSnapshotResponse.getSnapshotInfo().successfulShards(), greaterThan(0));
2930+
assertThat(createSnapshotResponse.getSnapshotInfo().successfulShards(),
2931+
equalTo(createSnapshotResponse.getSnapshotInfo().totalShards()));
2932+
2933+
logger.info("--> delete indices");
2934+
assertAcked(client.admin().indices().prepareDelete(indexName));
2935+
2936+
logger.info("--> restore all indices from the snapshot");
2937+
RestoreSnapshotResponse restoreSnapshotResponse = client.admin().cluster().prepareRestoreSnapshot("test-repo", "test-snap")
2938+
.setWaitForCompletion(true).execute().actionGet();
2939+
assertThat(restoreSnapshotResponse.getRestoreInfo().totalShards(), greaterThan(0));
2940+
2941+
logger.info("--> indexing some more");
2942+
for (int i = 10; i < 15; i++) {
2943+
index(indexName, "doc", Integer.toString(i), "foo", "bar" + i);
2944+
}
2945+
2946+
IndicesStatsResponse stats = client().admin().indices().prepareStats(indexName).clear().get();
2947+
ShardStats shardStats = stats.getShards()[0];
2948+
assertTrue(shardStats.getShardRouting().primary());
2949+
assertThat(shardStats.getSeqNoStats().getLocalCheckpoint(), equalTo(15L)); // 15 indexed docs and one "missing" op.
2950+
assertThat(shardStats.getSeqNoStats().getGlobalCheckpoint(), equalTo(15L));
2951+
assertThat(shardStats.getSeqNoStats().getMaxSeqNo(), equalTo(15L));
2952+
}
2953+
28842954
private void verifySnapshotInfo(final GetSnapshotsResponse response, final Map<String, List<String>> indicesPerSnapshot) {
28852955
for (SnapshotInfo snapshotInfo : response.getSnapshots()) {
28862956
final List<String> expected = snapshotInfo.indices();

test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,13 @@ public final void createIndex(String... names) {
737737
}
738738
}
739739

740+
/**
741+
* creates an index with the given setting
742+
*/
743+
public final void createIndex(String name, Settings indexSettings) {
744+
assertAcked(prepareCreate(name).setSettings(indexSettings));
745+
}
746+
740747
/**
741748
* Creates a new {@link CreateIndexRequestBuilder} with the settings obtained from {@link #indexSettings()}.
742749
*/

0 commit comments

Comments
 (0)