Skip to content

Commit 2a08dd1

Browse files
authored
Introduce Lucene-based metadata persistence (#48733)
This commit introduces `LucenePersistedState` which master-eligible nodes can use to persist the cluster metadata in a Lucene index rather than in many separate files. Relates #48701
1 parent 01030ca commit 2a08dd1

File tree

16 files changed

+1644
-212
lines changed

16 files changed

+1644
-212
lines changed

build.gradle

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,8 @@ task verifyVersions {
206206
* after the backport of the backcompat code is complete.
207207
*/
208208

209-
boolean bwc_tests_enabled = true
210-
final String bwc_tests_disabled_issue = "" /* place a PR link here when committing bwc changes */
209+
boolean bwc_tests_enabled = false
210+
final String bwc_tests_disabled_issue = "https://github.com/elastic/elasticsearch/issues/48701" /* place a PR link here when committing bwc changes */
211211
if (bwc_tests_enabled == false) {
212212
if (bwc_tests_disabled_issue.isEmpty()) {
213213
throw new GradleException("bwc_tests_disabled_issue must be set when bwc_tests_enabled == false")

server/src/main/java/org/elasticsearch/cluster/coordination/CoordinationState.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
import org.elasticsearch.cluster.metadata.MetaData;
2626
import org.elasticsearch.cluster.node.DiscoveryNode;
2727

28+
import java.io.Closeable;
29+
import java.io.IOException;
2830
import java.util.Collection;
2931
import java.util.Collections;
3032
import java.util.HashMap;
@@ -432,15 +434,14 @@ public void invariant() {
432434
assert publishVotes.isEmpty() || electionWon();
433435
}
434436

435-
public void close() {
437+
public void close() throws IOException {
436438
persistedState.close();
437439
}
438440

439441
/**
440442
* Pluggable persistence layer for {@link CoordinationState}.
441-
*
442443
*/
443-
public interface PersistedState {
444+
public interface PersistedState extends Closeable {
444445

445446
/**
446447
* Returns the current term
@@ -497,7 +498,8 @@ default void markLastAcceptedStateAsCommitted() {
497498
}
498499
}
499500

500-
default void close() {}
501+
default void close() throws IOException {
502+
}
501503
}
502504

503505
/**

server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
import org.elasticsearch.transport.TransportResponse.Empty;
7373
import org.elasticsearch.transport.TransportService;
7474

75+
import java.io.IOException;
7576
import java.util.ArrayList;
7677
import java.util.Collection;
7778
import java.util.Collections;
@@ -702,7 +703,7 @@ protected void doStop() {
702703
}
703704

704705
@Override
705-
protected void doClose() {
706+
protected void doClose() throws IOException {
706707
final CoordinationState coordinationState = this.coordinationState.get();
707708
if (coordinationState != null) {
708709
// This looks like a race that might leak an unclosed CoordinationState if it's created while execution is here, but this method

server/src/main/java/org/elasticsearch/env/NodeEnvironment.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import org.elasticsearch.common.util.set.Sets;
5050
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
5151
import org.elasticsearch.core.internal.io.IOUtils;
52+
import org.elasticsearch.gateway.LucenePersistedStateFactory;
5253
import org.elasticsearch.gateway.MetaDataStateFormat;
5354
import org.elasticsearch.index.Index;
5455
import org.elasticsearch.index.IndexSettings;
@@ -380,15 +381,24 @@ private static boolean upgradeLegacyNodeFolders(Logger logger, Settings settings
380381

381382
// determine folders to move and check that there are no extra files/folders
382383
final Set<String> folderNames = new HashSet<>();
384+
final Set<String> expectedFolderNames = new HashSet<>(Arrays.asList(
385+
386+
// node state directory, also containing MetaDataStateFormat-based global metadata
387+
MetaDataStateFormat.STATE_DIR_NAME,
388+
389+
// Lucene-based metadata folder
390+
LucenePersistedStateFactory.METADATA_DIRECTORY_NAME,
391+
392+
// indices
393+
INDICES_FOLDER));
383394

384395
try (DirectoryStream<Path> stream = Files.newDirectoryStream(legacyNodePath.path)) {
385396
for (Path subFolderPath : stream) {
386397
final String fileName = subFolderPath.getFileName().toString();
387398
if (FileSystemUtils.isDesktopServicesStore(subFolderPath)) {
388399
// ignore
389400
} else if (FileSystemUtils.isAccessibleDirectory(subFolderPath, logger)) {
390-
if (fileName.equals(INDICES_FOLDER) == false && // indices folder
391-
fileName.equals(MetaDataStateFormat.STATE_DIR_NAME) == false) { // global metadata & node state folder
401+
if (expectedFolderNames.contains(fileName) == false) {
392402
throw new IllegalStateException("unexpected folder encountered during data folder upgrade: " +
393403
subFolderPath);
394404
}
@@ -406,7 +416,7 @@ private static boolean upgradeLegacyNodeFolders(Logger logger, Settings settings
406416
}
407417
}
408418

409-
assert Sets.difference(folderNames, Sets.newHashSet(INDICES_FOLDER, MetaDataStateFormat.STATE_DIR_NAME)).isEmpty() :
419+
assert Sets.difference(folderNames, expectedFolderNames).isEmpty() :
410420
"expected indices and/or state dir folder but was " + folderNames;
411421

412422
upgradeActions.add(() -> {

server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java

Lines changed: 46 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import com.carrotsearch.hppc.cursors.ObjectObjectCursor;
2323
import org.apache.logging.log4j.LogManager;
2424
import org.apache.logging.log4j.Logger;
25-
import org.apache.logging.log4j.message.ParameterizedMessage;
2625
import org.apache.lucene.util.SetOnce;
2726
import org.elasticsearch.ElasticsearchException;
2827
import org.elasticsearch.Version;
@@ -43,10 +42,12 @@
4342
import org.elasticsearch.common.collect.Tuple;
4443
import org.elasticsearch.common.settings.Settings;
4544
import org.elasticsearch.common.unit.TimeValue;
45+
import org.elasticsearch.core.internal.io.IOUtils;
4646
import org.elasticsearch.index.Index;
4747
import org.elasticsearch.plugins.MetaDataUpgrader;
4848
import org.elasticsearch.transport.TransportService;
4949

50+
import java.io.Closeable;
5051
import java.io.IOException;
5152
import java.util.HashMap;
5253
import java.util.Map;
@@ -63,7 +64,7 @@
6364
* ClusterState#metaData()} because it might be stale or incomplete. Master-eligible nodes must perform an election to find a complete and
6465
* non-stale state, and master-ineligible nodes receive the real cluster state from the elected master after joining the cluster.
6566
*/
66-
public class GatewayMetaState {
67+
public class GatewayMetaState implements Closeable {
6768
private static final Logger logger = LogManager.getLogger(GatewayMetaState.class);
6869

6970
// Set by calling start()
@@ -81,49 +82,46 @@ public MetaData getMetaData() {
8182

8283
public void start(Settings settings, TransportService transportService, ClusterService clusterService,
8384
MetaStateService metaStateService, MetaDataIndexUpgradeService metaDataIndexUpgradeService,
84-
MetaDataUpgrader metaDataUpgrader) {
85+
MetaDataUpgrader metaDataUpgrader, LucenePersistedStateFactory lucenePersistedStateFactory) {
8586
assert persistedState.get() == null : "should only start once, but already have " + persistedState.get();
8687

87-
final Tuple<Manifest, ClusterState> manifestClusterStateTuple;
88-
try {
89-
upgradeMetaData(settings, metaStateService, metaDataIndexUpgradeService, metaDataUpgrader);
90-
manifestClusterStateTuple = loadStateAndManifest(ClusterName.CLUSTER_NAME_SETTING.get(settings), metaStateService);
91-
} catch (IOException e) {
92-
throw new ElasticsearchException("failed to load metadata", e);
88+
if (DiscoveryNode.isMasterNode(settings)) {
89+
try {
90+
persistedState.set(lucenePersistedStateFactory.loadPersistedState((version, metadata) ->
91+
prepareInitialClusterState(transportService, clusterService,
92+
ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.get(settings))
93+
.version(version)
94+
.metaData(upgradeMetaDataForMasterEligibleNode(metadata, metaDataIndexUpgradeService, metaDataUpgrader))
95+
.build())));
96+
} catch (IOException e) {
97+
throw new ElasticsearchException("failed to load metadata", e);
98+
}
9399
}
94100

95-
final IncrementalClusterStateWriter incrementalClusterStateWriter
96-
= new IncrementalClusterStateWriter(settings, clusterService.getClusterSettings(), metaStateService,
101+
if (DiscoveryNode.isDataNode(settings)) {
102+
final Tuple<Manifest, ClusterState> manifestClusterStateTuple;
103+
try {
104+
upgradeMetaData(settings, metaStateService, metaDataIndexUpgradeService, metaDataUpgrader);
105+
manifestClusterStateTuple = loadStateAndManifest(ClusterName.CLUSTER_NAME_SETTING.get(settings), metaStateService);
106+
} catch (IOException e) {
107+
throw new ElasticsearchException("failed to load metadata", e);
108+
}
109+
110+
final IncrementalClusterStateWriter incrementalClusterStateWriter
111+
= new IncrementalClusterStateWriter(settings, clusterService.getClusterSettings(), metaStateService,
97112
manifestClusterStateTuple.v1(),
98113
prepareInitialClusterState(transportService, clusterService, manifestClusterStateTuple.v2()),
99114
transportService.getThreadPool()::relativeTimeInMillis);
100-
if (DiscoveryNode.isMasterNode(settings) == false) {
101-
if (DiscoveryNode.isDataNode(settings)) {
102-
// Master-eligible nodes persist index metadata for all indices regardless of whether they hold any shards or not. It's
103-
// vitally important to the safety of the cluster coordination system that master-eligible nodes persist this metadata when
104-
// _accepting_ the cluster state (i.e. before it is committed). This persistence happens on the generic threadpool.
105-
//
106-
// In contrast, master-ineligible data nodes only persist the index metadata for shards that they hold. When all shards of
107-
// an index are moved off such a node the IndicesStore is responsible for removing the corresponding index directory,
108-
// including the metadata, and does so on the cluster applier thread.
109-
//
110-
// This presents a problem: if a shard is unassigned from a node and then reassigned back to it again then there is a race
111-
// between the IndicesStore deleting the index folder and the CoordinationState concurrently trying to write the updated
112-
// metadata into it. We could probably solve this with careful synchronization, but in fact there is no need. The persisted
113-
// state on master-ineligible data nodes is mostly ignored - it's only there to support dangling index imports, which is
114-
// inherently unsafe anyway. Thus we can safely delay metadata writes on master-ineligible data nodes until applying the
115-
// cluster state, which is what this does:
116-
clusterService.addLowPriorityApplier(new GatewayClusterApplier(incrementalClusterStateWriter));
117-
}
118115

119-
// Master-ineligible nodes do not need to persist the cluster state when accepting it because they are not in the voting
120-
// configuration, so it's ok if they have a stale or incomplete cluster state when restarted. We track the latest cluster state
121-
// in memory instead.
122-
persistedState.set(new InMemoryPersistedState(manifestClusterStateTuple.v1().getCurrentTerm(), manifestClusterStateTuple.v2()));
123-
} else {
124-
// Master-ineligible nodes must persist the cluster state when accepting it because they must reload the (complete, fresh)
125-
// last-accepted cluster state when restarted.
126-
persistedState.set(new GatewayPersistedState(incrementalClusterStateWriter));
116+
clusterService.addLowPriorityApplier(new GatewayClusterApplier(incrementalClusterStateWriter));
117+
118+
if (DiscoveryNode.isMasterNode(settings) == false) {
119+
persistedState.set(
120+
new InMemoryPersistedState(manifestClusterStateTuple.v1().getCurrentTerm(), manifestClusterStateTuple.v2()));
121+
}
122+
} else if (DiscoveryNode.isMasterNode(settings) == false) {
123+
persistedState.set(
124+
new InMemoryPersistedState(0L, ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.get(settings)).build()));
127125
}
128126
}
129127

@@ -139,6 +137,13 @@ ClusterState prepareInitialClusterState(TransportService transportService, Clust
139137
.apply(clusterState);
140138
}
141139

140+
// exposed so it can be overridden by tests
141+
MetaData upgradeMetaDataForMasterEligibleNode(MetaData metaData,
142+
MetaDataIndexUpgradeService metaDataIndexUpgradeService,
143+
MetaDataUpgrader metaDataUpgrader) {
144+
return upgradeMetaData(metaData, metaDataIndexUpgradeService, metaDataUpgrader);
145+
}
146+
142147
// exposed so it can be overridden by tests
143148
void upgradeMetaData(Settings settings, MetaStateService metaStateService, MetaDataIndexUpgradeService metaDataIndexUpgradeService,
144149
MetaDataUpgrader metaDataUpgrader) throws IOException {
@@ -252,6 +257,10 @@ private static boolean applyPluginUpgraders(ImmutableOpenMap<String, IndexTempla
252257
return false;
253258
}
254259

260+
@Override
261+
public void close() throws IOException {
262+
IOUtils.close(persistedState.get());
263+
}
255264

256265
private static class GatewayClusterApplier implements ClusterStateApplier {
257266

@@ -285,48 +294,4 @@ public void applyClusterState(ClusterChangedEvent event) {
285294

286295
}
287296

288-
private static class GatewayPersistedState implements PersistedState {
289-
290-
private final IncrementalClusterStateWriter incrementalClusterStateWriter;
291-
292-
GatewayPersistedState(IncrementalClusterStateWriter incrementalClusterStateWriter) {
293-
this.incrementalClusterStateWriter = incrementalClusterStateWriter;
294-
}
295-
296-
@Override
297-
public long getCurrentTerm() {
298-
return incrementalClusterStateWriter.getPreviousManifest().getCurrentTerm();
299-
}
300-
301-
@Override
302-
public ClusterState getLastAcceptedState() {
303-
final ClusterState previousClusterState = incrementalClusterStateWriter.getPreviousClusterState();
304-
assert previousClusterState.nodes().getLocalNode() != null : "Cluster state is not fully built yet";
305-
return previousClusterState;
306-
}
307-
308-
@Override
309-
public void setCurrentTerm(long currentTerm) {
310-
try {
311-
incrementalClusterStateWriter.setCurrentTerm(currentTerm);
312-
} catch (WriteStateException e) {
313-
logger.error(new ParameterizedMessage("Failed to set current term to {}", currentTerm), e);
314-
e.rethrowAsErrorOrUncheckedException();
315-
}
316-
}
317-
318-
@Override
319-
public void setLastAcceptedState(ClusterState clusterState) {
320-
try {
321-
incrementalClusterStateWriter.setIncrementalWrite(
322-
incrementalClusterStateWriter.getPreviousClusterState().term() == clusterState.term());
323-
incrementalClusterStateWriter.updateClusterState(clusterState);
324-
} catch (WriteStateException e) {
325-
logger.error(new ParameterizedMessage("Failed to set last accepted state with version {}", clusterState.version()), e);
326-
e.rethrowAsErrorOrUncheckedException();
327-
}
328-
}
329-
330-
}
331-
332297
}

0 commit comments

Comments
 (0)