Skip to content

Commit 65366e0

Browse files
authored
Write CS asynchronously on data-only nodes (#50782)
Writes cluster states out asynchronously on data-only nodes. The main reason for writing out the cluster state at all is so that the data-only nodes can snap into a cluster, that they can do a bit of bootstrap validation and so that the shard recovery tools work. Cluster states that are written asynchronously have their voting configuration adapted to a non existing configuration so that these nodes cannot mistakenly become master even if their node role is changed back and forth. Relates #48701
1 parent df40aec commit 65366e0

File tree

4 files changed

+269
-4
lines changed

4 files changed

+269
-4
lines changed

server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java

Lines changed: 162 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,14 @@
2020
package org.elasticsearch.gateway;
2121

2222
import com.carrotsearch.hppc.cursors.ObjectObjectCursor;
23+
import org.apache.logging.log4j.LogManager;
24+
import org.apache.logging.log4j.Logger;
2325
import org.apache.lucene.util.SetOnce;
2426
import org.elasticsearch.ElasticsearchException;
2527
import org.elasticsearch.Version;
2628
import org.elasticsearch.cluster.ClusterName;
2729
import org.elasticsearch.cluster.ClusterState;
30+
import org.elasticsearch.cluster.coordination.CoordinationMetaData;
2831
import org.elasticsearch.cluster.coordination.CoordinationState.PersistedState;
2932
import org.elasticsearch.cluster.coordination.InMemoryPersistedState;
3033
import org.elasticsearch.cluster.metadata.IndexMetaData;
@@ -37,21 +40,32 @@
3740
import org.elasticsearch.common.collect.ImmutableOpenMap;
3841
import org.elasticsearch.common.collect.Tuple;
3942
import org.elasticsearch.common.settings.Settings;
43+
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
44+
import org.elasticsearch.common.util.concurrent.EsExecutors;
45+
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
46+
import org.elasticsearch.common.util.concurrent.EsThreadPoolExecutor;
4047
import org.elasticsearch.core.internal.io.IOUtils;
4148
import org.elasticsearch.env.NodeMetaData;
49+
import org.elasticsearch.node.Node;
4250
import org.elasticsearch.plugins.MetaDataUpgrader;
51+
import org.elasticsearch.threadpool.ThreadPool;
4352
import org.elasticsearch.transport.TransportService;
4453

4554
import java.io.Closeable;
4655
import java.io.IOException;
4756
import java.io.UncheckedIOException;
57+
import java.util.Collections;
4858
import java.util.HashMap;
4959
import java.util.Map;
60+
import java.util.Objects;
61+
import java.util.concurrent.TimeUnit;
5062
import java.util.function.BiConsumer;
5163
import java.util.function.Consumer;
5264
import java.util.function.Function;
5365
import java.util.function.UnaryOperator;
5466

67+
import static org.elasticsearch.common.util.concurrent.EsExecutors.daemonThreadFactory;
68+
5569
/**
5670
* Loads (and maybe upgrades) cluster metadata at startup, and persistently stores cluster metadata for future restarts.
5771
*
@@ -100,16 +114,20 @@ public void start(Settings settings, TransportService transportService, ClusterS
100114
}
101115

102116
final PersistedClusterStateService.Writer persistenceWriter = persistedClusterStateService.createWriter();
103-
final LucenePersistedState lucenePersistedState;
117+
final PersistedState persistedState;
104118
boolean success = false;
105119
try {
106120
final ClusterState clusterState = prepareInitialClusterState(transportService, clusterService,
107121
ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.get(settings))
108122
.version(lastAcceptedVersion)
109123
.metaData(upgradeMetaDataForNode(metaData, metaDataIndexUpgradeService, metaDataUpgrader))
110124
.build());
111-
lucenePersistedState = new LucenePersistedState(
112-
persistenceWriter, currentTerm, clusterState);
125+
if (DiscoveryNode.isMasterNode(settings)) {
126+
persistedState = new LucenePersistedState(persistenceWriter, currentTerm, clusterState);
127+
} else {
128+
persistedState = new AsyncLucenePersistedState(settings, transportService.getThreadPool(),
129+
new LucenePersistedState(persistenceWriter, currentTerm, clusterState));
130+
}
113131
if (DiscoveryNode.isDataNode(settings)) {
114132
metaStateService.unreferenceAll(); // unreference legacy files (only keep them for dangling indices functionality)
115133
} else {
@@ -125,7 +143,7 @@ public void start(Settings settings, TransportService transportService, ClusterS
125143
}
126144
}
127145

128-
persistedState.set(lucenePersistedState);
146+
this.persistedState.set(persistedState);
129147
} catch (IOException e) {
130148
throw new ElasticsearchException("failed to load metadata", e);
131149
}
@@ -227,6 +245,146 @@ public void close() throws IOException {
227245
IOUtils.close(persistedState.get());
228246
}
229247

248+
// visible for testing
249+
public boolean allPendingAsyncStatesWritten() {
250+
final PersistedState ps = persistedState.get();
251+
if (ps instanceof AsyncLucenePersistedState) {
252+
return ((AsyncLucenePersistedState) ps).allPendingAsyncStatesWritten();
253+
} else {
254+
return true;
255+
}
256+
}
257+
258+
static class AsyncLucenePersistedState extends InMemoryPersistedState {
259+
260+
private static final Logger logger = LogManager.getLogger(AsyncLucenePersistedState.class);
261+
262+
static final String THREAD_NAME = "AsyncLucenePersistedState#updateTask";
263+
264+
private final EsThreadPoolExecutor threadPoolExecutor;
265+
private final PersistedState persistedState;
266+
267+
boolean newCurrentTermQueued = false;
268+
boolean newStateQueued = false;
269+
270+
private final Object mutex = new Object();
271+
272+
AsyncLucenePersistedState(Settings settings, ThreadPool threadPool, PersistedState persistedState) {
273+
super(persistedState.getCurrentTerm(), persistedState.getLastAcceptedState());
274+
final String nodeName = Objects.requireNonNull(Node.NODE_NAME_SETTING.get(settings));
275+
threadPoolExecutor = EsExecutors.newFixed(
276+
nodeName + "/" + THREAD_NAME,
277+
1, 1,
278+
daemonThreadFactory(nodeName, THREAD_NAME),
279+
threadPool.getThreadContext());
280+
this.persistedState = persistedState;
281+
}
282+
283+
@Override
284+
public void setCurrentTerm(long currentTerm) {
285+
synchronized (mutex) {
286+
super.setCurrentTerm(currentTerm);
287+
if (newCurrentTermQueued) {
288+
logger.trace("term update already queued (setting term to {})", currentTerm);
289+
} else {
290+
logger.trace("queuing term update (setting term to {})", currentTerm);
291+
newCurrentTermQueued = true;
292+
scheduleUpdate();
293+
}
294+
}
295+
}
296+
297+
@Override
298+
public void setLastAcceptedState(ClusterState clusterState) {
299+
synchronized (mutex) {
300+
super.setLastAcceptedState(clusterState);
301+
if (newStateQueued) {
302+
logger.trace("cluster state update already queued (setting cluster state to {})", clusterState.version());
303+
} else {
304+
logger.trace("queuing cluster state update (setting cluster state to {})", clusterState.version());
305+
newStateQueued = true;
306+
scheduleUpdate();
307+
}
308+
}
309+
}
310+
311+
private void scheduleUpdate() {
312+
assert Thread.holdsLock(mutex);
313+
try {
314+
threadPoolExecutor.execute(new AbstractRunnable() {
315+
316+
@Override
317+
public void onFailure(Exception e) {
318+
logger.error("Exception occurred when storing new meta data", e);
319+
}
320+
321+
@Override
322+
protected void doRun() {
323+
final Long term;
324+
final ClusterState clusterState;
325+
synchronized (mutex) {
326+
if (newCurrentTermQueued) {
327+
term = getCurrentTerm();
328+
newCurrentTermQueued = false;
329+
} else {
330+
term = null;
331+
}
332+
if (newStateQueued) {
333+
clusterState = getLastAcceptedState();
334+
newStateQueued = false;
335+
} else {
336+
clusterState = null;
337+
}
338+
}
339+
// write current term before last accepted state so that it is never below term in last accepted state
340+
if (term != null) {
341+
persistedState.setCurrentTerm(term);
342+
}
343+
if (clusterState != null) {
344+
persistedState.setLastAcceptedState(resetVotingConfiguration(clusterState));
345+
}
346+
}
347+
});
348+
} catch (EsRejectedExecutionException e) {
349+
// ignore cases where we are shutting down..., there is really nothing interesting to be done here...
350+
if (threadPoolExecutor.isShutdown() == false) {
351+
assert false : "only expect rejections when shutting down";
352+
throw e;
353+
}
354+
}
355+
}
356+
357+
static final CoordinationMetaData.VotingConfiguration staleStateConfiguration =
358+
new CoordinationMetaData.VotingConfiguration(Collections.singleton("STALE_STATE_CONFIG"));
359+
360+
static ClusterState resetVotingConfiguration(ClusterState clusterState) {
361+
CoordinationMetaData newCoordinationMetaData = CoordinationMetaData.builder(clusterState.coordinationMetaData())
362+
.lastAcceptedConfiguration(staleStateConfiguration)
363+
.lastCommittedConfiguration(staleStateConfiguration)
364+
.build();
365+
return ClusterState.builder(clusterState).metaData(MetaData.builder(clusterState.metaData())
366+
.coordinationMetaData(newCoordinationMetaData).build()).build();
367+
}
368+
369+
@Override
370+
public void close() throws IOException {
371+
try {
372+
ThreadPool.terminate(threadPoolExecutor, 10, TimeUnit.SECONDS);
373+
} finally {
374+
persistedState.close();
375+
}
376+
}
377+
378+
boolean allPendingAsyncStatesWritten() {
379+
synchronized (mutex) {
380+
if (newCurrentTermQueued || newStateQueued) {
381+
return false;
382+
}
383+
return threadPoolExecutor.getActiveCount() == 0;
384+
}
385+
}
386+
}
387+
230388
/**
231389
* Encapsulates the incremental writing of metadata to a {@link PersistedClusterStateService.Writer}.
232390
*/

server/src/test/java/org/elasticsearch/cluster/coordination/UnsafeBootstrapAndDetachCommandIT.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.elasticsearch.env.Environment;
2929
import org.elasticsearch.env.NodeEnvironment;
3030
import org.elasticsearch.env.TestEnvironment;
31+
import org.elasticsearch.gateway.GatewayMetaState;
3132
import org.elasticsearch.gateway.PersistedClusterStateService;
3233
import org.elasticsearch.indices.IndicesService;
3334
import org.elasticsearch.node.Node;
@@ -259,6 +260,7 @@ public void test3MasterNodes2Failed() throws Exception {
259260
logger.info("--> stop 1st master-eligible node and data-only node");
260261
NodeEnvironment nodeEnvironment = internalCluster().getMasterNodeInstance(NodeEnvironment.class);
261262
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(masterNodes.get(0)));
263+
assertBusy(() -> internalCluster().getInstance(GatewayMetaState.class, dataNode).allPendingAsyncStatesWritten());
262264
internalCluster().stopRandomDataNode();
263265

264266
logger.info("--> unsafely-bootstrap 1st master-eligible node");
@@ -327,6 +329,7 @@ public void testAllMasterEligibleNodesFailedDanglingIndexImport() throws Excepti
327329

328330
logger.info("--> stop data-only node and detach it from the old cluster");
329331
Settings dataNodeDataPathSettings = internalCluster().dataPathSettings(dataNode);
332+
assertBusy(() -> internalCluster().getInstance(GatewayMetaState.class, dataNode).allPendingAsyncStatesWritten());
330333
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(dataNode));
331334
final Environment environment = TestEnvironment.newEnvironment(
332335
Settings.builder().put(internalCluster().getDefaultSettings()).put(dataNodeDataPathSettings).build());

server/src/test/java/org/elasticsearch/gateway/GatewayMetaStatePersistedStateTests.java

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,22 +31,31 @@
3131
import org.elasticsearch.cluster.node.DiscoveryNode;
3232
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
3333
import org.elasticsearch.cluster.node.DiscoveryNodes;
34+
import org.elasticsearch.cluster.service.ClusterService;
35+
import org.elasticsearch.common.settings.ClusterSettings;
3436
import org.elasticsearch.common.settings.Settings;
3537
import org.elasticsearch.common.util.BigArrays;
3638
import org.elasticsearch.common.util.set.Sets;
3739
import org.elasticsearch.core.internal.io.IOUtils;
3840
import org.elasticsearch.env.Environment;
3941
import org.elasticsearch.env.NodeEnvironment;
4042
import org.elasticsearch.env.TestEnvironment;
43+
import org.elasticsearch.node.Node;
4144
import org.elasticsearch.test.ESTestCase;
45+
import org.elasticsearch.threadpool.TestThreadPool;
46+
import org.elasticsearch.threadpool.ThreadPool;
47+
import org.elasticsearch.transport.TransportService;
4248

4349
import java.io.IOException;
4450
import java.nio.file.Path;
4551
import java.util.Collections;
52+
import java.util.concurrent.TimeUnit;
4653

4754
import static org.hamcrest.Matchers.equalTo;
4855
import static org.hamcrest.Matchers.instanceOf;
4956
import static org.hamcrest.Matchers.not;
57+
import static org.mockito.Mockito.mock;
58+
import static org.mockito.Mockito.when;
5059

5160
public class GatewayMetaStatePersistedStateTests extends ESTestCase {
5261
private NodeEnvironment nodeEnvironment;
@@ -309,4 +318,95 @@ public void testStatePersistedOnLoad() throws IOException {
309318
}
310319
}
311320

321+
public void testDataOnlyNodePersistence() throws Exception {
322+
DiscoveryNode localNode = new DiscoveryNode("node1", buildNewFakeTransportAddress(), Collections.emptyMap(),
323+
Sets.newHashSet(DiscoveryNodeRole.DATA_ROLE), Version.CURRENT);
324+
Settings settings = Settings.builder().put(ClusterName.CLUSTER_NAME_SETTING.getKey(), clusterName.value()).put(
325+
Node.NODE_MASTER_SETTING.getKey(), false).put(Node.NODE_NAME_SETTING.getKey(), "test").build();
326+
final MockGatewayMetaState gateway = new MockGatewayMetaState(localNode);
327+
final TransportService transportService = mock(TransportService.class);
328+
TestThreadPool threadPool = new TestThreadPool("testMarkAcceptedConfigAsCommittedOnDataOnlyNode");
329+
when(transportService.getThreadPool()).thenReturn(threadPool);
330+
ClusterService clusterService = mock(ClusterService.class);
331+
when(clusterService.getClusterSettings()).thenReturn(
332+
new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS));
333+
final PersistedClusterStateService persistedClusterStateService =
334+
new PersistedClusterStateService(nodeEnvironment, xContentRegistry(), BigArrays.NON_RECYCLING_INSTANCE);
335+
gateway.start(settings, transportService, clusterService,
336+
new MetaStateService(nodeEnvironment, xContentRegistry()), null, null, persistedClusterStateService);
337+
final CoordinationState.PersistedState persistedState = gateway.getPersistedState();
338+
assertThat(persistedState, instanceOf(GatewayMetaState.AsyncLucenePersistedState.class));
339+
340+
//generate random coordinationMetaData with different lastAcceptedConfiguration and lastCommittedConfiguration
341+
CoordinationMetaData coordinationMetaData;
342+
do {
343+
coordinationMetaData = createCoordinationMetaData(randomNonNegativeLong());
344+
} while (coordinationMetaData.getLastAcceptedConfiguration().equals(coordinationMetaData.getLastCommittedConfiguration()));
345+
346+
ClusterState state = createClusterState(randomNonNegativeLong(),
347+
MetaData.builder().coordinationMetaData(coordinationMetaData)
348+
.clusterUUID(randomAlphaOfLength(10)).build());
349+
persistedState.setLastAcceptedState(state);
350+
assertBusy(() -> assertTrue(gateway.allPendingAsyncStatesWritten()));
351+
352+
assertThat(persistedState.getLastAcceptedState().getLastAcceptedConfiguration(),
353+
not(equalTo(persistedState.getLastAcceptedState().getLastCommittedConfiguration())));
354+
CoordinationMetaData persistedCoordinationMetaData =
355+
persistedClusterStateService.loadBestOnDiskState().metaData.coordinationMetaData();
356+
assertThat(persistedCoordinationMetaData.getLastAcceptedConfiguration(),
357+
equalTo(GatewayMetaState.AsyncLucenePersistedState.staleStateConfiguration));
358+
assertThat(persistedCoordinationMetaData.getLastCommittedConfiguration(),
359+
equalTo(GatewayMetaState.AsyncLucenePersistedState.staleStateConfiguration));
360+
361+
persistedState.markLastAcceptedStateAsCommitted();
362+
assertBusy(() -> assertTrue(gateway.allPendingAsyncStatesWritten()));
363+
364+
CoordinationMetaData expectedCoordinationMetaData = CoordinationMetaData.builder(coordinationMetaData)
365+
.lastCommittedConfiguration(coordinationMetaData.getLastAcceptedConfiguration()).build();
366+
ClusterState expectedClusterState =
367+
ClusterState.builder(state).metaData(MetaData.builder().coordinationMetaData(expectedCoordinationMetaData)
368+
.clusterUUID(state.metaData().clusterUUID()).clusterUUIDCommitted(true).build()).build();
369+
370+
assertClusterStateEqual(expectedClusterState, persistedState.getLastAcceptedState());
371+
persistedCoordinationMetaData = persistedClusterStateService.loadBestOnDiskState().metaData.coordinationMetaData();
372+
assertThat(persistedCoordinationMetaData.getLastAcceptedConfiguration(),
373+
equalTo(GatewayMetaState.AsyncLucenePersistedState.staleStateConfiguration));
374+
assertThat(persistedCoordinationMetaData.getLastCommittedConfiguration(),
375+
equalTo(GatewayMetaState.AsyncLucenePersistedState.staleStateConfiguration));
376+
assertTrue(persistedClusterStateService.loadBestOnDiskState().metaData.clusterUUIDCommitted());
377+
378+
// generate a series of updates and check if batching works
379+
final String indexName = randomAlphaOfLength(10);
380+
long currentTerm = state.term();
381+
for (int i = 0; i < 1000; i++) {
382+
if (rarely()) {
383+
// bump term
384+
currentTerm = currentTerm + (rarely() ? randomIntBetween(1, 5) : 0L);
385+
persistedState.setCurrentTerm(currentTerm);
386+
} else {
387+
// update cluster state
388+
final int numberOfShards = randomIntBetween(1, 5);
389+
final long term = Math.min(state.term() + (rarely() ? randomIntBetween(1, 5) : 0L), currentTerm);
390+
final IndexMetaData indexMetaData = createIndexMetaData(indexName, numberOfShards, i);
391+
state = createClusterState(state.version() + 1,
392+
MetaData.builder().coordinationMetaData(createCoordinationMetaData(term)).put(indexMetaData, false).build());
393+
persistedState.setLastAcceptedState(state);
394+
}
395+
}
396+
assertEquals(currentTerm, persistedState.getCurrentTerm());
397+
assertClusterStateEqual(state, persistedState.getLastAcceptedState());
398+
assertBusy(() -> assertTrue(gateway.allPendingAsyncStatesWritten()));
399+
400+
gateway.close();
401+
402+
try (CoordinationState.PersistedState reloadedPersistedState = newGatewayPersistedState()) {
403+
assertEquals(currentTerm, reloadedPersistedState.getCurrentTerm());
404+
assertClusterStateEqual(GatewayMetaState.AsyncLucenePersistedState.resetVotingConfiguration(state),
405+
reloadedPersistedState.getLastAcceptedState());
406+
assertNotNull(reloadedPersistedState.getLastAcceptedState().metaData().index(indexName));
407+
}
408+
409+
ThreadPool.terminate(threadPool, 10, TimeUnit.SECONDS);
410+
}
411+
312412
}

0 commit comments

Comments
 (0)