Skip to content

Commit c32e4fb

Browse files
authored
[Zen2] Best-effort cluster formation if unconfigured (#36215)
In real deployments it is important that clusters are properly configured to avoid accidentally forming multiple independent clusters at cluster bootstrapping time. However we also expect to be able to unpack Elasticsearch and start up one or more nodes without any up-front configuration, and have them do their best to find each other and form a cluster after a few seconds. This change adds a delayed automatic bootstrapping process to nodes that start up with no relevant settings set to support the desired out-of-the-box experience without compromising safety in properly-configured deployments.
1 parent 879397d commit c32e4fb

File tree

9 files changed

+211
-8
lines changed

9 files changed

+211
-8
lines changed

docs/reference/migration/migrate_7_0/cluster.asciidoc

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,13 @@ These shard preferences are removed in favour of the `_prefer_nodes` and `_only_
2424
Clusters now have soft limits on the total number of open shards in the cluster
2525
based on the number of nodes and the `cluster.max_shards_per_node` cluster
2626
setting, to prevent accidental operations that would destabilize the cluster.
27-
More information can be found in the <<misc-cluster,documentation for that setting>>.
27+
More information can be found in the <<misc-cluster,documentation for that setting>>.
28+
29+
[float]
30+
==== Discovery configuration is required in production
31+
Production deployments of Elasticsearch now require at least one of the following settings
32+
to be specified in the `elasticsearch.yml` configuration file:
33+
34+
- `discovery.zen.ping.unicast.hosts`
35+
- `discovery.zen.hosts_provider`
36+
- `cluster.initial_master_nodes`

docs/reference/setup/bootstrap-checks.asciidoc

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,3 +236,21 @@ versions of the HotSpot JVM.
236236
The all permission check ensures that the security policy used during bootstrap
237237
does not grant the `java.security.AllPermission` to Elasticsearch. Running with
238238
the all permission granted is equivalent to disabling the security manager.
239+
240+
=== Discovery configuration check
241+
242+
By default, when Elasticsearch first starts up it will try and discover other
243+
nodes running on the same host. If no elected master can be discovered within a
244+
few seconds then Elasticsearch will form a cluster that includes any other
245+
nodes that were discovered. It is useful to be able to form this cluster
246+
without any extra configuration in development mode, but this is unsuitable for
247+
production because it's possible to form multiple clusters and lose data as a
248+
result.
249+
250+
This bootstrap check ensures that discovery is not running with the default
251+
configuration. It can be satisfied by setting at least one of the following
252+
properties:
253+
254+
- `discovery.zen.ping.unicast.hosts`
255+
- `discovery.zen.hosts_provider`
256+
- `cluster.initial_master_nodes`

qa/unconfigured-node-name/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ apply plugin: 'elasticsearch.rest-test'
2222

2323
integTestCluster {
2424
setting 'node.name', null
25-
// TODO: Run this using zen2
25+
// TODO: Run this using zen2, with no discovery configuration at all, demonstrating that the node forms a cluster on its own without help
2626
setting 'discovery.type', 'zen'
2727
}
2828

server/src/main/java/org/elasticsearch/bootstrap/BootstrapChecks.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@
2323
import org.apache.logging.log4j.Logger;
2424
import org.apache.logging.log4j.message.ParameterizedMessage;
2525
import org.apache.lucene.util.Constants;
26+
import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
2627
import org.elasticsearch.common.SuppressForbidden;
2728
import org.elasticsearch.common.io.PathUtils;
29+
import org.elasticsearch.common.settings.Setting;
2830
import org.elasticsearch.common.transport.BoundTransportAddress;
2931
import org.elasticsearch.common.transport.TransportAddress;
3032
import org.elasticsearch.discovery.DiscoveryModule;
@@ -46,6 +48,12 @@
4648
import java.util.function.Predicate;
4749
import java.util.regex.Matcher;
4850
import java.util.regex.Pattern;
51+
import java.util.stream.Collectors;
52+
import java.util.stream.Stream;
53+
54+
import static org.elasticsearch.cluster.coordination.ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING;
55+
import static org.elasticsearch.discovery.DiscoveryModule.DISCOVERY_HOSTS_PROVIDER_SETTING;
56+
import static org.elasticsearch.discovery.zen.SettingsBasedHostsProvider.DISCOVERY_ZEN_PING_UNICAST_HOSTS_SETTING;
4957

5058
/**
5159
* We enforce bootstrap checks once a node has the transport protocol bound to a non-loopback interface or if the system property {@code
@@ -207,6 +215,7 @@ static List<BootstrapCheck> checks() {
207215
checks.add(new EarlyAccessCheck());
208216
checks.add(new G1GCCheck());
209217
checks.add(new AllPermissionCheck());
218+
checks.add(new DiscoveryConfiguredCheck());
210219
return Collections.unmodifiableList(checks);
211220
}
212221

@@ -713,4 +722,21 @@ boolean isAllPermissionGranted() {
713722

714723
}
715724

725+
static class DiscoveryConfiguredCheck implements BootstrapCheck {
726+
@Override
727+
public BootstrapCheckResult check(BootstrapContext context) {
728+
if (DiscoveryModule.ZEN2_DISCOVERY_TYPE.equals(DiscoveryModule.DISCOVERY_TYPE_SETTING.get(context.settings)) == false) {
729+
return BootstrapCheckResult.success();
730+
}
731+
if (ClusterBootstrapService.discoveryIsConfigured(context.settings)) {
732+
return BootstrapCheckResult.success();
733+
}
734+
735+
return BootstrapCheckResult.failure(String.format(
736+
Locale.ROOT,
737+
"the default discovery settings are unsuitable for production use; at least one of [%s] must be configured",
738+
Stream.of(DISCOVERY_ZEN_PING_UNICAST_HOSTS_SETTING, DISCOVERY_HOSTS_PROVIDER_SETTING, INITIAL_MASTER_NODES_SETTING)
739+
.map(Setting::getKey).collect(Collectors.joining(", "))));
740+
}
741+
}
716742
}

server/src/main/java/org/elasticsearch/cluster/coordination/ClusterBootstrapService.java

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import org.elasticsearch.action.admin.cluster.bootstrap.GetDiscoveredNodesRequest;
3030
import org.elasticsearch.action.admin.cluster.bootstrap.GetDiscoveredNodesResponse;
3131
import org.elasticsearch.cluster.node.DiscoveryNode;
32+
import org.elasticsearch.common.Nullable;
3233
import org.elasticsearch.common.io.stream.StreamInput;
3334
import org.elasticsearch.common.settings.Setting;
3435
import org.elasticsearch.common.settings.Setting.Property;
@@ -44,6 +45,10 @@
4445
import java.util.Collections;
4546
import java.util.List;
4647
import java.util.function.Function;
48+
import java.util.stream.Stream;
49+
50+
import static org.elasticsearch.discovery.DiscoveryModule.DISCOVERY_HOSTS_PROVIDER_SETTING;
51+
import static org.elasticsearch.discovery.zen.SettingsBasedHostsProvider.DISCOVERY_ZEN_PING_UNICAST_HOSTS_SETTING;
4752

4853
public class ClusterBootstrapService {
4954

@@ -57,22 +62,82 @@ public class ClusterBootstrapService {
5762
public static final Setting<List<String>> INITIAL_MASTER_NODES_SETTING =
5863
Setting.listSetting("cluster.initial_master_nodes", Collections.emptyList(), Function.identity(), Property.NodeScope);
5964

65+
public static final Setting<TimeValue> UNCONFIGURED_BOOTSTRAP_TIMEOUT_SETTING =
66+
Setting.timeSetting("discovery.unconfigured_bootstrap_timeout",
67+
TimeValue.timeValueSeconds(3), TimeValue.timeValueMillis(1), Property.NodeScope);
68+
6069
private final int initialMasterNodeCount;
6170
private final List<String> initialMasterNodes;
71+
@Nullable
72+
private final TimeValue unconfiguredBootstrapTimeout;
6273
private final TransportService transportService;
6374
private volatile boolean running;
6475

6576
public ClusterBootstrapService(Settings settings, TransportService transportService) {
6677
initialMasterNodeCount = INITIAL_MASTER_NODE_COUNT_SETTING.get(settings);
6778
initialMasterNodes = INITIAL_MASTER_NODES_SETTING.get(settings);
79+
unconfiguredBootstrapTimeout = discoveryIsConfigured(settings) ? null : UNCONFIGURED_BOOTSTRAP_TIMEOUT_SETTING.get(settings);
6880
this.transportService = transportService;
6981
}
7082

83+
public static boolean discoveryIsConfigured(Settings settings) {
84+
return Stream.of(DISCOVERY_HOSTS_PROVIDER_SETTING, DISCOVERY_ZEN_PING_UNICAST_HOSTS_SETTING,
85+
INITIAL_MASTER_NODE_COUNT_SETTING, INITIAL_MASTER_NODES_SETTING).anyMatch(s -> s.exists(settings));
86+
}
87+
7188
public void start() {
7289
assert running == false;
7390
running = true;
7491

75-
if ((initialMasterNodeCount > 0 || initialMasterNodes.isEmpty() == false) && transportService.getLocalNode().isMasterNode()) {
92+
if (transportService.getLocalNode().isMasterNode() == false) {
93+
return;
94+
}
95+
96+
if (unconfiguredBootstrapTimeout != null) {
97+
logger.info("no discovery configuration found, will perform best-effort cluster bootstrapping after [{}] " +
98+
"unless existing master is discovered", unconfiguredBootstrapTimeout);
99+
final ThreadContext threadContext = transportService.getThreadPool().getThreadContext();
100+
try (ThreadContext.StoredContext ignore = threadContext.stashContext()) {
101+
threadContext.markAsSystemContext();
102+
103+
transportService.getThreadPool().scheduleUnlessShuttingDown(unconfiguredBootstrapTimeout, Names.SAME, new Runnable() {
104+
@Override
105+
public void run() {
106+
final GetDiscoveredNodesRequest request = new GetDiscoveredNodesRequest();
107+
logger.trace("sending {}", request);
108+
transportService.sendRequest(transportService.getLocalNode(), GetDiscoveredNodesAction.NAME, request,
109+
new TransportResponseHandler<GetDiscoveredNodesResponse>() {
110+
@Override
111+
public void handleResponse(GetDiscoveredNodesResponse response) {
112+
logger.debug("discovered {}, starting to bootstrap", response.getNodes());
113+
awaitBootstrap(response.getBootstrapConfiguration());
114+
}
115+
116+
@Override
117+
public void handleException(TransportException exp) {
118+
logger.warn("discovery attempt failed", exp);
119+
}
120+
121+
@Override
122+
public String executor() {
123+
return Names.SAME;
124+
}
125+
126+
@Override
127+
public GetDiscoveredNodesResponse read(StreamInput in) throws IOException {
128+
return new GetDiscoveredNodesResponse(in);
129+
}
130+
});
131+
}
132+
133+
@Override
134+
public String toString() {
135+
return "unconfigured-discovery delayed bootstrap";
136+
}
137+
});
138+
139+
}
140+
} else if (initialMasterNodeCount > 0 || initialMasterNodes.isEmpty() == false) {
76141
logger.debug("unsafely waiting for discovery of [{}] master-eligible nodes", initialMasterNodeCount);
77142

78143
final ThreadContext threadContext = transportService.getThreadPool().getThreadContext();
@@ -116,7 +181,6 @@ public GetDiscoveredNodesResponse read(StreamInput in) throws IOException {
116181
}
117182

118183
public void stop() {
119-
assert running == true;
120184
running = false;
121185
}
122186

server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,9 @@ public void startInitialJoin() {
501501
becomeCandidate("startInitialJoin");
502502
}
503503

504-
clusterBootstrapService.start();
504+
if (isInitialConfigurationSet() == false) {
505+
clusterBootstrapService.start();
506+
}
505507
}
506508

507509
@Override

server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,7 @@ public void apply(Settings value, Settings current, Settings previous) {
473473
TransportAddVotingConfigExclusionsAction.MAXIMUM_VOTING_CONFIG_EXCLUSIONS_SETTING,
474474
ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING,
475475
ClusterBootstrapService.INITIAL_MASTER_NODE_COUNT_SETTING,
476+
ClusterBootstrapService.UNCONFIGURED_BOOTSTRAP_TIMEOUT_SETTING,
476477
LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING
477478
)));
478479

server/src/test/java/org/elasticsearch/bootstrap/BootstrapChecksTests.java

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,14 @@
2121

2222
import org.apache.logging.log4j.Logger;
2323
import org.apache.lucene.util.Constants;
24+
import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
2425
import org.elasticsearch.cluster.metadata.MetaData;
26+
import org.elasticsearch.common.CheckedConsumer;
2527
import org.elasticsearch.common.settings.Settings;
2628
import org.elasticsearch.common.transport.BoundTransportAddress;
2729
import org.elasticsearch.common.transport.TransportAddress;
30+
import org.elasticsearch.discovery.DiscoveryModule;
31+
import org.elasticsearch.discovery.zen.SettingsBasedHostsProvider;
2832
import org.elasticsearch.monitor.jvm.JvmInfo;
2933
import org.elasticsearch.node.NodeValidationException;
3034
import org.elasticsearch.test.ESTestCase;
@@ -700,4 +704,34 @@ public boolean alwaysEnforce() {
700704
assertThat(alwaysEnforced, hasToString(containsString("error")));
701705
}
702706

707+
public void testDiscoveryConfiguredCheck() throws NodeValidationException {
708+
final List<BootstrapCheck> checks = Collections.singletonList(new BootstrapChecks.DiscoveryConfiguredCheck());
709+
710+
final BootstrapContext zen2Context = new BootstrapContext(Settings.builder()
711+
.put(DiscoveryModule.DISCOVERY_TYPE_SETTING.getKey(), ZEN2_DISCOVERY_TYPE).build(), MetaData.EMPTY_META_DATA);
712+
713+
// not always enforced
714+
BootstrapChecks.check(zen2Context, false, checks);
715+
716+
// not enforced for non-zen2 discovery
717+
BootstrapChecks.check(new BootstrapContext(Settings.builder().put(DiscoveryModule.DISCOVERY_TYPE_SETTING.getKey(),
718+
randomFrom(ZEN_DISCOVERY_TYPE, "single-node", randomAlphaOfLength(5))).build(), MetaData.EMPTY_META_DATA), true, checks);
719+
720+
final NodeValidationException e = expectThrows(NodeValidationException.class,
721+
() -> BootstrapChecks.check(zen2Context, true, checks));
722+
assertThat(e, hasToString(containsString("the default discovery settings are unsuitable for production use; at least one " +
723+
"of [discovery.zen.ping.unicast.hosts, discovery.zen.hosts_provider, cluster.initial_master_nodes] must be configured")));
724+
725+
CheckedConsumer<Settings.Builder, NodeValidationException> ensureChecksPass = b ->
726+
{
727+
final BootstrapContext context = new BootstrapContext(b
728+
.put(DiscoveryModule.DISCOVERY_TYPE_SETTING.getKey(), ZEN2_DISCOVERY_TYPE).build(), MetaData.EMPTY_META_DATA);
729+
BootstrapChecks.check(context, true, checks);
730+
};
731+
732+
ensureChecksPass.accept(Settings.builder().putList(DiscoveryModule.DISCOVERY_HOSTS_PROVIDER_SETTING.getKey()));
733+
ensureChecksPass.accept(Settings.builder().putList(SettingsBasedHostsProvider.DISCOVERY_ZEN_PING_UNICAST_HOSTS_SETTING.getKey()));
734+
ensureChecksPass.accept(Settings.builder().put(ClusterBootstrapService.INITIAL_MASTER_NODE_COUNT_SETTING.getKey(), 0));
735+
ensureChecksPass.accept(Settings.builder().putList(ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING.getKey()));
736+
}
703737
}

server/src/test/java/org/elasticsearch/cluster/coordination/ClusterBootstrapServiceTests.java

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,14 @@
2323
import org.elasticsearch.action.admin.cluster.bootstrap.BootstrapClusterAction;
2424
import org.elasticsearch.action.admin.cluster.bootstrap.BootstrapClusterRequest;
2525
import org.elasticsearch.action.admin.cluster.bootstrap.BootstrapClusterResponse;
26+
import org.elasticsearch.action.admin.cluster.bootstrap.BootstrapConfiguration.NodeDescription;
2627
import org.elasticsearch.action.admin.cluster.bootstrap.GetDiscoveredNodesAction;
2728
import org.elasticsearch.action.admin.cluster.bootstrap.GetDiscoveredNodesRequest;
2829
import org.elasticsearch.action.admin.cluster.bootstrap.GetDiscoveredNodesResponse;
2930
import org.elasticsearch.cluster.node.DiscoveryNode;
3031
import org.elasticsearch.cluster.node.DiscoveryNode.Role;
3132
import org.elasticsearch.common.settings.Settings;
33+
import org.elasticsearch.common.settings.Settings.Builder;
3234
import org.elasticsearch.tasks.Task;
3335
import org.elasticsearch.test.ESTestCase;
3436
import org.elasticsearch.test.transport.MockTransport;
@@ -50,7 +52,11 @@
5052
import static java.util.Collections.singleton;
5153
import static org.elasticsearch.cluster.coordination.ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING;
5254
import static org.elasticsearch.cluster.coordination.ClusterBootstrapService.INITIAL_MASTER_NODE_COUNT_SETTING;
55+
import static org.elasticsearch.common.settings.Settings.builder;
56+
import static org.elasticsearch.discovery.DiscoveryModule.DISCOVERY_HOSTS_PROVIDER_SETTING;
57+
import static org.elasticsearch.discovery.zen.SettingsBasedHostsProvider.DISCOVERY_ZEN_PING_UNICAST_HOSTS_SETTING;
5358
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
59+
import static org.hamcrest.Matchers.equalTo;
5460

5561
public class ClusterBootstrapServiceTests extends ESTestCase {
5662

@@ -65,7 +71,7 @@ public void createServices() {
6571
otherNode1 = newDiscoveryNode("other1");
6672
otherNode2 = newDiscoveryNode("other2");
6773

68-
deterministicTaskQueue = new DeterministicTaskQueue(Settings.builder().put(NODE_NAME_SETTING.getKey(), "node").build(), random());
74+
deterministicTaskQueue = new DeterministicTaskQueue(builder().put(NODE_NAME_SETTING.getKey(), "node").build(), random());
6975

7076
final MockTransport transport = new MockTransport() {
7177
@Override
@@ -77,6 +83,9 @@ protected void onSendRequest(long requestId, String action, TransportRequest req
7783
transportService = transport.createTransportService(Settings.EMPTY, deterministicTaskQueue.getThreadPool(),
7884
TransportService.NOOP_TRANSPORT_INTERCEPTOR, boundTransportAddress -> localNode, null, emptySet());
7985

86+
clusterBootstrapService = new ClusterBootstrapService(builder().put(INITIAL_MASTER_NODE_COUNT_SETTING.getKey(), 3).build(),
87+
transportService);
88+
8089
final Settings settings;
8190
if (randomBoolean()) {
8291
settings = Settings.builder().put(INITIAL_MASTER_NODE_COUNT_SETTING.getKey(), 3).build();
@@ -109,8 +118,24 @@ public void testDoesNothingOnNonMasterNodes() {
109118
deterministicTaskQueue.runAllTasks();
110119
}
111120

112-
public void testDoesNothingIfSettingIsUnset() {
113-
clusterBootstrapService = new ClusterBootstrapService(Settings.EMPTY, transportService);
121+
public void testDoesNothingByDefaultIfHostsProviderConfigured() {
122+
testConfiguredIfSettingSet(builder().putList(DISCOVERY_HOSTS_PROVIDER_SETTING.getKey()));
123+
}
124+
125+
public void testDoesNothingByDefaultIfUnicastHostsConfigured() {
126+
testConfiguredIfSettingSet(builder().putList(DISCOVERY_ZEN_PING_UNICAST_HOSTS_SETTING.getKey()));
127+
}
128+
129+
public void testDoesNothingByDefaultIfMasterNodeCountConfigured() {
130+
testConfiguredIfSettingSet(builder().put(INITIAL_MASTER_NODE_COUNT_SETTING.getKey(), 0));
131+
}
132+
133+
public void testDoesNothingByDefaultIfMasterNodesConfigured() {
134+
testConfiguredIfSettingSet(builder().putList(INITIAL_MASTER_NODES_SETTING.getKey()));
135+
}
136+
137+
private void testConfiguredIfSettingSet(Builder builder) {
138+
clusterBootstrapService = new ClusterBootstrapService(builder.build(), transportService);
114139
transportService.registerRequestHandler(GetDiscoveredNodesAction.NAME, Names.SAME, GetDiscoveredNodesRequest::new,
115140
(request, channel, task) -> {
116141
throw new AssertionError("should not make a discovery request");
@@ -119,6 +144,30 @@ public void testDoesNothingIfSettingIsUnset() {
119144
deterministicTaskQueue.runAllTasks();
120145
}
121146

147+
public void testBootstrapsAutomaticallyWithDefaultConfiguration() {
148+
clusterBootstrapService = new ClusterBootstrapService(Settings.EMPTY, transportService);
149+
150+
final Set<DiscoveryNode> discoveredNodes = Stream.of(localNode, otherNode1, otherNode2).collect(Collectors.toSet());
151+
transportService.registerRequestHandler(GetDiscoveredNodesAction.NAME, Names.SAME, GetDiscoveredNodesRequest::new,
152+
(request, channel, task) -> channel.sendResponse(new GetDiscoveredNodesResponse(discoveredNodes)));
153+
154+
final AtomicBoolean bootstrapped = new AtomicBoolean();
155+
transportService.registerRequestHandler(BootstrapClusterAction.NAME, Names.SAME, BootstrapClusterRequest::new,
156+
(request, channel, task) -> {
157+
assertThat(request.getBootstrapConfiguration().getNodeDescriptions().stream()
158+
.map(NodeDescription::getId).collect(Collectors.toSet()),
159+
equalTo(discoveredNodes.stream().map(DiscoveryNode::getId).collect(Collectors.toSet())));
160+
161+
channel.sendResponse(new BootstrapClusterResponse(randomBoolean()));
162+
assertTrue(bootstrapped.compareAndSet(false, true));
163+
});
164+
165+
startServices();
166+
deterministicTaskQueue.runAllTasks();
167+
168+
assertTrue(bootstrapped.get());
169+
}
170+
122171
public void testDoesNotRetryOnDiscoveryFailure() {
123172
transportService.registerRequestHandler(GetDiscoveredNodesAction.NAME, Names.SAME, GetDiscoveredNodesRequest::new,
124173
new TransportRequestHandler<GetDiscoveredNodesRequest>() {

0 commit comments

Comments
 (0)