Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ public void run() {
node = new Node(environment) {
@Override
protected void validateNodeBeforeAcceptingRequests(Settings settings, BoundTransportAddress boundTransportAddress) {
BootstrapCheck.check(settings, boundTransportAddress);
BootstrapCheck.check(settings, getNodeEnvironment().getNodeLockId(), boundTransportAddress);
}
};
}
Expand Down
42 changes: 39 additions & 3 deletions core/src/main/java/org/elasticsearch/bootstrap/BootstrapCheck.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.elasticsearch.common.transport.BoundTransportAddress;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.discovery.zen.elect.ElectMasterService;
import org.elasticsearch.env.NodeEnvironment;
import org.elasticsearch.monitor.jvm.JvmInfo;
import org.elasticsearch.monitor.process.ProcessProbe;
import org.elasticsearch.node.Node;
Expand Down Expand Up @@ -59,13 +60,14 @@ private BootstrapCheck() {
* checks
*
* @param settings the current node settings
* @param dataDirectoryIndex the index of the directories who's locks under the data directories we were able to take
* @param boundTransportAddress the node network bindings
*/
static void check(final Settings settings, final BoundTransportAddress boundTransportAddress) {
static void check(final Settings settings, final int dataDirectoryIndex, final BoundTransportAddress boundTransportAddress) {
check(
enforceLimits(boundTransportAddress),
BootstrapSettings.IGNORE_SYSTEM_BOOTSTRAP_CHECKS.get(settings),
checks(settings),
checks(settings, dataDirectoryIndex),
Node.NODE_NAME_SETTING.get(settings));
}

Expand Down Expand Up @@ -153,7 +155,7 @@ static boolean enforceLimits(BoundTransportAddress boundTransportAddress) {
}

// the list of checks to execute
static List<Check> checks(final Settings settings) {
static List<Check> checks(final Settings settings, final int dataDirectoryIndex) {
final List<Check> checks = new ArrayList<>();
checks.add(new HeapSizeCheck());
final FileDescriptorCheck fileDescriptorCheck
Expand All @@ -173,6 +175,12 @@ static List<Check> checks(final Settings settings) {
checks.add(new ClientJvmCheck());
checks.add(new OnErrorCheck());
checks.add(new OnOutOfMemoryErrorCheck());
/*
* true in the line below means that we always default to the number of nodes for production configurations. This means that the
* check will produce consistent results even if Elasticsearch isn't being run in production mode. We rely on the production mode
* detection that BootstrapCheck already does decide whether or not to throw an error.
*/
checks.add(new DataDirectoryLockNumberCheck(dataDirectoryIndex, NodeEnvironment.getMaxLocalStorageNodes(settings, true)));
return Collections.unmodifiableList(checks);
}

Expand Down Expand Up @@ -592,4 +600,32 @@ public String errorMessage() {

}

/**
* Checks that the data directory that the node is using is within the production limit defined by
* {@link NodeEnvironment#MAX_LOCAL_STORAGE_NODES_SETTING}.
*/
static class DataDirectoryLockNumberCheck implements BootstrapCheck.Check {
private final int dataDirectoryIndex;
private final int maxLocalStorageNodes;

DataDirectoryLockNumberCheck(int dataDirectoryIndex, int maxLocalStorageNodes) {
this.dataDirectoryIndex = dataDirectoryIndex;
this.maxLocalStorageNodes = maxLocalStorageNodes;
}

@Override
public boolean check() {
return dataDirectoryIndex < maxLocalStorageNodes;
}

@Override
public String errorMessage() {
return "there are already [" + dataDirectoryIndex + "] Elasticsearch processes running with this data directory";
}

@Override
public boolean isSystemCheck() {
return false;
}
}
}
71 changes: 54 additions & 17 deletions core/src/main/java/org/elasticsearch/env/NodeEnvironment.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,6 @@
* A component that holds all data paths for a single node.
*/
public final class NodeEnvironment implements Closeable {

private final ESLogger logger;

public static class NodePath {
/* ${data.paths}/nodes/{node.id} */
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved these down below the constants. I thought I was going to add more members so I wanted to fix the order. I didn't end up adding more members but I still like having the members in a sensible order.

public final Path path;
Expand Down Expand Up @@ -138,21 +135,20 @@ public String toString() {
}
}

private final NodePath[] nodePaths;
private final Path sharedDataPath;
private final Lock[] locks;

private final int nodeLockId;
private final AtomicBoolean closed = new AtomicBoolean(false);
private final Map<ShardId, InternalShardLock> shardLocks = new HashMap<>();

private final NodeMetaData nodeMetaData;

/**
* Maximum number of data nodes that should run in an environment.
* Maximum number of data nodes that can run in an environment. {@code 0} is a special value for this setting meaning "use the default
* based on the number of allowed local storage nodes".
*/
public static final Setting<Integer> MAX_LOCAL_STORAGE_NODES_SETTING = Setting.intSetting("node.max_local_storage_nodes", 0, 0,
Property.NodeScope);
/**
* Default for {@link MAX_LOCAL_STORAGE_NODES_SETTING} to use when not in production mode.
*/
public static final Setting<Integer> MAX_LOCAL_STORAGE_NODES_SETTING = Setting.intSetting("node.max_local_storage_nodes", 50, 1,
Property.NodeScope);
public static final int MAX_LOCAL_STORAGE_NODES_NON_PRODUCTION_DEFAULT = 50;
/**
* Default for {@link MAX_LOCAL_STORAGE_NODES_SETTING} to use when in production mode.
*/
public static final int MAX_LOCAL_STORAGE_NODES_PRODUCTION_DEFAULT = 1;

/**
* If true automatically append node lock id to custom data paths.
Expand All @@ -179,6 +175,21 @@ public String toString() {
public static final String INDICES_FOLDER = "indices";
public static final String NODE_LOCK_FILENAME = "node.lock";

private final ESLogger logger;
private final NodePath[] nodePaths;
private final Path sharedDataPath;
private final Lock[] locks;

/**
* The lock number we were able to take. This is <strong>usually</strong> the number of Elasticsearch nodes sharing data directories
* with this node.
*/
private final int nodeLockId;
private final AtomicBoolean closed = new AtomicBoolean(false);
private final Map<ShardId, InternalShardLock> shardLocks = new HashMap<>();

private final NodeMetaData nodeMetaData;

public NodeEnvironment(Settings settings, Environment environment) throws IOException {

if (!DiscoveryNode.nodeRequiresLocalStorage(settings)) {
Expand All @@ -201,7 +212,11 @@ public NodeEnvironment(Settings settings, Environment environment) throws IOExce
sharedDataPath = environment.sharedDataFile();
int nodeLockId = -1;
IOException lastException = null;
int maxLocalStorageNodes = MAX_LOCAL_STORAGE_NODES_SETTING.get(settings);
/*
* Try and acquire a lock as though we were in non-production mode regardless of what we will bind. If we find out later we are
* in production mode we'll fail to startup fully and if we aren't in production mode we'll log a nice little warning.
*/
int maxLocalStorageNodes = getMaxLocalStorageNodes(settings, false);
for (int possibleLockId = 0; possibleLockId < maxLocalStorageNodes; possibleLockId++) {
for (int dirIndex = 0; dirIndex < environment.dataFiles().length; dirIndex++) {
Path dataDirWithClusterName = environment.dataWithClusterFiles()[dirIndex];
Expand Down Expand Up @@ -419,6 +434,28 @@ private static String toString(Collection<String> items) {
return b.toString();
}

/**
* The lock number we were able to take. This is <strong>usually</strong> the number of Elasticsearch nodes sharing data directories
* with this node.
*/
public int getNodeLockId() {
return nodeLockId;
}

/**
* Get the maximum number of local storage nodes allowed.
* @param settings the settings object from which to read the settings
* @param productionMode whether or not we are running in production mode
*/
public static int getMaxLocalStorageNodes(Settings settings, boolean productionMode) {
int maxLocalStorageNodes = MAX_LOCAL_STORAGE_NODES_SETTING.get(settings);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's very confusing to shadow the class variable maxLocalStorageNodes, maybe this can be named differently?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also opened #19752 for future work on this

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think there is a class variable named that though?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, I misread this:

screenshot from 2016-08-02 13-54-16

as being a class variable instead of another local variable, so ignore me!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The confusing part is that I had it as a class variable at some point when I was working on it!

if (maxLocalStorageNodes == 0) {
maxLocalStorageNodes = productionMode ? MAX_LOCAL_STORAGE_NODES_PRODUCTION_DEFAULT
: MAX_LOCAL_STORAGE_NODES_NON_PRODUCTION_DEFAULT;
}
return maxLocalStorageNodes;
}

/**
* Deletes a shard data directory iff the shards locks were successfully acquired.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@
package org.elasticsearch.bootstrap;

import org.apache.lucene.util.Constants;
import org.elasticsearch.bootstrap.BootstrapCheck.DataDirectoryLockNumberCheck;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.BoundTransportAddress;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.env.NodeEnvironment;
import org.elasticsearch.test.ESTestCase;

import java.util.ArrayList;
Expand Down Expand Up @@ -63,7 +65,7 @@ public void testNonProductionMode() {
BoundTransportAddress boundTransportAddress = mock(BoundTransportAddress.class);
when(boundTransportAddress.boundAddresses()).thenReturn(transportAddresses.toArray(new TransportAddress[0]));
when(boundTransportAddress.publishAddress()).thenReturn(publishAddress);
BootstrapCheck.check(Settings.EMPTY, boundTransportAddress);
BootstrapCheck.check(Settings.EMPTY, between(0, Integer.MAX_VALUE), boundTransportAddress);
}

public void testNoLogMessageInNonProductionMode() {
Expand Down Expand Up @@ -389,7 +391,7 @@ public void testMinMasterNodes() {
boolean isSet = randomBoolean();
BootstrapCheck.Check check = new BootstrapCheck.MinMasterNodesCheck(isSet);
assertThat(check.check(), not(equalTo(isSet)));
List<BootstrapCheck.Check> defaultChecks = BootstrapCheck.checks(Settings.EMPTY);
List<BootstrapCheck.Check> defaultChecks = BootstrapCheck.checks(Settings.EMPTY, 0);

expectThrows(RuntimeException.class, () -> BootstrapCheck.check(true, false, defaultChecks, "testMinMasterNodes"));
}
Expand Down Expand Up @@ -604,4 +606,21 @@ public boolean alwaysEnforce() {
assertThat(alwaysEnforced, hasToString(containsString("error")));
}

public void testDataDirectoryNumberCheck() {
// In production mode this defaults to 1:
DataDirectoryLockNumberCheck check = new DataDirectoryLockNumberCheck(0, 1);
assertTrue(check.check());
check = new DataDirectoryLockNumberCheck(1, 1);
assertFalse(check.check());
assertEquals("there are already [1] Elasticsearch processes running with this data directory", check.errorMessage());

// But in development mode it defaults to a whopping 50
check = new DataDirectoryLockNumberCheck(0, 50);
assertTrue(check.check());
check = new DataDirectoryLockNumberCheck(49, 50);
assertTrue(check.check());
check = new DataDirectoryLockNumberCheck(50, 50);
assertFalse(check.check());
assertEquals("there are already [50] Elasticsearch processes running with this data directory", check.errorMessage());
}
}
29 changes: 17 additions & 12 deletions core/src/test/java/org/elasticsearch/env/NodeEnvironmentTests.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,20 +57,25 @@
public class NodeEnvironmentTests extends ESTestCase {
private final IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("foo", Settings.EMPTY);

public void testNodeLockSillySettings() {
try {
NodeEnvironment.MAX_LOCAL_STORAGE_NODES_SETTING.get(Settings.builder()
.put(NodeEnvironment.MAX_LOCAL_STORAGE_NODES_SETTING.getKey(), between(Integer.MIN_VALUE, 0)).build());
fail("expected failure");
} catch (IllegalArgumentException e) {
assertThat(e.getMessage(), containsString("must be >= 1"));
public void testLocalStorageNodesSetting() {
{
final Settings settings = Settings.builder()
.put(NodeEnvironment.MAX_LOCAL_STORAGE_NODES_SETTING.getKey(), between(Integer.MIN_VALUE, 0)).build();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> NodeEnvironment.getMaxLocalStorageNodes(settings, randomBoolean()));
assertThat(e.getMessage(), containsString("must be >= 0"));
}
{
int value = between(1, Integer.MAX_VALUE); // MAXINT is silly but ok.
Settings settings = Settings.builder().put(NodeEnvironment.MAX_LOCAL_STORAGE_NODES_SETTING.getKey(), value).build();
assertEquals(value,NodeEnvironment.getMaxLocalStorageNodes(settings, randomBoolean()));
}

// Even though its silly MAXINT nodes is a-ok!
int value = between(1, Integer.MAX_VALUE);
int max = NodeEnvironment.MAX_LOCAL_STORAGE_NODES_SETTING.get(
Settings.builder().put(NodeEnvironment.MAX_LOCAL_STORAGE_NODES_SETTING.getKey(), value).build());
assertEquals(value, max);
// Defaults for production and non-production mode
assertEquals(NodeEnvironment.MAX_LOCAL_STORAGE_NODES_NON_PRODUCTION_DEFAULT,
NodeEnvironment.getMaxLocalStorageNodes(Settings.EMPTY, false));
assertEquals(NodeEnvironment.MAX_LOCAL_STORAGE_NODES_PRODUCTION_DEFAULT,
NodeEnvironment.getMaxLocalStorageNodes(Settings.EMPTY, true));
}

public void testNodeLockSingleEnvironment() throws IOException {
Expand Down
11 changes: 10 additions & 1 deletion docs/reference/migration/migrate_5_0/settings.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ scope (e.g. `_local_,_site_` for all loopback and private network addresses)
or by explicit interface names, hostnames, or addresses.

The `netty.epollBugWorkaround` settings is removed. This settings allow people to enable
a netty work around for https://github.com/netty/netty/issues/327[a high CPU usage issue] with early JVM versions.
a netty work around for https://github.com/netty/netty/issues/327[a high CPU usage issue] with early JVM versions.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Atom ate this trailing space. I regret nothing!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

This bug was http://bugs.java.com/view_bug.do?bug_id=6403933[fixed in Java 7]. Since Elasticsearch 5.0 requires Java 8 the settings is removed. Note that if the workaround needs to be reintroduced you can still set the `org.jboss.netty.epollBugWorkaround` system property to control Netty directly.

==== Forbid changing of thread pool types
Expand Down Expand Up @@ -259,6 +259,15 @@ a fallback realtime setting for the get and mget APIs when realtime
wasn't specified. Now if the parameter isn't specified we always
default to true.

==== `node.max_local_storage_nodes` defaults to 1 in production mode

If you don't explicitly set `node.max_local_storage_nodes` and you start
multiple Elasticsearch processes in the same source directories then
Elasticsearch will log warning if it hasn't bound to any non-local addresses.
If it *has* bound to any non-local addresses then it'll refuse to start,
with the message
`there are already [1] Elasticsearch processes running with this data directory`.

=== Script settings

==== Indexed script settings
Expand Down
15 changes: 10 additions & 5 deletions docs/reference/modules/node.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ cluster health to have a stable master node.
Any master-eligible node (all nodes by default) may be elected to become the
master node by the <<modules-discovery-zen,master election process>>.

IMPORTANT: Master nodes must have access to the `data/` directory (just like
IMPORTANT: Master nodes must have access to the `data/` directory (just like
`data` nodes) as this is where the cluster state is persisted between node restarts.

Indexing and searching your data is CPU-, memory-, and I/O-intensive work
Expand Down Expand Up @@ -284,14 +284,19 @@ your data! The RPM and Debian distributions do this for you already.
The <<data-path,data path>> can be shared by multiple nodes, even by nodes
from different clusters. This is very useful for testing failover and
different configurations on your development machine. In production, however,
it is recommended to run only one node of Elasticsearch per server.
it is recommended to run only one node of Elasticsearch per server. If you
don't explicitly set `node.max_local_storage_nodes` then Elasticsearch will
log a warning if multiple nodes share the same data directory and it hasn't
bound to any non-local addresses. If it has bound to any non-local addresses
then it'll refuse to start.

To prevent more than one node from sharing the same data path, add this
setting to the `elasticsearch.yml` config file:
You can always override the setting to *allow* more than one node to share the
same data directory in production mode by setting it in the `elasticsearch.yml`
config file:

[source,yaml]
------------------------------
node.max_local_storage_nodes: 1
node.max_local_storage_nodes: 2
------------------------------

WARNING: Never run different node types (i.e. master, data) from the
Expand Down
30 changes: 0 additions & 30 deletions docs/reference/setup/important-settings.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ configured before going into production.
* <<network.host,`network.host`>>
* <<unicast.hosts,`discovery.zen.ping.unicast.hosts`>>
* <<minimum_master_nodes,`discovery.zen.minimum_master_nodes`>>
* <<node.max_local_storage_nodes,`node.max_local_storage_nodes`>>

[float]
[[path-settings]]
Expand Down Expand Up @@ -185,32 +184,3 @@ discovery.zen.minimum_master_nodes: 2
IMPORTANT: If `discovery.zen.minimum_master_nodes` is not set when
Elasticsearch is running in <<dev-vs-prod,production mode>>, an exception will
be thrown which will prevent the node from starting.

[float]
[[node.max_local_storage_nodes]]
=== `node.max_local_storage_nodes`

It is possible to start more than one node on the same server from the same
`$ES_HOME`, just by doing the following:

[source,sh]
--------------------------------------------------
./bin/elasticsearch -d
./bin/elasticsearch -d
--------------------------------------------------

This works just fine: the data directory structure is designed to let multiple
nodes coexist. However, a single instance of Elasticsearch is able to use all
of the resources of a single server and it seldom makes sense to run multiple
nodes on the same server in production.

It is, however, possible to start more than one node on the same server by
mistake and to be completely unaware that this problem exists. To prevent more
than one node from sharing the same data directory, it is advisable to add the
following setting:

[source,yaml]
--------------------------------------------------
node.max_local_storage_nodes: 1
--------------------------------------------------