Skip to content

Commit 4fe98b1

Browse files
committed
Die with dignity
Today when a thread encounters a fatal unrecoverable error that threatens the stability of the JVM, Elasticsearch marches on. This includes out of memory errors, stack overflow errors and other errors that leave the JVM in a questionable state. Instead, the Elasticsearch JVM should die when these errors are encountered. This commit causes this to be the case.
1 parent 4b0d317 commit 4fe98b1

File tree

10 files changed

+265
-5
lines changed

10 files changed

+265
-5
lines changed

core/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ dependencies {
5656
compile "org.apache.lucene:lucene-spatial3d:${versions.lucene}"
5757
compile "org.apache.lucene:lucene-suggest:${versions.lucene}"
5858

59-
compile 'org.elasticsearch:securesm:1.0'
59+
compile 'org.elasticsearch:securesm:1.1'
6060

6161
// utilities
6262
compile 'net.sf.jopt-simple:jopt-simple:4.9'

core/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,12 @@ static void init(
247247
// fail if somebody replaced the lucene jars
248248
checkLucene();
249249

250+
// install the default uncaught exception handler; must be done before security is
251+
// initialized as we do not want to grant the runtime permission
252+
// setDefaultUncaughtExceptionHandler
253+
Thread.setDefaultUncaughtExceptionHandler(
254+
new ElasticsearchUncaughtExceptionHandler(() -> Node.NODE_NAME_SETTING.get(settings)));
255+
250256
INSTANCE.setup(true, settings, environment);
251257

252258
INSTANCE.start();
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.bootstrap;
21+
22+
import org.apache.lucene.index.MergePolicy;
23+
import org.elasticsearch.common.SuppressForbidden;
24+
import org.elasticsearch.common.logging.ESLogger;
25+
import org.elasticsearch.common.logging.Loggers;
26+
27+
import java.io.IOError;
28+
import java.util.Objects;
29+
import java.util.function.Supplier;
30+
31+
class ElasticsearchUncaughtExceptionHandler implements Thread.UncaughtExceptionHandler {
32+
33+
private final Supplier<String> loggingPrefixSupplier;
34+
35+
ElasticsearchUncaughtExceptionHandler(final Supplier<String> loggingPrefixSupplier) {
36+
this.loggingPrefixSupplier = Objects.requireNonNull(loggingPrefixSupplier);
37+
}
38+
39+
@Override
40+
public void uncaughtException(Thread t, Throwable e) {
41+
if (isFatalUncaught(e)) {
42+
try {
43+
onFatalUncaught(t.getName(), e);
44+
} finally {
45+
// we use specific error codes in case the above notification failed, at least we
46+
// will have some indication of the error bringing us down
47+
if (e instanceof InternalError) {
48+
halt(128);
49+
} else if (e instanceof OutOfMemoryError) {
50+
halt(127);
51+
} else if (e instanceof StackOverflowError) {
52+
halt(126);
53+
} else if (e instanceof UnknownError) {
54+
halt(125);
55+
} else if (e instanceof IOError) {
56+
halt(124);
57+
} else {
58+
halt(1);
59+
}
60+
}
61+
} else {
62+
onNonFatalUncaught(t.getName(), e);
63+
}
64+
}
65+
66+
// visible for testing
67+
static boolean isFatalUncaught(Throwable e) {
68+
return isFatalCause(e) || (e instanceof MergePolicy.MergeException && isFatalCause(e.getCause()));
69+
}
70+
71+
private static boolean isFatalCause(Throwable cause) {
72+
return cause instanceof Error;
73+
}
74+
75+
// visible for testing
76+
void onFatalUncaught(final String threadName, final Throwable t) {
77+
final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get());
78+
logger.error("fatal error in thread [{}], exiting", t, threadName);
79+
}
80+
81+
// visible for testing
82+
void onNonFatalUncaught(final String threadName, final Throwable t) {
83+
final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get());
84+
logger.warn("uncaught exception in thread [{}]", t, threadName);
85+
}
86+
87+
// visible for testing
88+
@SuppressForbidden(reason = "halt")
89+
void halt(int status) {
90+
// we halt to prevent shutdown hooks from running
91+
Runtime.getRuntime().halt(status);
92+
}
93+
94+
}

core/src/main/java/org/elasticsearch/bootstrap/Security.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ static void configure(Environment environment, boolean filterBadDefaults) throws
120120
Policy.setPolicy(new ESPolicy(createPermissions(environment), getPluginPermissions(environment), filterBadDefaults));
121121

122122
// enable security manager
123-
System.setSecurityManager(new SecureSM());
123+
System.setSecurityManager(new SecureSM(new String[] { "org.elasticsearch.bootstrap." }));
124124

125125
// do some basic tests
126126
selfTest();

core/src/main/resources/org/elasticsearch/bootstrap/security.policy

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
//// SecurityManager impl:
2525
//// Must have all permissions to properly perform access checks
2626

27-
grant codeBase "${codebase.securesm-1.0.jar}" {
27+
grant codeBase "${codebase.securesm-1.1.jar}" {
2828
permission java.security.AllPermission;
2929
};
3030

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.bootstrap;
21+
22+
import org.apache.lucene.index.MergePolicy;
23+
import org.elasticsearch.test.ESTestCase;
24+
import org.junit.Before;
25+
26+
import java.io.IOError;
27+
import java.io.IOException;
28+
import java.io.UncheckedIOException;
29+
import java.util.Collections;
30+
import java.util.HashMap;
31+
import java.util.Map;
32+
import java.util.concurrent.atomic.AtomicBoolean;
33+
import java.util.concurrent.atomic.AtomicInteger;
34+
import java.util.concurrent.atomic.AtomicReference;
35+
36+
import static org.hamcrest.CoreMatchers.equalTo;
37+
38+
public class ElasticsearchUncaughtExceptionHandlerTests extends ESTestCase {
39+
40+
private Map<Class<? extends Error>, Integer> expectedStatus;
41+
42+
@Before
43+
public void setUp() throws Exception {
44+
super.setUp();
45+
Map<Class<? extends Error>, Integer> expectedStatus = new HashMap<>();
46+
expectedStatus.put(InternalError.class, 128);
47+
expectedStatus.put(OutOfMemoryError.class, 127);
48+
expectedStatus.put(StackOverflowError.class, 126);
49+
expectedStatus.put(UnknownError.class, 125);
50+
expectedStatus.put(IOError.class, 124);
51+
this.expectedStatus = Collections.unmodifiableMap(expectedStatus);
52+
}
53+
54+
public void testUncaughtError() throws InterruptedException {
55+
final Error error = randomFrom(
56+
new InternalError(),
57+
new OutOfMemoryError(),
58+
new StackOverflowError(),
59+
new UnknownError(),
60+
new IOError(new IOException("fatal")),
61+
new Error() {});
62+
final Thread thread = new Thread(() -> { throw error; });
63+
final String name = randomAsciiOfLength(10);
64+
thread.setName(name);
65+
final AtomicBoolean halt = new AtomicBoolean();
66+
final AtomicInteger observedStatus = new AtomicInteger();
67+
final AtomicReference<String> threadNameReference = new AtomicReference<>();
68+
final AtomicReference<Throwable> throwableReference = new AtomicReference<>();
69+
thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtError") {
70+
71+
@Override
72+
void halt(int status) {
73+
halt.set(true);
74+
observedStatus.set(status);
75+
}
76+
77+
@Override
78+
void onFatalUncaught(String threadName, Throwable t) {
79+
threadNameReference.set(threadName);
80+
throwableReference.set(t);
81+
}
82+
83+
@Override
84+
void onNonFatalUncaught(String threadName, Throwable t) {
85+
fail();
86+
}
87+
88+
});
89+
thread.start();
90+
thread.join();
91+
assertTrue(halt.get());
92+
final int status;
93+
if (expectedStatus.containsKey(error.getClass())) {
94+
status = expectedStatus.get(error.getClass());
95+
} else {
96+
status = 1;
97+
}
98+
assertThat(observedStatus.get(), equalTo(status));
99+
assertThat(threadNameReference.get(), equalTo(name));
100+
assertThat(throwableReference.get(), equalTo(error));
101+
}
102+
103+
public void testUncaughtException() throws InterruptedException {
104+
final RuntimeException e = new RuntimeException("boom");
105+
final Thread thread = new Thread(() -> { throw e; });
106+
final String name = randomAsciiOfLength(10);
107+
thread.setName(name);
108+
final AtomicReference<String> threadNameReference = new AtomicReference<>();
109+
final AtomicReference<Throwable> throwableReference = new AtomicReference<>();
110+
thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtException") {
111+
@Override
112+
void halt(int status) {
113+
fail();
114+
}
115+
116+
@Override
117+
void onFatalUncaught(String threadName, Throwable t) {
118+
fail();
119+
}
120+
121+
@Override
122+
void onNonFatalUncaught(String threadName, Throwable t) {
123+
threadNameReference.set(threadName);
124+
throwableReference.set(t);
125+
}
126+
});
127+
thread.start();
128+
thread.join();
129+
assertThat(threadNameReference.get(), equalTo(name));
130+
assertThat(throwableReference.get(), equalTo(e));
131+
}
132+
133+
public void testIsFatalCause() {
134+
assertFatal(new MergePolicy.MergeException(new OutOfMemoryError(), null));
135+
assertFatal(new OutOfMemoryError());
136+
assertFatal(new StackOverflowError());
137+
assertFatal(new InternalError());
138+
assertFatal(new UnknownError());
139+
assertFatal(new IOError(new IOException()));
140+
assertNonFatal(new RuntimeException());
141+
assertNonFatal(new UncheckedIOException(new IOException()));
142+
}
143+
144+
private void assertFatal(Throwable cause) {
145+
assertTrue(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause));
146+
}
147+
148+
private void assertNonFatal(Throwable cause) {
149+
assertFalse(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause));
150+
}
151+
152+
}

distribution/licenses/securesm-1.0.jar.sha1

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1e423447d020041534be94c0f31a49fbdc1f2950

docs/reference/migration/migrate_5_0/packaging.asciidoc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,11 @@ from Elasticsearch.
5555
Additionally, it was previously possible to set any setting in
5656
Elasticsearch via JVM system properties. This has been removed from
5757
Elasticsearch.
58+
59+
==== Dying on fatal errors
60+
61+
Previous versions of Elasticsearch would not halt the JVM if out of memory errors or other fatal
62+
errors were encountered during the life of the Elasticsearch instance. Because such errors leave
63+
the JVM in a questionable state, the best course of action is to halt the JVM when this occurs.
64+
Starting in Elasticsearch 5.x, this is now the case. Operators should consider configuring their
65+
Elasticsearch services so that they respawn automatically in the case of such a fatal crash.

test/framework/src/main/java/org/elasticsearch/bootstrap/BootstrapForTesting.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ public boolean implies(ProtectionDomain domain, Permission permission) {
150150
return esPolicy.implies(domain, permission) || testFramework.implies(domain, permission);
151151
}
152152
});
153-
System.setSecurityManager(new SecureSM(true));
153+
System.setSecurityManager(SecureSM.createTestSecureSM());
154154
Security.selfTest();
155155

156156
// guarantee plugin classes are initialized first, in case they have one-time hacks.

0 commit comments

Comments
 (0)