-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-4012] stop SparkContext when the exception is thrown from an infinite loop #5004
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6322959
6ad3eb0
6087864
3c72cd8
589276a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -145,7 +145,7 @@ private[spark] class TaskSchedulerImpl( | |
| import sc.env.actorSystem.dispatcher | ||
| sc.env.actorSystem.scheduler.schedule(SPECULATION_INTERVAL milliseconds, | ||
| SPECULATION_INTERVAL milliseconds) { | ||
| Utils.tryOrExit { checkSpeculatableTasks() } | ||
| Utils.tryOrStopSparkContext(sc) { checkSpeculatableTasks() } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for updating this one as well! |
||
| } | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ import java.util.concurrent._ | |
| import java.util.concurrent.atomic.AtomicBoolean | ||
|
|
||
| import com.google.common.annotations.VisibleForTesting | ||
| import org.apache.spark.SparkContext | ||
|
|
||
| /** | ||
| * Asynchronously passes events to registered listeners. | ||
|
|
@@ -38,6 +39,8 @@ private[spark] abstract class AsynchronousListenerBus[L <: AnyRef, E](name: Stri | |
|
|
||
| self => | ||
|
|
||
| private var sparkContext: SparkContext = null | ||
|
|
||
| /* Cap the capacity of the event queue so we get an explicit error (rather than | ||
| * an OOM exception) if it's perpetually being added to more quickly than it's being drained. */ | ||
| private val EVENT_QUEUE_CAPACITY = 10000 | ||
|
|
@@ -57,7 +60,7 @@ private[spark] abstract class AsynchronousListenerBus[L <: AnyRef, E](name: Stri | |
|
|
||
| private val listenerThread = new Thread(name) { | ||
| setDaemon(true) | ||
| override def run(): Unit = Utils.logUncaughtExceptions { | ||
| override def run(): Unit = Utils.tryOrStopSparkContext(sparkContext) { | ||
| while (true) { | ||
| eventLock.acquire() | ||
| self.synchronized { | ||
|
|
@@ -89,9 +92,12 @@ private[spark] abstract class AsynchronousListenerBus[L <: AnyRef, E](name: Stri | |
| * This first sends out all buffered events posted before this listener bus has started, then | ||
| * listens for any additional events asynchronously while the listener bus is still running. | ||
| * This should only be called once. | ||
| * | ||
| * @param sc Used to stop the SparkContext in case the listener thread dies. | ||
| */ | ||
| def start() { | ||
| def start(sc: SparkContext) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add an |
||
| if (started.compareAndSet(false, true)) { | ||
| sparkContext = sc | ||
| listenerThread.start() | ||
| } else { | ||
| throw new IllegalStateException(s"$name already started!") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1145,6 +1145,8 @@ private[spark] object Utils extends Logging { | |
| /** | ||
| * Execute a block of code that evaluates to Unit, forwarding any uncaught exceptions to the | ||
| * default UncaughtExceptionHandler | ||
| * | ||
| * NOTE: This method is to be called by the spark-started JVM process. | ||
| */ | ||
| def tryOrExit(block: => Unit) { | ||
| try { | ||
|
|
@@ -1155,6 +1157,32 @@ private[spark] object Utils extends Logging { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Execute a block of code that evaluates to Unit, stop SparkContext is there is any uncaught | ||
| * exception | ||
| * | ||
| * NOTE: This method is to be called by the driver-side components to avoid stopping the | ||
| * user-started JVM process completely; in contrast, tryOrExit is to be called in the | ||
| * spark-started JVM process . | ||
| */ | ||
| def tryOrStopSparkContext(sc: SparkContext)(block: => Unit) { | ||
| try { | ||
| block | ||
| } catch { | ||
| case e: ControlThrowable => throw e | ||
| case t: Throwable => | ||
| val currentThreadName = Thread.currentThread().getName | ||
| if (sc != null) { | ||
| logError(s"uncaught error in thread $currentThreadName, stopping SparkContext", t) | ||
| sc.stop() | ||
| } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about throwing
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, @zsxwing thanks for the comments I personally prefer a more conservative way here (the current approach) Because the throwable thrown from here can be varying in terms of types, and I'm concerning that the Throwable from here, like So I prefer to letting the user call SparkContext.runJob to get a
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if we catch NonFatal(e) and re-throw other Throwables? Basically saying that fatal errors should be re-thrown, but lesser ones can just stop here, they should only application-level exceptions which are our code's concern. |
||
| if (!NonFatal(t)) { | ||
| logError(s"throw uncaught fatal error in thread $currentThreadName", t) | ||
| throw t | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Execute a block of code that evaluates to Unit, re-throwing any non-fatal uncaught | ||
| * exceptions as IOException. This is used when implementing Externalizable and Serializable's | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure what the FsHistoryProvider is, what should the failure semantics be around it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FsHistoryProvider is the module used by history server process to render the application logs....
this getRunner is called for periodically check updated logs and clean logs...
the point here is if anything uncaught is thrown, like OOM, HistoryServer process should be restarted, since it has been unfunctional...