@@ -1091,7 +1091,7 @@ class DAGScheduler(
10911091
10921092 // TODO: mark the executor as failed only if there were lots of fetch failures on it
10931093 if (bmAddress != null ) {
1094- handleExecutorLost(bmAddress.executorId, Some (task.epoch))
1094+ handleExecutorLost(bmAddress.executorId, fetchFailed = true , Some (task.epoch))
10951095 }
10961096
10971097 case ExceptionFailure (className, description, stackTrace, metrics) =>
@@ -1111,25 +1111,35 @@ class DAGScheduler(
11111111 * Responds to an executor being lost. This is called inside the event loop, so it assumes it can
11121112 * modify the scheduler's internal state. Use executorLost() to post a loss event from outside.
11131113 *
1114+ * We will also assume that we've lost all shuffle blocks associated with the executor if the
1115+ * executor serves its own blocks (i.e., we're not using external shuffle) OR a FetchFailed
1116+ * occurred, in which case we presume all shuffle data related to this executor to be lost.
1117+ *
11141118 * Optionally the epoch during which the failure was caught can be passed to avoid allowing
11151119 * stray fetch failures from possibly retriggering the detection of a node as lost.
11161120 */
1117- private [scheduler] def handleExecutorLost (execId : String , maybeEpoch : Option [Long ] = None ) {
1121+ private [scheduler] def handleExecutorLost (
1122+ execId : String ,
1123+ fetchFailed : Boolean ,
1124+ maybeEpoch : Option [Long ] = None ) {
11181125 val currentEpoch = maybeEpoch.getOrElse(mapOutputTracker.getEpoch)
11191126 if (! failedEpoch.contains(execId) || failedEpoch(execId) < currentEpoch) {
11201127 failedEpoch(execId) = currentEpoch
11211128 logInfo(" Executor lost: %s (epoch %d)" .format(execId, currentEpoch))
11221129 blockManagerMaster.removeExecutor(execId)
1123- // TODO: This will be really slow if we keep accumulating shuffle map stages
1124- for ((shuffleId, stage) <- shuffleToMapStage) {
1125- stage.removeOutputsOnExecutor(execId)
1126- val locs = stage.outputLocs.map(list => if (list.isEmpty) null else list.head).toArray
1127- mapOutputTracker.registerMapOutputs(shuffleId, locs, changeEpoch = true )
1128- }
1129- if (shuffleToMapStage.isEmpty) {
1130- mapOutputTracker.incrementEpoch()
1130+
1131+ if (! env.blockManager.externalShuffleServiceEnabled || fetchFailed) {
1132+ // TODO: This will be really slow if we keep accumulating shuffle map stages
1133+ for ((shuffleId, stage) <- shuffleToMapStage) {
1134+ stage.removeOutputsOnExecutor(execId)
1135+ val locs = stage.outputLocs.map(list => if (list.isEmpty) null else list.head).toArray
1136+ mapOutputTracker.registerMapOutputs(shuffleId, locs, changeEpoch = true )
1137+ }
1138+ if (shuffleToMapStage.isEmpty) {
1139+ mapOutputTracker.incrementEpoch()
1140+ }
1141+ clearCacheLocs()
11311142 }
1132- clearCacheLocs()
11331143 } else {
11341144 logDebug(" Additional executor lost message for " + execId +
11351145 " (epoch " + currentEpoch + " )" )
@@ -1387,7 +1397,7 @@ private[scheduler] class DAGSchedulerEventProcessActor(dagScheduler: DAGSchedule
13871397 dagScheduler.handleExecutorAdded(execId, host)
13881398
13891399 case ExecutorLost (execId) =>
1390- dagScheduler.handleExecutorLost(execId)
1400+ dagScheduler.handleExecutorLost(execId, fetchFailed = false )
13911401
13921402 case BeginEvent (task, taskInfo) =>
13931403 dagScheduler.handleBeginEvent(task, taskInfo)
0 commit comments