Skip to content

Commit c08bc37

Browse files
adrian-wangcloud-fan
authored andcommitted
[SPARK-29177][CORE] fix zombie tasks after stage abort
### What changes were proposed in this pull request? Do task handling even the task exceeds maxResultSize configured. More details are in the jira description https://issues.apache.org/jira/browse/SPARK-29177 . ### Why are the changes needed? Without this patch, the zombie tasks will prevent yarn from recycle those containers running these tasks, which will affect other applications. ### Does this PR introduce any user-facing change? No ### How was this patch tested? unit test and production test with a very large `SELECT` in spark thriftserver. Closes #25850 from adrian-wang/zombie. Authored-by: Daoyuan Wang <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 655356e commit c08bc37

File tree

2 files changed

+42
-0
lines changed

2 files changed

+42
-0
lines changed

core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
6464
val (result, size) = serializer.get().deserialize[TaskResult[_]](serializedData) match {
6565
case directResult: DirectTaskResult[_] =>
6666
if (!taskSetManager.canFetchMoreResults(serializedData.limit())) {
67+
// kill the task so that it will not become zombie task
68+
scheduler.handleFailedTask(taskSetManager, tid, TaskState.KILLED, TaskKilled(
69+
"Tasks result size has exceeded maxResultSize"))
6770
return
6871
}
6972
// deserialize "value" without holding any lock so that it won't block other threads.
@@ -75,6 +78,9 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
7578
if (!taskSetManager.canFetchMoreResults(size)) {
7679
// dropped by executor if size is larger than maxResultSize
7780
sparkEnv.blockManager.master.removeBlock(blockId)
81+
// kill the task so that it will not become zombie task
82+
scheduler.handleFailedTask(taskSetManager, tid, TaskState.KILLED, TaskKilled(
83+
"Tasks result size has exceeded maxResultSize"))
7884
return
7985
}
8086
logDebug("Fetching indirect task result for TID %s".format(tid))

core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import org.scalatest.BeforeAndAfter
3333
import org.scalatest.concurrent.Eventually._
3434

3535
import org.apache.spark._
36+
import org.apache.spark.TaskState.TaskState
3637
import org.apache.spark.TestUtils.JavaSourceFromString
3738
import org.apache.spark.internal.config.Network.RPC_MESSAGE_MAX_SIZE
3839
import org.apache.spark.storage.TaskResultBlockId
@@ -78,6 +79,16 @@ private class ResultDeletingTaskResultGetter(sparkEnv: SparkEnv, scheduler: Task
7879
}
7980
}
8081

82+
private class DummyTaskSchedulerImpl(sc: SparkContext)
83+
extends TaskSchedulerImpl(sc, 1, true) {
84+
override def handleFailedTask(
85+
taskSetManager: TaskSetManager,
86+
tid: Long,
87+
taskState: TaskState,
88+
reason: TaskFailedReason): Unit = {
89+
// do nothing
90+
}
91+
}
8192

8293
/**
8394
* A [[TaskResultGetter]] that stores the [[DirectTaskResult]]s it receives from executors
@@ -130,6 +141,31 @@ class TaskResultGetterSuite extends SparkFunSuite with BeforeAndAfter with Local
130141
"Expect result to be removed from the block manager.")
131142
}
132143

144+
test("handling total size of results larger than maxResultSize") {
145+
sc = new SparkContext("local", "test", conf)
146+
val scheduler = new DummyTaskSchedulerImpl(sc)
147+
val spyScheduler = spy(scheduler)
148+
val resultGetter = new TaskResultGetter(sc.env, spyScheduler)
149+
scheduler.taskResultGetter = resultGetter
150+
val myTsm = new TaskSetManager(spyScheduler, FakeTask.createTaskSet(2), 1) {
151+
// always returns false
152+
override def canFetchMoreResults(size: Long): Boolean = false
153+
}
154+
val indirectTaskResult = IndirectTaskResult(TaskResultBlockId(0), 0)
155+
val directTaskResult = new DirectTaskResult(ByteBuffer.allocate(0), Nil, Array())
156+
val ser = sc.env.closureSerializer.newInstance()
157+
val serializedIndirect = ser.serialize(indirectTaskResult)
158+
val serializedDirect = ser.serialize(directTaskResult)
159+
resultGetter.enqueueSuccessfulTask(myTsm, 0, serializedDirect)
160+
resultGetter.enqueueSuccessfulTask(myTsm, 1, serializedIndirect)
161+
eventually(timeout(1.second)) {
162+
verify(spyScheduler, times(1)).handleFailedTask(
163+
myTsm, 0, TaskState.KILLED, TaskKilled("Tasks result size has exceeded maxResultSize"))
164+
verify(spyScheduler, times(1)).handleFailedTask(
165+
myTsm, 1, TaskState.KILLED, TaskKilled("Tasks result size has exceeded maxResultSize"))
166+
}
167+
}
168+
133169
test("task retried if result missing from block manager") {
134170
// Set the maximum number of task failures to > 0, so that the task set isn't aborted
135171
// after the result is missing.

0 commit comments

Comments
 (0)