Skip to content

Commit 343bf8b

Browse files
committed
Switch to using CacheBuilder instead of home grown expiration
1 parent 5b7c482 commit 343bf8b

File tree

2 files changed

+26
-24
lines changed

2 files changed

+26
-24
lines changed

core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
package org.apache.spark.scheduler
1919

2020
import java.nio.ByteBuffer
21-
import java.util
2221
import java.util.{Timer, TimerTask}
2322
import java.util.concurrent.{ConcurrentHashMap, TimeUnit}
2423
import java.util.concurrent.atomic.AtomicLong
@@ -27,6 +26,9 @@ import scala.collection.mutable
2726
import scala.collection.mutable.{ArrayBuffer, Buffer, HashMap, HashSet}
2827
import scala.util.Random
2928

29+
import com.google.common.base.Ticker
30+
import com.google.common.cache.CacheBuilder
31+
3032
import org.apache.spark._
3133
import org.apache.spark.TaskState.TaskState
3234
import org.apache.spark.executor.ExecutorMetrics
@@ -137,9 +139,21 @@ private[spark] class TaskSchedulerImpl(
137139
// IDs of the tasks running on each executor
138140
private val executorIdToRunningTaskIds = new HashMap[String, HashSet[Long]]
139141

142+
// We add executors here when we first get decommission notification for them. Executors can
143+
// continue to run even after being asked to decommission, but they will eventually exit.
140144
val executorsPendingDecommission = new HashMap[String, ExecutorDecommissionInfo]
141-
// map of second to list of executors to clear form the above map
142-
val decommissioningExecutorsToGc = new util.TreeMap[Long, mutable.ArrayBuffer[String]]()
145+
146+
// When they exit and we know of that via heartbeat failure, we will add them to this cache.
147+
// This cache is consulted to know if a fetch failure is because a source executor was
148+
// decommissioned.
149+
lazy val decommissionedExecutorsRemoved = CacheBuilder.newBuilder()
150+
.expireAfterWrite(
151+
conf.getLong("spark.decommissioningRememberAfterRemoval.seconds", 60L), TimeUnit.SECONDS)
152+
.ticker(new Ticker{
153+
override def read(): Long = TimeUnit.MILLISECONDS.toNanos(clock.getTimeMillis())
154+
})
155+
.build[String, ExecutorDecommissionInfo]()
156+
.asMap()
143157

144158
def runningTasksByExecutors: Map[String, Int] = synchronized {
145159
executorIdToRunningTaskIds.toMap.mapValues(_.size).toMap
@@ -924,13 +938,9 @@ private[spark] class TaskSchedulerImpl(
924938

925939
override def getExecutorDecommissionInfo(executorId: String)
926940
: Option[ExecutorDecommissionInfo] = synchronized {
927-
import scala.collection.JavaConverters._
928-
// Garbage collect old decommissioning entries
929-
val secondsToGcUptil = TimeUnit.MILLISECONDS.toSeconds(clock.getTimeMillis())
930-
val headMap = decommissioningExecutorsToGc.headMap(secondsToGcUptil)
931-
headMap.values().asScala.flatten.foreach(executorsPendingDecommission -= _)
932-
headMap.clear()
933-
executorsPendingDecommission.get(executorId)
941+
executorsPendingDecommission
942+
.get(executorId)
943+
.orElse(Option(decommissionedExecutorsRemoved.get(executorId)))
934944
}
935945

936946
override def executorLost(executorId: String, givenReason: ExecutorLossReason): Unit = {
@@ -1037,14 +1047,8 @@ private[spark] class TaskSchedulerImpl(
10371047
}
10381048

10391049

1040-
val decomInfo = executorsPendingDecommission.get(executorId)
1041-
if (decomInfo.isDefined) {
1042-
val rememberSeconds =
1043-
conf.getInt("spark.decommissioningRememberAfterRemoval.seconds", 60)
1044-
val gcSecond = TimeUnit.MILLISECONDS.toSeconds(clock.getTimeMillis()) + rememberSeconds
1045-
decommissioningExecutorsToGc.computeIfAbsent(gcSecond, _ => mutable.ArrayBuffer.empty) +=
1046-
executorId
1047-
}
1050+
val decomInfo = executorsPendingDecommission.remove(executorId)
1051+
decomInfo.foreach(decommissionedExecutorsRemoved.put(executorId, _))
10481052

10491053
if (reason != LossReasonPending) {
10501054
executorIdToHost -= executorId

core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1855,20 +1855,18 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
18551855
assert(scheduler.getExecutorDecommissionInfo("executor1").isDefined)
18561856
clock.advance(2000)
18571857
scheduler.executorLost("executor1", ExecutorExited(0, false, "normal"))
1858-
assert(scheduler.decommissioningExecutorsToGc.size === 1)
1859-
assert(scheduler.executorsPendingDecommission.size === 1)
1858+
assert(scheduler.decommissionedExecutorsRemoved.size === 1)
1859+
assert(scheduler.executorsPendingDecommission.isEmpty)
18601860
clock.advance(2000)
18611861
// It hasn't been 60 seconds yet before removal
18621862
assert(scheduler.getExecutorDecommissionInfo("executor1").isDefined)
18631863
scheduler.executorDecommission("executor1", ExecutorDecommissionInfo("", false))
18641864
clock.advance(2000)
1865-
assert(scheduler.decommissioningExecutorsToGc.size === 1)
1866-
assert(scheduler.executorsPendingDecommission.size === 1)
1865+
assert(scheduler.decommissionedExecutorsRemoved.size === 1)
18671866
assert(scheduler.getExecutorDecommissionInfo("executor1").isDefined)
18681867
clock.advance(61000)
18691868
assert(scheduler.getExecutorDecommissionInfo("executor1").isEmpty)
1870-
assert(scheduler.decommissioningExecutorsToGc.isEmpty)
1871-
assert(scheduler.executorsPendingDecommission.isEmpty)
1869+
assert(scheduler.decommissionedExecutorsRemoved.isEmpty)
18721870
}
18731871

18741872
/**

0 commit comments

Comments
 (0)