Skip to content

Commit 408dada

Browse files
author
Marcelo Vanzin
committed
Merge branch 'master' into SPARK-4194
Conflicts: core/src/main/scala/org/apache/spark/SparkContext.scala core/src/main/scala/org/apache/spark/executor/Executor.scala core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
2 parents 2621609 + 30363ed commit 408dada

File tree

47 files changed

+989
-789
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+989
-789
lines changed

bin/load-spark-env.cmd

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
@echo off
2+
3+
rem
4+
rem Licensed to the Apache Software Foundation (ASF) under one or more
5+
rem contributor license agreements. See the NOTICE file distributed with
6+
rem this work for additional information regarding copyright ownership.
7+
rem The ASF licenses this file to You under the Apache License, Version 2.0
8+
rem (the "License"); you may not use this file except in compliance with
9+
rem the License. You may obtain a copy of the License at
10+
rem
11+
rem http://www.apache.org/licenses/LICENSE-2.0
12+
rem
13+
rem Unless required by applicable law or agreed to in writing, software
14+
rem distributed under the License is distributed on an "AS IS" BASIS,
15+
rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
rem See the License for the specific language governing permissions and
17+
rem limitations under the License.
18+
rem
19+
20+
rem This script loads spark-env.cmd if it exists, and ensures it is only loaded once.
21+
rem spark-env.cmd is loaded from SPARK_CONF_DIR if set, or within the current directory's
22+
rem conf/ subdirectory.
23+
24+
if [%SPARK_ENV_LOADED%] == [] (
25+
set SPARK_ENV_LOADED=1
26+
27+
if not [%SPARK_CONF_DIR%] == [] (
28+
set user_conf_dir=%SPARK_CONF_DIR%
29+
) else (
30+
set user_conf_dir=%~dp0..\..\conf
31+
)
32+
33+
call :LoadSparkEnv
34+
)
35+
36+
rem Setting SPARK_SCALA_VERSION if not already set.
37+
38+
set ASSEMBLY_DIR2=%SPARK_HOME%/assembly/target/scala-2.11
39+
set ASSEMBLY_DIR1=%SPARK_HOME%/assembly/target/scala-2.10
40+
41+
if [%SPARK_SCALA_VERSION%] == [] (
42+
43+
if exist %ASSEMBLY_DIR2% if exist %ASSEMBLY_DIR1% (
44+
echo "Presence of build for both scala versions(SCALA 2.10 and SCALA 2.11) detected."
45+
echo "Either clean one of them or, set SPARK_SCALA_VERSION=2.11 in spark-env.cmd."
46+
exit 1
47+
)
48+
if exist %ASSEMBLY_DIR2% (
49+
set SPARK_SCALA_VERSION=2.11
50+
) else (
51+
set SPARK_SCALA_VERSION=2.10
52+
)
53+
)
54+
exit /b 0
55+
56+
:LoadSparkEnv
57+
if exist "%user_conf_dir%\spark-env.cmd" (
58+
call "%user_conf_dir%\spark-env.cmd"
59+
)

bin/pyspark2.cmd

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@ rem
2020
rem Figure out where the Spark framework is installed
2121
set SPARK_HOME=%~dp0..
2222

23-
rem Load environment variables from conf\spark-env.cmd, if it exists
24-
if exist "%SPARK_HOME%\conf\spark-env.cmd" call "%SPARK_HOME%\conf\spark-env.cmd"
23+
call %SPARK_HOME%\bin\load-spark-env.cmd
2524

2625
rem Figure out which Python to use.
2726
if "x%PYSPARK_DRIVER_PYTHON%"=="x" (

bin/run-example2.cmd

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@ set FWDIR=%~dp0..\
2525
rem Export this as SPARK_HOME
2626
set SPARK_HOME=%FWDIR%
2727

28-
rem Load environment variables from conf\spark-env.cmd, if it exists
29-
if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd"
28+
call %SPARK_HOME%\bin\load-spark-env.cmd
3029

3130
rem Test that an argument was given
3231
if not "x%1"=="x" goto arg_given

bin/spark-class2.cmd

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@ rem
2020
rem Figure out where the Spark framework is installed
2121
set SPARK_HOME=%~dp0..
2222

23-
rem Load environment variables from conf\spark-env.cmd, if it exists
24-
if exist "%SPARK_HOME%\conf\spark-env.cmd" call "%SPARK_HOME%\conf\spark-env.cmd"
23+
call %SPARK_HOME%\bin\load-spark-env.cmd
2524

2625
rem Test that an argument was given
2726
if "x%1"=="x" (

core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala

Lines changed: 43 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@
1717

1818
package org.apache.spark
1919

20-
import scala.concurrent.duration._
21-
import scala.collection.mutable
20+
import java.util.concurrent.{ScheduledFuture, TimeUnit, Executors}
2221

23-
import akka.actor.{Actor, Cancellable}
22+
import scala.collection.mutable
2423

2524
import org.apache.spark.executor.TaskMetrics
25+
import org.apache.spark.rpc.{ThreadSafeRpcEndpoint, RpcEnv, RpcCallContext}
2626
import org.apache.spark.storage.BlockManagerId
2727
import org.apache.spark.scheduler.{SlaveLost, TaskScheduler}
28-
import org.apache.spark.util.ActorLogReceive
28+
import org.apache.spark.util.Utils
2929

3030
/**
3131
* A heartbeat from executors to the driver. This is a shared message used by several internal
@@ -51,9 +51,11 @@ private[spark] case class HeartbeatResponse(reregisterBlockManager: Boolean)
5151
* Lives in the driver to receive heartbeats from executors..
5252
*/
5353
private[spark] class HeartbeatReceiver(sc: SparkContext)
54-
extends Actor with ActorLogReceive with Logging {
54+
extends ThreadSafeRpcEndpoint with Logging {
55+
56+
override val rpcEnv: RpcEnv = sc.env.rpcEnv
5557

56-
private var scheduler: TaskScheduler = null
58+
private[spark] var scheduler: TaskScheduler = null
5759

5860
// executor ID -> timestamp of when the last heartbeat from this executor was received
5961
private val executorLastSeen = new mutable.HashMap[String, Long]
@@ -69,34 +71,44 @@ private[spark] class HeartbeatReceiver(sc: SparkContext)
6971
sc.conf.getOption("spark.network.timeoutInterval").map(_.toLong * 1000).
7072
getOrElse(sc.conf.getLong("spark.storage.blockManagerTimeoutIntervalMs", 60000))
7173

72-
private var timeoutCheckingTask: Cancellable = null
73-
74-
override def preStart(): Unit = {
75-
import context.dispatcher
76-
timeoutCheckingTask = context.system.scheduler.schedule(0.seconds,
77-
checkTimeoutIntervalMs.milliseconds, self, ExpireDeadHosts)
78-
super.preStart()
74+
private var timeoutCheckingTask: ScheduledFuture[_] = null
75+
76+
private val timeoutCheckingThread = Executors.newSingleThreadScheduledExecutor(
77+
Utils.namedThreadFactory("heartbeat-timeout-checking-thread"))
78+
79+
private val killExecutorThread = Executors.newSingleThreadExecutor(
80+
Utils.namedThreadFactory("kill-executor-thread"))
81+
82+
override def onStart(): Unit = {
83+
timeoutCheckingTask = timeoutCheckingThread.scheduleAtFixedRate(new Runnable {
84+
override def run(): Unit = Utils.tryLogNonFatalError {
85+
Option(self).foreach(_.send(ExpireDeadHosts))
86+
}
87+
}, 0, checkTimeoutIntervalMs, TimeUnit.MILLISECONDS)
7988
}
80-
81-
override def receiveWithLogging: PartialFunction[Any, Unit] = {
89+
90+
override def receive: PartialFunction[Any, Unit] = {
91+
case ExpireDeadHosts =>
92+
expireDeadHosts()
8293
case TaskSchedulerIsSet =>
8394
scheduler = sc.taskScheduler
95+
}
96+
97+
override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
8498
case heartbeat @ Heartbeat(executorId, taskMetrics, blockManagerId) =>
8599
if (scheduler != null) {
86100
val unknownExecutor = !scheduler.executorHeartbeatReceived(
87101
executorId, taskMetrics, blockManagerId)
88102
val response = HeartbeatResponse(reregisterBlockManager = unknownExecutor)
89103
executorLastSeen(executorId) = System.currentTimeMillis()
90-
sender ! response
104+
context.reply(response)
91105
} else {
92106
// Because Executor will sleep several seconds before sending the first "Heartbeat", this
93107
// case rarely happens. However, if it really happens, log it and ask the executor to
94108
// register itself again.
95109
logWarning(s"Dropping $heartbeat because TaskScheduler is not ready yet")
96-
sender ! HeartbeatResponse(reregisterBlockManager = true)
110+
context.reply(HeartbeatResponse(reregisterBlockManager = true))
97111
}
98-
case ExpireDeadHosts =>
99-
expireDeadHosts()
100112
}
101113

102114
private def expireDeadHosts(): Unit = {
@@ -109,17 +121,25 @@ private[spark] class HeartbeatReceiver(sc: SparkContext)
109121
scheduler.executorLost(executorId, SlaveLost("Executor heartbeat " +
110122
s"timed out after ${now - lastSeenMs} ms"))
111123
if (sc.supportDynamicAllocation) {
112-
sc.killExecutor(executorId)
124+
// Asynchronously kill the executor to avoid blocking the current thread
125+
killExecutorThread.submit(new Runnable {
126+
override def run(): Unit = sc.killExecutor(executorId)
127+
})
113128
}
114129
executorLastSeen.remove(executorId)
115130
}
116131
}
117132
}
118133

119-
override def postStop(): Unit = {
134+
override def onStop(): Unit = {
120135
if (timeoutCheckingTask != null) {
121-
timeoutCheckingTask.cancel()
136+
timeoutCheckingTask.cancel(true)
122137
}
123-
super.postStop()
138+
timeoutCheckingThread.shutdownNow()
139+
killExecutorThread.shutdownNow()
124140
}
125141
}
142+
143+
object HeartbeatReceiver {
144+
val ENDPOINT_NAME = "HeartbeatReceiver"
145+
}

core/src/main/scala/org/apache/spark/MapOutputTracker.scala

Lines changed: 29 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,11 @@ import java.io._
2121
import java.util.concurrent.ConcurrentHashMap
2222
import java.util.zip.{GZIPInputStream, GZIPOutputStream}
2323

24-
import scala.collection.mutable.{HashSet, HashMap, Map}
25-
import scala.concurrent.Await
24+
import scala.collection.mutable.{HashSet, Map}
2625
import scala.collection.JavaConversions._
26+
import scala.reflect.ClassTag
2727

28-
import akka.actor._
29-
import akka.pattern.ask
30-
28+
import org.apache.spark.rpc.{RpcEndpointRef, RpcEnv, RpcCallContext, RpcEndpoint}
3129
import org.apache.spark.scheduler.MapStatus
3230
import org.apache.spark.shuffle.MetadataFetchFailedException
3331
import org.apache.spark.storage.BlockManagerId
@@ -38,34 +36,35 @@ private[spark] case class GetMapOutputStatuses(shuffleId: Int)
3836
extends MapOutputTrackerMessage
3937
private[spark] case object StopMapOutputTracker extends MapOutputTrackerMessage
4038

41-
/** Actor class for MapOutputTrackerMaster */
42-
private[spark] class MapOutputTrackerMasterActor(tracker: MapOutputTrackerMaster, conf: SparkConf)
43-
extends Actor with ActorLogReceive with Logging {
39+
/** RpcEndpoint class for MapOutputTrackerMaster */
40+
private[spark] class MapOutputTrackerMasterEndpoint(
41+
override val rpcEnv: RpcEnv, tracker: MapOutputTrackerMaster, conf: SparkConf)
42+
extends RpcEndpoint with Logging {
4443
val maxAkkaFrameSize = AkkaUtils.maxFrameSizeBytes(conf)
4544

46-
override def receiveWithLogging: PartialFunction[Any, Unit] = {
45+
override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
4746
case GetMapOutputStatuses(shuffleId: Int) =>
48-
val hostPort = sender.path.address.hostPort
47+
val hostPort = context.sender.address.hostPort
4948
logInfo("Asked to send map output locations for shuffle " + shuffleId + " to " + hostPort)
5049
val mapOutputStatuses = tracker.getSerializedMapOutputStatuses(shuffleId)
5150
val serializedSize = mapOutputStatuses.size
5251
if (serializedSize > maxAkkaFrameSize) {
5352
val msg = s"Map output statuses were $serializedSize bytes which " +
5453
s"exceeds spark.akka.frameSize ($maxAkkaFrameSize bytes)."
5554

56-
/* For SPARK-1244 we'll opt for just logging an error and then throwing an exception.
57-
* Note that on exception the actor will just restart. A bigger refactoring (SPARK-1239)
58-
* will ultimately remove this entire code path. */
55+
/* For SPARK-1244 we'll opt for just logging an error and then sending it to the sender.
56+
* A bigger refactoring (SPARK-1239) will ultimately remove this entire code path. */
5957
val exception = new SparkException(msg)
6058
logError(msg, exception)
61-
throw exception
59+
context.sendFailure(exception)
60+
} else {
61+
context.reply(mapOutputStatuses)
6262
}
63-
sender ! mapOutputStatuses
6463

6564
case StopMapOutputTracker =>
66-
logInfo("MapOutputTrackerActor stopped!")
67-
sender ! true
68-
context.stop(self)
65+
logInfo("MapOutputTrackerMasterEndpoint stopped!")
66+
context.reply(true)
67+
stop()
6968
}
7069
}
7170

@@ -75,12 +74,9 @@ private[spark] class MapOutputTrackerMasterActor(tracker: MapOutputTrackerMaster
7574
* (driver and executor) use different HashMap to store its metadata.
7675
*/
7776
private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging {
78-
private val timeout = AkkaUtils.askTimeout(conf)
79-
private val retryAttempts = AkkaUtils.numRetries(conf)
80-
private val retryIntervalMs = AkkaUtils.retryWaitMs(conf)
8177

82-
/** Set to the MapOutputTrackerActor living on the driver. */
83-
var trackerActor: ActorRef = _
78+
/** Set to the MapOutputTrackerMasterEndpoint living on the driver. */
79+
var trackerEndpoint: RpcEndpointRef = _
8480

8581
/**
8682
* This HashMap has different behavior for the driver and the executors.
@@ -105,22 +101,22 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
105101
private val fetching = new HashSet[Int]
106102

107103
/**
108-
* Send a message to the trackerActor and get its result within a default timeout, or
104+
* Send a message to the trackerEndpoint and get its result within a default timeout, or
109105
* throw a SparkException if this fails.
110106
*/
111-
protected def askTracker(message: Any): Any = {
107+
protected def askTracker[T: ClassTag](message: Any): T = {
112108
try {
113-
AkkaUtils.askWithReply(message, trackerActor, retryAttempts, retryIntervalMs, timeout)
109+
trackerEndpoint.askWithReply[T](message)
114110
} catch {
115111
case e: Exception =>
116112
logError("Error communicating with MapOutputTracker", e)
117113
throw new SparkException("Error communicating with MapOutputTracker", e)
118114
}
119115
}
120116

121-
/** Send a one-way message to the trackerActor, to which we expect it to reply with true. */
117+
/** Send a one-way message to the trackerEndpoint, to which we expect it to reply with true. */
122118
protected def sendTracker(message: Any) {
123-
val response = askTracker(message)
119+
val response = askTracker[Boolean](message)
124120
if (response != true) {
125121
throw new SparkException(
126122
"Error reply received from MapOutputTracker. Expecting true, got " + response.toString)
@@ -157,11 +153,10 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
157153

158154
if (fetchedStatuses == null) {
159155
// We won the race to fetch the output locs; do so
160-
logInfo("Doing the fetch; tracker actor = " + trackerActor)
156+
logInfo("Doing the fetch; tracker endpoint = " + trackerEndpoint)
161157
// This try-finally prevents hangs due to timeouts:
162158
try {
163-
val fetchedBytes =
164-
askTracker(GetMapOutputStatuses(shuffleId)).asInstanceOf[Array[Byte]]
159+
val fetchedBytes = askTracker[Array[Byte]](GetMapOutputStatuses(shuffleId))
165160
fetchedStatuses = MapOutputTracker.deserializeMapStatuses(fetchedBytes)
166161
logInfo("Got the output locations")
167162
mapStatuses.put(shuffleId, fetchedStatuses)
@@ -328,7 +323,7 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
328323
override def stop() {
329324
sendTracker(StopMapOutputTracker)
330325
mapStatuses.clear()
331-
trackerActor = null
326+
trackerEndpoint = null
332327
metadataCleaner.cancel()
333328
cachedSerializedStatuses.clear()
334329
}
@@ -350,6 +345,8 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr
350345

351346
private[spark] object MapOutputTracker extends Logging {
352347

348+
val ENDPOINT_NAME = "MapOutputTracker"
349+
353350
// Serialize an array of map output locations into an efficient byte format so that we can send
354351
// it to reduce tasks. We do this by compressing the serialized bytes using GZIP. They will
355352
// generally be pretty compressible because many map outputs will be on the same hostname.

0 commit comments

Comments
 (0)