Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
39ba441
spark-9104 first draft version
liyezhang556520 Aug 17, 2015
ecc1044
Merge remote-tracking branch 'apache/master' into netMem-9104
liyezhang556520 Aug 17, 2015
2101538
show N/A for nio
liyezhang556520 Aug 17, 2015
9ccaf88
handle executor add and remove event for memotyTab
liyezhang556520 Aug 18, 2015
13c17fb
show removed executors info on page
liyezhang556520 Aug 19, 2015
c9b44b1
add stage memory trace
liyezhang556520 Aug 19, 2015
984feaf
add history support for heartbeat event
liyezhang556520 Aug 20, 2015
2501c82
limit history event log frequency
liyezhang556520 Aug 20, 2015
e0ae855
add some comments for EventLoggingListener
liyezhang556520 Aug 20, 2015
7491279
Merge remote-tracking branch 'apache/master' into netMem-9104
liyezhang556520 Aug 20, 2015
424c172
scala style fix
liyezhang556520 Aug 20, 2015
f21a804
remove executor port and fix test failure
liyezhang556520 Aug 21, 2015
2f3d30b
merge spache/master after master updated
liyezhang556520 Sep 25, 2015
7b846a2
work with JavaConverters
liyezhang556520 Oct 9, 2015
41874aa
Merge remote-tracking branch 'apache/master' into netMem-9104
liyezhang556520 Oct 9, 2015
0531d0f
Merge remote-tracking branch 'apache/master' into netMem-9104
liyezhang556520 Oct 29, 2015
27b7da1
refine the code according to Imran's comments and the design doc
liyezhang556520 Nov 2, 2015
a8fcf74
Merge remote-tracking branch 'apache/master' into netMem-9104
liyezhang556520 Nov 2, 2015
f2f0e64
fix scala style test
liyezhang556520 Nov 3, 2015
5f7a999
capitalize class name
liyezhang556520 Nov 3, 2015
5ad7a6a
change task metrics json format back to origin
liyezhang556520 Nov 3, 2015
c836fb9
Merge remote-tracking branch 'apache/master' into netMem-9104
liyezhang556520 Nov 3, 2015
b5aa4da
Merge remote-tracking branch 'apache/master' into netMem-9104
liyezhang556520 Nov 5, 2015
e8e2bdd
Merge remote-tracking branch 'apache/master' into netMem-9104
liyezhang556520 Nov 6, 2015
1dffa29
accroding to Imran's comment, refine the code
liyezhang556520 Nov 17, 2015
75e63c3
add first test case
liyezhang556520 Nov 17, 2015
0c1241c
fix scala style
liyezhang556520 Nov 17, 2015
c78628e
add more test cases, with eventloging test left
liyezhang556520 Nov 19, 2015
a93bd96
scala style fix
liyezhang556520 Nov 19, 2015
89214f3
fix test fail and add event logging unit test
liyezhang556520 Nov 23, 2015
1ed48c1
scala syle
liyezhang556520 Nov 23, 2015
cb307aa
merge to apache/master branch, fix merge conflict
liyezhang556520 Nov 24, 2015
b438077
roll back useless change
liyezhang556520 Nov 24, 2015
4123ac7
modify the code according to Imran's comments, mainly with unit test
liyezhang556520 Dec 8, 2015
2ce9fd9
fix scala style
liyezhang556520 Dec 8, 2015
17d094e
merge to master branch with tests update
liyezhang556520 Dec 8, 2015
4b3dbe4
change port to option and some bug fixes
liyezhang556520 Dec 9, 2015
0ea7cab
address comments of code refinement
liyezhang556520 Jan 12, 2016
5e031ce
merge to latest master branch from spark-9104-draft
liyezhang556520 Jan 12, 2016
87f8172
fix import ordering error
liyezhang556520 Jan 12, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import java.util.concurrent.{ScheduledFuture, TimeUnit}
import scala.collection.mutable
import scala.concurrent.Future

import org.apache.spark.executor.TaskMetrics
import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics}
import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
import org.apache.spark.scheduler._
import org.apache.spark.storage.BlockManagerId
Expand All @@ -35,6 +35,7 @@ import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
*/
private[spark] case class Heartbeat(
executorId: String,
executorMetrics: ExecutorMetrics,
taskMetrics: Array[(Long, TaskMetrics)], // taskId -> TaskMetrics
blockManagerId: BlockManagerId)

Expand Down Expand Up @@ -119,14 +120,14 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock)
context.reply(true)

// Messages received from executors
case heartbeat @ Heartbeat(executorId, taskMetrics, blockManagerId) =>
case heartbeat @ Heartbeat(executorId, executorMetrics, taskMetrics, blockManagerId) =>
if (scheduler != null) {
if (executorLastSeen.contains(executorId)) {
executorLastSeen(executorId) = clock.getTimeMillis()
eventLoopThread.submit(new Runnable {
override def run(): Unit = Utils.tryLogNonFatalError {
val unknownExecutor = !scheduler.executorHeartbeatReceived(
executorId, taskMetrics, blockManagerId)
executorId, executorMetrics, taskMetrics, blockManagerId)
val response = HeartbeatResponse(reregisterBlockManager = unknownExecutor)
context.reply(response)
}
Expand Down
22 changes: 20 additions & 2 deletions core/src/main/scala/org/apache/spark/executor/Executor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ private[spark] class Executor(
env.blockManager.initialize(conf.getAppId)
}

private val executorMetrics: ExecutorMetrics = new ExecutorMetrics
executorMetrics.setHostname(Utils.localHostName)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the hostname really enough? What about with multiple executors on the same host? The Heartbeat already has the executorId, maybe that is all we need?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's better to use HOST:PORT, but executor port cannot get here, which should get from RpcEnv.adress.port. We can get executorId on the driver side when receiving the message, that might be enough to identify the different executors, but since we will show the removed executors on the page, so we cannot know where the executor locate by only executorId, because the Executor tab only shows the active executors. we can remove the hostname here if we support showing the removed executors on Executor tab.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry I forgot about this in my last round. I don't understand the first sentence of your reply -- why can't you get the hostname & port? it looks like you can do exactly what you suggested to get the host & port from the rpc env with executorMetrics.setHostPort(env.rpcEnv.address.hostPort)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry that I forgot the reason why I didn't get the port from rpcEnv originally, but it seems we can get the port from it directly. Let me try to add it back.

if (env.rpcEnv.address != null) {
executorMetrics.setPort(Some(env.rpcEnv.address.port))
}

// Whether to load classes in user jars before those in Spark jars
private val userClassPathFirst = conf.getBoolean("spark.executor.userClassPathFirst", false)

Expand Down Expand Up @@ -431,7 +437,7 @@ private[spark] class Executor(
metrics.updateAccumulators()

if (isLocal) {
// JobProgressListener will hold an reference of it during
// JobProgressListener will hold a reference of it during
// onExecutorMetricsUpdate(), then JobProgressListener can not see
// the changes of metrics any more, so make a deep copy of it
val copiedMetrics = Utils.deserialize[TaskMetrics](Utils.serialize(metrics))
Expand All @@ -444,7 +450,19 @@ private[spark] class Executor(
}
}

val message = Heartbeat(executorId, tasksMetrics.toArray, env.blockManager.blockManagerId)
env.blockTransferService.getMemMetrics(this.executorMetrics)
val executorMetrics = if (isLocal) {
// JobProgressListener might hold a reference of it during onExecutorMetricsUpdate()
// in future, if then JobProgressListener cannot see the changes of metrics any
// more, so make a deep copy of it here for future change.
Utils.deserialize[ExecutorMetrics](Utils.serialize(this.executorMetrics))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whats the point of this? sorry if we discussed it earlier ... if its just to test the ExecutorMetrics really is serializable, that would be better in a test case

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is due to SPARK-3465. Currently we do not have any aggregation operations for ExecutorMetrics, we can remove this. We can add it back when we do some aggregation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, this is a great point. In that case, I think you can leave it in for now, but add a comment that the serialization & deserialization is just to make a copy, for to SPARK-3465

} else {
this.executorMetrics
}

val message = Heartbeat(
executorId, executorMetrics, tasksMetrics.toArray, env.blockManager.blockManagerId)

try {
val response = heartbeatReceiverRef.askWithRetry[HeartbeatResponse](
message, RpcTimeout(conf, "spark.executor.heartbeatInterval", "10s"))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.executor

import org.apache.spark.annotation.DeveloperApi

/**
* :: DeveloperApi ::
* Metrics tracked during the execution of an executor.
*
* So, when adding new fields, take into consideration that the whole object can be serialized for
* shipping off at any time to consumers of the SparkListener interface.
*/
@DeveloperApi
class ExecutorMetrics extends Serializable {

/**
* Host's name the executor runs on
*/
private var _hostname: String = ""
def hostname: String = _hostname
private[spark] def setHostname(value: String) = _hostname = value

/**
* Host's port the executor runs on
*/
private var _port: Option[Int] = None
def port: Option[Int] = _port
private[spark] def setPort(value: Option[Int]) = _port = value

private[spark] def hostPort: String = {
val hp = port match {
case None => hostname
case value => hostname + ":" + value.get
}
hp
}

private var _transportMetrics: TransportMetrics = new TransportMetrics
def transportMetrics: TransportMetrics = _transportMetrics
private[spark] def setTransportMetrics(value: TransportMetrics) = {
_transportMetrics = value
}
}

object ExecutorMetrics extends Serializable {
def apply(
hostName: String,
port: Option[Int],
transportMetrics: TransportMetrics): ExecutorMetrics = {
val execMetrics = new ExecutorMetrics
execMetrics.setHostname(hostName)
execMetrics.setPort(port)
execMetrics.setTransportMetrics(transportMetrics)
execMetrics
}
}

/**
* :: DeveloperApi ::
* Metrics for network layer
*/
@DeveloperApi
class TransportMetrics (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this needs to be Serializable (in fact, there should probably be some unit test which makes sure that ExecutorHeartbeats are serializable ...)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's true, thank you for pointing it out.

val timeStamp: Long = System.currentTimeMillis,
val onHeapSize: Long = 0L,
val offHeapSize: Long = 0L) extends Serializable

object TransportMetrics extends Serializable {
def apply(
timeStamp: Long,
onHeapSize: Long,
offHeapSize: Long): TransportMetrics = {
new TransportMetrics(timeStamp, onHeapSize, offHeapSize)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import java.nio.ByteBuffer
import scala.concurrent.{Await, Future, Promise}
import scala.concurrent.duration.Duration

import org.apache.spark.executor.ExecutorMetrics
import org.apache.spark.Logging
import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient}
Expand All @@ -37,6 +38,11 @@ abstract class BlockTransferService extends ShuffleClient with Closeable with Lo
*/
def init(blockDataManager: BlockDataManager)

/**
* Collect current executor memory metrics of transferService.
*/
private[spark] def getMemMetrics(executorMetrics: ExecutorMetrics): Unit

/**
* Tear down the transfer service.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ import java.nio.ByteBuffer
import scala.collection.JavaConverters._
import scala.concurrent.{Future, Promise}

import org.apache.spark.{SecurityManager, SparkConf}
import io.netty.buffer._

import org.apache.spark.{SecurityManager, SparkConf, SparkEnv}
import org.apache.spark.executor.{ExecutorMetrics, TransportMetrics}
import org.apache.spark.network._
import org.apache.spark.network.buffer.ManagedBuffer
import org.apache.spark.network.client.{RpcResponseCallback, TransportClientBootstrap, TransportClientFactory}
Expand All @@ -33,7 +36,7 @@ import org.apache.spark.network.shuffle.protocol.UploadBlock
import org.apache.spark.network.util.JavaUtils
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.storage.{BlockId, StorageLevel}
import org.apache.spark.util.Utils
import org.apache.spark.util.{Clock, SystemClock, Utils}

/**
* A BlockTransferService that uses Netty to fetch a set of blocks at at time.
Expand All @@ -50,6 +53,40 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage
private[this] var server: TransportServer = _
private[this] var clientFactory: TransportClientFactory = _
private[this] var appId: String = _
private[this] var clock: Clock = new SystemClock()

/**
* Use a different clock for this allocation manager. This is mainly used for testing.
*/
def setClock(newClock: Clock): Unit = {
clock = newClock
}

private[spark] override def getMemMetrics(executorMetrics: ExecutorMetrics): Unit = {
val currentTime = clock.getTimeMillis()
val clientPooledAllocator = clientFactory.getPooledAllocator()
val serverAllocator = server.getAllocator()
val clientOffHeapSize: Long = sumOfMetrics(convToScala(clientPooledAllocator.directArenas()))
val clientOnHeapSize: Long = sumOfMetrics(convToScala(clientPooledAllocator.heapArenas()))
val serverOffHeapSize: Long = sumOfMetrics(convToScala(serverAllocator.directArenas()))
val serverOnHeapSize: Long = sumOfMetrics(convToScala(serverAllocator.heapArenas()))
logDebug(s"Current Netty Client offheap size is $clientOffHeapSize, " +
s"Client heap size is $clientOnHeapSize, Server offheap size is $serverOffHeapSize, " +
s"server heap size is $serverOnHeapSize, executor id is " +
s"${SparkEnv.get.blockManager.blockManagerId.executorId}")
executorMetrics.setTransportMetrics(TransportMetrics(currentTime,
clientOnHeapSize + serverOnHeapSize, clientOffHeapSize + serverOffHeapSize))
}

private def convToScala = (x: java.util.List[PoolArenaMetric]) => x.asScala

private def sumOfMetrics(arenaMetricList: Seq[PoolArenaMetric]): Long = {
arenaMetricList.map { Arena =>
Arena.chunkLists().asScala.map { chunk =>
chunk.iterator().asScala.map(_.chunkSize()).sum
}.sum
}.sum
}

override def init(blockDataManager: BlockDataManager): Unit = {
val rpcHandler = new NettyBlockRpcServer(conf.getAppId, serializer, blockDataManager)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ import org.apache.commons.lang3.SerializationUtils

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics}
import org.apache.spark.network.util.JavaUtils
import org.apache.spark.partial.{ApproximateActionListener, ApproximateEvaluator, PartialResult}
import org.apache.spark.rdd.RDD
Expand Down Expand Up @@ -216,15 +216,16 @@ class DAGScheduler(
}

/**
* Update metrics for in-progress tasks and let the master know that the BlockManager is still
* alive. Return true if the driver knows about the given block manager. Otherwise, return false,
* indicating that the block manager should re-register.
* Update metrics for live executor and in-progress tasks and let the master know that the
* BlockManager is still alive. Return true if the driver knows about the given block manager.
* Otherwise, return false, indicating that the block manager should re-register.
*/
def executorHeartbeatReceived(
execId: String,
executorMetrics: ExecutorMetrics,
taskMetrics: Array[(Long, Int, Int, TaskMetrics)], // (taskId, stageId, stateAttempt, metrics)
blockManagerId: BlockManagerId): Boolean = {
listenerBus.post(SparkListenerExecutorMetricsUpdate(execId, taskMetrics))
listenerBus.post(SparkListenerExecutorMetricsUpdate(execId, executorMetrics, taskMetrics))
blockManagerMaster.driverEndpoint.askWithRetry[Boolean](
BlockManagerHeartbeat(blockManagerId), new RpcTimeout(600 seconds, "BlockManagerHeartbeat"))
}
Expand Down
Loading