-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-18236] Reduce duplicate objects in Spark UI and HistoryServer #15743
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6441f06
ade86db
7e05630
738cb5a
f8aee5d
4c867f1
3838243
4c7067e
9662163
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,7 @@ import scala.collection.mutable | |
| import scala.collection.mutable.{HashMap, LinkedHashMap} | ||
|
|
||
| import org.apache.spark.JobExecutionStatus | ||
| import org.apache.spark.executor.{ShuffleReadMetrics, ShuffleWriteMetrics, TaskMetrics} | ||
| import org.apache.spark.executor._ | ||
| import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo} | ||
| import org.apache.spark.util.AccumulatorContext | ||
| import org.apache.spark.util.collection.OpenHashSet | ||
|
|
@@ -147,9 +147,8 @@ private[spark] object UIData { | |
| memoryBytesSpilled = m.memoryBytesSpilled, | ||
| diskBytesSpilled = m.diskBytesSpilled, | ||
| peakExecutionMemory = m.peakExecutionMemory, | ||
| inputMetrics = InputMetricsUIData(m.inputMetrics.bytesRead, m.inputMetrics.recordsRead), | ||
| outputMetrics = | ||
| OutputMetricsUIData(m.outputMetrics.bytesWritten, m.outputMetrics.recordsWritten), | ||
| inputMetrics = InputMetricsUIData(m.inputMetrics), | ||
| outputMetrics = OutputMetricsUIData(m.outputMetrics), | ||
| shuffleReadMetrics = ShuffleReadMetricsUIData(m.shuffleReadMetrics), | ||
| shuffleWriteMetrics = ShuffleWriteMetricsUIData(m.shuffleWriteMetrics)) | ||
| } | ||
|
|
@@ -171,9 +170,9 @@ private[spark] object UIData { | |
| speculative = taskInfo.speculative | ||
| ) | ||
| newTaskInfo.gettingResultTime = taskInfo.gettingResultTime | ||
| newTaskInfo.accumulables ++= taskInfo.accumulables.filter { | ||
| newTaskInfo.setAccumulables(taskInfo.accumulables.filter { | ||
| accum => !accum.internal && accum.metadata != Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER) | ||
| } | ||
| }) | ||
| newTaskInfo.finishTime = taskInfo.finishTime | ||
| newTaskInfo.failed = taskInfo.failed | ||
| newTaskInfo | ||
|
|
@@ -197,8 +196,32 @@ private[spark] object UIData { | |
| shuffleWriteMetrics: ShuffleWriteMetricsUIData) | ||
|
|
||
| case class InputMetricsUIData(bytesRead: Long, recordsRead: Long) | ||
| object InputMetricsUIData { | ||
| def apply(metrics: InputMetrics): InputMetricsUIData = { | ||
| if (metrics.bytesRead == 0 && metrics.recordsRead == 0) { | ||
| EMPTY | ||
| } else { | ||
| new InputMetricsUIData( | ||
| bytesRead = metrics.bytesRead, | ||
| recordsRead = metrics.recordsRead) | ||
| } | ||
| } | ||
| private val EMPTY = InputMetricsUIData(0, 0) | ||
| } | ||
|
|
||
| case class OutputMetricsUIData(bytesWritten: Long, recordsWritten: Long) | ||
| object OutputMetricsUIData { | ||
| def apply(metrics: OutputMetrics): OutputMetricsUIData = { | ||
| if (metrics.bytesWritten == 0 && metrics.recordsWritten == 0) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume the else block is more common ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For OutputMetrics, I'd actually assume the opposite: these metrics are referring to bytes written to an external system, not bytes written to shuffle, so the majority of tasks won't have non-zero values for this metric (all but the last stage in a multi-stage job, for example).
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sounds good |
||
| EMPTY | ||
| } else { | ||
| new OutputMetricsUIData( | ||
| bytesWritten = metrics.bytesWritten, | ||
| recordsWritten = metrics.recordsWritten) | ||
| } | ||
| } | ||
| private val EMPTY = OutputMetricsUIData(0, 0) | ||
| } | ||
|
|
||
| case class ShuffleReadMetricsUIData( | ||
| remoteBlocksFetched: Long, | ||
|
|
@@ -212,17 +235,30 @@ private[spark] object UIData { | |
|
|
||
| object ShuffleReadMetricsUIData { | ||
| def apply(metrics: ShuffleReadMetrics): ShuffleReadMetricsUIData = { | ||
| new ShuffleReadMetricsUIData( | ||
| remoteBlocksFetched = metrics.remoteBlocksFetched, | ||
| localBlocksFetched = metrics.localBlocksFetched, | ||
| remoteBytesRead = metrics.remoteBytesRead, | ||
| localBytesRead = metrics.localBytesRead, | ||
| fetchWaitTime = metrics.fetchWaitTime, | ||
| recordsRead = metrics.recordsRead, | ||
| totalBytesRead = metrics.totalBytesRead, | ||
| totalBlocksFetched = metrics.totalBlocksFetched | ||
| ) | ||
| if ( | ||
| metrics.remoteBlocksFetched == 0 && | ||
| metrics.localBlocksFetched == 0 && | ||
| metrics.remoteBytesRead == 0 && | ||
| metrics.localBytesRead == 0 && | ||
| metrics.fetchWaitTime == 0 && | ||
| metrics.recordsRead == 0 && | ||
| metrics.totalBytesRead == 0 && | ||
| metrics.totalBlocksFetched == 0) { | ||
| EMPTY | ||
| } else { | ||
| new ShuffleReadMetricsUIData( | ||
| remoteBlocksFetched = metrics.remoteBlocksFetched, | ||
| localBlocksFetched = metrics.localBlocksFetched, | ||
| remoteBytesRead = metrics.remoteBytesRead, | ||
| localBytesRead = metrics.localBytesRead, | ||
| fetchWaitTime = metrics.fetchWaitTime, | ||
| recordsRead = metrics.recordsRead, | ||
| totalBytesRead = metrics.totalBytesRead, | ||
| totalBlocksFetched = metrics.totalBlocksFetched | ||
| ) | ||
| } | ||
| } | ||
| private val EMPTY = ShuffleReadMetricsUIData(0, 0, 0, 0, 0, 0, 0, 0) | ||
| } | ||
|
|
||
| case class ShuffleWriteMetricsUIData( | ||
|
|
@@ -232,12 +268,17 @@ private[spark] object UIData { | |
|
|
||
| object ShuffleWriteMetricsUIData { | ||
| def apply(metrics: ShuffleWriteMetrics): ShuffleWriteMetricsUIData = { | ||
| new ShuffleWriteMetricsUIData( | ||
| bytesWritten = metrics.bytesWritten, | ||
| recordsWritten = metrics.recordsWritten, | ||
| writeTime = metrics.writeTime | ||
| ) | ||
| if (metrics.bytesWritten == 0 && metrics.recordsWritten == 0 && metrics.writeTime == 0) { | ||
| EMPTY | ||
| } else { | ||
| new ShuffleWriteMetricsUIData( | ||
| bytesWritten = metrics.bytesWritten, | ||
| recordsWritten = metrics.recordsWritten, | ||
| writeTime = metrics.writeTime | ||
| ) | ||
| } | ||
| } | ||
| private val EMPTY = ShuffleWriteMetricsUIData(0, 0, 0) | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -694,8 +694,8 @@ private[spark] object JsonProtocol { | |
| val index = (json \ "Index").extract[Int] | ||
| val attempt = (json \ "Attempt").extractOpt[Int].getOrElse(1) | ||
| val launchTime = (json \ "Launch Time").extract[Long] | ||
| val executorId = (json \ "Executor ID").extract[String] | ||
| val host = (json \ "Host").extract[String] | ||
| val executorId = (json \ "Executor ID").extract[String].intern() | ||
| val host = (json \ "Host").extract[String].intern() | ||
| val taskLocality = TaskLocality.withName((json \ "Locality").extract[String]) | ||
| val speculative = (json \ "Speculative").extractOpt[Boolean].getOrElse(false) | ||
| val gettingResultTime = (json \ "Getting Result Time").extract[Long] | ||
|
|
@@ -713,7 +713,7 @@ private[spark] object JsonProtocol { | |
| taskInfo.finishTime = finishTime | ||
| taskInfo.failed = failed | ||
| taskInfo.killed = killed | ||
| accumulables.foreach { taskInfo.accumulables += _ } | ||
| taskInfo.setAccumulables(accumulables) | ||
| taskInfo | ||
| } | ||
|
|
||
|
|
@@ -885,8 +885,8 @@ private[spark] object JsonProtocol { | |
| if (json == JNothing) { | ||
| return null | ||
| } | ||
| val executorId = (json \ "Executor ID").extract[String] | ||
| val host = (json \ "Host").extract[String] | ||
| val executorId = (json \ "Executor ID").extract[String].intern() | ||
| val host = (json \ "Host").extract[String].intern() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In general intern'ing can be dangerous.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. scratch that, jdk7 improvements help.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From reading http://java-performance.info/string-intern-in-java-6-7-8/ it seems significantly safer in Java 7 (I also had the impression from tribal lore that |
||
| val port = (json \ "Port").extract[Int] | ||
| BlockManagerId(executorId, host, port) | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -86,7 +86,10 @@ object MimaExcludes { | |
| // [SPARK-18034] Upgrade to MiMa 0.1.11 to fix flakiness. | ||
| ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.aggregationDepth"), | ||
| ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.getAggregationDepth"), | ||
| ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.org$apache$spark$ml$param$shared$HasAggregationDepth$_setter_$aggregationDepth_=") | ||
| ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.org$apache$spark$ml$param$shared$HasAggregationDepth$_setter_$aggregationDepth_="), | ||
|
|
||
| // [SPARK-18236] Reduce duplicate objects in Spark UI and HistoryServer | ||
| ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.scheduler.TaskInfo.accumulables") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, I guess mima still fails to ignore
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, that's intentional; we removed the logic for excluding that annotation in #11751. The rationale is discussed in https://issues.apache.org/jira/browse/SPARK-13920:
|
||
| ) | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In order to maintain binary compatibility, I could rewrite this to be a
lazy valthat returns aListBufferformed from the "real"accumulableswhich can remain private. I might go ahead and do that just to avoid any chance of incompatibility-related problems, although I don't anticipate this being an issue in practice.