apache
diff --git a/‎core/src/main/scala/org/apache/spark/MapOutputTracker.scala‎
Lines changed: 1 addition & 28 deletions b/‎core/src/main/scala/org/apache/spark/MapOutputTracker.scala‎
Lines changed: 1 addition & 28 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/network/nio/NioBlockTransferService.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/network/nio/NioBlockTransferService.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala‎
Lines changed: 3 additions & 14 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala‎
Lines changed: 3 additions & 14 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala‎
Lines changed: 110 additions & 9 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala‎
Lines changed: 110 additions & 9 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala‎
Lines changed: 3 additions & 5 deletions b/‎core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala‎
Lines changed: 1 addition & 2 deletions b/‎core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveActor.scala‎
Lines changed: 2 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveActor.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/util/JsonProtocol.scala‎
Lines changed: 0 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/util/JsonProtocol.scala‎
Lines changed: 0 additions & 1 deletion
@@ -349,7 +349,6 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr
 }
 
 private[spark] object MapOutputTracker {
-  private val LOG_BASE = 1.1
 
   // Serialize an array of map output locations into an efficient byte format so that we can send
   // it to reduce tasks. We do this by compressing the serialized bytes using GZIP. They will
@@ -385,34 +384,8 @@ private[spark] object MapOutputTracker {
           throw new MetadataFetchFailedException(
             shuffleId, reduceId, "Missing an output location for shuffle " + shuffleId)
         } else {
-          (status.location, decompressSize(status.compressedSizes(reduceId)))
+          (status.location, status.getSizeForBlock(reduceId))
         }
     }
   }
-
-  /**
-   * Compress a size in bytes to 8 bits for efficient reporting of map output sizes.
-   * We do this by encoding the log base 1.1 of the size as an integer, which can support
-   * sizes up to 35 GB with at most 10% error.
-   */
-  def compressSize(size: Long): Byte = {
-    if (size == 0) {
-      0
-    } else if (size <= 1L) {
-      1
-    } else {
-      math.min(255, math.ceil(math.log(size) / math.log(LOG_BASE)).toInt).toByte
-    }
-  }
-
-  /**
-   * Decompress an 8-bit encoded block size, using the reverse operation of compressSize.
-   */
-  def decompressSize(compressedSize: Byte): Long = {
-    if (compressedSize == 0) {
-      0
-    } else {
-      math.pow(LOG_BASE, compressedSize & 0xFF).toLong
-    }
-  }
 }
@@ -205,6 +205,6 @@ final class NioBlockTransferService(conf: SparkConf, securityManager: SecurityMa
     val buffer = blockDataManager.getBlockData(blockId)
     logDebug("GetBlock " + blockId + " used " + Utils.getUsedTimeMs(startTimeMs)
       + " and got buffer " + buffer)
-    buffer.nioByteBuffer()
+    if (buffer == null) null else buffer.nioByteBuffer()
   }
 }
@@ -20,15 +20,12 @@ package org.apache.spark.scheduler
 import java.io.{File, FileNotFoundException, IOException, PrintWriter}
 import java.text.SimpleDateFormat
 import java.util.{Date, Properties}
-import java.util.concurrent.LinkedBlockingQueue
 
 import scala.collection.mutable.HashMap
 
 import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.executor.{DataReadMethod, TaskMetrics}
-import org.apache.spark.rdd.RDD
-import org.apache.spark.storage.StorageLevel
+import org.apache.spark.executor.TaskMetrics
 
 /**
  * :: DeveloperApi ::
@@ -62,24 +59,16 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener
   private val dateFormat = new ThreadLocal[SimpleDateFormat]() {
     override def initialValue() = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss")
   }
-  private val eventQueue = new LinkedBlockingQueue[SparkListenerEvent]
 
   createLogDir()
 
-  // The following 5 functions are used only in testing.
-  private[scheduler] def getLogDir = logDir
-  private[scheduler] def getJobIdToPrintWriter = jobIdToPrintWriter
-  private[scheduler] def getStageIdToJobId = stageIdToJobId
-  private[scheduler] def getJobIdToStageIds = jobIdToStageIds
-  private[scheduler] def getEventQueue = eventQueue
-
   /** Create a folder for log files, the folder's name is the creation time of jobLogger */
   protected def createLogDir() {
     val dir = new File(logDir + "/" + logDirName + "/")
     if (dir.exists()) {
       return
     }
-    if (dir.mkdirs() == false) {
+    if (!dir.mkdirs()) {
       // JobLogger should throw a exception rather than continue to construct this object.
       throw new IOException("create log directory error:" + logDir + "/" + logDirName + "/")
     }
@@ -261,7 +250,7 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener
   protected def recordJobProperties(jobId: Int, properties: Properties) {
     if (properties != null) {
       val description = properties.getProperty(SparkContext.SPARK_JOB_DESCRIPTION, "")
-      jobLogInfo(jobId, description, false)
+      jobLogInfo(jobId, description, withTime = false)
     }
   }
 
 
@@ -24,22 +24,123 @@ import org.apache.spark.storage.BlockManagerId
 /**
  * Result returned by a ShuffleMapTask to a scheduler. Includes the block manager address that the
  * task ran on as well as the sizes of outputs for each reducer, for passing on to the reduce tasks.
- * The map output sizes are compressed using MapOutputTracker.compressSize.
  */
-private[spark] class MapStatus(var location: BlockManagerId, var compressedSizes: Array[Byte])
-  extends Externalizable {
+private[spark] sealed trait MapStatus {
+  /** Location where this task was run. */
+  def location: BlockManagerId
 
-  def this() = this(null, null)  // For deserialization only
+  /** Estimated size for the reduce block, in bytes. */
+  def getSizeForBlock(reduceId: Int): Long
+}
+
+
+private[spark] object MapStatus {
+
+  def apply(loc: BlockManagerId, uncompressedSizes: Array[Long]): MapStatus = {
+    if (uncompressedSizes.length > 2000) {
+      new HighlyCompressedMapStatus(loc, uncompressedSizes)
+    } else {
+      new CompressedMapStatus(loc, uncompressedSizes)
+    }
+  }
+
+  private[this] val LOG_BASE = 1.1
+
+  /**
+   * Compress a size in bytes to 8 bits for efficient reporting of map output sizes.
+   * We do this by encoding the log base 1.1 of the size as an integer, which can support
+   * sizes up to 35 GB with at most 10% error.
+   */
+  def compressSize(size: Long): Byte = {
+    if (size == 0) {
+      0
+    } else if (size <= 1L) {
+      1
+    } else {
+      math.min(255, math.ceil(math.log(size) / math.log(LOG_BASE)).toInt).toByte
+    }
+  }
+
+  /**
+   * Decompress an 8-bit encoded block size, using the reverse operation of compressSize.
+   */
+  def decompressSize(compressedSize: Byte): Long = {
+    if (compressedSize == 0) {
+      0
+    } else {
+      math.pow(LOG_BASE, compressedSize & 0xFF).toLong
+    }
+  }
+}
+
+
+/**
+ * A [[MapStatus]] implementation that tracks the size of each block. Size for each block is
+ * represented using a single byte.
+ *
+ * @param loc location where the task is being executed.
+ * @param compressedSizes size of the blocks, indexed by reduce partition id.
+ */
+private[spark] class CompressedMapStatus(
+    private[this] var loc: BlockManagerId,
+    private[this] var compressedSizes: Array[Byte])
+  extends MapStatus with Externalizable {
+
+  protected def this() = this(null, null.asInstanceOf[Array[Byte]])  // For deserialization only
+
+  def this(loc: BlockManagerId, uncompressedSizes: Array[Long]) {
+    this(loc, uncompressedSizes.map(MapStatus.compressSize))
+  }
 
-  def writeExternal(out: ObjectOutput) {
-    location.writeExternal(out)
+  override def location: BlockManagerId = loc
+
+  override def getSizeForBlock(reduceId: Int): Long = {
+    MapStatus.decompressSize(compressedSizes(reduceId))
+  }
+
+  override def writeExternal(out: ObjectOutput): Unit = {
+    loc.writeExternal(out)
     out.writeInt(compressedSizes.length)
     out.write(compressedSizes)
   }
 
-  def readExternal(in: ObjectInput) {
-    location = BlockManagerId(in)
-    compressedSizes = new Array[Byte](in.readInt())
+  override def readExternal(in: ObjectInput): Unit = {
+    loc = BlockManagerId(in)
+    val len = in.readInt()
+    compressedSizes = new Array[Byte](len)
     in.readFully(compressedSizes)
   }
 }
+
+
+/**
+ * A [[MapStatus]] implementation that only stores the average size of the blocks.
+ *
+ * @param loc location where the task is being executed.
+ * @param avgSize average size of all the blocks
+ */
+private[spark] class HighlyCompressedMapStatus(
+    private[this] var loc: BlockManagerId,
+    private[this] var avgSize: Long)
+  extends MapStatus with Externalizable {
+
+  def this(loc: BlockManagerId, uncompressedSizes: Array[Long]) {
+    this(loc, uncompressedSizes.sum / uncompressedSizes.length)
+  }
+
+  protected def this() = this(null, 0L)  // For deserialization only
+
+  override def location: BlockManagerId = loc
+
+  override def getSizeForBlock(reduceId: Int): Long = avgSize
+
+  override def writeExternal(out: ObjectOutput): Unit = {
+    loc.writeExternal(out)
+    out.writeLong(avgSize)
+  }
+
+  override def readExternal(in: ObjectInput): Unit = {
+    loc = BlockManagerId(in)
+    avgSize = in.readLong()
+  }
+}
@@ -103,13 +103,11 @@ private[spark] class HashShuffleWriter[K, V](
 
   private def commitWritesAndBuildStatus(): MapStatus = {
     // Commit the writes. Get the size of each bucket block (total block size).
-    val compressedSizes = shuffle.writers.map { writer: BlockObjectWriter =>
+    val sizes: Array[Long] = shuffle.writers.map { writer: BlockObjectWriter =>
       writer.commitAndClose()
-      val size = writer.fileSegment().length
-      MapOutputTracker.compressSize(size)
+      writer.fileSegment().length
     }
-
-    new MapStatus(blockManager.blockManagerId, compressedSizes)
+    MapStatus(blockManager.blockManagerId, sizes)
   }
 
   private def revertWrites(): Unit = {
 
@@ -70,8 +70,7 @@ private[spark] class SortShuffleWriter[K, V, C](
     val partitionLengths = sorter.writePartitionedFile(blockId, context, outputFile)
     shuffleBlockManager.writeIndexFile(dep.shuffleId, mapId, partitionLengths)
 
-    mapStatus = new MapStatus(blockManager.blockManagerId,
-      partitionLengths.map(MapOutputTracker.compressSize))
+    mapStatus = MapStatus(blockManager.blockManagerId, partitionLengths)
   }
 
   /** Close this writer, passing along whether the map completed */
 
@@ -58,9 +58,9 @@ class BlockManagerSlaveActor(
         SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId)
       }
 
-    case RemoveBroadcast(broadcastId, tellMaster) =>
+    case RemoveBroadcast(broadcastId, _) =>
       doAsync[Int]("removing broadcast " + broadcastId, sender) {
-        blockManager.removeBroadcast(broadcastId, tellMaster)
+        blockManager.removeBroadcast(broadcastId, tellMaster = true)
       }
 
     case GetBlockStatus(blockId, _) =>
 
@@ -25,7 +25,6 @@ import scala.collection.Map
 import org.json4s.DefaultFormats
 import org.json4s.JsonDSL._
 import org.json4s.JsonAST._
-import org.json4s.jackson.JsonMethods._
 
 
 import org.apache.spark.executor.{DataReadMethod, InputMetrics, ShuffleReadMetrics,
Original file line number	Diff line number	Diff line change
`@@ -205,6 +205,6 @@ final class NioBlockTransferService(conf: SparkConf, securityManager: SecurityMa`
`205`	`205`	`val buffer = blockDataManager.getBlockData(blockId)`
`206`	`206`	`logDebug("GetBlock " + blockId + " used " + Utils.getUsedTimeMs(startTimeMs)`
`207`	`207`	`+ " and got buffer " + buffer)`
`208`		`- buffer.nioByteBuffer()`
	`208`	`+ if (buffer == null) null else buffer.nioByteBuffer()`
`209`	`209`	`}`
`210`	`210`	`}`
Original file line number	Diff line number	Diff line change
`@@ -70,8 +70,7 @@ private[spark] class SortShuffleWriter[K, V, C](`
`70`	`70`	`val partitionLengths = sorter.writePartitionedFile(blockId, context, outputFile)`
`71`	`71`	`shuffleBlockManager.writeIndexFile(dep.shuffleId, mapId, partitionLengths)`
`72`	`72`
`73`		`- mapStatus = new MapStatus(blockManager.blockManagerId,`
`74`		`- partitionLengths.map(MapOutputTracker.compressSize))`
	`73`	`+ mapStatus = MapStatus(blockManager.blockManagerId, partitionLengths)`
`75`	`74`	`}`
`76`	`75`
`77`	`76`	`/** Close this writer, passing along whether the map completed */`
Original file line number	Diff line number	Diff line change
`@@ -58,9 +58,9 @@ class BlockManagerSlaveActor(`
`58`	`58`	`SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId)`
`59`	`59`	`}`
`60`	`60`
`61`		`- case RemoveBroadcast(broadcastId, tellMaster) =>`
	`61`	`+ case RemoveBroadcast(broadcastId, _) =>`
`62`	`62`	`doAsync[Int]("removing broadcast " + broadcastId, sender) {`
`63`		`- blockManager.removeBroadcast(broadcastId, tellMaster)`
	`63`	`+ blockManager.removeBroadcast(broadcastId, tellMaster = true)`
`64`	`64`	`}`
`65`	`65`
`66`	`66`	`case GetBlockStatus(blockId, _) =>`