Refactor StorageStatus + add a bunch of tests

andrewor14 · andrewor14 · commit 53af15d25e19 · 2014-07-30T16:20:40.000-07:00
This commit refactors storage status to keep around a set of RDD
IDs which have blocks stored in the status' block manager. The
purpose is such that we don't have to linearly scan through every
single storage status' blocks if it doesn't even contain blocks
for the RDD we're interested in in the first place.

This commit also adds a bunch of tests for StorageStatus and
StorageUtils methods. There were previously a few minor bugs in
StorageUtils.blockLocationsFromStorageStatus and
StorageUtils.filterStorageStatusByRDD that are now fixed and tested.

Going forward, we need to first cleanup the method signatures to
reflect what they actually do. Then we will make things more
efficient now that we've set the stage.
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala
@@ -265,8 +265,9 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
 
   private def storageStatus: Array[StorageStatus] = {
     blockManagerInfo.map { case(blockManagerId, info) =>
-      val blockMap = mutable.Map[BlockId, BlockStatus](info.blocks.toSeq: _*)
-      new StorageStatus(blockManagerId, info.maxMem, blockMap)
+      val storageStatus = new StorageStatus(blockManagerId, info.maxMem)
+      info.blocks.foreach { case (id, status) => storageStatus.addBlock(id, status) }
+      storageStatus
     }.toArray
   }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala
@@ -55,11 +55,7 @@ private[spark] class BlockManagerSource(val blockManager: BlockManager, sc: Spar
   metricRegistry.register(MetricRegistry.name("disk", "diskSpaceUsed_MB"), new Gauge[Long] {
     override def getValue: Long = {
       val storageStatusList = blockManager.master.getStorageStatus
-      val diskSpaceUsed = storageStatusList
-        .flatMap(_.blocks.values.map(_.diskSize))
-        .reduceOption(_ + _)
-        .getOrElse(0L)
-
+      val diskSpaceUsed = storageStatusList.map(_.diskUsed).reduceOption(_ + _).getOrElse(0L)
       diskSpaceUsed / 1024 / 1024
     }
   })
diff --git a/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala b/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala
@@ -39,9 +39,9 @@ class StorageStatusListener extends SparkListener {
     filteredStatus.foreach { storageStatus =>
       updatedBlocks.foreach { case (blockId, updatedStatus) =>
         if (updatedStatus.storageLevel == StorageLevel.NONE) {
-          storageStatus.blocks.remove(blockId)
+          storageStatus.removeBlock(blockId)
         } else {
-          storageStatus.blocks(blockId) = updatedStatus
+          storageStatus.updateBlock(blockId, updatedStatus)
         }
       }
     }
@@ -50,9 +50,8 @@ class StorageStatusListener extends SparkListener {
   /** Update storage status list to reflect the removal of an RDD from the cache */
   private def updateStorageStatus(unpersistedRDDId: Int) {
     storageStatusList.foreach { storageStatus =>
-      val unpersistedBlocksIds = storageStatus.rddBlocks.keys.filter(_.rddId == unpersistedRDDId)
-      unpersistedBlocksIds.foreach { blockId =>
-        storageStatus.blocks.remove(blockId)
+      storageStatus.rddBlocks(unpersistedRDDId).foreach { case (blockId, _) =>
+        storageStatus.removeBlock(blockId)
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala b/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
@@ -28,55 +28,96 @@ import org.apache.spark.annotation.DeveloperApi
  * Storage information for each BlockManager.
  */
 @DeveloperApi
-class StorageStatus(
-    val blockManagerId: BlockManagerId,
-    val maxMem: Long,
-    val blocks: mutable.Map[BlockId, BlockStatus] = mutable.Map.empty) {
+class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
+  private val _blocks = new mutable.HashMap[BlockId, BlockStatus]
+  private val _rddIds = new mutable.HashSet[Int]
+
+  /** Return the blocks stored in this block manager as a mapping from ID to status. */
+  def blocks: Map[BlockId, BlockStatus] = _blocks
+
+  /** Add the given block, keeping track of the RDD ID if this is an RDD block. */
+  def addBlock(blockId: BlockId, blockStatus: BlockStatus): Unit = {
+    blockId match {
+      case RDDBlockId(rddId, _) => _rddIds.add(rddId)
+      case _ =>
+    }
+    _blocks(blockId) = blockStatus
+  }
 
-  def memUsed = blocks.values.map(_.memSize).reduceOption(_ + _).getOrElse(0L)
+  /** Update the given block, keeping track of the RDD ID if this is an RDD block. */
+  def updateBlock(blockId: BlockId, blockStatus: BlockStatus): Unit = addBlock(blockId, blockStatus)
+
+  /** Remove the given block, keeping track of the RDD ID if this is an RDD block. */
+  def removeBlock(blockId: BlockId): Option[BlockStatus] = {
+    val removed = _blocks.remove(blockId)
+    blockId match {
+      case RDDBlockId(rddId, _) =>
+        if (rddBlocks(rddId).isEmpty) {
+          _rddIds.remove(rddId)
+        }
+      case _ =>
+    }
+    removed
+  }
 
-  def memUsedByRDD(rddId: Int) =
-    rddBlocks.filterKeys(_.rddId == rddId).values.map(_.memSize).reduceOption(_ + _).getOrElse(0L)
+  /** Return the IDs of the RDDs which have blocks stored in this block manager. */
+  def rddIds: Seq[Int] = _rddIds.toSeq
 
-  def diskUsed = blocks.values.map(_.diskSize).reduceOption(_ + _).getOrElse(0L)
+  /** Return the RDD blocks stored in this block manager as a mapping from ID to status. */
+  def rddBlocks: Map[RDDBlockId, BlockStatus] =
+    blocks.filterKeys(_.isInstanceOf[RDDBlockId]).asInstanceOf[Map[RDDBlockId, BlockStatus]]
+
+  /**
+   * Return the RDD blocks with the given RDD ID stored in this block manager as a mapping
+   * from ID to status.
+   */
+  def rddBlocks(rddId: Int): Map[RDDBlockId, BlockStatus] = rddBlocks.filterKeys(_.rddId == rddId)
 
-  def diskUsedByRDD(rddId: Int) =
-    rddBlocks.filterKeys(_.rddId == rddId).values.map(_.diskSize).reduceOption(_ + _).getOrElse(0L)
+  /** Return the memory used by this block manager. */
+  def memUsed: Long = memUsed(blocks.values)
 
+  /** Return the memory used by the given RDD in this block manager. */
+  def memUsedByRDD(rddId: Int): Long = memUsed(rddBlocks(rddId).values)
+
+  /** Return the memory remaining in this block manager. */
   def memRemaining: Long = maxMem - memUsed
 
-  def rddBlocks = blocks.collect { case (rdd: RDDBlockId, status) => (rdd, status) }
+  /** Return the disk space used by this block manager. */
+  def diskUsed: Long = diskUsed(blocks.values)
+
+  /** Return the disk space used by the given RDD in this block manager. */
+  def diskUsedByRDD(rddId: Int): Long = diskUsed(rddBlocks(rddId).values)
+
+  // Helper methods for computing memory and disk usages
+  private def memUsed(statuses: Iterable[BlockStatus]): Long =
+    statuses.map(_.memSize).reduceOption(_ + _).getOrElse(0L)
+  private def diskUsed(statuses: Iterable[BlockStatus]): Long =
+    statuses.map(_.diskSize).reduceOption(_ + _).getOrElse(0L)
 }
 
 /** Helper methods for storage-related objects. */
 private[spark] object StorageUtils {
 
-  /**
-   * Returns basic information of all RDDs persisted in the given SparkContext. This does not
-   * include storage information.
-   */
-  def rddInfoFromSparkContext(sc: SparkContext): Array[RDDInfo] = {
-    sc.persistentRdds.values.map { rdd =>
+  /** Returns storage information of all RDDs persisted in the given SparkContext. */
+  def rddInfoFromStorageStatus(
+      storageStatuses: Seq[StorageStatus],
+      sc: SparkContext): Array[RDDInfo] = {
+    val rddInfos = sc.persistentRdds.values.map { rdd =>
       val rddName = Option(rdd.name).getOrElse(rdd.id.toString)
       val rddNumPartitions = rdd.partitions.size
       val rddStorageLevel = rdd.getStorageLevel
       val rddInfo = new RDDInfo(rdd.id, rddName, rddNumPartitions, rddStorageLevel)
       rddInfo
     }.toArray
-  }
-
-  /** Returns storage information of all RDDs persisted in the given SparkContext. */
-  def rddInfoFromStorageStatus(
-      storageStatuses: Seq[StorageStatus],
-      sc: SparkContext): Array[RDDInfo] = {
-    rddInfoFromStorageStatus(storageStatuses, rddInfoFromSparkContext(sc))
+    rddInfoFromStorageStatus(storageStatuses, rddInfos)
+    rddInfos
   }
 
   /** Returns storage information of all RDDs in the given list. */
   def rddInfoFromStorageStatus(
       storageStatuses: Seq[StorageStatus],
       rddInfos: Seq[RDDInfo],
-      updatedBlocks: Seq[(BlockId, BlockStatus)] = Seq.empty): Array[RDDInfo] = {
+      updatedBlocks: Seq[(BlockId, BlockStatus)] = Seq.empty): Unit = {
 
     // Mapping from a block ID -> its status
     val blockMap = mutable.Map(storageStatuses.flatMap(_.rddBlocks): _*)
@@ -94,7 +135,7 @@ private[spark] object StorageUtils {
     // Mapping from RDD ID -> the associated RDDInfo (with potentially outdated storage information)
     val rddInfoMap = rddInfos.map { info => (info.id, info) }.toMap
 
-    val rddStorageInfos = rddBlockMap.flatMap { case (rddId, blocks) =>
+    rddBlockMap.foreach { case (rddId, blocks) =>
       // Add up memory, disk and Tachyon sizes
       val persistedBlocks =
         blocks.filter { status => status.memSize + status.diskSize + status.tachyonSize > 0 }
@@ -111,31 +152,31 @@ private[spark] object StorageUtils {
         rddInfo.tachyonSize = tachyonSize
         rddInfo
       }
-    }.toArray
-
-    scala.util.Sorting.quickSort(rddStorageInfos)
-    rddStorageInfos
+    }
   }
 
   /** Returns a mapping from BlockId to the locations of the associated block. */
   def blockLocationsFromStorageStatus(
       storageStatuses: Seq[StorageStatus]): Map[BlockId, Seq[String]] = {
+    // An ungrouped list of (blockId, location) pairs
     val blockLocationPairs = storageStatuses.flatMap { storageStatus =>
       storageStatus.blocks.map { case (bid, _) => (bid, storageStatus.blockManagerId.hostPort) }
     }
-    blockLocationPairs.toMap
+    blockLocationPairs
       .groupBy { case (blockId, _) => blockId }
-      .mapValues(_.values.toSeq)
+      .mapValues { rddLocations => rddLocations.map { case (_, location) => location } }
   }
 
   /** Filters the given list of StorageStatus by the given RDD ID. */
   def filterStorageStatusByRDD(
       storageStatuses: Seq[StorageStatus],
       rddId: Int): Array[StorageStatus] = {
-    storageStatuses.map { status =>
-      val filteredBlocks = status.rddBlocks.filterKeys(_.rddId == rddId).toSeq
-      val filteredBlockMap = mutable.Map[BlockId, BlockStatus](filteredBlocks: _*)
-      new StorageStatus(status.blockManagerId, status.maxMem, filteredBlockMap)
-    }.toArray
+    storageStatuses
+      .filter(_.rddIds.contains(rddId))
+      .map { status =>
+        val newStatus = new StorageStatus(status.blockManagerId, status.maxMem)
+        status.rddBlocks(rddId).foreach { case (bid, bstatus) => newStatus.addBlock(bid, bstatus) }
+        newStatus
+      }.toArray
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
@@ -51,7 +51,7 @@ private[ui] class ExecutorsPage(parent: ExecutorsTab) extends WebUIPage("") {
     val storageStatusList = listener.storageStatusList
     val maxMem = storageStatusList.map(_.maxMem).fold(0L)(_ + _)
     val memUsed = storageStatusList.map(_.memUsed).fold(0L)(_ + _)
-    val diskSpaceUsed = storageStatusList.flatMap(_.blocks.values.map(_.diskSize)).fold(0L)(_ + _)
+    val diskSpaceUsed = storageStatusList.map(_.diskUsed).reduceOption(_ + _).getOrElse(0L)
     val execInfo = for (statusId <- 0 until storageStatusList.size) yield getExecInfo(statusId)
     val execInfoSorted = execInfo.sortBy(_.id)
 
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
@@ -51,9 +51,7 @@ class StorageListener(storageStatusListener: StorageStatusListener) extends Spar
   /** Update each RDD's info to reflect any updates to the RDD's storage status */
   private def updateRDDInfo(updatedBlocks: Seq[(BlockId, BlockStatus)] = Seq.empty) {
     val rddInfos = _rddInfoMap.values.toSeq
-    val updatedRddInfos =
-      StorageUtils.rddInfoFromStorageStatus(storageStatusList, rddInfos, updatedBlocks)
-    updatedRddInfos.foreach { info => _rddInfoMap(info.id) = info }
+    StorageUtils.rddInfoFromStorageStatus(storageStatusList, rddInfos, updatedBlocks)
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/storage/StorageSuite.scala b/core/src/test/scala/org/apache/spark/storage/StorageSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -39,9 +39,9 @@ class StorageStatusListener extends SparkListener {`
`39`	`39`	`filteredStatus.foreach { storageStatus =>`
`40`	`40`	`updatedBlocks.foreach { case (blockId, updatedStatus) =>`
`41`	`41`	`if (updatedStatus.storageLevel == StorageLevel.NONE) {`
`42`		`- storageStatus.blocks.remove(blockId)`
	`42`	`+ storageStatus.removeBlock(blockId)`
`43`	`43`	`} else {`
`44`		`- storageStatus.blocks(blockId) = updatedStatus`
	`44`	`+ storageStatus.updateBlock(blockId, updatedStatus)`
`45`	`45`	`}`
`46`	`46`	`}`
`47`	`47`	`}`
`@@ -50,9 +50,8 @@ class StorageStatusListener extends SparkListener {`
`50`	`50`	`/** Update storage status list to reflect the removal of an RDD from the cache */`
`51`	`51`	`private def updateStorageStatus(unpersistedRDDId: Int) {`
`52`	`52`	`storageStatusList.foreach { storageStatus =>`
`53`		`- val unpersistedBlocksIds = storageStatus.rddBlocks.keys.filter(_.rddId == unpersistedRDDId)`
`54`		`- unpersistedBlocksIds.foreach { blockId =>`
`55`		`- storageStatus.blocks.remove(blockId)`
	`53`	`+ storageStatus.rddBlocks(unpersistedRDDId).foreach { case (blockId, _) =>`
	`54`	`+ storageStatus.removeBlock(blockId)`
`56`	`55`	`}`
`57`	`56`	`}`
`58`	`57`	`}`