@@ -20,7 +20,6 @@ package org.apache.spark.storage
2020import scala .collection .Map
2121import scala .collection .mutable
2222
23- import org .apache .spark .SparkContext
2423import org .apache .spark .annotation .DeveloperApi
2524
2625/**
@@ -31,9 +30,24 @@ import org.apache.spark.annotation.DeveloperApi
3130@ DeveloperApi
3231class StorageStatus (val blockManagerId : BlockManagerId , val maxMem : Long ) {
3332
34- // This should not be mutated directly, but through the add/update/removeBlock methods
35- private val _blocks = new mutable.HashMap [BlockId , BlockStatus ]
36- private val _rddIds = new mutable.HashSet [Int ]
33+ /**
34+ * Internal representation of the blocks stored in this block manager.
35+ *
36+ * Common consumption patterns of these blocks include
37+ * (1) selecting all blocks,
38+ * (2) selecting only RDD blocks or,
39+ * (3) selecting only the blocks that belong to a specific RDD
40+ *
41+ * If we are only interested in a fraction of the blocks, as in (2) and (3), we should avoid
42+ * linearly scanning through all the blocks, which could be expensive if there are thousands
43+ * of blocks on each block manager. We achieve this by storing RDD blocks and non-RDD blocks
44+ * separately. In particular, RDD blocks are stored in a map indexed by RDD IDs, so we can
45+ * filter out the blocks of interest quickly.
46+ *
47+ * These collections should only be mutated through the add/update/removeBlock methods.
48+ */
49+ private val _rddBlocks = new mutable.HashMap [Int , mutable.Map [BlockId , BlockStatus ]]
50+ private val _nonRddBlocks = new mutable.HashMap [BlockId , BlockStatus ]
3751
3852 /**
3953 * Instantiate a StorageStatus with the given initial blocks. This essentially makes a copy of
@@ -44,67 +58,94 @@ class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
4458 initialBlocks.foreach { case (blockId, blockStatus) => addBlock(blockId, blockStatus) }
4559 }
4660
47- /** Return the blocks stored in this block manager as a mapping from ID to status. */
48- def blocks : Map [BlockId , BlockStatus ] = _blocks
61+ /** Return the blocks stored in this block manager. */
62+ def blocks : Seq [(BlockId , BlockStatus )] = {
63+ _nonRddBlocks.toSeq ++ rddBlocks.toSeq
64+ }
65+
66+ /** Return the RDD blocks stored in this block manager. */
67+ def rddBlocks : Seq [(BlockId , BlockStatus )] = {
68+ _rddBlocks.flatMap { case (_, blocks) => blocks }.toSeq
69+ }
70+
71+ /** Return the blocks that belong to the given RDD stored in this block manager. */
72+ def rddBlocksById (rddId : Int ): Seq [(BlockId , BlockStatus )] = {
73+ _rddBlocks.get(rddId).map(_.toSeq).getOrElse(Seq .empty)
74+ }
4975
50- /** Add the given block, keeping track of the RDD ID if this is an RDD block . */
76+ /** Add the given block to this storage status . */
5177 def addBlock (blockId : BlockId , blockStatus : BlockStatus ): Unit = {
5278 blockId match {
53- case RDDBlockId (rddId, _) => _rddIds.add(rddId)
79+ case RDDBlockId (rddId, _) =>
80+ _rddBlocks.getOrElseUpdate(rddId, new mutable.HashMap )(blockId) = blockStatus
5481 case _ =>
82+ _nonRddBlocks(blockId) = blockStatus
5583 }
56- _blocks(blockId) = blockStatus
5784 }
5885
59- /** Update the given block, keeping track of the RDD ID if this is an RDD block . */
86+ /** Update the given block in this storage status. If it doesn't already exist, add it . */
6087 def updateBlock (blockId : BlockId , blockStatus : BlockStatus ): Unit = addBlock(blockId, blockStatus)
6188
62- /** Remove the given block, keeping track of the RDD ID if this is an RDD block . */
89+ /** Remove the given block from this storage status . */
6390 def removeBlock (blockId : BlockId ): Option [BlockStatus ] = {
64- val removed = _blocks.remove(blockId)
6591 blockId match {
6692 case RDDBlockId (rddId, _) =>
67- if (rddBlocks(rddId).isEmpty) {
68- _rddIds.remove(rddId)
93+ if (_rddBlocks.contains(rddId)) {
94+ val removed = _rddBlocks(rddId).remove(blockId)
95+ // If the given RDD has no more blocks left, remove the RDD
96+ if (_rddBlocks(rddId).isEmpty) {
97+ _rddBlocks.remove(rddId)
98+ }
99+ removed
100+ } else {
101+ None
69102 }
70103 case _ =>
104+ _nonRddBlocks.remove(blockId)
71105 }
72- removed
73106 }
74107
75- /** Return the IDs of the RDDs which have blocks stored in this block manager. */
76- def rddIds : Seq [Int ] = _rddIds.toSeq
77-
78- /** Return the RDD blocks stored in this block manager as a mapping from ID to status. */
79- def rddBlocks : Map [RDDBlockId , BlockStatus ] =
80- blocks.filterKeys(_.isInstanceOf [RDDBlockId ]).asInstanceOf [Map [RDDBlockId , BlockStatus ]]
108+ /**
109+ * Return whether the given block is stored in this block manager in O(1) time.
110+ * Note that the alternative of doing this through `blocks` is O(blocks), which is much slower.
111+ */
112+ def containsBlock (blockId : BlockId ): Boolean = {
113+ blockId match {
114+ case RDDBlockId (rddId, _) =>
115+ _rddBlocks.get(rddId).filter(_.contains(blockId)).isDefined
116+ case _ =>
117+ _nonRddBlocks.contains(blockId)
118+ }
119+ }
81120
82121 /**
83- * Return the RDD blocks with the given RDD ID stored in this block manager as a mapping
84- * from ID to status .
122+ * Return the number of blocks in O(R) time, where R is the number of distinct RDD IDs.
123+ * Note that the alternative of doing this through `blocks` is O(blocks), which is much slower .
85124 */
86- def rddBlocks (rddId : Int ): Map [RDDBlockId , BlockStatus ] = rddBlocks.filterKeys(_.rddId == rddId)
125+ def numBlocks : Int = {
126+ _nonRddBlocks.size + _rddBlocks.values.map(_.size).reduceOption(_ + _).getOrElse(0 )
127+ }
87128
88129 /** Return the memory used by this block manager. */
89- def memUsed : Long = memUsed(blocks.values )
130+ def memUsed : Long = memUsed(blocks)
90131
91132 /** Return the memory used by the given RDD in this block manager. */
92- def memUsedByRDD (rddId : Int ): Long = memUsed(rddBlocks (rddId).values )
133+ def memUsedByRDD (rddId : Int ): Long = memUsed(rddBlocksById (rddId))
93134
94135 /** Return the memory remaining in this block manager. */
95136 def memRemaining : Long = maxMem - memUsed
96137
97138 /** Return the disk space used by this block manager. */
98- def diskUsed : Long = diskUsed(blocks.values )
139+ def diskUsed : Long = diskUsed(blocks)
99140
100141 /** Return the disk space used by the given RDD in this block manager. */
101- def diskUsedByRDD (rddId : Int ): Long = diskUsed(rddBlocks (rddId).values )
142+ def diskUsedByRDD (rddId : Int ): Long = diskUsed(rddBlocksById (rddId))
102143
103144 // Helper methods for computing memory and disk usages
104- private def memUsed (statuses : Iterable [ BlockStatus ]): Long =
105- statuses .map(_ .memSize) .reduceOption(_ + _).getOrElse(0L )
106- private def diskUsed (statuses : Iterable [ BlockStatus ]): Long =
107- statuses .map(_ .diskSize) .reduceOption(_ + _).getOrElse(0L )
145+ private def memUsed (_blocks : Seq [( BlockId , BlockStatus ) ]): Long =
146+ _blocks .map { case (_, s) => s .memSize } .reduceOption(_ + _).getOrElse(0L )
147+ private def diskUsed (_blocks : Seq [( BlockId , BlockStatus ) ]): Long =
148+ _blocks .map { case (_, s) => s .diskSize } .reduceOption(_ + _).getOrElse(0L )
108149}
109150
110151/** Helper methods for storage-related objects. */
@@ -123,18 +164,13 @@ private[spark] object StorageUtils {
123164 val rddId = rddInfo.id
124165
125166 // Collect all block statuses that belong to the given RDD
126- val newBlocks = updatedBlocks.filter { case (b , _) =>
127- b .asRDDId.filter(_.rddId == rddId).isDefined
167+ val newBlocks = updatedBlocks.filter { case (bid , _) =>
168+ bid .asRDDId.filter(_.rddId == rddId).isDefined
128169 }
129170 val newBlockIds = newBlocks.map { case (bid, _) => bid }.toSet
130- val oldBlocks = storageStatuses.flatMap { s =>
131- if (s.rddIds.contains(rddId)) {
132- // If the block is being updated, leave it out here in favor of the new status
133- s.rddBlocks(rddId).filterKeys { bid => ! newBlockIds.contains(bid) }
134- } else {
135- Seq .empty
136- }
137- }
171+ val oldBlocks = storageStatuses
172+ .flatMap(_.rddBlocksById(rddId))
173+ .filter { case (bid, _) => ! newBlockIds.contains(bid) } // avoid double counting
138174 val blocks = (oldBlocks ++ newBlocks).map { case (_, bstatus) => bstatus }
139175 val persistedBlocks = blocks.filter(_.isCached)
140176
@@ -151,30 +187,20 @@ private[spark] object StorageUtils {
151187 }
152188 }
153189
154- /** Return a mapping from block ID to the locations of the associated block. */
155- def getBlockLocations (storageStatuses : Seq [StorageStatus ]): Map [BlockId , Seq [String ]] = {
190+ /**
191+ * Return mapping from block ID to its locations for each block that belongs to the given RDD.
192+ */
193+ def getRDDBlockLocations (
194+ storageStatuses : Seq [StorageStatus ],
195+ rddId : Int ): Map [BlockId , Seq [String ]] = {
156196 val blockLocations = new mutable.HashMap [BlockId , mutable.ListBuffer [String ]]
157197 storageStatuses.foreach { status =>
158- status.blocks .foreach { case (bid, _) =>
198+ status.rddBlocksById(rddId) .foreach { case (bid, _) =>
159199 val location = status.blockManagerId.hostPort
160200 blockLocations.getOrElseUpdate(bid, mutable.ListBuffer .empty) += location
161201 }
162202 }
163203 blockLocations
164204 }
165205
166- /**
167- * Return a filtered list of storage statuses in which the only blocks remaining are the ones
168- * that belong to given RDD.
169- */
170- def filterByRDD (storageStatuses : Seq [StorageStatus ], rddId : Int ): Seq [StorageStatus ] = {
171- storageStatuses
172- .filter(_.rddIds.contains(rddId))
173- .map { status =>
174- new StorageStatus (
175- status.blockManagerId,
176- status.maxMem,
177- status.rddBlocks(rddId).asInstanceOf [Map [BlockId , BlockStatus ]])
178- }
179- }
180206}
0 commit comments