Actually filter out only the relevant RDDs

andrewor14 · andrewor14 · commit 2c3ef6ab40f4 · 2014-08-01T00:04:12.000-07:00
Prior to this commit, the changes in the PR actually demonstrate
little performance improvement under all workloads. This is because
we update all RDDInfos, rather than only the ones whose blocks are
being updated. Thus, even though the new filter logic in StorageStatus
is correct, we still iterate through all the RDD blocks every time
a task has an updated block.

This commit avoids this by only calling StorageLevel.updateRDDInfo
on the RDDs that need to be updated.
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
@@ -48,9 +48,11 @@ class StorageListener(storageStatusListener: StorageStatusListener) extends Spar
   /** Filter RDD info to include only those with cached partitions */
   def rddInfoList = _rddInfoMap.values.filter(_.numCachedPartitions > 0).toSeq
 
-  /** Update each RDD's info to reflect any updates in the RDD's storage status */
+  /** Update the storage info of the RDDs whose blocks are among the given updated blocks */
   private def updateRDDInfo(updatedBlocks: Seq[(BlockId, BlockStatus)]): Unit = {
-    StorageUtils.updateRddInfo(_rddInfoMap.values.toSeq, storageStatusList, updatedBlocks)
+    val rddIdsToUpdate = updatedBlocks.flatMap { case (bid, _) => bid.asRDDId.map(_.rddId) }.toSet
+    val rddInfosToUpdate = _rddInfoMap.values.toSeq.filter { s => rddIdsToUpdate.contains(s.id) }
+    StorageUtils.updateRddInfo(rddInfosToUpdate, storageStatusList, updatedBlocks)
   }
 
   /**