Refactor: move unfold logic to MemoryStore

andrewor14 · andrewor14 · commit 195abd771129 · 2014-07-07T16:49:58.000-07:00
This logic is also needed in other parts of the MemoryStore, e.g.
when we try to store deserialized bytes in memory. The unfolding
logic is specific to the memory case, so it makes sense for it to
reside in MemoryStore, as opposed to the higher level CacheManager.
diff --git a/core/src/main/scala/org/apache/spark/CacheManager.scala b/core/src/main/scala/org/apache/spark/CacheManager.scala
@@ -34,13 +34,6 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
   /** Keys of RDD partitions that are being computed/loaded. */
   private val loading = new mutable.HashSet[RDDBlockId]
 
-  /**
-   * The amount of space ensured for unrolling partitions, shared across all cores.
-   * This space is not reserved in advance, but allocated dynamically by dropping existing blocks.
-   * It must be a lazy val in order to access a mocked BlockManager's conf in tests properly.
-   */
-  private lazy val globalBufferMemory = BlockManager.getBufferMemory(blockManager.conf)
-
   /** Gets or computes an RDD partition. Used by RDD.iterator() when an RDD is cached. */
   def getOrCompute[T](
       rdd: RDD[T],
@@ -137,10 +130,12 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
       updatedBlocks: ArrayBuffer[(BlockId, BlockStatus)]): Iterator[T] = {
 
     if (!storageLevel.useMemory) {
-      /* This RDD is not to be cached in memory, so we can just pass the computed values
-       * as an iterator directly to the BlockManager, rather than first fully unrolling
+      /*
+       * This RDD is not to be cached in memory, so we can just pass the computed values
+       * as an iterator directly to the BlockManager, rather than first fully unfolding
        * it in memory. The latter option potentially uses much more memory and risks OOM
-       * exceptions that can be avoided. */
+       * exceptions that can be avoided.
+       */
       updatedBlocks ++= blockManager.put(key, values, storageLevel, tellMaster = true)
       blockManager.get(key) match {
         case Some(v) => v.data.asInstanceOf[Iterator[T]]
@@ -149,86 +144,38 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
           throw new BlockException(key, s"Block manager failed to return cached value for $key!")
       }
     } else {
-      /* This RDD is to be cached in memory. In this case we cannot pass the computed values
+      /*
+       * This RDD is to be cached in memory. In this case we cannot pass the computed values
        * to the BlockManager as an iterator and expect to read it back later. This is because
        * we may end up dropping a partition from memory store before getting it back, e.g.
-       * when the entirety of the RDD does not fit in memory. */
-
-      var count = 0                   // The number of elements unrolled so far
-      var dropPartition = false       // Whether to drop the new partition from memory
-      var previousSize = 0L           // Previous estimate of the size of our buffer
-      val memoryRequestPeriod = 1000  // How frequently we request for more memory for our buffer
-
-      val threadId = Thread.currentThread().getId
-      val cacheMemoryMap = SparkEnv.get.cacheMemoryMap
-      var buffer = new SizeTrackingAppendOnlyBuffer[Any]
-
-      try {
-        /* While adding values to the in-memory buffer, periodically check whether the memory
-         * restrictions for unrolling partitions are still satisfied. If not, stop immediately,
-         * and persist the partition to disk if specified by the storage level. This check is
-         * a safeguard against the scenario when a single partition does not fit in memory. */
-        while (values.hasNext && !dropPartition) {
-          buffer += values.next()
-          count += 1
-          if (count % memoryRequestPeriod == 1) {
-            // Calculate the amount of memory to request from the global memory pool
-            val currentSize = buffer.estimateSize()
-            val delta = math.max(currentSize - previousSize, 0)
-            val memoryToRequest = currentSize + delta
-            previousSize = currentSize
-
-            // Atomically check whether there is sufficient memory in the global pool to continue
-            cacheMemoryMap.synchronized {
-              val previouslyOccupiedMemory = cacheMemoryMap.get(threadId).getOrElse(0L)
-              val otherThreadsMemory = cacheMemoryMap.values.sum - previouslyOccupiedMemory
-
-              // Request for memory for the local buffer, and return whether request is granted
-              def requestForMemory(): Boolean = {
-                val availableMemory = blockManager.memoryStore.freeMemory - otherThreadsMemory
-                val granted = availableMemory > memoryToRequest
-                if (granted) { cacheMemoryMap(threadId) = memoryToRequest }
-                granted
-              }
-
-              // If the first request is not granted, try again after ensuring free space
-              // If there is still not enough space, give up and drop the partition
-              if (!requestForMemory()) {
-                val result = blockManager.memoryStore.ensureFreeSpace(key, globalBufferMemory)
-                updatedBlocks ++= result.droppedBlocks
-                dropPartition = !requestForMemory()
-              }
-            }
-          }
-        }
-
-        if (!dropPartition) {
-          // We have successfully unrolled the entire partition, so cache it in memory
-          updatedBlocks ++= blockManager.put(key, buffer.array, storageLevel, tellMaster = true)
-          buffer.iterator.asInstanceOf[Iterator[T]]
-        } else {
-          // We have exceeded our collective quota. This partition will not be cached in memory.
+       * when the entirety of the RDD does not fit in memory.
+       *
+       * In addition, we must be careful to not unfold the entire partition in memory at once.
+       * Otherwise, we may cause an OOM exception if the JVM does not have enough space for this
+       * single partition. Instead, we unfold the values cautiously, potentially aborting and
+       * dropping the partition to disk if applicable.
+       */
+      blockManager.memoryStore.unfoldSafely(key, values, storageLevel, updatedBlocks) match {
+        case Left(arrayValues) =>
+          // We have successfully unfolded the entire partition, so cache it in memory
+          updatedBlocks ++= blockManager.put(key, arrayValues, storageLevel, tellMaster = true)
+          arrayValues.iterator.asInstanceOf[Iterator[T]]
+        case Right(iteratorValues) =>
+          // There is not enough space to cache this partition in memory
+          var returnValues = iteratorValues.asInstanceOf[Iterator[T]]
           val persistToDisk = storageLevel.useDisk
-          logWarning(s"Failed to cache $key in memory! There is not enough space to unroll the " +
+          logWarning(s"Failed to cache $key in memory! There is not enough space to unfold the " +
             s"entire partition. " + (if (persistToDisk) "Persisting to disk instead." else ""))
-          var newValues = (buffer.iterator ++ values).asInstanceOf[Iterator[T]]
           if (persistToDisk) {
             val newLevel = StorageLevel(
               storageLevel.useDisk,
               useMemory = false,
               storageLevel.useOffHeap,
               deserialized = false,
               storageLevel.replication)
-            newValues = putInBlockManager[T](key, newValues, newLevel, updatedBlocks)
+            returnValues = putInBlockManager[T](key, returnValues, newLevel, updatedBlocks)
           }
-          newValues
-        }
-      } finally {
-        // Free up buffer for other threads
-        buffer = null
-        cacheMemoryMap.synchronized {
-          cacheMemoryMap(threadId) = 0
-        }
+          returnValues
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
@@ -22,7 +22,9 @@ import java.util.LinkedHashMap
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.spark.SparkEnv
 import org.apache.spark.util.{SizeEstimator, Utils}
+import org.apache.spark.util.collection.SizeTrackingAppendOnlyBuffer
 
 private case class MemoryEntry(value: Any, size: Long, deserialized: Boolean)
 
@@ -34,11 +36,20 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   extends BlockStore(blockManager) {
 
   private val entries = new LinkedHashMap[BlockId, MemoryEntry](32, 0.75f, true)
+
   @volatile private var currentMemory = 0L
-  // Object used to ensure that only one thread is putting blocks and if necessary, dropping
-  // blocks from the memory store.
+
+  // Object used to ensure that only one thread is putting blocks and if necessary,
+  // dropping blocks from the memory store.
   private val putLock = new Object()
 
+  /**
+   * The amount of space ensured for unfolding values in memory, shared across all cores.
+   * This space is not reserved in advance, but allocated dynamically by dropping existing blocks.
+   * It must be a lazy val in order to access a mocked BlockManager's conf in tests properly.
+   */
+  private lazy val globalBufferMemory = BlockManager.getBufferMemory(blockManager.conf)
+
   logInfo("MemoryStore started with capacity %s".format(Utils.bytesToString(maxMemory)))
 
   def freeMemory: Long = maxMemory - currentMemory
@@ -137,6 +148,87 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     logInfo("MemoryStore cleared")
   }
 
+  /**
+   * Unfold the given block in memory safely.
+   *
+   * The safety of this operation refers to avoiding potential OOM exceptions caused by
+   * unfolding the entirety of the block in memory at once. This is achieved by periodically
+   * checking whether the memory restrictions for unfolding blocks are still satisfied,
+   * stopping immediately if not. This check is a safeguard against the scenario in which
+   * there is not enough free memory to accommodate the entirety of a single block.
+   *
+   * This method returns either a fully unfolded array or a partially unfolded iterator.
+   */
+  def unfoldSafely(
+      blockId: BlockId,
+      values: Iterator[Any],
+      storageLevel: StorageLevel,
+      droppedBlocks: ArrayBuffer[(BlockId, BlockStatus)])
+    : Either[Array[Any], Iterator[Any]] = {
+
+    var count = 0                   // The number of elements unfolded so far
+    var enoughMemory = true         // Whether there is enough memory to unfold this block
+    var previousSize = 0L           // Previous estimate of the size of our buffer
+    val memoryRequestPeriod = 1000  // How frequently we request for more memory for our buffer
+
+    val threadId = Thread.currentThread().getId
+    val cacheMemoryMap = SparkEnv.get.cacheMemoryMap
+    var buffer = new SizeTrackingAppendOnlyBuffer[Any]
+
+    try {
+      while (values.hasNext && enoughMemory) {
+        buffer += values.next()
+        count += 1
+        if (count % memoryRequestPeriod == 1) {
+          // Calculate the amount of memory to request from the global memory pool
+          val currentSize = buffer.estimateSize()
+          val delta = math.max(currentSize - previousSize, 0)
+          val memoryToRequest = currentSize + delta
+          previousSize = currentSize
+
+          // Atomically check whether there is sufficient memory in the global pool to continue
+          cacheMemoryMap.synchronized {
+            val previouslyOccupiedMemory = cacheMemoryMap.get(threadId).getOrElse(0L)
+            val otherThreadsMemory = cacheMemoryMap.values.sum - previouslyOccupiedMemory
+
+            // Request for memory for the local buffer, and return whether request is granted
+            def requestForMemory(): Boolean = {
+              val availableMemory = freeMemory - otherThreadsMemory
+              val granted = availableMemory > memoryToRequest
+              if (granted) { cacheMemoryMap(threadId) = memoryToRequest }
+              granted
+            }
+
+            // If the first request is not granted, try again after ensuring free space
+            // If there is still not enough space, give up and drop the partition
+            if (!requestForMemory()) {
+              val result = ensureFreeSpace(blockId, globalBufferMemory)
+              droppedBlocks ++= result.droppedBlocks
+              enoughMemory = requestForMemory()
+            }
+          }
+        }
+      }
+
+      if (enoughMemory) {
+        // We successfully unfolded the entirety of this block
+        Left(buffer.array)
+      } else {
+        // We ran out of space while unfolding the values for this block
+        Right(buffer.iterator ++ values)
+      }
+
+    } finally {
+      // Unless we return an iterator that depends on the buffer, free up space for other threads
+      if (enoughMemory) {
+        buffer = null
+        cacheMemoryMap.synchronized {
+          cacheMemoryMap(threadId) = 0
+        }
+      }
+    }
+  }
+
   /**
    * Return the RDD ID that a given block ID is from, or None if it is not an RDD block.
    */