-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-12817] Simplify CacheManager code and remove unused BlockManager methods #10748
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -75,7 +75,17 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging { | |
|
|
||
| // Otherwise, cache the values and keep track of any updates in block statuses | ||
| val updatedBlocks = new ArrayBuffer[(BlockId, BlockStatus)] | ||
| val cachedValues = putInBlockManager(key, computedValues, storageLevel, updatedBlocks) | ||
| val cachedValues = { | ||
| updatedBlocks ++= | ||
| blockManager.putIterator(key, computedValues, storageLevel, tellMaster = true) | ||
| blockManager.get(key) match { | ||
| case Some(v) => v.data.asInstanceOf[Iterator[T]] | ||
| case None => | ||
| val msg = s"Block manager failed to return cached value for $key!" | ||
| logInfo(msg) | ||
| throw new BlockException(key, msg) | ||
| } | ||
| } | ||
| val metrics = context.taskMetrics | ||
| val lastUpdatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]()) | ||
| metrics.updatedBlocks = Some(lastUpdatedBlocks ++ updatedBlocks.toSeq) | ||
|
|
@@ -126,67 +136,4 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging { | |
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Cache the values of a partition, keeping track of any updates in the storage statuses of | ||
| * other blocks along the way. | ||
| * | ||
| * The effective storage level refers to the level that actually specifies BlockManager put | ||
| * behavior, not the level originally specified by the user. This is mainly for forcing a | ||
| * MEMORY_AND_DISK partition to disk if there is not enough room to unroll the partition, | ||
| * while preserving the the original semantics of the RDD as specified by the application. | ||
| */ | ||
| private def putInBlockManager[T]( | ||
| key: BlockId, | ||
| values: Iterator[T], | ||
| level: StorageLevel, | ||
| updatedBlocks: ArrayBuffer[(BlockId, BlockStatus)], | ||
| effectiveStorageLevel: Option[StorageLevel] = None): Iterator[T] = { | ||
|
|
||
| val putLevel = effectiveStorageLevel.getOrElse(level) | ||
| if (!putLevel.useMemory) { | ||
| /* | ||
| * This RDD is not to be cached in memory, so we can just pass the computed values as an | ||
| * iterator directly to the BlockManager rather than first fully unrolling it in memory. | ||
| */ | ||
| updatedBlocks ++= | ||
| blockManager.putIterator(key, values, level, tellMaster = true, effectiveStorageLevel) | ||
| blockManager.get(key) match { | ||
| case Some(v) => v.data.asInstanceOf[Iterator[T]] | ||
| case None => | ||
| logInfo(s"Failure to store $key") | ||
| throw new BlockException(key, s"Block manager failed to return cached value for $key!") | ||
| } | ||
| } else { | ||
| /* | ||
| * This RDD is to be cached in memory. In this case we cannot pass the computed values | ||
| * to the BlockManager as an iterator and expect to read it back later. This is because | ||
| * we may end up dropping a partition from memory store before getting it back. | ||
| * | ||
| * In addition, we must be careful to not unroll the entire partition in memory at once. | ||
| * Otherwise, we may cause an OOM exception if the JVM does not have enough space for this | ||
| * single partition. Instead, we unroll the values cautiously, potentially aborting and | ||
| * dropping the partition to disk if applicable. | ||
| */ | ||
| blockManager.memoryStore.unrollSafely(key, values, updatedBlocks) match { | ||
| case Left(arr) => | ||
| // We have successfully unrolled the entire partition, so cache it in memory | ||
| updatedBlocks ++= | ||
| blockManager.putArray(key, arr, level, tellMaster = true, effectiveStorageLevel) | ||
| arr.iterator.asInstanceOf[Iterator[T]] | ||
| case Right(it) => | ||
| // There is not enough space to cache this partition in memory | ||
| val returnValues = it.asInstanceOf[Iterator[T]] | ||
| if (putLevel.useDisk) { | ||
| logWarning(s"Persisting partition $key to disk instead.") | ||
| val diskOnlyLevel = StorageLevel(useDisk = true, useMemory = false, | ||
| useOffHeap = false, deserialized = false, putLevel.replication) | ||
| putInBlockManager[T](key, returnValues, level, updatedBlocks, Some(diskOnlyLevel)) | ||
| } else { | ||
| returnValues | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| } | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea behind deleting this block of code is the fact that |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -96,7 +96,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo | |
| putIterator(blockId, values, level, returnValues = true) | ||
| } else { | ||
| val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)] | ||
| tryToPut(blockId, bytes, bytes.limit, deserialized = false, droppedBlocks) | ||
| tryToPut(blockId, () => bytes, bytes.limit, deserialized = false, droppedBlocks) | ||
| PutResult(bytes.limit(), Right(bytes.duplicate()), droppedBlocks) | ||
| } | ||
| } | ||
|
|
@@ -122,23 +122,6 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo | |
| PutResult(size, data, droppedBlocks) | ||
| } | ||
|
|
||
| override def putArray( | ||
| blockId: BlockId, | ||
| values: Array[Any], | ||
| level: StorageLevel, | ||
| returnValues: Boolean): PutResult = { | ||
| val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)] | ||
| if (level.deserialized) { | ||
| val sizeEstimate = SizeEstimator.estimate(values.asInstanceOf[AnyRef]) | ||
| tryToPut(blockId, values, sizeEstimate, deserialized = true, droppedBlocks) | ||
| PutResult(sizeEstimate, Left(values.iterator), droppedBlocks) | ||
| } else { | ||
| val bytes = blockManager.dataSerialize(blockId, values.iterator) | ||
| tryToPut(blockId, bytes, bytes.limit, deserialized = false, droppedBlocks) | ||
| PutResult(bytes.limit(), Right(bytes.duplicate()), droppedBlocks) | ||
| } | ||
| } | ||
|
|
||
| override def putIterator( | ||
| blockId: BlockId, | ||
| values: Iterator[Any], | ||
|
|
@@ -170,9 +153,15 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo | |
| unrolledValues match { | ||
| case Left(arrayValues) => | ||
| // Values are fully unrolled in memory, so store them as an array | ||
| val res = putArray(blockId, arrayValues, level, returnValues) | ||
| droppedBlocks ++= res.droppedBlocks | ||
| PutResult(res.size, res.data, droppedBlocks) | ||
| if (level.deserialized) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an inlining of the previous |
||
| val sizeEstimate = SizeEstimator.estimate(arrayValues.asInstanceOf[AnyRef]) | ||
| tryToPut(blockId, () => arrayValues, sizeEstimate, deserialized = true, droppedBlocks) | ||
| PutResult(sizeEstimate, Left(arrayValues.iterator), droppedBlocks) | ||
| } else { | ||
| val bytes = blockManager.dataSerialize(blockId, arrayValues.iterator) | ||
| tryToPut(blockId, () => bytes, bytes.limit, deserialized = false, droppedBlocks) | ||
| PutResult(bytes.limit(), Right(bytes.duplicate()), droppedBlocks) | ||
| } | ||
| case Right(iteratorValues) => | ||
| // Not enough space to unroll this block; drop to disk if applicable | ||
| if (level.useDisk && allowPersistToDisk) { | ||
|
|
@@ -246,7 +235,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo | |
| * This method returns either an array with the contents of the entire block or an iterator | ||
| * containing the values of the block (if the array would have exceeded available memory). | ||
| */ | ||
| def unrollSafely( | ||
| private[storage] def unrollSafely( | ||
| blockId: BlockId, | ||
| values: Iterator[Any], | ||
| droppedBlocks: ArrayBuffer[(BlockId, BlockStatus)]) | ||
|
|
@@ -333,15 +322,6 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo | |
| blockId.asRDDId.map(_.rddId) | ||
| } | ||
|
|
||
| private def tryToPut( | ||
| blockId: BlockId, | ||
| value: Any, | ||
| size: Long, | ||
| deserialized: Boolean, | ||
| droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = { | ||
| tryToPut(blockId, () => value, size, deserialized, droppedBlocks) | ||
| } | ||
|
|
||
| /** | ||
| * Try to put in a set of values, if we can free up enough space. The value should either be | ||
| * an Array if deserialized is true or a ByteBuffer otherwise. Its (possibly estimated) size | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This code is moved from the old
putInBlockManagermethod.