apache · squito · Feb 20, 2015 · Feb 23, 2015 · Feb 24, 2015 · Feb 25, 2015
diff --git a/core/src/main/java/org/apache/spark/network/buffer/LargeByteBufferInputStream.java b/core/src/main/java/org/apache/spark/network/buffer/LargeByteBufferInputStream.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.network.buffer;
+
+import java.io.InputStream;
+
+import com.google.common.annotations.VisibleForTesting;
+
+/**
+ * Reads data from a LargeByteBuffer, and optionally cleans it up using buffer.dispose()
+ * when the stream is closed (e.g. to close a memory-mapped file).
+ */
+public class LargeByteBufferInputStream extends InputStream {
+
+  private LargeByteBuffer buffer;
+  private final boolean dispose;
+
+  public LargeByteBufferInputStream(LargeByteBuffer buffer, boolean dispose) {
+    this.buffer = buffer;
+    this.dispose = dispose;
+  }
+
+  public LargeByteBufferInputStream(LargeByteBuffer buffer) {
+    this(buffer, false);
+  }
+
+  @Override
+  public int read() {
+    if (buffer == null || buffer.remaining() == 0) {
+      return -1;
+    } else {
+      return buffer.get() & 0xFF;
+    }
+  }
+
+  @Override
+  public int read(byte[] dest) {
+    return read(dest, 0, dest.length);
+  }
+
+  @Override
+  public int read(byte[] dest, int offset, int length) {
+    if (buffer == null || buffer.remaining() == 0) {
+      return -1;
+    } else {
+      int amountToGet = (int) Math.min(buffer.remaining(), length);
+      buffer.get(dest, offset, amountToGet);
+      return amountToGet;
+    }
+  }
+
+  @Override
+  public long skip(long toSkip) {
+    if (buffer != null) {
+      return buffer.skip(toSkip);
+    } else {
+      return 0L;
+    }
+  }
+
+  // only for testing
+  @VisibleForTesting
+  boolean disposed = false;
+
+  /**
+   * Clean up the buffer, and potentially dispose of it
+   */
+  @Override
+  public void close() {
+    if (buffer != null) {
+      if (dispose) {
+        buffer.dispose();
+        disposed = true;
+      }
+      buffer = null;
+    }
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/network/buffer/LargeByteBufferOutputStream.java b/core/src/main/java/org/apache/spark/network/buffer/LargeByteBufferOutputStream.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.network.buffer;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.spark.util.io.ByteArrayChunkOutputStream;
+
+/**
+ * An OutputStream that will write all data to memory.  It supports writing over 2GB
+ * and the resulting data can be retrieved as a
+ * {@link org.apache.spark.network.buffer.LargeByteBuffer}
+ */
+public class LargeByteBufferOutputStream extends OutputStream {
+
+  private final ByteArrayChunkOutputStream output;
+
+  /**
+   * Create a new LargeByteBufferOutputStream which writes to byte arrays of the given size.  Note
+   * that <code>chunkSize</code> has <b>no effect</b> on the LargeByteBuffer returned by
+   * {@link #largeBuffer()}.
+   *
+   * @param chunkSize size of the byte arrays used by this output stream, in bytes
+   */
+  public LargeByteBufferOutputStream(int chunkSize) {
+    output = new ByteArrayChunkOutputStream(chunkSize);
+  }
+
+  @Override
+  public void write(int b) {
+    output.write(b);
+  }
+
+  @Override
+  public void write(byte[] bytes, int off, int len) {
+    output.write(bytes, off, len);
+  }
+
+  /**
+   * Get all of the data written to the stream so far as a LargeByteBuffer.  This method can be
+   * called multiple times, and each returned buffer will be completely independent (the data
+   * is copied for each returned buffer).  It does not close the stream.
+   *
+   * @return the data written to the stream as a LargeByteBuffer
+   */
+  public LargeByteBuffer largeBuffer() {
+    return largeBuffer(LargeByteBufferHelper.MAX_CHUNK_SIZE);
+  }
+
+  /**
+   * exposed for testing.  You don't really ever want to call this method -- the returned
+   * buffer will not implement {{asByteBuffer}} correctly.
+   */
+  @VisibleForTesting
+  LargeByteBuffer largeBuffer(int maxChunk) {
+    long totalSize = output.size();
+    int chunksNeeded = (int) ((totalSize + maxChunk - 1) / maxChunk);
+    ByteBuffer[] chunks = new ByteBuffer[chunksNeeded];
+    long remaining = totalSize;
+    long pos = 0;
+    for (int idx = 0; idx < chunksNeeded; idx++) {
+      int nextSize = (int) Math.min(maxChunk, remaining);
+      chunks[idx] = ByteBuffer.wrap(output.slice(pos, pos + nextSize));
+      pos += nextSize;
+      remaining -= nextSize;
+    }
+    return new WrappedLargeByteBuffer(chunks, maxChunk);
+  }
+
+  @Override
+  public void close() throws IOException {
+    output.close();
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
@@ -18,17 +18,17 @@
 package org.apache.spark.broadcast
 
 import java.io._
-import java.nio.ByteBuffer
 
 import scala.collection.JavaConversions.asJavaEnumeration
 import scala.reflect.ClassTag
 import scala.util.Random
 
 import org.apache.spark.{Logging, SparkConf, SparkEnv, SparkException}
 import org.apache.spark.io.CompressionCodec
+import org.apache.spark.network.buffer.{LargeByteBufferInputStream, LargeByteBufferHelper, LargeByteBuffer}
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.storage.{BroadcastBlockId, StorageLevel}
-import org.apache.spark.util.{ByteBufferInputStream, Utils}
+import org.apache.spark.util.Utils
 import org.apache.spark.util.io.ByteArrayChunkOutputStream
 
 /**
@@ -111,10 +111,10 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
   }
 
   /** Fetch torrent blocks from the driver and/or other executors. */
-  private def readBlocks(): Array[ByteBuffer] = {
+  private def readBlocks(): Array[LargeByteBuffer] = {
     // Fetch chunks of data. Note that all these chunks are stored in the BlockManager and reported
     // to the driver, so other executors can pull these chunks from this executor as well.
-    val blocks = new Array[ByteBuffer](numBlocks)
+    val blocks = new Array[LargeByteBuffer](numBlocks)
     val bm = SparkEnv.get.blockManager
 
     for (pid <- Random.shuffle(Seq.range(0, numBlocks))) {
@@ -123,8 +123,8 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
       // First try getLocalBytes because there is a chance that previous attempts to fetch the
       // broadcast blocks have already fetched some of the blocks. In that case, some blocks
       // would be available locally (on this executor).
-      def getLocal: Option[ByteBuffer] = bm.getLocalBytes(pieceId)
-      def getRemote: Option[ByteBuffer] = bm.getRemoteBytes(pieceId).map { block =>
+      def getLocal: Option[LargeByteBuffer] = bm.getLocalBytes(pieceId)
+      def getRemote: Option[LargeByteBuffer] = bm.getRemoteBytes(pieceId).map { block =>
         // If we found the block from remote executors/driver's BlockManager, put the block
         // in this executor's BlockManager.
         SparkEnv.get.blockManager.putBytes(
@@ -134,7 +134,7 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
           tellMaster = true)
         block
       }
-      val block: ByteBuffer = getLocal.orElse(getRemote).getOrElse(
+      val block: LargeByteBuffer = getLocal.orElse(getRemote).getOrElse(
         throw new SparkException(s"Failed to get $pieceId of $broadcastId"))
       blocks(pid) = block
     }
@@ -195,22 +195,22 @@ private object TorrentBroadcast extends Logging {
       obj: T,
       blockSize: Int,
       serializer: Serializer,
-      compressionCodec: Option[CompressionCodec]): Array[ByteBuffer] = {
+      compressionCodec: Option[CompressionCodec]): Array[LargeByteBuffer] = {
     val bos = new ByteArrayChunkOutputStream(blockSize)
     val out: OutputStream = compressionCodec.map(c => c.compressedOutputStream(bos)).getOrElse(bos)
     val ser = serializer.newInstance()
     val serOut = ser.serializeStream(out)
     serOut.writeObject[T](obj).close()
-    bos.toArrays.map(ByteBuffer.wrap)
+    bos.toArrays.map(LargeByteBufferHelper.asLargeByteBuffer)
   }
 
   def unBlockifyObject[T: ClassTag](
-      blocks: Array[ByteBuffer],
+      blocks: Array[LargeByteBuffer],
       serializer: Serializer,
       compressionCodec: Option[CompressionCodec]): T = {
     require(blocks.nonEmpty, "Cannot unblockify an empty array of blocks")
     val is = new SequenceInputStream(
-      asJavaEnumeration(blocks.iterator.map(block => new ByteBufferInputStream(block))))
+      asJavaEnumeration(blocks.iterator.map(block => new LargeByteBufferInputStream(block))))
     val in: InputStream = compressionCodec.map(c => c.compressedInputStream(is)).getOrElse(is)
     val ser = serializer.newInstance()
     val serIn = ser.deserializeStream(in)

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -23,6 +23,8 @@ import java.net.URL
 import java.nio.ByteBuffer
 import java.util.concurrent.{ConcurrentHashMap, TimeUnit}
 
+import org.apache.spark.network.buffer.LargeByteBufferHelper
+
 import scala.collection.JavaConversions._
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.util.control.NonFatal
@@ -266,7 +268,8 @@ private[spark] class Executor(
           } else if (resultSize >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
             val blockId = TaskResultBlockId(taskId)
             env.blockManager.putBytes(
-              blockId, serializedDirectResult, StorageLevel.MEMORY_AND_DISK_SER)
+              blockId, LargeByteBufferHelper.asLargeByteBuffer(serializedDirectResult),
+              StorageLevel.MEMORY_AND_DISK_SER)
             logInfo(
               s"Finished $taskName (TID $taskId). $resultSize bytes result sent via BlockManager)")
             ser.serialize(new IndirectTaskResult[Any](blockId, resultSize))

diff --git a/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala
@@ -24,9 +24,9 @@ import scala.concurrent.{Promise, Await, Future}
 import scala.concurrent.duration.Duration
 
 import org.apache.spark.Logging
-import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer}
+import org.apache.spark.network.buffer.{LargeByteBufferHelper, NioManagedBuffer, ManagedBuffer}
 import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener}
-import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel}
+import org.apache.spark.storage.{BlockId, StorageLevel}
 
 private[spark]
 abstract class BlockTransferService extends ShuffleClient with Closeable with Logging {

diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
@@ -23,12 +23,12 @@ import scala.collection.JavaConversions._
 
 import org.apache.spark.Logging
 import org.apache.spark.network.BlockDataManager
-import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
+import org.apache.spark.network.buffer.{BufferTooLargeException, ManagedBuffer, NioManagedBuffer}
 import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
 import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager}
 import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock}
 import org.apache.spark.serializer.Serializer
-import org.apache.spark.storage.{BlockId, StorageLevel}
+import org.apache.spark.storage.{ShuffleRemoteBlockSizeLimitException, BlockId, StorageLevel}
 
 /**
  * Serves requests to open blocks by simply registering one chunk per block requested.
@@ -53,11 +53,24 @@ class NettyBlockRpcServer(
 
     message match {
       case openBlocks: OpenBlocks =>
-        val blocks: Seq[ManagedBuffer] =
-          openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData)
-        val streamId = streamManager.registerStream(blocks.iterator)
-        logTrace(s"Registered streamId $streamId with ${blocks.size} buffers")
-        responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray)
+        try {
+          val blocks: Seq[ManagedBuffer] =
+            openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData)
+          val streamId = streamManager.registerStream(blocks.iterator)
+          logTrace(s"Registered streamId $streamId with ${blocks.size} buffers")
+          responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray)
+        } catch {
+          // shouldn't ever happen, b/c we should prevent writing 2GB shuffle files,
+          // but just to be safe
+          case ex: BufferTooLargeException =>
+            // throw & catch this helper exception, just to get full stack trace
+            try {
+              throw new ShuffleRemoteBlockSizeLimitException(ex)
+            } catch {
+              case ex2: ShuffleRemoteBlockSizeLimitException =>
+                responseContext.onFailure(ex2)
+            }
+        }
 
       case uploadBlock: UploadBlock =>
         // StorageLevel is serialized as bytes using our JavaSerializer.

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
@@ -77,7 +77,7 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
                 return
               }
               val deserializedResult = serializer.get().deserialize[DirectTaskResult[_]](
-                serializedTaskResult.get)
+                serializedTaskResult.get.asByteBuffer())
               sparkEnv.blockManager.master.removeBlock(blockId)
               (deserializedResult, size)
           }

diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleBlockResolver.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.shuffle
 
 import java.nio.ByteBuffer
+
 import org.apache.spark.network.buffer.ManagedBuffer
 import org.apache.spark.storage.ShuffleBlockId