Adds ByteArrayThief to avoid allocating ByteBuffer

liancheng · liancheng · commit d59daf65347a · 2015-09-24T14:28:22.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
+import java.io.{DataOutput, OutputStream}
 import java.math.{BigDecimal, BigInteger}
 import java.nio.ByteOrder
 
@@ -281,6 +282,24 @@ private[parquet] class CatalystRowConverter(
     }
   }
 
+  // !! HACK ALERT !!
+  //
+  // This is a hacky utility class used to steal underlying byte array from a Parquet `Binary`
+  // without copying it as long as we know that it's safe to do so.
+  private final class ByteArrayThief extends OutputStream {
+    var bytes: Array[Byte] = _
+    var offset: Int = _
+    var numBytes: Int = _
+
+    override def write(bytes: Array[Byte], offset: Int, length: Int): Unit = {
+      this.bytes = bytes
+      this.offset = offset
+      this.numBytes = length
+    }
+
+    override def write(b: Int): Unit = ()
+  }
+
   /**
    * Parquet converter for strings. A dictionary is used to minimize string decoding cost.
    */
@@ -289,6 +308,8 @@ private[parquet] class CatalystRowConverter(
 
     private var expandedDictionary: Array[UTF8String] = null
 
+    private val byteArrayThief = new ByteArrayThief
+
     override def hasDictionarySupport: Boolean = true
 
     override def setDictionary(dictionary: Dictionary): Unit = {
@@ -302,13 +323,11 @@ private[parquet] class CatalystRowConverter(
     }
 
     override def addBinary(value: Binary): Unit = {
-      // The underlying `ByteBuffer` implementation is guaranteed to be `HeapByteBuffer`, so here we
-      // are using `Binary.toByteBuffer.array()` to steal the underlying byte array without copying
-      // it.
-      val buffer = value.toByteBuffer
-      val offset = buffer.position()
-      val numBytes = buffer.limit() - buffer.position()
-      updater.set(UTF8String.fromBytes(buffer.array(), offset, numBytes))
+      value.writeTo(byteArrayThief)
+      val bytes = byteArrayThief.bytes
+      val offset = byteArrayThief.offset
+      val numBytes = byteArrayThief.numBytes
+      updater.set(UTF8String.fromBytes(bytes, offset, numBytes))
     }
   }
 
@@ -320,6 +339,8 @@ private[parquet] class CatalystRowConverter(
       updater: ParentContainerUpdater)
     extends CatalystPrimitiveConverter(updater) {
 
+    private val byteArrayThief = new ByteArrayThief
+
     // Converts decimals stored as INT32
     override def addInt(value: Int): Unit = {
       addLong(value: Long)
@@ -340,23 +361,21 @@ private[parquet] class CatalystRowConverter(
       val scale = decimalType.scale
 
       if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) {
-        // Constructs a `Decimal` with an unscaled `Long` value if possible.  The underlying
-        // `ByteBuffer` implementation is guaranteed to be `HeapByteBuffer`, so here we are using
-        // `Binary.toByteBuffer.array()` to steal the underlying byte array without copying it.
-        val buffer = value.toByteBuffer
-        val bytes = buffer.array()
-        val start = buffer.position()
-        val end = buffer.limit()
+        // Constructs a `Decimal` with an unscaled `Long` value if possible.
+        value.writeTo(byteArrayThief)
+        val bytes = byteArrayThief.bytes
+        val begin = byteArrayThief.offset
+        val end = begin + byteArrayThief.numBytes
 
         var unscaled = 0L
-        var i = start
+        var i = begin
 
         while (i < end) {
           unscaled = (unscaled << 8) | (bytes(i) & 0xff)
           i += 1
         }
 
-        val bits = 8 * (end - start)
+        val bits = 8 * byteArrayThief.numBytes
         unscaled = (unscaled << (64 - bits)) >> (64 - bits)
         Decimal(unscaled, precision, scale)
       } else {