apache
diff --git a/‎core/src/main/scala/org/apache/spark/storage/MemoryStore.scala‎
Lines changed: 9 additions & 5 deletions b/‎core/src/main/scala/org/apache/spark/storage/MemoryStore.scala‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala‎
Lines changed: 14 additions & 4 deletions b/‎sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala‎
Lines changed: 11 additions & 0 deletions b/‎sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/UnsafeRowParquetRecordReader.java‎
Lines changed: 130 additions & 16 deletions b/‎sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/UnsafeRowParquetRecordReader.java‎
Lines changed: 130 additions & 16 deletions
diff --git a/‎sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java‎
Lines changed: 43 additions & 2 deletions b/‎sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java‎
Lines changed: 43 additions & 2 deletions
@@ -255,8 +255,8 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
     var memoryThreshold = initialMemoryThreshold
     // Memory to request as a multiple of current vector size
     val memoryGrowthFactor = 1.5
-    // Previous unroll memory held by this task, for releasing later (only at the very end)
-    val previousMemoryReserved = currentUnrollMemoryForThisTask
+    // Keep track of pending unroll memory reserved by this method.
+    var pendingMemoryReserved = 0L
     // Underlying vector for unrolling the block
     var vector = new SizeTrackingVector[Any]
 
@@ -266,6 +266,8 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
     if (!keepUnrolling) {
       logWarning(s"Failed to reserve initial memory threshold of " +
         s"${Utils.bytesToString(initialMemoryThreshold)} for computing block $blockId in memory.")
+    } else {
+      pendingMemoryReserved += initialMemoryThreshold
     }
 
     // Unroll this block safely, checking whether we have exceeded our threshold periodically
@@ -278,6 +280,9 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
           if (currentSize >= memoryThreshold) {
             val amountToRequest = (currentSize * memoryGrowthFactor - memoryThreshold).toLong
             keepUnrolling = reserveUnrollMemoryForThisTask(blockId, amountToRequest)
+            if (keepUnrolling) {
+              pendingMemoryReserved += amountToRequest
+            }
             // New threshold is currentSize * memoryGrowthFactor
             memoryThreshold += amountToRequest
           }
@@ -304,10 +309,9 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
           // release the unroll memory yet. Instead, we transfer it to pending unroll memory
           // so `tryToPut` can further transfer it to normal storage memory later.
           // TODO: we can probably express this without pending unroll memory (SPARK-10907)
-          val amountToTransferToPending = currentUnrollMemoryForThisTask - previousMemoryReserved
-          unrollMemoryMap(taskAttemptId) -= amountToTransferToPending
+          unrollMemoryMap(taskAttemptId) -= pendingMemoryReserved
           pendingUnrollMemoryMap(taskAttemptId) =
-            pendingUnrollMemoryMap.getOrElse(taskAttemptId, 0L) + amountToTransferToPending
+            pendingUnrollMemoryMap.getOrElse(taskAttemptId, 0L) + pendingMemoryReserved
         }
       } else {
         // Otherwise, if we return an iterator, we can only release the unroll memory when
 
@@ -47,9 +47,9 @@ object RandomDataGenerator {
    */
   private val PROBABILITY_OF_NULL: Float = 0.1f
 
-  private val MAX_STR_LEN: Int = 1024
-  private val MAX_ARR_SIZE: Int = 128
-  private val MAX_MAP_SIZE: Int = 128
+  final val MAX_STR_LEN: Int = 1024
+  final val MAX_ARR_SIZE: Int = 128
+  final val MAX_MAP_SIZE: Int = 128
 
   /**
    * Helper function for constructing a biased random number generator which returns "interesting"
@@ -208,7 +208,17 @@ object RandomDataGenerator {
             forType(valueType, nullable = valueContainsNull, rand)
         ) yield {
           () => {
-            Seq.fill(rand.nextInt(MAX_MAP_SIZE))((keyGenerator(), valueGenerator())).toMap
+            val length = rand.nextInt(MAX_MAP_SIZE)
+            val keys = scala.collection.mutable.HashSet(Seq.fill(length)(keyGenerator()): _*)
+            // In case the number of different keys is not enough, set a max iteration to avoid
+            // infinite loop.
+            var count = 0
+            while (keys.size < length && count < MAX_MAP_SIZE) {
+              keys += keyGenerator()
+              count += 1
+            }
+            val values = Seq.fill(keys.size)(valueGenerator())
+            keys.zip(values).toMap
           }
         }
       }
 
@@ -95,4 +95,15 @@ class RandomDataGeneratorSuite extends SparkFunSuite {
     }
   }
 
+  test("check size of generated map") {
+    val mapType = MapType(IntegerType, IntegerType)
+    for (seed <- 1 to 1000) {
+      val generator = RandomDataGenerator.forType(
+        mapType, nullable = false, rand = new Random(seed)).get
+      val maps = Seq.fill(100)(generator().asInstanceOf[Map[Int, Int]])
+      val expectedTotalElements = 100 / 2 * RandomDataGenerator.MAX_MAP_SIZE
+      val deviation = math.abs(maps.map(_.size).sum - expectedTotalElements)
+      assert(deviation.toDouble / expectedTotalElements < 2e-1)
+    }
+  }
 }
@@ -21,6 +21,7 @@
 import java.nio.ByteBuffer;
 import java.util.List;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.parquet.Preconditions;
@@ -41,6 +42,7 @@
 import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter;
 import org.apache.spark.sql.execution.vectorized.ColumnVector;
 import org.apache.spark.sql.execution.vectorized.ColumnarBatch;
+import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.types.UTF8String;
@@ -207,13 +209,7 @@ public boolean nextBatch() throws IOException {
 
     int num = (int)Math.min((long) columnarBatch.capacity(), totalRowCount - rowsReturned);
     for (int i = 0; i < columnReaders.length; ++i) {
-      switch (columnReaders[i].descriptor.getType()) {
-        case INT32:
-          columnReaders[i].readIntBatch(num, columnarBatch.column(i));
-          break;
-        default:
-          throw new IOException("Unsupported type: " + columnReaders[i].descriptor.getType());
-      }
+      columnReaders[i].readBatch(num, columnarBatch.column(i));
     }
     rowsReturned += num;
     columnarBatch.setNumRows(num);
@@ -237,7 +233,8 @@ private void initializeInternal() throws IOException {
 
       // TODO: Be extremely cautious in what is supported. Expand this.
       if (originalTypes[i] != null && originalTypes[i] != OriginalType.DECIMAL &&
-          originalTypes[i] != OriginalType.UTF8 && originalTypes[i] != OriginalType.DATE) {
+          originalTypes[i] != OriginalType.UTF8 && originalTypes[i] != OriginalType.DATE &&
+          originalTypes[i] != OriginalType.INT_8 && originalTypes[i] != OriginalType.INT_16) {
         throw new IOException("Unsupported type: " + t);
       }
       if (originalTypes[i] == OriginalType.DECIMAL &&
@@ -464,6 +461,11 @@ private final class ColumnReader {
      */
     private boolean useDictionary;
 
+    /**
+     * If useDictionary is true, the staging vector used to decode the ids.
+     */
+    private ColumnVector dictionaryIds;
+
     /**
      * Maximum definition level for this column.
      */
@@ -587,9 +589,8 @@ private boolean next() throws IOException {
 
     /**
      * Reads `total` values from this columnReader into column.
-     * TODO: implement the other encodings.
      */
-    private void readIntBatch(int total, ColumnVector column) throws IOException {
+    private void readBatch(int total, ColumnVector column) throws IOException {
       int rowId = 0;
       while (total > 0) {
         // Compute the number of values we want to read in this page.
@@ -599,21 +600,134 @@ private void readIntBatch(int total, ColumnVector column) throws IOException {
           leftInPage = (int)(endOfPageValueCount - valuesRead);
         }
         int num = Math.min(total, leftInPage);
-        defColumn.readIntegers(
-            num, column, rowId, maxDefLevel, (VectorizedValuesReader)dataColumn, 0);
-
-        // Remap the values if it is dictionary encoded.
         if (useDictionary) {
-          for (int i = rowId; i < rowId + num; ++i) {
-            column.putInt(i, dictionary.decodeToInt(column.getInt(i)));
+          // Data is dictionary encoded. We will vector decode the ids and then resolve the values.
+          if (dictionaryIds == null) {
+            dictionaryIds = ColumnVector.allocate(total, DataTypes.IntegerType, MemoryMode.ON_HEAP);
+          } else {
+            dictionaryIds.reset();
+            dictionaryIds.reserve(total);
+          }
+          // Read and decode dictionary ids.
+          readIntBatch(rowId, num, dictionaryIds);
+          decodeDictionaryIds(rowId, num, column);
+        } else {
+          switch (descriptor.getType()) {
+            case INT32:
+              readIntBatch(rowId, num, column);
+              break;
+            case INT64:
+              readLongBatch(rowId, num, column);
+              break;
+            case BINARY:
+              readBinaryBatch(rowId, num, column);
+              break;
+            default:
+              throw new IOException("Unsupported type: " + descriptor.getType());
           }
         }
+
         valuesRead += num;
         rowId += num;
         total -= num;
       }
     }
 
+    /**
+     * Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`.
+     */
+    private void decodeDictionaryIds(int rowId, int num, ColumnVector column) {
+      switch (descriptor.getType()) {
+        case INT32:
+          if (column.dataType() == DataTypes.IntegerType) {
+            for (int i = rowId; i < rowId + num; ++i) {
+              column.putInt(i, dictionary.decodeToInt(dictionaryIds.getInt(i)));
+            }
+          } else if (column.dataType() == DataTypes.ByteType) {
+            for (int i = rowId; i < rowId + num; ++i) {
+              column.putByte(i, (byte)dictionary.decodeToInt(dictionaryIds.getInt(i)));
+            }
+          } else {
+            throw new NotImplementedException("Unimplemented type: " + column.dataType());
+          }
+          break;
+
+        case INT64:
+          for (int i = rowId; i < rowId + num; ++i) {
+            column.putLong(i, dictionary.decodeToLong(dictionaryIds.getInt(i)));
+          }
+          break;
+
+        case BINARY:
+          // TODO: this is incredibly inefficient as it blows up the dictionary right here. We
+          // need to do this better. We should probably add the dictionary data to the ColumnVector
+          // and reuse it across batches. This should mean adding a ByteArray would just update
+          // the length and offset.
+          for (int i = rowId; i < rowId + num; ++i) {
+            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
+            column.putByteArray(i, v.getBytes());
+          }
+          break;
+
+        default:
+          throw new NotImplementedException("Unsupported type: " + descriptor.getType());
+      }
+
+      if (dictionaryIds.numNulls() > 0) {
+        // Copy the NULLs over.
+        // TODO: we can improve this by decoding the NULLs directly into column. This would
+        // mean we decode the int ids into `dictionaryIds` and the NULLs into `column` and then
+        // just do the ID remapping as above.
+        for (int i = 0; i < num; ++i) {
+          if (dictionaryIds.getIsNull(rowId + i)) {
+            column.putNull(rowId + i);
+          }
+        }
+      }
+    }
+
+    /**
+     * For all the read*Batch functions, reads `num` values from this columnReader into column. It
+     * is guaranteed that num is smaller than the number of values left in the current page.
+     */
+
+    private void readIntBatch(int rowId, int num, ColumnVector column) throws IOException {
+      // This is where we implement support for the valid type conversions.
+      // TODO: implement remaining type conversions
+      if (column.dataType() == DataTypes.IntegerType) {
+        defColumn.readIntegers(
+            num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn, 0);
+      } else if (column.dataType() == DataTypes.ByteType) {
+        defColumn.readBytes(
+            num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
+      } else {
+        throw new NotImplementedException("Unimplemented type: " + column.dataType());
+      }
+    }
+
+    private void readLongBatch(int rowId, int num, ColumnVector column) throws IOException {
+      // This is where we implement support for the valid type conversions.
+      // TODO: implement remaining type conversions
+      if (column.dataType() == DataTypes.LongType) {
+        defColumn.readLongs(
+            num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
+      } else {
+        throw new NotImplementedException("Unimplemented type: " + column.dataType());
+      }
+    }
+
+    private void readBinaryBatch(int rowId, int num, ColumnVector column) throws IOException {
+      // This is where we implement support for the valid type conversions.
+      // TODO: implement remaining type conversions
+      if (column.isArray()) {
+        defColumn.readBinarys(
+            num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
+      } else {
+        throw new NotImplementedException("Unimplemented type: " + column.dataType());
+      }
+    }
+
+
     private void readPage() throws IOException {
       DataPage page = pageReader.readPage();
       // TODO: Why is this a visitor?
 
@@ -18,10 +18,13 @@
 
 import java.io.IOException;
 
+import org.apache.spark.sql.Column;
 import org.apache.spark.sql.execution.vectorized.ColumnVector;
 import org.apache.spark.unsafe.Platform;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.parquet.column.values.ValuesReader;
+import org.apache.parquet.io.api.Binary;
 
 /**
  * An implementation of the Parquet PLAIN decoder that supports the vectorized interface.
@@ -52,15 +55,53 @@ public void skip(int n) {
   }
 
   @Override
-  public void readIntegers(int total, ColumnVector c, int rowId) {
+  public final void readIntegers(int total, ColumnVector c, int rowId) {
     c.putIntsLittleEndian(rowId, total, buffer, offset - Platform.BYTE_ARRAY_OFFSET);
     offset += 4 * total;
   }
 
   @Override
-  public int readInteger() {
+  public final void readLongs(int total, ColumnVector c, int rowId) {
+    c.putLongsLittleEndian(rowId, total, buffer, offset - Platform.BYTE_ARRAY_OFFSET);
+    offset += 8 * total;
+  }
+
+  @Override
+  public final void readBytes(int total, ColumnVector c, int rowId) {
+    for (int i = 0; i < total; i++) {
+      // Bytes are stored as a 4-byte little endian int. Just read the first byte.
+      // TODO: consider pushing this in ColumnVector by adding a readBytes with a stride.
+      c.putInt(rowId + i, buffer[offset]);
+      offset += 4;
+    }
+  }
+
+  @Override
+  public final int readInteger() {
     int v = Platform.getInt(buffer, offset);
     offset += 4;
     return v;
   }
+
+  @Override
+  public final long readLong() {
+    long v = Platform.getLong(buffer, offset);
+    offset += 8;
+    return v;
+  }
+
+  @Override
+  public final byte readByte() {
+    return (byte)readInteger();
+  }
+
+  @Override
+  public final void readBinary(int total, ColumnVector v, int rowId) {
+    for (int i = 0; i < total; i++) {
+      int len = readInteger();
+      int start = offset;
+      offset += len;
+      v.putByteArray(rowId + i, buffer, start - Platform.BYTE_ARRAY_OFFSET, len);
+    }
+  }
 }
Original file line number	Diff line number	Diff line change
`@@ -95,4 +95,15 @@ class RandomDataGeneratorSuite extends SparkFunSuite {`
`95`	`95`	`}`
`96`	`96`	`}`
`97`	`97`
	`98`	`+ test("check size of generated map") {`
	`99`	`+ val mapType = MapType(IntegerType, IntegerType)`
	`100`	`+ for (seed <- 1 to 1000) {`
	`101`	`+ val generator = RandomDataGenerator.forType(`
	`102`	`+ mapType, nullable = false, rand = new Random(seed)).get`
	`103`	`+ val maps = Seq.fill(100)(generator().asInstanceOf[Map[Int, Int]])`
	`104`	`+ val expectedTotalElements = 100 / 2 * RandomDataGenerator.MAX_MAP_SIZE`
	`105`	`+ val deviation = math.abs(maps.map(_.size).sum - expectedTotalElements)`
	`106`	`+ assert(deviation.toDouble / expectedTotalElements < 2e-1)`
	`107`	`+ }`
	`108`	`+ }`
`98`	`109`	`}`