Add test for VectorizedSparkOrcNewRecordReader.

viirya · viirya · commit 160e92470136 · 2016-11-25T05:38:39.000Z
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
@@ -980,9 +980,9 @@ public ColumnVector getDictionaryIds() {
     return dictionaryIds;
   }
 
-  public ColumnVector() {
+  public ColumnVector(DataType type) {
     this.capacity = 0;
-    this.type = null;
+    this.type = type;
     this.childColumns = null;
     this.resultArray = null;
     this.resultStruct = null;
diff --git a/sql/hive/src/main/java/org/apache/hadoop/hive/ql/io/orc/OrcColumnVector.java b/sql/hive/src/main/java/org/apache/hadoop/hive/ql/io/orc/OrcColumnVector.java
@@ -23,6 +23,7 @@
 import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
 
+import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.unsafe.types.UTF8String;
 
@@ -35,7 +36,8 @@
 public class OrcColumnVector extends org.apache.spark.sql.execution.vectorized.ColumnVector {
   private ColumnVector col;
 
-  public OrcColumnVector(ColumnVector col) {
+  public OrcColumnVector(ColumnVector col, DataType type) {
+    super(type);
     this.col = col;
   }
 
diff --git a/sql/hive/src/main/java/org/apache/hadoop/hive/ql/io/orc/VectorizedSparkOrcNewRecordReader.java b/sql/hive/src/main/java/org/apache/hadoop/hive/ql/io/orc/VectorizedSparkOrcNewRecordReader.java
@@ -77,6 +77,7 @@ public VectorizedSparkOrcNewRecordReader(
       Configuration conf,
       FileSplit fileSplit,
       List<Integer> columnIDs,
+      StructType requiredSchema,
       StructType partitionColumns,
       InternalRow partitionValues) throws IOException {
     List<OrcProto.Type> types = file.getTypes();
@@ -93,7 +94,7 @@ public VectorizedSparkOrcNewRecordReader(
     for (int i = 0; i < columnIDs.size(); i++) {
       org.apache.hadoop.hive.ql.exec.vector.ColumnVector col =
         this.hiveBatch.cols[columnIDs.get(i)];
-      this.orcColumns[i] = new OrcColumnVector(col);
+      this.orcColumns[i] = new OrcColumnVector(col, requiredSchema.fields()[i].dataType());
     }
 
     // Allocate Spark ColumnVectors for partition columns.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -181,9 +181,14 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
         if (enableVectorizedReader) {
           val columnIDs =
             requiredSchema.map(a => physicalSchema.fieldIndex(a.name): Integer).sorted.asJava
-          val orcRecordReader =
-            new VectorizedSparkOrcNewRecordReader(
-              orcReader, conf, fileSplit, columnIDs, partitionSchema, file.partitionValues)
+          val orcRecordReader = new VectorizedSparkOrcNewRecordReader(
+            orcReader,
+            conf,
+            fileSplit,
+            columnIDs,
+            requiredSchema,
+            partitionSchema,
+            file.partitionValues)
 
           if (returningBatch) {
             orcRecordReader.enableReturningBatches()
@@ -226,11 +231,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
    * Returns whether the reader will return the rows as batch or not.
    */
   override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = {
-    val conf = sparkSession.sessionState.conf
-    conf.orcVectorizedReaderEnabled && conf.wholeStageEnabled &&
-      schema.length <= conf.wholeStageMaxNumFields &&
-      schema.forall(f => f.dataType.isInstanceOf[AtomicType] &&
-        !f.dataType.isInstanceOf[DateType] && !f.dataType.isInstanceOf[TimestampType])
+    OrcRelation.supportBatch(sparkSession, schema)
   }
 }
 
@@ -374,4 +375,15 @@ private[orc] object OrcRelation extends HiveInspectors {
     val (sortedIDs, sortedNames) = ids.zip(requestedSchema.fieldNames).sorted.unzip
     HiveShim.appendReadColumns(conf, sortedIDs, sortedNames)
   }
+
+  /**
+   * Returns whether the reader will return the rows as batch or not.
+   */
+  def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = {
+    val conf = sparkSession.sessionState.conf
+    conf.orcVectorizedReaderEnabled && conf.wholeStageEnabled &&
+      schema.length <= conf.wholeStageMaxNumFields &&
+      schema.forall(f => f.dataType.isInstanceOf[AtomicType] &&
+        !f.dataType.isInstanceOf[DateType] && !f.dataType.isInstanceOf[TimestampType])
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/vectorized/OrcColumnVectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/vectorized/OrcColumnVectorSuite.scala
@@ -80,7 +80,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {
     }
   }
 
-  private def testLongColumnVector[T](num: Int)
+  private def testLongColumnVector[T](num: Int, dt: DataType)
       (genExpected: (Seq[Long] => Seq[T]))
       (genActual: (OrcColumnVector, Int) => Seq[T]): Unit = {
     val seed = System.currentTimeMillis()
@@ -96,12 +96,12 @@ class OrcColumnVectorSuite extends SparkFunSuite {
 
     val expected = genExpected(data)
 
-    val orcCol = new OrcColumnVector(lv)
+    val orcCol = new OrcColumnVector(lv, dt)
     val actual = genActual(orcCol, num)
     assert(actual === expected)
   }
 
-  private def testDoubleColumnVector[T](num: Int)
+  private def testDoubleColumnVector[T](num: Int, dt: DataType)
       (genExpected: (Seq[Double] => Seq[T]))
       (genActual: (OrcColumnVector, Int) => Seq[T]): Unit = {
     val seed = System.currentTimeMillis()
@@ -117,12 +117,12 @@ class OrcColumnVectorSuite extends SparkFunSuite {
 
     val expected = genExpected(data)
 
-    val orcCol = new OrcColumnVector(lv)
+    val orcCol = new OrcColumnVector(lv, dt)
     val actual = genActual(orcCol, num)
     assert(actual === expected)
   }
 
-  private def testBytesColumnVector[T](num: Int)
+  private def testBytesColumnVector[T](num: Int, dt: DataType)
       (genExpected: (Seq[Seq[Byte]] => Seq[T]))
       (genActual: (OrcColumnVector, Int) => Seq[T]): Unit = {
     val seed = System.currentTimeMillis()
@@ -139,7 +139,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {
 
     val expected = genExpected(data)
 
-    val orcCol = new OrcColumnVector(lv)
+    val orcCol = new OrcColumnVector(lv, dt)
     val actual = genActual(orcCol, num)
     actual.zip(expected).foreach { case (a, e) =>
       assert(a === e)
@@ -168,7 +168,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {
 
       val expected = genExpected(data)
 
-      val orcCol = new OrcColumnVector(lv)
+      val orcCol = new OrcColumnVector(lv, decimalType)
       val actual = genActual(orcCol, num, decimalType.precision, decimalType.scale)
       actual.zip(expected).foreach { case (a, e) =>
         assert(a.compareTo(e) == 0)
@@ -183,7 +183,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {
         col.getBoolean(rowId)
       }
     }
-    testLongColumnVector(100)(genExpected)(genActual)
+    testLongColumnVector(100, BooleanType)(genExpected)(genActual)
   }
 
   test("Hive LongColumnVector: Int") {
@@ -193,7 +193,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {
         col.getInt(rowId)
       }
     }
-    testLongColumnVector(100)(genExpected)(genActual)
+    testLongColumnVector(100, IntegerType)(genExpected)(genActual)
   }
 
   test("Hive LongColumnVector: Byte") {
@@ -203,7 +203,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {
         col.getByte(rowId)
       }
     }
-    testLongColumnVector(100)(genExpected)(genActual)
+    testLongColumnVector(100, ByteType)(genExpected)(genActual)
   }
 
   test("Hive LongColumnVector: Short") {
@@ -213,7 +213,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {
         col.getShort(rowId)
       }
     }
-    testLongColumnVector(100)(genExpected)(genActual)
+    testLongColumnVector(100, ShortType)(genExpected)(genActual)
   }
 
   test("Hive LongColumnVector: Long") {
@@ -223,7 +223,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {
         col.getLong(rowId)
       }
     }
-    testLongColumnVector(100)(genExpected)(genActual)
+    testLongColumnVector(100, LongType)(genExpected)(genActual)
   }
 
   test("Hive DoubleColumnVector: Float") {
@@ -233,7 +233,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {
         col.getFloat(rowId)
       }
     }
-    testDoubleColumnVector(100)(genExpected)(genActual)
+    testDoubleColumnVector(100, FloatType)(genExpected)(genActual)
   }
 
   test("Hive DoubleColumnVector: Double") {
@@ -243,7 +243,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {
         col.getDouble(rowId)
       }
     }
-    testDoubleColumnVector(100)(genExpected)(genActual)
+    testDoubleColumnVector(100, DoubleType)(genExpected)(genActual)
   }
 
   test("Hive BytesColumnVector: Binary") {
@@ -253,7 +253,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {
         col.getBinary(rowId).toSeq
       }
     }
-    testBytesColumnVector(100)(genExpected)(genActual)
+    testBytesColumnVector(100, BinaryType)(genExpected)(genActual)
   }
 
   test("Hive BytesColumnVector: String") {
@@ -266,7 +266,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {
         col.getUTF8String(rowId)
       }
     }
-    testBytesColumnVector(100)(genExpected)(genActual)
+    testBytesColumnVector(100, StringType)(genExpected)(genActual)
   }
 
   test("Hive DecimalColumnVector") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/vectorized/VectorizedSparkOrcNewRecordReaderSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/vectorized/VectorizedSparkOrcNewRecordReaderSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`80`	`80`	`}`
`81`	`81`	`}`
`82`	`82`
`83`		`- private def testLongColumnVector[T](num: Int)`
	`83`	`+ private def testLongColumnVector[T](num: Int, dt: DataType)`
`84`	`84`	`(genExpected: (Seq[Long] => Seq[T]))`
`85`	`85`	`(genActual: (OrcColumnVector, Int) => Seq[T]): Unit = {`
`86`	`86`	`val seed = System.currentTimeMillis()`
`@@ -96,12 +96,12 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`96`	`96`
`97`	`97`	`val expected = genExpected(data)`
`98`	`98`
`99`		`- val orcCol = new OrcColumnVector(lv)`
	`99`	`+ val orcCol = new OrcColumnVector(lv, dt)`
`100`	`100`	`val actual = genActual(orcCol, num)`
`101`	`101`	`assert(actual === expected)`
`102`	`102`	`}`
`103`	`103`
`104`		`- private def testDoubleColumnVector[T](num: Int)`
	`104`	`+ private def testDoubleColumnVector[T](num: Int, dt: DataType)`
`105`	`105`	`(genExpected: (Seq[Double] => Seq[T]))`
`106`	`106`	`(genActual: (OrcColumnVector, Int) => Seq[T]): Unit = {`
`107`	`107`	`val seed = System.currentTimeMillis()`
`@@ -117,12 +117,12 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`117`	`117`
`118`	`118`	`val expected = genExpected(data)`
`119`	`119`
`120`		`- val orcCol = new OrcColumnVector(lv)`
	`120`	`+ val orcCol = new OrcColumnVector(lv, dt)`
`121`	`121`	`val actual = genActual(orcCol, num)`
`122`	`122`	`assert(actual === expected)`
`123`	`123`	`}`
`124`	`124`
`125`		`- private def testBytesColumnVector[T](num: Int)`
	`125`	`+ private def testBytesColumnVector[T](num: Int, dt: DataType)`
`126`	`126`	`(genExpected: (Seq[Seq[Byte]] => Seq[T]))`
`127`	`127`	`(genActual: (OrcColumnVector, Int) => Seq[T]): Unit = {`
`128`	`128`	`val seed = System.currentTimeMillis()`
`@@ -139,7 +139,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`139`	`139`
`140`	`140`	`val expected = genExpected(data)`
`141`	`141`
`142`		`- val orcCol = new OrcColumnVector(lv)`
	`142`	`+ val orcCol = new OrcColumnVector(lv, dt)`
`143`	`143`	`val actual = genActual(orcCol, num)`
`144`	`144`	`actual.zip(expected).foreach { case (a, e) =>`
`145`	`145`	`assert(a === e)`
`@@ -168,7 +168,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`168`	`168`
`169`	`169`	`val expected = genExpected(data)`
`170`	`170`
`171`		`- val orcCol = new OrcColumnVector(lv)`
	`171`	`+ val orcCol = new OrcColumnVector(lv, decimalType)`
`172`	`172`	`val actual = genActual(orcCol, num, decimalType.precision, decimalType.scale)`
`173`	`173`	`actual.zip(expected).foreach { case (a, e) =>`
`174`	`174`	`assert(a.compareTo(e) == 0)`
`@@ -183,7 +183,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`183`	`183`	`col.getBoolean(rowId)`
`184`	`184`	`}`
`185`	`185`	`}`
`186`		`- testLongColumnVector(100)(genExpected)(genActual)`
	`186`	`+ testLongColumnVector(100, BooleanType)(genExpected)(genActual)`
`187`	`187`	`}`
`188`	`188`
`189`	`189`	`test("Hive LongColumnVector: Int") {`
`@@ -193,7 +193,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`193`	`193`	`col.getInt(rowId)`
`194`	`194`	`}`
`195`	`195`	`}`
`196`		`- testLongColumnVector(100)(genExpected)(genActual)`
	`196`	`+ testLongColumnVector(100, IntegerType)(genExpected)(genActual)`
`197`	`197`	`}`
`198`	`198`
`199`	`199`	`test("Hive LongColumnVector: Byte") {`
`@@ -203,7 +203,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`203`	`203`	`col.getByte(rowId)`
`204`	`204`	`}`
`205`	`205`	`}`
`206`		`- testLongColumnVector(100)(genExpected)(genActual)`
	`206`	`+ testLongColumnVector(100, ByteType)(genExpected)(genActual)`
`207`	`207`	`}`
`208`	`208`
`209`	`209`	`test("Hive LongColumnVector: Short") {`
`@@ -213,7 +213,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`213`	`213`	`col.getShort(rowId)`
`214`	`214`	`}`
`215`	`215`	`}`
`216`		`- testLongColumnVector(100)(genExpected)(genActual)`
	`216`	`+ testLongColumnVector(100, ShortType)(genExpected)(genActual)`
`217`	`217`	`}`
`218`	`218`
`219`	`219`	`test("Hive LongColumnVector: Long") {`
`@@ -223,7 +223,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`223`	`223`	`col.getLong(rowId)`
`224`	`224`	`}`
`225`	`225`	`}`
`226`		`- testLongColumnVector(100)(genExpected)(genActual)`
	`226`	`+ testLongColumnVector(100, LongType)(genExpected)(genActual)`
`227`	`227`	`}`
`228`	`228`
`229`	`229`	`test("Hive DoubleColumnVector: Float") {`
`@@ -233,7 +233,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`233`	`233`	`col.getFloat(rowId)`
`234`	`234`	`}`
`235`	`235`	`}`
`236`		`- testDoubleColumnVector(100)(genExpected)(genActual)`
	`236`	`+ testDoubleColumnVector(100, FloatType)(genExpected)(genActual)`
`237`	`237`	`}`
`238`	`238`
`239`	`239`	`test("Hive DoubleColumnVector: Double") {`
`@@ -243,7 +243,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`243`	`243`	`col.getDouble(rowId)`
`244`	`244`	`}`
`245`	`245`	`}`
`246`		`- testDoubleColumnVector(100)(genExpected)(genActual)`
	`246`	`+ testDoubleColumnVector(100, DoubleType)(genExpected)(genActual)`
`247`	`247`	`}`
`248`	`248`
`249`	`249`	`test("Hive BytesColumnVector: Binary") {`
`@@ -253,7 +253,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`253`	`253`	`col.getBinary(rowId).toSeq`
`254`	`254`	`}`
`255`	`255`	`}`
`256`		`- testBytesColumnVector(100)(genExpected)(genActual)`
	`256`	`+ testBytesColumnVector(100, BinaryType)(genExpected)(genActual)`
`257`	`257`	`}`
`258`	`258`
`259`	`259`	`test("Hive BytesColumnVector: String") {`
`@@ -266,7 +266,7 @@ class OrcColumnVectorSuite extends SparkFunSuite {`
`266`	`266`	`col.getUTF8String(rowId)`
`267`	`267`	`}`
`268`	`268`	`}`
`269`		`- testBytesColumnVector(100)(genExpected)(genActual)`
	`269`	`+ testBytesColumnVector(100, StringType)(genExpected)(genActual)`
`270`	`270`	`}`
`271`	`271`
`272`	`272`	`test("Hive DecimalColumnVector") {`