From 1d0703466dbd8c3da420b2735d264580901cdc75 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 18 Jun 2016 15:47:11 +0900 Subject: [PATCH 01/75] Implementation of GenericArrayData specialized for primitive type array add unit tests --- .../sql/catalyst/util/GenericArrayData.scala | 443 +++++++++++++++++- .../catalyst/util/GenericArrayDataSuite.scala | 108 +++++ 2 files changed, 546 insertions(+), 5 deletions(-) create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataSuite.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 7ee9581b63af..657db2223638 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -23,17 +23,38 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.{DataType, Decimal} import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} +object GenericArrayData { + def allocate(seq: Seq[Any]): GenericArrayData = new GenericArrayData(seq) + def allocate(list: java.util.List[Any]): GenericArrayData = new GenericArrayData(list) + def allocate(seqOrArray: Any): GenericArrayData = new GenericArrayData(seqOrArray) + def allocate(primitiveArray: Array[Int]): GenericArrayData = + new GenericIntArrayData(primitiveArray) + def allocate(primitiveArray: Array[Long]): GenericArrayData = + new GenericLongArrayData(primitiveArray) + def allocate(primitiveArray: Array[Float]): GenericArrayData = + new GenericFloatArrayData(primitiveArray) + def allocate(primitiveArray: Array[Double]): GenericArrayData = + new GenericDoubleArrayData(primitiveArray) + def allocate(primitiveArray: Array[Short]): GenericArrayData = + new GenericShortArrayData(primitiveArray) + def allocate(primitiveArray: Array[Byte]): GenericArrayData = + new GenericByteArrayData(primitiveArray) + def allocate(primitiveArray: Array[Boolean]): GenericArrayData = + new GenericBooleanArrayData(primitiveArray) +} + private object GenericArrayData { // SPARK-16634: Workaround for JVM bug present in some 1.7 versions. def anyToSeq(seqOrArray: Any): Seq[Any] = seqOrArray match { case seq: Seq[Any] => seq case array: Array[_] => array.toSeq + case _ => Seq.empty } } -class GenericArrayData(val array: Array[Any]) extends ArrayData { +class GenericArrayData(private val _array: Array[Any]) extends ArrayData { def this(seq: Seq[Any]) = this(seq.toArray) def this(list: java.util.List[Any]) = this(list.asScala) @@ -49,11 +70,13 @@ class GenericArrayData(val array: Array[Any]) extends ArrayData { def this(seqOrArray: Any) = this(GenericArrayData.anyToSeq(seqOrArray)) + override def array(): Array[Any] = _array + override def copy(): ArrayData = new GenericArrayData(array.clone()) override def numElements(): Int = array.length - private def getAs[T](ordinal: Int) = array(ordinal).asInstanceOf[T] + private def getAs[T](ordinal: Int) = _array(ordinal).asInstanceOf[T] override def isNullAt(ordinal: Int): Boolean = getAs[AnyRef](ordinal) eq null override def get(ordinal: Int, elementType: DataType): AnyRef = getAs(ordinal) override def getBoolean(ordinal: Int): Boolean = getAs(ordinal) @@ -94,8 +117,8 @@ class GenericArrayData(val array: Array[Any]) extends ArrayData { return false } if (!isNullAt(i)) { - val o1 = array(i) - val o2 = other.array(i) + val o1 = _array(i) + val o2 = other._array(i) o1 match { case b1: Array[Byte] => if (!o2.isInstanceOf[Array[Byte]] || @@ -129,7 +152,7 @@ class GenericArrayData(val array: Array[Any]) extends ArrayData { if (isNullAt(i)) { 0 } else { - array(i) match { + _array(i) match { case b: Boolean => if (b) 0 else 1 case b: Byte => b.toInt case s: Short => s.toInt @@ -149,3 +172,413 @@ class GenericArrayData(val array: Array[Any]) extends ArrayData { result } } + +final class GenericIntArrayData(private val primitiveArray: Array[Int]) extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericIntArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getInt(ordinal: Int): Int = primitiveArray(ordinal) + override def toIntArray(): Array[Int] = { + val array = new Array[Int](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericIntArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericIntArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val update: Int = primitiveArray(i) + result = 37 * result + update + i += 1 + } + result + } +} + +final class GenericLongArrayData(private val primitiveArray: Array[Long]) + extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericLongArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getLong(ordinal: Int): Long = primitiveArray(ordinal) + override def toLongArray(): Array[Long] = { + val array = new Array[Long](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericLongArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericLongArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val l = primitiveArray(i) + val update: Int = (l ^ (l >>> 32)).toInt + result = 37 * result + update + i += 1 + } + result + } +} + +final class GenericFloatArrayData(private val primitiveArray: Array[Float]) + extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericFloatArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getFloat(ordinal: Int): Float = primitiveArray(ordinal) + override def toFloatArray(): Array[Float] = { + val array = new Array[Float](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericFloatArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericFloatArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (java.lang.Float.isNaN(o1)) { + if (!java.lang.Float.isNaN(o2)) { + return false; + } + } else if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val f = primitiveArray(i) + val update: Int = java.lang.Float.floatToIntBits(f) + result = 37 * result + update + i += 1 + } + result + } +} + +final class GenericDoubleArrayData(private val primitiveArray: Array[Double]) + extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericDoubleArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getDouble(ordinal: Int): Double = primitiveArray(ordinal) + override def toDoubleArray(): Array[Double] = { + val array = new Array[Double](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericDoubleArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericDoubleArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (java.lang.Double.isNaN(o1)) { + if (!java.lang.Double.isNaN(o2)) { + return false; + } + } else if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val d = primitiveArray(i) + val b = java.lang.Double.doubleToLongBits(d) + val update: Int = (b ^ (b >>> 32)).toInt + result = 37 * result + update + i += 1 + } + result + } +} + +final class GenericShortArrayData(private val primitiveArray: Array[Short]) + extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericShortArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getShort(ordinal: Int): Short = primitiveArray(ordinal) + override def toShortArray(): Array[Short] = { + val array = new Array[Short](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericShortArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericShortArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val update: Int = primitiveArray(i).toInt + result = 37 * result + update + i += 1 + } + result + } +} + +final class GenericByteArrayData(private val primitiveArray: Array[Byte]) + extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericByteArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getByte(ordinal: Int): Byte = primitiveArray(ordinal) + override def toByteArray(): Array[Byte] = { + val array = new Array[Byte](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericByteArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericByteArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val update: Int = primitiveArray(i).toInt + result = 37 * result + update + i += 1 + } + result + } +} + +final class GenericBooleanArrayData(private val primitiveArray: Array[Boolean]) + extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericBooleanArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getBoolean(ordinal: Int): Boolean = primitiveArray(ordinal) + override def toBooleanArray(): Array[Boolean] = { + val array = new Array[Boolean](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericBooleanArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericBooleanArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val update: Int = if (primitiveArray(i)) 1 else 0 + result = 37 * result + update + i += 1 + } + result + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataSuite.scala new file mode 100644 index 000000000000..8f16fa17fc71 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataSuite.scala @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.util + +import org.apache.spark.SparkFunSuite + import org.apache.spark.sql.catalyst.expressions.UnsafeArrayData + +class GenericArrayDataSuite extends SparkFunSuite { + + test("from primitive boolean array") { + val primitiveArray = Array(true, false, true) + val array = GenericArrayData.allocate(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.numElements == primitiveArray.length) + assert(array.isNullAt(0) == false) + assert(array.getBoolean(0) == primitiveArray(0)) + assert(array.getBoolean(1) == primitiveArray(1)) + assert(array.getBoolean(2) == primitiveArray(2)) + assert(array.toBooleanArray()(0) == primitiveArray(0)) + } + + test("from primitive byte array") { + val primitiveArray = Array(1.toByte, 10.toByte, 100.toByte) + val array = GenericArrayData.allocate(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.numElements == primitiveArray.length) + assert(array.isNullAt(0) == false) + assert(array.getByte(0) == primitiveArray(0)) + assert(array.getByte(1) == primitiveArray(1)) + assert(array.getByte(2) == primitiveArray(2)) + assert(array.toByteArray()(0) == primitiveArray(0)) + } + + test("from primitive short array") { + val primitiveArray = Array[Short](1.toShort, 100.toShort, 10000.toShort) + val array = GenericArrayData.allocate(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.numElements == primitiveArray.length) + assert(array.isNullAt(0) == false) + assert(array.getShort(0) == primitiveArray(0)) + assert(array.getShort(1) == primitiveArray(1)) + assert(array.getShort(2) == primitiveArray(2)) + assert(array.toShortArray()(0) == primitiveArray(0)) + } + + test("from primitive int array") { + val primitiveArray = Array(1, 1000, 1000000) + val array = GenericArrayData.allocate(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.numElements == primitiveArray.length) + assert(array.isNullAt(0) == false) + assert(array.getInt(0) == primitiveArray(0)) + assert(array.getInt(1) == primitiveArray(1)) + assert(array.getInt(2) == primitiveArray(2)) + assert(array.toIntArray()(0) == primitiveArray(0)) + } + + test("from primitive long array") { + val primitiveArray = Array(1L, 100000L, 10000000000L) + val array = GenericArrayData.allocate(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.numElements == primitiveArray.length) + assert(array.isNullAt(0) == false) + assert(array.getLong(0) == primitiveArray(0)) + assert(array.getLong(1) == primitiveArray(1)) + assert(array.getLong(2) == primitiveArray(2)) + assert(array.toLongArray()(0) == primitiveArray(0)) + } + + test("from primitive float array") { + val primitiveArray = Array(1.1f, 2.2f, 3.3f) + val array = GenericArrayData.allocate(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.numElements == primitiveArray.length) + assert(array.isNullAt(0) == false) + assert(array.getFloat(0) == primitiveArray(0)) + assert(array.getFloat(1) == primitiveArray(1)) + assert(array.getFloat(2) == primitiveArray(2)) + assert(array.toFloatArray()(0) == primitiveArray(0)) + } + + test("from primitive double array") { + val primitiveArray = Array(1.1, 2.2, 3.3) + val array = GenericArrayData.allocate(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.numElements == primitiveArray.length) + assert(array.isNullAt(0) == false) + assert(array.getDouble(0) == primitiveArray(0)) + assert(array.getDouble(1) == primitiveArray(1)) + assert(array.getDouble(2) == primitiveArray(2)) + assert(array.toDoubleArray()(0) == primitiveArray(0)) + } +} From 7b48a30b1d8ba0e1ebc325f7975b5b3568dacad1 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 18 Jun 2016 22:04:52 +0900 Subject: [PATCH 02/75] fix scala style error --- .../org/apache/spark/sql/catalyst/util/GenericArrayData.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 657db2223638..ad1cb5721475 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -582,3 +582,4 @@ final class GenericBooleanArrayData(private val primitiveArray: Array[Boolean]) result } } + From b5876a6cca49218e3ac774475b65716dd3724c57 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 02:38:01 +0900 Subject: [PATCH 03/75] Introduce GenericRefArrayData --- .../sql/catalyst/util/GenericArrayData.scala | 67 ++++++++++++++----- 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index ad1cb5721475..2a1db2af68e1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -24,9 +24,9 @@ import org.apache.spark.sql.types.{DataType, Decimal} import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} object GenericArrayData { - def allocate(seq: Seq[Any]): GenericArrayData = new GenericArrayData(seq) - def allocate(list: java.util.List[Any]): GenericArrayData = new GenericArrayData(list) - def allocate(seqOrArray: Any): GenericArrayData = new GenericArrayData(seqOrArray) + def allocate(seq: Seq[Any]): GenericArrayData = new GenericRefArrayData(seq) + def allocate(list: java.util.List[Any]): GenericArrayData = new GenericRefArrayData(list) + def allocate(seqOrArray: Any): GenericArrayData = new GenericRefArrayData(seqOrArray) def allocate(primitiveArray: Array[Int]): GenericArrayData = new GenericIntArrayData(primitiveArray) def allocate(primitiveArray: Array[Long]): GenericArrayData = @@ -54,7 +54,40 @@ private object GenericArrayData { } -class GenericArrayData(private val _array: Array[Any]) extends ArrayData { +abstract class GenericArrayData extends ArrayData { + override def get(ordinal: Int, elementType: DataType): AnyRef = + throw new UnsupportedOperationException("get() method is not supported") + override def getBoolean(ordinal: Int): Boolean = + throw new UnsupportedOperationException("getBoolean() method is not supported") + override def getByte(ordinal: Int): Byte = + throw new UnsupportedOperationException("getByte() method is not supported") + override def getShort(ordinal: Int): Short = + throw new UnsupportedOperationException("getShort() method is not supported") + override def getInt(ordinal: Int): Int = + throw new UnsupportedOperationException("getInt() method is not supported") + override def getLong(ordinal: Int): Long = + throw new UnsupportedOperationException("getLong() method is not supported") + override def getFloat(ordinal: Int): Float = + throw new UnsupportedOperationException("getFloat() method is not supported") + override def getDouble(ordinal: Int): Double = + throw new UnsupportedOperationException("getDouble() method is not supported") + override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = + throw new UnsupportedOperationException("getDecimal() method is not supported") + override def getUTF8String(ordinal: Int): UTF8String = + throw new UnsupportedOperationException("getUTF8String() method is not supported") + override def getBinary(ordinal: Int): Array[Byte] = + throw new UnsupportedOperationException("getBinary() method is not supported") + override def getInterval(ordinal: Int): CalendarInterval = + throw new UnsupportedOperationException("getInterval() method is not supported") + override def getStruct(ordinal: Int, numFields: Int): InternalRow = + throw new UnsupportedOperationException("getStruct() method is not supported") + override def getArray(ordinal: Int): ArrayData = + throw new UnsupportedOperationException("getArray() method is not supported") + override def getMap(ordinal: Int): MapData = + throw new UnsupportedOperationException("getMap() method is not supported") +} + +final class GenericRefArrayData(val _array: Array[Any]) extends GenericArrayData { def this(seq: Seq[Any]) = this(seq.toArray) def this(list: java.util.List[Any]) = this(list.asScala) @@ -70,13 +103,11 @@ class GenericArrayData(private val _array: Array[Any]) extends ArrayData { def this(seqOrArray: Any) = this(GenericArrayData.anyToSeq(seqOrArray)) - override def array(): Array[Any] = _array - - override def copy(): ArrayData = new GenericArrayData(array.clone()) + override def copy(): ArrayData = new GenericRefArrayData(array.clone()) override def numElements(): Int = array.length - private def getAs[T](ordinal: Int) = _array(ordinal).asInstanceOf[T] + private def getAs[T](ordinal: Int) = array(ordinal).asInstanceOf[T] override def isNullAt(ordinal: Int): Boolean = getAs[AnyRef](ordinal) eq null override def get(ordinal: Int, elementType: DataType): AnyRef = getAs(ordinal) override def getBoolean(ordinal: Int): Boolean = getAs(ordinal) @@ -117,8 +148,8 @@ class GenericArrayData(private val _array: Array[Any]) extends ArrayData { return false } if (!isNullAt(i)) { - val o1 = _array(i) - val o2 = other._array(i) + val o1 = array(i) + val o2 = other.array(i) o1 match { case b1: Array[Byte] => if (!o2.isInstanceOf[Array[Byte]] || @@ -152,7 +183,7 @@ class GenericArrayData(private val _array: Array[Any]) extends ArrayData { if (isNullAt(i)) { 0 } else { - _array(i) match { + array(i) match { case b: Boolean => if (b) 0 else 1 case b: Byte => b.toInt case s: Short => s.toInt @@ -173,7 +204,7 @@ class GenericArrayData(private val _array: Array[Any]) extends ArrayData { } } -final class GenericIntArrayData(private val primitiveArray: Array[Int]) extends GenericArrayData { +final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray override def copy(): ArrayData = new GenericIntArrayData(primitiveArray) @@ -229,7 +260,7 @@ final class GenericIntArrayData(private val primitiveArray: Array[Int]) extends } } -final class GenericLongArrayData(private val primitiveArray: Array[Long]) +final class GenericLongArrayData(val primitiveArray: Array[Long]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray @@ -287,7 +318,7 @@ final class GenericLongArrayData(private val primitiveArray: Array[Long]) } } -final class GenericFloatArrayData(private val primitiveArray: Array[Float]) +final class GenericFloatArrayData(val primitiveArray: Array[Float]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray @@ -349,7 +380,7 @@ final class GenericFloatArrayData(private val primitiveArray: Array[Float]) } } -final class GenericDoubleArrayData(private val primitiveArray: Array[Double]) +final class GenericDoubleArrayData(val primitiveArray: Array[Double]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray @@ -412,7 +443,7 @@ final class GenericDoubleArrayData(private val primitiveArray: Array[Double]) } } -final class GenericShortArrayData(private val primitiveArray: Array[Short]) +final class GenericShortArrayData(val primitiveArray: Array[Short]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray @@ -469,7 +500,7 @@ final class GenericShortArrayData(private val primitiveArray: Array[Short]) } } -final class GenericByteArrayData(private val primitiveArray: Array[Byte]) +final class GenericByteArrayData(val primitiveArray: Array[Byte]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray @@ -526,7 +557,7 @@ final class GenericByteArrayData(private val primitiveArray: Array[Byte]) } } -final class GenericBooleanArrayData(private val primitiveArray: Array[Boolean]) +final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray From ac5b73b81375b63748fca7bed286d8a715c6e165 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 02:48:04 +0900 Subject: [PATCH 04/75] replace 'new GenericArrayData' with 'GenericArrayData.allocate' --- .../spark/sql/catalyst/CatalystTypeConverters.scala | 8 ++++---- .../apache/spark/sql/catalyst/ScalaReflection.scala | 4 ++-- .../apache/spark/sql/catalyst/expressions/Cast.scala | 4 ++-- .../catalyst/expressions/aggregate/PivotFirst.scala | 2 +- .../sql/catalyst/expressions/aggregate/collect.scala | 2 +- .../expressions/codegen/GenerateSafeProjection.scala | 2 +- .../catalyst/expressions/collectionOperations.scala | 2 +- .../sql/catalyst/expressions/complexTypeCreator.scala | 11 ++++++----- .../catalyst/expressions/complexTypeExtractors.scala | 4 ++-- .../sql/catalyst/expressions/objects/objects.scala | 2 +- .../sql/catalyst/expressions/regexpExpressions.scala | 4 ++-- .../spark/sql/catalyst/json/JacksonParser.scala | 2 +- .../sql/catalyst/analysis/AnalysisErrorSuite.scala | 4 ++-- .../catalyst/encoders/EncoderResolutionSuite.scala | 4 ++-- .../spark/sql/catalyst/encoders/RowEncoderSuite.scala | 2 +- .../expressions/UnsafeRowConverterSuite.scala | 2 +- .../codegen/GeneratedProjectionSuite.scala | 2 +- .../datasources/parquet/ParquetRowConverter.scala | 4 ++-- .../spark/sql/execution/python/EvaluatePython.scala | 4 ++-- .../org/apache/spark/sql/test/ExamplePointUDT.scala | 2 +- .../scala/org/apache/spark/sql/UnsafeRowSuite.scala | 2 +- .../org/apache/spark/sql/UserDefinedTypeSuite.scala | 2 +- .../sql/execution/columnar/ColumnarTestUtils.scala | 2 +- .../apache/spark/sql/hive/HiveInspectorSuite.scala | 2 +- 24 files changed, 40 insertions(+), 39 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala index 5b9161551a7a..2801827e7bb6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala @@ -159,9 +159,9 @@ object CatalystTypeConverters { override def toCatalystImpl(scalaValue: Any): ArrayData = { scalaValue match { case a: Array[_] => - new GenericArrayData(a.map(elementConverter.toCatalyst)) + GenericArrayData.allocate(a.map(elementConverter.toCatalyst)) case s: Seq[_] => - new GenericArrayData(s.map(elementConverter.toCatalyst).toArray) + GenericArrayData.allocate(s.map(elementConverter.toCatalyst).toArray) case i: JavaIterable[_] => val iter = i.iterator val convertedIterable = scala.collection.mutable.ArrayBuffer.empty[Any] @@ -169,7 +169,7 @@ object CatalystTypeConverters { val item = iter.next() convertedIterable += elementConverter.toCatalyst(item) } - new GenericArrayData(convertedIterable.toArray) + GenericArrayData.allocate(convertedIterable.toArray) } } @@ -410,7 +410,7 @@ object CatalystTypeConverters { case t: Timestamp => TimestampConverter.toCatalyst(t) case d: BigDecimal => new DecimalConverter(DecimalType(d.precision, d.scale)).toCatalyst(d) case d: JavaBigDecimal => new DecimalConverter(DecimalType(d.precision, d.scale)).toCatalyst(d) - case seq: Seq[Any] => new GenericArrayData(seq.map(convertToCatalyst).toArray) + case seq: Seq[Any] => GenericArrayData.allocate(seq.map(convertToCatalyst).toArray) case r: Row => InternalRow(r.toSeq.map(convertToCatalyst): _*) case arr: Array[Any] => new GenericArrayData(arr.map(convertToCatalyst)) case map: Map[_, _] => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 7bcaea7ea2f7..68df41b5802d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericRefArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -459,7 +459,7 @@ object ScalaReflection extends ScalaReflection { case dt => NewInstance( - classOf[GenericArrayData], + classOf[GenericRefArrayData], input :: Nil, dataType = ArrayType(dt, schemaFor(elementType).nullable)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 4db1ae6faa15..039b4469b99e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -388,7 +388,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w values(i) = elementCast(e) } }) - new GenericArrayData(values) + GenericArrayData.allocate(values) }) } @@ -864,7 +864,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w } } } - $evPrim = new $arrayClass($values); + $evPrim = $arrayClass.allocate($values); """ } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala index 087606077295..23a8d5fd4903 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala @@ -131,7 +131,7 @@ case class PivotFirst( for (i <- 0 until indexSize) { result(i) = input.get(mutableAggBufferOffset + i, valueDataType) } - new GenericArrayData(result) + GenericArrayData.allocate(result) } override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate = diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala index d2880d58aefe..f97fe5fe8d51 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala @@ -78,7 +78,7 @@ abstract class Collect extends ImperativeAggregate { } override def eval(input: InternalRow): Any = { - new GenericArrayData(buffer.toArray) + GenericArrayData.allocate(buffer.toArray) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala index b1cb6edefb85..792d735a0d5a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala @@ -96,7 +96,7 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection] $values[$index] = ${elementConverter.value}; } } - final ArrayData $output = new $arrayClass($values); + final ArrayData $output = $arrayClass.allocate($values); """ ExprCode(code, "false", output) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index c863ba434120..1d10a9034ab7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -206,7 +206,7 @@ case class SortArray(base: Expression, ascendingOrder: Expression) if (elementType != NullType) { java.util.Arrays.sort(data, if (ascending.asInstanceOf[Boolean]) lt else gt) } - new GenericArrayData(data.asInstanceOf[Array[Any]]) + GenericArrayData.allocate(data.asInstanceOf[Array[Any]]) } override def prettyName: String = "sort_array" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index c9f36649ec8e..640c32628cdd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -52,7 +52,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression { override def nullable: Boolean = false override def eval(input: InternalRow): Any = { - new GenericArrayData(children.map(_.eval(input)).toArray) + GenericArrayData.allocate(children.map(_.eval(input)).toArray) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -76,7 +76,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression { """ }) + s""" - final ArrayData ${ev.value} = new $arrayClass($values); + final ArrayData ${ev.value} = $arrayClass.allocate($values); this.$values = null; """) } @@ -130,7 +130,8 @@ case class CreateMap(children: Seq[Expression]) extends Expression { throw new RuntimeException("Cannot use null as map key!") } val valueArray = values.map(_.eval(input)).toArray - new ArrayBasedMapData(new GenericArrayData(keyArray), new GenericArrayData(valueArray)) + new ArrayBasedMapData( + GenericArrayData.allocate(keyArray), GenericArrayData.allocate(valueArray)) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -141,8 +142,8 @@ case class CreateMap(children: Seq[Expression]) extends Expression { ctx.addMutableState("Object[]", keyArray, s"this.$keyArray = null;") ctx.addMutableState("Object[]", valueArray, s"this.$valueArray = null;") - val keyData = s"new $arrayClass($keyArray)" - val valueData = s"new $arrayClass($valueArray)" + val keyData = s"$arrayClass.allocate($keyArray)" + val valueData = s"$arrayClass.allocate($valueArray)" ev.copy(code = s""" final boolean ${ev.isNull} = false; $keyArray = new Object[${keys.size}]; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala index 0c256c3d890f..f17d0bc412b1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala @@ -176,7 +176,7 @@ case class GetArrayStructFields( } i += 1 } - new GenericArrayData(result) + GenericArrayData.allocate(result) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -201,7 +201,7 @@ case class GetArrayStructFields( } } } - ${ev.value} = new $arrayClass($values); + ${ev.value} = $arrayClass.allocate($values); """ }) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index 50e2ac3c36d9..b12f944566b4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -537,7 +537,7 @@ case class MapObjects private( $loopIndex += 1; } - ${ev.value} = new ${classOf[GenericArrayData].getName}($convertedArray); + ${ev.value} = ${classOf[GenericArrayData].getName}.allocate($convertedArray); } """ ev.copy(code = code, isNull = genInputData.isNull) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 5648ad6b6dc1..6aebe7970443 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -191,14 +191,14 @@ case class StringSplit(str: Expression, pattern: Expression) override def nullSafeEval(string: Any, regex: Any): Any = { val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1) - new GenericArrayData(strings.asInstanceOf[Array[Any]]) + GenericArrayData.allocate(strings.asInstanceOf[Array[Any]]) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val arrayClass = classOf[GenericArrayData].getName nullSafeCodeGen(ctx, ev, (str, pattern) => // Array in java is covariant, so we don't need to cast UTF8String[] to Object[]. - s"""${ev.value} = new $arrayClass($str.split($pattern, -1));""") + s"""${ev.value} = $arrayClass.allocate($str.split($pattern, -1));""") } override def prettyName: String = "split" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index e476cb11a351..ec87133c9d11 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -404,7 +404,7 @@ class JacksonParser( values += fieldConverter.apply(parser) } - new GenericArrayData(values.toArray) + GenericArrayData.allocate(values.toArray) } /** diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala index 21afe9fec594..6e59215dc8c6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala @@ -57,8 +57,8 @@ private[sql] class UngroupableUDT extends UserDefinedType[UngroupableData] { override def sqlType: DataType = MapType(IntegerType, IntegerType) override def serialize(ungroupableData: UngroupableData): MapData = { - val keyArray = new GenericArrayData(ungroupableData.data.keys.toSeq) - val valueArray = new GenericArrayData(ungroupableData.data.values.toSeq) + val keyArray = GenericArrayData.allocate(ungroupableData.data.keys.toSeq) + val valueArray = GenericArrayData.allocate(ungroupableData.data.values.toSeq) new ArrayBasedMapData(keyArray, valueArray) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala index 802397d50e85..49934354e5de 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala @@ -70,11 +70,11 @@ class EncoderResolutionSuite extends PlanTest { val bound = encoder.resolveAndBind(attrs) // If no null values appear, it should works fine - bound.fromRow(InternalRow(new GenericArrayData(Array(1, 2)))) + bound.fromRow(InternalRow(GenericArrayData.allocate(Array(1, 2)))) // If there is null value, it should throw runtime exception val e = intercept[RuntimeException] { - bound.fromRow(InternalRow(new GenericArrayData(Array(1, null)))) + bound.fromRow(InternalRow(GenericArrayData.allocate(Array(1, null)))) } assert(e.getMessage.contains("Null value appeared in non-nullable field")) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala index 1a5569a77dc7..46575f7d63eb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala @@ -51,7 +51,7 @@ class ExamplePointUDT extends UserDefinedType[ExamplePoint] { val output = new Array[Any](2) output(0) = p.x output(1) = p.y - new GenericArrayData(output) + GenericArrayData.allocate(output) } override def deserialize(datum: Any): ExamplePoint = { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala index cf3cbe270753..b634834c67e3 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala @@ -291,7 +291,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers { assert(unsafeRow.getSizeInBytes == 8 + 2 * 8 + row1.getSizeInBytes + row2.getSizeInBytes) } - private def createArray(values: Any*): ArrayData = new GenericArrayData(values.toArray) + private def createArray(values: Any*): ArrayData = GenericArrayData.allocate(values.toArray) private def createMap(keys: Any*)(values: Any*): MapData = { assert(keys.length == values.length) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala index b69b74b4240b..d6c9a9c0b638 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala @@ -86,7 +86,7 @@ class GeneratedProjectionSuite extends SparkFunSuite { test("generated unsafe projection with array of binary") { val row = InternalRow( Array[Byte](1, 2), - new GenericArrayData(Array(Array[Byte](1, 2), null, Array[Byte](3, 4)))) + GenericArrayData.allocate(Array(Array[Byte](1, 2), null, Array[Byte](3, 4)))) val fields = (BinaryType :: ArrayType(BinaryType) :: Nil).toArray[DataType] val unsafeProj = UnsafeProjection.create(fields) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala index 33dcf2f3fd16..108977c23ec3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala @@ -491,7 +491,7 @@ private[parquet] class ParquetRowConverter( override def getConverter(fieldIndex: Int): Converter = elementConverter - override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray)) + override def end(): Unit = updater.set(GenericArrayData.allocate(currentArray.toArray)) // NOTE: We can't reuse the mutable `ArrayBuffer` here and must instantiate a new buffer for the // next value. `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored @@ -590,7 +590,7 @@ private[parquet] class ParquetRowConverter( protected def newArrayUpdater(updater: ParentContainerUpdater) = new ParentContainerUpdater { override def start(): Unit = currentArray = ArrayBuffer.empty[Any] - override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray)) + override def end(): Unit = updater.set(GenericArrayData.allocate(currentArray.toArray)) override def set(value: Any): Unit = currentArray += value } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala index 46fd54e5c742..0e496dfd29e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala @@ -119,10 +119,10 @@ object EvaluatePython { case (c, BinaryType) if c.getClass.isArray && c.getClass.getComponentType.getName == "byte" => c case (c: java.util.List[_], ArrayType(elementType, _)) => - new GenericArrayData(c.asScala.map { e => fromJava(e, elementType)}.toArray) + GenericArrayData.allocate(c.asScala.map { e => fromJava(e, elementType)}.toArray) case (c, ArrayType(elementType, _)) if c.getClass.isArray => - new GenericArrayData(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType))) + GenericArrayData.allocate(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType))) case (javaMap: java.util.Map[_, _], MapType(keyType, valueType, _)) => ArrayBasedMapData( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala index a73e4272950a..6a8a5e060fd8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala @@ -49,7 +49,7 @@ private[sql] class ExamplePointUDT extends UserDefinedType[ExamplePoint] { val output = new Array[Any](2) output(0) = p.x output(1) = p.y - new GenericArrayData(output) + GenericArrayData.allocate(output) } override def deserialize(datum: Any): ExamplePoint = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala index a32763db054f..c002dfcf4908 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala @@ -160,7 +160,7 @@ class UnsafeRowSuite extends SparkFunSuite { } test("calling hashCode on unsafe array returned by getArray(ordinal)") { - val row = InternalRow.apply(new GenericArrayData(Array(1L))) + val row = InternalRow.apply(GenericArrayData.allocate(Array(1L))) val unsafeRow = UnsafeProjection.create(Array[DataType](ArrayType(LongType))).apply(row) // Makes sure hashCode on unsafe array won't crash unsafeRow.getArray(0).hashCode() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala index 474f17ff7afb..17ec9315e4a6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala @@ -50,7 +50,7 @@ object UDT { override def sqlType: DataType = ArrayType(DoubleType, containsNull = false) override def serialize(features: MyDenseVector): ArrayData = { - new GenericArrayData(features.data.map(_.asInstanceOf[Any])) + GenericArrayData.allocate(features.data.map(_.asInstanceOf[Any])) } override def deserialize(datum: Any): MyDenseVector = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala index 686c8fa6f5fa..e590d2833477 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala @@ -56,7 +56,7 @@ object ColumnarTestUtils { case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => - new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) + GenericArrayData.allocate(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala index 3de1f4aeb74d..8cadaeedea69 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala @@ -229,7 +229,7 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors { test("wrap / unwrap Array Type") { val dt = ArrayType(dataTypes(0)) - val d = new GenericArrayData(Array(row(0), row(0))) + val d = GenericArrayData.allocate(Array(row(0), row(0))) checkValue(d, unwrap(wrap(d, toInspector(dt), dt), toInspector(dt))) checkValue(null, unwrap(wrap(null, toInspector(dt), dt), toInspector(dt))) checkValue(d, From dab5a8ca3ad5993adf4117a4b35e86406ec5659e Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 12:22:23 +0900 Subject: [PATCH 05/75] Generate GenericArrayData.allocate in NewInstance() --- .../spark/sql/catalyst/expressions/objects/objects.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index b12f944566b4..80a0efe05fc6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -281,7 +281,11 @@ case class NewInstance( val constructorCall = outer.map { gen => s"""${gen.value}.new ${cls.getSimpleName}(${argValues.mkString(", ")})""" }.getOrElse { - s"new $className(${argValues.mkString(", ")})" + if (!cls.isInstanceOf[GenericArrayData]) { + s"new $className(${argValues.mkString(", ")})" + } else { + s"${cls.getName}.allocate(${argValues.mkString(", ")})" + } } val code = s""" From a8fe2d8a1d01338d13ba9432362a5bf2c2cdd5cd Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 14:15:51 +0900 Subject: [PATCH 06/75] intial version of Benchmark without performance --- .../util/GenericArrayDataBenchmark.scala | 188 ++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala new file mode 100644 index 000000000000..5a3364c9c7a0 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.util + +import org.apache.spark.util.Benchmark + +/** + * Benchmark [[GenericArrayData]] for Dense and Sparse with primitive type + */ +object GenericArrayDataBenchmark { +/* + def allocateGenericIntArray(iters: Int): Unit = { + val count = 1024 * 1024 * 10 + var array: GenericArrayData = null + + val primitiveIntArray = new Array[Int](count) + val denseIntArray = { i: Int => + for (n <- 0L until iters) { + array = GenericArrayData.allocate(primitiveIntArray) + } + } + val sparseIntArray = { i: Int => + for (n <- 0L until iters) { + array = new GenericRefArrayData(primitiveIntArray) + } + } + + val benchmark = new Benchmark("Allocate GenericArrayData for int", count * iters) + benchmark.addCase("Sparse")(sparseIntArray) + benchmark.addCase("Dense ")(denseIntArray) + } + + def allocateGenericDoubleArray(iters: Int): Unit = { + val count = 1024 * 1024 * 10 + var array: GenericArrayData = null + + val primitiveDoubleArray = new Array[Int](count) + val denseDoubleArray = { i: Int => + for (n <- 0L until iters) { + array = GenericArrayData.allocate(primitiveDoubleArray) + } + } + val sparseDoubleArray = { i: Int => + for (n <- 0L until iters) { + array = new GenericRefArrayData(primitiveDoubleArray) + } + } + + val benchmark = new Benchmark("Allocate GenericArrayData for double", count * iters) + benchmark.addCase("Sparse")(sparseDoubleArray) + benchmark.addCase("Dense ")(denseDoubleArray) + } + + def getPrimitiveIntArray(iters: Int): Unit = { + val count = 1024 * 1024 * 10 + + val intSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Int](count)) + val intDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Int](count)) + var primitiveIntArray: Array[Int] = null + val sparseIntArray = { i: Int => + for (n <- 0L until iters) { + primitiveIntArray = intSparseArray.toIntArray + } + } + val denseIntArray = { i: Int => + for (n <- 0L until iters) { + primitiveIntArray = intDenseArray.toIntArray + } + } + + val benchmark = new Benchmark("Get int primitive array", count * iters) + benchmark.addCase("Sparse int")(sparseIntArray) + benchmark.addCase("Dense int")(denseIntArray) + } + + def getPrimitiveDoubleArray(iters: Int): Unit = { + val count = 1024 * 1024 * 10 + + val doubleSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Double](count)) + val doubleDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Double](count)) + var primitiveDoubleArray: Array[Double] = null + val sparseDoubleArray = { i: Int => + for (n <- 0L until iters) { + primitiveDoubleArray = doubleSparseArray.toDoubleArray + } + } + val denseDoubleArray = { i: Int => + for (n <- 0L until iters) { + primitiveDoubleArray = doubleDenseArray.toDoubleArray + } + } + + val benchmark = new Benchmark("Get double primitive array", count * iters) + benchmark.addCase("Sparse double")(sparseDoubleArray) + benchmark.addCase("Dense double")(denseDoubleArray) + } + + def readGenericIntArray(iters: Int): Unit = { + val count = 1024 * 1024 * 10 + var result: Int = 0 + + val sparseArray = new GenericRefArrayData(new Array[Int](count)) + val sparseIntArray = { i: Int => + for (n <- 0L until iters) { + val len = sparseArray.numElements + var sum = 0 + for (i <- 0 until len - 1) { + sum += sparseArray.getInt(i) + } + result = sum + } + } + + val denseArray = GenericArrayData.allocate(new Array[Int](count)) + val denseIntArray = { i: Int => + for (n <- 0L until iters) { + val len = denseArray.numElements + var sum = 0 + for (i <- 0 until len - 1) { + sum += denseArray.getInt(i) + } + result = sum + } + } + + val benchmark = new Benchmark("Read GenericArrayData Int", count * iters) + benchmark.addCase("Sparse")(sparseIntArray) + benchmark.addCase("Dense ")(denseIntArray) + } + + def readGenericDoubleArray(iters: Int): Unit = { + val count = 1024 * 1024 * 10 + var result: Int = 0 + + val sparseArray = new GenericRefArrayData(new Array[Int](count)) + val sparseIntArray = { i: Int => + for (n <- 0L until iters) { + val len = sparseArray.numElements + var sum = 0 + for (i <- 0 until len - 1) { + sum += sparseArray.getInt(i) + } + result = sum + } + } + + val denseArray = GenericArrayData.allocate(new Array[Int](count)) + val denseIntArray = { i: Int => + for (n <- 0L until iters) { + val len = denseArray.numElements + var sum = 0 + for (i <- 0 until len - 1) { + sum += denseArray.getInt(i) + } + result = sum + } + } + + val benchmark = new Benchmark("Read GenericArrayData Double", count * iters) + benchmark.addCase("Sparse")(sparseIntArray) + benchmark.addCase("Dense ")(denseIntArray) + } + + def main(args: Array[String]): Unit = { + allocateGenericIntArray(1024) + allocateGenericDoubleArray(1024) + getPrimitiveIntArray(1024) + getPrimitiveDoubleArray(1024) + readGenericIntArray(512) + readGenericDoubleArray(512) + } +*/ +} From 870dc841c55e6157ca4e8c6f29717b67b17ed032 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 18:40:46 +0900 Subject: [PATCH 07/75] move project of Benchmark program --- .../sql/execution/benchmark}/GenericArrayDataBenchmark.scala | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sql/{catalyst/src/main/scala/org/apache/spark/sql/catalyst/util => core/src/test/scala/org/apache/spark/sql/execution/benchmark}/GenericArrayDataBenchmark.scala (100%) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala similarity index 100% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala From 7cdd5585143d7b858e3c3ea9e9d0e4ba8a004d72 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 18:41:08 +0900 Subject: [PATCH 08/75] update benchmark program --- .../benchmark/GenericArrayDataBenchmark.scala | 151 ++++++++++++------ 1 file changed, 106 insertions(+), 45 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index 5a3364c9c7a0..0832830d6a87 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -15,174 +15,235 @@ * limitations under the License. */ -package org.apache.spark.sql.catalyst.util +package org.apache.spark.sql.execution.benchmark +import scala.concurrent.duration._ + +import org.apache.spark.sql.catalyst.util._ import org.apache.spark.util.Benchmark /** * Benchmark [[GenericArrayData]] for Dense and Sparse with primitive type + * To run this: + * build/sbt "sql/test-only *benchmark.GenericArrayDataBenchmark" + * + * Benchmarks in this file are skipped in normal builds. */ -object GenericArrayDataBenchmark { -/* +class GenericArrayDataBenchmark extends BenchmarkBase { + def allocateGenericIntArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 + val count = 1024 * 1024 var array: GenericArrayData = null val primitiveIntArray = new Array[Int](count) val denseIntArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { array = GenericArrayData.allocate(primitiveIntArray) + n += 1 } } val sparseIntArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { array = new GenericRefArrayData(primitiveIntArray) + n += 1 } } - val benchmark = new Benchmark("Allocate GenericArrayData for int", count * iters) + val benchmark = new Benchmark("Allocate GenericArrayData for int", count * iters, + minNumIters = 10, minTime = 1.milliseconds) benchmark.addCase("Sparse")(sparseIntArray) benchmark.addCase("Dense ")(denseIntArray) + benchmark.run } def allocateGenericDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 + val count = 1024 * 1024 var array: GenericArrayData = null val primitiveDoubleArray = new Array[Int](count) val denseDoubleArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { array = GenericArrayData.allocate(primitiveDoubleArray) + n += 1 } } val sparseDoubleArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { array = new GenericRefArrayData(primitiveDoubleArray) + n += 1 } } - val benchmark = new Benchmark("Allocate GenericArrayData for double", count * iters) + val benchmark = new Benchmark("Allocate GenericArrayData for double", count * iters, + minNumIters = 10, minTime = 1.milliseconds) benchmark.addCase("Sparse")(sparseDoubleArray) benchmark.addCase("Dense ")(denseDoubleArray) + benchmark.run } def getPrimitiveIntArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 + val count = 1024 * 1024 val intSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Int](count)) val intDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Int](count)) var primitiveIntArray: Array[Int] = null val sparseIntArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { primitiveIntArray = intSparseArray.toIntArray + n += 1 } } val denseIntArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { primitiveIntArray = intDenseArray.toIntArray + n += 1 } } val benchmark = new Benchmark("Get int primitive array", count * iters) benchmark.addCase("Sparse int")(sparseIntArray) benchmark.addCase("Dense int")(denseIntArray) + benchmark.run } def getPrimitiveDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 + val count = 1024 * 1024 val doubleSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Double](count)) val doubleDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Double](count)) var primitiveDoubleArray: Array[Double] = null val sparseDoubleArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { primitiveDoubleArray = doubleSparseArray.toDoubleArray + n += 1 } } val denseDoubleArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { primitiveDoubleArray = doubleDenseArray.toDoubleArray + n += 1 } } val benchmark = new Benchmark("Get double primitive array", count * iters) benchmark.addCase("Sparse double")(sparseDoubleArray) benchmark.addCase("Dense double")(denseDoubleArray) + benchmark.run } def readGenericIntArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 + val count = 1024 * 1024 * 2 var result: Int = 0 val sparseArray = new GenericRefArrayData(new Array[Int](count)) val sparseIntArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { val len = sparseArray.numElements var sum = 0 - for (i <- 0 until len - 1) { + var i = 0 + while (i < len) { sum += sparseArray.getInt(i) + i += 1 } result = sum + n += 1 } } val denseArray = GenericArrayData.allocate(new Array[Int](count)) val denseIntArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { val len = denseArray.numElements var sum = 0 - for (i <- 0 until len - 1) { + var i = 0 + while (i < len) { sum += denseArray.getInt(i) + i += 1 } result = sum + n += 1 } } val benchmark = new Benchmark("Read GenericArrayData Int", count * iters) benchmark.addCase("Sparse")(sparseIntArray) benchmark.addCase("Dense ")(denseIntArray) + benchmark.run } def readGenericDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 - var result: Int = 0 + val count = 1024 * 1024 * 2 + var result: Double = 0 - val sparseArray = new GenericRefArrayData(new Array[Int](count)) - val sparseIntArray = { i: Int => - for (n <- 0L until iters) { + val sparseArray = new GenericRefArrayData(new Array[Double](count)) + val sparseDoubleArray = { i: Int => + var n = 0 + while (n < iters) { val len = sparseArray.numElements - var sum = 0 - for (i <- 0 until len - 1) { - sum += sparseArray.getInt(i) + var sum = 0.toDouble + var i = 0 + while (i < len) { + sum += sparseArray.getDouble(i) + i += 1 } result = sum + n += 1 } } - val denseArray = GenericArrayData.allocate(new Array[Int](count)) - val denseIntArray = { i: Int => - for (n <- 0L until iters) { + val denseArray = GenericArrayData.allocate(new Array[Double](count)) + val denseDoubleArray = { i: Int => + var n = 0 + while (n < iters) { val len = denseArray.numElements - var sum = 0 - for (i <- 0 until len - 1) { - sum += denseArray.getInt(i) + var sum = 0.toDouble + var i = 0 + while (i < len) { + sum += denseArray.getDouble(i) + i += 1 } result = sum + n += 1 } } val benchmark = new Benchmark("Read GenericArrayData Double", count * iters) - benchmark.addCase("Sparse")(sparseIntArray) - benchmark.addCase("Dense ")(denseIntArray) + benchmark.addCase("Sparse")(sparseDoubleArray) + benchmark.addCase("Dense ")(denseDoubleArray) + benchmark.run + } + + ignore("allocate GenericArrayData") { + allocateGenericIntArray(20) + allocateGenericDoubleArray(20) + } + + ignore("get primitive array") { + getPrimitiveIntArray(50) + getPrimitiveDoubleArray(50) + } + + test("read elements in GenericArrayData") { + readGenericIntArray(100) + readGenericDoubleArray(100) } def main(args: Array[String]): Unit = { - allocateGenericIntArray(1024) - allocateGenericDoubleArray(1024) - getPrimitiveIntArray(1024) - getPrimitiveDoubleArray(1024) - readGenericIntArray(512) - readGenericDoubleArray(512) + allocateGenericIntArray(20) + allocateGenericDoubleArray(20) + getPrimitiveIntArray(50) + getPrimitiveDoubleArray(50) + readGenericIntArray(20) + readGenericDoubleArray(20) } -*/ } From d28e256c03ab3e3260900a0c6dd06c7a37c70ad2 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 18:42:05 +0900 Subject: [PATCH 09/75] addressed review comments --- .../sql/catalyst/util/GenericArrayData.scala | 251 +++--------------- 1 file changed, 32 insertions(+), 219 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 2a1db2af68e1..18aed7bf527b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -25,19 +25,19 @@ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} object GenericArrayData { def allocate(seq: Seq[Any]): GenericArrayData = new GenericRefArrayData(seq) - def allocate(list: java.util.List[Any]): GenericArrayData = new GenericRefArrayData(list) - def allocate(seqOrArray: Any): GenericArrayData = new GenericRefArrayData(seqOrArray) - def allocate(primitiveArray: Array[Int]): GenericArrayData = + def allocate(list: java.util.List[Any]): GenericRefArrayData = new GenericRefArrayData(list) + def allocate(seqOrArray: Any): GenericRefArrayData = new GenericRefArrayData(seqOrArray) + def allocate(primitiveArray: Array[Int]): GenericIntArrayData = new GenericIntArrayData(primitiveArray) - def allocate(primitiveArray: Array[Long]): GenericArrayData = + def allocate(primitiveArray: Array[Long]): GenericLongArrayData = new GenericLongArrayData(primitiveArray) - def allocate(primitiveArray: Array[Float]): GenericArrayData = + def allocate(primitiveArray: Array[Float]): GenericFloatArrayData = new GenericFloatArrayData(primitiveArray) - def allocate(primitiveArray: Array[Double]): GenericArrayData = + def allocate(primitiveArray: Array[Double]): GenericDoubleArrayData = new GenericDoubleArrayData(primitiveArray) - def allocate(primitiveArray: Array[Short]): GenericArrayData = + def allocate(primitiveArray: Array[Short]): GenericShortArrayData = new GenericShortArrayData(primitiveArray) - def allocate(primitiveArray: Array[Byte]): GenericArrayData = + def allocate(primitiveArray: Array[Byte]): GenericByteArrayData = new GenericByteArrayData(primitiveArray) def allocate(primitiveArray: Array[Boolean]): GenericArrayData = new GenericBooleanArrayData(primitiveArray) @@ -85,6 +85,8 @@ abstract class GenericArrayData extends ArrayData { throw new UnsupportedOperationException("getArray() method is not supported") override def getMap(ordinal: Int): MapData = throw new UnsupportedOperationException("getMap() method is not supported") + + override def toString(): String = array.mkString("[", ",", "]") } final class GenericRefArrayData(val _array: Array[Any]) extends GenericArrayData { @@ -103,7 +105,7 @@ final class GenericRefArrayData(val _array: Array[Any]) extends GenericArrayData def this(seqOrArray: Any) = this(GenericArrayData.anyToSeq(seqOrArray)) - override def copy(): ArrayData = new GenericRefArrayData(array.clone()) + override def copy(): GenericRefArrayData = new GenericRefArrayData(array.clone()) override def numElements(): Int = array.length @@ -125,8 +127,6 @@ final class GenericRefArrayData(val _array: Array[Any]) extends GenericArrayData override def getArray(ordinal: Int): ArrayData = getAs(ordinal) override def getMap(ordinal: Int): MapData = getAs(ordinal) - override def toString(): String = array.mkString("[", ",", "]") - override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericArrayData]) { return false @@ -207,7 +207,7 @@ final class GenericRefArrayData(val _array: Array[Any]) extends GenericArrayData final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericIntArrayData(primitiveArray) + override def copy(): GenericIntArrayData = new GenericIntArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -218,7 +218,6 @@ final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericA System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericIntArrayData]) { @@ -230,41 +229,17 @@ final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericA return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val update: Int = primitiveArray(i) - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } final class GenericLongArrayData(val primitiveArray: Array[Long]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericLongArrayData(primitiveArray) + override def copy(): GenericLongArrayData = new GenericLongArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -275,7 +250,6 @@ final class GenericLongArrayData(val primitiveArray: Array[Long]) System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericLongArrayData]) { @@ -287,42 +261,17 @@ final class GenericLongArrayData(val primitiveArray: Array[Long]) return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val l = primitiveArray(i) - val update: Int = (l ^ (l >>> 32)).toInt - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } final class GenericFloatArrayData(val primitiveArray: Array[Float]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericFloatArrayData(primitiveArray) + override def copy(): GenericFloatArrayData = new GenericFloatArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -333,7 +282,6 @@ final class GenericFloatArrayData(val primitiveArray: Array[Float]) System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericFloatArrayData]) { @@ -345,46 +293,17 @@ final class GenericFloatArrayData(val primitiveArray: Array[Float]) return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (java.lang.Float.isNaN(o1)) { - if (!java.lang.Float.isNaN(o2)) { - return false; - } - } else if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val f = primitiveArray(i) - val update: Int = java.lang.Float.floatToIntBits(f) - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } final class GenericDoubleArrayData(val primitiveArray: Array[Double]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericDoubleArrayData(primitiveArray) + override def copy(): GenericDoubleArrayData = new GenericDoubleArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -395,7 +314,6 @@ final class GenericDoubleArrayData(val primitiveArray: Array[Double]) System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericDoubleArrayData]) { @@ -407,47 +325,17 @@ final class GenericDoubleArrayData(val primitiveArray: Array[Double]) return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (java.lang.Double.isNaN(o1)) { - if (!java.lang.Double.isNaN(o2)) { - return false; - } - } else if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val d = primitiveArray(i) - val b = java.lang.Double.doubleToLongBits(d) - val update: Int = (b ^ (b >>> 32)).toInt - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } final class GenericShortArrayData(val primitiveArray: Array[Short]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericShortArrayData(primitiveArray) + override def copy(): GenericShortArrayData = new GenericShortArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -458,7 +346,6 @@ final class GenericShortArrayData(val primitiveArray: Array[Short]) System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericShortArrayData]) { @@ -470,41 +357,17 @@ final class GenericShortArrayData(val primitiveArray: Array[Short]) return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val update: Int = primitiveArray(i).toInt - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } final class GenericByteArrayData(val primitiveArray: Array[Byte]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericByteArrayData(primitiveArray) + override def copy(): GenericByteArrayData = new GenericByteArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -515,7 +378,6 @@ final class GenericByteArrayData(val primitiveArray: Array[Byte]) System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericByteArrayData]) { @@ -527,41 +389,17 @@ final class GenericByteArrayData(val primitiveArray: Array[Byte]) return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val update: Int = primitiveArray(i).toInt - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericBooleanArrayData(primitiveArray) + override def copy(): GenericBooleanArrayData = new GenericBooleanArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -572,7 +410,6 @@ final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericBooleanArrayData]) { @@ -584,33 +421,9 @@ final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val update: Int = if (primitiveArray(i)) 1 else 0 - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } From e95e137fe34f9e244152a132eec16be64c57fa7b Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 21:41:31 +0900 Subject: [PATCH 10/75] fix test failures --- .../apache/spark/sql/catalyst/expressions/objects/objects.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index 80a0efe05fc6..2949d391a034 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -281,7 +281,7 @@ case class NewInstance( val constructorCall = outer.map { gen => s"""${gen.value}.new ${cls.getSimpleName}(${argValues.mkString(", ")})""" }.getOrElse { - if (!cls.isInstanceOf[GenericArrayData]) { + if (!cls.isAssignableFrom(classOf[GenericArrayData])) { s"new $className(${argValues.mkString(", ")})" } else { s"${cls.getName}.allocate(${argValues.mkString(", ")})" From 2b27bb37bb706ac3f0e2f2e6ac7eb5c5de8434c0 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 23 Jun 2016 02:03:53 +0900 Subject: [PATCH 11/75] Enabled all of benchmark suites with performance data --- .../benchmark/GenericArrayDataBenchmark.scala | 52 ++++++++++++++++++- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index 0832830d6a87..e6019bd98233 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -56,6 +56,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Sparse")(sparseIntArray) benchmark.addCase("Dense ")(denseIntArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Allocate GenericArrayData for int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Sparse 40 / 43 522.2 1.9 1.0X + Dense 0 / 0 209715200.0 0.0 401598.7X + */ } def allocateGenericDoubleArray(iters: Int): Unit = { @@ -83,6 +91,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Sparse")(sparseDoubleArray) benchmark.addCase("Dense ")(denseDoubleArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Allocate GenericArrayData for double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Sparse 40 / 44 523.2 1.9 1.0X + Dense 0 / 0 225500215.1 0.0 431013.0X + */ } def getPrimitiveIntArray(iters: Int): Unit = { @@ -110,6 +126,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Sparse int")(sparseIntArray) benchmark.addCase("Dense int")(denseIntArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Get int primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Sparse int 67 / 70 783.9 1.3 1.0X + Dense int 41 / 43 1263.8 0.8 1.6X + */ } def getPrimitiveDoubleArray(iters: Int): Unit = { @@ -137,6 +161,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Sparse double")(sparseDoubleArray) benchmark.addCase("Dense double")(denseDoubleArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Get double primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Sparse double 211 / 217 248.6 4.0 1.0X + Dense double 95 / 100 554.1 1.8 2.2X + */ } def readGenericIntArray(iters: Int): Unit = { @@ -179,6 +211,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Sparse")(sparseIntArray) benchmark.addCase("Dense ")(denseIntArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Read GenericArrayData Int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Sparse 160 / 163 1314.5 0.8 1.0X + Dense 68 / 69 3080.0 0.3 2.3X + */ } def readGenericDoubleArray(iters: Int): Unit = { @@ -221,14 +261,22 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Sparse")(sparseDoubleArray) benchmark.addCase("Dense ")(denseDoubleArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Read GenericArrayData Double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Sparse 611 / 613 343.3 2.9 1.0X + Dense 199 / 202 1051.5 1.0 3.1X + */ } - ignore("allocate GenericArrayData") { + test("allocate GenericArrayData") { allocateGenericIntArray(20) allocateGenericDoubleArray(20) } - ignore("get primitive array") { + test("get primitive array") { getPrimitiveIntArray(50) getPrimitiveDoubleArray(50) } From 878262b52ccfb03d69d454b9324f4925bf5b0d49 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 23 Jun 2016 03:38:21 +0900 Subject: [PATCH 12/75] fix test failures --- .../org/apache/spark/sql/catalyst/util/GenericArrayData.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 18aed7bf527b..6c7f1ca0b9c7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -24,7 +24,8 @@ import org.apache.spark.sql.types.{DataType, Decimal} import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} object GenericArrayData { - def allocate(seq: Seq[Any]): GenericArrayData = new GenericRefArrayData(seq) + def allocate(array: Array[Any]): GenericRefArrayData = new GenericRefArrayData(array) + def allocate(seq: Seq[Any]): GenericRefArrayData = new GenericRefArrayData(seq) def allocate(list: java.util.List[Any]): GenericRefArrayData = new GenericRefArrayData(list) def allocate(seqOrArray: Any): GenericRefArrayData = new GenericRefArrayData(seqOrArray) def allocate(primitiveArray: Array[Int]): GenericIntArrayData = From 6efddf0a49bc5cf971161bf5ce2ea49996772e05 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 23 Jun 2016 12:07:39 +0900 Subject: [PATCH 13/75] update test suite to resolve test failures --- .../org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala index 43b6afd9ad89..ded074c5abc7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala @@ -287,8 +287,8 @@ class ScalaReflectionSuite extends SparkFunSuite { assert(serializer.children.head.isInstanceOf[Literal]) assert(serializer.children.head.asInstanceOf[Literal].value === UTF8String.fromString("value")) assert(serializer.children.last.isInstanceOf[NewInstance]) - assert(serializer.children.last.asInstanceOf[NewInstance] - .cls.isAssignableFrom(classOf[org.apache.spark.sql.catalyst.util.GenericArrayData])) + assert(classOf[org.apache.spark.sql.catalyst.util.GenericArrayData] + isAssignableFrom(serializer.children.last.asInstanceOf[NewInstance].cls)) } private val dataTypeForComplexData = dataTypeFor[ComplexData] From 4d2b6bc6612afe0fc0b93ee637962dd87d1b8ce5 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 23 Jun 2016 14:11:53 +0900 Subject: [PATCH 14/75] fix compilation error --- .../main/scala/org/apache/spark/sql/hive/HiveInspectors.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index e303065127c3..7002b6437611 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -481,7 +481,7 @@ private[hive] trait HiveInspectors { val values = li.getWritableConstantValue.asScala .map(unwrapper) .toArray - val constant = new GenericArrayData(values) + val constant = GenericArrayData.allocate(values) _ => constant case poi: VoidObjectInspector => _ => null // always be null for void object inspector @@ -637,7 +637,7 @@ private[hive] trait HiveInspectors { Option(li.getList(data)) .map { l => val values = l.asScala.map(unwrapper).toArray - new GenericArrayData(values) + GenericArrayData.allocate(values) } .orNull } else { From f55aeead699d3477ab08dceebb06a24e7a2429a4 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 25 Jun 2016 12:13:39 +0900 Subject: [PATCH 15/75] addressed comments --- .../sql/catalyst/util/GenericArrayData.scala | 68 +++++++++++++------ 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 6c7f1ca0b9c7..1a59476460cc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -129,15 +129,16 @@ final class GenericRefArrayData(val _array: Array[Any]) extends GenericArrayData override def getMap(ordinal: Int): MapData = getAs(ordinal) override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericArrayData]) { - return false + if (o == this) { + return true } - val other = o.asInstanceOf[GenericArrayData] - if (other eq null) { + if (o == null || !o.isInstanceOf[GenericArrayData]) { return false } + val other = o.asInstanceOf[GenericArrayData] + val len = numElements() if (len != other.numElements()) { return false @@ -208,7 +209,7 @@ final class GenericRefArrayData(val _array: Array[Any]) extends GenericArrayData final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericIntArrayData = new GenericIntArrayData(primitiveArray.clone()) + override def copy(): GenericIntArrayData = new GenericIntArrayData(toIntArray) override def numElements(): Int = primitiveArray.length @@ -221,15 +222,16 @@ final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericA } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericIntArrayData]) { - return false + if (o == this) { + return true } - val other = o.asInstanceOf[GenericIntArrayData] - if (other eq null) { + if (o == null || !o.isInstanceOf[GenericIntArrayData]) { return false } + val other = o.asInstanceOf[GenericIntArrayData] + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } @@ -240,7 +242,7 @@ final class GenericLongArrayData(val primitiveArray: Array[Long]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericLongArrayData = new GenericLongArrayData(primitiveArray.clone()) + override def copy(): GenericLongArrayData = new GenericLongArrayData(toLongArray) override def numElements(): Int = primitiveArray.length @@ -253,7 +255,11 @@ final class GenericLongArrayData(val primitiveArray: Array[Long]) } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericLongArrayData]) { + if (o == this) { + return true + } + + if (o == null || !o.isInstanceOf[GenericLongArrayData]) { return false } @@ -272,7 +278,7 @@ final class GenericFloatArrayData(val primitiveArray: Array[Float]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericFloatArrayData = new GenericFloatArrayData(primitiveArray.clone()) + override def copy(): GenericFloatArrayData = new GenericFloatArrayData(toFloatArray) override def numElements(): Int = primitiveArray.length @@ -285,7 +291,11 @@ final class GenericFloatArrayData(val primitiveArray: Array[Float]) } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericFloatArrayData]) { + if (o == this) { + return true + } + + if (o == null || !o.isInstanceOf[GenericFloatArrayData]) { return false } @@ -304,7 +314,7 @@ final class GenericDoubleArrayData(val primitiveArray: Array[Double]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericDoubleArrayData = new GenericDoubleArrayData(primitiveArray.clone()) + override def copy(): GenericDoubleArrayData = new GenericDoubleArrayData(toDoubleArray) override def numElements(): Int = primitiveArray.length @@ -317,7 +327,11 @@ final class GenericDoubleArrayData(val primitiveArray: Array[Double]) } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericDoubleArrayData]) { + if (o == this) { + return true + } + + if (o == null || !o.isInstanceOf[GenericDoubleArrayData]) { return false } @@ -336,7 +350,7 @@ final class GenericShortArrayData(val primitiveArray: Array[Short]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericShortArrayData = new GenericShortArrayData(primitiveArray.clone()) + override def copy(): GenericShortArrayData = new GenericShortArrayData(toShortArray) override def numElements(): Int = primitiveArray.length @@ -349,7 +363,11 @@ final class GenericShortArrayData(val primitiveArray: Array[Short]) } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericShortArrayData]) { + if (o == this) { + return true + } + + if (o == null || !o.isInstanceOf[GenericShortArrayData]) { return false } @@ -368,7 +386,7 @@ final class GenericByteArrayData(val primitiveArray: Array[Byte]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericByteArrayData = new GenericByteArrayData(primitiveArray.clone()) + override def copy(): GenericByteArrayData = new GenericByteArrayData(toByteArray) override def numElements(): Int = primitiveArray.length @@ -381,7 +399,11 @@ final class GenericByteArrayData(val primitiveArray: Array[Byte]) } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericByteArrayData]) { + if (o == this) { + return true + } + + if (o == null || !o.isInstanceOf[GenericByteArrayData]) { return false } @@ -400,7 +422,7 @@ final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericBooleanArrayData = new GenericBooleanArrayData(primitiveArray.clone()) + override def copy(): GenericBooleanArrayData = new GenericBooleanArrayData(toBooleanArray) override def numElements(): Int = primitiveArray.length @@ -413,7 +435,11 @@ final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericBooleanArrayData]) { + if (o == this) { + return true + } + + if (o == null || !o.isInstanceOf[GenericBooleanArrayData]) { return false } From 78aaf13878036c52d424959cb9c003963c0f4cb4 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 25 Jun 2016 14:12:30 +0900 Subject: [PATCH 16/75] fix test failures --- .../sql/catalyst/util/GenericArrayData.scala | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 1a59476460cc..fcc3c7918925 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -129,10 +129,6 @@ final class GenericRefArrayData(val _array: Array[Any]) extends GenericArrayData override def getMap(ordinal: Int): MapData = getAs(ordinal) override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericArrayData]) { return false } @@ -222,10 +218,6 @@ final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericA } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericIntArrayData]) { return false } @@ -255,10 +247,6 @@ final class GenericLongArrayData(val primitiveArray: Array[Long]) } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericLongArrayData]) { return false } @@ -291,10 +279,6 @@ final class GenericFloatArrayData(val primitiveArray: Array[Float]) } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericFloatArrayData]) { return false } @@ -327,10 +311,6 @@ final class GenericDoubleArrayData(val primitiveArray: Array[Double]) } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericDoubleArrayData]) { return false } @@ -363,10 +343,6 @@ final class GenericShortArrayData(val primitiveArray: Array[Short]) } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericShortArrayData]) { return false } @@ -399,10 +375,6 @@ final class GenericByteArrayData(val primitiveArray: Array[Byte]) } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericByteArrayData]) { return false } @@ -435,10 +407,6 @@ final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericBooleanArrayData]) { return false } From 8b908e2ab4ed324aece74f91e36caca11eb80fcb Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jun 2016 00:50:09 +0900 Subject: [PATCH 17/75] fix descriptions --- .../benchmark/GenericArrayDataBenchmark.scala | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index e6019bd98233..5824dc5343e5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.util.Benchmark /** - * Benchmark [[GenericArrayData]] for Dense and Sparse with primitive type + * Benchmark [[GenericArrayData]] for specialized representation with primitive type * To run this: * build/sbt "sql/test-only *benchmark.GenericArrayDataBenchmark" * @@ -36,14 +36,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { var array: GenericArrayData = null val primitiveIntArray = new Array[Int](count) - val denseIntArray = { i: Int => + val specializedIntArray = { i: Int => var n = 0 while (n < iters) { array = GenericArrayData.allocate(primitiveIntArray) n += 1 } } - val sparseIntArray = { i: Int => + val genericIntArray = { i: Int => var n = 0 while (n < iters) { array = new GenericRefArrayData(primitiveIntArray) @@ -53,16 +53,16 @@ class GenericArrayDataBenchmark extends BenchmarkBase { val benchmark = new Benchmark("Allocate GenericArrayData for int", count * iters, minNumIters = 10, minTime = 1.milliseconds) - benchmark.addCase("Sparse")(sparseIntArray) - benchmark.addCase("Dense ")(denseIntArray) + benchmark.addCase("Generic ")(genericIntArray) + benchmark.addCase("Specialized")(specializedIntArray) benchmark.run /* OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Allocate GenericArrayData for int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Sparse 40 / 43 522.2 1.9 1.0X - Dense 0 / 0 209715200.0 0.0 401598.7X + Generic 40 / 43 522.2 1.9 1.0X + Specialized 0 / 0 209715200.0 0.0 401598.7X */ } @@ -71,14 +71,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { var array: GenericArrayData = null val primitiveDoubleArray = new Array[Int](count) - val denseDoubleArray = { i: Int => + val specializedDoubleArray = { i: Int => var n = 0 while (n < iters) { array = GenericArrayData.allocate(primitiveDoubleArray) n += 1 } } - val sparseDoubleArray = { i: Int => + val genericDoubleArray = { i: Int => var n = 0 while (n < iters) { array = new GenericRefArrayData(primitiveDoubleArray) @@ -88,16 +88,16 @@ class GenericArrayDataBenchmark extends BenchmarkBase { val benchmark = new Benchmark("Allocate GenericArrayData for double", count * iters, minNumIters = 10, minTime = 1.milliseconds) - benchmark.addCase("Sparse")(sparseDoubleArray) - benchmark.addCase("Dense ")(denseDoubleArray) + benchmark.addCase("Generic ")(genericDoubleArray) + benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run /* OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Allocate GenericArrayData for double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Sparse 40 / 44 523.2 1.9 1.0X - Dense 0 / 0 225500215.1 0.0 431013.0X + Generic 40 / 44 523.2 1.9 1.0X + Specialized 0 / 0 225500215.1 0.0 431013.0X */ } @@ -107,7 +107,7 @@ class GenericArrayDataBenchmark extends BenchmarkBase { val intSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Int](count)) val intDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Int](count)) var primitiveIntArray: Array[Int] = null - val sparseIntArray = { i: Int => + val genericIntArray = { i: Int => var n = 0 while (n < iters) { primitiveIntArray = intSparseArray.toIntArray @@ -123,16 +123,16 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } val benchmark = new Benchmark("Get int primitive array", count * iters) - benchmark.addCase("Sparse int")(sparseIntArray) - benchmark.addCase("Dense int")(denseIntArray) + benchmark.addCase("Generic ")(genericIntArray) + benchmark.addCase("Specialized")(denseIntArray) benchmark.run /* OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Get int primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Sparse int 67 / 70 783.9 1.3 1.0X - Dense int 41 / 43 1263.8 0.8 1.6X + Generic 67 / 70 783.9 1.3 1.0X + Specialized 41 / 43 1263.8 0.8 1.6X */ } @@ -142,14 +142,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { val doubleSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Double](count)) val doubleDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Double](count)) var primitiveDoubleArray: Array[Double] = null - val sparseDoubleArray = { i: Int => + val genericDoubleArray = { i: Int => var n = 0 while (n < iters) { primitiveDoubleArray = doubleSparseArray.toDoubleArray n += 1 } } - val denseDoubleArray = { i: Int => + val specializedDoubleArray = { i: Int => var n = 0 while (n < iters) { primitiveDoubleArray = doubleDenseArray.toDoubleArray @@ -158,16 +158,16 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } val benchmark = new Benchmark("Get double primitive array", count * iters) - benchmark.addCase("Sparse double")(sparseDoubleArray) - benchmark.addCase("Dense double")(denseDoubleArray) + benchmark.addCase("Generic ")(genericDoubleArray) + benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run /* OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Get double primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Sparse double 211 / 217 248.6 4.0 1.0X - Dense double 95 / 100 554.1 1.8 2.2X + Generic 211 / 217 248.6 4.0 1.0X + Specialized 95 / 100 554.1 1.8 2.2X */ } @@ -176,7 +176,7 @@ class GenericArrayDataBenchmark extends BenchmarkBase { var result: Int = 0 val sparseArray = new GenericRefArrayData(new Array[Int](count)) - val sparseIntArray = { i: Int => + val genericIntArray = { i: Int => var n = 0 while (n < iters) { val len = sparseArray.numElements @@ -208,7 +208,7 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } val benchmark = new Benchmark("Read GenericArrayData Int", count * iters) - benchmark.addCase("Sparse")(sparseIntArray) + benchmark.addCase("Sparse")(genericIntArray) benchmark.addCase("Dense ")(denseIntArray) benchmark.run /* @@ -216,8 +216,8 @@ class GenericArrayDataBenchmark extends BenchmarkBase { Intel Xeon E3-12xx v2 (Ivy Bridge) Read GenericArrayData Int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Sparse 160 / 163 1314.5 0.8 1.0X - Dense 68 / 69 3080.0 0.3 2.3X + Generic 160 / 163 1314.5 0.8 1.0X + Specialized 68 / 69 3080.0 0.3 2.3X */ } @@ -226,7 +226,7 @@ class GenericArrayDataBenchmark extends BenchmarkBase { var result: Double = 0 val sparseArray = new GenericRefArrayData(new Array[Double](count)) - val sparseDoubleArray = { i: Int => + val genericDoubleArray = { i: Int => var n = 0 while (n < iters) { val len = sparseArray.numElements @@ -242,7 +242,7 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } val denseArray = GenericArrayData.allocate(new Array[Double](count)) - val denseDoubleArray = { i: Int => + val specializedDoubleArray = { i: Int => var n = 0 while (n < iters) { val len = denseArray.numElements @@ -258,16 +258,16 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } val benchmark = new Benchmark("Read GenericArrayData Double", count * iters) - benchmark.addCase("Sparse")(sparseDoubleArray) - benchmark.addCase("Dense ")(denseDoubleArray) + benchmark.addCase("Generic")(genericDoubleArray) + benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run /* OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Read GenericArrayData Double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Sparse 611 / 613 343.3 2.9 1.0X - Dense 199 / 202 1051.5 1.0 3.1X + Generic 611 / 613 343.3 2.9 1.0X + Specialized 199 / 202 1051.5 1.0 3.1X */ } From e9ec382dcd0e2f85bc085e76fe5d4f66540637aa Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jun 2016 00:51:26 +0900 Subject: [PATCH 18/75] Better usage of GenericArrayData --- .../scala/org/apache/spark/sql/catalyst/ScalaReflection.scala | 4 ++-- .../org/apache/spark/sql/catalyst/util/GenericArrayData.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 68df41b5802d..7bcaea7ea2f7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericRefArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -459,7 +459,7 @@ object ScalaReflection extends ScalaReflection { case dt => NewInstance( - classOf[GenericRefArrayData], + classOf[GenericArrayData], input :: Nil, dataType = ArrayType(dt, schemaFor(elementType).nullable)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index fcc3c7918925..d241c354eed8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -40,7 +40,7 @@ object GenericArrayData { new GenericShortArrayData(primitiveArray) def allocate(primitiveArray: Array[Byte]): GenericByteArrayData = new GenericByteArrayData(primitiveArray) - def allocate(primitiveArray: Array[Boolean]): GenericArrayData = + def allocate(primitiveArray: Array[Boolean]): GenericBooleanArrayData = new GenericBooleanArrayData(primitiveArray) } From ae9591e51ac167ab786ea97accf68bb6834a4625 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jun 2016 10:20:15 +0900 Subject: [PATCH 19/75] revert part of changes --- .../scala/org/apache/spark/sql/catalyst/ScalaReflection.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 7bcaea7ea2f7..68df41b5802d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericRefArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -459,7 +459,7 @@ object ScalaReflection extends ScalaReflection { case dt => NewInstance( - classOf[GenericArrayData], + classOf[GenericRefArrayData], input :: Nil, dataType = ArrayType(dt, schemaFor(elementType).nullable)) } From 79f4c95ab47a21dcaf3ef9c41d5623bd1be568bb Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jun 2016 12:24:15 +0900 Subject: [PATCH 20/75] undo revert --- .../scala/org/apache/spark/sql/catalyst/ScalaReflection.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 68df41b5802d..7bcaea7ea2f7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericRefArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -459,7 +459,7 @@ object ScalaReflection extends ScalaReflection { case dt => NewInstance( - classOf[GenericRefArrayData], + classOf[GenericArrayData], input :: Nil, dataType = ArrayType(dt, schemaFor(elementType).nullable)) } From f93fbc234f63e7540a9509a79635c27c8f8aef63 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jun 2016 17:35:12 +0900 Subject: [PATCH 21/75] revert changes at 0800fdc5 --- .../scala/org/apache/spark/sql/catalyst/ScalaReflection.scala | 4 ++-- .../org/apache/spark/sql/catalyst/util/GenericArrayData.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 7bcaea7ea2f7..68df41b5802d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericRefArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -459,7 +459,7 @@ object ScalaReflection extends ScalaReflection { case dt => NewInstance( - classOf[GenericArrayData], + classOf[GenericRefArrayData], input :: Nil, dataType = ArrayType(dt, schemaFor(elementType).nullable)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index d241c354eed8..fcc3c7918925 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -40,7 +40,7 @@ object GenericArrayData { new GenericShortArrayData(primitiveArray) def allocate(primitiveArray: Array[Byte]): GenericByteArrayData = new GenericByteArrayData(primitiveArray) - def allocate(primitiveArray: Array[Boolean]): GenericBooleanArrayData = + def allocate(primitiveArray: Array[Boolean]): GenericArrayData = new GenericBooleanArrayData(primitiveArray) } From a452bdab1334c1874700a8c736eb85d3c4421ac0 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jun 2016 20:38:29 +0900 Subject: [PATCH 22/75] add null check after asInstanceOf --- .../apache/spark/sql/catalyst/util/GenericArrayData.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index fcc3c7918925..0f3f4c0599c6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -134,6 +134,9 @@ final class GenericRefArrayData(val _array: Array[Any]) extends GenericArrayData } val other = o.asInstanceOf[GenericArrayData] + if (other eq null) { + return false; + } val len = numElements() if (len != other.numElements()) { @@ -223,6 +226,9 @@ final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericA } val other = o.asInstanceOf[GenericIntArrayData] + if (other eq null) { + return false; + } java.util.Arrays.equals(primitiveArray, other.primitiveArray) } From 685d3e7130aace1d247c05f8bbf99e1d695418b7 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 27 Jun 2016 00:44:38 +0900 Subject: [PATCH 23/75] generate GenericArrayData.allocate in NewInstance --- .../scala/org/apache/spark/sql/catalyst/ScalaReflection.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 68df41b5802d..7bcaea7ea2f7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericRefArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -459,7 +459,7 @@ object ScalaReflection extends ScalaReflection { case dt => NewInstance( - classOf[GenericRefArrayData], + classOf[GenericArrayData], input :: Nil, dataType = ArrayType(dt, schemaFor(elementType).nullable)) } From 29c8519a74590b3b67e87bba7397c7268e349bc7 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 27 Jun 2016 12:28:04 +0900 Subject: [PATCH 24/75] fix test failure --- .../catalyst/expressions/objects/objects.scala | 13 +++++++++++-- .../sql/catalyst/util/GenericArrayData.scala | 17 +++++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index 2949d391a034..865dae01b5ef 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -226,6 +226,14 @@ case class NewInstance( outerPointer: Option[() => AnyRef]) extends Expression with NonSQLExpression { private val className = cls.getName + private val instantiatedCls: Class[_] = { + if (!cls.isAssignableFrom(classOf[GenericArrayData])) { + cls + } else { + GenericArrayData.instantiatedClass(dataType) + } + } + override def nullable: Boolean = propagateNull override def children: Seq[Expression] = arguments @@ -236,7 +244,8 @@ case class NewInstance( // Note that static inner classes (e.g., inner classes within Scala objects) don't need // outer pointer registration. val needOuterPointer = - outerPointer.isEmpty && cls.isMemberClass && !Modifier.isStatic(cls.getModifiers) + outerPointer.isEmpty && instantiatedCls.isMemberClass && + !Modifier.isStatic(instantiatedCls.getModifiers) childrenResolved && !needOuterPointer } @@ -297,7 +306,7 @@ case class NewInstance( ev.copy(code = code, isNull = isNull) } - override def toString: String = s"newInstance($cls)" + override def toString: String = s"newInstance($instantiatedCls)" } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 0f3f4c0599c6..ed13699ef3fb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.util import scala.collection.JavaConverters._ import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.types.{DataType, Decimal} +import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} object GenericArrayData { @@ -40,8 +40,21 @@ object GenericArrayData { new GenericShortArrayData(primitiveArray) def allocate(primitiveArray: Array[Byte]): GenericByteArrayData = new GenericByteArrayData(primitiveArray) - def allocate(primitiveArray: Array[Boolean]): GenericArrayData = + def allocate(primitiveArray: Array[Boolean]): GenericBooleanArrayData = new GenericBooleanArrayData(primitiveArray) + + def instantiatedClass(dt: DataType): Class[_] = { + dt match { + case IntegerType => classOf[GenericIntArrayData] + case LongType => classOf[GenericLongArrayData] + case FloatType => classOf[GenericFloatArrayData] + case DoubleType => classOf[GenericDoubleArrayData] + case ShortType => classOf[GenericShortArrayData] + case ByteType => classOf[GenericByteArrayData] + case BooleanType => classOf[GenericBooleanArrayData] + case _ => classOf[GenericRefArrayData] + } + } } private object GenericArrayData { From 0e43c4b0b881f88a31264a7b48b18b7f5cf6a7a2 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 27 Jun 2016 17:45:08 +0900 Subject: [PATCH 25/75] fix test failure --- .../sql/catalyst/expressions/objects/objects.scala | 13 ++----------- .../spark/sql/catalyst/util/GenericArrayData.scala | 13 ------------- .../benchmark/GenericArrayDataBenchmark.scala | 6 +++--- 3 files changed, 5 insertions(+), 27 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index 865dae01b5ef..2949d391a034 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -226,14 +226,6 @@ case class NewInstance( outerPointer: Option[() => AnyRef]) extends Expression with NonSQLExpression { private val className = cls.getName - private val instantiatedCls: Class[_] = { - if (!cls.isAssignableFrom(classOf[GenericArrayData])) { - cls - } else { - GenericArrayData.instantiatedClass(dataType) - } - } - override def nullable: Boolean = propagateNull override def children: Seq[Expression] = arguments @@ -244,8 +236,7 @@ case class NewInstance( // Note that static inner classes (e.g., inner classes within Scala objects) don't need // outer pointer registration. val needOuterPointer = - outerPointer.isEmpty && instantiatedCls.isMemberClass && - !Modifier.isStatic(instantiatedCls.getModifiers) + outerPointer.isEmpty && cls.isMemberClass && !Modifier.isStatic(cls.getModifiers) childrenResolved && !needOuterPointer } @@ -306,7 +297,7 @@ case class NewInstance( ev.copy(code = code, isNull = isNull) } - override def toString: String = s"newInstance($instantiatedCls)" + override def toString: String = s"newInstance($cls)" } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index ed13699ef3fb..d185c4fc6f41 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -42,19 +42,6 @@ object GenericArrayData { new GenericByteArrayData(primitiveArray) def allocate(primitiveArray: Array[Boolean]): GenericBooleanArrayData = new GenericBooleanArrayData(primitiveArray) - - def instantiatedClass(dt: DataType): Class[_] = { - dt match { - case IntegerType => classOf[GenericIntArrayData] - case LongType => classOf[GenericLongArrayData] - case FloatType => classOf[GenericFloatArrayData] - case DoubleType => classOf[GenericDoubleArrayData] - case ShortType => classOf[GenericShortArrayData] - case ByteType => classOf[GenericByteArrayData] - case BooleanType => classOf[GenericBooleanArrayData] - case _ => classOf[GenericRefArrayData] - } - } } private object GenericArrayData { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index 5824dc5343e5..c956e77e8197 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -271,17 +271,17 @@ class GenericArrayDataBenchmark extends BenchmarkBase { */ } - test("allocate GenericArrayData") { + ignore("allocate GenericArrayData") { allocateGenericIntArray(20) allocateGenericDoubleArray(20) } - test("get primitive array") { + ignore("get primitive array") { getPrimitiveIntArray(50) getPrimitiveDoubleArray(50) } - test("read elements in GenericArrayData") { + ignore("read elements in GenericArrayData") { readGenericIntArray(100) readGenericDoubleArray(100) } From d15a46ce584b52e4d47295fb914666cb0e437991 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 12 Jul 2016 02:32:46 +0900 Subject: [PATCH 26/75] update --- .../benchmark/GenericArrayDataBenchmark.scala | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index c956e77e8197..de3084f6b42c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -25,7 +25,8 @@ import org.apache.spark.util.Benchmark /** * Benchmark [[GenericArrayData]] for specialized representation with primitive type * To run this: - * build/sbt "sql/test-only *benchmark.GenericArrayDataBenchmark" + * 1. replace ignore(...) with test(...) + * 2. build/sbt "sql/test-only *benchmark.GenericArrayDataBenchmark" * * Benchmarks in this file are skipped in normal builds. */ @@ -285,13 +286,4 @@ class GenericArrayDataBenchmark extends BenchmarkBase { readGenericIntArray(100) readGenericDoubleArray(100) } - - def main(args: Array[String]): Unit = { - allocateGenericIntArray(20) - allocateGenericDoubleArray(20) - getPrimitiveIntArray(50) - getPrimitiveDoubleArray(50) - readGenericIntArray(20) - readGenericDoubleArray(20) - } } From 3e84fdbc19c7a8c02bcb35a02642d5a9fd31b2ee Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 12 Jul 2016 03:04:25 +0900 Subject: [PATCH 27/75] replace new GenericArrayData with GenericArrayData.allocate --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 4 ++-- .../org/apache/spark/sql/catalyst/expressions/xml/xpath.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 5f533fecf8d0..712c889c7aa2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1596,8 +1596,8 @@ case class Sentences( widx = wi.current if (Character.isLetterOrDigit(word.charAt(0))) words += UTF8String.fromString(word) } - result += new GenericArrayData(words) + result += GenericArrayData.allocate(words) } - new GenericArrayData(result) + GenericArrayData.allocate(result) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala index aa328045cafd..1e54acd26ab6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala @@ -210,7 +210,7 @@ case class XPathList(xml: Expression, path: Expression) extends XPathExtract { ret(i) = UTF8String.fromString(nodeList.item(i).getNodeValue) i += 1 } - new GenericArrayData(ret) + GenericArrayData.allocate(ret) } else { null } From 604293e8391fd50b79c5a0391cc14911bbedc3bc Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 26 Jul 2016 02:00:17 +0900 Subject: [PATCH 28/75] rebase --- .../sql/catalyst/util/GenericArrayData.scala | 26 ++-- .../execution/datasources/jdbc/JDBCRDD.scala | 131 ++++++++++++++++++ 2 files changed, 144 insertions(+), 13 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index d185c4fc6f41..efed96012d57 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -44,17 +44,6 @@ object GenericArrayData { new GenericBooleanArrayData(primitiveArray) } -private object GenericArrayData { - - // SPARK-16634: Workaround for JVM bug present in some 1.7 versions. - def anyToSeq(seqOrArray: Any): Seq[Any] = seqOrArray match { - case seq: Seq[Any] => seq - case array: Array[_] => array.toSeq - case _ => Seq.empty - } - -} - abstract class GenericArrayData extends ArrayData { override def get(ordinal: Int, elementType: DataType): AnyRef = throw new UnsupportedOperationException("get() method is not supported") @@ -90,7 +79,18 @@ abstract class GenericArrayData extends ArrayData { override def toString(): String = array.mkString("[", ",", "]") } -final class GenericRefArrayData(val _array: Array[Any]) extends GenericArrayData { +private object GenericRefArrayData { + + // SPARK-16634: Workaround for JVM bug present in some 1.7 versions. + def anyToSeq(seqOrArray: Any): Seq[Any] = seqOrArray match { + case seq: Seq[Any] => seq + case array: Array[_] => array.toSeq + case _ => Seq.empty + } + +} + +final class GenericRefArrayData(val array: Array[Any]) extends GenericArrayData { def this(seq: Seq[Any]) = this(seq.toArray) def this(list: java.util.List[Any]) = this(list.asScala) @@ -104,7 +104,7 @@ final class GenericRefArrayData(val _array: Array[Any]) extends GenericArrayData def this(primitiveArray: Array[Byte]) = this(primitiveArray.toSeq) def this(primitiveArray: Array[Boolean]) = this(primitiveArray.toSeq) - def this(seqOrArray: Any) = this(GenericArrayData.anyToSeq(seqOrArray)) + def this(seqOrArray: Any) = this(GenericRefArrayData.anyToSeq(seqOrArray)) override def copy(): GenericRefArrayData = new GenericRefArrayData(array.clone()) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala index c0fabc81e42a..938f041bd565 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala @@ -229,6 +229,137 @@ private[jdbc] class JDBCRDD( } } + // A `JDBCValueSetter` is responsible for converting and setting a value from `ResultSet` + // into a field for `MutableRow`. The last argument `Int` means the index for the + // value to be set in the row and also used for the value to retrieve from `ResultSet`. + private type JDBCValueSetter = (ResultSet, MutableRow, Int) => Unit + + /** + * Creates `JDBCValueSetter`s according to [[StructType]], which can set + * each value from `ResultSet` to each field of [[MutableRow]] correctly. + */ + def makeSetters(schema: StructType): Array[JDBCValueSetter] = + schema.fields.map(sf => makeSetter(sf.dataType, sf.metadata)) + + private def makeSetter(dt: DataType, metadata: Metadata): JDBCValueSetter = dt match { + case BooleanType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setBoolean(pos, rs.getBoolean(pos + 1)) + + case DateType => + (rs: ResultSet, row: MutableRow, pos: Int) => + // DateTimeUtils.fromJavaDate does not handle null value, so we need to check it. + val dateVal = rs.getDate(pos + 1) + if (dateVal != null) { + row.setInt(pos, DateTimeUtils.fromJavaDate(dateVal)) + } else { + row.update(pos, null) + } + + // When connecting with Oracle DB through JDBC, the precision and scale of BigDecimal + // object returned by ResultSet.getBigDecimal is not correctly matched to the table + // schema reported by ResultSetMetaData.getPrecision and ResultSetMetaData.getScale. + // If inserting values like 19999 into a column with NUMBER(12, 2) type, you get through + // a BigDecimal object with scale as 0. But the dataframe schema has correct type as + // DecimalType(12, 2). Thus, after saving the dataframe into parquet file and then + // retrieve it, you will get wrong result 199.99. + // So it is needed to set precision and scale for Decimal based on JDBC metadata. + case DecimalType.Fixed(p, s) => + (rs: ResultSet, row: MutableRow, pos: Int) => + val decimal = + nullSafeConvert[java.math.BigDecimal](rs.getBigDecimal(pos + 1), d => Decimal(d, p, s)) + row.update(pos, decimal) + + case DoubleType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setDouble(pos, rs.getDouble(pos + 1)) + + case FloatType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setFloat(pos, rs.getFloat(pos + 1)) + + case IntegerType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setInt(pos, rs.getInt(pos + 1)) + + case LongType if metadata.contains("binarylong") => + (rs: ResultSet, row: MutableRow, pos: Int) => + val bytes = rs.getBytes(pos + 1) + var ans = 0L + var j = 0 + while (j < bytes.size) { + ans = 256 * ans + (255 & bytes(j)) + j = j + 1 + } + row.setLong(pos, ans) + + case LongType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setLong(pos, rs.getLong(pos + 1)) + + case StringType => + (rs: ResultSet, row: MutableRow, pos: Int) => + // TODO(davies): use getBytes for better performance, if the encoding is UTF-8 + row.update(pos, UTF8String.fromString(rs.getString(pos + 1))) + + case TimestampType => + (rs: ResultSet, row: MutableRow, pos: Int) => + val t = rs.getTimestamp(pos + 1) + if (t != null) { + row.setLong(pos, DateTimeUtils.fromJavaTimestamp(t)) + } else { + row.update(pos, null) + } + + case BinaryType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.update(pos, rs.getBytes(pos + 1)) + + case ArrayType(et, _) => + val elementConversion = et match { + case TimestampType => + (array: Object) => + array.asInstanceOf[Array[java.sql.Timestamp]].map { timestamp => + nullSafeConvert(timestamp, DateTimeUtils.fromJavaTimestamp) + } + + case StringType => + (array: Object) => + array.asInstanceOf[Array[java.lang.String]] + .map(UTF8String.fromString) + + case DateType => + (array: Object) => + array.asInstanceOf[Array[java.sql.Date]].map { date => + nullSafeConvert(date, DateTimeUtils.fromJavaDate) + } + + case dt: DecimalType => + (array: Object) => + array.asInstanceOf[Array[java.math.BigDecimal]].map { decimal => + nullSafeConvert[java.math.BigDecimal]( + decimal, d => Decimal(d, dt.precision, dt.scale)) + } + + case LongType if metadata.contains("binarylong") => + throw new IllegalArgumentException(s"Unsupported array element " + + s"type ${dt.simpleString} based on binary") + + case ArrayType(_, _) => + throw new IllegalArgumentException("Nested arrays unsupported") + + case _ => (array: Object) => array.asInstanceOf[Array[Any]] + } + + (rs: ResultSet, row: MutableRow, pos: Int) => + val array = nullSafeConvert[Object]( + rs.getArray(pos + 1).getArray, + array => GenericArrayData.allocate(elementConversion.apply(array))) + row.update(pos, array) + + case _ => throw new IllegalArgumentException(s"Unsupported type ${dt.simpleString}") + } + /** * Runs the SQL query against the JDBC driver. * From 1dcd58256f2bbb419bb34b600c76c4698f8ee1cc Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 10 Sep 2016 20:39:35 +0900 Subject: [PATCH 29/75] rebase --- .../aggregate/ApproximatePercentile.scala | 2 +- .../expressions/ObjectExpressionsSuite.scala | 16 +-- .../execution/datasources/jdbc/JDBCRDD.scala | 131 ------------------ .../datasources/jdbc/JdbcUtils.scala | 2 +- 4 files changed, 10 insertions(+), 141 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index 692cbd7c0d32..4d49af394b8c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -143,7 +143,7 @@ case class ApproximatePercentile( if (result.length == 0) { null } else if (returnPercentileArray) { - new GenericArrayData(result) + GenericArrayData.allocate(result) } else { result(0) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala index 3edcc02f1526..ab14981ad184 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala @@ -39,7 +39,7 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // test UnsafeRow-backed data val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]] val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4)))) - val structExpected = new GenericArrayData( + val structExpected = GenericArrayData.allocate( Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4)))) checkEvalutionWithUnsafeProjection( structEncoder.serializer.head, structExpected, structInputRow) @@ -47,8 +47,8 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // test UnsafeArray-backed data val arrayEncoder = ExpressionEncoder[Array[Array[Int]]] val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4)))) - val arrayExpected = new GenericArrayData( - Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4)))) + val arrayExpected = GenericArrayData.allocate( + Array(GenericArrayData.allocate(Array(1, 2)), GenericArrayData.allocate(Array(3, 4)))) checkEvalutionWithUnsafeProjection( arrayEncoder.serializer.head, arrayExpected, arrayInputRow) @@ -56,13 +56,13 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]] val mapInputRow = InternalRow.fromSeq(Seq(Array( Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400)))) - val mapExpected = new GenericArrayData(Seq( + val mapExpected = GenericArrayData.allocate(Seq( new ArrayBasedMapData( - new GenericArrayData(Array(1, 2)), - new GenericArrayData(Array(100, 200))), + GenericArrayData.allocate(Array(1, 2)), + GenericArrayData.allocate(Array(100, 200))), new ArrayBasedMapData( - new GenericArrayData(Array(3, 4)), - new GenericArrayData(Array(300, 400))))) + GenericArrayData.allocate(Array(3, 4)), + GenericArrayData.allocate(Array(300, 400))))) checkEvalutionWithUnsafeProjection( mapEncoder.serializer.head, mapExpected, mapInputRow) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala index 938f041bd565..c0fabc81e42a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala @@ -229,137 +229,6 @@ private[jdbc] class JDBCRDD( } } - // A `JDBCValueSetter` is responsible for converting and setting a value from `ResultSet` - // into a field for `MutableRow`. The last argument `Int` means the index for the - // value to be set in the row and also used for the value to retrieve from `ResultSet`. - private type JDBCValueSetter = (ResultSet, MutableRow, Int) => Unit - - /** - * Creates `JDBCValueSetter`s according to [[StructType]], which can set - * each value from `ResultSet` to each field of [[MutableRow]] correctly. - */ - def makeSetters(schema: StructType): Array[JDBCValueSetter] = - schema.fields.map(sf => makeSetter(sf.dataType, sf.metadata)) - - private def makeSetter(dt: DataType, metadata: Metadata): JDBCValueSetter = dt match { - case BooleanType => - (rs: ResultSet, row: MutableRow, pos: Int) => - row.setBoolean(pos, rs.getBoolean(pos + 1)) - - case DateType => - (rs: ResultSet, row: MutableRow, pos: Int) => - // DateTimeUtils.fromJavaDate does not handle null value, so we need to check it. - val dateVal = rs.getDate(pos + 1) - if (dateVal != null) { - row.setInt(pos, DateTimeUtils.fromJavaDate(dateVal)) - } else { - row.update(pos, null) - } - - // When connecting with Oracle DB through JDBC, the precision and scale of BigDecimal - // object returned by ResultSet.getBigDecimal is not correctly matched to the table - // schema reported by ResultSetMetaData.getPrecision and ResultSetMetaData.getScale. - // If inserting values like 19999 into a column with NUMBER(12, 2) type, you get through - // a BigDecimal object with scale as 0. But the dataframe schema has correct type as - // DecimalType(12, 2). Thus, after saving the dataframe into parquet file and then - // retrieve it, you will get wrong result 199.99. - // So it is needed to set precision and scale for Decimal based on JDBC metadata. - case DecimalType.Fixed(p, s) => - (rs: ResultSet, row: MutableRow, pos: Int) => - val decimal = - nullSafeConvert[java.math.BigDecimal](rs.getBigDecimal(pos + 1), d => Decimal(d, p, s)) - row.update(pos, decimal) - - case DoubleType => - (rs: ResultSet, row: MutableRow, pos: Int) => - row.setDouble(pos, rs.getDouble(pos + 1)) - - case FloatType => - (rs: ResultSet, row: MutableRow, pos: Int) => - row.setFloat(pos, rs.getFloat(pos + 1)) - - case IntegerType => - (rs: ResultSet, row: MutableRow, pos: Int) => - row.setInt(pos, rs.getInt(pos + 1)) - - case LongType if metadata.contains("binarylong") => - (rs: ResultSet, row: MutableRow, pos: Int) => - val bytes = rs.getBytes(pos + 1) - var ans = 0L - var j = 0 - while (j < bytes.size) { - ans = 256 * ans + (255 & bytes(j)) - j = j + 1 - } - row.setLong(pos, ans) - - case LongType => - (rs: ResultSet, row: MutableRow, pos: Int) => - row.setLong(pos, rs.getLong(pos + 1)) - - case StringType => - (rs: ResultSet, row: MutableRow, pos: Int) => - // TODO(davies): use getBytes for better performance, if the encoding is UTF-8 - row.update(pos, UTF8String.fromString(rs.getString(pos + 1))) - - case TimestampType => - (rs: ResultSet, row: MutableRow, pos: Int) => - val t = rs.getTimestamp(pos + 1) - if (t != null) { - row.setLong(pos, DateTimeUtils.fromJavaTimestamp(t)) - } else { - row.update(pos, null) - } - - case BinaryType => - (rs: ResultSet, row: MutableRow, pos: Int) => - row.update(pos, rs.getBytes(pos + 1)) - - case ArrayType(et, _) => - val elementConversion = et match { - case TimestampType => - (array: Object) => - array.asInstanceOf[Array[java.sql.Timestamp]].map { timestamp => - nullSafeConvert(timestamp, DateTimeUtils.fromJavaTimestamp) - } - - case StringType => - (array: Object) => - array.asInstanceOf[Array[java.lang.String]] - .map(UTF8String.fromString) - - case DateType => - (array: Object) => - array.asInstanceOf[Array[java.sql.Date]].map { date => - nullSafeConvert(date, DateTimeUtils.fromJavaDate) - } - - case dt: DecimalType => - (array: Object) => - array.asInstanceOf[Array[java.math.BigDecimal]].map { decimal => - nullSafeConvert[java.math.BigDecimal]( - decimal, d => Decimal(d, dt.precision, dt.scale)) - } - - case LongType if metadata.contains("binarylong") => - throw new IllegalArgumentException(s"Unsupported array element " + - s"type ${dt.simpleString} based on binary") - - case ArrayType(_, _) => - throw new IllegalArgumentException("Nested arrays unsupported") - - case _ => (array: Object) => array.asInstanceOf[Array[Any]] - } - - (rs: ResultSet, row: MutableRow, pos: Int) => - val array = nullSafeConvert[Object]( - rs.getArray(pos + 1).getArray, - array => GenericArrayData.allocate(elementConversion.apply(array))) - row.update(pos, array) - - case _ => throw new IllegalArgumentException(s"Unsupported type ${dt.simpleString}") - } - /** * Runs the SQL query against the JDBC driver. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala index 41edb6511c2c..27beccb0c4d2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -425,7 +425,7 @@ object JdbcUtils extends Logging { (rs: ResultSet, row: InternalRow, pos: Int) => val array = nullSafeConvert[Object]( rs.getArray(pos + 1).getArray, - array => new GenericArrayData(elementConversion.apply(array))) + array => GenericArrayData.allocate(elementConversion.apply(array))) row.update(pos, array) case _ => throw new IllegalArgumentException(s"Unsupported type ${dt.simpleString}") From e60bf407473a383ccd907a02545169559678adaa Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 1 Oct 2016 15:32:10 +0900 Subject: [PATCH 30/75] reimplement without factory method --- .../codegen/UnsafeArrayWriter.java | 114 ++++ .../sql/catalyst/CatalystTypeConverters.scala | 8 +- .../spark/sql/catalyst/expressions/Cast.scala | 4 +- .../aggregate/ApproximatePercentile.scala | 2 +- .../expressions/aggregate/PivotFirst.scala | 2 +- .../expressions/aggregate/collect.scala | 2 +- .../codegen/GenerateSafeProjection.scala | 2 +- .../codegen/GenerateUnsafeProjection.scala | 49 +- .../expressions/collectionOperations.scala | 2 +- .../expressions/complexTypeCreator.scala | 11 +- .../expressions/complexTypeExtractors.scala | 4 +- .../expressions/objects/objects.scala | 8 +- .../expressions/regexpExpressions.scala | 4 +- .../expressions/stringExpressions.scala | 4 +- .../sql/catalyst/expressions/xml/xpath.scala | 2 +- .../sql/catalyst/json/JacksonParser.scala | 2 +- .../sql/catalyst/util/GenericArrayData.scala | 492 +++++++----------- .../sql/catalyst/ScalaReflectionSuite.scala | 4 +- .../analysis/AnalysisErrorSuite.scala | 4 +- .../encoders/EncoderResolutionSuite.scala | 4 +- .../catalyst/encoders/RowEncoderSuite.scala | 2 +- .../expressions/ObjectExpressionsSuite.scala | 16 +- .../expressions/UnsafeRowConverterSuite.scala | 2 +- .../codegen/BufferHolderSuite.scala | 190 ++++++- .../codegen/GeneratedProjectionSuite.scala | 2 +- .../catalyst/util/GenericArrayDataSuite.scala | 295 +++++++---- .../datasources/jdbc/JdbcUtils.scala | 2 +- .../parquet/ParquetRowConverter.scala | 4 +- .../sql/execution/python/EvaluatePython.scala | 4 +- .../spark/sql/test/ExamplePointUDT.scala | 2 +- .../spark/sql/DataFrameComplexTypeSuite.scala | 13 + .../org/apache/spark/sql/UnsafeRowSuite.scala | 2 +- .../spark/sql/UserDefinedTypeSuite.scala | 2 +- .../columnar/ColumnarTestUtils.scala | 2 +- .../spark/sql/hive/HiveInspectors.scala | 4 +- .../spark/sql/hive/HiveInspectorSuite.scala | 2 +- 36 files changed, 778 insertions(+), 490 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java index afea4676893e..f8bbfbbac1b7 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.expressions.codegen; +import org.apache.spark.sql.catalyst.util.ArrayData; +import org.apache.spark.sql.catalyst.util.GenericArrayData; import org.apache.spark.sql.types.Decimal; import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.ByteArrayMethods; @@ -259,4 +261,116 @@ public void write(int ordinal, CalendarInterval input) { // move the cursor forward. holder.cursor += 16; } + + public void writePrimitiveBooleanArray(ArrayData arrayData) { + boolean[] input; + if (arrayData instanceof GenericArrayData && + (input = ((GenericArrayData)arrayData).booleanArray()) != null) { + int length = input.length; + Platform.copyMemory(input, Platform.BOOLEAN_ARRAY_OFFSET, + holder.buffer, startingOffset + headerInBytes, length); + } else { + int length = arrayData.numElements(); + for (int i = 0; i < length; i++) { + Platform.putBoolean(holder.buffer, startingOffset + headerInBytes + i, + arrayData.getBoolean(i)); + } + } + } + + public void writePrimitiveByteArray(ArrayData arrayData) { + byte[] input; + if (arrayData instanceof GenericArrayData && + (input = ((GenericArrayData)arrayData).byteArray()) != null) { + int length = input.length; + Platform.copyMemory(input, Platform.BYTE_ARRAY_OFFSET, + holder.buffer, startingOffset + headerInBytes, length); + } else { + int length = arrayData.numElements(); + for (int i = 0; i < length; i++) { + Platform.putByte(holder.buffer, startingOffset + headerInBytes + i, + arrayData.getByte(i)); + } + } + } + + public void writePrimitiveShortArray(ArrayData arrayData) { + short[] input; + if (arrayData instanceof GenericArrayData && + (input = ((GenericArrayData)arrayData).shortArray()) != null) { + int length = input.length * 2; + Platform.copyMemory(input, Platform.SHORT_ARRAY_OFFSET, + holder.buffer, startingOffset + headerInBytes, length); + } else { + int length = arrayData.numElements(); + for (int i = 0; i < length; i++) { + Platform.putShort(holder.buffer, startingOffset + headerInBytes + i * 2, + arrayData.getShort(i)); + } + } + } + + public void writePrimitiveIntArray(ArrayData arrayData) { + int[] input; + if (arrayData instanceof GenericArrayData && + (input = ((GenericArrayData)arrayData).intArray()) != null) { + int length = input.length * 4; + Platform.copyMemory(input, Platform.INT_ARRAY_OFFSET, + holder.buffer, startingOffset + headerInBytes, length); + } else { + int length = arrayData.numElements(); + for (int i = 0; i < length; i++) { + Platform.putInt(holder.buffer, startingOffset + headerInBytes + i * 4, + arrayData.getInt(i)); + } + } + } + + public void writePrimitiveLongArray(ArrayData arrayData) { + long[] input; + if (arrayData instanceof GenericArrayData && + (input = ((GenericArrayData)arrayData).longArray()) != null) { + int length = input.length * 8; + Platform.copyMemory(input, Platform.LONG_ARRAY_OFFSET, + holder.buffer, startingOffset + headerInBytes, length); + } else { + int length = arrayData.numElements(); + for (int i = 0; i < length; i++) { + Platform.putLong(holder.buffer, startingOffset + headerInBytes + i * 8, + arrayData.getLong(i)); + } + } + } + + public void writePrimitiveFloatArray(ArrayData arrayData) { + float[] input; + if (arrayData instanceof GenericArrayData && + (input = ((GenericArrayData)arrayData).floatArray()) != null) { + int length = input.length * 4; + Platform.copyMemory(input, Platform.FLOAT_ARRAY_OFFSET, + holder.buffer, startingOffset + headerInBytes, length); + } else { + int length = arrayData.numElements(); + for (int i = 0; i < length; i++) { + Platform.putFloat(holder.buffer, startingOffset + headerInBytes + i * 4, + arrayData.getFloat(i)); + } + } + } + + public void writePrimitiveDoubleArray(ArrayData arrayData) { + double[] input; + if (arrayData instanceof GenericArrayData && + (input = ((GenericArrayData)arrayData).doubleArray()) != null) { + int length = input.length * 8; + Platform.copyMemory(input, Platform.DOUBLE_ARRAY_OFFSET, + holder.buffer, startingOffset + headerInBytes, length); + } else { + int length = arrayData.numElements(); + for (int i = 0; i < length; i++) { + Platform.putDouble(holder.buffer, startingOffset + headerInBytes + i * 8, + arrayData.getDouble(i)); + } + } + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala index 2801827e7bb6..5b9161551a7a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala @@ -159,9 +159,9 @@ object CatalystTypeConverters { override def toCatalystImpl(scalaValue: Any): ArrayData = { scalaValue match { case a: Array[_] => - GenericArrayData.allocate(a.map(elementConverter.toCatalyst)) + new GenericArrayData(a.map(elementConverter.toCatalyst)) case s: Seq[_] => - GenericArrayData.allocate(s.map(elementConverter.toCatalyst).toArray) + new GenericArrayData(s.map(elementConverter.toCatalyst).toArray) case i: JavaIterable[_] => val iter = i.iterator val convertedIterable = scala.collection.mutable.ArrayBuffer.empty[Any] @@ -169,7 +169,7 @@ object CatalystTypeConverters { val item = iter.next() convertedIterable += elementConverter.toCatalyst(item) } - GenericArrayData.allocate(convertedIterable.toArray) + new GenericArrayData(convertedIterable.toArray) } } @@ -410,7 +410,7 @@ object CatalystTypeConverters { case t: Timestamp => TimestampConverter.toCatalyst(t) case d: BigDecimal => new DecimalConverter(DecimalType(d.precision, d.scale)).toCatalyst(d) case d: JavaBigDecimal => new DecimalConverter(DecimalType(d.precision, d.scale)).toCatalyst(d) - case seq: Seq[Any] => GenericArrayData.allocate(seq.map(convertToCatalyst).toArray) + case seq: Seq[Any] => new GenericArrayData(seq.map(convertToCatalyst).toArray) case r: Row => InternalRow(r.toSeq.map(convertToCatalyst): _*) case arr: Array[Any] => new GenericArrayData(arr.map(convertToCatalyst)) case map: Map[_, _] => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 039b4469b99e..4db1ae6faa15 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -388,7 +388,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w values(i) = elementCast(e) } }) - GenericArrayData.allocate(values) + new GenericArrayData(values) }) } @@ -864,7 +864,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w } } } - $evPrim = $arrayClass.allocate($values); + $evPrim = new $arrayClass($values); """ } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index 4d49af394b8c..692cbd7c0d32 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -143,7 +143,7 @@ case class ApproximatePercentile( if (result.length == 0) { null } else if (returnPercentileArray) { - GenericArrayData.allocate(result) + new GenericArrayData(result) } else { result(0) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala index 23a8d5fd4903..087606077295 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala @@ -131,7 +131,7 @@ case class PivotFirst( for (i <- 0 until indexSize) { result(i) = input.get(mutableAggBufferOffset + i, valueDataType) } - GenericArrayData.allocate(result) + new GenericArrayData(result) } override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate = diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala index f97fe5fe8d51..d2880d58aefe 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala @@ -78,7 +78,7 @@ abstract class Collect extends ImperativeAggregate { } override def eval(input: InternalRow): Any = { - GenericArrayData.allocate(buffer.toArray) + new GenericArrayData(buffer.toArray) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala index 792d735a0d5a..b1cb6edefb85 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala @@ -96,7 +96,7 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection] $values[$index] = ${elementConverter.value}; } } - final ArrayData $output = $arrayClass.allocate($values); + final ArrayData $output = new $arrayClass($values); """ ExprCode(code, "false", output) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala index 7e4c9089a2cb..0a802046c2f7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala @@ -117,12 +117,12 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro $rowWriter.setOffsetAndSize($index, $tmpCursor, $bufferHolder.cursor - $tmpCursor); """ - case a @ ArrayType(et, _) => + case a @ ArrayType(et, cn) => s""" // Remember the current cursor so that we can calculate how many bytes are // written later. final int $tmpCursor = $bufferHolder.cursor; - ${writeArrayToBuffer(ctx, input.value, et, bufferHolder)} + ${writeArrayToBuffer(ctx, input.value, et, cn, bufferHolder)} $rowWriter.setOffsetAndSize($index, $tmpCursor, $bufferHolder.cursor - $tmpCursor); """ @@ -171,6 +171,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro ctx: CodegenContext, input: String, elementType: DataType, + containsNull: Boolean, bufferHolder: String): String = { val arrayWriterClass = classOf[UnsafeArrayWriter].getName val arrayWriter = ctx.freshName("arrayWriter") @@ -202,10 +203,10 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro $arrayWriter.setOffsetAndSize($index, $tmpCursor, $bufferHolder.cursor - $tmpCursor); """ - case a @ ArrayType(et, _) => + case a @ ArrayType(et, cn) => s""" final int $tmpCursor = $bufferHolder.cursor; - ${writeArrayToBuffer(ctx, element, et, bufferHolder)} + ${writeArrayToBuffer(ctx, element, et, cn, bufferHolder)} $arrayWriter.setOffsetAndSize($index, $tmpCursor, $bufferHolder.cursor - $tmpCursor); """ @@ -225,6 +226,31 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro } val primitiveTypeName = if (ctx.isPrimitiveType(jt)) ctx.primitiveTypeName(et) else "" + val storeElements = if (containsNull) { + s""" + for (int $index = 0; $index < $numElements; $index++) { + if ($input.isNullAt($index)) { + $arrayWriter.setNull$primitiveTypeName($index); + } else { + final $jt $element = ${ctx.getValue(input, et, index)}; + $writeElement + } + } + """ + } else { + if (ctx.isPrimitiveType(et)) { + val typeName = ctx.primitiveTypeName(et) + s"$arrayWriter.writePrimitive${typeName}Array($input);" + } else { + s""" + for (int $index = 0; $index < $numElements; $index++) { + final $jt $element = ${ctx.getValue(input, et, index)}; + $writeElement + } + """ + } + } + s""" if ($input instanceof UnsafeArrayData) { ${writeUnsafeData(ctx, s"((UnsafeArrayData) $input)", bufferHolder)} @@ -232,14 +258,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro final int $numElements = $input.numElements(); $arrayWriter.initialize($bufferHolder, $numElements, $elementOrOffsetSize); - for (int $index = 0; $index < $numElements; $index++) { - if ($input.isNullAt($index)) { - $arrayWriter.setNull$primitiveTypeName($index); - } else { - final $jt $element = ${ctx.getValue(input, et, index)}; - $writeElement - } - } + $storeElements } """ } @@ -271,11 +290,11 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro // Remember the current cursor so that we can write numBytes of key array later. final int $tmpCursor = $bufferHolder.cursor; - ${writeArrayToBuffer(ctx, keys, keyType, bufferHolder)} + ${writeArrayToBuffer(ctx, keys, keyType, false, bufferHolder)} // Write the numBytes of key array into the first 8 bytes. - Platform.putLong($bufferHolder.buffer, $tmpCursor - 8, $bufferHolder.cursor - $tmpCursor); + Platform.putInt($bufferHolder.buffer, $tmpCursor - 8, $bufferHolder.cursor - $tmpCursor); - ${writeArrayToBuffer(ctx, values, valueType, bufferHolder)} + ${writeArrayToBuffer(ctx, values, valueType, true, bufferHolder)} } """ } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 1d10a9034ab7..c863ba434120 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -206,7 +206,7 @@ case class SortArray(base: Expression, ascendingOrder: Expression) if (elementType != NullType) { java.util.Arrays.sort(data, if (ascending.asInstanceOf[Boolean]) lt else gt) } - GenericArrayData.allocate(data.asInstanceOf[Array[Any]]) + new GenericArrayData(data.asInstanceOf[Array[Any]]) } override def prettyName: String = "sort_array" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index 640c32628cdd..c9f36649ec8e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -52,7 +52,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression { override def nullable: Boolean = false override def eval(input: InternalRow): Any = { - GenericArrayData.allocate(children.map(_.eval(input)).toArray) + new GenericArrayData(children.map(_.eval(input)).toArray) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -76,7 +76,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression { """ }) + s""" - final ArrayData ${ev.value} = $arrayClass.allocate($values); + final ArrayData ${ev.value} = new $arrayClass($values); this.$values = null; """) } @@ -130,8 +130,7 @@ case class CreateMap(children: Seq[Expression]) extends Expression { throw new RuntimeException("Cannot use null as map key!") } val valueArray = values.map(_.eval(input)).toArray - new ArrayBasedMapData( - GenericArrayData.allocate(keyArray), GenericArrayData.allocate(valueArray)) + new ArrayBasedMapData(new GenericArrayData(keyArray), new GenericArrayData(valueArray)) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -142,8 +141,8 @@ case class CreateMap(children: Seq[Expression]) extends Expression { ctx.addMutableState("Object[]", keyArray, s"this.$keyArray = null;") ctx.addMutableState("Object[]", valueArray, s"this.$valueArray = null;") - val keyData = s"$arrayClass.allocate($keyArray)" - val valueData = s"$arrayClass.allocate($valueArray)" + val keyData = s"new $arrayClass($keyArray)" + val valueData = s"new $arrayClass($valueArray)" ev.copy(code = s""" final boolean ${ev.isNull} = false; $keyArray = new Object[${keys.size}]; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala index f17d0bc412b1..0c256c3d890f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala @@ -176,7 +176,7 @@ case class GetArrayStructFields( } i += 1 } - GenericArrayData.allocate(result) + new GenericArrayData(result) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -201,7 +201,7 @@ case class GetArrayStructFields( } } } - ${ev.value} = $arrayClass.allocate($values); + ${ev.value} = new $arrayClass($values); """ }) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index 2949d391a034..50e2ac3c36d9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -281,11 +281,7 @@ case class NewInstance( val constructorCall = outer.map { gen => s"""${gen.value}.new ${cls.getSimpleName}(${argValues.mkString(", ")})""" }.getOrElse { - if (!cls.isAssignableFrom(classOf[GenericArrayData])) { - s"new $className(${argValues.mkString(", ")})" - } else { - s"${cls.getName}.allocate(${argValues.mkString(", ")})" - } + s"new $className(${argValues.mkString(", ")})" } val code = s""" @@ -541,7 +537,7 @@ case class MapObjects private( $loopIndex += 1; } - ${ev.value} = ${classOf[GenericArrayData].getName}.allocate($convertedArray); + ${ev.value} = new ${classOf[GenericArrayData].getName}($convertedArray); } """ ev.copy(code = code, isNull = genInputData.isNull) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 6aebe7970443..5648ad6b6dc1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -191,14 +191,14 @@ case class StringSplit(str: Expression, pattern: Expression) override def nullSafeEval(string: Any, regex: Any): Any = { val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1) - GenericArrayData.allocate(strings.asInstanceOf[Array[Any]]) + new GenericArrayData(strings.asInstanceOf[Array[Any]]) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val arrayClass = classOf[GenericArrayData].getName nullSafeCodeGen(ctx, ev, (str, pattern) => // Array in java is covariant, so we don't need to cast UTF8String[] to Object[]. - s"""${ev.value} = $arrayClass.allocate($str.split($pattern, -1));""") + s"""${ev.value} = new $arrayClass($str.split($pattern, -1));""") } override def prettyName: String = "split" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 712c889c7aa2..5f533fecf8d0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1596,8 +1596,8 @@ case class Sentences( widx = wi.current if (Character.isLetterOrDigit(word.charAt(0))) words += UTF8String.fromString(word) } - result += GenericArrayData.allocate(words) + result += new GenericArrayData(words) } - GenericArrayData.allocate(result) + new GenericArrayData(result) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala index 1e54acd26ab6..aa328045cafd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala @@ -210,7 +210,7 @@ case class XPathList(xml: Expression, path: Expression) extends XPathExtract { ret(i) = UTF8String.fromString(nodeList.item(i).getNodeValue) i += 1 } - GenericArrayData.allocate(ret) + new GenericArrayData(ret) } else { null } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index ec87133c9d11..e476cb11a351 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -404,7 +404,7 @@ class JacksonParser( values += fieldConverter.apply(parser) } - GenericArrayData.allocate(values.toArray) + new GenericArrayData(values.toArray) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index efed96012d57..7b2347546d3e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -23,103 +23,82 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} -object GenericArrayData { - def allocate(array: Array[Any]): GenericRefArrayData = new GenericRefArrayData(array) - def allocate(seq: Seq[Any]): GenericRefArrayData = new GenericRefArrayData(seq) - def allocate(list: java.util.List[Any]): GenericRefArrayData = new GenericRefArrayData(list) - def allocate(seqOrArray: Any): GenericRefArrayData = new GenericRefArrayData(seqOrArray) - def allocate(primitiveArray: Array[Int]): GenericIntArrayData = - new GenericIntArrayData(primitiveArray) - def allocate(primitiveArray: Array[Long]): GenericLongArrayData = - new GenericLongArrayData(primitiveArray) - def allocate(primitiveArray: Array[Float]): GenericFloatArrayData = - new GenericFloatArrayData(primitiveArray) - def allocate(primitiveArray: Array[Double]): GenericDoubleArrayData = - new GenericDoubleArrayData(primitiveArray) - def allocate(primitiveArray: Array[Short]): GenericShortArrayData = - new GenericShortArrayData(primitiveArray) - def allocate(primitiveArray: Array[Byte]): GenericByteArrayData = - new GenericByteArrayData(primitiveArray) - def allocate(primitiveArray: Array[Boolean]): GenericBooleanArrayData = - new GenericBooleanArrayData(primitiveArray) -} - -abstract class GenericArrayData extends ArrayData { - override def get(ordinal: Int, elementType: DataType): AnyRef = - throw new UnsupportedOperationException("get() method is not supported") - override def getBoolean(ordinal: Int): Boolean = - throw new UnsupportedOperationException("getBoolean() method is not supported") - override def getByte(ordinal: Int): Byte = - throw new UnsupportedOperationException("getByte() method is not supported") - override def getShort(ordinal: Int): Short = - throw new UnsupportedOperationException("getShort() method is not supported") - override def getInt(ordinal: Int): Int = - throw new UnsupportedOperationException("getInt() method is not supported") - override def getLong(ordinal: Int): Long = - throw new UnsupportedOperationException("getLong() method is not supported") - override def getFloat(ordinal: Int): Float = - throw new UnsupportedOperationException("getFloat() method is not supported") - override def getDouble(ordinal: Int): Double = - throw new UnsupportedOperationException("getDouble() method is not supported") - override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = - throw new UnsupportedOperationException("getDecimal() method is not supported") - override def getUTF8String(ordinal: Int): UTF8String = - throw new UnsupportedOperationException("getUTF8String() method is not supported") - override def getBinary(ordinal: Int): Array[Byte] = - throw new UnsupportedOperationException("getBinary() method is not supported") - override def getInterval(ordinal: Int): CalendarInterval = - throw new UnsupportedOperationException("getInterval() method is not supported") - override def getStruct(ordinal: Int, numFields: Int): InternalRow = - throw new UnsupportedOperationException("getStruct() method is not supported") - override def getArray(ordinal: Int): ArrayData = - throw new UnsupportedOperationException("getArray() method is not supported") - override def getMap(ordinal: Int): MapData = - throw new UnsupportedOperationException("getMap() method is not supported") - - override def toString(): String = array.mkString("[", ",", "]") -} - -private object GenericRefArrayData { +private object GenericArrayData { // SPARK-16634: Workaround for JVM bug present in some 1.7 versions. def anyToSeq(seqOrArray: Any): Seq[Any] = seqOrArray match { case seq: Seq[Any] => seq case array: Array[_] => array.toSeq - case _ => Seq.empty } } -final class GenericRefArrayData(val array: Array[Any]) extends GenericArrayData { +class GenericArrayData(val array: Array[Any], + val booleanArray: Array[Boolean], val byteArray: Array[Byte], val shortArray: Array[Short], + val intArray: Array[Int], val longArray: Array[Long], val floatArray: Array[Float], + val doubleArray: Array[Double]) extends ArrayData { - def this(seq: Seq[Any]) = this(seq.toArray) + def this(seq: Seq[Any]) = this(seq.toArray, null, null, null, null, null, null, null) def this(list: java.util.List[Any]) = this(list.asScala) // TODO: This is boxing. We should specialize. - def this(primitiveArray: Array[Int]) = this(primitiveArray.toSeq) - def this(primitiveArray: Array[Long]) = this(primitiveArray.toSeq) - def this(primitiveArray: Array[Float]) = this(primitiveArray.toSeq) - def this(primitiveArray: Array[Double]) = this(primitiveArray.toSeq) - def this(primitiveArray: Array[Short]) = this(primitiveArray.toSeq) - def this(primitiveArray: Array[Byte]) = this(primitiveArray.toSeq) - def this(primitiveArray: Array[Boolean]) = this(primitiveArray.toSeq) - - def this(seqOrArray: Any) = this(GenericRefArrayData.anyToSeq(seqOrArray)) - - override def copy(): GenericRefArrayData = new GenericRefArrayData(array.clone()) + def this(primitiveArray: Array[Boolean]) = + this(null, primitiveArray, null, null, null, null, null, null) + def this(primitiveArray: Array[Byte]) = + this(null, null, primitiveArray, null, null, null, null, null) + def this(primitiveArray: Array[Short]) = + this(null, null, null, primitiveArray, null, null, null, null) + def this(primitiveArray: Array[Int]) = + this(null, null, null, null, primitiveArray, null, null, null) + def this(primitiveArray: Array[Long]) = + this(null, null, null, null, null, primitiveArray, null, null) + def this(primitiveArray: Array[Float]) = + this(null, null, null, null, null, null, primitiveArray, null) + def this(primitiveArray: Array[Double]) = + this(null, null, null, null, null, null, null, primitiveArray) + + def this(primitiveArray: Array[Any]) = this(primitiveArray.toSeq) + + def this(seqOrArray: Any) = this(GenericArrayData.anyToSeq(seqOrArray)) + + override def copy(): ArrayData = { + if (booleanArray != null) new GenericArrayData(booleanArray.clone()) + else if (byteArray != null) new GenericArrayData(byteArray.clone()) + else if (shortArray != null) new GenericArrayData(shortArray.clone()) + else if (intArray != null) new GenericArrayData(intArray.clone()) + else if (longArray != null) new GenericArrayData(longArray.clone()) + else if (floatArray != null) new GenericArrayData(floatArray.clone()) + else if (doubleArray != null) new GenericArrayData(doubleArray.clone()) + else new GenericArrayData(array.clone()) + } - override def numElements(): Int = array.length + override def numElements(): Int = { + if (booleanArray != null) booleanArray.length + else if (byteArray != null) byteArray.length + else if (shortArray != null) shortArray.length + else if (intArray != null) intArray.length + else if (longArray != null) longArray.length + else if (floatArray != null) floatArray.length + else if (doubleArray != null) doubleArray.length + else array.length + } private def getAs[T](ordinal: Int) = array(ordinal).asInstanceOf[T] - override def isNullAt(ordinal: Int): Boolean = getAs[AnyRef](ordinal) eq null override def get(ordinal: Int, elementType: DataType): AnyRef = getAs(ordinal) - override def getBoolean(ordinal: Int): Boolean = getAs(ordinal) - override def getByte(ordinal: Int): Byte = getAs(ordinal) - override def getShort(ordinal: Int): Short = getAs(ordinal) - override def getInt(ordinal: Int): Int = getAs(ordinal) - override def getLong(ordinal: Int): Long = getAs(ordinal) - override def getFloat(ordinal: Int): Float = getAs(ordinal) - override def getDouble(ordinal: Int): Double = getAs(ordinal) + override def getBoolean(ordinal: Int): Boolean = + if (booleanArray != null) booleanArray(ordinal) else getAs(ordinal) + override def getByte(ordinal: Int): Byte = + if (byteArray != null) byteArray(ordinal) else getAs(ordinal) + override def getShort(ordinal: Int): Short = + if (shortArray != null) shortArray(ordinal) else getAs(ordinal) + override def getInt(ordinal: Int): Int = + if (intArray != null) intArray(ordinal) else getAs(ordinal) + override def getLong(ordinal: Int): Long = + if (longArray != null) longArray(ordinal) else getAs(ordinal) + override def getFloat(ordinal: Int): Float = + if (floatArray != null) floatArray(ordinal) else getAs(ordinal) + override def getDouble(ordinal: Int): Double = + if (doubleArray != null) doubleArray(ordinal) else getAs(ordinal) override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = getAs(ordinal) override def getUTF8String(ordinal: Int): UTF8String = getAs(ordinal) override def getBinary(ordinal: Int): Array[Byte] = getAs(ordinal) @@ -128,14 +107,111 @@ final class GenericRefArrayData(val array: Array[Any]) extends GenericArrayData override def getArray(ordinal: Int): ArrayData = getAs(ordinal) override def getMap(ordinal: Int): MapData = getAs(ordinal) + override def isNullAt(ordinal: Int): Boolean = { + if (booleanArray != null || byteArray != null || shortArray != null || intArray != null || + longArray != null || floatArray != null || doubleArray != null) { + false + } else { + getAs[AnyRef](ordinal) eq null + } + } + + override def toBooleanArray(): Array[Boolean] = { + if (booleanArray != null) { + val len = booleanArray.length + val array = new Array[Boolean](len) + System.arraycopy(booleanArray, 0, array, 0, len) + array + } else { + super.toBooleanArray + } + } + + override def toByteArray(): Array[Byte] = { + if (byteArray != null) { + val len = byteArray.length + val array = new Array[Byte](len) + System.arraycopy(byteArray, 0, array, 0, len) + array + } else { + super.toByteArray + } + } + + override def toShortArray(): Array[Short] = { + if (shortArray != null) { + val len = shortArray.length + val array = new Array[Short](len) + System.arraycopy(shortArray, 0, array, 0, len) + array + } else { + super.toShortArray + } + } + + override def toIntArray(): Array[Int] = { + if (intArray != null) { + val len = intArray.length + val array = new Array[Int](len) + System.arraycopy(intArray, 0, array, 0, len) + array + } else { + super.toIntArray + } + } + + override def toLongArray(): Array[Long] = { + if (longArray != null) { + val len = longArray.length + val array = new Array[Long](len) + System.arraycopy(longArray, 0, array, 0, len) + array + } else { + super.toLongArray + } + } + + override def toFloatArray(): Array[Float] = { + if (floatArray != null) { + val len = floatArray.length + val array = new Array[Float](len) + System.arraycopy(floatArray, 0, array, 0, len) + array + } else { + super.toFloatArray + } + } + + override def toDoubleArray(): Array[Double] = { + if (doubleArray != null) { + val len = doubleArray.length + val array = new Array[Double](len) + System.arraycopy(doubleArray, 0, array, 0, len) + array + } else { + super.toDoubleArray + } + } + + override def toString(): String = { + if (booleanArray != null) booleanArray.mkString("[", ",", "]") + else if (byteArray != null) byteArray.mkString("[", ",", "]") + else if (shortArray != null) shortArray.mkString("[", ",", "]") + else if (intArray != null) intArray.mkString("[", ",", "]") + else if (longArray != null) longArray.mkString("[", ",", "]") + else if (floatArray != null) floatArray.mkString("[", ",", "]") + else if (doubleArray != null) doubleArray.mkString("[", ",", "]") + else array.mkString("[", ",", "]") + } + override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericArrayData]) { + if (!o.isInstanceOf[GenericArrayData]) { return false } val other = o.asInstanceOf[GenericArrayData] if (other eq null) { - return false; + return false } val len = numElements() @@ -143,6 +219,26 @@ final class GenericRefArrayData(val array: Array[Any]) extends GenericArrayData return false } + if ((booleanArray != null) && (other.booleanArray != null)) { + return java.util.Arrays.equals(booleanArray, other.booleanArray) + } else if ((byteArray != null) && (other.byteArray != null)) { + return java.util.Arrays.equals(byteArray, other.byteArray) + } else if ((shortArray != null) && (other.shortArray != null)) { + return java.util.Arrays.equals(shortArray, other.shortArray) + } else if ((intArray != null) && (other.intArray != null)) { + return java.util.Arrays.equals(intArray, other.intArray) + } else if ((longArray != null) && (other.longArray != null)) { + return java.util.Arrays.equals(longArray, other.longArray) + } else if ((floatArray != null) && (other.floatArray != null)) { + return java.util.Arrays.equals(floatArray, other.floatArray) + } else if ((doubleArray != null) && (other.doubleArray != null)) { + return java.util.Arrays.equals(doubleArray, other.doubleArray) + } + + if ((array == null) || (other.array == null)) { + return false + } + var i = 0 while (i < len) { if (isNullAt(i) != other.isNullAt(i)) { @@ -176,6 +272,14 @@ final class GenericRefArrayData(val array: Array[Any]) extends GenericArrayData } override def hashCode: Int = { + if (booleanArray != null) return java.util.Arrays.hashCode(booleanArray) + else if (byteArray != null) return java.util.Arrays.hashCode(byteArray) + else if (shortArray != null) return java.util.Arrays.hashCode(shortArray) + else if (intArray != null) return java.util.Arrays.hashCode(intArray) + else if (longArray != null) return java.util.Arrays.hashCode(longArray) + else if (floatArray != null) return java.util.Arrays.hashCode(floatArray) + else if (doubleArray != null) return java.util.Arrays.hashCode(doubleArray) + var result: Int = 37 var i = 0 val len = numElements() @@ -204,227 +308,3 @@ final class GenericRefArrayData(val array: Array[Any]) extends GenericArrayData result } } - -final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericIntArrayData = new GenericIntArrayData(toIntArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getInt(ordinal: Int): Int = primitiveArray(ordinal) - override def toIntArray(): Array[Int] = { - val array = new Array[Int](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericIntArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericIntArrayData] - if (other eq null) { - return false; - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - -final class GenericLongArrayData(val primitiveArray: Array[Long]) - extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericLongArrayData = new GenericLongArrayData(toLongArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getLong(ordinal: Int): Long = primitiveArray(ordinal) - override def toLongArray(): Array[Long] = { - val array = new Array[Long](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericLongArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericLongArrayData] - if (other eq null) { - return false - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - -final class GenericFloatArrayData(val primitiveArray: Array[Float]) - extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericFloatArrayData = new GenericFloatArrayData(toFloatArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getFloat(ordinal: Int): Float = primitiveArray(ordinal) - override def toFloatArray(): Array[Float] = { - val array = new Array[Float](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericFloatArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericFloatArrayData] - if (other eq null) { - return false - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - -final class GenericDoubleArrayData(val primitiveArray: Array[Double]) - extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericDoubleArrayData = new GenericDoubleArrayData(toDoubleArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getDouble(ordinal: Int): Double = primitiveArray(ordinal) - override def toDoubleArray(): Array[Double] = { - val array = new Array[Double](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericDoubleArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericDoubleArrayData] - if (other eq null) { - return false - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - -final class GenericShortArrayData(val primitiveArray: Array[Short]) - extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericShortArrayData = new GenericShortArrayData(toShortArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getShort(ordinal: Int): Short = primitiveArray(ordinal) - override def toShortArray(): Array[Short] = { - val array = new Array[Short](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericShortArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericShortArrayData] - if (other eq null) { - return false - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - -final class GenericByteArrayData(val primitiveArray: Array[Byte]) - extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericByteArrayData = new GenericByteArrayData(toByteArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getByte(ordinal: Int): Byte = primitiveArray(ordinal) - override def toByteArray(): Array[Byte] = { - val array = new Array[Byte](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericByteArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericByteArrayData] - if (other eq null) { - return false - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - -final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) - extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericBooleanArrayData = new GenericBooleanArrayData(toBooleanArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getBoolean(ordinal: Int): Boolean = primitiveArray(ordinal) - override def toBooleanArray(): Array[Boolean] = { - val array = new Array[Boolean](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericBooleanArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericBooleanArrayData] - if (other eq null) { - return false - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala index ded074c5abc7..43b6afd9ad89 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala @@ -287,8 +287,8 @@ class ScalaReflectionSuite extends SparkFunSuite { assert(serializer.children.head.isInstanceOf[Literal]) assert(serializer.children.head.asInstanceOf[Literal].value === UTF8String.fromString("value")) assert(serializer.children.last.isInstanceOf[NewInstance]) - assert(classOf[org.apache.spark.sql.catalyst.util.GenericArrayData] - isAssignableFrom(serializer.children.last.asInstanceOf[NewInstance].cls)) + assert(serializer.children.last.asInstanceOf[NewInstance] + .cls.isAssignableFrom(classOf[org.apache.spark.sql.catalyst.util.GenericArrayData])) } private val dataTypeForComplexData = dataTypeFor[ComplexData] diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala index 6e59215dc8c6..21afe9fec594 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala @@ -57,8 +57,8 @@ private[sql] class UngroupableUDT extends UserDefinedType[UngroupableData] { override def sqlType: DataType = MapType(IntegerType, IntegerType) override def serialize(ungroupableData: UngroupableData): MapData = { - val keyArray = GenericArrayData.allocate(ungroupableData.data.keys.toSeq) - val valueArray = GenericArrayData.allocate(ungroupableData.data.values.toSeq) + val keyArray = new GenericArrayData(ungroupableData.data.keys.toSeq) + val valueArray = new GenericArrayData(ungroupableData.data.values.toSeq) new ArrayBasedMapData(keyArray, valueArray) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala index 49934354e5de..802397d50e85 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala @@ -70,11 +70,11 @@ class EncoderResolutionSuite extends PlanTest { val bound = encoder.resolveAndBind(attrs) // If no null values appear, it should works fine - bound.fromRow(InternalRow(GenericArrayData.allocate(Array(1, 2)))) + bound.fromRow(InternalRow(new GenericArrayData(Array(1, 2)))) // If there is null value, it should throw runtime exception val e = intercept[RuntimeException] { - bound.fromRow(InternalRow(GenericArrayData.allocate(Array(1, null)))) + bound.fromRow(InternalRow(new GenericArrayData(Array(1, null)))) } assert(e.getMessage.contains("Null value appeared in non-nullable field")) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala index 46575f7d63eb..1a5569a77dc7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala @@ -51,7 +51,7 @@ class ExamplePointUDT extends UserDefinedType[ExamplePoint] { val output = new Array[Any](2) output(0) = p.x output(1) = p.y - GenericArrayData.allocate(output) + new GenericArrayData(output) } override def deserialize(datum: Any): ExamplePoint = { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala index ab14981ad184..3edcc02f1526 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala @@ -39,7 +39,7 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // test UnsafeRow-backed data val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]] val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4)))) - val structExpected = GenericArrayData.allocate( + val structExpected = new GenericArrayData( Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4)))) checkEvalutionWithUnsafeProjection( structEncoder.serializer.head, structExpected, structInputRow) @@ -47,8 +47,8 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // test UnsafeArray-backed data val arrayEncoder = ExpressionEncoder[Array[Array[Int]]] val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4)))) - val arrayExpected = GenericArrayData.allocate( - Array(GenericArrayData.allocate(Array(1, 2)), GenericArrayData.allocate(Array(3, 4)))) + val arrayExpected = new GenericArrayData( + Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4)))) checkEvalutionWithUnsafeProjection( arrayEncoder.serializer.head, arrayExpected, arrayInputRow) @@ -56,13 +56,13 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]] val mapInputRow = InternalRow.fromSeq(Seq(Array( Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400)))) - val mapExpected = GenericArrayData.allocate(Seq( + val mapExpected = new GenericArrayData(Seq( new ArrayBasedMapData( - GenericArrayData.allocate(Array(1, 2)), - GenericArrayData.allocate(Array(100, 200))), + new GenericArrayData(Array(1, 2)), + new GenericArrayData(Array(100, 200))), new ArrayBasedMapData( - GenericArrayData.allocate(Array(3, 4)), - GenericArrayData.allocate(Array(300, 400))))) + new GenericArrayData(Array(3, 4)), + new GenericArrayData(Array(300, 400))))) checkEvalutionWithUnsafeProjection( mapEncoder.serializer.head, mapExpected, mapInputRow) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala index b634834c67e3..cf3cbe270753 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala @@ -291,7 +291,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers { assert(unsafeRow.getSizeInBytes == 8 + 2 * 8 + row1.getSizeInBytes + row2.getSizeInBytes) } - private def createArray(values: Any*): ArrayData = GenericArrayData.allocate(values.toArray) + private def createArray(values: Any*): ArrayData = new GenericArrayData(values.toArray) private def createMap(keys: Any*)(values: Any*): MapData = { assert(keys.length == values.length) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSuite.scala index c7c386b5b838..0653302e09c0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSuite.scala @@ -18,7 +18,9 @@ package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.catalyst.expressions.UnsafeRow +import org.apache.spark.sql.catalyst.expressions.{UnsafeArrayData, UnsafeRow} +import org.apache.spark.sql.catalyst.util.GenericArrayData +import org.apache.spark.unsafe.Platform class BufferHolderSuite extends SparkFunSuite { @@ -36,4 +38,190 @@ class BufferHolderSuite extends SparkFunSuite { } assert(e.getMessage.contains("exceeds size limitation")) } + + def performUnsafeArrayWriter(length: Int, elementSize: Int, f: (UnsafeArrayWriter) => Unit): + UnsafeArrayData = { + val unsafeRow = new UnsafeRow(1) + val unsafeArrayWriter = new UnsafeArrayWriter + val bufferHolder = new BufferHolder(unsafeRow, 32) + bufferHolder.reset() + val cursor = bufferHolder.cursor + unsafeArrayWriter.initialize(bufferHolder, length, elementSize) + // execute UnsafeArrayWriter.foo() in f() + f(unsafeArrayWriter) + + val unsafeArray = new UnsafeArrayData + unsafeArray.pointTo(bufferHolder.buffer, cursor.toLong, bufferHolder.cursor - cursor) + assert(unsafeArray.numElements() == length) + unsafeArray + } + + def initializeUnsafeArrayData(data: Seq[Any], elementSize: Int): UnsafeArrayData = { + val length = data.length + val unsafeArray = new UnsafeArrayData + val headerSize = UnsafeArrayData.calculateHeaderPortionInBytes(length) + val size = headerSize + elementSize * length + val buffer = new Array[Byte](size) + Platform.putInt(buffer, Platform.BYTE_ARRAY_OFFSET, length) + unsafeArray.pointTo(buffer, Platform.BYTE_ARRAY_OFFSET, size) + assert(unsafeArray.numElements == length) + data.zipWithIndex.map { case (e, i) => + val offset = Platform.BYTE_ARRAY_OFFSET + headerSize + elementSize * i + e match { + case _ : Boolean => Platform.putBoolean(buffer, offset, e.asInstanceOf[Boolean]) + case _ : Byte => Platform.putByte(buffer, offset, e.asInstanceOf[Byte]) + case _ : Short => Platform.putShort(buffer, offset, e.asInstanceOf[Short]) + case _ : Int => Platform.putInt(buffer, offset, e.asInstanceOf[Int]) + case _ : Long => Platform.putLong(buffer, offset, e.asInstanceOf[Long]) + case _ : Float => Platform.putFloat(buffer, offset, e.asInstanceOf[Float]) + case _ : Double => Platform.putDouble(buffer, offset, e.asInstanceOf[Double]) + case _ => throw new UnsupportedOperationException() + } + } + unsafeArray + } + + val booleanData = Seq(true, false) + val byteData = Seq(0.toByte, 1.toByte, Byte.MaxValue, Byte.MinValue) + val shortData = Seq(0.toShort, 1.toShort, Short.MaxValue, Short.MinValue) + val intData = Seq(0, 1, -1, Int.MaxValue, Int.MinValue) + val longData = Seq(0.toLong, 1.toLong, -1.toLong, Long.MaxValue, Long.MinValue) + val floatData = Seq(0.toFloat, 1.1.toFloat, -1.1.toFloat, Float.MaxValue, Float.MinValue) + val doubleData = Seq(0.toDouble, 1.1.toDouble, -1.1.toDouble, Double.MaxValue, Double.MinValue) + + test("UnsafeArrayDataWriter write") { + val boolUnsafeArray = performUnsafeArrayWriter(booleanData.length, 1, + (writer: UnsafeArrayWriter) => booleanData.zipWithIndex.map { + case (e, i) => writer.write(i, e) }) + booleanData.zipWithIndex.map { case (e, i) => assert(boolUnsafeArray.getBoolean(i) == e) } + + val byteUnsafeArray = performUnsafeArrayWriter(byteData.length, 1, + (writer: UnsafeArrayWriter) => byteData.zipWithIndex.map { + case (e, i) => writer.write(i, e) }) + byteData.zipWithIndex.map { case (e, i) => assert(byteUnsafeArray.getByte(i) == e) } + + val shortUnsafeArray = performUnsafeArrayWriter(shortData.length, 2, + (writer: UnsafeArrayWriter) => shortData.zipWithIndex.map { + case (e, i) => writer.write(i, e) }) + shortData.zipWithIndex.map { case (e, i) => assert(shortUnsafeArray.getShort(i) == e) } + + val intUnsafeArray = performUnsafeArrayWriter(intData.length, 4, + (writer: UnsafeArrayWriter) => intData.zipWithIndex.map { + case (e, i) => writer.write(i, e) }) + intData.zipWithIndex.map { case (e, i) => assert(intUnsafeArray.getInt(i) == e) } + + val longUnsafeArray = performUnsafeArrayWriter(longData.length, 8, + (writer: UnsafeArrayWriter) => longData.zipWithIndex.map { + case (e, i) => writer.write(i, e) }) + longData.zipWithIndex.map { case (e, i) => assert(longUnsafeArray.getLong(i) == e) } + + val floatUnsafeArray = performUnsafeArrayWriter(floatData.length, 8, + (writer: UnsafeArrayWriter) => floatData.zipWithIndex.map { + case (e, i) => writer.write(i, e) }) + floatData.zipWithIndex.map { case (e, i) => assert(floatUnsafeArray.getFloat(i) == e) } + + val doubleUnsafeArray = performUnsafeArrayWriter(doubleData.length, 8, + (writer: UnsafeArrayWriter) => doubleData.zipWithIndex.map { + case (e, i) => writer.write(i, e) }) + doubleData.zipWithIndex.map { case (e, i) => assert(doubleUnsafeArray.getDouble(i) == e) } + } + + test("toPrimitiveArray") { + val booleanUnsafeArray = initializeUnsafeArrayData(booleanData, 1) + booleanUnsafeArray.toBooleanArray(). + zipWithIndex.map { case (e, i) => assert(e == booleanData(i)) } + + val byteUnsafeArray = initializeUnsafeArrayData(byteData, 1) + byteUnsafeArray.toByteArray().zipWithIndex.map { case (e, i) => assert(e == byteData(i)) } + + val shortUnsafeArray = initializeUnsafeArrayData(shortData, 2) + shortUnsafeArray.toShortArray().zipWithIndex.map { case (e, i) => assert(e == shortData(i)) } + + val intUnsafeArray = initializeUnsafeArrayData(intData, 4) + intUnsafeArray.toIntArray().zipWithIndex.map { case (e, i) => assert(e == intData(i)) } + + val longUnsafeArray = initializeUnsafeArrayData(longData, 8) + longUnsafeArray.toLongArray().zipWithIndex.map { case (e, i) => assert(e == longData(i)) } + + val floatUnsafeArray = initializeUnsafeArrayData(floatData, 4) + floatUnsafeArray.toFloatArray().zipWithIndex.map { case (e, i) => assert(e == floatData(i)) } + + val doubleUnsafeArray = initializeUnsafeArrayData(doubleData, 8) + doubleUnsafeArray.toDoubleArray(). + zipWithIndex.map { case (e, i) => assert(e == doubleData(i)) } + } + + test("fromPrimitiveArray") { + val booleanArray = booleanData.toArray + val booleanUnsafeArray = UnsafeArrayData.fromPrimitiveArray(booleanArray) + booleanArray.zipWithIndex.map { case (e, i) => assert(booleanUnsafeArray.getBoolean(i) == e) } + + val byteArray = byteData.toArray + val byteUnsafeArray = UnsafeArrayData.fromPrimitiveArray(byteArray) + byteArray.zipWithIndex.map { case (e, i) => assert(byteUnsafeArray.getByte(i) == e) } + + val shortArray = shortData.toArray + val shortUnsafeArray = UnsafeArrayData.fromPrimitiveArray(shortArray) + shortArray.zipWithIndex.map { case (e, i) => assert(shortUnsafeArray.getShort(i) == e) } + + val intArray = intData.toArray + val intUnsafeArray = UnsafeArrayData.fromPrimitiveArray(intArray) + intArray.zipWithIndex.map { case (e, i) => assert(intUnsafeArray.getInt(i) == e) } + + val longArray = longData.toArray + val longUnsafeArray = UnsafeArrayData.fromPrimitiveArray(longArray) + longArray.zipWithIndex.map { case (e, i) => assert(longUnsafeArray.getLong(i) == e) } + + val floatArray = floatData.toArray + val floatUnsafeArray = UnsafeArrayData.fromPrimitiveArray(floatArray) + floatArray.zipWithIndex.map { case (e, i) => assert(floatUnsafeArray.getFloat(i) == e) } + + val doubleArray = doubleData.toArray + val doubleUnsafeArray = UnsafeArrayData.fromPrimitiveArray(doubleArray) + doubleArray.zipWithIndex.map { case (e, i) => assert(doubleUnsafeArray.getDouble(i) == e) } + } + + test("writePrimitiveArray") { + val booleanArray = booleanData.toArray + val booleanUnsafeArray = performUnsafeArrayWriter(booleanArray.length, 4, + (writer: UnsafeArrayWriter) => + writer.writePrimitiveBooleanArray(new GenericArrayData(booleanArray))) + booleanArray.zipWithIndex.map { case (e, i) => assert(booleanUnsafeArray.getBoolean(i) == e) } + + val byteArray = byteData.toArray + val byteUnsafeArray = performUnsafeArrayWriter(byteArray.length, 4, + (writer: UnsafeArrayWriter) => + writer.writePrimitiveByteArray(new GenericArrayData(byteArray))) + byteArray.zipWithIndex.map { case (e, i) => assert(byteUnsafeArray.getByte(i) == e) } + + val shortArray = shortData.toArray + val shortUnsafeArray = performUnsafeArrayWriter(shortArray.length, 4, + (writer: UnsafeArrayWriter) => + writer.writePrimitiveShortArray(new GenericArrayData(shortArray))) + shortArray.zipWithIndex.map { case (e, i) => assert(shortUnsafeArray.getShort(i) == e) } + + val intArray = intData.toArray + val intUnsafeArray = performUnsafeArrayWriter(intArray.length, 4, + (writer: UnsafeArrayWriter) => + writer.writePrimitiveIntArray(new GenericArrayData(intArray))) + intArray.zipWithIndex.map { case (e, i) => assert(intUnsafeArray.getInt(i) == e) } + + val longArray = longData.toArray + val longUnsafeArray = performUnsafeArrayWriter(longArray.length, 8, + (writer: UnsafeArrayWriter) => + writer.writePrimitiveLongArray(new GenericArrayData(longArray))) + longArray.zipWithIndex.map { case (e, i) => assert(longUnsafeArray.getLong(i) == e) } + + val floatArray = floatData.toArray + val floatUnsafeArray = performUnsafeArrayWriter(floatArray.length, 4, + (writer: UnsafeArrayWriter) => + writer.writePrimitiveFloatArray(new GenericArrayData(floatArray))) + floatArray.zipWithIndex.map { case (e, i) => assert(floatUnsafeArray.getFloat(i) == e) } + + val doubleArray = doubleData.toArray + val doubleUnsafeArray = performUnsafeArrayWriter(doubleArray.length, 8, + (writer: UnsafeArrayWriter) => + writer.writePrimitiveDoubleArray(new GenericArrayData(doubleArray))) + doubleArray.zipWithIndex.map { case (e, i) => assert(doubleUnsafeArray.getDouble(i) == e) } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala index d6c9a9c0b638..b69b74b4240b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala @@ -86,7 +86,7 @@ class GeneratedProjectionSuite extends SparkFunSuite { test("generated unsafe projection with array of binary") { val row = InternalRow( Array[Byte](1, 2), - GenericArrayData.allocate(Array(Array[Byte](1, 2), null, Array[Byte](3, 4)))) + new GenericArrayData(Array(Array[Byte](1, 2), null, Array[Byte](3, 4)))) val fields = (BinaryType :: ArrayType(BinaryType) :: Nil).toArray[DataType] val unsafeProj = UnsafeProjection.create(fields) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataSuite.scala index 8f16fa17fc71..4c4c5d6b4e9f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataSuite.scala @@ -1,108 +1,187 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.util - -import org.apache.spark.SparkFunSuite - import org.apache.spark.sql.catalyst.expressions.UnsafeArrayData - -class GenericArrayDataSuite extends SparkFunSuite { - - test("from primitive boolean array") { - val primitiveArray = Array(true, false, true) - val array = GenericArrayData.allocate(primitiveArray) - assert(array.isInstanceOf[GenericArrayData]) - assert(array.numElements == primitiveArray.length) - assert(array.isNullAt(0) == false) - assert(array.getBoolean(0) == primitiveArray(0)) - assert(array.getBoolean(1) == primitiveArray(1)) - assert(array.getBoolean(2) == primitiveArray(2)) - assert(array.toBooleanArray()(0) == primitiveArray(0)) - } - - test("from primitive byte array") { - val primitiveArray = Array(1.toByte, 10.toByte, 100.toByte) - val array = GenericArrayData.allocate(primitiveArray) - assert(array.isInstanceOf[GenericArrayData]) - assert(array.numElements == primitiveArray.length) - assert(array.isNullAt(0) == false) - assert(array.getByte(0) == primitiveArray(0)) - assert(array.getByte(1) == primitiveArray(1)) - assert(array.getByte(2) == primitiveArray(2)) - assert(array.toByteArray()(0) == primitiveArray(0)) - } - - test("from primitive short array") { - val primitiveArray = Array[Short](1.toShort, 100.toShort, 10000.toShort) - val array = GenericArrayData.allocate(primitiveArray) - assert(array.isInstanceOf[GenericArrayData]) - assert(array.numElements == primitiveArray.length) - assert(array.isNullAt(0) == false) - assert(array.getShort(0) == primitiveArray(0)) - assert(array.getShort(1) == primitiveArray(1)) - assert(array.getShort(2) == primitiveArray(2)) - assert(array.toShortArray()(0) == primitiveArray(0)) - } - - test("from primitive int array") { - val primitiveArray = Array(1, 1000, 1000000) - val array = GenericArrayData.allocate(primitiveArray) - assert(array.isInstanceOf[GenericArrayData]) - assert(array.numElements == primitiveArray.length) - assert(array.isNullAt(0) == false) - assert(array.getInt(0) == primitiveArray(0)) - assert(array.getInt(1) == primitiveArray(1)) - assert(array.getInt(2) == primitiveArray(2)) - assert(array.toIntArray()(0) == primitiveArray(0)) - } - - test("from primitive long array") { - val primitiveArray = Array(1L, 100000L, 10000000000L) - val array = GenericArrayData.allocate(primitiveArray) - assert(array.isInstanceOf[GenericArrayData]) - assert(array.numElements == primitiveArray.length) - assert(array.isNullAt(0) == false) - assert(array.getLong(0) == primitiveArray(0)) - assert(array.getLong(1) == primitiveArray(1)) - assert(array.getLong(2) == primitiveArray(2)) - assert(array.toLongArray()(0) == primitiveArray(0)) - } - - test("from primitive float array") { - val primitiveArray = Array(1.1f, 2.2f, 3.3f) - val array = GenericArrayData.allocate(primitiveArray) - assert(array.isInstanceOf[GenericArrayData]) - assert(array.numElements == primitiveArray.length) - assert(array.isNullAt(0) == false) - assert(array.getFloat(0) == primitiveArray(0)) - assert(array.getFloat(1) == primitiveArray(1)) - assert(array.getFloat(2) == primitiveArray(2)) - assert(array.toFloatArray()(0) == primitiveArray(0)) - } - - test("from primitive double array") { - val primitiveArray = Array(1.1, 2.2, 3.3) - val array = GenericArrayData.allocate(primitiveArray) - assert(array.isInstanceOf[GenericArrayData]) - assert(array.numElements == primitiveArray.length) - assert(array.isNullAt(0) == false) - assert(array.getDouble(0) == primitiveArray(0)) - assert(array.getDouble(1) == primitiveArray(1)) - assert(array.getDouble(2) == primitiveArray(2)) - assert(array.toDoubleArray()(0) == primitiveArray(0)) - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.util + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.expressions.UnsafeArrayData + +class GenericArrayDataSuite extends SparkFunSuite { + + test("equals/hash") { + val booleanPrimitiveArray = Array(true, false, true) + val booleanArray1 = new GenericArrayData(booleanPrimitiveArray) + val booleanArray2 = new GenericArrayData(booleanPrimitiveArray) + val anyBooleanArray = new GenericArrayData(booleanPrimitiveArray.toArray[Any]) + assert(booleanArray1.equals(booleanArray2)) + assert(!booleanArray1.equals(anyBooleanArray)) + assert(booleanArray1.hashCode == booleanArray2.hashCode) + assert(booleanArray1.hashCode != anyBooleanArray.hashCode) + + val bytePrimitiveArray = Array(1.toByte, 10.toByte, 100.toByte) + val byteArray1 = new GenericArrayData(bytePrimitiveArray) + val byteArray2 = new GenericArrayData(bytePrimitiveArray) + val anyByteArray = new GenericArrayData(bytePrimitiveArray.toArray[Any]) + assert(byteArray1.equals(byteArray2)) + assert(!byteArray1.equals(anyByteArray)) + assert(byteArray1.hashCode == byteArray2.hashCode) + assert(byteArray1.hashCode != anyByteArray.hashCode) + + val shortPrimitiveArray = Array[Short](1.toShort, 100.toShort, 10000.toShort) + val shortArray1 = new GenericArrayData(shortPrimitiveArray) + val shortArray2 = new GenericArrayData(shortPrimitiveArray) + val anyShortArray = new GenericArrayData(shortPrimitiveArray.toArray[Any]) + assert(shortArray1.equals(shortArray2)) + assert(!shortArray1.equals(anyShortArray)) + assert(shortArray1.hashCode == shortArray2.hashCode) + assert(shortArray1.hashCode != anyShortArray.hashCode) + + val intPrimitiveArray = Array(1, 1000, 1000000) + val intArray1 = new GenericArrayData(intPrimitiveArray) + val intArray2 = new GenericArrayData(intPrimitiveArray) + val anyIntArray = new GenericArrayData(intPrimitiveArray.toArray[Any]) + assert(intArray1.equals(intArray2)) + assert(!intArray1.equals(anyIntArray)) + assert(intArray1.hashCode == intArray2.hashCode) + assert(intArray2.hashCode != anyIntArray.hashCode) + + val longPrimitiveArray = Array(1L, 100000L, 10000000000L) + val longArray1 = new GenericArrayData(longPrimitiveArray) + val longArray2 = new GenericArrayData(longPrimitiveArray) + val anyLongArray = new GenericArrayData(longPrimitiveArray.toArray[Any]) + assert(longArray1.equals(longArray2)) + assert(!longArray1.equals(anyLongArray)) + assert(longArray1.hashCode == longArray2.hashCode) + assert(longArray1.hashCode != anyLongArray.hashCode) + + val floatPrimitiveArray = Array(1.1f, 2.2f, 3.3f) + val floatArray1 = new GenericArrayData(floatPrimitiveArray) + val floatArray2 = new GenericArrayData(floatPrimitiveArray) + val anyFloatArray = new GenericArrayData(floatPrimitiveArray.toArray[Any]) + assert(floatArray1.equals(floatArray2)) + assert(!floatArray1.equals(anyFloatArray)) + assert(floatArray1.hashCode == floatArray2.hashCode) + assert(floatArray1.hashCode != anyFloatArray.hashCode) + + val doublePrimitiveArray = Array(1.1, 2.2, 3.3) + val doubleArray1 = new GenericArrayData(doublePrimitiveArray) + val doubleArray2 = new GenericArrayData(doublePrimitiveArray) + val anyDoubleArray = new GenericArrayData(doublePrimitiveArray.toArray[Any]) + assert(doubleArray1.equals(doubleArray2)) + assert(!doubleArray1.equals(anyDoubleArray)) + assert(doubleArray1.hashCode == doubleArray2.hashCode) + assert(doubleArray1.hashCode != anyDoubleArray.hashCode) + } + + test("from primitive boolean array") { + val primitiveArray = Array(true, false, true) + val array = new GenericArrayData(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.array == null) + assert(array.booleanArray != null) + assert(array.numElements == primitiveArray.length) + assert(array.isNullAt(0) == false) + assert(array.getBoolean(0) == primitiveArray(0)) + assert(array.getBoolean(1) == primitiveArray(1)) + assert(array.getBoolean(2) == primitiveArray(2)) + assert(array.toBooleanArray()(0) == primitiveArray(0)) + } + + test("from primitive byte array") { + val primitiveArray = Array(1.toByte, 10.toByte, 100.toByte) + val array = new GenericArrayData(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.array == null) + assert(array.byteArray != null) + assert(array.numElements == primitiveArray.length) + assert(array.isNullAt(0) == false) + assert(array.getByte(0) == primitiveArray(0)) + assert(array.getByte(1) == primitiveArray(1)) + assert(array.getByte(2) == primitiveArray(2)) + assert(array.toByteArray()(0) == primitiveArray(0)) + } + + test("from primitive short array") { + val primitiveArray = Array[Short](1.toShort, 100.toShort, 10000.toShort) + val array = new GenericArrayData(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.array == null) + assert(array.shortArray != null) + assert(array.numElements == primitiveArray.length) + assert(array.isNullAt(0) == false) + assert(array.getShort(0) == primitiveArray(0)) + assert(array.getShort(1) == primitiveArray(1)) + assert(array.getShort(2) == primitiveArray(2)) + assert(array.toShortArray()(0) == primitiveArray(0)) + } + + test("from primitive int array") { + val primitiveArray = Array(1, 1000, 1000000) + val array = new GenericArrayData(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.array == null) + assert(array.intArray != null) + assert(array.numElements == primitiveArray.length) + assert(array.isNullAt(0) == false) + assert(array.getInt(0) == primitiveArray(0)) + assert(array.getInt(1) == primitiveArray(1)) + assert(array.getInt(2) == primitiveArray(2)) + assert(array.toIntArray()(0) == primitiveArray(0)) + } + + test("from primitive long array") { + val primitiveArray = Array(1L, 100000L, 10000000000L) + val array = new GenericArrayData(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.array == null) + assert(array.longArray != null) + assert(array.numElements == primitiveArray.length) + assert(array.isNullAt(0) == false) + assert(array.getLong(0) == primitiveArray(0)) + assert(array.getLong(1) == primitiveArray(1)) + assert(array.getLong(2) == primitiveArray(2)) + assert(array.toLongArray()(0) == primitiveArray(0)) + } + + test("from primitive float array") { + val primitiveArray = Array(1.1f, 2.2f, 3.3f) + val array = new GenericArrayData(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.array == null) + assert(array.floatArray != null) + assert(array.numElements == primitiveArray.length) + assert(array.isNullAt(0) == false) + assert(array.getFloat(0) == primitiveArray(0)) + assert(array.getFloat(1) == primitiveArray(1)) + assert(array.getFloat(2) == primitiveArray(2)) + assert(array.toFloatArray()(0) == primitiveArray(0)) + } + + test("from primitive double array") { + val primitiveArray = Array(1.1, 2.2, 3.3) + val array = new GenericArrayData(primitiveArray) + assert(array.isInstanceOf[GenericArrayData]) + assert(array.numElements == primitiveArray.length) + assert(array.array == null) + assert(array.doubleArray != null) + assert(array.isNullAt(0) == false) + assert(array.getDouble(0) == primitiveArray(0)) + assert(array.getDouble(1) == primitiveArray(1)) + assert(array.getDouble(2) == primitiveArray(2)) + assert(array.toDoubleArray()(0) == primitiveArray(0)) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala index 27beccb0c4d2..41edb6511c2c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -425,7 +425,7 @@ object JdbcUtils extends Logging { (rs: ResultSet, row: InternalRow, pos: Int) => val array = nullSafeConvert[Object]( rs.getArray(pos + 1).getArray, - array => GenericArrayData.allocate(elementConversion.apply(array))) + array => new GenericArrayData(elementConversion.apply(array))) row.update(pos, array) case _ => throw new IllegalArgumentException(s"Unsupported type ${dt.simpleString}") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala index 108977c23ec3..33dcf2f3fd16 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala @@ -491,7 +491,7 @@ private[parquet] class ParquetRowConverter( override def getConverter(fieldIndex: Int): Converter = elementConverter - override def end(): Unit = updater.set(GenericArrayData.allocate(currentArray.toArray)) + override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray)) // NOTE: We can't reuse the mutable `ArrayBuffer` here and must instantiate a new buffer for the // next value. `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored @@ -590,7 +590,7 @@ private[parquet] class ParquetRowConverter( protected def newArrayUpdater(updater: ParentContainerUpdater) = new ParentContainerUpdater { override def start(): Unit = currentArray = ArrayBuffer.empty[Any] - override def end(): Unit = updater.set(GenericArrayData.allocate(currentArray.toArray)) + override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray)) override def set(value: Any): Unit = currentArray += value } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala index 0e496dfd29e8..46fd54e5c742 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala @@ -119,10 +119,10 @@ object EvaluatePython { case (c, BinaryType) if c.getClass.isArray && c.getClass.getComponentType.getName == "byte" => c case (c: java.util.List[_], ArrayType(elementType, _)) => - GenericArrayData.allocate(c.asScala.map { e => fromJava(e, elementType)}.toArray) + new GenericArrayData(c.asScala.map { e => fromJava(e, elementType)}.toArray) case (c, ArrayType(elementType, _)) if c.getClass.isArray => - GenericArrayData.allocate(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType))) + new GenericArrayData(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType))) case (javaMap: java.util.Map[_, _], MapType(keyType, valueType, _)) => ArrayBasedMapData( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala index 6a8a5e060fd8..a73e4272950a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala @@ -49,7 +49,7 @@ private[sql] class ExamplePointUDT extends UserDefinedType[ExamplePoint] { val output = new Array[Any](2) output(0) = p.x output(1) = p.y - GenericArrayData.allocate(output) + new GenericArrayData(output) } override def deserialize(datum: Any): ExamplePoint = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala index 1230b921aa27..7d7ded7d1cad 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala @@ -27,6 +27,19 @@ import org.apache.spark.sql.test.SharedSQLContext class DataFrameComplexTypeSuite extends QueryTest with SharedSQLContext { import testImplicits._ + test("primitive type and null on array") { + val rows = sparkContext.parallelize(Seq(1, 2), 1).toDF("v"). + selectExpr("Array(v + 2, null, v + 3)") + checkAnswer(rows, Seq(Row(Array(3, null, 4)), Row(Array(4, null, 5)))) + } + + test("array with null on array") { + val rows = sparkContext.parallelize(Seq(1, 2), 1).toDF("v"). + selectExpr("Array(Array(v, v + 1)," + + "null," + + "Array(v, v - 1))").collect + } + test("UDF on struct") { val f = udf((a: String) => a) val df = sparkContext.parallelize(Seq((1, 1))).toDF("a", "b") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala index c002dfcf4908..a32763db054f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala @@ -160,7 +160,7 @@ class UnsafeRowSuite extends SparkFunSuite { } test("calling hashCode on unsafe array returned by getArray(ordinal)") { - val row = InternalRow.apply(GenericArrayData.allocate(Array(1L))) + val row = InternalRow.apply(new GenericArrayData(Array(1L))) val unsafeRow = UnsafeProjection.create(Array[DataType](ArrayType(LongType))).apply(row) // Makes sure hashCode on unsafe array won't crash unsafeRow.getArray(0).hashCode() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala index 17ec9315e4a6..474f17ff7afb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala @@ -50,7 +50,7 @@ object UDT { override def sqlType: DataType = ArrayType(DoubleType, containsNull = false) override def serialize(features: MyDenseVector): ArrayData = { - GenericArrayData.allocate(features.data.map(_.asInstanceOf[Any])) + new GenericArrayData(features.data.map(_.asInstanceOf[Any])) } override def deserialize(datum: Any): MyDenseVector = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala index e590d2833477..686c8fa6f5fa 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala @@ -56,7 +56,7 @@ object ColumnarTestUtils { case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => - GenericArrayData.allocate(Array[Any](Random.nextInt(), Random.nextInt())) + new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index 7002b6437611..e303065127c3 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -481,7 +481,7 @@ private[hive] trait HiveInspectors { val values = li.getWritableConstantValue.asScala .map(unwrapper) .toArray - val constant = GenericArrayData.allocate(values) + val constant = new GenericArrayData(values) _ => constant case poi: VoidObjectInspector => _ => null // always be null for void object inspector @@ -637,7 +637,7 @@ private[hive] trait HiveInspectors { Option(li.getList(data)) .map { l => val values = l.asScala.map(unwrapper).toArray - GenericArrayData.allocate(values) + new GenericArrayData(values) } .orNull } else { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala index 8cadaeedea69..3de1f4aeb74d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala @@ -229,7 +229,7 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors { test("wrap / unwrap Array Type") { val dt = ArrayType(dataTypes(0)) - val d = GenericArrayData.allocate(Array(row(0), row(0))) + val d = new GenericArrayData(Array(row(0), row(0))) checkValue(d, unwrap(wrap(d, toInspector(dt), dt), toInspector(dt))) checkValue(null, unwrap(wrap(null, toInspector(dt), dt), toInspector(dt))) checkValue(d, From e541f35ebd37f39e95445a3dc408196d94afbd95 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 1 Oct 2016 15:57:12 +0900 Subject: [PATCH 31/75] update benchmark programs --- .../benchmark/GenericArrayDataBenchmark.scala | 128 ++++++------------ .../benchmark/PrimitiveArrayBenchmark.scala | 32 +++++ 2 files changed, 73 insertions(+), 87 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index de3084f6b42c..38bdbf9e448e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -40,14 +40,15 @@ class GenericArrayDataBenchmark extends BenchmarkBase { val specializedIntArray = { i: Int => var n = 0 while (n < iters) { - array = GenericArrayData.allocate(primitiveIntArray) + array = new GenericArrayData(primitiveIntArray) n += 1 } } + val anyArray = primitiveIntArray.toArray[Any] val genericIntArray = { i: Int => var n = 0 while (n < iters) { - array = new GenericRefArrayData(primitiveIntArray) + array = new GenericArrayData(anyArray) n += 1 } } @@ -57,14 +58,6 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic ")(genericIntArray) benchmark.addCase("Specialized")(specializedIntArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Allocate GenericArrayData for int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 40 / 43 522.2 1.9 1.0X - Specialized 0 / 0 209715200.0 0.0 401598.7X - */ } def allocateGenericDoubleArray(iters: Int): Unit = { @@ -75,14 +68,15 @@ class GenericArrayDataBenchmark extends BenchmarkBase { val specializedDoubleArray = { i: Int => var n = 0 while (n < iters) { - array = GenericArrayData.allocate(primitiveDoubleArray) + array = new GenericArrayData(primitiveDoubleArray) n += 1 } } + val anyArray = primitiveDoubleArray.toArray[Any] val genericDoubleArray = { i: Int => var n = 0 while (n < iters) { - array = new GenericRefArrayData(primitiveDoubleArray) + array = new GenericArrayData(anyArray) n += 1 } } @@ -92,99 +86,75 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic ")(genericDoubleArray) benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Allocate GenericArrayData for double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 40 / 44 523.2 1.9 1.0X - Specialized 0 / 0 225500215.1 0.0 431013.0X - */ } def getPrimitiveIntArray(iters: Int): Unit = { - val count = 1024 * 1024 + val count = 1024 * 1024 * 8 - val intSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Int](count)) - val intDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Int](count)) + val anyArray: GenericArrayData = new GenericArrayData(new Array[Int](count).toArray[Any]) + val intArray: GenericArrayData = new GenericArrayData(new Array[Int](count)) var primitiveIntArray: Array[Int] = null val genericIntArray = { i: Int => var n = 0 while (n < iters) { - primitiveIntArray = intSparseArray.toIntArray + primitiveIntArray = anyArray.toIntArray n += 1 } } - val denseIntArray = { i: Int => + val specializedIntArray = { i: Int => var n = 0 while (n < iters) { - primitiveIntArray = intDenseArray.toIntArray + primitiveIntArray = intArray.toIntArray n += 1 } } val benchmark = new Benchmark("Get int primitive array", count * iters) - benchmark.addCase("Generic ")(genericIntArray) - benchmark.addCase("Specialized")(denseIntArray) + benchmark.addCase("Generic")(genericIntArray) + benchmark.addCase("Specialized")(specializedIntArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Get int primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 67 / 70 783.9 1.3 1.0X - Specialized 41 / 43 1263.8 0.8 1.6X - */ } def getPrimitiveDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 + val count = 1024 * 1024 * 8 - val doubleSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Double](count)) - val doubleDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Double](count)) + val anyArray: GenericArrayData = new GenericArrayData(new Array[Double](count).toArray[Any]) + val doubleArray: GenericArrayData = new GenericArrayData(new Array[Double](count)) var primitiveDoubleArray: Array[Double] = null val genericDoubleArray = { i: Int => var n = 0 while (n < iters) { - primitiveDoubleArray = doubleSparseArray.toDoubleArray + primitiveDoubleArray = anyArray.toDoubleArray n += 1 } } val specializedDoubleArray = { i: Int => var n = 0 while (n < iters) { - primitiveDoubleArray = doubleDenseArray.toDoubleArray + primitiveDoubleArray = doubleArray.toDoubleArray n += 1 } } val benchmark = new Benchmark("Get double primitive array", count * iters) - benchmark.addCase("Generic ")(genericDoubleArray) + benchmark.addCase("Generic")(genericDoubleArray) benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Get double primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 211 / 217 248.6 4.0 1.0X - Specialized 95 / 100 554.1 1.8 2.2X - */ } def readGenericIntArray(iters: Int): Unit = { - val count = 1024 * 1024 * 2 + val count = 1024 * 1024 * 8 var result: Int = 0 - val sparseArray = new GenericRefArrayData(new Array[Int](count)) + val anyArray = new GenericArrayData(new Array[Int](count).toArray[Any]) val genericIntArray = { i: Int => var n = 0 while (n < iters) { - val len = sparseArray.numElements + val len = anyArray.numElements var sum = 0 var i = 0 while (i < len) { - sum += sparseArray.getInt(i) + sum += anyArray.getInt(i) i += 1 } result = sum @@ -192,15 +162,15 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } } - val denseArray = GenericArrayData.allocate(new Array[Int](count)) - val denseIntArray = { i: Int => + val intArray = new GenericArrayData(new Array[Int](count)) + val specializedIntArray = { i: Int => var n = 0 while (n < iters) { - val len = denseArray.numElements + val len = intArray.numElements var sum = 0 var i = 0 while (i < len) { - sum += denseArray.getInt(i) + sum += intArray.getInt(i) i += 1 } result = sum @@ -209,32 +179,24 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } val benchmark = new Benchmark("Read GenericArrayData Int", count * iters) - benchmark.addCase("Sparse")(genericIntArray) - benchmark.addCase("Dense ")(denseIntArray) + benchmark.addCase("Generic")(genericIntArray) + benchmark.addCase("Specialized")(specializedIntArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Read GenericArrayData Int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 160 / 163 1314.5 0.8 1.0X - Specialized 68 / 69 3080.0 0.3 2.3X - */ } def readGenericDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 * 2 + val count = 1024 * 1024 *8 var result: Double = 0 - val sparseArray = new GenericRefArrayData(new Array[Double](count)) + val anyArray = new GenericArrayData(new Array[Double](count).toArray[Any]) val genericDoubleArray = { i: Int => var n = 0 while (n < iters) { - val len = sparseArray.numElements + val len = anyArray.numElements var sum = 0.toDouble var i = 0 while (i < len) { - sum += sparseArray.getDouble(i) + sum += anyArray.getDouble(i) i += 1 } result = sum @@ -242,15 +204,15 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } } - val denseArray = GenericArrayData.allocate(new Array[Double](count)) + val doubleArray = new GenericArrayData(new Array[Double](count)) val specializedDoubleArray = { i: Int => var n = 0 while (n < iters) { - val len = denseArray.numElements + val len = doubleArray.numElements var sum = 0.toDouble var i = 0 while (i < len) { - sum += denseArray.getDouble(i) + sum += doubleArray.getDouble(i) i += 1 } result = sum @@ -262,14 +224,6 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic")(genericDoubleArray) benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Read GenericArrayData Double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 611 / 613 343.3 2.9 1.0X - Specialized 199 / 202 1051.5 1.0 3.1X - */ } ignore("allocate GenericArrayData") { @@ -278,12 +232,12 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } ignore("get primitive array") { - getPrimitiveIntArray(50) - getPrimitiveDoubleArray(50) + getPrimitiveIntArray(20) + getPrimitiveDoubleArray(20) } ignore("read elements in GenericArrayData") { - readGenericIntArray(100) - readGenericDoubleArray(100) + readGenericIntArray(25) + readGenericDoubleArray(25) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala index e7c8f2717fd7..f50318469108 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala @@ -79,4 +79,36 @@ class PrimitiveArrayBenchmark extends BenchmarkBase { ignore("Write an array in Dataset") { writeDatasetArray(4) } + + def writeArray(iters: Int): Unit = { + import sparkSession.implicits._ + + val iters = 5 + val n = 1024 * 1024 + val rows = 15 + + val benchmark = new Benchmark("Read primitive array", n) + + val intDF = sparkSession.sparkContext.parallelize(0 until rows, 1) + .map(i => Array.tabulate(n)(i => i)).toDF() + intDF.count() // force to create df + + benchmark.addCase(s"Write int array in DataFrame", numIters = iters)(iter => { + intDF.selectExpr("value as a").collect + }) + + val doubleDF = sparkSession.sparkContext.parallelize(0 until rows, 1) + .map(i => Array.tabulate(n)(i => i.toDouble)).toDF() + doubleDF.count() // force to create df + + benchmark.addCase(s"Write double array in DataFrame", numIters = iters)(iter => { + doubleDF.selectExpr("value as a").collect + }) + + benchmark.run() + } + + ignore("Write an array in DataFrame") { + writeArray(1) + } } From 22d310bc1d34dae4c4c7550b0089b57bd419ea25 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 1 Oct 2016 16:08:54 +0900 Subject: [PATCH 32/75] fix scala style error --- .../sql/execution/benchmark/GenericArrayDataBenchmark.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index 38bdbf9e448e..7bbf01277cac 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -185,7 +185,7 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } def readGenericDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 *8 + val count = 1024 * 1024 * 8 var result: Double = 0 val anyArray = new GenericArrayData(new Array[Double](count).toArray[Any]) From 8684ad66448959dc58f57925bd7abdce906645ff Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 1 Oct 2016 17:53:20 +0900 Subject: [PATCH 33/75] fix test failure in OrcQuerySuite --- .../catalyst/expressions/codegen/GenerateUnsafeProjection.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala index 0a802046c2f7..0854990f5ef8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala @@ -292,7 +292,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro ${writeArrayToBuffer(ctx, keys, keyType, false, bufferHolder)} // Write the numBytes of key array into the first 8 bytes. - Platform.putInt($bufferHolder.buffer, $tmpCursor - 8, $bufferHolder.cursor - $tmpCursor); + Platform.putLong($bufferHolder.buffer, $tmpCursor - 8, $bufferHolder.cursor - $tmpCursor); ${writeArrayToBuffer(ctx, values, valueType, true, bufferHolder)} } From aa3ada8f5595b6bb5dd0cfe84b2b5c5c9738947f Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 1 Oct 2016 21:50:45 +0900 Subject: [PATCH 34/75] fix test failure (DatasetPrimitiveSuites) --- .../org/apache/spark/sql/catalyst/util/GenericArrayData.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 7b2347546d3e..34236ad6f4b4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -57,7 +57,7 @@ class GenericArrayData(val array: Array[Any], def this(primitiveArray: Array[Double]) = this(null, null, null, null, null, null, null, primitiveArray) - def this(primitiveArray: Array[Any]) = this(primitiveArray.toSeq) + def this(array: Array[Any]) = this(array, null, null, null, null, null, null, null) def this(seqOrArray: Any) = this(GenericArrayData.anyToSeq(seqOrArray)) From a81ee1420176ff10ed3d92d599517b89b3baa6dd Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 1 Oct 2016 21:51:17 +0900 Subject: [PATCH 35/75] update benchmark results --- .../benchmark/GenericArrayDataBenchmark.scala | 48 +++++++++++++++++++ .../benchmark/PrimitiveArrayBenchmark.scala | 8 ++++ 2 files changed, 56 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index 7bbf01277cac..7303e4ea2f07 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -58,6 +58,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic ")(genericIntArray) benchmark.addCase("Specialized")(specializedIntArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Allocate GenericArrayData for int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 0 / 0 46500044.3 0.0 1.0X + Specialized 0 / 0 170500162.6 0.0 3.7X + */ } def allocateGenericDoubleArray(iters: Int): Unit = { @@ -86,6 +94,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic ")(genericDoubleArray) benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Allocate GenericArrayData for double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 0 / 0 55627374.0 0.0 1.0X + Specialized 0 / 0 177724745.8 0.0 3.2X + */ } def getPrimitiveIntArray(iters: Int): Unit = { @@ -113,6 +129,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic")(genericIntArray) benchmark.addCase("Specialized")(specializedIntArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Get int primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 334 / 382 502.4 2.0 1.0X + Specialized 282 / 314 595.4 1.7 1.2X + */ } def getPrimitiveDoubleArray(iters: Int): Unit = { @@ -140,6 +164,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic")(genericDoubleArray) benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Get double primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 1720 / 1883 97.6 10.3 1.0X + Specialized 703 / 1117 238.7 4.2 2.4X + */ } def readGenericIntArray(iters: Int): Unit = { @@ -182,6 +214,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic")(genericIntArray) benchmark.addCase("Specialized")(specializedIntArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Read GenericArrayData Int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 206 / 212 1017.6 1.0 1.0X + Specialized 161 / 167 1301.0 0.8 1.3X + */ } def readGenericDoubleArray(iters: Int): Unit = { @@ -224,6 +264,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic")(genericDoubleArray) benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Read GenericArrayData Double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 547 / 581 383.3 2.6 1.0X + Specialized 237 / 260 884.0 1.1 2.3X + */ } ignore("allocate GenericArrayData") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala index f50318469108..a3f32ec3da04 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala @@ -106,6 +106,14 @@ class PrimitiveArrayBenchmark extends BenchmarkBase { }) benchmark.run() + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Read primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Write int array in DataFrame 1290 / 1748 0.8 1230.1 1.0X + Write double array in DataFrame 1761 / 2236 0.6 1679.0 0.7X + */ } ignore("Write an array in DataFrame") { From 681ae032e9f7fb7d27a89dace0d2c8dc3d238a86 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 18 Jun 2016 15:47:11 +0900 Subject: [PATCH 36/75] Implementation of GenericArrayData specialized for primitive type array add unit tests --- .../sql/catalyst/util/GenericArrayData.scala | 437 +++++++++++++++++- 1 file changed, 434 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 34236ad6f4b4..515a9d0825dc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -23,12 +23,33 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} +object GenericArrayData { + def allocate(seq: Seq[Any]): GenericArrayData = new GenericArrayData(seq) + def allocate(list: java.util.List[Any]): GenericArrayData = new GenericArrayData(list) + def allocate(seqOrArray: Any): GenericArrayData = new GenericArrayData(seqOrArray) + def allocate(primitiveArray: Array[Int]): GenericArrayData = + new GenericIntArrayData(primitiveArray) + def allocate(primitiveArray: Array[Long]): GenericArrayData = + new GenericLongArrayData(primitiveArray) + def allocate(primitiveArray: Array[Float]): GenericArrayData = + new GenericFloatArrayData(primitiveArray) + def allocate(primitiveArray: Array[Double]): GenericArrayData = + new GenericDoubleArrayData(primitiveArray) + def allocate(primitiveArray: Array[Short]): GenericArrayData = + new GenericShortArrayData(primitiveArray) + def allocate(primitiveArray: Array[Byte]): GenericArrayData = + new GenericByteArrayData(primitiveArray) + def allocate(primitiveArray: Array[Boolean]): GenericArrayData = + new GenericBooleanArrayData(primitiveArray) +} + private object GenericArrayData { // SPARK-16634: Workaround for JVM bug present in some 1.7 versions. def anyToSeq(seqOrArray: Any): Seq[Any] = seqOrArray match { case seq: Seq[Any] => seq case array: Array[_] => array.toSeq + case _ => Seq.empty } } @@ -245,8 +266,8 @@ class GenericArrayData(val array: Array[Any], return false } if (!isNullAt(i)) { - val o1 = array(i) - val o2 = other.array(i) + val o1 = _array(i) + val o2 = other._array(i) o1 match { case b1: Array[Byte] => if (!o2.isInstanceOf[Array[Byte]] || @@ -288,7 +309,7 @@ class GenericArrayData(val array: Array[Any], if (isNullAt(i)) { 0 } else { - array(i) match { + _array(i) match { case b: Boolean => if (b) 0 else 1 case b: Byte => b.toInt case s: Short => s.toInt @@ -308,3 +329,413 @@ class GenericArrayData(val array: Array[Any], result } } + +final class GenericIntArrayData(private val primitiveArray: Array[Int]) extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericIntArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getInt(ordinal: Int): Int = primitiveArray(ordinal) + override def toIntArray(): Array[Int] = { + val array = new Array[Int](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericIntArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericIntArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val update: Int = primitiveArray(i) + result = 37 * result + update + i += 1 + } + result + } +} + +final class GenericLongArrayData(private val primitiveArray: Array[Long]) + extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericLongArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getLong(ordinal: Int): Long = primitiveArray(ordinal) + override def toLongArray(): Array[Long] = { + val array = new Array[Long](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericLongArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericLongArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val l = primitiveArray(i) + val update: Int = (l ^ (l >>> 32)).toInt + result = 37 * result + update + i += 1 + } + result + } +} + +final class GenericFloatArrayData(private val primitiveArray: Array[Float]) + extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericFloatArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getFloat(ordinal: Int): Float = primitiveArray(ordinal) + override def toFloatArray(): Array[Float] = { + val array = new Array[Float](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericFloatArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericFloatArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (java.lang.Float.isNaN(o1)) { + if (!java.lang.Float.isNaN(o2)) { + return false; + } + } else if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val f = primitiveArray(i) + val update: Int = java.lang.Float.floatToIntBits(f) + result = 37 * result + update + i += 1 + } + result + } +} + +final class GenericDoubleArrayData(private val primitiveArray: Array[Double]) + extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericDoubleArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getDouble(ordinal: Int): Double = primitiveArray(ordinal) + override def toDoubleArray(): Array[Double] = { + val array = new Array[Double](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericDoubleArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericDoubleArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (java.lang.Double.isNaN(o1)) { + if (!java.lang.Double.isNaN(o2)) { + return false; + } + } else if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val d = primitiveArray(i) + val b = java.lang.Double.doubleToLongBits(d) + val update: Int = (b ^ (b >>> 32)).toInt + result = 37 * result + update + i += 1 + } + result + } +} + +final class GenericShortArrayData(private val primitiveArray: Array[Short]) + extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericShortArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getShort(ordinal: Int): Short = primitiveArray(ordinal) + override def toShortArray(): Array[Short] = { + val array = new Array[Short](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericShortArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericShortArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val update: Int = primitiveArray(i).toInt + result = 37 * result + update + i += 1 + } + result + } +} + +final class GenericByteArrayData(private val primitiveArray: Array[Byte]) + extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericByteArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getByte(ordinal: Int): Byte = primitiveArray(ordinal) + override def toByteArray(): Array[Byte] = { + val array = new Array[Byte](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericByteArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericByteArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val update: Int = primitiveArray(i).toInt + result = 37 * result + update + i += 1 + } + result + } +} + +final class GenericBooleanArrayData(private val primitiveArray: Array[Boolean]) + extends GenericArrayData { + override def array(): Array[Any] = primitiveArray.toArray + + override def copy(): ArrayData = new GenericBooleanArrayData(primitiveArray) + + override def numElements(): Int = primitiveArray.length + + override def isNullAt(ordinal: Int): Boolean = false + override def getBoolean(ordinal: Int): Boolean = primitiveArray(ordinal) + override def toBooleanArray(): Array[Boolean] = { + val array = new Array[Boolean](numElements) + System.arraycopy(primitiveArray, 0, array, 0, numElements) + array + } + override def toString(): String = primitiveArray.mkString("[", ",", "]") + + override def equals(o: Any): Boolean = { + if (!o.isInstanceOf[GenericBooleanArrayData]) { + return false + } + + val other = o.asInstanceOf[GenericBooleanArrayData] + if (other eq null) { + return false + } + + val len = numElements() + if (len != other.numElements()) { + return false + } + + var i = 0 + while (i < len) { + val o1 = primitiveArray(i) + val o2 = other.primitiveArray(i) + if (o1 != o2) { + return false + } + i += 1 + } + true + } + + override def hashCode: Int = { + var result: Int = 37 + var i = 0 + val len = numElements() + while (i < len) { + val update: Int = if (primitiveArray(i)) 1 else 0 + result = 37 * result + update + i += 1 + } + result + } +} From 556e76ee95971234d94a8e8bd15a21dfaaa675d4 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 18 Jun 2016 22:04:52 +0900 Subject: [PATCH 37/75] fix scala style error --- .../org/apache/spark/sql/catalyst/util/GenericArrayData.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 515a9d0825dc..91a0c5323323 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -739,3 +739,4 @@ final class GenericBooleanArrayData(private val primitiveArray: Array[Boolean]) result } } + From ff457030a67c0c33f650d9d0430649b584c369be Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 02:38:01 +0900 Subject: [PATCH 38/75] Introduce GenericRefArrayData --- .../sql/catalyst/util/GenericArrayData.scala | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 91a0c5323323..e90361e22617 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -24,9 +24,9 @@ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} object GenericArrayData { - def allocate(seq: Seq[Any]): GenericArrayData = new GenericArrayData(seq) - def allocate(list: java.util.List[Any]): GenericArrayData = new GenericArrayData(list) - def allocate(seqOrArray: Any): GenericArrayData = new GenericArrayData(seqOrArray) + def allocate(seq: Seq[Any]): GenericArrayData = new GenericRefArrayData(seq) + def allocate(list: java.util.List[Any]): GenericArrayData = new GenericRefArrayData(list) + def allocate(seqOrArray: Any): GenericArrayData = new GenericRefArrayData(seqOrArray) def allocate(primitiveArray: Array[Int]): GenericArrayData = new GenericIntArrayData(primitiveArray) def allocate(primitiveArray: Array[Long]): GenericArrayData = @@ -266,8 +266,8 @@ class GenericArrayData(val array: Array[Any], return false } if (!isNullAt(i)) { - val o1 = _array(i) - val o2 = other._array(i) + val o1 = array(i) + val o2 = other.array(i) o1 match { case b1: Array[Byte] => if (!o2.isInstanceOf[Array[Byte]] || @@ -309,7 +309,7 @@ class GenericArrayData(val array: Array[Any], if (isNullAt(i)) { 0 } else { - _array(i) match { + array(i) match { case b: Boolean => if (b) 0 else 1 case b: Byte => b.toInt case s: Short => s.toInt @@ -330,7 +330,7 @@ class GenericArrayData(val array: Array[Any], } } -final class GenericIntArrayData(private val primitiveArray: Array[Int]) extends GenericArrayData { +final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray override def copy(): ArrayData = new GenericIntArrayData(primitiveArray) @@ -386,7 +386,7 @@ final class GenericIntArrayData(private val primitiveArray: Array[Int]) extends } } -final class GenericLongArrayData(private val primitiveArray: Array[Long]) +final class GenericLongArrayData(val primitiveArray: Array[Long]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray @@ -444,7 +444,7 @@ final class GenericLongArrayData(private val primitiveArray: Array[Long]) } } -final class GenericFloatArrayData(private val primitiveArray: Array[Float]) +final class GenericFloatArrayData(val primitiveArray: Array[Float]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray @@ -506,7 +506,7 @@ final class GenericFloatArrayData(private val primitiveArray: Array[Float]) } } -final class GenericDoubleArrayData(private val primitiveArray: Array[Double]) +final class GenericDoubleArrayData(val primitiveArray: Array[Double]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray @@ -569,7 +569,7 @@ final class GenericDoubleArrayData(private val primitiveArray: Array[Double]) } } -final class GenericShortArrayData(private val primitiveArray: Array[Short]) +final class GenericShortArrayData(val primitiveArray: Array[Short]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray @@ -626,7 +626,7 @@ final class GenericShortArrayData(private val primitiveArray: Array[Short]) } } -final class GenericByteArrayData(private val primitiveArray: Array[Byte]) +final class GenericByteArrayData(val primitiveArray: Array[Byte]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray @@ -683,7 +683,7 @@ final class GenericByteArrayData(private val primitiveArray: Array[Byte]) } } -final class GenericBooleanArrayData(private val primitiveArray: Array[Boolean]) +final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray From 23f9f65361e194bac0bcc7eda9957c8135cd075e Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 02:48:04 +0900 Subject: [PATCH 39/75] replace 'new GenericArrayData' with 'GenericArrayData.allocate' --- .../spark/sql/catalyst/CatalystTypeConverters.scala | 8 ++++---- .../apache/spark/sql/catalyst/ScalaReflection.scala | 4 ++-- .../apache/spark/sql/catalyst/expressions/Cast.scala | 4 ++-- .../catalyst/expressions/aggregate/PivotFirst.scala | 2 +- .../sql/catalyst/expressions/aggregate/collect.scala | 2 +- .../expressions/codegen/GenerateSafeProjection.scala | 2 +- .../catalyst/expressions/collectionOperations.scala | 2 +- .../sql/catalyst/expressions/complexTypeCreator.scala | 11 ++++++----- .../catalyst/expressions/complexTypeExtractors.scala | 4 ++-- .../sql/catalyst/expressions/objects/objects.scala | 2 +- .../sql/catalyst/expressions/regexpExpressions.scala | 4 ++-- .../spark/sql/catalyst/json/JacksonParser.scala | 2 +- .../sql/catalyst/analysis/AnalysisErrorSuite.scala | 4 ++-- .../catalyst/encoders/EncoderResolutionSuite.scala | 4 ++-- .../spark/sql/catalyst/encoders/RowEncoderSuite.scala | 2 +- .../expressions/UnsafeRowConverterSuite.scala | 2 +- .../codegen/GeneratedProjectionSuite.scala | 2 +- .../datasources/parquet/ParquetRowConverter.scala | 4 ++-- .../spark/sql/execution/python/EvaluatePython.scala | 4 ++-- .../org/apache/spark/sql/test/ExamplePointUDT.scala | 2 +- .../scala/org/apache/spark/sql/UnsafeRowSuite.scala | 2 +- .../org/apache/spark/sql/UserDefinedTypeSuite.scala | 2 +- .../sql/execution/columnar/ColumnarTestUtils.scala | 2 +- .../apache/spark/sql/hive/HiveInspectorSuite.scala | 2 +- 24 files changed, 40 insertions(+), 39 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala index 5b9161551a7a..2801827e7bb6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala @@ -159,9 +159,9 @@ object CatalystTypeConverters { override def toCatalystImpl(scalaValue: Any): ArrayData = { scalaValue match { case a: Array[_] => - new GenericArrayData(a.map(elementConverter.toCatalyst)) + GenericArrayData.allocate(a.map(elementConverter.toCatalyst)) case s: Seq[_] => - new GenericArrayData(s.map(elementConverter.toCatalyst).toArray) + GenericArrayData.allocate(s.map(elementConverter.toCatalyst).toArray) case i: JavaIterable[_] => val iter = i.iterator val convertedIterable = scala.collection.mutable.ArrayBuffer.empty[Any] @@ -169,7 +169,7 @@ object CatalystTypeConverters { val item = iter.next() convertedIterable += elementConverter.toCatalyst(item) } - new GenericArrayData(convertedIterable.toArray) + GenericArrayData.allocate(convertedIterable.toArray) } } @@ -410,7 +410,7 @@ object CatalystTypeConverters { case t: Timestamp => TimestampConverter.toCatalyst(t) case d: BigDecimal => new DecimalConverter(DecimalType(d.precision, d.scale)).toCatalyst(d) case d: JavaBigDecimal => new DecimalConverter(DecimalType(d.precision, d.scale)).toCatalyst(d) - case seq: Seq[Any] => new GenericArrayData(seq.map(convertToCatalyst).toArray) + case seq: Seq[Any] => GenericArrayData.allocate(seq.map(convertToCatalyst).toArray) case r: Row => InternalRow(r.toSeq.map(convertToCatalyst): _*) case arr: Array[Any] => new GenericArrayData(arr.map(convertToCatalyst)) case map: Map[_, _] => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 7bcaea7ea2f7..68df41b5802d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericRefArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -459,7 +459,7 @@ object ScalaReflection extends ScalaReflection { case dt => NewInstance( - classOf[GenericArrayData], + classOf[GenericRefArrayData], input :: Nil, dataType = ArrayType(dt, schemaFor(elementType).nullable)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 4db1ae6faa15..039b4469b99e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -388,7 +388,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w values(i) = elementCast(e) } }) - new GenericArrayData(values) + GenericArrayData.allocate(values) }) } @@ -864,7 +864,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w } } } - $evPrim = new $arrayClass($values); + $evPrim = $arrayClass.allocate($values); """ } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala index 087606077295..23a8d5fd4903 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala @@ -131,7 +131,7 @@ case class PivotFirst( for (i <- 0 until indexSize) { result(i) = input.get(mutableAggBufferOffset + i, valueDataType) } - new GenericArrayData(result) + GenericArrayData.allocate(result) } override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate = diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala index d2880d58aefe..f97fe5fe8d51 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala @@ -78,7 +78,7 @@ abstract class Collect extends ImperativeAggregate { } override def eval(input: InternalRow): Any = { - new GenericArrayData(buffer.toArray) + GenericArrayData.allocate(buffer.toArray) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala index b1cb6edefb85..792d735a0d5a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala @@ -96,7 +96,7 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection] $values[$index] = ${elementConverter.value}; } } - final ArrayData $output = new $arrayClass($values); + final ArrayData $output = $arrayClass.allocate($values); """ ExprCode(code, "false", output) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index c863ba434120..1d10a9034ab7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -206,7 +206,7 @@ case class SortArray(base: Expression, ascendingOrder: Expression) if (elementType != NullType) { java.util.Arrays.sort(data, if (ascending.asInstanceOf[Boolean]) lt else gt) } - new GenericArrayData(data.asInstanceOf[Array[Any]]) + GenericArrayData.allocate(data.asInstanceOf[Array[Any]]) } override def prettyName: String = "sort_array" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index c9f36649ec8e..640c32628cdd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -52,7 +52,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression { override def nullable: Boolean = false override def eval(input: InternalRow): Any = { - new GenericArrayData(children.map(_.eval(input)).toArray) + GenericArrayData.allocate(children.map(_.eval(input)).toArray) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -76,7 +76,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression { """ }) + s""" - final ArrayData ${ev.value} = new $arrayClass($values); + final ArrayData ${ev.value} = $arrayClass.allocate($values); this.$values = null; """) } @@ -130,7 +130,8 @@ case class CreateMap(children: Seq[Expression]) extends Expression { throw new RuntimeException("Cannot use null as map key!") } val valueArray = values.map(_.eval(input)).toArray - new ArrayBasedMapData(new GenericArrayData(keyArray), new GenericArrayData(valueArray)) + new ArrayBasedMapData( + GenericArrayData.allocate(keyArray), GenericArrayData.allocate(valueArray)) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -141,8 +142,8 @@ case class CreateMap(children: Seq[Expression]) extends Expression { ctx.addMutableState("Object[]", keyArray, s"this.$keyArray = null;") ctx.addMutableState("Object[]", valueArray, s"this.$valueArray = null;") - val keyData = s"new $arrayClass($keyArray)" - val valueData = s"new $arrayClass($valueArray)" + val keyData = s"$arrayClass.allocate($keyArray)" + val valueData = s"$arrayClass.allocate($valueArray)" ev.copy(code = s""" final boolean ${ev.isNull} = false; $keyArray = new Object[${keys.size}]; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala index 0c256c3d890f..f17d0bc412b1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala @@ -176,7 +176,7 @@ case class GetArrayStructFields( } i += 1 } - new GenericArrayData(result) + GenericArrayData.allocate(result) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -201,7 +201,7 @@ case class GetArrayStructFields( } } } - ${ev.value} = new $arrayClass($values); + ${ev.value} = $arrayClass.allocate($values); """ }) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index 50e2ac3c36d9..b12f944566b4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -537,7 +537,7 @@ case class MapObjects private( $loopIndex += 1; } - ${ev.value} = new ${classOf[GenericArrayData].getName}($convertedArray); + ${ev.value} = ${classOf[GenericArrayData].getName}.allocate($convertedArray); } """ ev.copy(code = code, isNull = genInputData.isNull) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 5648ad6b6dc1..6aebe7970443 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -191,14 +191,14 @@ case class StringSplit(str: Expression, pattern: Expression) override def nullSafeEval(string: Any, regex: Any): Any = { val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1) - new GenericArrayData(strings.asInstanceOf[Array[Any]]) + GenericArrayData.allocate(strings.asInstanceOf[Array[Any]]) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val arrayClass = classOf[GenericArrayData].getName nullSafeCodeGen(ctx, ev, (str, pattern) => // Array in java is covariant, so we don't need to cast UTF8String[] to Object[]. - s"""${ev.value} = new $arrayClass($str.split($pattern, -1));""") + s"""${ev.value} = $arrayClass.allocate($str.split($pattern, -1));""") } override def prettyName: String = "split" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index e476cb11a351..ec87133c9d11 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -404,7 +404,7 @@ class JacksonParser( values += fieldConverter.apply(parser) } - new GenericArrayData(values.toArray) + GenericArrayData.allocate(values.toArray) } /** diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala index 21afe9fec594..6e59215dc8c6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala @@ -57,8 +57,8 @@ private[sql] class UngroupableUDT extends UserDefinedType[UngroupableData] { override def sqlType: DataType = MapType(IntegerType, IntegerType) override def serialize(ungroupableData: UngroupableData): MapData = { - val keyArray = new GenericArrayData(ungroupableData.data.keys.toSeq) - val valueArray = new GenericArrayData(ungroupableData.data.values.toSeq) + val keyArray = GenericArrayData.allocate(ungroupableData.data.keys.toSeq) + val valueArray = GenericArrayData.allocate(ungroupableData.data.values.toSeq) new ArrayBasedMapData(keyArray, valueArray) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala index 802397d50e85..49934354e5de 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala @@ -70,11 +70,11 @@ class EncoderResolutionSuite extends PlanTest { val bound = encoder.resolveAndBind(attrs) // If no null values appear, it should works fine - bound.fromRow(InternalRow(new GenericArrayData(Array(1, 2)))) + bound.fromRow(InternalRow(GenericArrayData.allocate(Array(1, 2)))) // If there is null value, it should throw runtime exception val e = intercept[RuntimeException] { - bound.fromRow(InternalRow(new GenericArrayData(Array(1, null)))) + bound.fromRow(InternalRow(GenericArrayData.allocate(Array(1, null)))) } assert(e.getMessage.contains("Null value appeared in non-nullable field")) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala index 1a5569a77dc7..46575f7d63eb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala @@ -51,7 +51,7 @@ class ExamplePointUDT extends UserDefinedType[ExamplePoint] { val output = new Array[Any](2) output(0) = p.x output(1) = p.y - new GenericArrayData(output) + GenericArrayData.allocate(output) } override def deserialize(datum: Any): ExamplePoint = { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala index cf3cbe270753..b634834c67e3 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala @@ -291,7 +291,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers { assert(unsafeRow.getSizeInBytes == 8 + 2 * 8 + row1.getSizeInBytes + row2.getSizeInBytes) } - private def createArray(values: Any*): ArrayData = new GenericArrayData(values.toArray) + private def createArray(values: Any*): ArrayData = GenericArrayData.allocate(values.toArray) private def createMap(keys: Any*)(values: Any*): MapData = { assert(keys.length == values.length) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala index b69b74b4240b..d6c9a9c0b638 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala @@ -86,7 +86,7 @@ class GeneratedProjectionSuite extends SparkFunSuite { test("generated unsafe projection with array of binary") { val row = InternalRow( Array[Byte](1, 2), - new GenericArrayData(Array(Array[Byte](1, 2), null, Array[Byte](3, 4)))) + GenericArrayData.allocate(Array(Array[Byte](1, 2), null, Array[Byte](3, 4)))) val fields = (BinaryType :: ArrayType(BinaryType) :: Nil).toArray[DataType] val unsafeProj = UnsafeProjection.create(fields) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala index 33dcf2f3fd16..108977c23ec3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala @@ -491,7 +491,7 @@ private[parquet] class ParquetRowConverter( override def getConverter(fieldIndex: Int): Converter = elementConverter - override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray)) + override def end(): Unit = updater.set(GenericArrayData.allocate(currentArray.toArray)) // NOTE: We can't reuse the mutable `ArrayBuffer` here and must instantiate a new buffer for the // next value. `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored @@ -590,7 +590,7 @@ private[parquet] class ParquetRowConverter( protected def newArrayUpdater(updater: ParentContainerUpdater) = new ParentContainerUpdater { override def start(): Unit = currentArray = ArrayBuffer.empty[Any] - override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray)) + override def end(): Unit = updater.set(GenericArrayData.allocate(currentArray.toArray)) override def set(value: Any): Unit = currentArray += value } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala index 46fd54e5c742..0e496dfd29e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala @@ -119,10 +119,10 @@ object EvaluatePython { case (c, BinaryType) if c.getClass.isArray && c.getClass.getComponentType.getName == "byte" => c case (c: java.util.List[_], ArrayType(elementType, _)) => - new GenericArrayData(c.asScala.map { e => fromJava(e, elementType)}.toArray) + GenericArrayData.allocate(c.asScala.map { e => fromJava(e, elementType)}.toArray) case (c, ArrayType(elementType, _)) if c.getClass.isArray => - new GenericArrayData(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType))) + GenericArrayData.allocate(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType))) case (javaMap: java.util.Map[_, _], MapType(keyType, valueType, _)) => ArrayBasedMapData( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala index a73e4272950a..6a8a5e060fd8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala @@ -49,7 +49,7 @@ private[sql] class ExamplePointUDT extends UserDefinedType[ExamplePoint] { val output = new Array[Any](2) output(0) = p.x output(1) = p.y - new GenericArrayData(output) + GenericArrayData.allocate(output) } override def deserialize(datum: Any): ExamplePoint = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala index a32763db054f..c002dfcf4908 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala @@ -160,7 +160,7 @@ class UnsafeRowSuite extends SparkFunSuite { } test("calling hashCode on unsafe array returned by getArray(ordinal)") { - val row = InternalRow.apply(new GenericArrayData(Array(1L))) + val row = InternalRow.apply(GenericArrayData.allocate(Array(1L))) val unsafeRow = UnsafeProjection.create(Array[DataType](ArrayType(LongType))).apply(row) // Makes sure hashCode on unsafe array won't crash unsafeRow.getArray(0).hashCode() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala index 474f17ff7afb..17ec9315e4a6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala @@ -50,7 +50,7 @@ object UDT { override def sqlType: DataType = ArrayType(DoubleType, containsNull = false) override def serialize(features: MyDenseVector): ArrayData = { - new GenericArrayData(features.data.map(_.asInstanceOf[Any])) + GenericArrayData.allocate(features.data.map(_.asInstanceOf[Any])) } override def deserialize(datum: Any): MyDenseVector = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala index 686c8fa6f5fa..e590d2833477 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala @@ -56,7 +56,7 @@ object ColumnarTestUtils { case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => - new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) + GenericArrayData.allocate(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala index 3de1f4aeb74d..8cadaeedea69 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala @@ -229,7 +229,7 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors { test("wrap / unwrap Array Type") { val dt = ArrayType(dataTypes(0)) - val d = new GenericArrayData(Array(row(0), row(0))) + val d = GenericArrayData.allocate(Array(row(0), row(0))) checkValue(d, unwrap(wrap(d, toInspector(dt), dt), toInspector(dt))) checkValue(null, unwrap(wrap(null, toInspector(dt), dt), toInspector(dt))) checkValue(d, From 839bcb83323d1ca21af4b85e5006aad17024bd18 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 12:22:23 +0900 Subject: [PATCH 40/75] Generate GenericArrayData.allocate in NewInstance() --- .../spark/sql/catalyst/expressions/objects/objects.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index b12f944566b4..80a0efe05fc6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -281,7 +281,11 @@ case class NewInstance( val constructorCall = outer.map { gen => s"""${gen.value}.new ${cls.getSimpleName}(${argValues.mkString(", ")})""" }.getOrElse { - s"new $className(${argValues.mkString(", ")})" + if (!cls.isInstanceOf[GenericArrayData]) { + s"new $className(${argValues.mkString(", ")})" + } else { + s"${cls.getName}.allocate(${argValues.mkString(", ")})" + } } val code = s""" From 97068d413dd2ec8a0b3e748c808dc19a9e243a51 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 14:15:51 +0900 Subject: [PATCH 41/75] intial version of Benchmark without performance --- .../util/GenericArrayDataBenchmark.scala | 188 ++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala new file mode 100644 index 000000000000..5a3364c9c7a0 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.util + +import org.apache.spark.util.Benchmark + +/** + * Benchmark [[GenericArrayData]] for Dense and Sparse with primitive type + */ +object GenericArrayDataBenchmark { +/* + def allocateGenericIntArray(iters: Int): Unit = { + val count = 1024 * 1024 * 10 + var array: GenericArrayData = null + + val primitiveIntArray = new Array[Int](count) + val denseIntArray = { i: Int => + for (n <- 0L until iters) { + array = GenericArrayData.allocate(primitiveIntArray) + } + } + val sparseIntArray = { i: Int => + for (n <- 0L until iters) { + array = new GenericRefArrayData(primitiveIntArray) + } + } + + val benchmark = new Benchmark("Allocate GenericArrayData for int", count * iters) + benchmark.addCase("Sparse")(sparseIntArray) + benchmark.addCase("Dense ")(denseIntArray) + } + + def allocateGenericDoubleArray(iters: Int): Unit = { + val count = 1024 * 1024 * 10 + var array: GenericArrayData = null + + val primitiveDoubleArray = new Array[Int](count) + val denseDoubleArray = { i: Int => + for (n <- 0L until iters) { + array = GenericArrayData.allocate(primitiveDoubleArray) + } + } + val sparseDoubleArray = { i: Int => + for (n <- 0L until iters) { + array = new GenericRefArrayData(primitiveDoubleArray) + } + } + + val benchmark = new Benchmark("Allocate GenericArrayData for double", count * iters) + benchmark.addCase("Sparse")(sparseDoubleArray) + benchmark.addCase("Dense ")(denseDoubleArray) + } + + def getPrimitiveIntArray(iters: Int): Unit = { + val count = 1024 * 1024 * 10 + + val intSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Int](count)) + val intDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Int](count)) + var primitiveIntArray: Array[Int] = null + val sparseIntArray = { i: Int => + for (n <- 0L until iters) { + primitiveIntArray = intSparseArray.toIntArray + } + } + val denseIntArray = { i: Int => + for (n <- 0L until iters) { + primitiveIntArray = intDenseArray.toIntArray + } + } + + val benchmark = new Benchmark("Get int primitive array", count * iters) + benchmark.addCase("Sparse int")(sparseIntArray) + benchmark.addCase("Dense int")(denseIntArray) + } + + def getPrimitiveDoubleArray(iters: Int): Unit = { + val count = 1024 * 1024 * 10 + + val doubleSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Double](count)) + val doubleDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Double](count)) + var primitiveDoubleArray: Array[Double] = null + val sparseDoubleArray = { i: Int => + for (n <- 0L until iters) { + primitiveDoubleArray = doubleSparseArray.toDoubleArray + } + } + val denseDoubleArray = { i: Int => + for (n <- 0L until iters) { + primitiveDoubleArray = doubleDenseArray.toDoubleArray + } + } + + val benchmark = new Benchmark("Get double primitive array", count * iters) + benchmark.addCase("Sparse double")(sparseDoubleArray) + benchmark.addCase("Dense double")(denseDoubleArray) + } + + def readGenericIntArray(iters: Int): Unit = { + val count = 1024 * 1024 * 10 + var result: Int = 0 + + val sparseArray = new GenericRefArrayData(new Array[Int](count)) + val sparseIntArray = { i: Int => + for (n <- 0L until iters) { + val len = sparseArray.numElements + var sum = 0 + for (i <- 0 until len - 1) { + sum += sparseArray.getInt(i) + } + result = sum + } + } + + val denseArray = GenericArrayData.allocate(new Array[Int](count)) + val denseIntArray = { i: Int => + for (n <- 0L until iters) { + val len = denseArray.numElements + var sum = 0 + for (i <- 0 until len - 1) { + sum += denseArray.getInt(i) + } + result = sum + } + } + + val benchmark = new Benchmark("Read GenericArrayData Int", count * iters) + benchmark.addCase("Sparse")(sparseIntArray) + benchmark.addCase("Dense ")(denseIntArray) + } + + def readGenericDoubleArray(iters: Int): Unit = { + val count = 1024 * 1024 * 10 + var result: Int = 0 + + val sparseArray = new GenericRefArrayData(new Array[Int](count)) + val sparseIntArray = { i: Int => + for (n <- 0L until iters) { + val len = sparseArray.numElements + var sum = 0 + for (i <- 0 until len - 1) { + sum += sparseArray.getInt(i) + } + result = sum + } + } + + val denseArray = GenericArrayData.allocate(new Array[Int](count)) + val denseIntArray = { i: Int => + for (n <- 0L until iters) { + val len = denseArray.numElements + var sum = 0 + for (i <- 0 until len - 1) { + sum += denseArray.getInt(i) + } + result = sum + } + } + + val benchmark = new Benchmark("Read GenericArrayData Double", count * iters) + benchmark.addCase("Sparse")(sparseIntArray) + benchmark.addCase("Dense ")(denseIntArray) + } + + def main(args: Array[String]): Unit = { + allocateGenericIntArray(1024) + allocateGenericDoubleArray(1024) + getPrimitiveIntArray(1024) + getPrimitiveDoubleArray(1024) + readGenericIntArray(512) + readGenericDoubleArray(512) + } +*/ +} From cf2b2160a34a10f31433bbbd080a625fd2985f7c Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 18:40:46 +0900 Subject: [PATCH 42/75] move project of Benchmark program --- .../util/GenericArrayDataBenchmark.scala | 188 ------------ .../benchmark/GenericArrayDataBenchmark.scala | 277 ++++++------------ 2 files changed, 87 insertions(+), 378 deletions(-) delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala deleted file mode 100644 index 5a3364c9c7a0..000000000000 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.util - -import org.apache.spark.util.Benchmark - -/** - * Benchmark [[GenericArrayData]] for Dense and Sparse with primitive type - */ -object GenericArrayDataBenchmark { -/* - def allocateGenericIntArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 - var array: GenericArrayData = null - - val primitiveIntArray = new Array[Int](count) - val denseIntArray = { i: Int => - for (n <- 0L until iters) { - array = GenericArrayData.allocate(primitiveIntArray) - } - } - val sparseIntArray = { i: Int => - for (n <- 0L until iters) { - array = new GenericRefArrayData(primitiveIntArray) - } - } - - val benchmark = new Benchmark("Allocate GenericArrayData for int", count * iters) - benchmark.addCase("Sparse")(sparseIntArray) - benchmark.addCase("Dense ")(denseIntArray) - } - - def allocateGenericDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 - var array: GenericArrayData = null - - val primitiveDoubleArray = new Array[Int](count) - val denseDoubleArray = { i: Int => - for (n <- 0L until iters) { - array = GenericArrayData.allocate(primitiveDoubleArray) - } - } - val sparseDoubleArray = { i: Int => - for (n <- 0L until iters) { - array = new GenericRefArrayData(primitiveDoubleArray) - } - } - - val benchmark = new Benchmark("Allocate GenericArrayData for double", count * iters) - benchmark.addCase("Sparse")(sparseDoubleArray) - benchmark.addCase("Dense ")(denseDoubleArray) - } - - def getPrimitiveIntArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 - - val intSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Int](count)) - val intDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Int](count)) - var primitiveIntArray: Array[Int] = null - val sparseIntArray = { i: Int => - for (n <- 0L until iters) { - primitiveIntArray = intSparseArray.toIntArray - } - } - val denseIntArray = { i: Int => - for (n <- 0L until iters) { - primitiveIntArray = intDenseArray.toIntArray - } - } - - val benchmark = new Benchmark("Get int primitive array", count * iters) - benchmark.addCase("Sparse int")(sparseIntArray) - benchmark.addCase("Dense int")(denseIntArray) - } - - def getPrimitiveDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 - - val doubleSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Double](count)) - val doubleDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Double](count)) - var primitiveDoubleArray: Array[Double] = null - val sparseDoubleArray = { i: Int => - for (n <- 0L until iters) { - primitiveDoubleArray = doubleSparseArray.toDoubleArray - } - } - val denseDoubleArray = { i: Int => - for (n <- 0L until iters) { - primitiveDoubleArray = doubleDenseArray.toDoubleArray - } - } - - val benchmark = new Benchmark("Get double primitive array", count * iters) - benchmark.addCase("Sparse double")(sparseDoubleArray) - benchmark.addCase("Dense double")(denseDoubleArray) - } - - def readGenericIntArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 - var result: Int = 0 - - val sparseArray = new GenericRefArrayData(new Array[Int](count)) - val sparseIntArray = { i: Int => - for (n <- 0L until iters) { - val len = sparseArray.numElements - var sum = 0 - for (i <- 0 until len - 1) { - sum += sparseArray.getInt(i) - } - result = sum - } - } - - val denseArray = GenericArrayData.allocate(new Array[Int](count)) - val denseIntArray = { i: Int => - for (n <- 0L until iters) { - val len = denseArray.numElements - var sum = 0 - for (i <- 0 until len - 1) { - sum += denseArray.getInt(i) - } - result = sum - } - } - - val benchmark = new Benchmark("Read GenericArrayData Int", count * iters) - benchmark.addCase("Sparse")(sparseIntArray) - benchmark.addCase("Dense ")(denseIntArray) - } - - def readGenericDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 - var result: Int = 0 - - val sparseArray = new GenericRefArrayData(new Array[Int](count)) - val sparseIntArray = { i: Int => - for (n <- 0L until iters) { - val len = sparseArray.numElements - var sum = 0 - for (i <- 0 until len - 1) { - sum += sparseArray.getInt(i) - } - result = sum - } - } - - val denseArray = GenericArrayData.allocate(new Array[Int](count)) - val denseIntArray = { i: Int => - for (n <- 0L until iters) { - val len = denseArray.numElements - var sum = 0 - for (i <- 0 until len - 1) { - sum += denseArray.getInt(i) - } - result = sum - } - } - - val benchmark = new Benchmark("Read GenericArrayData Double", count * iters) - benchmark.addCase("Sparse")(sparseIntArray) - benchmark.addCase("Dense ")(denseIntArray) - } - - def main(args: Array[String]): Unit = { - allocateGenericIntArray(1024) - allocateGenericDoubleArray(1024) - getPrimitiveIntArray(1024) - getPrimitiveDoubleArray(1024) - readGenericIntArray(512) - readGenericDoubleArray(512) - } -*/ -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index 7303e4ea2f07..5a3364c9c7a0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -15,277 +15,174 @@ * limitations under the License. */ -package org.apache.spark.sql.execution.benchmark +package org.apache.spark.sql.catalyst.util -import scala.concurrent.duration._ - -import org.apache.spark.sql.catalyst.util._ import org.apache.spark.util.Benchmark /** - * Benchmark [[GenericArrayData]] for specialized representation with primitive type - * To run this: - * 1. replace ignore(...) with test(...) - * 2. build/sbt "sql/test-only *benchmark.GenericArrayDataBenchmark" - * - * Benchmarks in this file are skipped in normal builds. + * Benchmark [[GenericArrayData]] for Dense and Sparse with primitive type */ -class GenericArrayDataBenchmark extends BenchmarkBase { - +object GenericArrayDataBenchmark { +/* def allocateGenericIntArray(iters: Int): Unit = { - val count = 1024 * 1024 + val count = 1024 * 1024 * 10 var array: GenericArrayData = null val primitiveIntArray = new Array[Int](count) - val specializedIntArray = { i: Int => - var n = 0 - while (n < iters) { - array = new GenericArrayData(primitiveIntArray) - n += 1 + val denseIntArray = { i: Int => + for (n <- 0L until iters) { + array = GenericArrayData.allocate(primitiveIntArray) } } - val anyArray = primitiveIntArray.toArray[Any] - val genericIntArray = { i: Int => - var n = 0 - while (n < iters) { - array = new GenericArrayData(anyArray) - n += 1 + val sparseIntArray = { i: Int => + for (n <- 0L until iters) { + array = new GenericRefArrayData(primitiveIntArray) } } - val benchmark = new Benchmark("Allocate GenericArrayData for int", count * iters, - minNumIters = 10, minTime = 1.milliseconds) - benchmark.addCase("Generic ")(genericIntArray) - benchmark.addCase("Specialized")(specializedIntArray) - benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Allocate GenericArrayData for int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 0 / 0 46500044.3 0.0 1.0X - Specialized 0 / 0 170500162.6 0.0 3.7X - */ + val benchmark = new Benchmark("Allocate GenericArrayData for int", count * iters) + benchmark.addCase("Sparse")(sparseIntArray) + benchmark.addCase("Dense ")(denseIntArray) } def allocateGenericDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 + val count = 1024 * 1024 * 10 var array: GenericArrayData = null val primitiveDoubleArray = new Array[Int](count) - val specializedDoubleArray = { i: Int => - var n = 0 - while (n < iters) { - array = new GenericArrayData(primitiveDoubleArray) - n += 1 + val denseDoubleArray = { i: Int => + for (n <- 0L until iters) { + array = GenericArrayData.allocate(primitiveDoubleArray) } } - val anyArray = primitiveDoubleArray.toArray[Any] - val genericDoubleArray = { i: Int => - var n = 0 - while (n < iters) { - array = new GenericArrayData(anyArray) - n += 1 + val sparseDoubleArray = { i: Int => + for (n <- 0L until iters) { + array = new GenericRefArrayData(primitiveDoubleArray) } } - val benchmark = new Benchmark("Allocate GenericArrayData for double", count * iters, - minNumIters = 10, minTime = 1.milliseconds) - benchmark.addCase("Generic ")(genericDoubleArray) - benchmark.addCase("Specialized")(specializedDoubleArray) - benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Allocate GenericArrayData for double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 0 / 0 55627374.0 0.0 1.0X - Specialized 0 / 0 177724745.8 0.0 3.2X - */ + val benchmark = new Benchmark("Allocate GenericArrayData for double", count * iters) + benchmark.addCase("Sparse")(sparseDoubleArray) + benchmark.addCase("Dense ")(denseDoubleArray) } def getPrimitiveIntArray(iters: Int): Unit = { - val count = 1024 * 1024 * 8 + val count = 1024 * 1024 * 10 - val anyArray: GenericArrayData = new GenericArrayData(new Array[Int](count).toArray[Any]) - val intArray: GenericArrayData = new GenericArrayData(new Array[Int](count)) + val intSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Int](count)) + val intDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Int](count)) var primitiveIntArray: Array[Int] = null - val genericIntArray = { i: Int => - var n = 0 - while (n < iters) { - primitiveIntArray = anyArray.toIntArray - n += 1 + val sparseIntArray = { i: Int => + for (n <- 0L until iters) { + primitiveIntArray = intSparseArray.toIntArray } } - val specializedIntArray = { i: Int => - var n = 0 - while (n < iters) { - primitiveIntArray = intArray.toIntArray - n += 1 + val denseIntArray = { i: Int => + for (n <- 0L until iters) { + primitiveIntArray = intDenseArray.toIntArray } } val benchmark = new Benchmark("Get int primitive array", count * iters) - benchmark.addCase("Generic")(genericIntArray) - benchmark.addCase("Specialized")(specializedIntArray) - benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Get int primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 334 / 382 502.4 2.0 1.0X - Specialized 282 / 314 595.4 1.7 1.2X - */ + benchmark.addCase("Sparse int")(sparseIntArray) + benchmark.addCase("Dense int")(denseIntArray) } def getPrimitiveDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 * 8 + val count = 1024 * 1024 * 10 - val anyArray: GenericArrayData = new GenericArrayData(new Array[Double](count).toArray[Any]) - val doubleArray: GenericArrayData = new GenericArrayData(new Array[Double](count)) + val doubleSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Double](count)) + val doubleDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Double](count)) var primitiveDoubleArray: Array[Double] = null - val genericDoubleArray = { i: Int => - var n = 0 - while (n < iters) { - primitiveDoubleArray = anyArray.toDoubleArray - n += 1 + val sparseDoubleArray = { i: Int => + for (n <- 0L until iters) { + primitiveDoubleArray = doubleSparseArray.toDoubleArray } } - val specializedDoubleArray = { i: Int => - var n = 0 - while (n < iters) { - primitiveDoubleArray = doubleArray.toDoubleArray - n += 1 + val denseDoubleArray = { i: Int => + for (n <- 0L until iters) { + primitiveDoubleArray = doubleDenseArray.toDoubleArray } } val benchmark = new Benchmark("Get double primitive array", count * iters) - benchmark.addCase("Generic")(genericDoubleArray) - benchmark.addCase("Specialized")(specializedDoubleArray) - benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Get double primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 1720 / 1883 97.6 10.3 1.0X - Specialized 703 / 1117 238.7 4.2 2.4X - */ + benchmark.addCase("Sparse double")(sparseDoubleArray) + benchmark.addCase("Dense double")(denseDoubleArray) } def readGenericIntArray(iters: Int): Unit = { - val count = 1024 * 1024 * 8 + val count = 1024 * 1024 * 10 var result: Int = 0 - val anyArray = new GenericArrayData(new Array[Int](count).toArray[Any]) - val genericIntArray = { i: Int => - var n = 0 - while (n < iters) { - val len = anyArray.numElements + val sparseArray = new GenericRefArrayData(new Array[Int](count)) + val sparseIntArray = { i: Int => + for (n <- 0L until iters) { + val len = sparseArray.numElements var sum = 0 - var i = 0 - while (i < len) { - sum += anyArray.getInt(i) - i += 1 + for (i <- 0 until len - 1) { + sum += sparseArray.getInt(i) } result = sum - n += 1 } } - val intArray = new GenericArrayData(new Array[Int](count)) - val specializedIntArray = { i: Int => - var n = 0 - while (n < iters) { - val len = intArray.numElements + val denseArray = GenericArrayData.allocate(new Array[Int](count)) + val denseIntArray = { i: Int => + for (n <- 0L until iters) { + val len = denseArray.numElements var sum = 0 - var i = 0 - while (i < len) { - sum += intArray.getInt(i) - i += 1 + for (i <- 0 until len - 1) { + sum += denseArray.getInt(i) } result = sum - n += 1 } } val benchmark = new Benchmark("Read GenericArrayData Int", count * iters) - benchmark.addCase("Generic")(genericIntArray) - benchmark.addCase("Specialized")(specializedIntArray) - benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Read GenericArrayData Int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 206 / 212 1017.6 1.0 1.0X - Specialized 161 / 167 1301.0 0.8 1.3X - */ + benchmark.addCase("Sparse")(sparseIntArray) + benchmark.addCase("Dense ")(denseIntArray) } def readGenericDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 * 8 - var result: Double = 0 + val count = 1024 * 1024 * 10 + var result: Int = 0 - val anyArray = new GenericArrayData(new Array[Double](count).toArray[Any]) - val genericDoubleArray = { i: Int => - var n = 0 - while (n < iters) { - val len = anyArray.numElements - var sum = 0.toDouble - var i = 0 - while (i < len) { - sum += anyArray.getDouble(i) - i += 1 + val sparseArray = new GenericRefArrayData(new Array[Int](count)) + val sparseIntArray = { i: Int => + for (n <- 0L until iters) { + val len = sparseArray.numElements + var sum = 0 + for (i <- 0 until len - 1) { + sum += sparseArray.getInt(i) } result = sum - n += 1 } } - val doubleArray = new GenericArrayData(new Array[Double](count)) - val specializedDoubleArray = { i: Int => - var n = 0 - while (n < iters) { - val len = doubleArray.numElements - var sum = 0.toDouble - var i = 0 - while (i < len) { - sum += doubleArray.getDouble(i) - i += 1 + val denseArray = GenericArrayData.allocate(new Array[Int](count)) + val denseIntArray = { i: Int => + for (n <- 0L until iters) { + val len = denseArray.numElements + var sum = 0 + for (i <- 0 until len - 1) { + sum += denseArray.getInt(i) } result = sum - n += 1 } } val benchmark = new Benchmark("Read GenericArrayData Double", count * iters) - benchmark.addCase("Generic")(genericDoubleArray) - benchmark.addCase("Specialized")(specializedDoubleArray) - benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Read GenericArrayData Double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 547 / 581 383.3 2.6 1.0X - Specialized 237 / 260 884.0 1.1 2.3X - */ - } - - ignore("allocate GenericArrayData") { - allocateGenericIntArray(20) - allocateGenericDoubleArray(20) - } - - ignore("get primitive array") { - getPrimitiveIntArray(20) - getPrimitiveDoubleArray(20) + benchmark.addCase("Sparse")(sparseIntArray) + benchmark.addCase("Dense ")(denseIntArray) } - ignore("read elements in GenericArrayData") { - readGenericIntArray(25) - readGenericDoubleArray(25) + def main(args: Array[String]): Unit = { + allocateGenericIntArray(1024) + allocateGenericDoubleArray(1024) + getPrimitiveIntArray(1024) + getPrimitiveDoubleArray(1024) + readGenericIntArray(512) + readGenericDoubleArray(512) } +*/ } From 8df1c34a012f074c4c970374cbd9ef6b43d2a4ab Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 18:41:08 +0900 Subject: [PATCH 43/75] update benchmark program --- .../benchmark/GenericArrayDataBenchmark.scala | 151 ++++++++++++------ 1 file changed, 106 insertions(+), 45 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index 5a3364c9c7a0..0832830d6a87 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -15,174 +15,235 @@ * limitations under the License. */ -package org.apache.spark.sql.catalyst.util +package org.apache.spark.sql.execution.benchmark +import scala.concurrent.duration._ + +import org.apache.spark.sql.catalyst.util._ import org.apache.spark.util.Benchmark /** * Benchmark [[GenericArrayData]] for Dense and Sparse with primitive type + * To run this: + * build/sbt "sql/test-only *benchmark.GenericArrayDataBenchmark" + * + * Benchmarks in this file are skipped in normal builds. */ -object GenericArrayDataBenchmark { -/* +class GenericArrayDataBenchmark extends BenchmarkBase { + def allocateGenericIntArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 + val count = 1024 * 1024 var array: GenericArrayData = null val primitiveIntArray = new Array[Int](count) val denseIntArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { array = GenericArrayData.allocate(primitiveIntArray) + n += 1 } } val sparseIntArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { array = new GenericRefArrayData(primitiveIntArray) + n += 1 } } - val benchmark = new Benchmark("Allocate GenericArrayData for int", count * iters) + val benchmark = new Benchmark("Allocate GenericArrayData for int", count * iters, + minNumIters = 10, minTime = 1.milliseconds) benchmark.addCase("Sparse")(sparseIntArray) benchmark.addCase("Dense ")(denseIntArray) + benchmark.run } def allocateGenericDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 + val count = 1024 * 1024 var array: GenericArrayData = null val primitiveDoubleArray = new Array[Int](count) val denseDoubleArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { array = GenericArrayData.allocate(primitiveDoubleArray) + n += 1 } } val sparseDoubleArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { array = new GenericRefArrayData(primitiveDoubleArray) + n += 1 } } - val benchmark = new Benchmark("Allocate GenericArrayData for double", count * iters) + val benchmark = new Benchmark("Allocate GenericArrayData for double", count * iters, + minNumIters = 10, minTime = 1.milliseconds) benchmark.addCase("Sparse")(sparseDoubleArray) benchmark.addCase("Dense ")(denseDoubleArray) + benchmark.run } def getPrimitiveIntArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 + val count = 1024 * 1024 val intSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Int](count)) val intDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Int](count)) var primitiveIntArray: Array[Int] = null val sparseIntArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { primitiveIntArray = intSparseArray.toIntArray + n += 1 } } val denseIntArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { primitiveIntArray = intDenseArray.toIntArray + n += 1 } } val benchmark = new Benchmark("Get int primitive array", count * iters) benchmark.addCase("Sparse int")(sparseIntArray) benchmark.addCase("Dense int")(denseIntArray) + benchmark.run } def getPrimitiveDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 + val count = 1024 * 1024 val doubleSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Double](count)) val doubleDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Double](count)) var primitiveDoubleArray: Array[Double] = null val sparseDoubleArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { primitiveDoubleArray = doubleSparseArray.toDoubleArray + n += 1 } } val denseDoubleArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { primitiveDoubleArray = doubleDenseArray.toDoubleArray + n += 1 } } val benchmark = new Benchmark("Get double primitive array", count * iters) benchmark.addCase("Sparse double")(sparseDoubleArray) benchmark.addCase("Dense double")(denseDoubleArray) + benchmark.run } def readGenericIntArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 + val count = 1024 * 1024 * 2 var result: Int = 0 val sparseArray = new GenericRefArrayData(new Array[Int](count)) val sparseIntArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { val len = sparseArray.numElements var sum = 0 - for (i <- 0 until len - 1) { + var i = 0 + while (i < len) { sum += sparseArray.getInt(i) + i += 1 } result = sum + n += 1 } } val denseArray = GenericArrayData.allocate(new Array[Int](count)) val denseIntArray = { i: Int => - for (n <- 0L until iters) { + var n = 0 + while (n < iters) { val len = denseArray.numElements var sum = 0 - for (i <- 0 until len - 1) { + var i = 0 + while (i < len) { sum += denseArray.getInt(i) + i += 1 } result = sum + n += 1 } } val benchmark = new Benchmark("Read GenericArrayData Int", count * iters) benchmark.addCase("Sparse")(sparseIntArray) benchmark.addCase("Dense ")(denseIntArray) + benchmark.run } def readGenericDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 * 10 - var result: Int = 0 + val count = 1024 * 1024 * 2 + var result: Double = 0 - val sparseArray = new GenericRefArrayData(new Array[Int](count)) - val sparseIntArray = { i: Int => - for (n <- 0L until iters) { + val sparseArray = new GenericRefArrayData(new Array[Double](count)) + val sparseDoubleArray = { i: Int => + var n = 0 + while (n < iters) { val len = sparseArray.numElements - var sum = 0 - for (i <- 0 until len - 1) { - sum += sparseArray.getInt(i) + var sum = 0.toDouble + var i = 0 + while (i < len) { + sum += sparseArray.getDouble(i) + i += 1 } result = sum + n += 1 } } - val denseArray = GenericArrayData.allocate(new Array[Int](count)) - val denseIntArray = { i: Int => - for (n <- 0L until iters) { + val denseArray = GenericArrayData.allocate(new Array[Double](count)) + val denseDoubleArray = { i: Int => + var n = 0 + while (n < iters) { val len = denseArray.numElements - var sum = 0 - for (i <- 0 until len - 1) { - sum += denseArray.getInt(i) + var sum = 0.toDouble + var i = 0 + while (i < len) { + sum += denseArray.getDouble(i) + i += 1 } result = sum + n += 1 } } val benchmark = new Benchmark("Read GenericArrayData Double", count * iters) - benchmark.addCase("Sparse")(sparseIntArray) - benchmark.addCase("Dense ")(denseIntArray) + benchmark.addCase("Sparse")(sparseDoubleArray) + benchmark.addCase("Dense ")(denseDoubleArray) + benchmark.run + } + + ignore("allocate GenericArrayData") { + allocateGenericIntArray(20) + allocateGenericDoubleArray(20) + } + + ignore("get primitive array") { + getPrimitiveIntArray(50) + getPrimitiveDoubleArray(50) + } + + test("read elements in GenericArrayData") { + readGenericIntArray(100) + readGenericDoubleArray(100) } def main(args: Array[String]): Unit = { - allocateGenericIntArray(1024) - allocateGenericDoubleArray(1024) - getPrimitiveIntArray(1024) - getPrimitiveDoubleArray(1024) - readGenericIntArray(512) - readGenericDoubleArray(512) + allocateGenericIntArray(20) + allocateGenericDoubleArray(20) + getPrimitiveIntArray(50) + getPrimitiveDoubleArray(50) + readGenericIntArray(20) + readGenericDoubleArray(20) } -*/ } From 040d9aa10b9f7240188770e541a69bd156d0d0b0 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 18:42:05 +0900 Subject: [PATCH 44/75] addressed review comments --- .../sql/catalyst/util/GenericArrayData.scala | 246 +++--------------- 1 file changed, 30 insertions(+), 216 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index e90361e22617..949d4ee39135 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -25,19 +25,19 @@ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} object GenericArrayData { def allocate(seq: Seq[Any]): GenericArrayData = new GenericRefArrayData(seq) - def allocate(list: java.util.List[Any]): GenericArrayData = new GenericRefArrayData(list) - def allocate(seqOrArray: Any): GenericArrayData = new GenericRefArrayData(seqOrArray) - def allocate(primitiveArray: Array[Int]): GenericArrayData = + def allocate(list: java.util.List[Any]): GenericRefArrayData = new GenericRefArrayData(list) + def allocate(seqOrArray: Any): GenericRefArrayData = new GenericRefArrayData(seqOrArray) + def allocate(primitiveArray: Array[Int]): GenericIntArrayData = new GenericIntArrayData(primitiveArray) - def allocate(primitiveArray: Array[Long]): GenericArrayData = + def allocate(primitiveArray: Array[Long]): GenericLongArrayData = new GenericLongArrayData(primitiveArray) - def allocate(primitiveArray: Array[Float]): GenericArrayData = + def allocate(primitiveArray: Array[Float]): GenericFloatArrayData = new GenericFloatArrayData(primitiveArray) - def allocate(primitiveArray: Array[Double]): GenericArrayData = + def allocate(primitiveArray: Array[Double]): GenericDoubleArrayData = new GenericDoubleArrayData(primitiveArray) - def allocate(primitiveArray: Array[Short]): GenericArrayData = + def allocate(primitiveArray: Array[Short]): GenericShortArrayData = new GenericShortArrayData(primitiveArray) - def allocate(primitiveArray: Array[Byte]): GenericArrayData = + def allocate(primitiveArray: Array[Byte]): GenericByteArrayData = new GenericByteArrayData(primitiveArray) def allocate(primitiveArray: Array[Boolean]): GenericArrayData = new GenericBooleanArrayData(primitiveArray) @@ -128,6 +128,7 @@ class GenericArrayData(val array: Array[Any], override def getArray(ordinal: Int): ArrayData = getAs(ordinal) override def getMap(ordinal: Int): MapData = getAs(ordinal) +<<<<<<< 8df1c34a012f074c4c970374cbd9ef6b43d2a4ab override def isNullAt(ordinal: Int): Boolean = { if (booleanArray != null || byteArray != null || shortArray != null || intArray != null || longArray != null || floatArray != null || doubleArray != null) { @@ -333,7 +334,7 @@ class GenericArrayData(val array: Array[Any], final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericIntArrayData(primitiveArray) + override def copy(): GenericIntArrayData = new GenericIntArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -344,7 +345,6 @@ final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericA System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericIntArrayData]) { @@ -356,41 +356,17 @@ final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericA return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val update: Int = primitiveArray(i) - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } final class GenericLongArrayData(val primitiveArray: Array[Long]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericLongArrayData(primitiveArray) + override def copy(): GenericLongArrayData = new GenericLongArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -401,7 +377,6 @@ final class GenericLongArrayData(val primitiveArray: Array[Long]) System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericLongArrayData]) { @@ -413,42 +388,17 @@ final class GenericLongArrayData(val primitiveArray: Array[Long]) return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val l = primitiveArray(i) - val update: Int = (l ^ (l >>> 32)).toInt - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } final class GenericFloatArrayData(val primitiveArray: Array[Float]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericFloatArrayData(primitiveArray) + override def copy(): GenericFloatArrayData = new GenericFloatArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -459,7 +409,6 @@ final class GenericFloatArrayData(val primitiveArray: Array[Float]) System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericFloatArrayData]) { @@ -471,46 +420,17 @@ final class GenericFloatArrayData(val primitiveArray: Array[Float]) return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (java.lang.Float.isNaN(o1)) { - if (!java.lang.Float.isNaN(o2)) { - return false; - } - } else if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val f = primitiveArray(i) - val update: Int = java.lang.Float.floatToIntBits(f) - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } final class GenericDoubleArrayData(val primitiveArray: Array[Double]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericDoubleArrayData(primitiveArray) + override def copy(): GenericDoubleArrayData = new GenericDoubleArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -521,7 +441,6 @@ final class GenericDoubleArrayData(val primitiveArray: Array[Double]) System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericDoubleArrayData]) { @@ -533,47 +452,17 @@ final class GenericDoubleArrayData(val primitiveArray: Array[Double]) return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (java.lang.Double.isNaN(o1)) { - if (!java.lang.Double.isNaN(o2)) { - return false; - } - } else if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val d = primitiveArray(i) - val b = java.lang.Double.doubleToLongBits(d) - val update: Int = (b ^ (b >>> 32)).toInt - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } final class GenericShortArrayData(val primitiveArray: Array[Short]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericShortArrayData(primitiveArray) + override def copy(): GenericShortArrayData = new GenericShortArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -584,7 +473,6 @@ final class GenericShortArrayData(val primitiveArray: Array[Short]) System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericShortArrayData]) { @@ -596,41 +484,17 @@ final class GenericShortArrayData(val primitiveArray: Array[Short]) return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val update: Int = primitiveArray(i).toInt - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } final class GenericByteArrayData(val primitiveArray: Array[Byte]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericByteArrayData(primitiveArray) + override def copy(): GenericByteArrayData = new GenericByteArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -641,7 +505,6 @@ final class GenericByteArrayData(val primitiveArray: Array[Byte]) System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericByteArrayData]) { @@ -653,41 +516,17 @@ final class GenericByteArrayData(val primitiveArray: Array[Byte]) return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val update: Int = primitiveArray(i).toInt - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): ArrayData = new GenericBooleanArrayData(primitiveArray) + override def copy(): GenericBooleanArrayData = new GenericBooleanArrayData(primitiveArray.clone()) override def numElements(): Int = primitiveArray.length @@ -698,7 +537,6 @@ final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) System.arraycopy(primitiveArray, 0, array, 0, numElements) array } - override def toString(): String = primitiveArray.mkString("[", ",", "]") override def equals(o: Any): Boolean = { if (!o.isInstanceOf[GenericBooleanArrayData]) { @@ -710,33 +548,9 @@ final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) return false } - val len = numElements() - if (len != other.numElements()) { - return false - } - - var i = 0 - while (i < len) { - val o1 = primitiveArray(i) - val o2 = other.primitiveArray(i) - if (o1 != o2) { - return false - } - i += 1 - } - true + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } - override def hashCode: Int = { - var result: Int = 37 - var i = 0 - val len = numElements() - while (i < len) { - val update: Int = if (primitiveArray(i)) 1 else 0 - result = 37 * result + update - i += 1 - } - result - } + override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) } From cabc27aaa91b695f9a8dad7cd90ab2a8c4592a84 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 22 Jun 2016 21:41:31 +0900 Subject: [PATCH 45/75] fix test failures --- .../apache/spark/sql/catalyst/expressions/objects/objects.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index 80a0efe05fc6..2949d391a034 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -281,7 +281,7 @@ case class NewInstance( val constructorCall = outer.map { gen => s"""${gen.value}.new ${cls.getSimpleName}(${argValues.mkString(", ")})""" }.getOrElse { - if (!cls.isInstanceOf[GenericArrayData]) { + if (!cls.isAssignableFrom(classOf[GenericArrayData])) { s"new $className(${argValues.mkString(", ")})" } else { s"${cls.getName}.allocate(${argValues.mkString(", ")})" From 80abdbb88945ad76605f9c73bb05c504bab69008 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 23 Jun 2016 02:03:53 +0900 Subject: [PATCH 46/75] Enabled all of benchmark suites with performance data --- .../benchmark/GenericArrayDataBenchmark.scala | 52 ++++++++++++++++++- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index 0832830d6a87..e6019bd98233 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -56,6 +56,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Sparse")(sparseIntArray) benchmark.addCase("Dense ")(denseIntArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Allocate GenericArrayData for int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Sparse 40 / 43 522.2 1.9 1.0X + Dense 0 / 0 209715200.0 0.0 401598.7X + */ } def allocateGenericDoubleArray(iters: Int): Unit = { @@ -83,6 +91,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Sparse")(sparseDoubleArray) benchmark.addCase("Dense ")(denseDoubleArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Allocate GenericArrayData for double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Sparse 40 / 44 523.2 1.9 1.0X + Dense 0 / 0 225500215.1 0.0 431013.0X + */ } def getPrimitiveIntArray(iters: Int): Unit = { @@ -110,6 +126,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Sparse int")(sparseIntArray) benchmark.addCase("Dense int")(denseIntArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Get int primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Sparse int 67 / 70 783.9 1.3 1.0X + Dense int 41 / 43 1263.8 0.8 1.6X + */ } def getPrimitiveDoubleArray(iters: Int): Unit = { @@ -137,6 +161,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Sparse double")(sparseDoubleArray) benchmark.addCase("Dense double")(denseDoubleArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Get double primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Sparse double 211 / 217 248.6 4.0 1.0X + Dense double 95 / 100 554.1 1.8 2.2X + */ } def readGenericIntArray(iters: Int): Unit = { @@ -179,6 +211,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Sparse")(sparseIntArray) benchmark.addCase("Dense ")(denseIntArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Read GenericArrayData Int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Sparse 160 / 163 1314.5 0.8 1.0X + Dense 68 / 69 3080.0 0.3 2.3X + */ } def readGenericDoubleArray(iters: Int): Unit = { @@ -221,14 +261,22 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Sparse")(sparseDoubleArray) benchmark.addCase("Dense ")(denseDoubleArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Read GenericArrayData Double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Sparse 611 / 613 343.3 2.9 1.0X + Dense 199 / 202 1051.5 1.0 3.1X + */ } - ignore("allocate GenericArrayData") { + test("allocate GenericArrayData") { allocateGenericIntArray(20) allocateGenericDoubleArray(20) } - ignore("get primitive array") { + test("get primitive array") { getPrimitiveIntArray(50) getPrimitiveDoubleArray(50) } From 9311664b4d25d08797c1d760e50f6c7e4745d5ba Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 23 Jun 2016 03:38:21 +0900 Subject: [PATCH 47/75] fix test failures --- .../org/apache/spark/sql/catalyst/util/GenericArrayData.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 949d4ee39135..3d886ab0a785 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -24,7 +24,8 @@ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} object GenericArrayData { - def allocate(seq: Seq[Any]): GenericArrayData = new GenericRefArrayData(seq) + def allocate(array: Array[Any]): GenericRefArrayData = new GenericRefArrayData(array) + def allocate(seq: Seq[Any]): GenericRefArrayData = new GenericRefArrayData(seq) def allocate(list: java.util.List[Any]): GenericRefArrayData = new GenericRefArrayData(list) def allocate(seqOrArray: Any): GenericRefArrayData = new GenericRefArrayData(seqOrArray) def allocate(primitiveArray: Array[Int]): GenericIntArrayData = From 25105f3d93fcf02561b197b13997627935af73ea Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 23 Jun 2016 12:07:39 +0900 Subject: [PATCH 48/75] update test suite to resolve test failures --- .../org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala index 43b6afd9ad89..ded074c5abc7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala @@ -287,8 +287,8 @@ class ScalaReflectionSuite extends SparkFunSuite { assert(serializer.children.head.isInstanceOf[Literal]) assert(serializer.children.head.asInstanceOf[Literal].value === UTF8String.fromString("value")) assert(serializer.children.last.isInstanceOf[NewInstance]) - assert(serializer.children.last.asInstanceOf[NewInstance] - .cls.isAssignableFrom(classOf[org.apache.spark.sql.catalyst.util.GenericArrayData])) + assert(classOf[org.apache.spark.sql.catalyst.util.GenericArrayData] + isAssignableFrom(serializer.children.last.asInstanceOf[NewInstance].cls)) } private val dataTypeForComplexData = dataTypeFor[ComplexData] From f3286627d10621687d1a58debf7f78c474193133 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 23 Jun 2016 14:11:53 +0900 Subject: [PATCH 49/75] fix compilation error --- .../main/scala/org/apache/spark/sql/hive/HiveInspectors.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index e303065127c3..7002b6437611 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -481,7 +481,7 @@ private[hive] trait HiveInspectors { val values = li.getWritableConstantValue.asScala .map(unwrapper) .toArray - val constant = new GenericArrayData(values) + val constant = GenericArrayData.allocate(values) _ => constant case poi: VoidObjectInspector => _ => null // always be null for void object inspector @@ -637,7 +637,7 @@ private[hive] trait HiveInspectors { Option(li.getList(data)) .map { l => val values = l.asScala.map(unwrapper).toArray - new GenericArrayData(values) + GenericArrayData.allocate(values) } .orNull } else { From 16de76f68e388da88e040926777ae23fbfe82f82 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 25 Jun 2016 12:13:39 +0900 Subject: [PATCH 50/75] addressed comments --- .../sql/catalyst/util/GenericArrayData.scala | 68 +++++++++++++------ 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 3d886ab0a785..b81c6b5c45bf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -228,15 +228,16 @@ class GenericArrayData(val array: Array[Any], } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericArrayData]) { - return false + if (o == this) { + return true } - val other = o.asInstanceOf[GenericArrayData] - if (other eq null) { + if (o == null || !o.isInstanceOf[GenericArrayData]) { return false } + val other = o.asInstanceOf[GenericArrayData] + val len = numElements() if (len != other.numElements()) { return false @@ -335,7 +336,7 @@ class GenericArrayData(val array: Array[Any], final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericIntArrayData = new GenericIntArrayData(primitiveArray.clone()) + override def copy(): GenericIntArrayData = new GenericIntArrayData(toIntArray) override def numElements(): Int = primitiveArray.length @@ -348,15 +349,16 @@ final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericA } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericIntArrayData]) { - return false + if (o == this) { + return true } - val other = o.asInstanceOf[GenericIntArrayData] - if (other eq null) { + if (o == null || !o.isInstanceOf[GenericIntArrayData]) { return false } + val other = o.asInstanceOf[GenericIntArrayData] + java.util.Arrays.equals(primitiveArray, other.primitiveArray) } @@ -367,7 +369,7 @@ final class GenericLongArrayData(val primitiveArray: Array[Long]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericLongArrayData = new GenericLongArrayData(primitiveArray.clone()) + override def copy(): GenericLongArrayData = new GenericLongArrayData(toLongArray) override def numElements(): Int = primitiveArray.length @@ -380,7 +382,11 @@ final class GenericLongArrayData(val primitiveArray: Array[Long]) } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericLongArrayData]) { + if (o == this) { + return true + } + + if (o == null || !o.isInstanceOf[GenericLongArrayData]) { return false } @@ -399,7 +405,7 @@ final class GenericFloatArrayData(val primitiveArray: Array[Float]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericFloatArrayData = new GenericFloatArrayData(primitiveArray.clone()) + override def copy(): GenericFloatArrayData = new GenericFloatArrayData(toFloatArray) override def numElements(): Int = primitiveArray.length @@ -412,7 +418,11 @@ final class GenericFloatArrayData(val primitiveArray: Array[Float]) } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericFloatArrayData]) { + if (o == this) { + return true + } + + if (o == null || !o.isInstanceOf[GenericFloatArrayData]) { return false } @@ -431,7 +441,7 @@ final class GenericDoubleArrayData(val primitiveArray: Array[Double]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericDoubleArrayData = new GenericDoubleArrayData(primitiveArray.clone()) + override def copy(): GenericDoubleArrayData = new GenericDoubleArrayData(toDoubleArray) override def numElements(): Int = primitiveArray.length @@ -444,7 +454,11 @@ final class GenericDoubleArrayData(val primitiveArray: Array[Double]) } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericDoubleArrayData]) { + if (o == this) { + return true + } + + if (o == null || !o.isInstanceOf[GenericDoubleArrayData]) { return false } @@ -463,7 +477,7 @@ final class GenericShortArrayData(val primitiveArray: Array[Short]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericShortArrayData = new GenericShortArrayData(primitiveArray.clone()) + override def copy(): GenericShortArrayData = new GenericShortArrayData(toShortArray) override def numElements(): Int = primitiveArray.length @@ -476,7 +490,11 @@ final class GenericShortArrayData(val primitiveArray: Array[Short]) } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericShortArrayData]) { + if (o == this) { + return true + } + + if (o == null || !o.isInstanceOf[GenericShortArrayData]) { return false } @@ -495,7 +513,7 @@ final class GenericByteArrayData(val primitiveArray: Array[Byte]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericByteArrayData = new GenericByteArrayData(primitiveArray.clone()) + override def copy(): GenericByteArrayData = new GenericByteArrayData(toByteArray) override def numElements(): Int = primitiveArray.length @@ -508,7 +526,11 @@ final class GenericByteArrayData(val primitiveArray: Array[Byte]) } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericByteArrayData]) { + if (o == this) { + return true + } + + if (o == null || !o.isInstanceOf[GenericByteArrayData]) { return false } @@ -527,7 +549,7 @@ final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) extends GenericArrayData { override def array(): Array[Any] = primitiveArray.toArray - override def copy(): GenericBooleanArrayData = new GenericBooleanArrayData(primitiveArray.clone()) + override def copy(): GenericBooleanArrayData = new GenericBooleanArrayData(toBooleanArray) override def numElements(): Int = primitiveArray.length @@ -540,7 +562,11 @@ final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) } override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[GenericBooleanArrayData]) { + if (o == this) { + return true + } + + if (o == null || !o.isInstanceOf[GenericBooleanArrayData]) { return false } From 12d138eacf779bb5c47d809a8795743459580a32 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 25 Jun 2016 14:12:30 +0900 Subject: [PATCH 51/75] fix test failures --- .../sql/catalyst/util/GenericArrayData.scala | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index b81c6b5c45bf..db5ca5f5c7a6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -228,10 +228,6 @@ class GenericArrayData(val array: Array[Any], } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericArrayData]) { return false } @@ -349,10 +345,6 @@ final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericA } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericIntArrayData]) { return false } @@ -382,10 +374,6 @@ final class GenericLongArrayData(val primitiveArray: Array[Long]) } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericLongArrayData]) { return false } @@ -418,10 +406,6 @@ final class GenericFloatArrayData(val primitiveArray: Array[Float]) } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericFloatArrayData]) { return false } @@ -454,10 +438,6 @@ final class GenericDoubleArrayData(val primitiveArray: Array[Double]) } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericDoubleArrayData]) { return false } @@ -490,10 +470,6 @@ final class GenericShortArrayData(val primitiveArray: Array[Short]) } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericShortArrayData]) { return false } @@ -526,10 +502,6 @@ final class GenericByteArrayData(val primitiveArray: Array[Byte]) } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericByteArrayData]) { return false } @@ -562,10 +534,6 @@ final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) } override def equals(o: Any): Boolean = { - if (o == this) { - return true - } - if (o == null || !o.isInstanceOf[GenericBooleanArrayData]) { return false } From 19c453be7487c57abb5f8f505847355c3f8c1835 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jun 2016 00:50:09 +0900 Subject: [PATCH 52/75] fix descriptions --- .../benchmark/GenericArrayDataBenchmark.scala | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index e6019bd98233..5824dc5343e5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.util.Benchmark /** - * Benchmark [[GenericArrayData]] for Dense and Sparse with primitive type + * Benchmark [[GenericArrayData]] for specialized representation with primitive type * To run this: * build/sbt "sql/test-only *benchmark.GenericArrayDataBenchmark" * @@ -36,14 +36,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { var array: GenericArrayData = null val primitiveIntArray = new Array[Int](count) - val denseIntArray = { i: Int => + val specializedIntArray = { i: Int => var n = 0 while (n < iters) { array = GenericArrayData.allocate(primitiveIntArray) n += 1 } } - val sparseIntArray = { i: Int => + val genericIntArray = { i: Int => var n = 0 while (n < iters) { array = new GenericRefArrayData(primitiveIntArray) @@ -53,16 +53,16 @@ class GenericArrayDataBenchmark extends BenchmarkBase { val benchmark = new Benchmark("Allocate GenericArrayData for int", count * iters, minNumIters = 10, minTime = 1.milliseconds) - benchmark.addCase("Sparse")(sparseIntArray) - benchmark.addCase("Dense ")(denseIntArray) + benchmark.addCase("Generic ")(genericIntArray) + benchmark.addCase("Specialized")(specializedIntArray) benchmark.run /* OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Allocate GenericArrayData for int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Sparse 40 / 43 522.2 1.9 1.0X - Dense 0 / 0 209715200.0 0.0 401598.7X + Generic 40 / 43 522.2 1.9 1.0X + Specialized 0 / 0 209715200.0 0.0 401598.7X */ } @@ -71,14 +71,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { var array: GenericArrayData = null val primitiveDoubleArray = new Array[Int](count) - val denseDoubleArray = { i: Int => + val specializedDoubleArray = { i: Int => var n = 0 while (n < iters) { array = GenericArrayData.allocate(primitiveDoubleArray) n += 1 } } - val sparseDoubleArray = { i: Int => + val genericDoubleArray = { i: Int => var n = 0 while (n < iters) { array = new GenericRefArrayData(primitiveDoubleArray) @@ -88,16 +88,16 @@ class GenericArrayDataBenchmark extends BenchmarkBase { val benchmark = new Benchmark("Allocate GenericArrayData for double", count * iters, minNumIters = 10, minTime = 1.milliseconds) - benchmark.addCase("Sparse")(sparseDoubleArray) - benchmark.addCase("Dense ")(denseDoubleArray) + benchmark.addCase("Generic ")(genericDoubleArray) + benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run /* OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Allocate GenericArrayData for double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Sparse 40 / 44 523.2 1.9 1.0X - Dense 0 / 0 225500215.1 0.0 431013.0X + Generic 40 / 44 523.2 1.9 1.0X + Specialized 0 / 0 225500215.1 0.0 431013.0X */ } @@ -107,7 +107,7 @@ class GenericArrayDataBenchmark extends BenchmarkBase { val intSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Int](count)) val intDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Int](count)) var primitiveIntArray: Array[Int] = null - val sparseIntArray = { i: Int => + val genericIntArray = { i: Int => var n = 0 while (n < iters) { primitiveIntArray = intSparseArray.toIntArray @@ -123,16 +123,16 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } val benchmark = new Benchmark("Get int primitive array", count * iters) - benchmark.addCase("Sparse int")(sparseIntArray) - benchmark.addCase("Dense int")(denseIntArray) + benchmark.addCase("Generic ")(genericIntArray) + benchmark.addCase("Specialized")(denseIntArray) benchmark.run /* OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Get int primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Sparse int 67 / 70 783.9 1.3 1.0X - Dense int 41 / 43 1263.8 0.8 1.6X + Generic 67 / 70 783.9 1.3 1.0X + Specialized 41 / 43 1263.8 0.8 1.6X */ } @@ -142,14 +142,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { val doubleSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Double](count)) val doubleDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Double](count)) var primitiveDoubleArray: Array[Double] = null - val sparseDoubleArray = { i: Int => + val genericDoubleArray = { i: Int => var n = 0 while (n < iters) { primitiveDoubleArray = doubleSparseArray.toDoubleArray n += 1 } } - val denseDoubleArray = { i: Int => + val specializedDoubleArray = { i: Int => var n = 0 while (n < iters) { primitiveDoubleArray = doubleDenseArray.toDoubleArray @@ -158,16 +158,16 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } val benchmark = new Benchmark("Get double primitive array", count * iters) - benchmark.addCase("Sparse double")(sparseDoubleArray) - benchmark.addCase("Dense double")(denseDoubleArray) + benchmark.addCase("Generic ")(genericDoubleArray) + benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run /* OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Get double primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Sparse double 211 / 217 248.6 4.0 1.0X - Dense double 95 / 100 554.1 1.8 2.2X + Generic 211 / 217 248.6 4.0 1.0X + Specialized 95 / 100 554.1 1.8 2.2X */ } @@ -176,7 +176,7 @@ class GenericArrayDataBenchmark extends BenchmarkBase { var result: Int = 0 val sparseArray = new GenericRefArrayData(new Array[Int](count)) - val sparseIntArray = { i: Int => + val genericIntArray = { i: Int => var n = 0 while (n < iters) { val len = sparseArray.numElements @@ -208,7 +208,7 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } val benchmark = new Benchmark("Read GenericArrayData Int", count * iters) - benchmark.addCase("Sparse")(sparseIntArray) + benchmark.addCase("Sparse")(genericIntArray) benchmark.addCase("Dense ")(denseIntArray) benchmark.run /* @@ -216,8 +216,8 @@ class GenericArrayDataBenchmark extends BenchmarkBase { Intel Xeon E3-12xx v2 (Ivy Bridge) Read GenericArrayData Int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Sparse 160 / 163 1314.5 0.8 1.0X - Dense 68 / 69 3080.0 0.3 2.3X + Generic 160 / 163 1314.5 0.8 1.0X + Specialized 68 / 69 3080.0 0.3 2.3X */ } @@ -226,7 +226,7 @@ class GenericArrayDataBenchmark extends BenchmarkBase { var result: Double = 0 val sparseArray = new GenericRefArrayData(new Array[Double](count)) - val sparseDoubleArray = { i: Int => + val genericDoubleArray = { i: Int => var n = 0 while (n < iters) { val len = sparseArray.numElements @@ -242,7 +242,7 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } val denseArray = GenericArrayData.allocate(new Array[Double](count)) - val denseDoubleArray = { i: Int => + val specializedDoubleArray = { i: Int => var n = 0 while (n < iters) { val len = denseArray.numElements @@ -258,16 +258,16 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } val benchmark = new Benchmark("Read GenericArrayData Double", count * iters) - benchmark.addCase("Sparse")(sparseDoubleArray) - benchmark.addCase("Dense ")(denseDoubleArray) + benchmark.addCase("Generic")(genericDoubleArray) + benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run /* OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Read GenericArrayData Double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Sparse 611 / 613 343.3 2.9 1.0X - Dense 199 / 202 1051.5 1.0 3.1X + Generic 611 / 613 343.3 2.9 1.0X + Specialized 199 / 202 1051.5 1.0 3.1X */ } From bf34ec4076f01904ebce8c29113c4c63a3f1b6c3 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jun 2016 00:51:26 +0900 Subject: [PATCH 53/75] Better usage of GenericArrayData --- .../scala/org/apache/spark/sql/catalyst/ScalaReflection.scala | 4 ++-- .../org/apache/spark/sql/catalyst/util/GenericArrayData.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 68df41b5802d..7bcaea7ea2f7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericRefArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -459,7 +459,7 @@ object ScalaReflection extends ScalaReflection { case dt => NewInstance( - classOf[GenericRefArrayData], + classOf[GenericArrayData], input :: Nil, dataType = ArrayType(dt, schemaFor(elementType).nullable)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index db5ca5f5c7a6..0b1ad58da83f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -40,7 +40,7 @@ object GenericArrayData { new GenericShortArrayData(primitiveArray) def allocate(primitiveArray: Array[Byte]): GenericByteArrayData = new GenericByteArrayData(primitiveArray) - def allocate(primitiveArray: Array[Boolean]): GenericArrayData = + def allocate(primitiveArray: Array[Boolean]): GenericBooleanArrayData = new GenericBooleanArrayData(primitiveArray) } From 9cd7776ad9cf698a3c26185860fba16c7d794576 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jun 2016 10:20:15 +0900 Subject: [PATCH 54/75] revert part of changes --- .../scala/org/apache/spark/sql/catalyst/ScalaReflection.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 7bcaea7ea2f7..68df41b5802d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericRefArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -459,7 +459,7 @@ object ScalaReflection extends ScalaReflection { case dt => NewInstance( - classOf[GenericArrayData], + classOf[GenericRefArrayData], input :: Nil, dataType = ArrayType(dt, schemaFor(elementType).nullable)) } From d0b3f60be7694dcebfb5f9ba9e85357e6d82a023 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jun 2016 12:24:15 +0900 Subject: [PATCH 55/75] undo revert --- .../scala/org/apache/spark/sql/catalyst/ScalaReflection.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 68df41b5802d..7bcaea7ea2f7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericRefArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -459,7 +459,7 @@ object ScalaReflection extends ScalaReflection { case dt => NewInstance( - classOf[GenericRefArrayData], + classOf[GenericArrayData], input :: Nil, dataType = ArrayType(dt, schemaFor(elementType).nullable)) } From 8b19f75eaea4fe648570cadf88d4aed73a070537 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jun 2016 17:35:12 +0900 Subject: [PATCH 56/75] revert changes at 0800fdc5 --- .../scala/org/apache/spark/sql/catalyst/ScalaReflection.scala | 4 ++-- .../org/apache/spark/sql/catalyst/util/GenericArrayData.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 7bcaea7ea2f7..68df41b5802d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericRefArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -459,7 +459,7 @@ object ScalaReflection extends ScalaReflection { case dt => NewInstance( - classOf[GenericArrayData], + classOf[GenericRefArrayData], input :: Nil, dataType = ArrayType(dt, schemaFor(elementType).nullable)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 0b1ad58da83f..db5ca5f5c7a6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -40,7 +40,7 @@ object GenericArrayData { new GenericShortArrayData(primitiveArray) def allocate(primitiveArray: Array[Byte]): GenericByteArrayData = new GenericByteArrayData(primitiveArray) - def allocate(primitiveArray: Array[Boolean]): GenericBooleanArrayData = + def allocate(primitiveArray: Array[Boolean]): GenericArrayData = new GenericBooleanArrayData(primitiveArray) } From 7b0e7696a7ceafab2ab3707419ba2d4ed27079ac Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jun 2016 20:38:29 +0900 Subject: [PATCH 57/75] add null check after asInstanceOf --- .../apache/spark/sql/catalyst/util/GenericArrayData.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index db5ca5f5c7a6..46209a0b65d0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -233,6 +233,9 @@ class GenericArrayData(val array: Array[Any], } val other = o.asInstanceOf[GenericArrayData] + if (other eq null) { + return false; + } val len = numElements() if (len != other.numElements()) { @@ -350,6 +353,9 @@ final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericA } val other = o.asInstanceOf[GenericIntArrayData] + if (other eq null) { + return false; + } java.util.Arrays.equals(primitiveArray, other.primitiveArray) } From 1a0486f8525c16286070e68c22509918887d7674 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 27 Jun 2016 00:44:38 +0900 Subject: [PATCH 58/75] generate GenericArrayData.allocate in NewInstance --- .../scala/org/apache/spark/sql/catalyst/ScalaReflection.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 68df41b5802d..7bcaea7ea2f7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericRefArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -459,7 +459,7 @@ object ScalaReflection extends ScalaReflection { case dt => NewInstance( - classOf[GenericRefArrayData], + classOf[GenericArrayData], input :: Nil, dataType = ArrayType(dt, schemaFor(elementType).nullable)) } From 963876f360c4321ab05b3bae4886dca4c8ed85f3 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 27 Jun 2016 12:28:04 +0900 Subject: [PATCH 59/75] fix test failure --- .../catalyst/expressions/objects/objects.scala | 13 +++++++++++-- .../sql/catalyst/util/GenericArrayData.scala | 15 ++++++++++++++- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index 2949d391a034..865dae01b5ef 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -226,6 +226,14 @@ case class NewInstance( outerPointer: Option[() => AnyRef]) extends Expression with NonSQLExpression { private val className = cls.getName + private val instantiatedCls: Class[_] = { + if (!cls.isAssignableFrom(classOf[GenericArrayData])) { + cls + } else { + GenericArrayData.instantiatedClass(dataType) + } + } + override def nullable: Boolean = propagateNull override def children: Seq[Expression] = arguments @@ -236,7 +244,8 @@ case class NewInstance( // Note that static inner classes (e.g., inner classes within Scala objects) don't need // outer pointer registration. val needOuterPointer = - outerPointer.isEmpty && cls.isMemberClass && !Modifier.isStatic(cls.getModifiers) + outerPointer.isEmpty && instantiatedCls.isMemberClass && + !Modifier.isStatic(instantiatedCls.getModifiers) childrenResolved && !needOuterPointer } @@ -297,7 +306,7 @@ case class NewInstance( ev.copy(code = code, isNull = isNull) } - override def toString: String = s"newInstance($cls)" + override def toString: String = s"newInstance($instantiatedCls)" } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 46209a0b65d0..7aa298a34b32 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -40,8 +40,21 @@ object GenericArrayData { new GenericShortArrayData(primitiveArray) def allocate(primitiveArray: Array[Byte]): GenericByteArrayData = new GenericByteArrayData(primitiveArray) - def allocate(primitiveArray: Array[Boolean]): GenericArrayData = + def allocate(primitiveArray: Array[Boolean]): GenericBooleanArrayData = new GenericBooleanArrayData(primitiveArray) + + def instantiatedClass(dt: DataType): Class[_] = { + dt match { + case IntegerType => classOf[GenericIntArrayData] + case LongType => classOf[GenericLongArrayData] + case FloatType => classOf[GenericFloatArrayData] + case DoubleType => classOf[GenericDoubleArrayData] + case ShortType => classOf[GenericShortArrayData] + case ByteType => classOf[GenericByteArrayData] + case BooleanType => classOf[GenericBooleanArrayData] + case _ => classOf[GenericRefArrayData] + } + } } private object GenericArrayData { From d50ee045b897ba07a9fe87c488f7713e8cffc52a Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 27 Jun 2016 17:45:08 +0900 Subject: [PATCH 60/75] fix test failure --- .../sql/catalyst/expressions/objects/objects.scala | 13 ++----------- .../spark/sql/catalyst/util/GenericArrayData.scala | 13 ------------- .../benchmark/GenericArrayDataBenchmark.scala | 6 +++--- 3 files changed, 5 insertions(+), 27 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index 865dae01b5ef..2949d391a034 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -226,14 +226,6 @@ case class NewInstance( outerPointer: Option[() => AnyRef]) extends Expression with NonSQLExpression { private val className = cls.getName - private val instantiatedCls: Class[_] = { - if (!cls.isAssignableFrom(classOf[GenericArrayData])) { - cls - } else { - GenericArrayData.instantiatedClass(dataType) - } - } - override def nullable: Boolean = propagateNull override def children: Seq[Expression] = arguments @@ -244,8 +236,7 @@ case class NewInstance( // Note that static inner classes (e.g., inner classes within Scala objects) don't need // outer pointer registration. val needOuterPointer = - outerPointer.isEmpty && instantiatedCls.isMemberClass && - !Modifier.isStatic(instantiatedCls.getModifiers) + outerPointer.isEmpty && cls.isMemberClass && !Modifier.isStatic(cls.getModifiers) childrenResolved && !needOuterPointer } @@ -306,7 +297,7 @@ case class NewInstance( ev.copy(code = code, isNull = isNull) } - override def toString: String = s"newInstance($instantiatedCls)" + override def toString: String = s"newInstance($cls)" } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 7aa298a34b32..138340a3a55e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -42,19 +42,6 @@ object GenericArrayData { new GenericByteArrayData(primitiveArray) def allocate(primitiveArray: Array[Boolean]): GenericBooleanArrayData = new GenericBooleanArrayData(primitiveArray) - - def instantiatedClass(dt: DataType): Class[_] = { - dt match { - case IntegerType => classOf[GenericIntArrayData] - case LongType => classOf[GenericLongArrayData] - case FloatType => classOf[GenericFloatArrayData] - case DoubleType => classOf[GenericDoubleArrayData] - case ShortType => classOf[GenericShortArrayData] - case ByteType => classOf[GenericByteArrayData] - case BooleanType => classOf[GenericBooleanArrayData] - case _ => classOf[GenericRefArrayData] - } - } } private object GenericArrayData { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index 5824dc5343e5..c956e77e8197 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -271,17 +271,17 @@ class GenericArrayDataBenchmark extends BenchmarkBase { */ } - test("allocate GenericArrayData") { + ignore("allocate GenericArrayData") { allocateGenericIntArray(20) allocateGenericDoubleArray(20) } - test("get primitive array") { + ignore("get primitive array") { getPrimitiveIntArray(50) getPrimitiveDoubleArray(50) } - test("read elements in GenericArrayData") { + ignore("read elements in GenericArrayData") { readGenericIntArray(100) readGenericDoubleArray(100) } From d5814397154b7f66148f2b73b63b4504a6d11c6d Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 12 Jul 2016 02:32:46 +0900 Subject: [PATCH 61/75] update --- .../benchmark/GenericArrayDataBenchmark.scala | 12 ++------- .../benchmark/PrimitiveArrayBenchmark.scala | 26 +++++++++++++++++++ 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index c956e77e8197..de3084f6b42c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -25,7 +25,8 @@ import org.apache.spark.util.Benchmark /** * Benchmark [[GenericArrayData]] for specialized representation with primitive type * To run this: - * build/sbt "sql/test-only *benchmark.GenericArrayDataBenchmark" + * 1. replace ignore(...) with test(...) + * 2. build/sbt "sql/test-only *benchmark.GenericArrayDataBenchmark" * * Benchmarks in this file are skipped in normal builds. */ @@ -285,13 +286,4 @@ class GenericArrayDataBenchmark extends BenchmarkBase { readGenericIntArray(100) readGenericDoubleArray(100) } - - def main(args: Array[String]): Unit = { - allocateGenericIntArray(20) - allocateGenericDoubleArray(20) - getPrimitiveIntArray(50) - getPrimitiveDoubleArray(50) - readGenericIntArray(20) - readGenericDoubleArray(20) - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala index a3f32ec3da04..ea4be010f160 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala @@ -27,16 +27,28 @@ import org.apache.spark.util.Benchmark * Benchmark [[PrimitiveArray]] for DataFrame and Dataset program using primitive array * To run this: * 1. replace ignore(...) with test(...) +<<<<<<< 1daa9216fe2a928d9c0b6153b12b043fac3c4fa0 * 2. build/sbt "sql/test-only *benchmark.PrimitiveArrayBenchmark" +======= + * 2. build/sbt "sql/test-only *benchmark.PrimitiveArrayDataBenchmark" +>>>>>>> update * * Benchmarks in this file are skipped in normal builds. */ class PrimitiveArrayBenchmark extends BenchmarkBase { +<<<<<<< 1daa9216fe2a928d9c0b6153b12b043fac3c4fa0 def writeDatasetArray(iters: Int): Unit = { import sparkSession.implicits._ val count = 1024 * 1024 * 2 +======= + + def showArray(iters: Int): Unit = { + import sparkSession.implicits._ + + val count = 1024 * 1024 * 24 +>>>>>>> update val sc = sparkSession.sparkContext val primitiveIntArray = Array.fill[Int](count)(65535) @@ -67,6 +79,7 @@ class PrimitiveArrayBenchmark extends BenchmarkBase { benchmark.addCase("Double")(doubleArray) benchmark.run /* +<<<<<<< 1daa9216fe2a928d9c0b6153b12b043fac3c4fa0 OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Write an array in Dataset: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative @@ -78,6 +91,19 @@ class PrimitiveArrayBenchmark extends BenchmarkBase { ignore("Write an array in Dataset") { writeDatasetArray(4) +======= + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Read an array in DataFrame: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Int 502 / 530 50.1 20.0 1.0X + Double 1111 / 1170 22.7 44.1 0.5X + */ + } + + ignore("Read an array in DataFrame") { + showArray(1) +>>>>>>> update } def writeArray(iters: Int): Unit = { From 8063e634146058c7ebc2d465b87c3d4fd16c3516 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 12 Jul 2016 03:04:25 +0900 Subject: [PATCH 62/75] replace new GenericArrayData with GenericArrayData.allocate --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 4 ++-- .../org/apache/spark/sql/catalyst/expressions/xml/xpath.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 5f533fecf8d0..712c889c7aa2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1596,8 +1596,8 @@ case class Sentences( widx = wi.current if (Character.isLetterOrDigit(word.charAt(0))) words += UTF8String.fromString(word) } - result += new GenericArrayData(words) + result += GenericArrayData.allocate(words) } - new GenericArrayData(result) + GenericArrayData.allocate(result) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala index aa328045cafd..1e54acd26ab6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala @@ -210,7 +210,7 @@ case class XPathList(xml: Expression, path: Expression) extends XPathExtract { ret(i) = UTF8String.fromString(nodeList.item(i).getNodeValue) i += 1 } - new GenericArrayData(ret) + GenericArrayData.allocate(ret) } else { null } From 090869d19f588da7729ea3c4580b5f3dfbffbc6e Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 19 Jul 2016 21:00:44 +0900 Subject: [PATCH 63/75] update benchmark program --- .../benchmark/PrimitiveArrayBenchmark.scala | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala index ea4be010f160..a3f32ec3da04 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala @@ -27,28 +27,16 @@ import org.apache.spark.util.Benchmark * Benchmark [[PrimitiveArray]] for DataFrame and Dataset program using primitive array * To run this: * 1. replace ignore(...) with test(...) -<<<<<<< 1daa9216fe2a928d9c0b6153b12b043fac3c4fa0 * 2. build/sbt "sql/test-only *benchmark.PrimitiveArrayBenchmark" -======= - * 2. build/sbt "sql/test-only *benchmark.PrimitiveArrayDataBenchmark" ->>>>>>> update * * Benchmarks in this file are skipped in normal builds. */ class PrimitiveArrayBenchmark extends BenchmarkBase { -<<<<<<< 1daa9216fe2a928d9c0b6153b12b043fac3c4fa0 def writeDatasetArray(iters: Int): Unit = { import sparkSession.implicits._ val count = 1024 * 1024 * 2 -======= - - def showArray(iters: Int): Unit = { - import sparkSession.implicits._ - - val count = 1024 * 1024 * 24 ->>>>>>> update val sc = sparkSession.sparkContext val primitiveIntArray = Array.fill[Int](count)(65535) @@ -79,7 +67,6 @@ class PrimitiveArrayBenchmark extends BenchmarkBase { benchmark.addCase("Double")(doubleArray) benchmark.run /* -<<<<<<< 1daa9216fe2a928d9c0b6153b12b043fac3c4fa0 OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) Write an array in Dataset: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative @@ -91,19 +78,6 @@ class PrimitiveArrayBenchmark extends BenchmarkBase { ignore("Write an array in Dataset") { writeDatasetArray(4) -======= - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Read an array in DataFrame: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Int 502 / 530 50.1 20.0 1.0X - Double 1111 / 1170 22.7 44.1 0.5X - */ - } - - ignore("Read an array in DataFrame") { - showArray(1) ->>>>>>> update } def writeArray(iters: Int): Unit = { From 1837792885f09043badd760ae2750590a63cb931 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 26 Jul 2016 02:00:17 +0900 Subject: [PATCH 64/75] rebase --- .../sql/catalyst/util/GenericArrayData.scala | 2 +- .../execution/datasources/jdbc/JDBCRDD.scala | 131 ++++++++++++++++++ 2 files changed, 132 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 138340a3a55e..10d863f0cfdb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -81,7 +81,7 @@ class GenericArrayData(val array: Array[Any], def this(array: Array[Any]) = this(array, null, null, null, null, null, null, null) - def this(seqOrArray: Any) = this(GenericArrayData.anyToSeq(seqOrArray)) + def this(seqOrArray: Any) = this(GenericRefArrayData.anyToSeq(seqOrArray)) override def copy(): ArrayData = { if (booleanArray != null) new GenericArrayData(booleanArray.clone()) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala index c0fabc81e42a..938f041bd565 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala @@ -229,6 +229,137 @@ private[jdbc] class JDBCRDD( } } + // A `JDBCValueSetter` is responsible for converting and setting a value from `ResultSet` + // into a field for `MutableRow`. The last argument `Int` means the index for the + // value to be set in the row and also used for the value to retrieve from `ResultSet`. + private type JDBCValueSetter = (ResultSet, MutableRow, Int) => Unit + + /** + * Creates `JDBCValueSetter`s according to [[StructType]], which can set + * each value from `ResultSet` to each field of [[MutableRow]] correctly. + */ + def makeSetters(schema: StructType): Array[JDBCValueSetter] = + schema.fields.map(sf => makeSetter(sf.dataType, sf.metadata)) + + private def makeSetter(dt: DataType, metadata: Metadata): JDBCValueSetter = dt match { + case BooleanType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setBoolean(pos, rs.getBoolean(pos + 1)) + + case DateType => + (rs: ResultSet, row: MutableRow, pos: Int) => + // DateTimeUtils.fromJavaDate does not handle null value, so we need to check it. + val dateVal = rs.getDate(pos + 1) + if (dateVal != null) { + row.setInt(pos, DateTimeUtils.fromJavaDate(dateVal)) + } else { + row.update(pos, null) + } + + // When connecting with Oracle DB through JDBC, the precision and scale of BigDecimal + // object returned by ResultSet.getBigDecimal is not correctly matched to the table + // schema reported by ResultSetMetaData.getPrecision and ResultSetMetaData.getScale. + // If inserting values like 19999 into a column with NUMBER(12, 2) type, you get through + // a BigDecimal object with scale as 0. But the dataframe schema has correct type as + // DecimalType(12, 2). Thus, after saving the dataframe into parquet file and then + // retrieve it, you will get wrong result 199.99. + // So it is needed to set precision and scale for Decimal based on JDBC metadata. + case DecimalType.Fixed(p, s) => + (rs: ResultSet, row: MutableRow, pos: Int) => + val decimal = + nullSafeConvert[java.math.BigDecimal](rs.getBigDecimal(pos + 1), d => Decimal(d, p, s)) + row.update(pos, decimal) + + case DoubleType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setDouble(pos, rs.getDouble(pos + 1)) + + case FloatType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setFloat(pos, rs.getFloat(pos + 1)) + + case IntegerType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setInt(pos, rs.getInt(pos + 1)) + + case LongType if metadata.contains("binarylong") => + (rs: ResultSet, row: MutableRow, pos: Int) => + val bytes = rs.getBytes(pos + 1) + var ans = 0L + var j = 0 + while (j < bytes.size) { + ans = 256 * ans + (255 & bytes(j)) + j = j + 1 + } + row.setLong(pos, ans) + + case LongType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.setLong(pos, rs.getLong(pos + 1)) + + case StringType => + (rs: ResultSet, row: MutableRow, pos: Int) => + // TODO(davies): use getBytes for better performance, if the encoding is UTF-8 + row.update(pos, UTF8String.fromString(rs.getString(pos + 1))) + + case TimestampType => + (rs: ResultSet, row: MutableRow, pos: Int) => + val t = rs.getTimestamp(pos + 1) + if (t != null) { + row.setLong(pos, DateTimeUtils.fromJavaTimestamp(t)) + } else { + row.update(pos, null) + } + + case BinaryType => + (rs: ResultSet, row: MutableRow, pos: Int) => + row.update(pos, rs.getBytes(pos + 1)) + + case ArrayType(et, _) => + val elementConversion = et match { + case TimestampType => + (array: Object) => + array.asInstanceOf[Array[java.sql.Timestamp]].map { timestamp => + nullSafeConvert(timestamp, DateTimeUtils.fromJavaTimestamp) + } + + case StringType => + (array: Object) => + array.asInstanceOf[Array[java.lang.String]] + .map(UTF8String.fromString) + + case DateType => + (array: Object) => + array.asInstanceOf[Array[java.sql.Date]].map { date => + nullSafeConvert(date, DateTimeUtils.fromJavaDate) + } + + case dt: DecimalType => + (array: Object) => + array.asInstanceOf[Array[java.math.BigDecimal]].map { decimal => + nullSafeConvert[java.math.BigDecimal]( + decimal, d => Decimal(d, dt.precision, dt.scale)) + } + + case LongType if metadata.contains("binarylong") => + throw new IllegalArgumentException(s"Unsupported array element " + + s"type ${dt.simpleString} based on binary") + + case ArrayType(_, _) => + throw new IllegalArgumentException("Nested arrays unsupported") + + case _ => (array: Object) => array.asInstanceOf[Array[Any]] + } + + (rs: ResultSet, row: MutableRow, pos: Int) => + val array = nullSafeConvert[Object]( + rs.getArray(pos + 1).getArray, + array => GenericArrayData.allocate(elementConversion.apply(array))) + row.update(pos, array) + + case _ => throw new IllegalArgumentException(s"Unsupported type ${dt.simpleString}") + } + /** * Runs the SQL query against the JDBC driver. * From 06b07da4fbfd6e7482fd0bc1a4b064ef37c857d3 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 10 Sep 2016 20:39:35 +0900 Subject: [PATCH 65/75] rebase --- .../aggregate/ApproximatePercentile.scala | 2 +- .../expressions/ObjectExpressionsSuite.scala | 16 +-- .../execution/datasources/jdbc/JDBCRDD.scala | 131 ------------------ .../datasources/jdbc/JdbcUtils.scala | 2 +- 4 files changed, 10 insertions(+), 141 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index 692cbd7c0d32..4d49af394b8c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -143,7 +143,7 @@ case class ApproximatePercentile( if (result.length == 0) { null } else if (returnPercentileArray) { - new GenericArrayData(result) + GenericArrayData.allocate(result) } else { result(0) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala index 3edcc02f1526..ab14981ad184 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala @@ -39,7 +39,7 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // test UnsafeRow-backed data val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]] val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4)))) - val structExpected = new GenericArrayData( + val structExpected = GenericArrayData.allocate( Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4)))) checkEvalutionWithUnsafeProjection( structEncoder.serializer.head, structExpected, structInputRow) @@ -47,8 +47,8 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // test UnsafeArray-backed data val arrayEncoder = ExpressionEncoder[Array[Array[Int]]] val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4)))) - val arrayExpected = new GenericArrayData( - Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4)))) + val arrayExpected = GenericArrayData.allocate( + Array(GenericArrayData.allocate(Array(1, 2)), GenericArrayData.allocate(Array(3, 4)))) checkEvalutionWithUnsafeProjection( arrayEncoder.serializer.head, arrayExpected, arrayInputRow) @@ -56,13 +56,13 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]] val mapInputRow = InternalRow.fromSeq(Seq(Array( Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400)))) - val mapExpected = new GenericArrayData(Seq( + val mapExpected = GenericArrayData.allocate(Seq( new ArrayBasedMapData( - new GenericArrayData(Array(1, 2)), - new GenericArrayData(Array(100, 200))), + GenericArrayData.allocate(Array(1, 2)), + GenericArrayData.allocate(Array(100, 200))), new ArrayBasedMapData( - new GenericArrayData(Array(3, 4)), - new GenericArrayData(Array(300, 400))))) + GenericArrayData.allocate(Array(3, 4)), + GenericArrayData.allocate(Array(300, 400))))) checkEvalutionWithUnsafeProjection( mapEncoder.serializer.head, mapExpected, mapInputRow) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala index 938f041bd565..c0fabc81e42a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala @@ -229,137 +229,6 @@ private[jdbc] class JDBCRDD( } } - // A `JDBCValueSetter` is responsible for converting and setting a value from `ResultSet` - // into a field for `MutableRow`. The last argument `Int` means the index for the - // value to be set in the row and also used for the value to retrieve from `ResultSet`. - private type JDBCValueSetter = (ResultSet, MutableRow, Int) => Unit - - /** - * Creates `JDBCValueSetter`s according to [[StructType]], which can set - * each value from `ResultSet` to each field of [[MutableRow]] correctly. - */ - def makeSetters(schema: StructType): Array[JDBCValueSetter] = - schema.fields.map(sf => makeSetter(sf.dataType, sf.metadata)) - - private def makeSetter(dt: DataType, metadata: Metadata): JDBCValueSetter = dt match { - case BooleanType => - (rs: ResultSet, row: MutableRow, pos: Int) => - row.setBoolean(pos, rs.getBoolean(pos + 1)) - - case DateType => - (rs: ResultSet, row: MutableRow, pos: Int) => - // DateTimeUtils.fromJavaDate does not handle null value, so we need to check it. - val dateVal = rs.getDate(pos + 1) - if (dateVal != null) { - row.setInt(pos, DateTimeUtils.fromJavaDate(dateVal)) - } else { - row.update(pos, null) - } - - // When connecting with Oracle DB through JDBC, the precision and scale of BigDecimal - // object returned by ResultSet.getBigDecimal is not correctly matched to the table - // schema reported by ResultSetMetaData.getPrecision and ResultSetMetaData.getScale. - // If inserting values like 19999 into a column with NUMBER(12, 2) type, you get through - // a BigDecimal object with scale as 0. But the dataframe schema has correct type as - // DecimalType(12, 2). Thus, after saving the dataframe into parquet file and then - // retrieve it, you will get wrong result 199.99. - // So it is needed to set precision and scale for Decimal based on JDBC metadata. - case DecimalType.Fixed(p, s) => - (rs: ResultSet, row: MutableRow, pos: Int) => - val decimal = - nullSafeConvert[java.math.BigDecimal](rs.getBigDecimal(pos + 1), d => Decimal(d, p, s)) - row.update(pos, decimal) - - case DoubleType => - (rs: ResultSet, row: MutableRow, pos: Int) => - row.setDouble(pos, rs.getDouble(pos + 1)) - - case FloatType => - (rs: ResultSet, row: MutableRow, pos: Int) => - row.setFloat(pos, rs.getFloat(pos + 1)) - - case IntegerType => - (rs: ResultSet, row: MutableRow, pos: Int) => - row.setInt(pos, rs.getInt(pos + 1)) - - case LongType if metadata.contains("binarylong") => - (rs: ResultSet, row: MutableRow, pos: Int) => - val bytes = rs.getBytes(pos + 1) - var ans = 0L - var j = 0 - while (j < bytes.size) { - ans = 256 * ans + (255 & bytes(j)) - j = j + 1 - } - row.setLong(pos, ans) - - case LongType => - (rs: ResultSet, row: MutableRow, pos: Int) => - row.setLong(pos, rs.getLong(pos + 1)) - - case StringType => - (rs: ResultSet, row: MutableRow, pos: Int) => - // TODO(davies): use getBytes for better performance, if the encoding is UTF-8 - row.update(pos, UTF8String.fromString(rs.getString(pos + 1))) - - case TimestampType => - (rs: ResultSet, row: MutableRow, pos: Int) => - val t = rs.getTimestamp(pos + 1) - if (t != null) { - row.setLong(pos, DateTimeUtils.fromJavaTimestamp(t)) - } else { - row.update(pos, null) - } - - case BinaryType => - (rs: ResultSet, row: MutableRow, pos: Int) => - row.update(pos, rs.getBytes(pos + 1)) - - case ArrayType(et, _) => - val elementConversion = et match { - case TimestampType => - (array: Object) => - array.asInstanceOf[Array[java.sql.Timestamp]].map { timestamp => - nullSafeConvert(timestamp, DateTimeUtils.fromJavaTimestamp) - } - - case StringType => - (array: Object) => - array.asInstanceOf[Array[java.lang.String]] - .map(UTF8String.fromString) - - case DateType => - (array: Object) => - array.asInstanceOf[Array[java.sql.Date]].map { date => - nullSafeConvert(date, DateTimeUtils.fromJavaDate) - } - - case dt: DecimalType => - (array: Object) => - array.asInstanceOf[Array[java.math.BigDecimal]].map { decimal => - nullSafeConvert[java.math.BigDecimal]( - decimal, d => Decimal(d, dt.precision, dt.scale)) - } - - case LongType if metadata.contains("binarylong") => - throw new IllegalArgumentException(s"Unsupported array element " + - s"type ${dt.simpleString} based on binary") - - case ArrayType(_, _) => - throw new IllegalArgumentException("Nested arrays unsupported") - - case _ => (array: Object) => array.asInstanceOf[Array[Any]] - } - - (rs: ResultSet, row: MutableRow, pos: Int) => - val array = nullSafeConvert[Object]( - rs.getArray(pos + 1).getArray, - array => GenericArrayData.allocate(elementConversion.apply(array))) - row.update(pos, array) - - case _ => throw new IllegalArgumentException(s"Unsupported type ${dt.simpleString}") - } - /** * Runs the SQL query against the JDBC driver. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala index 41edb6511c2c..27beccb0c4d2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -425,7 +425,7 @@ object JdbcUtils extends Logging { (rs: ResultSet, row: InternalRow, pos: Int) => val array = nullSafeConvert[Object]( rs.getArray(pos + 1).getArray, - array => new GenericArrayData(elementConversion.apply(array))) + array => GenericArrayData.allocate(elementConversion.apply(array))) row.update(pos, array) case _ => throw new IllegalArgumentException(s"Unsupported type ${dt.simpleString}") From 06c22acf4b96961d84f51773517939cc5df1c41b Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 1 Oct 2016 15:32:10 +0900 Subject: [PATCH 66/75] reimplement without factory method --- .../sql/catalyst/CatalystTypeConverters.scala | 8 +- .../spark/sql/catalyst/expressions/Cast.scala | 4 +- .../aggregate/ApproximatePercentile.scala | 2 +- .../expressions/aggregate/PivotFirst.scala | 2 +- .../expressions/aggregate/collect.scala | 2 +- .../codegen/GenerateSafeProjection.scala | 2 +- .../codegen/GenerateUnsafeProjection.scala | 2 +- .../expressions/collectionOperations.scala | 2 +- .../expressions/complexTypeCreator.scala | 11 +- .../expressions/complexTypeExtractors.scala | 4 +- .../expressions/objects/objects.scala | 8 +- .../expressions/regexpExpressions.scala | 4 +- .../expressions/stringExpressions.scala | 4 +- .../sql/catalyst/expressions/xml/xpath.scala | 2 +- .../sql/catalyst/json/JacksonParser.scala | 2 +- .../sql/catalyst/util/GenericArrayData.scala | 230 +----------------- .../sql/catalyst/ScalaReflectionSuite.scala | 4 +- .../analysis/AnalysisErrorSuite.scala | 4 +- .../encoders/EncoderResolutionSuite.scala | 4 +- .../catalyst/encoders/RowEncoderSuite.scala | 2 +- .../expressions/ObjectExpressionsSuite.scala | 16 +- .../expressions/UnsafeRowConverterSuite.scala | 2 +- .../codegen/GeneratedProjectionSuite.scala | 2 +- .../datasources/jdbc/JdbcUtils.scala | 2 +- .../parquet/ParquetRowConverter.scala | 4 +- .../sql/execution/python/EvaluatePython.scala | 4 +- .../spark/sql/test/ExamplePointUDT.scala | 2 +- .../org/apache/spark/sql/UnsafeRowSuite.scala | 2 +- .../spark/sql/UserDefinedTypeSuite.scala | 2 +- .../columnar/ColumnarTestUtils.scala | 2 +- .../spark/sql/hive/HiveInspectors.scala | 4 +- .../spark/sql/hive/HiveInspectorSuite.scala | 2 +- 32 files changed, 58 insertions(+), 289 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala index 2801827e7bb6..5b9161551a7a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala @@ -159,9 +159,9 @@ object CatalystTypeConverters { override def toCatalystImpl(scalaValue: Any): ArrayData = { scalaValue match { case a: Array[_] => - GenericArrayData.allocate(a.map(elementConverter.toCatalyst)) + new GenericArrayData(a.map(elementConverter.toCatalyst)) case s: Seq[_] => - GenericArrayData.allocate(s.map(elementConverter.toCatalyst).toArray) + new GenericArrayData(s.map(elementConverter.toCatalyst).toArray) case i: JavaIterable[_] => val iter = i.iterator val convertedIterable = scala.collection.mutable.ArrayBuffer.empty[Any] @@ -169,7 +169,7 @@ object CatalystTypeConverters { val item = iter.next() convertedIterable += elementConverter.toCatalyst(item) } - GenericArrayData.allocate(convertedIterable.toArray) + new GenericArrayData(convertedIterable.toArray) } } @@ -410,7 +410,7 @@ object CatalystTypeConverters { case t: Timestamp => TimestampConverter.toCatalyst(t) case d: BigDecimal => new DecimalConverter(DecimalType(d.precision, d.scale)).toCatalyst(d) case d: JavaBigDecimal => new DecimalConverter(DecimalType(d.precision, d.scale)).toCatalyst(d) - case seq: Seq[Any] => GenericArrayData.allocate(seq.map(convertToCatalyst).toArray) + case seq: Seq[Any] => new GenericArrayData(seq.map(convertToCatalyst).toArray) case r: Row => InternalRow(r.toSeq.map(convertToCatalyst): _*) case arr: Array[Any] => new GenericArrayData(arr.map(convertToCatalyst)) case map: Map[_, _] => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 039b4469b99e..4db1ae6faa15 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -388,7 +388,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w values(i) = elementCast(e) } }) - GenericArrayData.allocate(values) + new GenericArrayData(values) }) } @@ -864,7 +864,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w } } } - $evPrim = $arrayClass.allocate($values); + $evPrim = new $arrayClass($values); """ } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index 4d49af394b8c..692cbd7c0d32 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -143,7 +143,7 @@ case class ApproximatePercentile( if (result.length == 0) { null } else if (returnPercentileArray) { - GenericArrayData.allocate(result) + new GenericArrayData(result) } else { result(0) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala index 23a8d5fd4903..087606077295 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PivotFirst.scala @@ -131,7 +131,7 @@ case class PivotFirst( for (i <- 0 until indexSize) { result(i) = input.get(mutableAggBufferOffset + i, valueDataType) } - GenericArrayData.allocate(result) + new GenericArrayData(result) } override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate = diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala index f97fe5fe8d51..d2880d58aefe 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala @@ -78,7 +78,7 @@ abstract class Collect extends ImperativeAggregate { } override def eval(input: InternalRow): Any = { - GenericArrayData.allocate(buffer.toArray) + new GenericArrayData(buffer.toArray) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala index 792d735a0d5a..b1cb6edefb85 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala @@ -96,7 +96,7 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection] $values[$index] = ${elementConverter.value}; } } - final ArrayData $output = $arrayClass.allocate($values); + final ArrayData $output = new $arrayClass($values); """ ExprCode(code, "false", output) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala index 0854990f5ef8..0a802046c2f7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala @@ -292,7 +292,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro ${writeArrayToBuffer(ctx, keys, keyType, false, bufferHolder)} // Write the numBytes of key array into the first 8 bytes. - Platform.putLong($bufferHolder.buffer, $tmpCursor - 8, $bufferHolder.cursor - $tmpCursor); + Platform.putInt($bufferHolder.buffer, $tmpCursor - 8, $bufferHolder.cursor - $tmpCursor); ${writeArrayToBuffer(ctx, values, valueType, true, bufferHolder)} } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 1d10a9034ab7..c863ba434120 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -206,7 +206,7 @@ case class SortArray(base: Expression, ascendingOrder: Expression) if (elementType != NullType) { java.util.Arrays.sort(data, if (ascending.asInstanceOf[Boolean]) lt else gt) } - GenericArrayData.allocate(data.asInstanceOf[Array[Any]]) + new GenericArrayData(data.asInstanceOf[Array[Any]]) } override def prettyName: String = "sort_array" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index 640c32628cdd..c9f36649ec8e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -52,7 +52,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression { override def nullable: Boolean = false override def eval(input: InternalRow): Any = { - GenericArrayData.allocate(children.map(_.eval(input)).toArray) + new GenericArrayData(children.map(_.eval(input)).toArray) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -76,7 +76,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression { """ }) + s""" - final ArrayData ${ev.value} = $arrayClass.allocate($values); + final ArrayData ${ev.value} = new $arrayClass($values); this.$values = null; """) } @@ -130,8 +130,7 @@ case class CreateMap(children: Seq[Expression]) extends Expression { throw new RuntimeException("Cannot use null as map key!") } val valueArray = values.map(_.eval(input)).toArray - new ArrayBasedMapData( - GenericArrayData.allocate(keyArray), GenericArrayData.allocate(valueArray)) + new ArrayBasedMapData(new GenericArrayData(keyArray), new GenericArrayData(valueArray)) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -142,8 +141,8 @@ case class CreateMap(children: Seq[Expression]) extends Expression { ctx.addMutableState("Object[]", keyArray, s"this.$keyArray = null;") ctx.addMutableState("Object[]", valueArray, s"this.$valueArray = null;") - val keyData = s"$arrayClass.allocate($keyArray)" - val valueData = s"$arrayClass.allocate($valueArray)" + val keyData = s"new $arrayClass($keyArray)" + val valueData = s"new $arrayClass($valueArray)" ev.copy(code = s""" final boolean ${ev.isNull} = false; $keyArray = new Object[${keys.size}]; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala index f17d0bc412b1..0c256c3d890f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala @@ -176,7 +176,7 @@ case class GetArrayStructFields( } i += 1 } - GenericArrayData.allocate(result) + new GenericArrayData(result) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -201,7 +201,7 @@ case class GetArrayStructFields( } } } - ${ev.value} = $arrayClass.allocate($values); + ${ev.value} = new $arrayClass($values); """ }) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index 2949d391a034..50e2ac3c36d9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -281,11 +281,7 @@ case class NewInstance( val constructorCall = outer.map { gen => s"""${gen.value}.new ${cls.getSimpleName}(${argValues.mkString(", ")})""" }.getOrElse { - if (!cls.isAssignableFrom(classOf[GenericArrayData])) { - s"new $className(${argValues.mkString(", ")})" - } else { - s"${cls.getName}.allocate(${argValues.mkString(", ")})" - } + s"new $className(${argValues.mkString(", ")})" } val code = s""" @@ -541,7 +537,7 @@ case class MapObjects private( $loopIndex += 1; } - ${ev.value} = ${classOf[GenericArrayData].getName}.allocate($convertedArray); + ${ev.value} = new ${classOf[GenericArrayData].getName}($convertedArray); } """ ev.copy(code = code, isNull = genInputData.isNull) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 6aebe7970443..5648ad6b6dc1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -191,14 +191,14 @@ case class StringSplit(str: Expression, pattern: Expression) override def nullSafeEval(string: Any, regex: Any): Any = { val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1) - GenericArrayData.allocate(strings.asInstanceOf[Array[Any]]) + new GenericArrayData(strings.asInstanceOf[Array[Any]]) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val arrayClass = classOf[GenericArrayData].getName nullSafeCodeGen(ctx, ev, (str, pattern) => // Array in java is covariant, so we don't need to cast UTF8String[] to Object[]. - s"""${ev.value} = $arrayClass.allocate($str.split($pattern, -1));""") + s"""${ev.value} = new $arrayClass($str.split($pattern, -1));""") } override def prettyName: String = "split" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 712c889c7aa2..5f533fecf8d0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1596,8 +1596,8 @@ case class Sentences( widx = wi.current if (Character.isLetterOrDigit(word.charAt(0))) words += UTF8String.fromString(word) } - result += GenericArrayData.allocate(words) + result += new GenericArrayData(words) } - GenericArrayData.allocate(result) + new GenericArrayData(result) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala index 1e54acd26ab6..aa328045cafd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala @@ -210,7 +210,7 @@ case class XPathList(xml: Expression, path: Expression) extends XPathExtract { ret(i) = UTF8String.fromString(nodeList.item(i).getNodeValue) i += 1 } - GenericArrayData.allocate(ret) + new GenericArrayData(ret) } else { null } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index ec87133c9d11..e476cb11a351 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -404,7 +404,7 @@ class JacksonParser( values += fieldConverter.apply(parser) } - GenericArrayData.allocate(values.toArray) + new GenericArrayData(values.toArray) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 10d863f0cfdb..64eed3d65b07 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -50,7 +50,6 @@ private object GenericArrayData { def anyToSeq(seqOrArray: Any): Seq[Any] = seqOrArray match { case seq: Seq[Any] => seq case array: Array[_] => array.toSeq - case _ => Seq.empty } } @@ -129,7 +128,6 @@ class GenericArrayData(val array: Array[Any], override def getArray(ordinal: Int): ArrayData = getAs(ordinal) override def getMap(ordinal: Int): MapData = getAs(ordinal) -<<<<<<< 8df1c34a012f074c4c970374cbd9ef6b43d2a4ab override def isNullAt(ordinal: Int): Boolean = { if (booleanArray != null || byteArray != null || shortArray != null || intArray != null || longArray != null || floatArray != null || doubleArray != null) { @@ -228,13 +226,13 @@ class GenericArrayData(val array: Array[Any], } override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericArrayData]) { + if (!o.isInstanceOf[GenericArrayData]) { return false } val other = o.asInstanceOf[GenericArrayData] if (other eq null) { - return false; + return false } val len = numElements() @@ -331,227 +329,3 @@ class GenericArrayData(val array: Array[Any], result } } - -final class GenericIntArrayData(val primitiveArray: Array[Int]) extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericIntArrayData = new GenericIntArrayData(toIntArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getInt(ordinal: Int): Int = primitiveArray(ordinal) - override def toIntArray(): Array[Int] = { - val array = new Array[Int](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericIntArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericIntArrayData] - if (other eq null) { - return false; - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - -final class GenericLongArrayData(val primitiveArray: Array[Long]) - extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericLongArrayData = new GenericLongArrayData(toLongArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getLong(ordinal: Int): Long = primitiveArray(ordinal) - override def toLongArray(): Array[Long] = { - val array = new Array[Long](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericLongArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericLongArrayData] - if (other eq null) { - return false - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - -final class GenericFloatArrayData(val primitiveArray: Array[Float]) - extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericFloatArrayData = new GenericFloatArrayData(toFloatArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getFloat(ordinal: Int): Float = primitiveArray(ordinal) - override def toFloatArray(): Array[Float] = { - val array = new Array[Float](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericFloatArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericFloatArrayData] - if (other eq null) { - return false - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - -final class GenericDoubleArrayData(val primitiveArray: Array[Double]) - extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericDoubleArrayData = new GenericDoubleArrayData(toDoubleArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getDouble(ordinal: Int): Double = primitiveArray(ordinal) - override def toDoubleArray(): Array[Double] = { - val array = new Array[Double](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericDoubleArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericDoubleArrayData] - if (other eq null) { - return false - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - -final class GenericShortArrayData(val primitiveArray: Array[Short]) - extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericShortArrayData = new GenericShortArrayData(toShortArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getShort(ordinal: Int): Short = primitiveArray(ordinal) - override def toShortArray(): Array[Short] = { - val array = new Array[Short](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericShortArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericShortArrayData] - if (other eq null) { - return false - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - -final class GenericByteArrayData(val primitiveArray: Array[Byte]) - extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericByteArrayData = new GenericByteArrayData(toByteArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getByte(ordinal: Int): Byte = primitiveArray(ordinal) - override def toByteArray(): Array[Byte] = { - val array = new Array[Byte](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericByteArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericByteArrayData] - if (other eq null) { - return false - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - -final class GenericBooleanArrayData(val primitiveArray: Array[Boolean]) - extends GenericArrayData { - override def array(): Array[Any] = primitiveArray.toArray - - override def copy(): GenericBooleanArrayData = new GenericBooleanArrayData(toBooleanArray) - - override def numElements(): Int = primitiveArray.length - - override def isNullAt(ordinal: Int): Boolean = false - override def getBoolean(ordinal: Int): Boolean = primitiveArray(ordinal) - override def toBooleanArray(): Array[Boolean] = { - val array = new Array[Boolean](numElements) - System.arraycopy(primitiveArray, 0, array, 0, numElements) - array - } - - override def equals(o: Any): Boolean = { - if (o == null || !o.isInstanceOf[GenericBooleanArrayData]) { - return false - } - - val other = o.asInstanceOf[GenericBooleanArrayData] - if (other eq null) { - return false - } - - java.util.Arrays.equals(primitiveArray, other.primitiveArray) - } - - override def hashCode: Int = java.util.Arrays.hashCode(primitiveArray) -} - diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala index ded074c5abc7..43b6afd9ad89 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala @@ -287,8 +287,8 @@ class ScalaReflectionSuite extends SparkFunSuite { assert(serializer.children.head.isInstanceOf[Literal]) assert(serializer.children.head.asInstanceOf[Literal].value === UTF8String.fromString("value")) assert(serializer.children.last.isInstanceOf[NewInstance]) - assert(classOf[org.apache.spark.sql.catalyst.util.GenericArrayData] - isAssignableFrom(serializer.children.last.asInstanceOf[NewInstance].cls)) + assert(serializer.children.last.asInstanceOf[NewInstance] + .cls.isAssignableFrom(classOf[org.apache.spark.sql.catalyst.util.GenericArrayData])) } private val dataTypeForComplexData = dataTypeFor[ComplexData] diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala index 6e59215dc8c6..21afe9fec594 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala @@ -57,8 +57,8 @@ private[sql] class UngroupableUDT extends UserDefinedType[UngroupableData] { override def sqlType: DataType = MapType(IntegerType, IntegerType) override def serialize(ungroupableData: UngroupableData): MapData = { - val keyArray = GenericArrayData.allocate(ungroupableData.data.keys.toSeq) - val valueArray = GenericArrayData.allocate(ungroupableData.data.values.toSeq) + val keyArray = new GenericArrayData(ungroupableData.data.keys.toSeq) + val valueArray = new GenericArrayData(ungroupableData.data.values.toSeq) new ArrayBasedMapData(keyArray, valueArray) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala index 49934354e5de..802397d50e85 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala @@ -70,11 +70,11 @@ class EncoderResolutionSuite extends PlanTest { val bound = encoder.resolveAndBind(attrs) // If no null values appear, it should works fine - bound.fromRow(InternalRow(GenericArrayData.allocate(Array(1, 2)))) + bound.fromRow(InternalRow(new GenericArrayData(Array(1, 2)))) // If there is null value, it should throw runtime exception val e = intercept[RuntimeException] { - bound.fromRow(InternalRow(GenericArrayData.allocate(Array(1, null)))) + bound.fromRow(InternalRow(new GenericArrayData(Array(1, null)))) } assert(e.getMessage.contains("Null value appeared in non-nullable field")) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala index 46575f7d63eb..1a5569a77dc7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala @@ -51,7 +51,7 @@ class ExamplePointUDT extends UserDefinedType[ExamplePoint] { val output = new Array[Any](2) output(0) = p.x output(1) = p.y - GenericArrayData.allocate(output) + new GenericArrayData(output) } override def deserialize(datum: Any): ExamplePoint = { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala index ab14981ad184..3edcc02f1526 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala @@ -39,7 +39,7 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // test UnsafeRow-backed data val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]] val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4)))) - val structExpected = GenericArrayData.allocate( + val structExpected = new GenericArrayData( Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4)))) checkEvalutionWithUnsafeProjection( structEncoder.serializer.head, structExpected, structInputRow) @@ -47,8 +47,8 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // test UnsafeArray-backed data val arrayEncoder = ExpressionEncoder[Array[Array[Int]]] val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4)))) - val arrayExpected = GenericArrayData.allocate( - Array(GenericArrayData.allocate(Array(1, 2)), GenericArrayData.allocate(Array(3, 4)))) + val arrayExpected = new GenericArrayData( + Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4)))) checkEvalutionWithUnsafeProjection( arrayEncoder.serializer.head, arrayExpected, arrayInputRow) @@ -56,13 +56,13 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]] val mapInputRow = InternalRow.fromSeq(Seq(Array( Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400)))) - val mapExpected = GenericArrayData.allocate(Seq( + val mapExpected = new GenericArrayData(Seq( new ArrayBasedMapData( - GenericArrayData.allocate(Array(1, 2)), - GenericArrayData.allocate(Array(100, 200))), + new GenericArrayData(Array(1, 2)), + new GenericArrayData(Array(100, 200))), new ArrayBasedMapData( - GenericArrayData.allocate(Array(3, 4)), - GenericArrayData.allocate(Array(300, 400))))) + new GenericArrayData(Array(3, 4)), + new GenericArrayData(Array(300, 400))))) checkEvalutionWithUnsafeProjection( mapEncoder.serializer.head, mapExpected, mapInputRow) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala index b634834c67e3..cf3cbe270753 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala @@ -291,7 +291,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers { assert(unsafeRow.getSizeInBytes == 8 + 2 * 8 + row1.getSizeInBytes + row2.getSizeInBytes) } - private def createArray(values: Any*): ArrayData = GenericArrayData.allocate(values.toArray) + private def createArray(values: Any*): ArrayData = new GenericArrayData(values.toArray) private def createMap(keys: Any*)(values: Any*): MapData = { assert(keys.length == values.length) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala index d6c9a9c0b638..b69b74b4240b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala @@ -86,7 +86,7 @@ class GeneratedProjectionSuite extends SparkFunSuite { test("generated unsafe projection with array of binary") { val row = InternalRow( Array[Byte](1, 2), - GenericArrayData.allocate(Array(Array[Byte](1, 2), null, Array[Byte](3, 4)))) + new GenericArrayData(Array(Array[Byte](1, 2), null, Array[Byte](3, 4)))) val fields = (BinaryType :: ArrayType(BinaryType) :: Nil).toArray[DataType] val unsafeProj = UnsafeProjection.create(fields) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala index 27beccb0c4d2..41edb6511c2c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -425,7 +425,7 @@ object JdbcUtils extends Logging { (rs: ResultSet, row: InternalRow, pos: Int) => val array = nullSafeConvert[Object]( rs.getArray(pos + 1).getArray, - array => GenericArrayData.allocate(elementConversion.apply(array))) + array => new GenericArrayData(elementConversion.apply(array))) row.update(pos, array) case _ => throw new IllegalArgumentException(s"Unsupported type ${dt.simpleString}") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala index 108977c23ec3..33dcf2f3fd16 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala @@ -491,7 +491,7 @@ private[parquet] class ParquetRowConverter( override def getConverter(fieldIndex: Int): Converter = elementConverter - override def end(): Unit = updater.set(GenericArrayData.allocate(currentArray.toArray)) + override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray)) // NOTE: We can't reuse the mutable `ArrayBuffer` here and must instantiate a new buffer for the // next value. `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored @@ -590,7 +590,7 @@ private[parquet] class ParquetRowConverter( protected def newArrayUpdater(updater: ParentContainerUpdater) = new ParentContainerUpdater { override def start(): Unit = currentArray = ArrayBuffer.empty[Any] - override def end(): Unit = updater.set(GenericArrayData.allocate(currentArray.toArray)) + override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray)) override def set(value: Any): Unit = currentArray += value } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala index 0e496dfd29e8..46fd54e5c742 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala @@ -119,10 +119,10 @@ object EvaluatePython { case (c, BinaryType) if c.getClass.isArray && c.getClass.getComponentType.getName == "byte" => c case (c: java.util.List[_], ArrayType(elementType, _)) => - GenericArrayData.allocate(c.asScala.map { e => fromJava(e, elementType)}.toArray) + new GenericArrayData(c.asScala.map { e => fromJava(e, elementType)}.toArray) case (c, ArrayType(elementType, _)) if c.getClass.isArray => - GenericArrayData.allocate(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType))) + new GenericArrayData(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType))) case (javaMap: java.util.Map[_, _], MapType(keyType, valueType, _)) => ArrayBasedMapData( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala index 6a8a5e060fd8..a73e4272950a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala @@ -49,7 +49,7 @@ private[sql] class ExamplePointUDT extends UserDefinedType[ExamplePoint] { val output = new Array[Any](2) output(0) = p.x output(1) = p.y - GenericArrayData.allocate(output) + new GenericArrayData(output) } override def deserialize(datum: Any): ExamplePoint = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala index c002dfcf4908..a32763db054f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala @@ -160,7 +160,7 @@ class UnsafeRowSuite extends SparkFunSuite { } test("calling hashCode on unsafe array returned by getArray(ordinal)") { - val row = InternalRow.apply(GenericArrayData.allocate(Array(1L))) + val row = InternalRow.apply(new GenericArrayData(Array(1L))) val unsafeRow = UnsafeProjection.create(Array[DataType](ArrayType(LongType))).apply(row) // Makes sure hashCode on unsafe array won't crash unsafeRow.getArray(0).hashCode() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala index 17ec9315e4a6..474f17ff7afb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala @@ -50,7 +50,7 @@ object UDT { override def sqlType: DataType = ArrayType(DoubleType, containsNull = false) override def serialize(features: MyDenseVector): ArrayData = { - GenericArrayData.allocate(features.data.map(_.asInstanceOf[Any])) + new GenericArrayData(features.data.map(_.asInstanceOf[Any])) } override def deserialize(datum: Any): MyDenseVector = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala index e590d2833477..686c8fa6f5fa 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala @@ -56,7 +56,7 @@ object ColumnarTestUtils { case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => - GenericArrayData.allocate(Array[Any](Random.nextInt(), Random.nextInt())) + new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index 7002b6437611..e303065127c3 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -481,7 +481,7 @@ private[hive] trait HiveInspectors { val values = li.getWritableConstantValue.asScala .map(unwrapper) .toArray - val constant = GenericArrayData.allocate(values) + val constant = new GenericArrayData(values) _ => constant case poi: VoidObjectInspector => _ => null // always be null for void object inspector @@ -637,7 +637,7 @@ private[hive] trait HiveInspectors { Option(li.getList(data)) .map { l => val values = l.asScala.map(unwrapper).toArray - GenericArrayData.allocate(values) + new GenericArrayData(values) } .orNull } else { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala index 8cadaeedea69..3de1f4aeb74d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala @@ -229,7 +229,7 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors { test("wrap / unwrap Array Type") { val dt = ArrayType(dataTypes(0)) - val d = GenericArrayData.allocate(Array(row(0), row(0))) + val d = new GenericArrayData(Array(row(0), row(0))) checkValue(d, unwrap(wrap(d, toInspector(dt), dt), toInspector(dt))) checkValue(null, unwrap(wrap(null, toInspector(dt), dt), toInspector(dt))) checkValue(d, From 4c6f41ea3f1854b9b741c92ebe3d2b6a314d945d Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 1 Oct 2016 15:57:12 +0900 Subject: [PATCH 67/75] update benchmark programs --- .../benchmark/GenericArrayDataBenchmark.scala | 128 ++++++------------ .../benchmark/PrimitiveArrayBenchmark.scala | 2 +- 2 files changed, 42 insertions(+), 88 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index de3084f6b42c..38bdbf9e448e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -40,14 +40,15 @@ class GenericArrayDataBenchmark extends BenchmarkBase { val specializedIntArray = { i: Int => var n = 0 while (n < iters) { - array = GenericArrayData.allocate(primitiveIntArray) + array = new GenericArrayData(primitiveIntArray) n += 1 } } + val anyArray = primitiveIntArray.toArray[Any] val genericIntArray = { i: Int => var n = 0 while (n < iters) { - array = new GenericRefArrayData(primitiveIntArray) + array = new GenericArrayData(anyArray) n += 1 } } @@ -57,14 +58,6 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic ")(genericIntArray) benchmark.addCase("Specialized")(specializedIntArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Allocate GenericArrayData for int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 40 / 43 522.2 1.9 1.0X - Specialized 0 / 0 209715200.0 0.0 401598.7X - */ } def allocateGenericDoubleArray(iters: Int): Unit = { @@ -75,14 +68,15 @@ class GenericArrayDataBenchmark extends BenchmarkBase { val specializedDoubleArray = { i: Int => var n = 0 while (n < iters) { - array = GenericArrayData.allocate(primitiveDoubleArray) + array = new GenericArrayData(primitiveDoubleArray) n += 1 } } + val anyArray = primitiveDoubleArray.toArray[Any] val genericDoubleArray = { i: Int => var n = 0 while (n < iters) { - array = new GenericRefArrayData(primitiveDoubleArray) + array = new GenericArrayData(anyArray) n += 1 } } @@ -92,99 +86,75 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic ")(genericDoubleArray) benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Allocate GenericArrayData for double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 40 / 44 523.2 1.9 1.0X - Specialized 0 / 0 225500215.1 0.0 431013.0X - */ } def getPrimitiveIntArray(iters: Int): Unit = { - val count = 1024 * 1024 + val count = 1024 * 1024 * 8 - val intSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Int](count)) - val intDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Int](count)) + val anyArray: GenericArrayData = new GenericArrayData(new Array[Int](count).toArray[Any]) + val intArray: GenericArrayData = new GenericArrayData(new Array[Int](count)) var primitiveIntArray: Array[Int] = null val genericIntArray = { i: Int => var n = 0 while (n < iters) { - primitiveIntArray = intSparseArray.toIntArray + primitiveIntArray = anyArray.toIntArray n += 1 } } - val denseIntArray = { i: Int => + val specializedIntArray = { i: Int => var n = 0 while (n < iters) { - primitiveIntArray = intDenseArray.toIntArray + primitiveIntArray = intArray.toIntArray n += 1 } } val benchmark = new Benchmark("Get int primitive array", count * iters) - benchmark.addCase("Generic ")(genericIntArray) - benchmark.addCase("Specialized")(denseIntArray) + benchmark.addCase("Generic")(genericIntArray) + benchmark.addCase("Specialized")(specializedIntArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Get int primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 67 / 70 783.9 1.3 1.0X - Specialized 41 / 43 1263.8 0.8 1.6X - */ } def getPrimitiveDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 + val count = 1024 * 1024 * 8 - val doubleSparseArray: GenericArrayData = new GenericRefArrayData(new Array[Double](count)) - val doubleDenseArray: GenericArrayData = GenericArrayData.allocate(new Array[Double](count)) + val anyArray: GenericArrayData = new GenericArrayData(new Array[Double](count).toArray[Any]) + val doubleArray: GenericArrayData = new GenericArrayData(new Array[Double](count)) var primitiveDoubleArray: Array[Double] = null val genericDoubleArray = { i: Int => var n = 0 while (n < iters) { - primitiveDoubleArray = doubleSparseArray.toDoubleArray + primitiveDoubleArray = anyArray.toDoubleArray n += 1 } } val specializedDoubleArray = { i: Int => var n = 0 while (n < iters) { - primitiveDoubleArray = doubleDenseArray.toDoubleArray + primitiveDoubleArray = doubleArray.toDoubleArray n += 1 } } val benchmark = new Benchmark("Get double primitive array", count * iters) - benchmark.addCase("Generic ")(genericDoubleArray) + benchmark.addCase("Generic")(genericDoubleArray) benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Get double primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 211 / 217 248.6 4.0 1.0X - Specialized 95 / 100 554.1 1.8 2.2X - */ } def readGenericIntArray(iters: Int): Unit = { - val count = 1024 * 1024 * 2 + val count = 1024 * 1024 * 8 var result: Int = 0 - val sparseArray = new GenericRefArrayData(new Array[Int](count)) + val anyArray = new GenericArrayData(new Array[Int](count).toArray[Any]) val genericIntArray = { i: Int => var n = 0 while (n < iters) { - val len = sparseArray.numElements + val len = anyArray.numElements var sum = 0 var i = 0 while (i < len) { - sum += sparseArray.getInt(i) + sum += anyArray.getInt(i) i += 1 } result = sum @@ -192,15 +162,15 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } } - val denseArray = GenericArrayData.allocate(new Array[Int](count)) - val denseIntArray = { i: Int => + val intArray = new GenericArrayData(new Array[Int](count)) + val specializedIntArray = { i: Int => var n = 0 while (n < iters) { - val len = denseArray.numElements + val len = intArray.numElements var sum = 0 var i = 0 while (i < len) { - sum += denseArray.getInt(i) + sum += intArray.getInt(i) i += 1 } result = sum @@ -209,32 +179,24 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } val benchmark = new Benchmark("Read GenericArrayData Int", count * iters) - benchmark.addCase("Sparse")(genericIntArray) - benchmark.addCase("Dense ")(denseIntArray) + benchmark.addCase("Generic")(genericIntArray) + benchmark.addCase("Specialized")(specializedIntArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Read GenericArrayData Int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 160 / 163 1314.5 0.8 1.0X - Specialized 68 / 69 3080.0 0.3 2.3X - */ } def readGenericDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 * 2 + val count = 1024 * 1024 *8 var result: Double = 0 - val sparseArray = new GenericRefArrayData(new Array[Double](count)) + val anyArray = new GenericArrayData(new Array[Double](count).toArray[Any]) val genericDoubleArray = { i: Int => var n = 0 while (n < iters) { - val len = sparseArray.numElements + val len = anyArray.numElements var sum = 0.toDouble var i = 0 while (i < len) { - sum += sparseArray.getDouble(i) + sum += anyArray.getDouble(i) i += 1 } result = sum @@ -242,15 +204,15 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } } - val denseArray = GenericArrayData.allocate(new Array[Double](count)) + val doubleArray = new GenericArrayData(new Array[Double](count)) val specializedDoubleArray = { i: Int => var n = 0 while (n < iters) { - val len = denseArray.numElements + val len = doubleArray.numElements var sum = 0.toDouble var i = 0 while (i < len) { - sum += denseArray.getDouble(i) + sum += doubleArray.getDouble(i) i += 1 } result = sum @@ -262,14 +224,6 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic")(genericDoubleArray) benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.0.4-301.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Read GenericArrayData Double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 611 / 613 343.3 2.9 1.0X - Specialized 199 / 202 1051.5 1.0 3.1X - */ } ignore("allocate GenericArrayData") { @@ -278,12 +232,12 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } ignore("get primitive array") { - getPrimitiveIntArray(50) - getPrimitiveDoubleArray(50) + getPrimitiveIntArray(20) + getPrimitiveDoubleArray(20) } ignore("read elements in GenericArrayData") { - readGenericIntArray(100) - readGenericDoubleArray(100) + readGenericIntArray(25) + readGenericDoubleArray(25) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala index a3f32ec3da04..5eba0de1215d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala @@ -87,7 +87,7 @@ class PrimitiveArrayBenchmark extends BenchmarkBase { val n = 1024 * 1024 val rows = 15 - val benchmark = new Benchmark("Read primitive array", n) + val benchmark = new Benchmark("Write an array in Dataframe", n) val intDF = sparkSession.sparkContext.parallelize(0 until rows, 1) .map(i => Array.tabulate(n)(i => i)).toDF() From 903122463d9ba18cc7e1dc3e5185c10a192f1323 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 1 Oct 2016 16:08:54 +0900 Subject: [PATCH 68/75] fix scala style error --- .../sql/execution/benchmark/GenericArrayDataBenchmark.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index 38bdbf9e448e..7bbf01277cac 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -185,7 +185,7 @@ class GenericArrayDataBenchmark extends BenchmarkBase { } def readGenericDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 *8 + val count = 1024 * 1024 * 8 var result: Double = 0 val anyArray = new GenericArrayData(new Array[Double](count).toArray[Any]) From ecbc32e9464d40d44042f9932f5c13131c3a7961 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 1 Oct 2016 17:53:20 +0900 Subject: [PATCH 69/75] fix test failure in OrcQuerySuite --- .../catalyst/expressions/codegen/GenerateUnsafeProjection.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala index 0a802046c2f7..0854990f5ef8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala @@ -292,7 +292,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro ${writeArrayToBuffer(ctx, keys, keyType, false, bufferHolder)} // Write the numBytes of key array into the first 8 bytes. - Platform.putInt($bufferHolder.buffer, $tmpCursor - 8, $bufferHolder.cursor - $tmpCursor); + Platform.putLong($bufferHolder.buffer, $tmpCursor - 8, $bufferHolder.cursor - $tmpCursor); ${writeArrayToBuffer(ctx, values, valueType, true, bufferHolder)} } From 503dbdeaa66953639d733857983f4a48c28111a0 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 1 Oct 2016 21:51:17 +0900 Subject: [PATCH 70/75] update benchmark results --- .../benchmark/GenericArrayDataBenchmark.scala | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index 7bbf01277cac..7303e4ea2f07 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -58,6 +58,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic ")(genericIntArray) benchmark.addCase("Specialized")(specializedIntArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Allocate GenericArrayData for int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 0 / 0 46500044.3 0.0 1.0X + Specialized 0 / 0 170500162.6 0.0 3.7X + */ } def allocateGenericDoubleArray(iters: Int): Unit = { @@ -86,6 +94,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic ")(genericDoubleArray) benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Allocate GenericArrayData for double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 0 / 0 55627374.0 0.0 1.0X + Specialized 0 / 0 177724745.8 0.0 3.2X + */ } def getPrimitiveIntArray(iters: Int): Unit = { @@ -113,6 +129,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic")(genericIntArray) benchmark.addCase("Specialized")(specializedIntArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Get int primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 334 / 382 502.4 2.0 1.0X + Specialized 282 / 314 595.4 1.7 1.2X + */ } def getPrimitiveDoubleArray(iters: Int): Unit = { @@ -140,6 +164,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic")(genericDoubleArray) benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Get double primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 1720 / 1883 97.6 10.3 1.0X + Specialized 703 / 1117 238.7 4.2 2.4X + */ } def readGenericIntArray(iters: Int): Unit = { @@ -182,6 +214,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic")(genericIntArray) benchmark.addCase("Specialized")(specializedIntArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Read GenericArrayData Int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 206 / 212 1017.6 1.0 1.0X + Specialized 161 / 167 1301.0 0.8 1.3X + */ } def readGenericDoubleArray(iters: Int): Unit = { @@ -224,6 +264,14 @@ class GenericArrayDataBenchmark extends BenchmarkBase { benchmark.addCase("Generic")(genericDoubleArray) benchmark.addCase("Specialized")(specializedDoubleArray) benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Read GenericArrayData Double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 547 / 581 383.3 2.6 1.0X + Specialized 237 / 260 884.0 1.1 2.3X + */ } ignore("allocate GenericArrayData") { From c7ed68f0ea20bd6c9cfeeeec9e8292ac617f56d7 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 9 Nov 2016 02:07:53 +0900 Subject: [PATCH 71/75] fix compilation error --- .../sql/catalyst/util/GenericArrayData.scala | 23 +------------------ 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 64eed3d65b07..34236ad6f4b4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -23,27 +23,6 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} -object GenericArrayData { - def allocate(array: Array[Any]): GenericRefArrayData = new GenericRefArrayData(array) - def allocate(seq: Seq[Any]): GenericRefArrayData = new GenericRefArrayData(seq) - def allocate(list: java.util.List[Any]): GenericRefArrayData = new GenericRefArrayData(list) - def allocate(seqOrArray: Any): GenericRefArrayData = new GenericRefArrayData(seqOrArray) - def allocate(primitiveArray: Array[Int]): GenericIntArrayData = - new GenericIntArrayData(primitiveArray) - def allocate(primitiveArray: Array[Long]): GenericLongArrayData = - new GenericLongArrayData(primitiveArray) - def allocate(primitiveArray: Array[Float]): GenericFloatArrayData = - new GenericFloatArrayData(primitiveArray) - def allocate(primitiveArray: Array[Double]): GenericDoubleArrayData = - new GenericDoubleArrayData(primitiveArray) - def allocate(primitiveArray: Array[Short]): GenericShortArrayData = - new GenericShortArrayData(primitiveArray) - def allocate(primitiveArray: Array[Byte]): GenericByteArrayData = - new GenericByteArrayData(primitiveArray) - def allocate(primitiveArray: Array[Boolean]): GenericBooleanArrayData = - new GenericBooleanArrayData(primitiveArray) -} - private object GenericArrayData { // SPARK-16634: Workaround for JVM bug present in some 1.7 versions. @@ -80,7 +59,7 @@ class GenericArrayData(val array: Array[Any], def this(array: Array[Any]) = this(array, null, null, null, null, null, null, null) - def this(seqOrArray: Any) = this(GenericRefArrayData.anyToSeq(seqOrArray)) + def this(seqOrArray: Any) = this(GenericArrayData.anyToSeq(seqOrArray)) override def copy(): ArrayData = { if (booleanArray != null) new GenericArrayData(booleanArray.clone()) From c82fbf3d9e5668edb57886909670142dc3e9872d Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 9 Nov 2016 05:33:20 +0900 Subject: [PATCH 72/75] add another use cases from #13909 --- .../expressions/complexTypeCreator.scala | 38 ++++++++++++---- .../benchmark/PrimitiveArrayBenchmark.scala | 45 +++++++++---------- 2 files changed, 50 insertions(+), 33 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index c9f36649ec8e..e1150506cc1b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -60,25 +60,47 @@ case class CreateArray(children: Seq[Expression]) extends Expression { val values = ctx.freshName("values") ctx.addMutableState("Object[]", values, s"this.$values = null;") - ev.copy(code = s""" + val ArrayType(dt, _) = dataType + val isPrimitive = ctx.isPrimitiveType(dt) && false + val evals = children.map(e => e.genCode(ctx)) + val allNonNull = evals.forall(_.isNull == "false") + if (!isPrimitive || !allNonNull) { + ev.copy(code = s""" final boolean ${ev.isNull} = false; this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( - ctx.INPUT_ROW, - children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + s""" + ctx.splitExpressions( + ctx.INPUT_ROW, + children.zipWithIndex.map { case (e, i) => + val eval = e.genCode(ctx) + eval.code + + s""" if (${eval.isNull}) { $values[$i] = null; } else { $values[$i] = ${eval.value}; } """ - }) + - s""" + }) + + s""" final ArrayData ${ev.value} = new $arrayClass($values); this.$values = null; """) + } else { + val javaDataType = ctx.javaType(dt) + ctx.addMutableState(s"${javaDataType}[]", values, + s"this.$values = new ${javaDataType}[${children.size}];") + ev.copy(code = + ctx.splitExpressions( + ctx.INPUT_ROW, + evals.zipWithIndex.map { case (eval, i) => + eval.code + + s"\n$values[$i] = ${eval.value};" + }) + + s""" + final ArrayData ${ev.value} = new $arrayClass($values); + """, + isNull = "false") + } } override def prettyName: String = "array" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala index 5eba0de1215d..11792a16fb2f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala @@ -80,43 +80,38 @@ class PrimitiveArrayBenchmark extends BenchmarkBase { writeDatasetArray(4) } - def writeArray(iters: Int): Unit = { + def readDataFrameArray(iters: Int): Unit = { import sparkSession.implicits._ + val n = 1500 + val rows = 3 - val iters = 5 - val n = 1024 * 1024 - val rows = 15 + val intStatement = (0 to n - 1).map(i => s"value + $i").mkString("Array(", ",", ")") + val ints = Array.tabulate(rows)(i => i) + val intDF = sparkSession.sparkContext.parallelize(ints, 1).toDF - val benchmark = new Benchmark("Write an array in Dataframe", n) + val doubleStatement = (0 to n - 1).map(i => s"value + $i.0d").mkString("Array(", ",", ")") + val doubles = Array.tabulate(rows)(i => i.toDouble) + val doubleDF = sparkSession.sparkContext.parallelize(doubles, 1).toDF - val intDF = sparkSession.sparkContext.parallelize(0 until rows, 1) - .map(i => Array.tabulate(n)(i => i)).toDF() - intDF.count() // force to create df - - benchmark.addCase(s"Write int array in DataFrame", numIters = iters)(iter => { - intDF.selectExpr("value as a").collect + val benchmark = new Benchmark("Read a primitive array in DataFrame", n * iters) + benchmark.addCase(s"Int ", numIters = iters)(iter => { + intDF.selectExpr(intStatement).queryExecution.toRdd.collect.length }) - - val doubleDF = sparkSession.sparkContext.parallelize(0 until rows, 1) - .map(i => Array.tabulate(n)(i => i.toDouble)).toDF() - doubleDF.count() // force to create df - - benchmark.addCase(s"Write double array in DataFrame", numIters = iters)(iter => { - doubleDF.selectExpr("value as a").collect + benchmark.addCase(s"Double", numIters = iters)(iter => { + doubleDF.selectExpr(doubleStatement).queryExecution.toRdd.collect.length }) - - benchmark.run() + benchmark.run /* OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) - Read primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + Read a primitive array in DataFrame: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Write int array in DataFrame 1290 / 1748 0.8 1230.1 1.0X - Write double array in DataFrame 1761 / 2236 0.6 1679.0 0.7X + Int 241 / 340 0.0 32140.2 1.0X + Double 212 / 220 0.0 28319.2 1.1X */ } - ignore("Write an array in DataFrame") { - writeArray(1) + test("Read an array in DataFrame") { + readDataFrameArray(5) } } From 6bf54ec5e227689d69f6db991e9ecbc54e153d0a Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 9 Nov 2016 05:46:29 +0900 Subject: [PATCH 73/75] update benchmark results --- .../benchmark/GenericArrayDataBenchmark.scala | 95 ++----------------- 1 file changed, 9 insertions(+), 86 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index 7303e4ea2f07..f253251d0522 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -32,79 +32,7 @@ import org.apache.spark.util.Benchmark */ class GenericArrayDataBenchmark extends BenchmarkBase { - def allocateGenericIntArray(iters: Int): Unit = { - val count = 1024 * 1024 - var array: GenericArrayData = null - - val primitiveIntArray = new Array[Int](count) - val specializedIntArray = { i: Int => - var n = 0 - while (n < iters) { - array = new GenericArrayData(primitiveIntArray) - n += 1 - } - } - val anyArray = primitiveIntArray.toArray[Any] - val genericIntArray = { i: Int => - var n = 0 - while (n < iters) { - array = new GenericArrayData(anyArray) - n += 1 - } - } - - val benchmark = new Benchmark("Allocate GenericArrayData for int", count * iters, - minNumIters = 10, minTime = 1.milliseconds) - benchmark.addCase("Generic ")(genericIntArray) - benchmark.addCase("Specialized")(specializedIntArray) - benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Allocate GenericArrayData for int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 0 / 0 46500044.3 0.0 1.0X - Specialized 0 / 0 170500162.6 0.0 3.7X - */ - } - - def allocateGenericDoubleArray(iters: Int): Unit = { - val count = 1024 * 1024 - var array: GenericArrayData = null - - val primitiveDoubleArray = new Array[Int](count) - val specializedDoubleArray = { i: Int => - var n = 0 - while (n < iters) { - array = new GenericArrayData(primitiveDoubleArray) - n += 1 - } - } - val anyArray = primitiveDoubleArray.toArray[Any] - val genericDoubleArray = { i: Int => - var n = 0 - while (n < iters) { - array = new GenericArrayData(anyArray) - n += 1 - } - } - - val benchmark = new Benchmark("Allocate GenericArrayData for double", count * iters, - minNumIters = 10, minTime = 1.milliseconds) - benchmark.addCase("Generic ")(genericDoubleArray) - benchmark.addCase("Specialized")(specializedDoubleArray) - benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Allocate GenericArrayData for double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Generic 0 / 0 55627374.0 0.0 1.0X - Specialized 0 / 0 177724745.8 0.0 3.2X - */ - } - - def getPrimitiveIntArray(iters: Int): Unit = { + def getPrimitiveIntArray(iters: Int): Unit = { val count = 1024 * 1024 * 8 val anyArray: GenericArrayData = new GenericArrayData(new Array[Int](count).toArray[Any]) @@ -134,8 +62,8 @@ class GenericArrayDataBenchmark extends BenchmarkBase { Intel Xeon E3-12xx v2 (Ivy Bridge) Get int primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Generic 334 / 382 502.4 2.0 1.0X - Specialized 282 / 314 595.4 1.7 1.2X + Generic 277 / 366 605.0 1.7 1.0X + Specialized 214 / 251 785.1 1.3 1.3X */ } @@ -169,8 +97,8 @@ class GenericArrayDataBenchmark extends BenchmarkBase { Intel Xeon E3-12xx v2 (Ivy Bridge) Get double primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Generic 1720 / 1883 97.6 10.3 1.0X - Specialized 703 / 1117 238.7 4.2 2.4X + Generic 1976 / 1991 84.9 11.8 1.0X + Specialized 589 / 1050 285.1 3.5 3.4X */ } @@ -219,8 +147,8 @@ class GenericArrayDataBenchmark extends BenchmarkBase { Intel Xeon E3-12xx v2 (Ivy Bridge) Read GenericArrayData Int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Generic 206 / 212 1017.6 1.0 1.0X - Specialized 161 / 167 1301.0 0.8 1.3X + Generic 208 / 214 1008.3 1.0 1.0X + Specialized 142 / 158 1471.7 0.7 1.5X */ } @@ -269,16 +197,11 @@ class GenericArrayDataBenchmark extends BenchmarkBase { Intel Xeon E3-12xx v2 (Ivy Bridge) Read GenericArrayData Double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Generic 547 / 581 383.3 2.6 1.0X - Specialized 237 / 260 884.0 1.1 2.3X + Generic 621 / 683 337.7 3.0 1.0X + Specialized 265 / 297 790.4 1.3 2.3X */ } - ignore("allocate GenericArrayData") { - allocateGenericIntArray(20) - allocateGenericDoubleArray(20) - } - ignore("get primitive array") { getPrimitiveIntArray(20) getPrimitiveDoubleArray(20) From 8bcba32cbefa6acf12cc9762d224658ffdbc4edc Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 9 Nov 2016 17:26:27 +0900 Subject: [PATCH 74/75] Revert "update benchmark results" This reverts commit 6bf54ec5e227689d69f6db991e9ecbc54e153d0a. --- .../benchmark/GenericArrayDataBenchmark.scala | 95 +++++++++++++++++-- 1 file changed, 86 insertions(+), 9 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala index f253251d0522..7303e4ea2f07 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/GenericArrayDataBenchmark.scala @@ -32,7 +32,79 @@ import org.apache.spark.util.Benchmark */ class GenericArrayDataBenchmark extends BenchmarkBase { - def getPrimitiveIntArray(iters: Int): Unit = { + def allocateGenericIntArray(iters: Int): Unit = { + val count = 1024 * 1024 + var array: GenericArrayData = null + + val primitiveIntArray = new Array[Int](count) + val specializedIntArray = { i: Int => + var n = 0 + while (n < iters) { + array = new GenericArrayData(primitiveIntArray) + n += 1 + } + } + val anyArray = primitiveIntArray.toArray[Any] + val genericIntArray = { i: Int => + var n = 0 + while (n < iters) { + array = new GenericArrayData(anyArray) + n += 1 + } + } + + val benchmark = new Benchmark("Allocate GenericArrayData for int", count * iters, + minNumIters = 10, minTime = 1.milliseconds) + benchmark.addCase("Generic ")(genericIntArray) + benchmark.addCase("Specialized")(specializedIntArray) + benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Allocate GenericArrayData for int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 0 / 0 46500044.3 0.0 1.0X + Specialized 0 / 0 170500162.6 0.0 3.7X + */ + } + + def allocateGenericDoubleArray(iters: Int): Unit = { + val count = 1024 * 1024 + var array: GenericArrayData = null + + val primitiveDoubleArray = new Array[Int](count) + val specializedDoubleArray = { i: Int => + var n = 0 + while (n < iters) { + array = new GenericArrayData(primitiveDoubleArray) + n += 1 + } + } + val anyArray = primitiveDoubleArray.toArray[Any] + val genericDoubleArray = { i: Int => + var n = 0 + while (n < iters) { + array = new GenericArrayData(anyArray) + n += 1 + } + } + + val benchmark = new Benchmark("Allocate GenericArrayData for double", count * iters, + minNumIters = 10, minTime = 1.milliseconds) + benchmark.addCase("Generic ")(genericDoubleArray) + benchmark.addCase("Specialized")(specializedDoubleArray) + benchmark.run + /* + OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 + Intel Xeon E3-12xx v2 (Ivy Bridge) + Allocate GenericArrayData for double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Generic 0 / 0 55627374.0 0.0 1.0X + Specialized 0 / 0 177724745.8 0.0 3.2X + */ + } + + def getPrimitiveIntArray(iters: Int): Unit = { val count = 1024 * 1024 * 8 val anyArray: GenericArrayData = new GenericArrayData(new Array[Int](count).toArray[Any]) @@ -62,8 +134,8 @@ class GenericArrayDataBenchmark extends BenchmarkBase { Intel Xeon E3-12xx v2 (Ivy Bridge) Get int primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Generic 277 / 366 605.0 1.7 1.0X - Specialized 214 / 251 785.1 1.3 1.3X + Generic 334 / 382 502.4 2.0 1.0X + Specialized 282 / 314 595.4 1.7 1.2X */ } @@ -97,8 +169,8 @@ class GenericArrayDataBenchmark extends BenchmarkBase { Intel Xeon E3-12xx v2 (Ivy Bridge) Get double primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Generic 1976 / 1991 84.9 11.8 1.0X - Specialized 589 / 1050 285.1 3.5 3.4X + Generic 1720 / 1883 97.6 10.3 1.0X + Specialized 703 / 1117 238.7 4.2 2.4X */ } @@ -147,8 +219,8 @@ class GenericArrayDataBenchmark extends BenchmarkBase { Intel Xeon E3-12xx v2 (Ivy Bridge) Read GenericArrayData Int: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Generic 208 / 214 1008.3 1.0 1.0X - Specialized 142 / 158 1471.7 0.7 1.5X + Generic 206 / 212 1017.6 1.0 1.0X + Specialized 161 / 167 1301.0 0.8 1.3X */ } @@ -197,11 +269,16 @@ class GenericArrayDataBenchmark extends BenchmarkBase { Intel Xeon E3-12xx v2 (Ivy Bridge) Read GenericArrayData Double: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Generic 621 / 683 337.7 3.0 1.0X - Specialized 265 / 297 790.4 1.3 2.3X + Generic 547 / 581 383.3 2.6 1.0X + Specialized 237 / 260 884.0 1.1 2.3X */ } + ignore("allocate GenericArrayData") { + allocateGenericIntArray(20) + allocateGenericDoubleArray(20) + } + ignore("get primitive array") { getPrimitiveIntArray(20) getPrimitiveDoubleArray(20) From 7697e5f49c8d05806b26ad360f2d789fd8c707c7 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 9 Nov 2016 17:26:48 +0900 Subject: [PATCH 75/75] Revert "add another use cases from #13909" This reverts commit c82fbf3d9e5668edb57886909670142dc3e9872d. --- .../expressions/complexTypeCreator.scala | 38 ++++------------ .../benchmark/PrimitiveArrayBenchmark.scala | 45 ++++++++++--------- 2 files changed, 33 insertions(+), 50 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index e1150506cc1b..c9f36649ec8e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -60,47 +60,25 @@ case class CreateArray(children: Seq[Expression]) extends Expression { val values = ctx.freshName("values") ctx.addMutableState("Object[]", values, s"this.$values = null;") - val ArrayType(dt, _) = dataType - val isPrimitive = ctx.isPrimitiveType(dt) && false - val evals = children.map(e => e.genCode(ctx)) - val allNonNull = evals.forall(_.isNull == "false") - if (!isPrimitive || !allNonNull) { - ev.copy(code = s""" + ev.copy(code = s""" final boolean ${ev.isNull} = false; this.$values = new Object[${children.size}];""" + - ctx.splitExpressions( - ctx.INPUT_ROW, - children.zipWithIndex.map { case (e, i) => - val eval = e.genCode(ctx) - eval.code + - s""" + ctx.splitExpressions( + ctx.INPUT_ROW, + children.zipWithIndex.map { case (e, i) => + val eval = e.genCode(ctx) + eval.code + s""" if (${eval.isNull}) { $values[$i] = null; } else { $values[$i] = ${eval.value}; } """ - }) + - s""" + }) + + s""" final ArrayData ${ev.value} = new $arrayClass($values); this.$values = null; """) - } else { - val javaDataType = ctx.javaType(dt) - ctx.addMutableState(s"${javaDataType}[]", values, - s"this.$values = new ${javaDataType}[${children.size}];") - ev.copy(code = - ctx.splitExpressions( - ctx.INPUT_ROW, - evals.zipWithIndex.map { case (eval, i) => - eval.code + - s"\n$values[$i] = ${eval.value};" - }) + - s""" - final ArrayData ${ev.value} = new $arrayClass($values); - """, - isNull = "false") - } } override def prettyName: String = "array" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala index 11792a16fb2f..5eba0de1215d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala @@ -80,38 +80,43 @@ class PrimitiveArrayBenchmark extends BenchmarkBase { writeDatasetArray(4) } - def readDataFrameArray(iters: Int): Unit = { + def writeArray(iters: Int): Unit = { import sparkSession.implicits._ - val n = 1500 - val rows = 3 - val intStatement = (0 to n - 1).map(i => s"value + $i").mkString("Array(", ",", ")") - val ints = Array.tabulate(rows)(i => i) - val intDF = sparkSession.sparkContext.parallelize(ints, 1).toDF + val iters = 5 + val n = 1024 * 1024 + val rows = 15 - val doubleStatement = (0 to n - 1).map(i => s"value + $i.0d").mkString("Array(", ",", ")") - val doubles = Array.tabulate(rows)(i => i.toDouble) - val doubleDF = sparkSession.sparkContext.parallelize(doubles, 1).toDF + val benchmark = new Benchmark("Write an array in Dataframe", n) - val benchmark = new Benchmark("Read a primitive array in DataFrame", n * iters) - benchmark.addCase(s"Int ", numIters = iters)(iter => { - intDF.selectExpr(intStatement).queryExecution.toRdd.collect.length + val intDF = sparkSession.sparkContext.parallelize(0 until rows, 1) + .map(i => Array.tabulate(n)(i => i)).toDF() + intDF.count() // force to create df + + benchmark.addCase(s"Write int array in DataFrame", numIters = iters)(iter => { + intDF.selectExpr("value as a").collect }) - benchmark.addCase(s"Double", numIters = iters)(iter => { - doubleDF.selectExpr(doubleStatement).queryExecution.toRdd.collect.length + + val doubleDF = sparkSession.sparkContext.parallelize(0 until rows, 1) + .map(i => Array.tabulate(n)(i => i.toDouble)).toDF() + doubleDF.count() // force to create df + + benchmark.addCase(s"Write double array in DataFrame", numIters = iters)(iter => { + doubleDF.selectExpr("value as a").collect }) - benchmark.run + + benchmark.run() /* OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 Intel Xeon E3-12xx v2 (Ivy Bridge) - Read a primitive array in DataFrame: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + Read primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ - Int 241 / 340 0.0 32140.2 1.0X - Double 212 / 220 0.0 28319.2 1.1X + Write int array in DataFrame 1290 / 1748 0.8 1230.1 1.0X + Write double array in DataFrame 1761 / 2236 0.6 1679.0 0.7X */ } - test("Read an array in DataFrame") { - readDataFrameArray(5) + ignore("Write an array in DataFrame") { + writeArray(1) } }