From 0701ccbd28153d060ceae55e7ef532c290472aa4 Mon Sep 17 00:00:00 2001 From: Carmen Kwan Date: Fri, 2 Sep 2022 14:30:08 +0200 Subject: [PATCH 1/3] [SPARK-40315] Add equals() and hashCode() to ArrayBasedMapData --- .../sql/catalyst/util/ArrayBasedMapData.scala | 25 +++++++++++++++ .../util/ArrayBasedMapBuilderSuite.scala | 32 +++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala index 3768f7a1824f1..bb9fc6248ded9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.util +import scala.util.hashing.MurmurHash3 + import java.util.{Map => JavaMap} /** @@ -35,6 +37,29 @@ class ArrayBasedMapData(val keyArray: ArrayData, val valueArray: ArrayData) exte override def toString: String = { s"keys: $keyArray, values: $valueArray" } + + override def equals(obj: Any): Boolean = { + if (obj == null && this == null) { + return true + } + + if (obj == null || !obj.isInstanceOf[ArrayBasedMapData]) { + return false + } + + val other = obj.asInstanceOf[ArrayBasedMapData] + + keyArray.equals(other.keyArray) && valueArray.equals(other.valueArray) + } + + // Hash this class as a Product of two hashCodes. We don't know the DataType which prevents us + // from getting individual rows for hashing as a Map. + override def hashCode(): Int = { + val seed = MurmurHash3.productSeed + val keyHash = scala.util.hashing.MurmurHash3.mix(seed, keyArray.hashCode()) + val valueHash = scala.util.hashing.MurmurHash3.mix(keyHash, valueArray.hashCode()) + scala.util.hashing.MurmurHash3.finalizeHash(valueHash, 2) + } } object ArrayBasedMapData { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala index 6e07cd5d6415d..4752ba1d6450c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala @@ -142,4 +142,36 @@ class ArrayBasedMapBuilderSuite extends SparkFunSuite with SQLHelper { Map(new GenericArrayData(Seq(1, 1)) -> 3, new GenericArrayData(Seq(2, 2)) -> 2)) } } + + test("simple equal() and hashCode() semantics") { + val dataToAdd: Map[Int, Int] = Map(0 -> -7, 1 -> 3, 10 -> 4, 20 -> 5) + val builder1 = new ArrayBasedMapBuilder(IntegerType, IntegerType) + val builder2 = new ArrayBasedMapBuilder(IntegerType, IntegerType) + val builder3 = new ArrayBasedMapBuilder(IntegerType, IntegerType) + dataToAdd.foreach { case (key, value) => + builder1.put(key, value) + builder2.put(key, value) + // Replace the value by something slightly different in builder3 for one of the keys. + if (key == 20) { + builder3.put(key, value - 1) + } else { + builder3.put(key, value) + } + } + val arrayBasedMapData1 = builder1.build() + val arrayBasedMapData2 = builder2.build() + val arrayBasedMapData3 = builder3.build() + + // We expect two objects to be equal and to have the same hashCode if they have the same + // elements. + assert(arrayBasedMapData1 == arrayBasedMapData2) + assert(arrayBasedMapData1.equals(arrayBasedMapData2)) + assert(arrayBasedMapData1.hashCode() == arrayBasedMapData2.hashCode()) + + // If two objects have different elements, we expect them to not be equal and their hashCode + // to be different. + assert(arrayBasedMapData1 != arrayBasedMapData3) + assert(!arrayBasedMapData1.equals(arrayBasedMapData3)) + assert(arrayBasedMapData1.hashCode() != arrayBasedMapData3.hashCode()) + } } From fa27f2c672931064a5dd8d4804546169c8029d72 Mon Sep 17 00:00:00 2001 From: Carmen Kwan Date: Fri, 2 Sep 2022 14:50:21 +0200 Subject: [PATCH 2/3] Add Spark ticket number to test --- .../spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala index 4752ba1d6450c..4679c5c96afda 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala @@ -143,7 +143,7 @@ class ArrayBasedMapBuilderSuite extends SparkFunSuite with SQLHelper { } } - test("simple equal() and hashCode() semantics") { + test("SPARK-40315: simple equal() and hashCode() semantics") { val dataToAdd: Map[Int, Int] = Map(0 -> -7, 1 -> 3, 10 -> 4, 20 -> 5) val builder1 = new ArrayBasedMapBuilder(IntegerType, IntegerType) val builder2 = new ArrayBasedMapBuilder(IntegerType, IntegerType) From 02a55323aa4640c80c51a39d556ee3d46e48ee52 Mon Sep 17 00:00:00 2001 From: Carmen Kwan Date: Tue, 6 Sep 2022 11:22:25 +0200 Subject: [PATCH 3/3] Fix equals and tests --- .../spark/sql/catalyst/util/ArrayBasedMapData.scala | 13 +++---------- .../apache/spark/sql/catalyst/util/MapData.scala | 5 ++--- .../catalyst/util/ArrayBasedMapBuilderSuite.scala | 4 +--- .../spark/sql/catalyst/util/ComplexDataSuite.scala | 2 -- 4 files changed, 6 insertions(+), 18 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala index bb9fc6248ded9..5e02bc3b0de49 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala @@ -39,17 +39,10 @@ class ArrayBasedMapData(val keyArray: ArrayData, val valueArray: ArrayData) exte } override def equals(obj: Any): Boolean = { - if (obj == null && this == null) { - return true + obj match { + case other: ArrayBasedMapData => keyArray == other.keyArray && valueArray == other.valueArray + case _ => false } - - if (obj == null || !obj.isInstanceOf[ArrayBasedMapData]) { - return false - } - - val other = obj.asInstanceOf[ArrayBasedMapData] - - keyArray.equals(other.keyArray) && valueArray.equals(other.valueArray) } // Hash this class as a Product of two hashCodes. We don't know the DataType which prevents us diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala index 94e8824cd18cc..fd34c9f1e657c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala @@ -20,9 +20,8 @@ package org.apache.spark.sql.catalyst.util import org.apache.spark.sql.types.DataType /** - * This is an internal data representation for map type in Spark SQL. This should not implement - * `equals` and `hashCode` because the type cannot be used as join keys, grouping keys, or - * in equality tests. See SPARK-9415 and PR#13847 for the discussions. + * This is an internal data representation for map type in Spark SQL. This type cannot be used as + * join keys, grouping keys, or in equality tests. See SPARK-9415 and PR#13847 for the discussions. */ abstract class MapData extends Serializable { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala index 4679c5c96afda..8176984e99366 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilderSuite.scala @@ -164,13 +164,11 @@ class ArrayBasedMapBuilderSuite extends SparkFunSuite with SQLHelper { // We expect two objects to be equal and to have the same hashCode if they have the same // elements. - assert(arrayBasedMapData1 == arrayBasedMapData2) assert(arrayBasedMapData1.equals(arrayBasedMapData2)) assert(arrayBasedMapData1.hashCode() == arrayBasedMapData2.hashCode()) - // If two objects have different elements, we expect them to not be equal and their hashCode + // If two objects have different elements, we expect them not to be equal and their hashCode // to be different. - assert(arrayBasedMapData1 != arrayBasedMapData3) assert(!arrayBasedMapData1.equals(arrayBasedMapData3)) assert(arrayBasedMapData1.hashCode() != arrayBasedMapData3.hashCode()) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ComplexDataSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ComplexDataSuite.scala index f921f06537080..c6fb2ea14a021 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ComplexDataSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ComplexDataSuite.scala @@ -39,8 +39,6 @@ class ComplexDataSuite extends SparkFunSuite { val testArrayMap2 = ArrayBasedMapData(testMap2.toMap) val testArrayMap3 = ArrayBasedMapData(testMap3.toMap) val testArrayMap4 = ArrayBasedMapData(testMap4.toMap) - assert(testArrayMap1 !== testArrayMap3) - assert(testArrayMap2 !== testArrayMap4) // UnsafeMapData val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType)))