Skip to content

Commit 36902e1

Browse files
alacloud-fan
authored andcommitted
[SPARK-26878] QueryTest.compare() does not handle maps with array keys correctly
## What changes were proposed in this pull request? The previous strategy for comparing Maps leveraged sorting (key, value) tuples by their _.toString. However, the _.toString representation of an arrays has nothing to do with it's content. If a map has array keys, it's (key, value) pairs would be compared with other maps essentially at random. This could results in false negatives in tests. This changes first compares keys together to find the matching ones, and then compares associated values. ## How was this patch tested? New unit test added. Closes apache#23789 from ala/compare-map. Authored-by: Ala Luszczak <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent dcdbd06 commit 36902e1

File tree

2 files changed

+40
-3
lines changed

2 files changed

+40
-3
lines changed

sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ package org.apache.spark.sql
2020
import java.io.{Externalizable, ObjectInput, ObjectOutput}
2121
import java.sql.{Date, Timestamp}
2222

23+
import org.scalatest.exceptions.TestFailedException
24+
2325
import org.apache.spark.{SparkException, TaskContext}
2426
import org.apache.spark.sql.catalyst.ScroogeLikeExample
2527
import org.apache.spark.sql.catalyst.encoders.{OuterScopes, RowEncoder}
@@ -67,6 +69,41 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
6769
data: _*)
6870
}
6971

72+
test("toDS should compare map with byte array keys correctly") {
73+
// Choose the order of arrays in such way, that sorting keys of different maps by _.toString
74+
// will not incidentally put equal keys together.
75+
val arrays = (1 to 5).map(_ => Array[Byte](0.toByte, 0.toByte)).sortBy(_.toString).toArray
76+
arrays(0)(1) = 1.toByte
77+
arrays(1)(1) = 2.toByte
78+
arrays(2)(1) = 2.toByte
79+
arrays(3)(1) = 1.toByte
80+
81+
val mapA = Map(arrays(0) -> "one", arrays(2) -> "two")
82+
val subsetOfA = Map(arrays(0) -> "one")
83+
val equalToA = Map(arrays(1) -> "two", arrays(3) -> "one")
84+
val notEqualToA1 = Map(arrays(1) -> "two", arrays(3) -> "not one")
85+
val notEqualToA2 = Map(arrays(1) -> "two", arrays(4) -> "one")
86+
87+
// Comparing map with itself
88+
checkDataset(Seq(mapA).toDS(), mapA)
89+
90+
// Comparing map with equivalent map
91+
checkDataset(Seq(equalToA).toDS(), mapA)
92+
checkDataset(Seq(mapA).toDS(), equalToA)
93+
94+
// Comparing map with it's subset
95+
intercept[TestFailedException](checkDataset(Seq(subsetOfA).toDS(), mapA))
96+
intercept[TestFailedException](checkDataset(Seq(mapA).toDS(), subsetOfA))
97+
98+
// Comparing map with another map differing by single value
99+
intercept[TestFailedException](checkDataset(Seq(notEqualToA1).toDS(), mapA))
100+
intercept[TestFailedException](checkDataset(Seq(mapA).toDS(), notEqualToA1))
101+
102+
// Comparing map with another map differing by single key
103+
intercept[TestFailedException](checkDataset(Seq(notEqualToA2).toDS(), mapA))
104+
intercept[TestFailedException](checkDataset(Seq(mapA).toDS(), notEqualToA2))
105+
}
106+
70107
test("toDS with RDD") {
71108
val ds = sparkContext.makeRDD(Seq("a", "b", "c"), 3).toDS()
72109
checkDataset(

sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -341,9 +341,9 @@ object QueryTest {
341341
case (a: Array[_], b: Array[_]) =>
342342
a.length == b.length && a.zip(b).forall { case (l, r) => compare(l, r)}
343343
case (a: Map[_, _], b: Map[_, _]) =>
344-
val entries1 = a.iterator.toSeq.sortBy(_.toString())
345-
val entries2 = b.iterator.toSeq.sortBy(_.toString())
346-
compare(entries1, entries2)
344+
a.size == b.size && a.keys.forall { aKey =>
345+
b.keys.find(bKey => compare(aKey, bKey)).exists(bKey => compare(a(aKey), b(bKey)))
346+
}
347347
case (a: Iterable[_], b: Iterable[_]) =>
348348
a.size == b.size && a.zip(b).forall { case (l, r) => compare(l, r)}
349349
case (a: Product, b: Product) =>

0 commit comments

Comments
 (0)