address comments

allisonwang-db · allisonwang-db · commit fa6050aa6b72 · 2020-11-17T17:46:28.000-08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1256,7 +1256,7 @@ object SQLConf {
   val REMOVE_REDUNDANT_SORTS_ENABLED = buildConf("spark.sql.execution.removeRedundantSorts")
     .internal()
     .doc("Whether to remove redundant physical sort node")
-    .version("2.4.8")
+    .version("3.1.0")
     .booleanConf
     .createWithDefault(true)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -344,6 +344,8 @@ object QueryExecution {
       PlanSubqueries,
       RemoveRedundantProjects,
       EnsureRequirements,
+      // `RemoveRedundantSorts` needs to be added before `EnsureRequirements` to guarantee the same
+      // number of partitions when instantiating PartitioningCollection.
       RemoveRedundantSorts,
       DisableUnnecessaryBucketedScan,
       ApplyColumnarRulesAndInsertTransitions(sparkSession.sessionState.columnarRules),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantSortsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantSortsSuite.scala
@@ -18,7 +18,9 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.sql.{DataFrame, QueryTest}
+import org.apache.spark.sql.catalyst.plans.physical.UnknownPartitioning
 import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, DisableAdaptiveExecutionSuite, EnableAdaptiveExecutionSuite}
+import org.apache.spark.sql.execution.joins.ShuffledJoin
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
 
@@ -136,13 +138,11 @@ abstract class RemoveRedundantSortsSuiteBase
     }
   }
 
-  test("shuffled join with different left and right side partition numbers") {
+  test("SPARK-33472: shuffled join with different left and right side partition numbers") {
     withTempView("t1", "t2") {
       spark.range(0, 100, 1, 2).select('id as "key").createOrReplaceTempView("t1")
       (0 to 100).toDF("key").createOrReplaceTempView("t2")
 
-      // left side partitioning: RangePartitioning(key ASC, 2)
-      // right side partitioning: UnknownPartitioning(0)
       val queryTemplate = """
         |SELECT /*+ %s(t1) */ t1.key
         |FROM t1 JOIN t2 ON t1.key = t2.key
@@ -151,7 +151,14 @@ abstract class RemoveRedundantSortsSuiteBase
       """.stripMargin
 
       Seq(("MERGE", 3), ("SHUFFLE_HASH", 1)).foreach { case (hint, count) =>
-        checkSorts(queryTemplate.format(hint), count, count)
+        val query = queryTemplate.format(hint)
+        val df = sql(query)
+        val sparkPlan = df.queryExecution.sparkPlan
+        val join = sparkPlan.collect { case j: ShuffledJoin => j }.head
+        val range = sparkPlan.collect { case r: RangeExec => r }.head
+        assert(join.left.outputPartitioning == range.outputPartitioning)
+        assert(join.right.outputPartitioning == UnknownPartitioning(0))
+        checkSorts(query, count, count)
       }
     }
   }