[CARMEL-6151] Exchange Push Down through Aggregate (#1053)

xingchaozh · GitHub Enterprise · commit 156d8d945158 · 2022-08-30T13:02:28.000+08:00
* [CARMEL-6151] Exchange Push Down through Aggregate

* fix ut

* fix ut
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1303,6 +1303,14 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val EXCHANGE_PUSH_DOWN_THROUGH_AGGREGATE_ENABLED =
+    buildConf("spark.sql.exchangePushDownThroughAggregate.enabled")
+      .internal()
+      .doc("When true, we will try to push down exchange through aggregate.")
+      .version("3.0.0")
+      .booleanConf
+      .createWithDefault(false)
+
   val AUTO_APPLY_STAGE_FALLBACK_PLAN_ENABLED =
     buildConf("spark.sql.applyStageFallbackPlan.enabled")
       .internal()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -37,8 +37,7 @@ import org.apache.spark.sql.execution.QueryExecution.skipAuthTag
 import org.apache.spark.sql.execution.adaptive.{AdaptiveExecutionContext, EnsureRepartitionForWriting, InsertAdaptiveSparkPlan}
 import org.apache.spark.sql.execution.bucketing.DisableUnnecessaryBucketedScan
 import org.apache.spark.sql.execution.dynamicpruning.PlanDynamicPruningFilters
-import org.apache.spark.sql.execution.exchange.EliminateShuffleExec
-import org.apache.spark.sql.execution.exchange.EnsureRequirements
+import org.apache.spark.sql.execution.exchange.{EliminateShuffleExec, EnsureRequirements, ExchangePushDownThroughAggregate}
 import org.apache.spark.sql.execution.reuse.ReuseExchangeAndSubquery
 import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata}
 import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
@@ -366,6 +365,7 @@ object QueryExecution {
       PlanSubqueries(sparkSession),
       EliminateHintPlaceHolder,
       EnsureRequirements,
+      ExchangePushDownThroughAggregate,
       // `RemoveRedundantSorts` needs to be added after `EnsureRequirements` to guarantee the same
       // number of partitions when instantiating PartitioningCollection.
       RemoveRedundantSorts,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -91,6 +91,7 @@ case class AdaptiveSparkPlanExec(
   private def queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq(
     eliminateHintPlaceHolder,
     ensureRequirements,
+    ExchangePushDownThroughAggregate,
     removeRedundantSorts,
     RemoveRedundantPartialAggregates,
     EnsureRepartitionForWriting,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangePushDownThroughAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangePushDownThroughAggregate.scala
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.exchange
+
+import org.apache.spark.sql.catalyst.expressions.aggregate.{Final, Partial}
+import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, HashClusteredDistribution, HashPartitioning}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.aggregate.{BaseAggregateExec, HashAggregateExec}
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * Exchange could be push down through aggregate to reduce shuffle count or
+ * improve the parallelism of aggregate
+ */
+object ExchangePushDownThroughAggregate extends Rule[SparkPlan] {
+  private def isPushDownSupport(agg: BaseAggregateExec,
+                               partialAgg: BaseAggregateExec,
+                                shuffle: ShuffleExchangeExec): Boolean = {
+    val validFinalAggregate = agg.aggregateExpressions.forall(_.mode == Final)
+    val validPartialAggregate = partialAgg.aggregateExpressions.forall(_.mode == Partial)
+
+    val ensureRequirement = agg.requiredChildDistribution.
+      forall(shuffle.outputPartitioning.satisfies)
+
+    val containsAll = shuffle.outputPartitioning match {
+      case h: HashPartitioning => h.expressions.forall {
+        e => agg.groupingExpressions.find(_.semanticEquals(e)).nonEmpty
+      }
+      case _ => false
+    }
+
+    validFinalAggregate && validPartialAggregate && ensureRequirement && containsAll
+  }
+
+  private def exchangePushDown(plan: SparkPlan): SparkPlan = plan transform {
+    /* Aggregate on bucket table
+     *
+     *                   Shuffle(j)                         HashAggregate(i, ..., Final)
+     *                      |                                            |
+     *         HashAggregate(i, ..., Final)                 HashAggregate(i, ..., Partial)
+     *                      |                      ->                    |
+     *         HashAggregate(i, ..., Partial)                        Shuffle(j)
+     *                       |                                            |
+     *                    Project                                      Project
+     *                      |                                            |
+     *                    Filter                                      Filter
+     *                      |                                            |
+     *                 Scan(t1: i, j)                               Scan(t1: i, j)
+     *                 -- bucket on i
+     */
+    case shuffle @ ShuffleExchangeExec(_: HashPartitioning,
+    agg @ HashAggregateExec(_, _, _, _, _, _,
+    partialAgg@HashAggregateExec(_, _, _, _, _, _,
+    ProjectExec(_, FilterExec(_, scan: FileSourceScanExec)))), _)
+      if isPushDownSupport(agg, partialAgg, shuffle) && scan.relation.bucketSpec.nonEmpty =>
+      val newShuffle = shuffle.withNewChildren(partialAgg.children)
+      newShuffle.addOptimizeTag(s"created by ${this.simpleRuleName}")
+
+      partialAgg.logicalLink.foreach(newShuffle.setLogicalLink)
+      val newPartialAgg = partialAgg.withNewChildren(newShuffle :: Nil)
+      agg.withNewChildren( newPartialAgg :: Nil)
+
+    case shuffle @ ShuffleExchangeExec(_: HashPartitioning,
+    agg @ HashAggregateExec(_, _, _, _, _, _,
+    partialAgg@HashAggregateExec(_, _, _, _, _, _, scan: FileSourceScanExec)), _)
+      if isPushDownSupport(agg, partialAgg, shuffle) && scan.relation.bucketSpec.nonEmpty =>
+      val newShuffle = shuffle.withNewChildren(partialAgg.children)
+      newShuffle.addOptimizeTag(s"created by ${this.simpleRuleName}")
+
+      partialAgg.logicalLink.foreach(newShuffle.setLogicalLink)
+      val newPartialAgg = partialAgg.withNewChildren(newShuffle :: Nil)
+      agg.withNewChildren( newPartialAgg :: Nil)
+
+
+    /* Aggregate on non-bucket table
+     *
+     *                   Shuffle(j)
+     *                      |
+     *         HashAggregate(i, j, Final)                   HashAggregate(i, j, Final)
+     *                      |                                            |
+     *                  Shuffle(i, j)             ->        HashAggregate(i, j, Partial)
+     *                      |                                            |
+     *         HashAggregate(i, j, Partial)                        Shuffle(j)
+     *                      |                                            |
+     *                    Project                                      Project
+     *                      |                                            |
+     *                    Filter                                      Filter
+     *                      |                                            |
+     *                 Scan(t1: i, j)                             Scan(t1: i, j)
+     */
+    case shuffle @ ShuffleExchangeExec(_: HashPartitioning,
+    agg @ HashAggregateExec(_, _, _, _, _, _, ShuffleExchangeExec(_: HashPartitioning,
+    partialAgg@HashAggregateExec(_, _, _, _, _, _,
+    ProjectExec(_, FilterExec(_, _: FileSourceScanExec))), _)), _)
+      if isPushDownSupport(agg, partialAgg, shuffle) =>
+      val newShuffle = shuffle.withNewChildren(partialAgg.children)
+      newShuffle.addOptimizeTag(s"created by ${this.simpleRuleName}")
+
+      partialAgg.logicalLink.foreach(newShuffle.setLogicalLink)
+      val newPartialAgg = partialAgg.withNewChildren(newShuffle :: Nil)
+      agg.withNewChildren( newPartialAgg :: Nil)
+
+    case shuffle @ ShuffleExchangeExec(_: HashPartitioning,
+    agg @ HashAggregateExec(_, _, _, _, _, _, ShuffleExchangeExec(_: HashPartitioning,
+    partialAgg@HashAggregateExec(_, _, _, _, _, _, _: FileSourceScanExec), _)), _)
+      if isPushDownSupport(agg, partialAgg, shuffle) =>
+      val newShuffle = shuffle.withNewChildren(partialAgg.children)
+      newShuffle.addOptimizeTag(s"created by ${this.simpleRuleName}")
+
+      partialAgg.logicalLink.foreach(newShuffle.setLogicalLink)
+      val newPartialAgg = partialAgg.withNewChildren(newShuffle :: Nil)
+      agg.withNewChildren( newPartialAgg :: Nil)
+  }
+
+  def apply(plan: SparkPlan): SparkPlan = {
+    if (!conf.getConf(SQLConf.EXCHANGE_PUSH_DOWN_THROUGH_AGGREGATE_ENABLED)) {
+      plan
+    } else {
+      val newPlan = exchangePushDown(plan)
+      newPlan
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/ExchangePushDownThroughAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/ExchangePushDownThroughAggregateSuite.scala
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources
+
+import org.apache.spark.sql.{DataFrame, QueryTest}
+import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, DisableAdaptiveExecutionSuite, EnableAdaptiveExecutionSuite}
+import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
+import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils}
+
+class ExchangePushDownThroughAggregateWithoutHiveSupportSuite
+  extends ExchangePushDownThroughAggregateSuite
+    with SharedSparkSession
+    with DisableAdaptiveExecutionSuite {
+
+  protected override def beforeAll(): Unit = {
+    super.beforeAll()
+    assert(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "in-memory")
+  }
+}
+
+class ExchangePushDownThroughAggregateWithoutHiveSupportSuiteAE
+  extends ExchangePushDownThroughAggregateSuite
+    with SharedSparkSession
+    with EnableAdaptiveExecutionSuite {
+
+  protected override def beforeAll(): Unit = {
+    super.beforeAll()
+    assert(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "in-memory")
+  }
+}
+
+abstract class ExchangePushDownThroughAggregateSuite extends QueryTest
+  with SQLTestUtils with AdaptiveSparkPlanHelper {
+
+//  protected override def beforeAll(): Unit = {
+//    super.beforeAll()
+//    assert(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "in-memory")
+//  }
+
+  import testImplicits._
+
+  private lazy val df1 =
+    (0 until 50).map(i => (i % 2, i % 4, i % 8)).toDF("i", "j", "k").as("df1")
+  private lazy val df2 =
+    (0 until 50).map(i => (i % 3, i % 6, i % 9)).toDF("i", "j", "k").as("df2")
+  private lazy val df3 =
+    (0 until 50).map(i => (i % 3, i % 6, i % 9)).toDF("l", "m", "n").as("df3")
+
+  private def checkExchangePushDown(query: String,
+                                    shuffleCountIfEnable: Int,
+                                    shuffleCountIfDisable: Int): Unit = {
+
+    def checkExchangePushDownResult(query: String, enabled: Boolean,
+                                    expectedNumShuffle: Int): DataFrame = {
+      val df = sql(query)
+      df.collect()
+
+      val plan = df.queryExecution.executedPlan
+
+      // scalastyle:off println
+      println(s"query: ${query},\nenabled: ${enabled},\nplan: ${plan}")
+      // scalastyle:on println
+
+      val shuffles = collect(plan) { case s: ShuffleExchangeExec => s }
+      assert(shuffles.length == expectedNumShuffle)
+      df
+    }
+
+    withSQLConf(SQLConf.EXCHANGE_PUSH_DOWN_THROUGH_AGGREGATE_ENABLED.key -> "true") {
+      val result = checkExchangePushDownResult(query, true, shuffleCountIfEnable)
+
+      withSQLConf(SQLConf.EXCHANGE_PUSH_DOWN_THROUGH_AGGREGATE_ENABLED.key -> "false") {
+        val result2 = checkExchangePushDownResult(query, false, shuffleCountIfDisable)
+        checkAnswer(result, result2)
+      }
+    }
+  }
+
+
+  test("Exchange push down through aggregate - basic test") {
+    withTable("bucket_table1", "normal_table1", "normal_table2") {
+      df1.write.format("parquet").bucketBy(8, "i").
+        saveAsTable("bucket_table1")
+      df2.write.format("parquet").saveAsTable("normal_table1")
+      df3.write.format("parquet").saveAsTable("normal_table2")
+      // df1.write.format("parquet").saveAsTable("t3")
+
+      Seq(
+//        (
+//          """
+//            |select * from (select i, j, k from
+//            |(select distinct i, j, k from
+//            |(select i, j, k from bucket_table1 cluster by j)
+//            | t2 ) t1 cluster by j) t0 order by i, j, k
+//            |""".stripMargin, 1, 2),
+
+//        (
+//          """
+//            |select * from (select i, j, k from
+//            |(select distinct i, j, k from bucket_table1
+//            | t2 ) t1 cluster by j ) t0 order by i, j, k
+//            |""".stripMargin, 1, 1),
+
+        // basic test for bucket table
+        (
+          """
+            |select l, i, j, k from
+            |normal_table2 t0 left join (select distinct i, j, k from bucket_table1
+            |) t1 on t0.l = t1.j
+            |""".stripMargin, 2, 2),
+        (
+          """
+            |select l, i, j, k from
+            |normal_table2 t0 left join (
+            |select i, j, k, count(*) as c from bucket_table1 group by 1, 2, 3
+            |) t1 on t0.l = t1.c
+            |""".stripMargin, 2, 2), // No support since clustered on c instead of i, j, k
+        // with filter
+        (
+          """
+            |select l, i, j, k from
+            |normal_table2 t0 left join (select distinct i, j, k from bucket_table1
+            |where i > 0) t1 on t0.l = t1.j
+            |""".stripMargin, 2, 2),
+        // with one aggregate function
+        (
+          """
+            |select l, i, j, k, mi from
+            |normal_table2 t0 left join (select distinct i, j, k, max(i) as mi from bucket_table1
+            |where i > 0 group by 1, 2, 3) t1 on t0.l = t1.j
+            |""".stripMargin, 2, 2),
+        // with 2 aggregate function
+        (
+          """
+            |select l, i, j, k, mi, cnt from
+            |normal_table2 t0 left join
+            |(select distinct i, j, k, max(i) as mi, count(1) as cnt from bucket_table1
+            |where i > 0 group by 1, 2, 3) t1 on t0.l = t1.j
+            |""".stripMargin, 2, 2),
+
+
+        // basic test for normal table
+        (
+          """
+            |select l, i, j, k from
+            |normal_table2 t0 left join (select distinct i, j, k from normal_table1
+            |) t1 on t0.l = t1.j
+            |""".stripMargin, 2, 3),
+        // with filter
+        (
+          """
+            |select l, i, j, k from
+            |normal_table2 t0 left join (select distinct i, j, k from normal_table1
+            |where i > 0) t1 on t0.l = t1.j
+            |""".stripMargin, 2, 3),
+        // with one aggregate function
+        (
+          """
+            |select l, i, j, k, mi from
+            |normal_table2 t0 left join (select distinct i, j, k, max(i) as mi from normal_table1
+            |where i > 0 group by 1, 2, 3) t1 on t0.l = t1.j
+            |""".stripMargin, 2, 3),
+        // with 2 aggregate function
+        (
+          """
+            |select l, i, j, k, mi, cnt from
+            |normal_table2 t0 left join
+            |(select distinct i, j, k, max(i) as mi, count(1) as cnt from normal_table1
+            |where i > 0 group by 1, 2, 3) t1 on t0.l = t1.j
+            |""".stripMargin, 2, 3)
+      ).foreach { case (query, shuffleCountIfEnable, shuffleCountIfDisable) =>
+        withSQLConf(SQLConf.AUTO_BUCKETED_SCAN_ENABLED.key -> "true",
+          SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "1") {
+          checkExchangePushDown(query, shuffleCountIfEnable, shuffleCountIfDisable)
+        }
+      }
+    }
+  }
+}