change the default plan for single distinct

Davies Liu · Davies Liu · commit 192a04dfa9f1 · 2015-12-01T13:26:31.000-08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst
 private[spark] trait CatalystConf {
   def caseSensitiveAnalysis: Boolean
 
-  protected[spark] def specializeSingleDistinctAggPlanning: Boolean
+  protected[spark] def aggregationPlanning15: Boolean
 }
 
 /**
@@ -32,12 +32,12 @@ object EmptyConf extends CatalystConf {
     throw new UnsupportedOperationException
   }
 
-  protected[spark] override def specializeSingleDistinctAggPlanning: Boolean = {
+  protected[spark] override def aggregationPlanning15: Boolean = {
     throw new UnsupportedOperationException
   }
 }
 
 /** A CatalystConf that can be used for local testing. */
 case class SimpleCatalystConf(caseSensitiveAnalysis: Boolean) extends CatalystConf {
-  protected[spark] override def specializeSingleDistinctAggPlanning: Boolean = true
+  protected[spark] override def aggregationPlanning15: Boolean = false
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DistinctAggregationRewriter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DistinctAggregationRewriter.scala
@@ -123,13 +123,11 @@ case class DistinctAggregationRewriter(conf: CatalystConf) extends Rule[LogicalP
       .filter(_.isDistinct)
       .groupBy(_.aggregateFunction.children.toSet)
 
-    val shouldRewrite = if (conf.specializeSingleDistinctAggPlanning) {
-      // When the flag is set to specialize single distinct agg planning,
-      // we will rely on our Aggregation strategy to handle queries with a single
-      // distinct column.
+    val shouldRewrite = if (conf.aggregationPlanning15) {
+      // use the same plan as Spark 1.5 (one shuffle) for single distinct
       distinctAggGroups.size > 1
     } else {
-      distinctAggGroups.size >= 1
+      distinctAggGroups.nonEmpty
     }
     if (shouldRewrite) {
       // Create the attributes for the grouping id and the group by clause.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -449,17 +449,13 @@ private[spark] object SQLConf {
     doc = "When true, we could use `datasource`.`path` as table in SQL query"
   )
 
-  val SPECIALIZE_SINGLE_DISTINCT_AGG_PLANNING =
-    booleanConf("spark.sql.specializeSingleDistinctAggPlanning",
-      defaultValue = Some(true),
+  val AGG_PLANNING_15 =
+    booleanConf("spark.sql.aggregationPlanning15",
+      defaultValue = Some(false),
       isPublic = false,
-      doc = "When true, if a query only has a single distinct column and it has " +
-        "grouping expressions, we will use our planner rule to handle this distinct " +
-        "column (other cases are handled by DistinctAggregationRewriter). " +
-        "When false, we will always use DistinctAggregationRewriter to plan " +
-        "aggregation queries with DISTINCT keyword. This is an internal flag that is " +
-        "used to benchmark the performance impact of using DistinctAggregationRewriter to " +
-        "plan aggregation queries with a single distinct column.")
+      doc = "When true, will generate the plan as Spark 1.5 (using one shuffle for a query have " +
+        "single distinct aggregation). When false, will generate more robust plan (using two " +
+        "shuffle)")
 
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
@@ -579,8 +575,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def runSQLOnFile: Boolean = getConf(RUN_SQL_ON_FILES)
 
-  protected[spark] override def specializeSingleDistinctAggPlanning: Boolean =
-    getConf(SPECIALIZE_SINGLE_DISTINCT_AGG_PLANNING)
+  protected[spark] override def aggregationPlanning15: Boolean = getConf(AGG_PLANNING_15)
 
   /** ********************** SQLConf functionality methods ************ */
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -527,7 +527,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
   test("single distinct column set") {
     Seq(true, false).foreach { specializeSingleDistinctAgg =>
       val conf =
-        (SQLConf.SPECIALIZE_SINGLE_DISTINCT_AGG_PLANNING.key,
+        (SQLConf.AGG_PLANNING_15.key,
           specializeSingleDistinctAgg.toString)
       withSQLConf(conf) {
         // DISTINCT is not meaningful with Max and Min, so we just ignore the DISTINCT keyword.

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst`
`20`	`20`	`private[spark] trait CatalystConf {`
`21`	`21`	`def caseSensitiveAnalysis: Boolean`
`22`	`22`
`23`		`- protected[spark] def specializeSingleDistinctAggPlanning: Boolean`
	`23`	`+ protected[spark] def aggregationPlanning15: Boolean`
`24`	`24`	`}`
`25`	`25`
`26`	`26`	`/**`
`@@ -32,12 +32,12 @@ object EmptyConf extends CatalystConf {`
`32`	`32`	`throw new UnsupportedOperationException`
`33`	`33`	`}`
`34`	`34`
`35`		`- protected[spark] override def specializeSingleDistinctAggPlanning: Boolean = {`
	`35`	`+ protected[spark] override def aggregationPlanning15: Boolean = {`
`36`	`36`	`throw new UnsupportedOperationException`
`37`	`37`	`}`
`38`	`38`	`}`
`39`	`39`
`40`	`40`	`/** A CatalystConf that can be used for local testing. */`
`41`	`41`	`case class SimpleCatalystConf(caseSensitiveAnalysis: Boolean) extends CatalystConf {`
`42`		`- protected[spark] override def specializeSingleDistinctAggPlanning: Boolean = true`
	`42`	`+ protected[spark] override def aggregationPlanning15: Boolean = false`
`43`	`43`	`}`