Push predicates through Expand

cloud-fan · cloud-fan · commit 30dbdc646153 · 2016-04-19T21:20:43.000+08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -300,6 +300,9 @@ class Analyzer(
           a.toAttribute.withNullability((nonNullBitmask & 1 << idx) == 0)
         }
 
+        val expand = Expand(x.bitmasks, groupByAliases, groupByAttributes, gid, x.child)
+        val finalGroupingAttrs = expand.output.drop(x.child.output.length)
+
         val aggregations: Seq[NamedExpression] = x.aggregations.map { case expr =>
           // collect all the found AggregateExpression, so we can check an expression is part of
           // any AggregateExpression or not.
@@ -321,15 +324,12 @@ class Analyzer(
               if (index == -1) {
                 e
               } else {
-                groupByAttributes(index)
+                finalGroupingAttrs(index)
               }
           }.asInstanceOf[NamedExpression]
         }
 
-        Aggregate(
-          groupByAttributes :+ gid,
-          aggregations,
-          Expand(x.bitmasks, groupByAliases, groupByAttributes, gid, x.child))
+        Aggregate(finalGroupingAttrs, aggregations, expand)
 
       case f @ Filter(cond, child) if hasGroupingFunction(cond) =>
         val groupingExprs = findGroupingExprs(child)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1019,8 +1019,6 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
     case filter @ Filter(_, f: Filter) => filter
     // should not push predicates through sample, or will generate different results.
     case filter @ Filter(_, s: Sample) => filter
-    // TODO: push predicates through expand
-    case filter @ Filter(_, e: Expand) => filter
 
     case filter @ Filter(condition, u: UnaryNode) if u.expressions.forall(_.deterministic) =>
       pushDownPredicate(filter, u.child) { predicate =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -516,7 +516,10 @@ private[sql] object Expand {
       // groupingId is the last output, here we use the bit mask as the concrete value for it.
       } :+ Literal.create(bitmask, IntegerType)
     }
-    val output = child.output ++ groupByAttrs :+ gid
+
+    // the `groupByAttrs` has different meaning in `Expand.output`, it could be the original
+    // grouping expression or null, so here we create new instance of it.
+    val output = child.output ++ groupByAttrs.map(_.newInstance) :+ gid
     Expand(projections, output, Project(child.output ++ groupByAliases, child))
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -743,4 +743,19 @@ class FilterPushdownSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer)
   }
+
+  test("expand") {
+    val agg = testRelation
+      .groupBy(Cube(Seq('a, 'b)))('a, 'b, sum('c))
+      .analyze
+      .asInstanceOf[Aggregate]
+
+    val a = agg.output(0)
+    val b = agg.output(1)
+
+    val query = agg.where(a > 1 && b > 2)
+    val optimized = Optimize.execute(query)
+    val correctedAnswer = agg.copy(child = agg.child.where(a > 1 && b > 2)).analyze
+    comparePlans(optimized, correctedAnswer)
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala
@@ -288,8 +288,9 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi
 
   private def isGroupingSet(a: Aggregate, e: Expand, p: Project): Boolean = {
     assert(a.child == e && e.child == p)
-    a.groupingExpressions.forall(_.isInstanceOf[Attribute]) &&
-      sameOutput(e.output, p.child.output ++ a.groupingExpressions.map(_.asInstanceOf[Attribute]))
+    a.groupingExpressions.forall(_.isInstanceOf[Attribute]) && sameOutput(
+      e.output.drop(p.child.output.length),
+      a.groupingExpressions.map(_.asInstanceOf[Attribute]))
   }
 
   private def groupingSetToSQL(
@@ -302,13 +303,10 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi
     val gid = expand.output.last
 
     val numOriginalOutput = project.child.output.length
-    // Assumption: Aggregate's groupingExpressions is composed of
-    // 1) the attributes of aliased group by expressions
-    // 2) gid, which is always the last one
-    val groupByAttributes = agg.groupingExpressions.dropRight(1).map(_.asInstanceOf[Attribute])
     // Assumption: Project's projectList is composed of
     // 1) the original output (Project's child.output),
     // 2) the aliased group by expressions.
+    val groupByAttributes = project.output.drop(numOriginalOutput)
     val groupByExprs = project.projectList.drop(numOriginalOutput).map(_.asInstanceOf[Alias].child)
     val groupingSQL = groupByExprs.map(_.sql).mkString(", ")
 

Original file line number	Diff line number	Diff line change
`@@ -300,6 +300,9 @@ class Analyzer(`
`300`	`300`	`a.toAttribute.withNullability((nonNullBitmask & 1 << idx) == 0)`
`301`	`301`	`}`
`302`	`302`
	`303`	`+ val expand = Expand(x.bitmasks, groupByAliases, groupByAttributes, gid, x.child)`
	`304`	`+ val finalGroupingAttrs = expand.output.drop(x.child.output.length)`
	`305`	`+`
`303`	`306`	`val aggregations: Seq[NamedExpression] = x.aggregations.map { case expr =>`
`304`	`307`	`// collect all the found AggregateExpression, so we can check an expression is part of`
`305`	`308`	`// any AggregateExpression or not.`
`@@ -321,15 +324,12 @@ class Analyzer(`
`321`	`324`	`if (index == -1) {`
`322`	`325`	`e`
`323`	`326`	`} else {`
`324`		`- groupByAttributes(index)`
	`327`	`+ finalGroupingAttrs(index)`
`325`	`328`	`}`
`326`	`329`	`}.asInstanceOf[NamedExpression]`
`327`	`330`	`}`
`328`	`331`
`329`		`- Aggregate(`
`330`		`- groupByAttributes :+ gid,`
`331`		`- aggregations,`
`332`		`- Expand(x.bitmasks, groupByAliases, groupByAttributes, gid, x.child))`
	`332`	`+ Aggregate(finalGroupingAttrs, aggregations, expand)`
`333`	`333`
`334`	`334`	`case f @ Filter(cond, child) if hasGroupingFunction(cond) =>`
`335`	`335`	`val groupingExprs = findGroupingExprs(child)`
Original file line number	Diff line number	Diff line change
`@@ -516,7 +516,10 @@ private[sql] object Expand {`
`516`	`516`	`// groupingId is the last output, here we use the bit mask as the concrete value for it.`
`517`	`517`	`} :+ Literal.create(bitmask, IntegerType)`
`518`	`518`	`}`
`519`		`- val output = child.output ++ groupByAttrs :+ gid`
	`519`	`+`
	`520`	+ // the `groupByAttrs` has different meaning in `Expand.output`, it could be the original
	`521`	`+ // grouping expression or null, so here we create new instance of it.`
	`522`	`+ val output = child.output ++ groupByAttrs.map(_.newInstance) :+ gid`
`520`	`523`	`Expand(projections, output, Project(child.output ++ groupByAliases, child))`
`521`	`524`	`}`
`522`	`525`	`}`