Handle aliases

Tanel Kiis · Tanel Kiis · commit ef64abfdd891 · 2020-10-16T15:00:11.000+03:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -97,7 +97,7 @@ object Predicate extends CodeGeneratorWithInterpretedFallback[Expression, BasePr
   }
 }
 
-trait PredicateHelper extends Logging {
+trait PredicateHelper extends Logging with AliasHelper {
   protected def splitConjunctivePredicates(condition: Expression): Seq[Expression] = {
     condition match {
       case And(cond1, cond2) =>
@@ -150,18 +150,6 @@ trait PredicateHelper extends Logging {
     }
   }
 
-  // Substitute any known alias from a map.
-  protected def replaceAlias(
-      condition: Expression,
-      aliases: AttributeMap[Expression]): Expression = {
-    // Use transformUp to prevent infinite recursion when the replacement expression
-    // redefines the same ExprId,
-    condition.transformUp {
-      case a: Attribute =>
-        aliases.getOrElse(a, a)
-    }
-  }
-
   /**
    * Returns true if `expr` can be evaluated using only the output of `plan`.  This method
    * can be used to determine when it is acceptable to move expression evaluation within a query
@@ -249,6 +237,41 @@ trait PredicateHelper extends Logging {
   }
 }
 
+/**
+ * Helper methods for collecting and replacing aliases.
+ */
+trait AliasHelper {
+
+  protected def getAliasMap(plan: Project): AttributeMap[Expression] = {
+    // Create a map of Aliases to their values from the child projection.
+    // e.g., 'SELECT a + b AS c, d ...' produces Map(c -> a + b).
+    AttributeMap(plan.projectList.collect { case a: Alias => (a.toAttribute, a.child) })
+  }
+
+  protected def getAliasMap(plan: Aggregate): AttributeMap[Expression] = {
+    // Find all the aliased expressions in the aggregate list that don't include any actual
+    // AggregateExpression or PythonUDF, and create a map from the alias to the expression
+    val aliasMap = plan.aggregateExpressions.collect {
+      case a: Alias if a.child.find(e => e.isInstanceOf[AggregateExpression] ||
+        PythonUDF.isGroupedAggPandasUDF(e)).isEmpty =>
+        (a.toAttribute, a.child)
+    }
+    AttributeMap(aliasMap)
+  }
+
+  // Substitute any known alias from a map.
+  protected def replaceAlias(
+    condition: Expression,
+    aliases: AttributeMap[Expression]): Expression = {
+    // Use transformUp to prevent infinite recursion when the replacement expression
+    // redefines the same ExprId,
+    condition.transformUp {
+      case a: Attribute =>
+        aliases.getOrElse(a, a)
+    }
+  }
+}
+
 @ExpressionDescription(
   usage = "_FUNC_ expr - Logical not.",
   examples = """
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -482,18 +482,51 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] {
  * Remove redundant aggregates from a query plan. A redundant aggregate is an aggregate whose
  * only goal is to keep distinct values, while its parent aggregate would ignore duplicate values.
  */
-object RemoveRedundantAggregates extends Rule[LogicalPlan] {
+object RemoveRedundantAggregates extends Rule[LogicalPlan] with AliasHelper {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case upper @ Aggregate(_, _, lower: Aggregate) if lowerIsRedundant(upper, lower) =>
-      upper.copy(child = lower.child)
+      val aliasMap = getAliasMap(lower)
+      upper.copy(
+        child = lower.child,
+        groupingExpressions = upper.groupingExpressions.map(replaceAlias(_, aliasMap)),
+        aggregateExpressions = upper.aggregateExpressions.map(
+          replaceAliasButKeepOuter(_, aliasMap))
+      )
   }
 
   private def lowerIsRedundant(upper: Aggregate, lower: Aggregate): Boolean = {
-    val upperReferencesOnlyGrouping = upper.references
-      .subsetOf(AttributeSet(lower.groupingExpressions))
+    val isDeterministic = upper.aggregateExpressions.forall(_.deterministic) &&
+      lower.aggregateExpressions.forall(_.deterministic)
+
+    val upperReferencesOnlyGrouping = upper.references.subsetOf(AttributeSet(
+      lower.aggregateExpressions.filter(!isAggregate(_)).map(_.toAttribute)))
+
     val upperHasNoAggregateExpressions = upper.aggregateExpressions
-      .forall(_.find(_.isInstanceOf[AggregateExpression]).isEmpty)
-    upperReferencesOnlyGrouping && upperHasNoAggregateExpressions
+      .forall(_.find(isAggregate).isEmpty)
+
+    isDeterministic && upperReferencesOnlyGrouping && upperHasNoAggregateExpressions
+  }
+
+  private def isAggregate(expr: Expression): Boolean = {
+    expr.find(e => e.isInstanceOf[AggregateExpression] ||
+      PythonUDF.isGroupedAggPandasUDF(e)).isDefined
+  }
+
+  /**
+   * Replace all attributes, that reference an alias, with the aliased expression,
+   * but keep the name of the name of the outmost attribute.
+   */
+  private def replaceAliasButKeepOuter(
+    expr: NamedExpression,
+    aliasMap: AttributeMap[Expression]): NamedExpression = {
+
+    val replaced = expr match {
+      case a: Attribute if aliasMap.contains(a) =>
+        Alias(replaceAlias(a, aliasMap), a.name)(a.exprId, a.qualifier)
+      case _ => replaceAlias(expr, aliasMap)
+    }
+
+    replaced.asInstanceOf[NamedExpression]
   }
 }
 
@@ -1258,23 +1291,6 @@ object PushPredicateThroughNonJoin extends Rule[LogicalPlan] with PredicateHelpe
       }
   }
 
-  def getAliasMap(plan: Project): AttributeMap[Expression] = {
-    // Create a map of Aliases to their values from the child projection.
-    // e.g., 'SELECT a + b AS c, d ...' produces Map(c -> a + b).
-    AttributeMap(plan.projectList.collect { case a: Alias => (a.toAttribute, a.child) })
-  }
-
-  def getAliasMap(plan: Aggregate): AttributeMap[Expression] = {
-    // Find all the aliased expressions in the aggregate list that don't include any actual
-    // AggregateExpression or PythonUDF, and create a map from the alias to the expression
-    val aliasMap = plan.aggregateExpressions.collect {
-      case a: Alias if a.child.find(e => e.isInstanceOf[AggregateExpression] ||
-          PythonUDF.isGroupedAggPandasUDF(e)).isEmpty =>
-        (a.toAttribute, a.child)
-    }
-    AttributeMap(aliasMap)
-  }
-
   def canPushThrough(p: UnaryNode): Boolean = p match {
     // Note that some operators (e.g. project, aggregate, union) are being handled separately
     // (earlier in this rule).
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PushDownLeftSemiAntiJoin.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PushDownLeftSemiAntiJoin.scala
@@ -42,7 +42,7 @@ object PushDownLeftSemiAntiJoin extends Rule[LogicalPlan] with PredicateHelper {
         // No join condition, just push down the Join below Project
         p.copy(child = Join(gChild, rightOp, joinType, joinCond, hint))
       } else {
-        val aliasMap = PushPredicateThroughNonJoin.getAliasMap(p)
+        val aliasMap = getAliasMap(p)
         val newJoinCond = if (aliasMap.nonEmpty) {
           Option(replaceAlias(joinCond.get, aliasMap))
         } else {
@@ -55,7 +55,7 @@ object PushDownLeftSemiAntiJoin extends Rule[LogicalPlan] with PredicateHelper {
     case join @ Join(agg: Aggregate, rightOp, LeftSemiOrAnti(_), _, _)
         if agg.aggregateExpressions.forall(_.deterministic) && agg.groupingExpressions.nonEmpty &&
         !agg.aggregateExpressions.exists(ScalarSubquery.hasCorrelatedScalarSubquery) =>
-      val aliasMap = PushPredicateThroughNonJoin.getAliasMap(agg)
+      val aliasMap = getAliasMap(agg)
       val canPushDownPredicate = (predicate: Expression) => {
         val replaced = replaceAlias(predicate, aliasMap)
         predicate.references.nonEmpty &&
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregatesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregatesSuite.scala
@@ -70,14 +70,37 @@ class RemoveRedundantAggregatesSuite extends PlanTest {
     comparePlans(optimized, expected)
   }
 
-  test("Keep non-redundant aggregate") {
+  test("Remove redundant aggregate with aliases") {
     val relation = LocalRelation('a.int, 'b.int)
     val query = relation
-      .groupBy('a)('a, first('b) as 'b)
+      .groupBy('a + 'b)(('a + 'b) as 'c, count('b))
+      .groupBy('c)('c)
+      .analyze
+    val expected = relation
+      .groupBy('a + 'b)(('a + 'b) as 'c)
+      .analyze
+    val optimized = Optimize.execute(query)
+    comparePlans(optimized, expected)
+  }
+
+  test("Keep non-redundant aggregate - upper has agg expression") {
+    val relation = LocalRelation('a.int, 'b.int)
+    val query = relation
+      .groupBy('a, 'b)('a, 'b)
       // The count would change if we remove the first aggregate
       .groupBy('a)('a, count('b))
       .analyze
     val optimized = Optimize.execute(query)
     comparePlans(optimized, query)
   }
+
+  test("Keep non-redundant aggregate - upper references non-grouping") {
+    val relation = LocalRelation('a.int, 'b.int)
+    val query = relation
+      .groupBy('a)('a, count('b) as 'c)
+      .groupBy('c)('c)
+      .analyze
+    val optimized = Optimize.execute(query)
+    comparePlans(optimized, query)
+  }
 }