[SPARK-18711][SQL] should disable subexpression elimination for LambdaVariable

cloud-fan · hvanhovell · commit 01a7d33d0851 · 2016-12-05T11:37:13.000-08:00
## What changes were proposed in this pull request? This is kind of a long-standing bug, it's hidden until #15780 , which may add `AssertNotNull` on top of `LambdaVariable` and thus enables subexpression elimination. However, subexpression elimination will evaluate the common expressions at the beginning, which is invalid for `LambdaVariable`. `LambdaVariable` usually represents loop variable, which can't be evaluated ahead of the loop. This PR skips expressions containing `LambdaVariable` when doing subexpression elimination. ## How was this patch tested? updated test in `DatasetAggregatorSuite` Author: Wenchen Fan <wenchen@databricks.com> Closes #16143 from cloud-fan/aggregator.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.expressions.objects.LambdaVariable
 
 /**
  * This class is used to compute equality of (sub)expression trees. Expressions can be added
@@ -72,7 +73,10 @@ class EquivalentExpressions {
       root: Expression,
       ignoreLeaf: Boolean = true,
       skipReferenceToExpressions: Boolean = true): Unit = {
-    val skip = root.isInstanceOf[LeafExpression] && ignoreLeaf
+    val skip = (root.isInstanceOf[LeafExpression] && ignoreLeaf) ||
+      // `LambdaVariable` is usually used as a loop variable, which can't be evaluated ahead of the
+      // loop. So we can't evaluate sub-expressions containing `LambdaVariable` at the beginning.
+      root.find(_.isInstanceOf[LambdaVariable]).isDefined
     // There are some special expressions that we should not recurse into children.
     //   1. CodegenFallback: it's children will not be used to generate code (call eval() instead)
     //   2. ReferenceToExpressions: it's kind of an explicit sub-expression elimination.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
@@ -92,13 +92,13 @@ object NameAgg extends Aggregator[AggData, String, String] {
 }
 
 
-object SeqAgg extends Aggregator[AggData, Seq[Int], Seq[Int]] {
+object SeqAgg extends Aggregator[AggData, Seq[Int], Seq[(Int, Int)]] {
   def zero: Seq[Int] = Nil
   def reduce(b: Seq[Int], a: AggData): Seq[Int] = a.a +: b
   def merge(b1: Seq[Int], b2: Seq[Int]): Seq[Int] = b1 ++ b2
-  def finish(r: Seq[Int]): Seq[Int] = r
+  def finish(r: Seq[Int]): Seq[(Int, Int)] = r.map(i => i -> i)
   override def bufferEncoder: Encoder[Seq[Int]] = ExpressionEncoder()
-  override def outputEncoder: Encoder[Seq[Int]] = ExpressionEncoder()
+  override def outputEncoder: Encoder[Seq[(Int, Int)]] = ExpressionEncoder()
 }
 
 
@@ -281,7 +281,7 @@ class DatasetAggregatorSuite extends QueryTest with SharedSQLContext {
 
     checkDataset(
       ds.groupByKey(_.b).agg(SeqAgg.toColumn),
-      "a" -> Seq(1, 2)
+      "a" -> Seq(1 -> 1, 2 -> 2)
     )
   }
 

Original file line number	Diff line number	Diff line change
`@@ -92,13 +92,13 @@ object NameAgg extends Aggregator[AggData, String, String] {`
`92`	`92`	`}`
`93`	`93`
`94`	`94`
`95`		`-object SeqAgg extends Aggregator[AggData, Seq[Int], Seq[Int]] {`
	`95`	`+object SeqAgg extends Aggregator[AggData, Seq[Int], Seq[(Int, Int)]] {`
`96`	`96`	`def zero: Seq[Int] = Nil`
`97`	`97`	`def reduce(b: Seq[Int], a: AggData): Seq[Int] = a.a +: b`
`98`	`98`	`def merge(b1: Seq[Int], b2: Seq[Int]): Seq[Int] = b1 ++ b2`
`99`		`- def finish(r: Seq[Int]): Seq[Int] = r`
	`99`	`+ def finish(r: Seq[Int]): Seq[(Int, Int)] = r.map(i => i -> i)`
`100`	`100`	`override def bufferEncoder: Encoder[Seq[Int]] = ExpressionEncoder()`
`101`		`- override def outputEncoder: Encoder[Seq[Int]] = ExpressionEncoder()`
	`101`	`+ override def outputEncoder: Encoder[Seq[(Int, Int)]] = ExpressionEncoder()`
`102`	`102`	`}`
`103`	`103`
`104`	`104`
`@@ -281,7 +281,7 @@ class DatasetAggregatorSuite extends QueryTest with SharedSQLContext {`
`281`	`281`
`282`	`282`	`checkDataset(`
`283`	`283`	`ds.groupByKey(_.b).agg(SeqAgg.toColumn),`
`284`		`- "a" -> Seq(1, 2)`
	`284`	`+ "a" -> Seq(1 -> 1, 2 -> 2)`
`285`	`285`	`)`
`286`	`286`	`}`
`287`	`287`