Defer the evaluation of expresssions in Project

Davies Liu · Davies Liu · commit f6139e6c4330 · 2016-02-22T13:57:14.000-08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -63,10 +63,9 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
     val value = ctx.getValue(ctx.INPUT_ROW, dataType, ordinal.toString)
     if (ctx.currentVars != null && ctx.currentVars(ordinal) != null) {
       val oev = ctx.currentVars(ordinal)
-      // assert(oev.code == "", s"$this has not been evaluated yet.")
       ev.isNull = oev.isNull
       ev.value = oev.value
-      ""
+      oev.code
     } else if (nullable) {
       s"""
         boolean ${ev.isNull} = ${ctx.INPUT_ROW}.isNullAt($ordinal);
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala
@@ -80,7 +80,7 @@ trait CodegenSupport extends SparkPlan {
     ctx.freshNamePrefix = variablePrefix
     waitForSubqueries()
     s"""
-       |/*** PRODUCE: ${commentSafe(this.simpleString)} */
+       |/*** PRODUCE: ${toCommentSafeString(this.simpleString)} */
        |${doProduce(ctx)}
      """.stripMargin
   }
@@ -142,9 +142,10 @@ trait CodegenSupport extends SparkPlan {
     evaluateVars
   }
 
-  protected def commentSafe(s: String): String = {
-    s.replace("*/", "\\*\\/").replace("\\u", "\\\\u")
-  }
+  /**
+   * The subset of inputSet those should be evaluated before this plan.
+   */
+  def usedInputs: AttributeSet = references
 
   /**
     * Consume the columns generated from it's child, call doConsume() or emit the rows.
@@ -167,8 +168,8 @@ trait CodegenSupport extends SparkPlan {
       }
     s"""
        |
-       |/*** CONSUME: ${commentSafe(this.simpleString)} */
-       |${evaluateRequiredVariables(child.output, inputVars, references)}
+       |/*** CONSUME: ${toCommentSafeString(this.simpleString)} */
+       |${evaluateRequiredVariables(child.output, inputVars, usedInputs)}
        |${doConsume(ctx, inputVars)}
      """.stripMargin
   }
@@ -292,11 +293,7 @@ case class WholeStageCodegen(plan: CodegenSupport, children: Seq[SparkPlan])
       }
 
       /** Codegened pipeline for:
-<<<<<<< HEAD
-        * ${commentSafe(plan.treeString.trim)}
-=======
         * ${toCommentSafeString(plan.treeString.trim)}
->>>>>>> 00461bb911c31aff9c945a14e23df2af4c280c23
         */
       class GeneratedIterator extends org.apache.spark.sql.execution.BufferedRowIterator {
 
@@ -358,11 +355,12 @@ case class WholeStageCodegen(plan: CodegenSupport, children: Seq[SparkPlan])
         val colExprs = output.zipWithIndex.map { case (attr, i) =>
           BoundReference(i, attr.dataType, attr.nullable)
         }
+        val evaluateInputs = evaluateVariables(input)
         // generate the code to create a UnsafeRow
         ctx.currentVars = input
         val code = GenerateUnsafeProjection.createCode(ctx, colExprs, false)
         s"""
-           |${evaluateVariables(input)}
+           |$evaluateInputs
            |${code.code.trim}
            |currentRows.add(${code.value}.copy());
          """.stripMargin
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -116,13 +116,7 @@ case class TungstenAggregate(
   // all the mode of aggregate expressions
   private val modes = aggregateExpressions.map(_.mode).distinct
 
-  override def references: AttributeSet = {
-    AttributeSet(groupingExpressions.flatMap(_.references) ++ aggregateExpressions.flatMap {
-      case AggregateExpression(f, Final | PartialMerge, _) => f.inputAggBufferAttributes
-      case AggregateExpression(f, Partial | Complete, _) => f.references
-    })
-    child.outputSet
-  }
+  override def usedInputs: AttributeSet = inputSet
 
   override def supportCodegen: Boolean = {
     // ImperativeAggregate is not supported right now
@@ -387,25 +381,28 @@ case class TungstenAggregate(
       val keyVars = groupingExpressions.zipWithIndex.map { case (e, i) =>
         BoundReference(i, e.dataType, e.nullable).gen(ctx)
       }
+      val evaluateKeyVars = evaluateVariables(keyVars)
       ctx.INPUT_ROW = bufferTerm
       val bufferVars = aggregateBufferAttributes.zipWithIndex.map { case (e, i) =>
         BoundReference(i, e.dataType, e.nullable).gen(ctx)
       }
+      val evaluateBufferVars = evaluateVariables(bufferVars)
       // evaluate the aggregation result
       ctx.currentVars = bufferVars
       val aggResults = declFunctions.map(_.evaluateExpression).map { e =>
         BindReferences.bindReference(e, aggregateBufferAttributes).gen(ctx)
       }
+      val evaluateAggResults = evaluateVariables(aggResults)
       // generate the final result
       ctx.currentVars = keyVars ++ aggResults
       val inputAttrs = groupingAttributes ++ aggregateAttributes
       val resultVars = resultExpressions.map { e =>
         BindReferences.bindReference(e, inputAttrs).gen(ctx)
       }
       s"""
-       ${evaluateVariables(keyVars)}
-       ${evaluateVariables(bufferVars)}
-       ${evaluateVariables(aggResults)}
+       $evaluateKeyVars
+       $evaluateBufferVars
+       $evaluateAggResults
        ${consume(ctx, resultVars)}
        """
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -39,6 +39,16 @@ case class Project(projectList: Seq[NamedExpression], child: SparkPlan)
     child.asInstanceOf[CodegenSupport].produce(ctx, this)
   }
 
+  override def usedInputs: AttributeSet = {
+    // only the attributes those are used at least twice should be evaluated before this plan,
+    // otherwise we could defer the evaluation until output attribute is actually used.
+    val usedExprIds = projectList.flatMap(_.collect {
+      case a: Attribute => a.exprId
+    })
+    val usedMoreThanOnce = usedExprIds.groupBy(id => id).filter(_._2.size > 1).keySet
+    references.filter(a => usedMoreThanOnce.contains(a.exprId))
+  }
+
   override def doConsume(ctx: CodegenContext, input: Seq[ExprCode]): String = {
     val exprs = projectList.map(x =>
       ExpressionCanonicalizer.execute(BindReferences.bindReference(x, child.output)))