fix tests

Davies Liu · Davies Liu · commit ffc9d8ca9019 · 2016-03-04T21:18:23.000-08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -151,17 +151,16 @@ private[sql] case class PhysicalRDD(
     val exprs = output.zipWithIndex.map(x => new BoundReference(x._2, x._1.dataType, true))
     val row = ctx.freshName("row")
     val numOutputRows = metricTerm(ctx, "numOutputRows")
-    ctx.INPUT_ROW = row
-    ctx.currentVars = null
-    val columns = exprs.map(_.gen(ctx))
 
     // The input RDD can either return (all) ColumnarBatches or InternalRows. We determine this
     // by looking at the first value of the RDD and then calling the function which will process
     // the remaining. It is faster to return batches.
     // TODO: The abstractions between this class and SqlNewHadoopRDD makes it difficult to know
     // here which path to use. Fix this.
 
-
+    ctx.INPUT_ROW = row
+    ctx.currentVars = null
+    val columns1 = exprs.map(_.gen(ctx))
     val scanBatches = ctx.freshName("processBatches")
     ctx.addNewFunction(scanBatches,
       s"""
@@ -170,11 +169,11 @@ private[sql] case class PhysicalRDD(
       |     int numRows = $batch.numRows();
       |     if ($idx == 0) $numOutputRows.add(numRows);
       |
-      |     while ($idx < numRows) {
+      |     while (!shouldStop() && $idx < numRows) {
       |       InternalRow $row = $batch.getRow($idx++);
-      |       ${consume(ctx, columns).trim}
-      |       if (shouldStop()) return;
+      |       ${consume(ctx, columns1).trim}
       |     }
+      |     if (shouldStop()) return;
       |
       |     if (!$input.hasNext()) {
       |       $batch = null;
@@ -185,16 +184,23 @@ private[sql] case class PhysicalRDD(
       |   }
       | }""".stripMargin)
 
+    ctx.INPUT_ROW = row
+    ctx.currentVars = null
+    val columns2 = exprs.map(_.gen(ctx))
+    val inputRow = if (isUnsafeRow) row else null
     val scanRows = ctx.freshName("processRows")
     ctx.addNewFunction(scanRows,
       s"""
        | private void $scanRows(InternalRow $row) throws java.io.IOException {
-       |   while (true) {
+       |   boolean firstRow = true;
+       |   while (!shouldStop() && (firstRow || $input.hasNext())) {
+       |     if (firstRow) {
+       |       firstRow = false;
+       |     } else {
+       |       $row = (InternalRow) $input.next();
+       |     }
        |     $numOutputRows.add(1);
-       |     ${consume(ctx, columns).trim}
-       |     if (shouldStop()) return;
-       |     if (!$input.hasNext()) break;
-       |     $row = (InternalRow)$input.next();
+       |     ${consume(ctx, columns2, inputRow).trim}
        |   }
        | }""".stripMargin)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala
@@ -97,12 +97,11 @@ trait CodegenSupport extends SparkPlan {
     *     # call child.produce()
     *     initialized = true;
     *   }
-    *   while (hashmap.hasNext()) {
+    *   while (!shouldStop() && hashmap.hasNext()) {
     *     row = hashmap.next();
     *     # build the aggregation results
     *     # create variables for results
     *     # call consume(), which will call parent.doConsume()
-   *      if (shouldStop()) return;
     *   }
     */
   protected def doProduce(ctx: CodegenContext): String
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -40,8 +40,13 @@ case class Project(projectList: Seq[NamedExpression], child: SparkPlan)
   }
 
   override def usedInputs: AttributeSet = {
-    // filter out the expressions that just pass the input to next operator.
-    AttributeSet(projectList.filterNot(inputSet.contains).flatMap(_.references))
+    // only the attributes those are used at least twice should be evaluated before this plan,
+    // otherwise we could defer the evaluation until output attribute is actually used.
+    val usedExprIds = projectList.flatMap(_.collect {
+      case a: Attribute => a.exprId
+    })
+    val usedMoreThanOnce = usedExprIds.groupBy(id => id).filter(_._2.size > 1).keySet
+    references.filter(a => usedMoreThanOnce.contains(a.exprId))
   }
 
   override def doConsume(ctx: CodegenContext, input: Seq[ExprCode]): String = {
@@ -50,7 +55,7 @@ case class Project(projectList: Seq[NamedExpression], child: SparkPlan)
     ctx.currentVars = input
     val resultVars = exprs.map(_.gen(ctx))
     // Evaluation of non-deterministic expressions can't be deferred.
-    val nonDeterministicAttrs = projectList.zip(output).filter(!_._1.deterministic).unzip._2
+    val nonDeterministicAttrs = projectList.filterNot(_.deterministic).map(_.toAttribute)
     s"""
        |${evaluateRequiredVariables(output, resultVars, AttributeSet(nonDeterministicAttrs))}
        |${consume(ctx, resultVars)}