apache · nongli · Feb 2, 2016 · Feb 2, 2016 · rxin · Feb 2, 2016
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -36,7 +36,7 @@
 import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter;
 import org.apache.spark.util.collection.unsafe.sort.UnsafeSorterIterator;
 
-final class UnsafeExternalRowSorter {
+public final class UnsafeExternalRowSorter {
 
   /**
    * If positive, forces records to be spilled to disk at the given frequency (measured in numbers
@@ -84,8 +84,7 @@ void setTestSpillFrequency(int frequency) {
     testSpillFrequency = frequency;
   }
 
-  @VisibleForTesting
-  void insertRow(UnsafeRow row) throws IOException {
+  public void insertRow(UnsafeRow row) throws IOException {
     final long prefix = prefixComputer.computePrefix(row);
     sorter.insertRecord(
       row.getBaseObject(),
@@ -110,8 +109,7 @@ private void cleanupResources() {
     sorter.cleanupResources();
   }
 
-  @VisibleForTesting
-  Iterator<UnsafeRow> sort() throws IOException {
+  public Iterator<UnsafeRow> sort() throws IOException {
     try {
       final UnsafeSorterIterator sortedIterator = sorter.getSortedIterator();
       if (!sortedIterator.hasNext()) {
@@ -160,7 +158,6 @@ public UnsafeRow next() {
     }
   }
 
-
   public Iterator<UnsafeRow> sort(Iterator<UnsafeRow> inputIterator) throws IOException {
     while (inputIterator.hasNext()) {
       insertRow(inputIterator.next());

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Sort.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Sort.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.{InternalAccumulator, SparkEnv, TaskContext}
+import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, GenerateUnsafeProjection}
 import org.apache.spark.sql.catalyst.plans.physical.{Distribution, OrderedDistribution, UnspecifiedDistribution}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
@@ -37,7 +38,7 @@ case class Sort(
     global: Boolean,
     child: SparkPlan,
     testSpillFrequency: Int = 0)
-  extends UnaryNode {
+  extends UnaryNode with CodegenSupport {
 
   override def output: Seq[Attribute] = child.output
 
@@ -50,34 +51,38 @@ case class Sort(
     "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size"),
     "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size"))
 
+  def createSorter(): UnsafeExternalRowSorter = {
+    val ordering = newOrdering(sortOrder, output)
+
+    // The comparator for comparing prefix
+    val boundSortExpression = BindReferences.bindReference(sortOrder.head, output)
+    val prefixComparator = SortPrefixUtils.getPrefixComparator(boundSortExpression)
+
+    // The generator for prefix
+    val prefixProjection = UnsafeProjection.create(Seq(SortPrefix(boundSortExpression)))
+    val prefixComputer = new UnsafeExternalRowSorter.PrefixComputer {
+      override def computePrefix(row: InternalRow): Long = {
+        prefixProjection.apply(row).getLong(0)
+      }
+    }
+
+    val pageSize = SparkEnv.get.memoryManager.pageSizeBytes
+    val sorter = new UnsafeExternalRowSorter(
+      schema, ordering, prefixComparator, prefixComputer, pageSize)
+    if (testSpillFrequency > 0) {
+      sorter.setTestSpillFrequency(testSpillFrequency)
+    }
+    sorter
+  }
+
   protected override def doExecute(): RDD[InternalRow] = {
     val schema = child.schema
-    val childOutput = child.output
 
     val dataSize = longMetric("dataSize")
     val spillSize = longMetric("spillSize")
 
     child.execute().mapPartitionsInternal { iter =>
-      val ordering = newOrdering(sortOrder, childOutput)
-
-      // The comparator for comparing prefix
-      val boundSortExpression = BindReferences.bindReference(sortOrder.head, childOutput)
-      val prefixComparator = SortPrefixUtils.getPrefixComparator(boundSortExpression)
-
-      // The generator for prefix
-      val prefixProjection = UnsafeProjection.create(Seq(SortPrefix(boundSortExpression)))
-      val prefixComputer = new UnsafeExternalRowSorter.PrefixComputer {
-        override def computePrefix(row: InternalRow): Long = {
-          prefixProjection.apply(row).getLong(0)
-        }
-      }
-
-      val pageSize = SparkEnv.get.memoryManager.pageSizeBytes
-      val sorter = new UnsafeExternalRowSorter(
-        schema, ordering, prefixComparator, prefixComputer, pageSize)
-      if (testSpillFrequency > 0) {
-        sorter.setTestSpillFrequency(testSpillFrequency)
-      }
+      val sorter = createSorter()
 
       val metrics = TaskContext.get().taskMetrics()
       // Remember spill data size of this task before execute this operator so that we can
@@ -93,4 +98,63 @@ case class Sort(
       sortedIterator
     }
   }
+
+  override def upstream(): RDD[InternalRow] = {
+    child.asInstanceOf[CodegenSupport].upstream()
+  }
+
+  // Name of sorter variable used in codegen.
+  private var sorterVariable: String = _
+
+  override protected def doProduce(ctx: CodegenContext): String = {
+    val needToSort = ctx.freshName("needToSort")
+    ctx.addMutableState("boolean", needToSort, s"$needToSort = true;")
+
+
+    // Initialize the class member variables. This includes the instance of the Sorter and
+    // the iterator to return sorted rows.
+    val thisPlan = ctx.addReferenceObj("plan", this)
+    sorterVariable = ctx.freshName("sorter")
+    ctx.addMutableState(classOf[UnsafeExternalRowSorter].getName, sorterVariable,
+      s"$sorterVariable = $thisPlan.createSorter();")
+    val sortedIterator = ctx.freshName("sortedIter")
+    ctx.addMutableState("scala.collection.Iterator<UnsafeRow>", sortedIterator, "")
+
+    val addToSorter = ctx.freshName("addToSorter")
+    ctx.addNewFunction(addToSorter,
+      s"""
+        | private void $addToSorter() throws java.io.IOException {
+        |   ${child.asInstanceOf[CodegenSupport].produce(ctx, this)}
+        | }
+      """.stripMargin.trim)
+
+    val outputRow = ctx.freshName("outputRow")
+    s"""
+       | if ($needToSort) {
+       |   $addToSorter();
+       |   $sortedIterator = $sorterVariable.sort();
+       |   $needToSort = false;
+       | }
+       |
+       | while ($sortedIterator.hasNext()) {
+       |   UnsafeRow $outputRow = (UnsafeRow)$sortedIterator.next();
+       |   ${consume(ctx, null, outputRow)}
+       | }
+     """.stripMargin.trim
+  }
+
+  override def doConsume(ctx: CodegenContext, input: Seq[ExprCode]): String = {
+    val colExprs = child.output.zipWithIndex.map { case (attr, i) =>
+      BoundReference(i, attr.dataType, attr.nullable)
+    }
+
+    ctx.currentVars = input
+    val code = GenerateUnsafeProjection.createCode(ctx, colExprs, false)
+
+    s"""
+       | // Convert the input attributes to an UnsafeRow and add it to the sorter
+       | ${code.code}
+       | $sorterVariable.insertRow(${code.value});
+     """.stripMargin.trim
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala
@@ -229,6 +229,7 @@ case class WholeStageCodegen(plan: CodegenSupport, children: Seq[SparkPlan])
 
         private Object[] references;
         ${ctx.declareMutableStates()}
+
         ${ctx.declareAddedFunctions()}
 
         public GeneratedIterator(Object[] references) {
@@ -240,7 +241,7 @@ case class WholeStageCodegen(plan: CodegenSupport, children: Seq[SparkPlan])
          $code
         }
       }
-      """
+      """.trim
 
     // try to compile, helpful for debug
     // println(s"${CodeFormatter.format(source)}")
@@ -277,7 +278,7 @@ case class WholeStageCodegen(plan: CodegenSupport, children: Seq[SparkPlan])
       s"""
          | currentRow = $row;
          | return;
-       """.stripMargin
+       """.stripMargin.trim
     } else {
       assert(input != null)
       if (input.nonEmpty) {
@@ -291,13 +292,13 @@ case class WholeStageCodegen(plan: CodegenSupport, children: Seq[SparkPlan])
            | ${code.code.trim}
            | currentRow = ${code.value};
            | return;
-         """.stripMargin
+         """.stripMargin.trim
       } else {
         // There is no columns
         s"""
            | currentRow = unsafeRow;
            | return;
-         """.stripMargin
+         """.stripMargin.trim
       }
     }
   }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.{CodegenSupport, SparkPlan, UnaryNode, UnsafeFixedWidthAggregationMap}
 import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.types.{DecimalType, StructType}
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.unsafe.KVIterator
 
 case class TungstenAggregate(
@@ -287,7 +287,6 @@ case class TungstenAggregate(
     GenerateUnsafeRowJoiner.create(groupingKeySchema, bufferSchema)
   }
 
-
   /**
     * Update peak execution memory, called in generated Java class.
     */