[SPARK-27207] : Fix SortBasedAggregator to run with different aggregate functions and write unit test

pgandhi · pgandhi · commit 4dc1007df9f5 · 2019-03-29T13:28:54.000-05:00
Fix SortBasedAggregator to ensure that update and merge are performed with two different sets of aggregate functions, one for update and one for merge respectively.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala
@@ -58,7 +58,7 @@ class ObjectAggregationIterator(
 
   private[this] var aggBufferIterator: Iterator[AggregationBufferEntry] = _
 
-  val (sortBasedAggExpressions, sortBasedAggFunctions): (
+  val (sortBasedMergeAggExpressions, sortBasedMergeAggFunctions): (
     Seq[AggregateExpression], Array[AggregateFunction]) = {
     val newExpressions = aggregateExpressions.map {
       case agg @ AggregateExpression(_, Partial, _, _) =>
@@ -72,8 +72,9 @@ class ObjectAggregationIterator(
 
   // Hacking the aggregation mode to call AggregateFunction.merge to merge two aggregation buffers
   private val mergeAggregationBuffers: (InternalRow, InternalRow) => Unit = {
-    val newInputAttributes = sortBasedAggFunctions.flatMap(_.inputAggBufferAttributes)
-    generateProcessRow(sortBasedAggExpressions, sortBasedAggFunctions, newInputAttributes)
+    val newInputAttributes = sortBasedMergeAggFunctions.flatMap(_.inputAggBufferAttributes)
+    generateProcessRow(
+      sortBasedMergeAggExpressions, sortBasedMergeAggFunctions, newInputAttributes)
   }
 
   /**
@@ -184,7 +185,9 @@ class ObjectAggregationIterator(
           StructType.fromAttributes(groupingAttributes),
           processRow,
           mergeAggregationBuffers,
-          createNewAggregationBuffer(sortBasedAggFunctions))
+          createNewAggregationBuffer(aggregateFunctions),
+          createNewAggregationBuffer(sortBasedMergeAggFunctions),
+          aggregateFunctions)
 
         while (inputRows.hasNext) {
           // NOTE: The input row is always UnsafeRow
@@ -212,7 +215,12 @@ class ObjectAggregationIterator(
  * @param processRow  Function to update the aggregation buffer with input rows
  * @param mergeAggregationBuffers Function used to merge the input aggregation buffers into existing
  *                                aggregation buffers
- * @param makeEmptyAggregationBuffer Creates an empty aggregation buffer
+ * @param makeEmptyAggregationBufferForSortBasedUpdateAggFunctions Creates an empty aggregation
+ *                                                                 buffer for update operation
+ * @param makeEmptyAggregationBufferForSortBasedMergeAggFunctions Creates an empty aggregation
+ *                                                                buffer for merge operation
+ * @param sortBasedUpdateAggFunctions aggregate functions needed to serialize the
+ *                                    aggregation buffer
  *
  * @todo Try to eliminate this class by refactor and reuse code paths in [[SortAggregateExec]].
  */
@@ -222,7 +230,9 @@ class SortBasedAggregator(
     groupingSchema: StructType,
     processRow: (InternalRow, InternalRow) => Unit,
     mergeAggregationBuffers: (InternalRow, InternalRow) => Unit,
-    makeEmptyAggregationBuffer: => InternalRow) {
+    makeEmptyAggregationBufferForSortBasedUpdateAggFunctions: => InternalRow,
+    makeEmptyAggregationBufferForSortBasedMergeAggFunctions: => InternalRow,
+    sortBasedUpdateAggFunctions: Array[AggregateFunction]) {
 
   // external sorter to sort the input (grouping key + input row) with grouping key.
   private val inputSorter = createExternalSorterForInput()
@@ -231,6 +241,10 @@ class SortBasedAggregator(
   def addInput(groupingKey: UnsafeRow, inputRow: UnsafeRow): Unit = {
     inputSorter.insertKV(groupingKey, inputRow)
   }
+  private def serializeBuffer(buffer: InternalRow): Unit = {
+    sortBasedUpdateAggFunctions.collect { case f: TypedImperativeAggregate[_] => f }.foreach(
+      _.serializeAggregateBufferInPlace(buffer))
+  }
 
   /**
    * Returns a destructive iterator of AggregationBufferEntry.
@@ -241,16 +255,18 @@ class SortBasedAggregator(
       val inputIterator = inputSorter.sortedIterator()
       var hasNextInput: Boolean = inputIterator.next()
       var hasNextAggBuffer: Boolean = initialAggBufferIterator.next()
-      private var result: AggregationBufferEntry = _
+      private var updateResult: AggregationBufferEntry = _
+      private var finalResult: AggregationBufferEntry = _
       private var groupingKey: UnsafeRow = _
 
       override def hasNext(): Boolean = {
-        result != null || findNextSortedGroup()
+        updateResult != null || finalResult != null || findNextSortedGroup()
       }
 
       override def next(): AggregationBufferEntry = {
-        val returnResult = result
-        result = null
+        val returnResult = finalResult
+        updateResult = null
+        finalResult = null
         returnResult
       }
 
@@ -259,21 +275,31 @@ class SortBasedAggregator(
         if (hasNextInput || hasNextAggBuffer) {
           // Find smaller key of the initialAggBufferIterator and initialAggBufferIterator
           groupingKey = findGroupingKey()
-          result = new AggregationBufferEntry(groupingKey, makeEmptyAggregationBuffer)
+          updateResult = new AggregationBufferEntry(
+            groupingKey, makeEmptyAggregationBufferForSortBasedUpdateAggFunctions)
+          finalResult = new AggregationBufferEntry(
+            groupingKey, makeEmptyAggregationBufferForSortBasedMergeAggFunctions)
 
           // Firstly, update the aggregation buffer with input rows.
           while (hasNextInput &&
             groupingKeyOrdering.compare(inputIterator.getKey, groupingKey) == 0) {
-            processRow(result.aggregationBuffer, inputIterator.getValue)
+            processRow(updateResult.aggregationBuffer, inputIterator.getValue)
             hasNextInput = inputIterator.next()
           }
 
+          // This step ensures that the contents of the updateResult aggregation buffer are
+          // merged with the finalResult aggregation buffer to maintain consistency
+          if (hasNextAggBuffer) {
+            serializeBuffer(updateResult.aggregationBuffer)
+            mergeAggregationBuffers(finalResult.aggregationBuffer, updateResult.aggregationBuffer)
+          }
           // Secondly, merge the aggregation buffer with existing aggregation buffers.
           // NOTE: the ordering of these two while-block matter, mergeAggregationBuffer() should
           // be called after calling processRow.
           while (hasNextAggBuffer &&
             groupingKeyOrdering.compare(initialAggBufferIterator.getKey, groupingKey) == 0) {
-            mergeAggregationBuffers(result.aggregationBuffer, initialAggBufferIterator.getValue)
+            mergeAggregationBuffers(
+              finalResult.aggregationBuffer, initialAggBufferIterator.getValue)
             hasNextAggBuffer = initialAggBufferIterator.next()
           }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TypedImperativeAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TypedImperativeAggregateSuite.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}
 
 import org.apache.spark.sql.TypedImperativeAggregateSuite.TypedMax
-import org.apache.spark.sql.TypedImperativeAggregateSuite.TypedMax2
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, GenericInternalRow, ImplicitCastInputTypes, SpecificInternalRow}
 import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate
@@ -211,20 +210,6 @@ class TypedImperativeAggregateSuite extends QueryTest with SharedSQLContext {
     checkAnswer(query, expected)
   }
 
-  test("SPARK-27207: Ensure aggregate buffers are initialized again for SortBasedAggregate") {
-    withSQLConf("spark.sql.objectHashAggregate.sortBased.fallbackThreshold" -> "5") {
-      val df = data.toDF("value", "key").coalesce(2)
-      val query = df.groupBy($"key").agg(typedMax2($"value"), count($"value"), typedMax2($"value"))
-      val expected = data.groupBy(_._2).toSeq.map { group =>
-        val (key, values) = group
-        val valueMax = values.map(_._1).max
-        val countValue = values.size
-        Row(key, valueMax, countValue, valueMax)
-      }
-      checkAnswer(query, expected)
-    }
-  }
-
   private def typedMax(column: Column): Column = {
     val max = TypedMax(column.expr, nullable = false)
     Column(max.toAggregateExpression())
@@ -235,10 +220,6 @@ class TypedImperativeAggregateSuite extends QueryTest with SharedSQLContext {
     Column(max.toAggregateExpression())
   }
 
-  private def typedMax2(column: Column): Column = {
-    val max = TypedMax2(column.expr, nullable = false)
-    Column(max.toAggregateExpression())
-  }
 }
 
 object TypedImperativeAggregateSuite {
@@ -319,86 +300,5 @@ object TypedImperativeAggregateSuite {
     }
   }
 
-  /**
-   * Calculate the max value with object aggregation buffer. This stores class MaxValue
-   * in aggregation buffer.
-   */
-  private case class TypedMax2(
-    child: Expression,
-    nullable: Boolean = false,
-    mutableAggBufferOffset: Int = 0,
-    inputAggBufferOffset: Int = 0)
-    extends TypedImperativeAggregate[MaxValue] with ImplicitCastInputTypes {
-
-
-    var maxValueBuffer: MaxValue = null
-    override def createAggregationBuffer(): MaxValue = {
-      // Returns Int.MinValue if all inputs are null
-      maxValueBuffer = new MaxValue(Int.MinValue)
-      maxValueBuffer
-    }
-
-    override def update(buffer: MaxValue, input: InternalRow): MaxValue = {
-      child.eval(input) match {
-        case inputValue: Int =>
-          if (inputValue > buffer.value) {
-            buffer.value = inputValue
-            buffer.isValueSet = true
-          }
-        case null => // skip
-      }
-      buffer
-    }
-
-    override def merge(bufferMax: MaxValue, inputMax: MaxValue): MaxValue = {
-      // The below if condition will throw a Null Pointer Exception if initialize() is not called
-      if (maxValueBuffer.isValueSet) {
-        // do nothing
-      }
-      if (inputMax.value > bufferMax.value) {
-        bufferMax.value = inputMax.value
-        bufferMax.isValueSet = bufferMax.isValueSet || inputMax.isValueSet
-      }
-      bufferMax
-    }
-
-    override def eval(bufferMax: MaxValue): Any = {
-      if (nullable && bufferMax.isValueSet == false) {
-        null
-      } else {
-        bufferMax.value
-      }
-    }
-
-    override lazy val deterministic: Boolean = true
-
-    override def children: Seq[Expression] = Seq(child)
-
-    override def inputTypes: Seq[AbstractDataType] = Seq(IntegerType)
-
-    override def dataType: DataType = IntegerType
-
-    override def withNewMutableAggBufferOffset(newOffset: Int): TypedImperativeAggregate[MaxValue] =
-      copy(mutableAggBufferOffset = newOffset)
-
-    override def withNewInputAggBufferOffset(newOffset: Int): TypedImperativeAggregate[MaxValue] =
-      copy(inputAggBufferOffset = newOffset)
-
-    override def serialize(buffer: MaxValue): Array[Byte] = {
-      val out = new ByteArrayOutputStream()
-      val stream = new DataOutputStream(out)
-      stream.writeBoolean(buffer.isValueSet)
-      stream.writeInt(buffer.value)
-      out.toByteArray
-    }
-
-    override def deserialize(storageFormat: Array[Byte]): MaxValue = {
-      val in = new ByteArrayInputStream(storageFormat)
-      val stream = new DataInputStream(in)
-      val isValueSet = stream.readBoolean()
-      val value = stream.readInt()
-      new MaxValue(value, isValueSet)
-    }
-  }
   private class MaxValue(var value: Int, var isValueSet: Boolean = false)
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala
@@ -25,6 +25,7 @@ import org.apache.spark._
 import org.apache.spark.memory.{TaskMemoryManager, TestMemoryManager}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 import org.apache.spark.unsafe.KVIterator
 
@@ -78,7 +79,9 @@ class SortBasedAggregationStoreSuite  extends SparkFunSuite with LocalSparkConte
       groupingSchema,
       updateInputRow,
       mergeAggBuffer,
-      createNewAggregationBuffer)
+      createNewAggregationBuffer,
+      createNewAggregationBuffer,
+      sortBasedUpdateAggFunctions = new Array[AggregateFunction](5))
 
     (5000 to 100000).foreach { _ =>
       randomKV(inputRow, group)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala
@@ -49,6 +49,16 @@ class HiveUDAFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
       (2: Integer) -> null,
       (3: Integer) -> null
     ).toDF("key", "value").repartition(2).createOrReplaceTempView("t")
+    Seq(
+      (0: Integer) -> "val_0",
+      (1: Integer) -> "val_1",
+      (2: Integer) -> "val_2",
+      (3: Integer) -> "val_3",
+      (4: Integer) -> "val_4",
+      (5: Integer) -> "val_5",
+      (6: Integer) -> null,
+      (7: Integer) -> null
+    ).toDF("key", "value").repartition(2).createOrReplaceTempView("t2")
   }
 
   protected override def afterAll(): Unit = {
@@ -111,6 +121,26 @@ class HiveUDAFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
     ))
   }
 
+  test("SPARK-27207: customized Hive UDAF with two aggregation buffers for Sort" +
+      " Based Aggregation") {
+    withSQLConf("spark.sql.objectHashAggregate.sortBased.fallbackThreshold" -> "2") {
+      val df = sql("SELECT key % 2, mock2(value) FROM t2 GROUP BY key % 2")
+
+      val aggs = df.queryExecution.executedPlan.collect {
+        case agg: ObjectHashAggregateExec => agg
+      }
+
+      // There should be two aggregate operators, one for partial aggregation, and the other for
+      // global aggregation.
+      assert(aggs.length == 2)
+
+      checkAnswer(df, Seq(
+        Row(0, Row(3, 1)),
+        Row(1, Row(3, 1))
+      ))
+    }
+  }
+
   test("call JAVA UDAF") {
     withTempView("temp") {
       withUserDefinedFunction("myDoubleAvg" -> false) {