fix tests

sameeragarwal · sameeragarwal · commit ececd5770e0c · 2016-04-13T15:12:16.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ColumnarAggMapCodeGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ColumnarAggMapCodeGenerator.scala
@@ -191,10 +191,11 @@ class ColumnarAggMapCodeGenerator(
        |      ${groupingKeys.zipWithIndex.map(k =>
                 s"batch.column(${k._2}).putLong(numRows, ${k._1._2});").mkString("\n")}
        |      ${bufferValues.zipWithIndex.map(k =>
-                s"batch.column(${groupingKeys.length + k._2}).putLong(numRows, 0);")
+                s"batch.column(${groupingKeys.length + k._2}).putNull(numRows);")
                 .mkString("\n")}
        |      buckets[idx] = numRows++;
        |      batch.setNumRows(numRows);
+       |      aggregateBufferBatch.setNumRows(numRows);
        |      return aggregateBufferBatch.getRow(buckets[idx]);
        |    } else if (equals(idx, ${groupingKeys.map(_._2).mkString(", ")})) {
        |      return aggregateBufferBatch.getRow(buckets[idx]);
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -70,12 +70,14 @@ case class TungstenAggregate(
     }
   }
 
-  // This is for testing. We force TungstenAggregationIterator to fall back to sort-based
-  // aggregation once it has processed a given number of input rows.
-  private val testFallbackStartsAt: Option[Int] = {
+  // This is for testing. We force TungstenAggregationIterator to fall back to the bytes to bytes
+  // map and the sort-based aggregation once it has processed a given number of input rows.
+  private val testFallbackStartsAt: Option[(Int, Int)] = {
     sqlContext.getConf("spark.sql.TungstenAggregate.testFallbackStartsAt", null) match {
       case null | "" => None
-      case fallbackStartsAt => Some(fallbackStartsAt.toInt)
+      case fallbackStartsAt =>
+        val splits = fallbackStartsAt.split(",").map(_.trim)
+        Some((splits.head.toInt, splits.last.toInt))
     }
   }
 
@@ -593,20 +595,27 @@ case class TungstenAggregate(
       ctx.updateColumn(buffer, dt, i, ev, updateExpr(i).nullable)
     }
 
-    val (checkFallback, resetCounter, incCounter) = if (testFallbackStartsAt.isDefined) {
+    val (checkFallbackForGeneratedHashMap, checkFallbackForBytesToBytesMap, resetCounter,
+    incCounter) = if (testFallbackStartsAt.isDefined) {
       val countTerm = ctx.freshName("fallbackCounter")
       ctx.addMutableState("int", countTerm, s"$countTerm = 0;")
-      (s"$countTerm < ${testFallbackStartsAt.get}", s"$countTerm = 0;", s"$countTerm += 1;")
+      (s"$countTerm < ${testFallbackStartsAt.get._1}",
+        s"$countTerm < ${testFallbackStartsAt.get._2}", s"$countTerm = 0;", s"$countTerm += 1;")
     } else {
-      ("true", "", "")
+      ("true", "true", "", "")
     }
 
     val findOrInsertInGeneratedHashMap: Option[String] = {
       if (isAggregateHashMapEnabled) {
         Option(
           s"""
-             | $aggregateRow =
-             |   $aggregateHashMapTerm.findOrInsert(${groupByKeys.map(_.value).mkString(", ")});
+             |if ($checkFallbackForGeneratedHashMap) {
+             |  ${groupByKeys.map(_.code).mkString("\n")}
+             |  if (${groupByKeys.map("!" + _.isNull).mkString(" && ")}) {
+             |    $aggregateRow =
+             |      $aggregateHashMapTerm.findOrInsert(${groupByKeys.map(_.value).mkString(", ")});
+             |  }
+             |}
          """.stripMargin)
       } else {
         None
@@ -619,7 +628,7 @@ case class TungstenAggregate(
          |   // generate grouping key
          |   ${keyCode.code.trim}
          |   ${hashEval.code.trim}
-         |   if ($checkFallback) {
+         |   if ($checkFallbackForBytesToBytesMap) {
          |     // try to get the buffer from hash map
          |     $buffer = $hashMapTerm.getAggregationBufferFromUnsafeRow($key, ${hashEval.value});
          |   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -85,7 +85,7 @@ class TungstenAggregationIterator(
     newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
     originalInputAttributes: Seq[Attribute],
     inputIter: Iterator[InternalRow],
-    testFallbackStartsAt: Option[Int],
+    testFallbackStartsAt: Option[(Int, Int)],
     numOutputRows: LongSQLMetric,
     dataSize: LongSQLMetric,
     spillSize: LongSQLMetric)
@@ -171,7 +171,7 @@ class TungstenAggregationIterator(
   // hashMap. If there is not enough memory, it will multiple hash-maps, spilling
   // after each becomes full then using sort to merge these spills, finally do sort
   // based aggregation.
-  private def processInputs(fallbackStartsAt: Int): Unit = {
+  private def processInputs(fallbackStartsAt: (Int, Int)): Unit = {
     if (groupingExpressions.isEmpty) {
       // If there is no grouping expressions, we can just reuse the same buffer over and over again.
       // Note that it would be better to eliminate the hash map entirely in the future.
@@ -187,7 +187,7 @@ class TungstenAggregationIterator(
         val newInput = inputIter.next()
         val groupingKey = groupingProjection.apply(newInput)
         var buffer: UnsafeRow = null
-        if (i < fallbackStartsAt) {
+        if (i < fallbackStartsAt._2) {
           buffer = hashMap.getAggregationBufferFromUnsafeRow(groupingKey)
         }
         if (buffer == null) {
@@ -352,7 +352,7 @@ class TungstenAggregationIterator(
   /**
    * Start processing input rows.
    */
-  processInputs(testFallbackStartsAt.getOrElse(Int.MaxValue))
+  processInputs(testFallbackStartsAt.getOrElse((Int.MaxValue, Int.MaxValue)))
 
   // If we did not switch to sort-based aggregation in processInputs,
   // we pre-load the first key-value pair from the map (to make hasNext idempotent).
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -967,27 +967,32 @@ class TungstenAggregationQuerySuite extends AggregationQuerySuite
 class TungstenAggregationQueryWithControlledFallbackSuite extends AggregationQuerySuite {
 
   override protected def checkAnswer(actual: => DataFrame, expectedAnswer: Seq[Row]): Unit = {
-    (0 to 2).foreach { fallbackStartsAt =>
-      withSQLConf("spark.sql.TungstenAggregate.testFallbackStartsAt" -> fallbackStartsAt.toString) {
-        // Create a new df to make sure its physical operator picks up
-        // spark.sql.TungstenAggregate.testFallbackStartsAt.
-        // todo: remove it?
-        val newActual = Dataset.ofRows(sqlContext, actual.logicalPlan)
-
-        QueryTest.checkAnswer(newActual, expectedAnswer) match {
-          case Some(errorMessage) =>
-            val newErrorMessage =
-              s"""
-                |The following aggregation query failed when using TungstenAggregate with
-                |controlled fallback (it falls back to sort-based aggregation once it has processed
-                |$fallbackStartsAt input rows). The query is
-                |${actual.queryExecution}
-                |
-                |$errorMessage
-              """.stripMargin
-
-            fail(newErrorMessage)
-          case None =>
+    Seq(false, true).foreach { enableColumnarHashMap =>
+      withSQLConf("spark.sql.codegen.aggregate.map.enabled" -> enableColumnarHashMap.toString) {
+        (1 to 3).foreach { fallbackStartsAt =>
+          withSQLConf("spark.sql.TungstenAggregate.testFallbackStartsAt" ->
+            s"${(fallbackStartsAt - 1).toString}, ${fallbackStartsAt.toString}") {
+            // Create a new df to make sure its physical operator picks up
+            // spark.sql.TungstenAggregate.testFallbackStartsAt.
+            // todo: remove it?
+            val newActual = Dataset.ofRows(sqlContext, actual.logicalPlan)
+
+            QueryTest.checkAnswer(newActual, expectedAnswer) match {
+              case Some(errorMessage) =>
+                val newErrorMessage =
+                  s"""
+                    |The following aggregation query failed when using TungstenAggregate with
+                    |controlled fallback (it falls back to bytes to bytes map once it has processed
+                    |${fallbackStartsAt -1} input rows and to sort-based aggregation once it has
+                    |processed $fallbackStartsAt input rows). The query is ${actual.queryExecution}
+                    |
+                    |$errorMessage
+                  """.stripMargin
+
+                fail(newErrorMessage)
+              case None => // Success
+            }
+          }
         }
       }
     }