CR

sameeragarwal · sameeragarwal · commit ec74328ab737 · 2016-04-08T09:03:55.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ColumnarAggMapCodeGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ColumnarAggMapCodeGenerator.scala
@@ -20,7 +20,20 @@ package org.apache.spark.sql.execution.aggregate
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.types.StructType
 
-class TungstenAggregateHashMap(
+/**
+ * This is a helper object to generate an append-only single-key/single value aggregate hash
+ * map that can act as a 'cache' for extremely fast key-value lookups while evaluating aggregates
+ * (and fall back to the `BytesToBytesMap` if a given key isn't found). This is 'codegened' in
+ * TungstenAggregate to speed up aggregates w/ key.
+ *
+ * It is backed by a power-of-2-sized array for index lookups and a columnar batch that stores the
+ * key-value pairs. The index lookups in the array rely on linear probing (with a small number of
+ * maximum tries) and use an inexpensive hash function which makes it really efficient for a
+ * majority of lookups. However, using linear probing and an inexpensive hash function also makes it
+ * less robust as compared to the `BytesToBytesMap` (especially for a large number of keys or even
+ * for certain distribution of keys) and requires us to fall back on the latter for correctness.
+ */
+class ColumnarAggMapCodeGenerator(
     ctx: CodegenContext,
     generatedClassName: String,
     groupingKeySchema: StructType,
@@ -43,7 +56,7 @@ class TungstenAggregateHashMap(
      """.stripMargin
   }
 
-  def initializeAggregateHashMap(): String = {
+  private def initializeAggregateHashMap(): String = {
     val generatedSchema: String =
       s"""
          |new org.apache.spark.sql.types.StructType()
@@ -76,16 +89,38 @@ class TungstenAggregateHashMap(
      """.stripMargin
   }
 
-  def generateHashFunction(): String = {
+  /**
+   * Generates a method that computes a hash by currently xor-ing all individual group-by keys. For
+   * instance, if we have 2 long group-by keys, the generated function would be of the form:
+   *
+   * {{{
+   * private long hash(long agg_key, long agg_key1) {
+   *   return agg_key ^ agg_key1;
+   *   }
+   * }}}
+   */
+  private def generateHashFunction(): String = {
     s"""
-       |// TODO: Improve this Hash Function
+       |// TODO: Improve this hash function
        |private long hash($groupingKeySignature) {
        |  return ${groupingKeys.map(_._2).mkString(" ^ ")};
        |}
      """.stripMargin
   }
 
-  def generateEquals(): String = {
+  /**
+   * Generates a method that returns true if the group-by keys exist at a given index in the
+   * associated [[org.apache.spark.sql.execution.vectorized.ColumnarBatch]]. For instance, if we
+   * have 2 long group-by keys, the generated function would be of the form:
+   *
+   * {{{
+   * private boolean equals(int idx, long agg_key, long agg_key1) {
+   *   return batch.column(0).getLong(buckets[idx]) == agg_key &&
+   *     batch.column(1).getLong(buckets[idx]) == agg_key1;
+   * }
+   * }}}
+   */
+  private def generateEquals(): String = {
     s"""
        |private boolean equals(int idx, $groupingKeySignature) {
        |  return ${groupingKeys.zipWithIndex.map(k =>
@@ -94,10 +129,43 @@ class TungstenAggregateHashMap(
      """.stripMargin
   }
 
-  def generateFindOrInsert(): String = {
+  /**
+   * Generates a method that returns a mutable
+   * [[org.apache.spark.sql.execution.vectorized.ColumnarBatch.Row]] which keeps track of the
+   * aggregate value(s) for a given set of keys. If the corresponding row doesn't exist, the
+   * generated method adds the corresponding row in the associated
+   * [[org.apache.spark.sql.execution.vectorized.ColumnarBatch]]. For instance, if we
+   * have 2 long group-by keys, the generated function would be of the form:
+   *
+   * {{{
+   * public org.apache.spark.sql.execution.vectorized.ColumnarBatch.Row findOrInsert(
+   *     long agg_key, long agg_key1) {
+   *   long h = hash(agg_key, agg_key1);
+   *   int step = 0;
+   *   int idx = (int) h & (numBuckets - 1);
+   *   while (step < maxSteps) {
+   *     // Return bucket index if it's either an empty slot or already contains the key
+   *     if (buckets[idx] == -1) {
+   *       batch.column(0).putLong(numRows, agg_key);
+   *       batch.column(1).putLong(numRows, agg_key1);
+   *       batch.column(2).putLong(numRows, 0);
+   *       buckets[idx] = numRows++;
+   *       return batch.getRow(buckets[idx]);
+   *     } else if (equals(idx, agg_key, agg_key1)) {
+   *       return batch.getRow(buckets[idx]);
+   *     }
+   *     idx = (idx + 1) & (numBuckets - 1);
+   *     step++;
+   *   }
+   *   // Didn't find it
+   *   return null;
+   * }
+   * }}}
+   */
+  private def generateFindOrInsert(): String = {
     s"""
        |public org.apache.spark.sql.execution.vectorized.ColumnarBatch.Row findOrInsert(${
-          groupingKeySignature}) {
+            groupingKeySignature}) {
        |  long h = hash(${groupingKeys.map(_._2).mkString(", ")});
        |  int step = 0;
        |  int idx = (int) h & (numBuckets - 1);
@@ -117,8 +185,8 @@ class TungstenAggregateHashMap(
        |    idx = (idx + 1) & (numBuckets - 1);
        |    step++;
        |  }
-       |// Didn't find it
-       |return null;
+       |  // Didn't find it
+       |  return null;
        |}
      """.stripMargin
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -443,8 +443,8 @@ case class TungstenAggregate(
       (groupingKeySchema ++ bufferSchema).forall(_.dataType == LongType)
     val aggregateHashMapTerm = ctx.freshName("aggregateHashMap")
     val aggregateHashMapClassName = ctx.freshName("GeneratedAggregateHashMap")
-    val aggregateHashMapGenerator =
-      new TungstenAggregateHashMap(ctx, aggregateHashMapClassName, groupingKeySchema, bufferSchema)
+    val aggregateHashMapGenerator = new ColumnarAggMapCodeGenerator(ctx, aggregateHashMapClassName,
+      groupingKeySchema, bufferSchema)
     if (isAggregateHashMapEnabled && isAggregateHashMapSupported) {
       ctx.addMutableState(aggregateHashMapClassName, aggregateHashMapTerm,
         s"$aggregateHashMapTerm = new $aggregateHashMapClassName();")