[SPARK-36646][SQL] Push down group by partition column for aggregate

huaxingao · viirya · commit b9a0c56729c5 · 2021-11-17T17:51:43.000-08:00
### What changes were proposed in this pull request? lift the restriction for aggregate push down for parquet and orc if group by columns are the same as the partition cols ### Why are the changes needed? previously, if there are group by columns, we don't push down aggregate to data source. After the change, if the group by columns are the same as the partition columns, we will push down aggregates. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? new tests Closes #34445 from huaxingao/group_by. Authored-by: Huaxin Gao <huaxin_gao@apple.com> Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/AggregatePushDownUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/AggregatePushDownUtils.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.{Expression, GenericInternalRow}
 import org.apache.spark.sql.connector.expressions.NamedReference
 import org.apache.spark.sql.connector.expressions.aggregate.{AggregateFunc, Aggregation, Count, CountStar, Max, Min}
 import org.apache.spark.sql.execution.RowToColumnConverter
@@ -81,19 +81,37 @@ object AggregatePushDownUtils {
       }
     }
 
-    if (aggregation.groupByColumns.nonEmpty || dataFilters.nonEmpty) {
+    if (dataFilters.nonEmpty) {
       // Parquet/ORC footer has max/min/count for columns
       // e.g. SELECT COUNT(col1) FROM t
       // but footer doesn't have max/min/count for a column if max/min/count
       // are combined with filter or group by
       // e.g. SELECT COUNT(col1) FROM t WHERE col2 = 8
       //      SELECT COUNT(col1) FROM t GROUP BY col2
       // However, if the filter is on partition column, max/min/count can still be pushed down
-      // Todo:  add support if groupby column is partition col
-      //        (https://issues.apache.org/jira/browse/SPARK-36646)
       return None
     }
 
+    if (aggregation.groupByColumns.nonEmpty &&
+      partitionNames.size != aggregation.groupByColumns.length) {
+      // If there are group by columns, we only push down if the group by columns are the same as
+      // the partition columns. In theory, if group by columns are a subset of partition columns,
+      // we should still be able to push down. e.g. if table t has partition columns p1, p2, and p3,
+      // SELECT MAX(c) FROM t GROUP BY p1, p2 should still be able to push down. However, the
+      // partial aggregation pushed down to data source needs to be
+      // SELECT p1, p2, p3, MAX(c) FROM t GROUP BY p1, p2, p3, and Spark layer
+      // needs to have a final aggregation such as SELECT MAX(c) FROM t GROUP BY p1, p2, then the
+      // pushed down query schema is different from the query schema at Spark. We will keep
+      // aggregate push down simple and don't handle this complicate case for now.
+      return None
+    }
+    aggregation.groupByColumns.foreach { col =>
+      // don't push down if the group by columns are not the same as the partition columns (orders
+      // doesn't matter because reorder can be done at data source layer)
+      if (col.fieldNames.length != 1 || !isPartitionCol(col)) return None
+      finalSchema = finalSchema.add(getStructFieldForCol(col))
+    }
+
     aggregation.aggregateExpressions.foreach {
       case max: Max =>
         if (!processMinOrMax(max)) return None
@@ -138,4 +156,44 @@ object AggregatePushDownUtils {
     converter.convert(aggregatesAsRow, columnVectors.toArray)
     new ColumnarBatch(columnVectors.asInstanceOf[Array[ColumnVector]], 1)
   }
+
+  /**
+   * Return the schema for aggregates only (exclude group by columns)
+   */
+  def getSchemaWithoutGroupingExpression(
+      aggSchema: StructType,
+      aggregation: Aggregation): StructType = {
+    val numOfGroupByColumns = aggregation.groupByColumns.length
+    if (numOfGroupByColumns > 0) {
+      new StructType(aggSchema.fields.drop(numOfGroupByColumns))
+    } else {
+      aggSchema
+    }
+  }
+
+  /**
+   * Reorder partition cols if they are not in the same order as group by columns
+   */
+  def reOrderPartitionCol(
+      partitionSchema: StructType,
+      aggregation: Aggregation,
+      partitionValues: InternalRow): InternalRow = {
+    val groupByColNames = aggregation.groupByColumns.map(_.fieldNames.head)
+    assert(groupByColNames.length == partitionSchema.length &&
+      groupByColNames.length == partitionValues.numFields, "The number of group by columns " +
+      s"${groupByColNames.length} should be the same as partition schema length " +
+      s"${partitionSchema.length} and the number of fields ${partitionValues.numFields} " +
+      s"in partitionValues")
+    var reorderedPartColValues = Array.empty[Any]
+    if (!partitionSchema.names.sameElements(groupByColNames)) {
+      groupByColNames.foreach { col =>
+        val index = partitionSchema.names.indexOf(col)
+        val v = partitionValues.asInstanceOf[GenericInternalRow].values(index)
+        reorderedPartColValues = reorderedPartColValues :+ v
+      }
+      new GenericInternalRow(reorderedPartColValues)
+    } else {
+      partitionValues
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
@@ -35,11 +35,12 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{SPARK_VERSION_METADATA_KEY, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution
+import org.apache.spark.sql.catalyst.expressions.JoinedRow
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.catalyst.util.{quoteIdentifier, CharVarcharUtils}
 import org.apache.spark.sql.connector.expressions.aggregate.{Aggregation, Count, CountStar, Max, Min}
 import org.apache.spark.sql.errors.QueryExecutionErrors
-import org.apache.spark.sql.execution.datasources.SchemaMergeUtils
+import org.apache.spark.sql.execution.datasources.{AggregatePushDownUtils, SchemaMergeUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{ThreadUtils, Utils}
 
@@ -396,9 +397,8 @@ object OrcUtils extends Logging {
       dataSchema: StructType,
       partitionSchema: StructType,
       aggregation: Aggregation,
-      aggSchema: StructType): InternalRow = {
-    require(aggregation.groupByColumns.length == 0,
-      s"aggregate $aggregation with group-by column shouldn't be pushed down")
+      aggSchema: StructType,
+      partitionValues: InternalRow): InternalRow = {
     var columnsStatistics: OrcColumnStatistics = null
     try {
       columnsStatistics = OrcFooterReader.readStatistics(reader)
@@ -457,17 +457,22 @@ object OrcUtils extends Logging {
       }
     }
 
+    // if there are group by columns, we will build result row first,
+    // and then append group by columns values (partition columns values) to the result row.
+    val schemaWithoutGroupBy =
+      AggregatePushDownUtils.getSchemaWithoutGroupingExpression(aggSchema, aggregation)
+
     val aggORCValues: Seq[WritableComparable[_]] =
       aggregation.aggregateExpressions.zipWithIndex.map {
         case (max: Max, index) =>
           val columnName = max.column.fieldNames.head
           val statistics = getColumnStatistics(columnName)
-          val dataType = aggSchema(index).dataType
+          val dataType = schemaWithoutGroupBy(index).dataType
           getMinMaxFromColumnStatistics(statistics, dataType, isMax = true)
         case (min: Min, index) =>
           val columnName = min.column.fieldNames.head
           val statistics = getColumnStatistics(columnName)
-          val dataType = aggSchema.apply(index).dataType
+          val dataType = schemaWithoutGroupBy.apply(index).dataType
           getMinMaxFromColumnStatistics(statistics, dataType, isMax = false)
         case (count: Count, _) =>
           val columnName = count.column.fieldNames.head
@@ -490,7 +495,15 @@ object OrcUtils extends Logging {
             s"createAggInternalRowFromFooter should not take $x as the aggregate expression")
       }
 
-    val orcValuesDeserializer = new OrcDeserializer(aggSchema, (0 until aggSchema.length).toArray)
-    orcValuesDeserializer.deserializeFromValues(aggORCValues)
+    val orcValuesDeserializer = new OrcDeserializer(schemaWithoutGroupBy,
+      (0 until schemaWithoutGroupBy.length).toArray)
+    val resultRow = orcValuesDeserializer.deserializeFromValues(aggORCValues)
+    if (aggregation.groupByColumns.nonEmpty) {
+      val reOrderedPartitionValues = AggregatePushDownUtils.reOrderPartitionCol(
+        partitionSchema, aggregation, partitionValues)
+      new JoinedRow(reOrderedPartitionValues, resultRow)
+    } else {
+      resultRow
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
@@ -31,8 +31,9 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
 import org.apache.spark.SparkException
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.JoinedRow
 import org.apache.spark.sql.connector.expressions.aggregate.{Aggregation, Count, CountStar, Max, Min}
-import org.apache.spark.sql.execution.datasources.PartitioningUtils
+import org.apache.spark.sql.execution.datasources.AggregatePushDownUtils
 import org.apache.spark.sql.internal.SQLConf.{LegacyBehaviorPolicy, PARQUET_AGGREGATE_PUSHDOWN_ENABLED}
 import org.apache.spark.sql.types.StructType
 
@@ -157,17 +158,22 @@ object ParquetUtils {
       partitionSchema: StructType,
       aggregation: Aggregation,
       aggSchema: StructType,
-      datetimeRebaseMode: LegacyBehaviorPolicy.Value,
-      isCaseSensitive: Boolean): InternalRow = {
+      partitionValues: InternalRow,
+      datetimeRebaseMode: LegacyBehaviorPolicy.Value): InternalRow = {
     val (primitiveTypes, values) = getPushedDownAggResult(
-      footer, filePath, dataSchema, partitionSchema, aggregation, isCaseSensitive)
+      footer, filePath, dataSchema, partitionSchema, aggregation)
 
     val builder = Types.buildMessage
     primitiveTypes.foreach(t => builder.addField(t))
     val parquetSchema = builder.named("root")
 
+    // if there are group by columns, we will build result row first,
+    // and then append group by columns values (partition columns values) to the result row.
+    val schemaWithoutGroupBy =
+      AggregatePushDownUtils.getSchemaWithoutGroupingExpression(aggSchema, aggregation)
+
     val schemaConverter = new ParquetToSparkSchemaConverter
-    val converter = new ParquetRowConverter(schemaConverter, parquetSchema, aggSchema,
+    val converter = new ParquetRowConverter(schemaConverter, parquetSchema, schemaWithoutGroupBy,
       None, datetimeRebaseMode, LegacyBehaviorPolicy.CORRECTED, NoopUpdater)
     val primitiveTypeNames = primitiveTypes.map(_.getPrimitiveTypeName)
     primitiveTypeNames.zipWithIndex.foreach {
@@ -195,7 +201,14 @@ object ParquetUtils {
       case (_, i) =>
         throw new SparkException("Unexpected parquet type name: " + primitiveTypeNames(i))
     }
-    converter.currentRecord
+
+    if (aggregation.groupByColumns.nonEmpty) {
+      val reorderedPartitionValues = AggregatePushDownUtils.reOrderPartitionCol(
+        partitionSchema, aggregation, partitionValues)
+      new JoinedRow(reorderedPartitionValues, converter.currentRecord)
+    } else {
+      converter.currentRecord
+    }
   }
 
   /**
@@ -211,16 +224,14 @@ object ParquetUtils {
       filePath: String,
       dataSchema: StructType,
       partitionSchema: StructType,
-      aggregation: Aggregation,
-      isCaseSensitive: Boolean)
+      aggregation: Aggregation)
   : (Array[PrimitiveType], Array[Any]) = {
     val footerFileMetaData = footer.getFileMetaData
     val fields = footerFileMetaData.getSchema.getFields
     val blocks = footer.getBlocks
     val primitiveTypeBuilder = mutable.ArrayBuilder.make[PrimitiveType]
     val valuesBuilder = mutable.ArrayBuilder.make[Any]
 
-    assert(aggregation.groupByColumns.length == 0, "group by shouldn't be pushed down")
     aggregation.aggregateExpressions.foreach { agg =>
       var value: Any = None
       var rowCount = 0L
@@ -250,8 +261,7 @@ object ParquetUtils {
             schemaName = "count(" + count.column.fieldNames.head + ")"
             rowCount += block.getRowCount
             var isPartitionCol = false
-            if (partitionSchema.fields.map(PartitioningUtils.getColName(_, isCaseSensitive))
-              .toSet.contains(count.column.fieldNames.head)) {
+            if (partitionSchema.fields.map(_.name).toSet.contains(count.column.fieldNames.head)) {
               isPartitionCol = true
             }
             isCount = true
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala
@@ -83,11 +83,10 @@ case class OrcPartitionReaderFactory(
 
   override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = {
     val conf = broadcastedConf.value.value
-    val filePath = new Path(new URI(file.filePath))
-
     if (aggregation.nonEmpty) {
-      return buildReaderWithAggregates(filePath, conf)
+      return buildReaderWithAggregates(file, conf)
     }
+    val filePath = new Path(new URI(file.filePath))
 
     val resultedColPruneInfo =
       Utils.tryWithResource(createORCReader(filePath, conf)) { reader =>
@@ -127,11 +126,10 @@ case class OrcPartitionReaderFactory(
 
   override def buildColumnarReader(file: PartitionedFile): PartitionReader[ColumnarBatch] = {
     val conf = broadcastedConf.value.value
-    val filePath = new Path(new URI(file.filePath))
-
     if (aggregation.nonEmpty) {
-      return buildColumnarReaderWithAggregates(filePath, conf)
+      return buildColumnarReaderWithAggregates(file, conf)
     }
+    val filePath = new Path(new URI(file.filePath))
 
     val resultedColPruneInfo =
       Utils.tryWithResource(createORCReader(filePath, conf)) { reader =>
@@ -183,14 +181,16 @@ case class OrcPartitionReaderFactory(
    * Build reader with aggregate push down.
    */
   private def buildReaderWithAggregates(
-      filePath: Path,
+      file: PartitionedFile,
       conf: Configuration): PartitionReader[InternalRow] = {
+    val filePath = new Path(new URI(file.filePath))
     new PartitionReader[InternalRow] {
       private var hasNext = true
       private lazy val row: InternalRow = {
         Utils.tryWithResource(createORCReader(filePath, conf)) { reader =>
           OrcUtils.createAggInternalRowFromFooter(
-            reader, filePath.toString, dataSchema, partitionSchema, aggregation.get, readDataSchema)
+            reader, filePath.toString, dataSchema, partitionSchema, aggregation.get,
+            readDataSchema, file.partitionValues)
         }
       }
 
@@ -209,15 +209,16 @@ case class OrcPartitionReaderFactory(
    * Build columnar reader with aggregate push down.
    */
   private def buildColumnarReaderWithAggregates(
-      filePath: Path,
+      file: PartitionedFile,
       conf: Configuration): PartitionReader[ColumnarBatch] = {
+    val filePath = new Path(new URI(file.filePath))
     new PartitionReader[ColumnarBatch] {
       private var hasNext = true
       private lazy val batch: ColumnarBatch = {
         Utils.tryWithResource(createORCReader(filePath, conf)) { reader =>
           val row = OrcUtils.createAggInternalRowFromFooter(
             reader, filePath.toString, dataSchema, partitionSchema, aggregation.get,
-            readDataSchema)
+            readDataSchema, file.partitionValues)
           AggregatePushDownUtils.convertAggregatesRowToBatch(row, readDataSchema, offHeap = false)
         }
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala
@@ -129,10 +129,11 @@ case class ParquetPartitionReaderFactory(
         private var hasNext = true
         private lazy val row: InternalRow = {
           val footer = getFooter(file)
+
           if (footer != null && footer.getBlocks.size > 0) {
             ParquetUtils.createAggInternalRowFromFooter(footer, file.filePath, dataSchema,
-              partitionSchema, aggregation.get, readDataSchema,
-              getDatetimeRebaseMode(footer.getFileMetaData), isCaseSensitive)
+              partitionSchema, aggregation.get, readDataSchema, file.partitionValues,
+              getDatetimeRebaseMode(footer.getFileMetaData))
           } else {
             null
           }
@@ -174,8 +175,8 @@ case class ParquetPartitionReaderFactory(
           val footer = getFooter(file)
           if (footer != null && footer.getBlocks.size > 0) {
             val row = ParquetUtils.createAggInternalRowFromFooter(footer, file.filePath,
-              dataSchema, partitionSchema, aggregation.get, readDataSchema,
-              getDatetimeRebaseMode(footer.getFileMetaData), isCaseSensitive)
+              dataSchema, partitionSchema, aggregation.get, readDataSchema, file.partitionValues,
+              getDatetimeRebaseMode(footer.getFileMetaData))
             AggregatePushDownUtils.convertAggregatesRowToBatch(
               row, readDataSchema, enableOffHeapColumnVector && Option(TaskContext.get()).isDefined)
           } else {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceAggregatePushDownSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceAggregatePushDownSuite.scala