add org.apache.spark.sql.internal.connector.SupportsPushDownCatalystFilters

huaxingao · huaxingao · commit 68ace263f44a · 2021-08-30T17:46:07.000-07:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/connector/SupportsPushDownCatalystFilters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/connector/SupportsPushDownCatalystFilters.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.internal.connector
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+
+/**
+ * A mix-in interface for {@link FileScanBuilder}. This can be used to push down partitionFilters
+ * and dataFilters to FileIndex in the format of catalyst Expression.
+ */
+trait SupportsPushDownCatalystFilters {
+  /**
+   * Pushes down partitionFilters and dataFilters to FileIndex in the format of catalyst
+   * Expression. These catalyst Expression filters are used for partition pruning. The dataFilters
+   * are also translated into data source filters and used for selecting records.
+   */
+  def pushCatalystFilters(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Unit
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala
@@ -22,13 +22,17 @@ import org.apache.spark.sql.{sources, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.connector.read.{ScanBuilder, SupportsPushDownRequiredColumns}
 import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, PartitioningAwareFileIndex, PartitioningUtils}
+import org.apache.spark.sql.internal.connector.SupportsPushDownCatalystFilters
 import org.apache.spark.sql.sources.Filter
 import org.apache.spark.sql.types.StructType
 
 abstract class FileScanBuilder(
     sparkSession: SparkSession,
     fileIndex: PartitioningAwareFileIndex,
-    dataSchema: StructType) extends ScanBuilder with SupportsPushDownRequiredColumns {
+    dataSchema: StructType)
+  extends ScanBuilder
+    with SupportsPushDownRequiredColumns
+    with SupportsPushDownCatalystFilters {
   private val partitionSchema = fileIndex.partitionSchema
   private val isCaseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
   protected val supportsNestedSchemaPruning = false
@@ -66,7 +70,9 @@ abstract class FileScanBuilder(
 
   // Note: The partitionFilters and dataFilters need to be pushed to FileIndex in the format of
   // Expression because partition pruning uses the Expression Filters, not sources.Filters.
-  def pushFilters(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Unit = {
+  override def pushCatalystFilters(
+      partitionFilters: Seq[Expression],
+      dataFilters: Seq[Expression]): Unit = {
     this.partitionFilters = partitionFilters
     this.dataFilters = dataFilters
     val translatedFilters = mutable.ArrayBuffer.empty[sources.Filter]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala
@@ -74,7 +74,7 @@ object PushDownUtils extends PredicateHelper {
         val (partitionFilters, dataFilters) =
           DataSourceUtils.getPartitionKeyFiltersAndDataFilters(
             f.getSparkSession, scanBuilderHolder.relation, f.readPartitionSchema(), filters)
-        f.pushFilters(ExpressionSet(partitionFilters).toSeq, dataFilters)
+        f.pushCatalystFilters(ExpressionSet(partitionFilters).toSeq, dataFilters)
         (Nil, dataFilters)
       case _ => (Nil, filters)
     }

Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@ object PushDownUtils extends PredicateHelper {`
`74`	`74`	`val (partitionFilters, dataFilters) =`
`75`	`75`	`DataSourceUtils.getPartitionKeyFiltersAndDataFilters(`
`76`	`76`	`f.getSparkSession, scanBuilderHolder.relation, f.readPartitionSchema(), filters)`
`77`		`- f.pushFilters(ExpressionSet(partitionFilters).toSeq, dataFilters)`
	`77`	`+ f.pushCatalystFilters(ExpressionSet(partitionFilters).toSeq, dataFilters)`
`78`	`78`	`(Nil, dataFilters)`
`79`	`79`	`case _ => (Nil, filters)`
`80`	`80`	`}`