[SPARK-10829] [SPARK-11301] [SQL] fix 2 bugs for filter on partitioned columns (1.5 backport)

cloud-fan · yhuai · commit 6b10ea5d5370 · 2015-10-30T12:14:53.000-07:00
[SPARK-10829](#8916) Filter combine partition key and attribute doesn't work in DataSource scan [SPARK-11301](#9271) fix case sensitivity for filter on partitioned columns Author: Wenchen Fan <wenchen@databricks.com> This patch had conflicts when merged, resolved by Committer: Yin Huai <yhuai@databricks.com> Closes #9371 from cloud-fan/branch-1.5.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -62,7 +62,20 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     // Scanning partitioned HadoopFsRelation
     case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _))
         if t.partitionSpec.partitionColumns.nonEmpty =>
-      val selectedPartitions = prunePartitions(filters, t.partitionSpec).toArray
+      // We divide the filter expressions into 3 parts
+      val partitionColumns = AttributeSet(
+        t.partitionColumns.map(c => l.output.find(_.name == c.name).get))
+
+      // Only pruning the partition keys
+      val partitionFilters = filters.filter(_.references.subsetOf(partitionColumns))
+
+      // Only pushes down predicates that do not reference partition keys.
+      val pushedFilters = filters.filter(_.references.intersect(partitionColumns).isEmpty)
+
+      // Predicates with both partition keys and attributes
+      val combineFilters = filters.toSet -- partitionFilters.toSet -- pushedFilters.toSet
+
+      val selectedPartitions = prunePartitions(partitionFilters, t.partitionSpec).toArray
 
       logInfo {
         val total = t.partitionSpec.partitions.length
@@ -71,21 +84,16 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         s"Selected $selected partitions out of $total, pruned $percentPruned% partitions."
       }
 
-      // Only pushes down predicates that do not reference partition columns.
-      val pushedFilters = {
-        val partitionColumnNames = t.partitionSpec.partitionColumns.map(_.name).toSet
-        filters.filter { f =>
-          val referencedColumnNames = f.references.map(_.name).toSet
-          referencedColumnNames.intersect(partitionColumnNames).isEmpty
-        }
-      }
-
-      buildPartitionedTableScan(
+      val scan = buildPartitionedTableScan(
         l,
         projects,
         pushedFilters,
         t.partitionSpec.partitionColumns,
-        selectedPartitions) :: Nil
+        selectedPartitions)
+
+      combineFilters
+        .reduceLeftOption(expressions.And)
+        .map(execution.Filter(_, scan)).getOrElse(scan) :: Nil
 
     // Scanning non-partitioned HadoopFsRelation
     case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _)) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -937,4 +937,14 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       expected(except)
     )
   }
+
+  test("SPARK-11301: fix case sensitivity for filter on partitioned columns") {
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+      withTempPath { path =>
+        Seq(2012 -> "a").toDF("year", "val").write.partitionBy("year").parquet(path.getAbsolutePath)
+        val df = sqlContext.read.parquet(path.getAbsolutePath)
+        checkAnswer(df.filter($"yEAr" > 2000).select($"val"), Row("a"))
+      }
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -937,4 +937,14 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {`
`937`	`937`	`expected(except)`
`938`	`938`	`)`
`939`	`939`	`}`
	`940`	`+`
	`941`	`+ test("SPARK-11301: fix case sensitivity for filter on partitioned columns") {`
	`942`	`+ withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {`
	`943`	`+ withTempPath { path =>`
	`944`	`+ Seq(2012 -> "a").toDF("year", "val").write.partitionBy("year").parquet(path.getAbsolutePath)`
	`945`	`+ val df = sqlContext.read.parquet(path.getAbsolutePath)`
	`946`	`+ checkAnswer(df.filter($"yEAr" > 2000).select($"val"), Row("a"))`
	`947`	`+ }`
	`948`	`+ }`
	`949`	`+ }`
`940`	`950`	`}`