apache · HyukjinKwon · Jan 23, 2019 · Jan 23, 2019 · gatorsmile · Jan 24, 2019
diff --git a/...src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/...src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -314,6 +314,17 @@ class ParquetFileFormat
       SQLConf.CASE_SENSITIVE.key,
       sparkSession.sessionState.conf.caseSensitiveAnalysis)
 
+    // There are two things to note here.
+    //
+    // 1. Dictionary filtering has an issue about the predication on null. For this reason,
+    //   This filtering is disabled. See SPARK-26677.
+    //
+    // 2. We should disable 'parquet.filter.dictionary.enabled' but
+    //   the 'parquet.filter.stats.enabled' and 'parquet.filter.dictionary.enabled' were
+    //   swapped mistakenly in Parquet side. It should use 'parquet.filter.dictionary.enabled'
+    //   when Spark upgrades Parquet. See PARQUET-1309.
+    hadoopConf.setIfUnset(ParquetInputFormat.STATS_FILTERING_ENABLED, "false")
+
     ParquetWriteSupport.setSchema(requiredSchema, hadoopConf)
 
     // Sets flags for `ParquetToSparkSchemaConverter`

diff --git a/...src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/...src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -890,6 +890,21 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
       }
     }
   }
+
+  test("SPARK-26677: negated null-safe equality comparison should not filter matched row groups") {
+    (true :: false :: Nil).foreach { vectorized =>
+      withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized.toString) {
+        withTempPath { path =>
+          // Repeated values for dictionary encoding.
+          Seq(Some("A"), Some("A"), None).toDF.repartition(1)
+            .write.parquet(path.getAbsolutePath)
+          val df = spark.read.parquet(path.getAbsolutePath)
+          checkAnswer(stripSparkFilter(df.where("NOT (value <=> 'A')")), df)
+        }
+      }
+    }
+  }
+
 }
 
 object TestingUDT {