[SPARK-24230][SQL] Fix SpecificParquetRecordReaderBase with dictionary filters.

rdblue · cloud-fan · commit 3469f5c989e6 · 2018-05-24T20:55:26.000+08:00
## What changes were proposed in this pull request? I missed this commit when preparing #21070. When Parquet is able to filter blocks with dictionary filtering, the expected total value count to be too high in Spark, leading to an error when there were fewer than expected row groups to process. Spark should get the row groups from Parquet to pick up new filter schemes in Parquet like dictionary filtering. ## How was this patch tested? Using in production at Netflix. Added test case for dictionary-filtered blocks. Author: Ryan Blue <blue@apache.org> Closes #21295 from rdblue/SPARK-24230-fix-parquet-block-tracking.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java
@@ -146,7 +146,8 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptCont
     this.sparkSchema = StructType$.MODULE$.fromString(sparkRequestedSchemaString);
     this.reader = new ParquetFileReader(
         configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
-    for (BlockMetaData block : blocks) {
+    // use the blocks from the reader in case some do not match filters and will not be read
+    for (BlockMetaData block : reader.getRowGroups()) {
       this.totalRowCount += block.getRowCount();
     }
 
@@ -224,7 +225,8 @@ protected void initialize(String path, List<String> columns) throws IOException
     this.sparkSchema = new ParquetToSparkSchemaConverter(config).convert(requestedSchema);
     this.reader = new ParquetFileReader(
         config, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
-    for (BlockMetaData block : blocks) {
+    // use the blocks from the reader in case some do not match filters and will not be read
+    for (BlockMetaData block : reader.getRowGroups()) {
       this.totalRowCount += block.getRowCount();
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -879,6 +879,18 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
       }
     }
   }
+
+  test("SPARK-24230: filter row group using dictionary") {
+    withSQLConf(("parquet.filter.dictionary.enabled", "true")) {
+      // create a table with values from 0, 2, ..., 18 that will be dictionary-encoded
+      withParquetTable((0 until 100).map(i => ((i * 2) % 20, s"data-$i")), "t") {
+        // search for a key that is not present so the dictionary filter eliminates all row groups
+        // Fails without SPARK-24230:
+        //   java.io.IOException: expecting more rows but reached last block. Read 0 out of 50
+        checkAnswer(sql("SELECT _2 FROM t WHERE t._1 = 5"), Seq.empty)
+      }
+    }
+  }
 }
 
 object TestingUDT {

Original file line number	Diff line number	Diff line change
`@@ -146,7 +146,8 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptCont`
`146`	`146`	`this.sparkSchema = StructType$.MODULE$.fromString(sparkRequestedSchemaString);`
`147`	`147`	`this.reader = new ParquetFileReader(`
`148`	`148`	`configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());`
`149`		`- for (BlockMetaData block : blocks) {`
	`149`	`+ // use the blocks from the reader in case some do not match filters and will not be read`
	`150`	`+ for (BlockMetaData block : reader.getRowGroups()) {`
`150`	`151`	`this.totalRowCount += block.getRowCount();`
`151`	`152`	`}`
`152`	`153`
`@@ -224,7 +225,8 @@ protected void initialize(String path, List<String> columns) throws IOException`
`224`	`225`	`this.sparkSchema = new ParquetToSparkSchemaConverter(config).convert(requestedSchema);`
`225`	`226`	`this.reader = new ParquetFileReader(`
`226`	`227`	`config, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());`
`227`		`- for (BlockMetaData block : blocks) {`
	`228`	`+ // use the blocks from the reader in case some do not match filters and will not be read`
	`229`	`+ for (BlockMetaData block : reader.getRowGroups()) {`
`228`	`230`	`this.totalRowCount += block.getRowCount();`
`229`	`231`	`}`
`230`	`232`	`}`
Original file line number	Diff line number	Diff line change
`@@ -879,6 +879,18 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext`
`879`	`879`	`}`
`880`	`880`	`}`
`881`	`881`	`}`
	`882`	`+`
	`883`	`+ test("SPARK-24230: filter row group using dictionary") {`
	`884`	`+ withSQLConf(("parquet.filter.dictionary.enabled", "true")) {`
	`885`	`+ // create a table with values from 0, 2, ..., 18 that will be dictionary-encoded`
	`886`	`+ withParquetTable((0 until 100).map(i => ((i * 2) % 20, s"data-$i")), "t") {`
	`887`	`+ // search for a key that is not present so the dictionary filter eliminates all row groups`
	`888`	`+ // Fails without SPARK-24230:`
	`889`	`+ // java.io.IOException: expecting more rows but reached last block. Read 0 out of 50`
	`890`	`+ checkAnswer(sql("SELECT _2 FROM t WHERE t._1 = 5"), Seq.empty)`
	`891`	`+ }`
	`892`	`+ }`
	`893`	`+ }`
`882`	`894`	`}`
`883`	`895`
`884`	`896`	`object TestingUDT {`