-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-12355][SQL] Implement unhandledFilter interface for Parquet #10502
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a55ad54
cf331a4
8c376af
8f0dbc4
2ad8182
dfc7506
ad757be
4187322
67d4533
0e149da
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -141,6 +141,11 @@ private[sql] class ParquetRelation( | |
| .map(_.toBoolean) | ||
| .getOrElse(sqlContext.conf.getConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED)) | ||
|
|
||
| // When merging schemas is enabled and the column of the given filter does not exist, | ||
| // Parquet emits an exception which is an issue of Parquet (PARQUET-389). | ||
| private val safeParquetFilterPushDown = | ||
| sqlContext.conf.parquetFilterPushDown && !shouldMergeSchemas | ||
|
|
||
| private val mergeRespectSummaries = | ||
| sqlContext.conf.getConf(SQLConf.PARQUET_SCHEMA_RESPECT_SUMMARIES) | ||
|
|
||
|
|
@@ -300,13 +305,26 @@ private[sql] class ParquetRelation( | |
| } | ||
| } | ||
|
|
||
| override def unhandledFilters(filters: Array[Filter]): Array[Filter] = { | ||
| // The unsafe row RecordReader does not support row by row filtering so for this case | ||
| // it should wrap this with Spark-side filtering. | ||
| val enableUnsafeRowParquetReader = | ||
| sqlContext.getConf(SQLConf.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED.key).toBoolean | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm.. I think we can implement this for now and then simply change the condition when The option for Parquet filter push down was also false by default but it was implemented. I don't think a default value can deicde if a feature should be implemented or not. |
||
| val shouldHandleFilters = safeParquetFilterPushDown && !enableUnsafeRowParquetReader | ||
| if (shouldHandleFilters) { | ||
| filters.filter(ParquetFilters.createFilter(dataSchema, _).isEmpty) | ||
| } else { | ||
| filters | ||
| } | ||
| } | ||
|
|
||
| override def buildInternalScan( | ||
| requiredColumns: Array[String], | ||
| filters: Array[Filter], | ||
| inputFiles: Array[FileStatus], | ||
| broadcastedConf: Broadcast[SerializableConfiguration]): RDD[InternalRow] = { | ||
| val useMetadataCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA) | ||
| val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown | ||
| val parquetFilterPushDown = safeParquetFilterPushDown | ||
| val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString | ||
| val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp | ||
|
|
||
|
|
@@ -576,6 +594,15 @@ private[sql] object ParquetRelation extends Logging { | |
| conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[CatalystReadSupport].getName) | ||
|
|
||
| // Try to push down filters when filter push-down is enabled. | ||
| val safeRequiredColumns = if (parquetFilterPushDown) { | ||
| val referencedColumns = filters | ||
| // Collects all columns referenced in Parquet filter predicates. | ||
| .flatMap(filter => ParquetFilters.referencedColumns(dataSchema, filter)) | ||
| (requiredColumns ++ referencedColumns).distinct | ||
| } else { | ||
| requiredColumns | ||
| } | ||
|
|
||
| if (parquetFilterPushDown) { | ||
| filters | ||
| // Collects all converted Parquet filter predicates. Notice that not all predicates can be | ||
|
|
@@ -587,7 +614,7 @@ private[sql] object ParquetRelation extends Logging { | |
| } | ||
|
|
||
| conf.set(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA, { | ||
| val requestedSchema = StructType(requiredColumns.map(dataSchema(_))) | ||
| val requestedSchema = StructType(safeRequiredColumns.map(dataSchema(_))) | ||
| CatalystSchemaConverter.checkFieldNames(requestedSchema).json | ||
| }) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to add
private[parquet]?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, yes it looks so. I think I might also have to change
createFilter()in that way because I just followed up in the way ofcreateFilter()for this function because bothcreateFilter()andreferencedColumns()are called in the same places.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed.