-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-25207][SQL] Case-insensitve field resolution for filter pushdown when reading Parquet #22197
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5902afe
2226eae
c76189d
10cd89f
1ea94cc
90b8717
10c437e
5b2bd93
9142f49
cb03fb7
86a0cb0
29a5804
db49461
04b88c5
41a7b83
e0d6196
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,7 @@ import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate, Operato | |
| import org.apache.parquet.filter2.predicate.FilterApi._ | ||
| import org.apache.parquet.filter2.predicate.Operators.{Column => _, _} | ||
|
|
||
| import org.apache.spark.SparkException | ||
| import org.apache.spark.sql._ | ||
| import org.apache.spark.sql.catalyst.dsl.expressions._ | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
|
|
@@ -60,7 +61,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex | |
| private lazy val parquetFilters = | ||
| new ParquetFilters(conf.parquetFilterPushDownDate, conf.parquetFilterPushDownTimestamp, | ||
| conf.parquetFilterPushDownDecimal, conf.parquetFilterPushDownStringStartWith, | ||
| conf.parquetFilterPushDownInFilterThreshold) | ||
| conf.parquetFilterPushDownInFilterThreshold, conf.caseSensitiveAnalysis) | ||
|
|
||
| override def beforeEach(): Unit = { | ||
| super.beforeEach() | ||
|
|
@@ -1021,6 +1022,118 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex | |
| } | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-25207: Case-insensitive field resolution for pushdown when reading parquet") { | ||
| def createParquetFilter(caseSensitive: Boolean): ParquetFilters = { | ||
| new ParquetFilters(conf.parquetFilterPushDownDate, conf.parquetFilterPushDownTimestamp, | ||
| conf.parquetFilterPushDownDecimal, conf.parquetFilterPushDownStringStartWith, | ||
| conf.parquetFilterPushDownInFilterThreshold, caseSensitive) | ||
| } | ||
| val caseSensitiveParquetFilters = createParquetFilter(caseSensitive = true) | ||
| val caseInsensitiveParquetFilters = createParquetFilter(caseSensitive = false) | ||
|
|
||
| def testCaseInsensitiveResolution( | ||
| schema: StructType, | ||
| expected: FilterPredicate, | ||
| filter: sources.Filter): Unit = { | ||
| val parquetSchema = new SparkToParquetSchemaConverter(conf).convert(schema) | ||
|
|
||
| assertResult(Some(expected)) { | ||
| caseInsensitiveParquetFilters.createFilter(parquetSchema, filter) | ||
| } | ||
| assertResult(None) { | ||
| caseSensitiveParquetFilters.createFilter(parquetSchema, filter) | ||
| } | ||
| } | ||
|
|
||
| val schema = StructType(Seq(StructField("cint", IntegerType))) | ||
|
|
||
| testCaseInsensitiveResolution( | ||
| schema, FilterApi.eq(intColumn("cint"), null.asInstanceOf[Integer]), sources.IsNull("CINT")) | ||
|
|
||
| testCaseInsensitiveResolution( | ||
| schema, | ||
| FilterApi.notEq(intColumn("cint"), null.asInstanceOf[Integer]), | ||
| sources.IsNotNull("CINT")) | ||
|
|
||
| testCaseInsensitiveResolution( | ||
| schema, FilterApi.eq(intColumn("cint"), 1000: Integer), sources.EqualTo("CINT", 1000)) | ||
|
|
||
| testCaseInsensitiveResolution( | ||
| schema, | ||
| FilterApi.notEq(intColumn("cint"), 1000: Integer), | ||
| sources.Not(sources.EqualTo("CINT", 1000))) | ||
|
|
||
| testCaseInsensitiveResolution( | ||
| schema, FilterApi.eq(intColumn("cint"), 1000: Integer), sources.EqualNullSafe("CINT", 1000)) | ||
|
|
||
| testCaseInsensitiveResolution( | ||
| schema, | ||
| FilterApi.notEq(intColumn("cint"), 1000: Integer), | ||
| sources.Not(sources.EqualNullSafe("CINT", 1000))) | ||
|
|
||
| testCaseInsensitiveResolution( | ||
| schema, | ||
| FilterApi.lt(intColumn("cint"), 1000: Integer), sources.LessThan("CINT", 1000)) | ||
|
|
||
| testCaseInsensitiveResolution( | ||
| schema, | ||
| FilterApi.ltEq(intColumn("cint"), 1000: Integer), | ||
| sources.LessThanOrEqual("CINT", 1000)) | ||
|
|
||
| testCaseInsensitiveResolution( | ||
| schema, FilterApi.gt(intColumn("cint"), 1000: Integer), sources.GreaterThan("CINT", 1000)) | ||
|
|
||
| testCaseInsensitiveResolution( | ||
| schema, | ||
| FilterApi.gtEq(intColumn("cint"), 1000: Integer), | ||
| sources.GreaterThanOrEqual("CINT", 1000)) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: maybe we don't need to test against so many predicate. We just want to make sure case insensitive resolution work.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Each test is corresponding to one line code change in All tests together can cover all my change in |
||
|
|
||
| testCaseInsensitiveResolution( | ||
| schema, | ||
| FilterApi.or( | ||
| FilterApi.eq(intColumn("cint"), 10: Integer), | ||
| FilterApi.eq(intColumn("cint"), 20: Integer)), | ||
| sources.In("CINT", Array(10, 20))) | ||
|
|
||
| val dupFieldSchema = StructType( | ||
| Seq(StructField("cint", IntegerType), StructField("cINT", IntegerType))) | ||
| val dupParquetSchema = new SparkToParquetSchemaConverter(conf).convert(dupFieldSchema) | ||
| assertResult(None) { | ||
| caseInsensitiveParquetFilters.createFilter( | ||
| dupParquetSchema, sources.EqualTo("CINT", 1000)) | ||
| } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add one negative test that having name names in case insensitive modes, for example,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added, thanks! |
||
| } | ||
|
|
||
| test("SPARK-25207: exception when duplicate fields in case-insensitive mode") { | ||
| withTempPath { dir => | ||
| val count = 10 | ||
| val tableName = "spark_25207" | ||
| val tableDir = dir.getAbsoluteFile + "/table" | ||
| withTable(tableName) { | ||
| withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { | ||
| spark.range(count).selectExpr("id as A", "id as B", "id as b") | ||
| .write.mode("overwrite").parquet(tableDir) | ||
| } | ||
| sql( | ||
| s""" | ||
| |CREATE TABLE $tableName (A LONG, B LONG) USING PARQUET LOCATION '$tableDir' | ||
| """.stripMargin) | ||
|
|
||
| withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { | ||
| val e = intercept[SparkException] { | ||
| sql(s"select a from $tableName where b > 0").collect() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we read this table with case-sensitive mode?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we can, see below. Let me add one test case.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: to be consistent with the following query, I'd make this query as |
||
| } | ||
| assert(e.getCause.isInstanceOf[RuntimeException] && e.getCause.getMessage.contains( | ||
| """Found duplicate field(s) "B": [B, b] in case-insensitive mode""")) | ||
| } | ||
|
|
||
| withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { | ||
| checkAnswer(sql(s"select A from $tableName where B > 0"), (1 until count).map(Row(_))) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| class NumRowGroupsAcc extends AccumulatorV2[Integer, Integer] { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we don't need to consider ambiguity, can't we just lowercase
f.getNameabove instead of doing dedup here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is a good question!
Let's see the scenario like below:
Without dedup, we possible pushdown "a > 0" instead of "A > 0",
although it is wrong, it will still trigger the Exception finally when reading parquet,
so whether dedup or not, we will get the same result.
@cloud-fan , @gatorsmile any idea?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we do the dedup before parquet filter pushdown and parquet column pruning? Then we can simplify the code in both cases.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping @yucai
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@cloud-fan, it is a great idea, thanks!
I think it is not to "dedup" before pushdown and pruning.
Maybe we should do parquet schema clip before pushdown and pruning.
If duplicated fields are detected, throw the exception.
If not, pass clipped parquet schema via hadoopconf to parquet lib.
I am trying this way, will update soon.