-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-26709][SQL] OptimizeMetadataOnlyQuery does not handle empty records correctly #23635
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6bd09b1
1c81586
15cc872
8399d77
1524205
f7dac39
9e99d4b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -582,12 +582,14 @@ object SQLConf { | |
| .createWithDefault(HiveCaseSensitiveInferenceMode.INFER_AND_SAVE.toString) | ||
|
|
||
| val OPTIMIZER_METADATA_ONLY = buildConf("spark.sql.optimizer.metadataOnly") | ||
| .internal() | ||
| .doc("When true, enable the metadata-only query optimization that use the table's metadata " + | ||
| "to produce the partition columns instead of table scans. It applies when all the columns " + | ||
| "scanned are partition columns and the query has an aggregate operator that satisfies " + | ||
| "distinct semantics.") | ||
| "distinct semantics. By default the optimization is disabled, since it may return " + | ||
| "incorrect results when the files are empty.") | ||
| .booleanConf | ||
| .createWithDefault(true) | ||
| .createWithDefault(false) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about making this conf. internal and always printing a warning message when this flag enabled and the rule applied? |
||
|
|
||
| val COLUMN_NAME_OF_CORRUPT_RECORD = buildConf("spark.sql.columnNameOfCorruptRecord") | ||
| .doc("The name of internal column for storing raw/un-parsed JSON and CSV records that fail " + | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2966,6 +2966,43 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { | |
| } | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also add a case that illustrates that |
||
| Seq(true, false).foreach { enableOptimizeMetadataOnlyQuery => | ||
| withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> enableOptimizeMetadataOnlyQuery.toString) { | ||
| withTable("t") { | ||
| sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)") | ||
| sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)") | ||
| if (enableOptimizeMetadataOnlyQuery) { | ||
| // The result is wrong if we enable the configuration. | ||
| checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(5)) | ||
| } else { | ||
| checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null)) | ||
| } | ||
| checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null)) | ||
| } | ||
|
|
||
| withTempPath { path => | ||
| val tabLocation = path.getCanonicalPath | ||
| val partLocation1 = tabLocation + "/p=3" | ||
| val partLocation2 = tabLocation + "/p=1" | ||
| // SPARK-23271 empty RDD when saved should write a metadata only file | ||
| val df = spark.emptyDataFrame.select(lit(1).as("col")) | ||
| df.write.parquet(partLocation1) | ||
| val df2 = spark.range(10).toDF("col") | ||
| df2.write.parquet(partLocation2) | ||
| val readDF = spark.read.parquet(tabLocation) | ||
| if (enableOptimizeMetadataOnlyQuery) { | ||
| // The result is wrong if we enable the configuration. | ||
| checkAnswer(readDF.selectExpr("max(p)"), Row(3)) | ||
| } else { | ||
| checkAnswer(readDF.selectExpr("max(p)"), Row(1)) | ||
| } | ||
| checkAnswer(readDF.selectExpr("max(col)"), Row(9)) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| case class Foo(bar: Option[String]) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The configuration is not only about Parquet. Since it is disabled, and to avoid confusing users, I think we can just remove the doc here.