-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-39731][SQL] Fix issue in CSV and JSON data sources when parsing dates in "yyyyMMdd" format with CORRECTED time parser policy #37147
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1193ce7
b714b7f
8a10a68
9b65761
45011a0
40d07bd
55c5579
ef91606
15c07f7
a83288b
bf9351d
ac63b63
a447b08
8feb707
8a01ece
fbdf9d8
2962cd9
10ca4a4
b2a3db2
739e7db
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -36,7 +36,7 @@ import org.apache.hadoop.io.SequenceFile.CompressionType | |||||||||
| import org.apache.hadoop.io.compress.GzipCodec | ||||||||||
| import org.apache.logging.log4j.Level | ||||||||||
|
|
||||||||||
| import org.apache.spark.{SparkConf, SparkException, TestUtils} | ||||||||||
| import org.apache.spark.{SparkConf, SparkException, SparkUpgradeException, TestUtils} | ||||||||||
| import org.apache.spark.sql.{AnalysisException, Column, DataFrame, Encoders, QueryTest, Row} | ||||||||||
| import org.apache.spark.sql.catalyst.util.{DateTimeTestUtils, DateTimeUtils} | ||||||||||
| import org.apache.spark.sql.execution.datasources.CommonFileDataSourceSuite | ||||||||||
|
|
@@ -2837,7 +2837,80 @@ abstract class CSVSuite | |||||||||
| ) | ||||||||||
| assert(results.collect().toSeq.map(_.toSeq) == expected) | ||||||||||
| } | ||||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
| test("SPARK-39731: Correctly parse dates and timestamps with yyyyMMdd pattern") { | ||||||||||
| withTempPath { path => | ||||||||||
| Seq( | ||||||||||
| "1,2020011,2020011", | ||||||||||
| "2,20201203,20201203").toDF() | ||||||||||
| .repartition(1) | ||||||||||
| .write.text(path.getAbsolutePath) | ||||||||||
| val schema = new StructType() | ||||||||||
| .add("id", IntegerType) | ||||||||||
| .add("date", DateType) | ||||||||||
| .add("ts", TimestampType) | ||||||||||
| val output = spark.read | ||||||||||
| .schema(schema) | ||||||||||
| .option("dateFormat", "yyyyMMdd") | ||||||||||
| .option("timestampFormat", "yyyyMMdd") | ||||||||||
| .csv(path.getAbsolutePath) | ||||||||||
|
|
||||||||||
| def check(mode: String, res: Seq[Row]): Unit = { | ||||||||||
| withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> mode) { | ||||||||||
| checkAnswer(output, res) | ||||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
| check( | ||||||||||
| "legacy", | ||||||||||
| Seq( | ||||||||||
| Row(1, Date.valueOf("2020-01-01"), Timestamp.valueOf("2020-01-01 00:00:00")), | ||||||||||
| Row(2, Date.valueOf("2020-12-03"), Timestamp.valueOf("2020-12-03 00:00:00")) | ||||||||||
| ) | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| check( | ||||||||||
| "corrected", | ||||||||||
| Seq( | ||||||||||
| Row(1, null, null), | ||||||||||
| Row(2, Date.valueOf("2020-12-03"), Timestamp.valueOf("2020-12-03 00:00:00")) | ||||||||||
| ) | ||||||||||
|
Comment on lines
+2866
to
+2879
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For completeness, would you consider adding a check for spark/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala Lines 2598 to 2601 in 1193ce7
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! |
||||||||||
| ) | ||||||||||
|
|
||||||||||
| val err = intercept[SparkException] { | ||||||||||
| check("exception", Nil) | ||||||||||
| }.getCause | ||||||||||
| assert(err.isInstanceOf[SparkUpgradeException]) | ||||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
| test("SPARK-39731: Handle date and timestamp parsing fallback") { | ||||||||||
| withTempPath { path => | ||||||||||
| Seq("2020-01-01,2020-01-01").toDF() | ||||||||||
| .repartition(1) | ||||||||||
| .write.text(path.getAbsolutePath) | ||||||||||
| val schema = new StructType() | ||||||||||
| .add("date", DateType) | ||||||||||
| .add("ts", TimestampType) | ||||||||||
|
|
||||||||||
| def output(enableFallback: Boolean): DataFrame = spark.read | ||||||||||
| .schema(schema) | ||||||||||
| .option("dateFormat", "invalid") | ||||||||||
| .option("timestampFormat", "invalid") | ||||||||||
| .option("enableDateTimeParsingFallback", enableFallback) | ||||||||||
| .csv(path.getAbsolutePath) | ||||||||||
|
|
||||||||||
| checkAnswer( | ||||||||||
| output(enableFallback = true), | ||||||||||
| Seq(Row(Date.valueOf("2020-01-01"), Timestamp.valueOf("2020-01-01 00:00:00"))) | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| checkAnswer( | ||||||||||
| output(enableFallback = false), | ||||||||||
| Seq(Row(null, null)) | ||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry I'm a bit confused. Why date parsing fails?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah, because the format pattern is given but invalid. |
||||||||||
| ) | ||||||||||
| } | ||||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
Uh oh!
There was an error while loading. Please reload this page.