-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-19228][SQL] Migrate on Java 8 time from FastDateFormat for meet the ISO8601 #21363
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -90,6 +90,7 @@ private[csv] object CSVInferSchema { | |
| // DecimalTypes have different precisions and scales, so we try to find the common type. | ||
| findTightestCommonType(typeSoFar, tryParseDecimal(field, options)).getOrElse(StringType) | ||
| case DoubleType => tryParseDouble(field, options) | ||
| case DateType => tryParseDate(field, options) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this also is a behavior change. Shall we document it?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can do it, but where exactly it should be documented? |
||
| case TimestampType => tryParseTimestamp(field, options) | ||
| case BooleanType => tryParseBoolean(field, options) | ||
| case StringType => StringType | ||
|
|
@@ -140,14 +141,23 @@ private[csv] object CSVInferSchema { | |
| private def tryParseDouble(field: String, options: CSVOptions): DataType = { | ||
| if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field, options)) { | ||
| DoubleType | ||
| } else { | ||
| tryParseDate(field, options) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this a behavior change? Previously timestamp type, now date type/timestamp type?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For example, by mistake we have identical "timestampFormat" and "dateFormat" options.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At the moment, DateType here is ignored at all, I'm not sure that it was conceived when the type was created |
||
| } | ||
| } | ||
|
|
||
| private def tryParseDate(field: String, options: CSVOptions): DataType = { | ||
| // This case infers a custom `dateFormat` is set. | ||
| if ((allCatch opt options.dateFormatter.parse(field)).isDefined) { | ||
| DateType | ||
| } else { | ||
| tryParseTimestamp(field, options) | ||
| } | ||
| } | ||
|
|
||
| private def tryParseTimestamp(field: String, options: CSVOptions): DataType = { | ||
| // This case infers a custom `dataFormat` is set. | ||
| if ((allCatch opt options.timestampFormat.parse(field)).isDefined) { | ||
| // This case infers a custom `timestampFormat` is set. | ||
| if ((allCatch opt options.timestampFormatter.parse(field)).isDefined) { | ||
| TimestampType | ||
| } else if ((allCatch opt DateTimeUtils.stringToTime(field)).isDefined) { | ||
| // We keep this for backwards compatibility. | ||
|
|
@@ -216,6 +226,9 @@ private[csv] object CSVInferSchema { | |
| } else { | ||
| Some(DecimalType(range + scale, scale)) | ||
| } | ||
| // By design 'TimestampType' (8 bytes) is larger than 'DateType' (4 bytes). | ||
| case (t1: DateType, t2: TimestampType) => Some(TimestampType) | ||
| case (t1: TimestampType, t2: DateType) => Some(TimestampType) | ||
|
|
||
| case _ => None | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,10 +18,10 @@ | |
| package org.apache.spark.sql.execution.datasources.csv | ||
|
|
||
| import java.nio.charset.StandardCharsets | ||
| import java.time.format.{DateTimeFormatter, ResolverStyle} | ||
| import java.util.{Locale, TimeZone} | ||
|
|
||
| import com.univocity.parsers.csv.{CsvParserSettings, CsvWriterSettings, UnescapedQuoteHandling} | ||
| import org.apache.commons.lang3.time.FastDateFormat | ||
|
|
||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.catalyst.util._ | ||
|
|
@@ -119,7 +119,6 @@ class CSVOptions( | |
| val positiveInf = parameters.getOrElse("positiveInf", "Inf") | ||
| val negativeInf = parameters.getOrElse("negativeInf", "-Inf") | ||
|
|
||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds unrelated change. |
||
| val compressionCodec: Option[String] = { | ||
| val name = parameters.get("compression").orElse(parameters.get("codec")) | ||
| name.map(CompressionCodecs.getCodecClassName) | ||
|
|
@@ -128,13 +127,20 @@ class CSVOptions( | |
| val timeZone: TimeZone = DateTimeUtils.getTimeZone( | ||
| parameters.getOrElse(DateTimeUtils.TIMEZONE_OPTION, defaultTimeZoneId)) | ||
|
|
||
| // Uses `FastDateFormat` which can be direct replacement for `SimpleDateFormat` and thread-safe. | ||
| val dateFormat: FastDateFormat = | ||
| FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"), Locale.US) | ||
| val dateFormat: String = parameters.getOrElse("dateFormat", "yyyy-MM-dd") | ||
|
|
||
| val timestampFormat: String = parameters | ||
| .getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSXXX") | ||
|
|
||
| val timestampFormat: FastDateFormat = | ||
| FastDateFormat.getInstance( | ||
| parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSXXX"), timeZone, Locale.US) | ||
| @transient lazy val dateFormatter: DateTimeFormatter = { | ||
| DateTimeFormatter.ofPattern(dateFormat) | ||
| .withLocale(Locale.US).withZone(timeZone.toZoneId).withResolverStyle(ResolverStyle.SMART) | ||
| } | ||
|
|
||
| @transient lazy val timestampFormatter: DateTimeFormatter = { | ||
| DateTimeFormatter.ofPattern(timestampFormat) | ||
| .withLocale(Locale.US).withZone(timeZone.toZoneId).withResolverStyle(ResolverStyle.SMART) | ||
| } | ||
|
|
||
| val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| timestamp,date | ||
| 26/08/2015 22:31:46.913,27/09/2015 | ||
| 27/10/2014 22:33:31.601,26/12/2016 | ||
| 28/01/2016 22:33:52.888,28/01/2017 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -59,13 +59,21 @@ class CSVInferSchemaSuite extends SparkFunSuite { | |
| assert(CSVInferSchema.inferField(IntegerType, textValueOne, options) == expectedTypeOne) | ||
| } | ||
|
|
||
| test("Timestamp field types are inferred correctly via custom data format") { | ||
| var options = new CSVOptions(Map("timestampFormat" -> "yyyy-mm"), "GMT") | ||
| test("Timestamp field types are inferred correctly via custom date format") { | ||
| var options = new CSVOptions(Map("timestampFormat" -> "yyyy-MM"), "GMT") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why we need to change this? The format string change is also a behavior change for users. We might need a config for that.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "yyyy-mm" means years and minutes, this is a date format, not a time format |
||
| assert(CSVInferSchema.inferField(TimestampType, "2015-08", options) == TimestampType) | ||
| options = new CSVOptions(Map("timestampFormat" -> "yyyy"), "GMT") | ||
| assert(CSVInferSchema.inferField(TimestampType, "2015", options) == TimestampType) | ||
| } | ||
|
|
||
| test("Date field types are inferred correctly via custom date and timestamp format") { | ||
| val options = new CSVOptions(Map("dateFormat" -> "dd/MM/yyyy", | ||
| "timestampFormat" -> "dd/MM/yyyy HH:mm:ss.SSS"), "GMT") | ||
| assert(CSVInferSchema.inferField(TimestampType, | ||
| "28/01/2017 22:31:46.913", options) == TimestampType) | ||
| assert(CSVInferSchema.inferField(DateType, "16/12/2012", options) == DateType) | ||
| } | ||
|
|
||
| test("Timestamp field types are inferred correctly from other types") { | ||
| val options = new CSVOptions(Map.empty[String, String], "GMT") | ||
| assert(CSVInferSchema.inferField(IntegerType, "2015-08-20 14", options) == StringType) | ||
|
|
@@ -111,7 +119,7 @@ class CSVInferSchemaSuite extends SparkFunSuite { | |
| } | ||
|
|
||
| test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") { | ||
| val options = new CSVOptions(Map("TiMeStampFormat" -> "yyyy-mm"), "GMT") | ||
| val options = new CSVOptions(Map("TiMeStampFormat" -> "yyyy-MM"), "GMT") | ||
| assert(CSVInferSchema.inferField(TimestampType, "2015-08", options) == TimestampType) | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,7 @@ | |
| package org.apache.spark.sql.execution.datasources.csv | ||
|
|
||
| import java.math.BigDecimal | ||
| import java.util.Locale | ||
| import java.time.{LocalDate, LocalDateTime} | ||
|
|
||
| import org.apache.spark.SparkFunSuite | ||
| import org.apache.spark.sql.catalyst.util.DateTimeUtils | ||
|
|
@@ -107,20 +107,26 @@ class UnivocityParserSuite extends SparkFunSuite { | |
| assert(parser.makeConverter("_1", BooleanType, options = options).apply("true") == true) | ||
|
|
||
| val timestampsOptions = | ||
| new CSVOptions(Map("timestampFormat" -> "dd/MM/yyyy hh:mm"), "GMT") | ||
| new CSVOptions(Map("timestampFormat" -> "dd/MM/yyyy HH:mm"), "GMT") | ||
| val customTimestamp = "31/01/2015 00:00" | ||
| val expectedTime = timestampsOptions.timestampFormat.parse(customTimestamp).getTime | ||
|
|
||
| val expectedTime = LocalDateTime.parse(customTimestamp, timestampsOptions.timestampFormatter) | ||
| .atZone(options.timeZone.toZoneId) | ||
| .toInstant.toEpochMilli | ||
| val castedTimestamp = | ||
| parser.makeConverter("_1", TimestampType, nullable = true, options = timestampsOptions) | ||
| parser.makeConverter("_1", TimestampType, nullable = true, timestampsOptions) | ||
| .apply(customTimestamp) | ||
| assert(castedTimestamp == expectedTime * 1000L) | ||
|
|
||
| val customDate = "31/01/2015" | ||
| val dateOptions = new CSVOptions(Map("dateFormat" -> "dd/MM/yyyy"), "GMT") | ||
| val expectedDate = dateOptions.dateFormat.parse(customDate).getTime | ||
| val customDate = "31/01/2015" | ||
|
|
||
| val expectedDate = LocalDate.parse(customDate, dateOptions.dateFormatter) | ||
| .atStartOfDay(options.timeZone.toZoneId) | ||
| .toInstant.toEpochMilli | ||
| val castedDate = | ||
| parser.makeConverter("_1", DateType, nullable = true, options = dateOptions) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would keep this line as was.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
| .apply(customTimestamp) | ||
| parser.makeConverter("_1", DateType, nullable = true, dateOptions) | ||
| .apply(customDate) | ||
| assert(castedDate == DateTimeUtils.millisToDays(expectedDate)) | ||
|
|
||
| val timestamp = "2015-01-01 00:00:00" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
1000000L->MICROS_PER_SECOND?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks