apache · MaxGekk · Nov 24, 2018 · Nov 24, 2018 · Nov 24, 2018 · Nov 24, 2018
diff --git a/docs/sql-migration-guide-upgrade.md b/docs/sql-migration-guide-upgrade.md
@@ -33,6 +33,8 @@ displayTitle: Spark SQL Upgrading Guide
 
   - Spark applications which are built with Spark version 2.4 and prior, and call methods of `UserDefinedFunction`, need to be re-compiled with Spark 3.0, as they are not binary compatible with Spark 3.0.
 
+  - Since Spark 3.0, CSV datasource uses java.time API for parsing and generating CSV content. New formatting implementation supports date/timestamp patterns conformed to ISO 8601. To switch back to the implementation used in Spark 2.4 and earlier, set `spark.sql.legacy.timeParser.enabled` to `true`.
+
 ## Upgrading From Spark SQL 2.3 to 2.4
 
   - In Spark version 2.3 and earlier, the second parameter to array_contains function is implicitly promoted to the element type of first array type parameter. This type promotion can be lossy and may cause `array_contains` function to return wrong result. This problem has been addressed in 2.4 by employing a safer type promotion mechanism. This can cause some change in behavior and are illustrated in the table below.

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
@@ -22,10 +22,16 @@ import scala.util.control.Exception.allCatch
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis.TypeCoercion
 import org.apache.spark.sql.catalyst.expressions.ExprUtils
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.util.DateTimeFormatter
 import org.apache.spark.sql.types._
 
-class CSVInferSchema(options: CSVOptions) extends Serializable {
+class CSVInferSchema(val options: CSVOptions) extends Serializable {
+
+  @transient
+  private lazy val timeParser = DateTimeFormatter(
+    options.timestampFormat,
+    options.timeZone,
+    options.locale)
 
   private val decimalParser = {
     ExprUtils.getDecimalParser(options.locale)
@@ -154,10 +160,7 @@ class CSVInferSchema(options: CSVOptions) extends Serializable {
 
   private def tryParseTimestamp(field: String): DataType = {
     // This case infers a custom `dataFormat` is set.
-    if ((allCatch opt options.timestampFormat.parse(field)).isDefined) {
-      TimestampType
-    } else if ((allCatch opt DateTimeUtils.stringToTime(field)).isDefined) {
-      // We keep this for backwards compatibility.
+    if ((allCatch opt timeParser.parse(field)).isDefined) {
       TimestampType
     } else {
       tryParseBoolean(field)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
@@ -21,7 +21,6 @@ import java.nio.charset.StandardCharsets
 import java.util.{Locale, TimeZone}
 
 import com.univocity.parsers.csv.{CsvParserSettings, CsvWriterSettings, UnescapedQuoteHandling}
-import org.apache.commons.lang3.time.FastDateFormat
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.util._
@@ -146,13 +145,10 @@ class CSVOptions(
   // A language tag in IETF BCP 47 format
   val locale: Locale = parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US)
 
-  // Uses `FastDateFormat` which can be direct replacement for `SimpleDateFormat` and thread-safe.
-  val dateFormat: FastDateFormat =
-    FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"), locale)
+  val dateFormat: String = parameters.getOrElse("dateFormat", "yyyy-MM-dd")
 
-  val timestampFormat: FastDateFormat =
-    FastDateFormat.getInstance(
-      parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSXXX"), timeZone, locale)
+  val timestampFormat: String =
+    parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSXXX")
 
   val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false)
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala
@@ -22,7 +22,7 @@ import java.io.Writer
 import com.univocity.parsers.csv.CsvWriter
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeFormatter}
 import org.apache.spark.sql.types._
 
 class UnivocityGenerator(
@@ -41,14 +41,18 @@ class UnivocityGenerator(
   private val valueConverters: Array[ValueConverter] =
     schema.map(_.dataType).map(makeConverter).toArray
 
+  private val timeFormatter = DateTimeFormatter(
+    options.timestampFormat,
+    options.timeZone,
+    options.locale)
+  private val dateFormatter = DateFormatter(options.dateFormat, options.timeZone, options.locale)
+
   private def makeConverter(dataType: DataType): ValueConverter = dataType match {
     case DateType =>
-      (row: InternalRow, ordinal: Int) =>
-        options.dateFormat.format(DateTimeUtils.toJavaDate(row.getInt(ordinal)))
+      (row: InternalRow, ordinal: Int) => dateFormatter.format(row.getInt(ordinal))
 
     case TimestampType =>
-      (row: InternalRow, ordinal: Int) =>
-        options.timestampFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(ordinal)))
+      (row: InternalRow, ordinal: Int) => timeFormatter.format(row.getLong(ordinal))
 
     case udt: UserDefinedType[_] => makeConverter(udt.sqlType)
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
@@ -19,15 +19,14 @@ package org.apache.spark.sql.catalyst.csv
 
 import java.io.InputStream
 
-import scala.util.Try
 import scala.util.control.NonFatal
 
 import com.univocity.parsers.csv.CsvParser
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{ExprUtils, GenericInternalRow}
-import org.apache.spark.sql.catalyst.util.{BadRecordException, DateTimeUtils, FailureSafeParser}
+import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -75,6 +74,12 @@ class UnivocityParser(
 
   private val row = new GenericInternalRow(requiredSchema.length)
 
+  private val timeFormatter = DateTimeFormatter(
+    options.timestampFormat,
+    options.timeZone,
+    options.locale)
+  private val dateFormatter = DateFormatter(options.dateFormat, options.timeZone, options.locale)
+
   // Retrieve the raw record string.
   private def getCurrentInput: UTF8String = {
     UTF8String.fromString(tokenizer.getContext.currentParsedContent().stripLineEnd)
@@ -100,7 +105,7 @@ class UnivocityParser(
   //
   //   output row - ["A", 2]
   private val valueConverters: Array[ValueConverter] = {
-    requiredSchema.map(f => makeConverter(f.name, f.dataType, f.nullable, options)).toArray
+    requiredSchema.map(f => makeConverter(f.name, f.dataType, f.nullable)).toArray
   }
 
   private val decimalParser = ExprUtils.getDecimalParser(options.locale)
@@ -115,8 +120,7 @@ class UnivocityParser(
   def makeConverter(
       name: String,
       dataType: DataType,
-      nullable: Boolean = true,
-      options: CSVOptions): ValueConverter = dataType match {
+      nullable: Boolean = true): ValueConverter = dataType match {
     case _: ByteType => (d: String) =>
       nullSafeDatum(d, name, nullable, options)(_.toByte)
 
@@ -154,34 +158,16 @@ class UnivocityParser(
       }
 
     case _: TimestampType => (d: String) =>
-      nullSafeDatum(d, name, nullable, options) { datum =>
-        // This one will lose microseconds parts.
-        // See https://issues.apache.org/jira/browse/SPARK-10681.
-        Try(options.timestampFormat.parse(datum).getTime * 1000L)
-          .getOrElse {
-          // If it fails to parse, then tries the way used in 2.0 and 1.x for backwards
-          // compatibility.
-          DateTimeUtils.stringToTime(datum).getTime * 1000L
-        }
-      }
+      nullSafeDatum(d, name, nullable, options)(timeFormatter.parse)
 
     case _: DateType => (d: String) =>
-      nullSafeDatum(d, name, nullable, options) { datum =>
-        // This one will lose microseconds parts.
-        // See https://issues.apache.org/jira/browse/SPARK-10681.x
-        Try(DateTimeUtils.millisToDays(options.dateFormat.parse(datum).getTime))
-          .getOrElse {
-          // If it fails to parse, then tries the way used in 2.0 and 1.x for backwards
-          // compatibility.
-          DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(datum).getTime)
-        }
-      }
+      nullSafeDatum(d, name, nullable, options)(dateFormatter.parse)
 
     case _: StringType => (d: String) =>
       nullSafeDatum(d, name, nullable, options)(UTF8String.fromString)
 
     case udt: UserDefinedType[_] => (datum: String) =>
-      makeConverter(name, udt.sqlType, nullable, options)
+      makeConverter(name, udt.sqlType, nullable)
 
     // We don't actually hit this exception though, we keep it for understandability
     case _ => throw new RuntimeException(s"Unsupported type: ${dataType.typeName}")

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatter.scala
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import java.time._
+import java.time.format.DateTimeFormatterBuilder
+import java.time.temporal.{ChronoField, TemporalQueries}
+import java.util.{Locale, TimeZone}
+
+import scala.util.Try
+
+import org.apache.commons.lang3.time.FastDateFormat
+
+import org.apache.spark.sql.internal.SQLConf
+
+sealed trait DateTimeFormatter {
+  def parse(s: String): Long // returns microseconds since epoch
+  def format(us: Long): String
+}
+
+class Iso8601DateTimeFormatter(
+    pattern: String,
+    timeZone: TimeZone,
+    locale: Locale) extends DateTimeFormatter {
+  val formatter = new DateTimeFormatterBuilder()
+    .appendPattern(pattern)
+    .parseDefaulting(ChronoField.YEAR_OF_ERA, 1970)
+    .parseDefaulting(ChronoField.MONTH_OF_YEAR, 1)
+    .parseDefaulting(ChronoField.DAY_OF_MONTH, 1)
+    .parseDefaulting(ChronoField.HOUR_OF_DAY, 0)
+    .parseDefaulting(ChronoField.MINUTE_OF_HOUR, 0)
+    .parseDefaulting(ChronoField.SECOND_OF_MINUTE, 0)
+    .toFormatter(locale)
+
+  def toInstant(s: String): Instant = {
+    val temporalAccessor = formatter.parse(s)
+    if (temporalAccessor.query(TemporalQueries.offset()) == null) {
+      val localDateTime = LocalDateTime.from(temporalAccessor)
+      val zonedDateTime = ZonedDateTime.of(localDateTime, timeZone.toZoneId)
+      Instant.from(zonedDateTime)
+    } else {
+      Instant.from(temporalAccessor)
+    }
+  }
+
+  private def instantToMicros(instant: Instant): Long = {
+    val sec = Math.multiplyExact(instant.getEpochSecond, DateTimeUtils.MICROS_PER_SECOND)
+    val result = Math.addExact(sec, instant.getNano / DateTimeUtils.NANOS_PER_MICROS)
+    result
+  }
+
+  def parse(s: String): Long = instantToMicros(toInstant(s))
+
+  def format(us: Long): String = {
+    val secs = Math.floorDiv(us, DateTimeUtils.MICROS_PER_SECOND)
+    val mos = Math.floorMod(us, DateTimeUtils.MICROS_PER_SECOND)
+    val instant = Instant.ofEpochSecond(secs, mos * DateTimeUtils.NANOS_PER_MICROS)
+
+    formatter.withZone(timeZone.toZoneId).format(instant)
+  }
+}
+
+class LegacyDateTimeFormatter(
+    pattern: String,
+    timeZone: TimeZone,
+    locale: Locale) extends DateTimeFormatter {
+  val format = FastDateFormat.getInstance(pattern, timeZone, locale)
+
+  protected def toMillis(s: String): Long = format.parse(s).getTime
+
+  def parse(s: String): Long = toMillis(s) * DateTimeUtils.MICROS_PER_MILLIS
+
+  def format(us: Long): String = {
+    format.format(DateTimeUtils.toJavaTimestamp(us))
+  }
+}
+
+class LegacyFallbackDateTimeFormatter(
+    pattern: String,
+    timeZone: TimeZone,
+    locale: Locale) extends LegacyDateTimeFormatter(pattern, timeZone, locale) {
+  override def toMillis(s: String): Long = {
+    Try {super.toMillis(s)}.getOrElse(DateTimeUtils.stringToTime(s).getTime)
+  }
+}
+
+object DateTimeFormatter {
+  def apply(format: String, timeZone: TimeZone, locale: Locale): DateTimeFormatter = {
+    if (SQLConf.get.legacyTimeParserEnabled) {
+      new LegacyFallbackDateTimeFormatter(format, timeZone, locale)
+    } else {
+      new Iso8601DateTimeFormatter(format, timeZone, locale)
+    }
+  }
+}
+
+sealed trait DateFormatter {
+  def parse(s: String): Int // returns days since epoch
+  def format(days: Int): String
+}
+
+class Iso8601DateFormatter(
+    pattern: String,
+    timeZone: TimeZone,
+    locale: Locale) extends DateFormatter {
+
+  val dateTimeFormatter = new Iso8601DateTimeFormatter(pattern, timeZone, locale)
+
+  override def parse(s: String): Int = {
+    val seconds = dateTimeFormatter.toInstant(s).getEpochSecond
+    val days = Math.floorDiv(seconds, DateTimeUtils.SECONDS_PER_DAY)
+
+    days.toInt
+  }
+
+  override def format(days: Int): String = {
+    val instant = Instant.ofEpochSecond(days * DateTimeUtils.SECONDS_PER_DAY)
+    dateTimeFormatter.formatter.withZone(timeZone.toZoneId).format(instant)
+  }
+}
+
+class LegacyDateFormatter(
+    pattern: String,
+    timeZone: TimeZone,
+    locale: Locale) extends DateFormatter {
+  val format = FastDateFormat.getInstance(pattern, timeZone, locale)
+
+  def parse(s: String): Int = {
+    val milliseconds = format.parse(s).getTime
+    DateTimeUtils.millisToDays(milliseconds)
+  }
+
+  def format(days: Int): String = {
+    val date = DateTimeUtils.toJavaDate(days)
+    format.format(date)
+  }
+}
+
+class LegacyFallbackDateFormatter(
+    pattern: String,
+    timeZone: TimeZone,
+    locale: Locale) extends LegacyDateFormatter(pattern, timeZone, locale) {
+  override def parse(s: String): Int = {
+    Try(super.parse(s)).orElse {
+      // If it fails to parse, then tries the way used in 2.0 and 1.x for backwards
+      // compatibility.
+      Try(DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(s).getTime))
+    }.getOrElse {
+      // In Spark 1.5.0, we store the data as number of days since epoch in string.
+      // So, we just convert it to Int.
+      s.toInt
+    }
+  }
+}
+
+object DateFormatter {
+  def apply(format: String, timeZone: TimeZone, locale: Locale): DateFormatter = {
+    if (SQLConf.get.legacyTimeParserEnabled) {
+      new LegacyFallbackDateFormatter(format, timeZone, locale)
+    } else {
+      new Iso8601DateFormatter(format, timeZone, locale)
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -50,7 +50,7 @@ object DateTimeUtils {
   final val MILLIS_PER_SECOND = 1000L
   final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L
   final val MICROS_PER_DAY = MICROS_PER_SECOND * SECONDS_PER_DAY
-
+  final val NANOS_PER_MICROS = 1000L
   final val MILLIS_PER_DAY = SECONDS_PER_DAY * 1000L
 
   // number of days in 400 years

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1618,6 +1618,13 @@ object SQLConf {
         "a SparkConf entry.")
       .booleanConf
       .createWithDefault(true)
+
+  val LEGACY_TIME_PARSER_ENABLED = buildConf("spark.sql.legacy.timeParser.enabled")
+    .doc("When set to true, java.text.SimpleDateFormat is used for formatting and parsing " +
+      " dates/timestamps in a locale-sensitive manner. When set to false, classes from " +
+      "java.time.* packages are used for the same purpose.")
+    .booleanConf
+    .createWithDefault(false)
 }
 
 /**
@@ -2040,6 +2047,8 @@ class SQLConf extends Serializable with Logging {
 
   def setCommandRejectsSparkConfs: Boolean = getConf(SQLConf.SET_COMMAND_REJECTS_SPARK_CONFS)
 
+  def legacyTimeParserEnabled: Boolean = getConf(SQLConf.LEGACY_TIME_PARSER_ENABLED)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */