Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 2 additions & 12 deletions docs/sql-ref-datetime-pattern.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,7 @@ Spark uses pattern letters in the following table for date and timestamp parsing
|**M/L**|month-of-year|month|7; 07; Jul; July|
|**d**|day-of-month|number(3)|28|
|**Q/q**|quarter-of-year|number/text|3; 03; Q3; 3rd quarter|
|**Y**|week-based-year|year|1996; 96|
|**w**|week-of-week-based-year|number(2)|27|
|**W**|week-of-month|number(1)|4|
|**E**|day-of-week|text|Tue; Tuesday|
|**u**|localized day-of-week|number/text|2; 02; Tue; Tuesday|
|**F**|week-of-month|number(1)|3|
|**a**|am-pm-of-day|am-pm|PM|
|**h**|clock-hour-of-am-pm (1-12)|number(2)|12|
Expand All @@ -63,7 +59,7 @@ Spark uses pattern letters in the following table for date and timestamp parsing

The count of pattern letters determines the format.

- Text: The text style is determined based on the number of pattern letters used. Less than 4 pattern letters will use the short form. Exactly 4 pattern letters will use the full form. Exactly 5 pattern letters will use the narrow form. 5 or more letters will fail.
- Text: The text style is determined based on the number of pattern letters used. Less than 4 pattern letters will use the short text form, typically an abbreviation, e.g. day-of-week Monday might output "Mon". Exactly 4 pattern letters will use the full text form, typically the full description, e.g, day-of-week Monday might output "Monday". 5 or more letters will fail.

- Number(n): The n here represents the maximum count of letters this type of datetime pattern can be used. If the count of letters is one, then the value is output using the minimum number of digits and without padding. Otherwise, the count of digits is used as the width of the output field, with the value zero-padded as necessary.

Expand Down Expand Up @@ -137,10 +133,4 @@ The count of pattern letters determines the format.
During parsing, the whole section may be missing from the parsed string.
An optional section is started by `[` and ended using `]` (or at the end of the pattern).

- Symbols of 'Y', 'W', 'w', 'E', 'u', 'F', 'q' and 'Q' can only be used for datetime formatting, e.g. `date_format`. They are not allowed used for datetime parsing, e.g. `to_timestamp`.

More details for the text style:

- Short Form: Short text, typically an abbreviation. For example, day-of-week Monday might output "Mon".

- Full Form: Full text, typically the full description. For example, day-of-week Monday might output "Monday".
- Symbols of 'E', 'F', 'q' and 'Q' can only be used for datetime formatting, e.g. `date_format`. They are not allowed used for datetime parsing, e.g. `to_timestamp`.
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class Iso8601DateFormatter(
extends DateFormatter with DateTimeFormatterHelper {

@transient
private lazy val formatter = getOrCreateFormatter(pattern, locale)
private lazy val formatter = getOrCreateFormatter(pattern, locale, isParsing)

@transient
private lazy val legacyFormatter = DateFormatter.getLegacyFormatter(
Expand Down Expand Up @@ -126,7 +126,7 @@ object DateFormatter {
zoneId: ZoneId,
locale: Locale = defaultLocale,
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
isParsing: Boolean = true): DateFormatter = {
isParsing: Boolean): DateFormatter = {
val pattern = format.getOrElse(defaultPattern)
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
Expand Down Expand Up @@ -159,11 +159,11 @@ object DateFormatter {
getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
}

def apply(format: String, zoneId: ZoneId): DateFormatter = {
getFormatter(Some(format), zoneId)
def apply(format: String, zoneId: ZoneId, isParsing: Boolean = false): DateFormatter = {
getFormatter(Some(format), zoneId, isParsing = isParsing)
}

def apply(zoneId: ZoneId): DateFormatter = {
getFormatter(None, zoneId)
getFormatter(None, zoneId, isParsing = false)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ trait DateTimeFormatterHelper {
protected def getOrCreateFormatter(
pattern: String,
locale: Locale,
isParsing: Boolean = false): DateTimeFormatter = {
isParsing: Boolean): DateTimeFormatter = {
val newPattern = convertIncompatiblePattern(pattern, isParsing)
val useVarLen = isParsing && newPattern.contains('S')
val key = (newPattern, locale, useVarLen)
Expand Down Expand Up @@ -234,22 +234,27 @@ private object DateTimeFormatterHelper {
val formatter = DateTimeFormatter.ofPattern("LLL qqq", Locale.US)
formatter.format(LocalDate.of(2000, 1, 1)) == "1 1"
}
final val unsupportedLetters = Set('A', 'c', 'e', 'n', 'N', 'p')
// SPARK-31892: The week-based date fields are rarely used and really confusing for parsing values
// to datetime, especially when they are mixed with other non-week-based ones
// to datetime, especially when they are mixed with other non-week-based ones;
// SPARK-31879: It's also difficult for us to restore the behavior of week-based date fields
// formatting, in DateTimeFormatter the first day of week for week-based date fields become
// localized, for the default Locale.US, it uses Sunday as the first day of week, while in Spark
// 2.4, the SimpleDateFormat uses Monday as the first day of week.
final val weekBasedLetters = Set('Y', 'W', 'w', 'u', 'e', 'c')
final val unsupportedLetters = Set('A', 'n', 'N', 'p')
// The quarter fields will also be parsed strangely, e.g. when the pattern contains `yMd` and can
// be directly resolved then the `q` do check for whether the month is valid, but if the date
// fields is incomplete, e.g. `yM`, the checking will be bypassed.
final val unsupportedLettersForParsing = Set('Y', 'W', 'w', 'E', 'u', 'F', 'q', 'Q')
final val unsupportedLettersForParsing = Set('E', 'F', 'q', 'Q')
final val unsupportedPatternLengths = {
// SPARK-31771: Disable Narrow-form TextStyle to avoid silent data change, as it is Full-form in
// 2.4
Seq("G", "M", "L", "E", "u", "Q", "q").map(_ * 5) ++
Seq("G", "M", "L", "E", "Q", "q").map(_ * 5) ++
// SPARK-31867: Disable year pattern longer than 10 which will cause Java time library throw
// unchecked `ArrayIndexOutOfBoundsException` by the `NumberPrinterParser` for formatting. It
// makes the call side difficult to handle exceptions and easily leads to silent data change
// because of the exceptions being suppressed.
Seq("y", "Y").map(_ * 11)
Seq("y").map(_ * 11)
}.toSet

/**
Expand All @@ -260,7 +265,7 @@ private object DateTimeFormatterHelper {
* @param pattern The input pattern.
* @return The pattern for new parser
*/
def convertIncompatiblePattern(pattern: String, isParsing: Boolean = false): String = {
def convertIncompatiblePattern(pattern: String, isParsing: Boolean): String = {
val eraDesignatorContained = pattern.split("'").zipWithIndex.exists {
case (patternPart, index) =>
// Text can be quoted using single quotes, we only check the non-quote parts.
Expand All @@ -269,6 +274,10 @@ private object DateTimeFormatterHelper {
(pattern + " ").split("'").zipWithIndex.map {
case (patternPart, index) =>
if (index % 2 == 0) {
for (c <- patternPart if weekBasedLetters.contains(c)) {
throw new IllegalArgumentException(s"All week-based patterns are unsupported since" +
s" Spark 3.0, detected: $c, Please use the SQL function EXTRACT instead")
}
for (c <- patternPart if unsupportedLetters.contains(c) ||
(isParsing && unsupportedLettersForParsing.contains(c))) {
throw new IllegalArgumentException(s"Illegal pattern character: $c")
Expand All @@ -282,20 +291,13 @@ private object DateTimeFormatterHelper {
"or upgrade your Java version. For more details, please read " +
"https://bugs.openjdk.java.net/browse/JDK-8114833")
}
// The meaning of 'u' was day number of week in SimpleDateFormat, it was changed to year
// in DateTimeFormatter. Substitute 'u' to 'e' and use DateTimeFormatter to parse the
// string. If parsable, return the result; otherwise, fall back to 'u', and then use the
// legacy SimpleDateFormat parser to parse. When it is successfully parsed, throw an
// exception and ask users to change the pattern strings or turn on the legacy mode;
// otherwise, return NULL as what Spark 2.4 does.
val res = patternPart.replace("u", "e")
// In DateTimeFormatter, 'u' supports negative years. We substitute 'y' to 'u' here for
// keeping the support in Spark 3.0. If parse failed in Spark 3.0, fall back to 'y'.
// We only do this substitution when there is no era designator found in the pattern.
if (!eraDesignatorContained) {
res.replace("y", "u")
patternPart.replace("y", "u")
} else {
res
patternPart
}
} else {
patternPart
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ class Iso8601TimestampFormatter(
zoneId: ZoneId,
locale: Locale,
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction: Boolean)
isParsing: Boolean)
extends TimestampFormatter with DateTimeFormatterHelper {
@transient
protected lazy val formatter: DateTimeFormatter =
getOrCreateFormatter(pattern, locale, needVarLengthSecondFraction)
getOrCreateFormatter(pattern, locale, isParsing)

@transient
protected lazy val legacyFormatter = TimestampFormatter.getLegacyFormatter(
Expand Down Expand Up @@ -122,7 +122,7 @@ class FractionTimestampFormatter(zoneId: ZoneId)
zoneId,
TimestampFormatter.defaultLocale,
LegacyDateFormats.FAST_DATE_FORMAT,
needVarLengthSecondFraction = false) {
isParsing = false) {

@transient
override protected lazy val formatter = DateTimeFormatterHelper.fractionFormatter
Expand Down Expand Up @@ -287,7 +287,7 @@ object TimestampFormatter {
zoneId: ZoneId,
locale: Locale = defaultLocale,
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
isParsing: Boolean = false): TimestampFormatter = {
isParsing: Boolean): TimestampFormatter = {
val pattern = format.getOrElse(defaultPattern)
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
Expand Down Expand Up @@ -334,12 +334,12 @@ object TimestampFormatter {
def apply(
format: String,
zoneId: ZoneId,
isParsing: Boolean = false): TimestampFormatter = {
isParsing: Boolean): TimestampFormatter = {
getFormatter(Some(format), zoneId, isParsing = isParsing)
}

def apply(zoneId: ZoneId): TimestampFormatter = {
getFormatter(None, zoneId)
getFormatter(None, zoneId, isParsing = false)
}

def getFractionFormatter(zoneId: ZoneId): TimestampFormatter = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
private val JST_OPT = Option(JST.getId)

def toMillis(timestamp: String): Long = {
val tf = TimestampFormatter("yyyy-MM-dd HH:mm:ss", UTC)
val tf = TimestampFormatter("yyyy-MM-dd HH:mm:ss", UTC, isParsing = true)
DateTimeUtils.microsToMillis(tf.parse(timestamp))
}
val date = "2015-04-08 13:10:15"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,22 @@
* limitations under the License.
*/

package org.apache.spark.sql.util
package org.apache.spark.sql.catalyst.util

import java.time.{DateTimeException, LocalDate}

import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
import org.apache.spark.sql.catalyst.plans.SQLHelper
import org.apache.spark.sql.catalyst.util.{DateFormatter, LegacyDateFormats}
import org.apache.spark.SparkUpgradeException
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
import org.apache.spark.sql.catalyst.util.DateTimeUtils._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy

class DateFormatterSuite extends SparkFunSuite with SQLHelper {
class DateFormatterSuite extends DatetimeFormatterSuite {

override def checkFormatterCreation(pattern: String, isParsing: Boolean): Unit = {
DateFormatter(pattern, UTC, isParsing)
}

test("parsing dates") {
outstandingTimezonesIds.foreach { timeZone =>
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,27 +22,32 @@ import org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper._

class DateTimeFormatterHelperSuite extends SparkFunSuite {

private def convertIncompatiblePattern(pattern: String): String = {
DateTimeFormatterHelper.convertIncompatiblePattern(pattern, isParsing = false)
}

test("check incompatible pattern") {
assert(convertIncompatiblePattern("MM-DD-u") === "MM-DD-e")
assert(convertIncompatiblePattern("yyyy-MM-dd'T'HH:mm:ss.SSSz")
=== "uuuu-MM-dd'T'HH:mm:ss.SSSz")
assert(convertIncompatiblePattern("yyyy-MM'y contains in quoted text'HH:mm:ss")
=== "uuuu-MM'y contains in quoted text'HH:mm:ss")
assert(convertIncompatiblePattern("yyyy-MM-dd-u'T'HH:mm:ss.SSSz")
=== "uuuu-MM-dd-e'T'HH:mm:ss.SSSz")
assert(convertIncompatiblePattern("yyyy-MM'u contains in quoted text'HH:mm:ss")
=== "uuuu-MM'u contains in quoted text'HH:mm:ss")
assert(convertIncompatiblePattern("yyyy-MM'u contains in quoted text'''''HH:mm:ss")
=== "uuuu-MM'u contains in quoted text'''''HH:mm:ss")
assert(convertIncompatiblePattern("yyyy-MM-dd'T'HH:mm:ss.SSSz G")
=== "yyyy-MM-dd'T'HH:mm:ss.SSSz G")
weekBasedLetters.foreach { l =>
val e = intercept[IllegalArgumentException](convertIncompatiblePattern(s"yyyy-MM-dd $l G"))
assert(e.getMessage.contains("week-based"))
}
unsupportedLetters.foreach { l =>
val e = intercept[IllegalArgumentException](convertIncompatiblePattern(s"yyyy-MM-dd $l G"))
assert(e.getMessage === s"Illegal pattern character: $l")
}
unsupportedLettersForParsing.foreach { l =>
val e = intercept[IllegalArgumentException] {
convertIncompatiblePattern(s"$l", isParsing = true)
DateTimeFormatterHelper.convertIncompatiblePattern(s"$l", isParsing = true)
}
assert(e.getMessage === s"Illegal pattern character: $l")
assert(convertIncompatiblePattern(s"$l").nonEmpty)
Expand All @@ -57,7 +62,6 @@ class DateTimeFormatterHelperSuite extends SparkFunSuite {
}
assert(e2.getMessage === s"Too many pattern letters: ${style.head}")
}
assert(convertIncompatiblePattern("yyyy-MM-dd uuuu") === "uuuu-MM-dd eeee")
assert(convertIncompatiblePattern("yyyy-MM-dd EEEE") === "uuuu-MM-dd EEEE")
assert(convertIncompatiblePattern("yyyy-MM-dd'e'HH:mm:ss") === "uuuu-MM-dd'e'HH:mm:ss")
assert(convertIncompatiblePattern("yyyy-MM-dd'T'") === "uuuu-MM-dd'T'")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.catalyst.util

import org.scalatest.Matchers

import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
import org.apache.spark.sql.catalyst.plans.SQLHelper

trait DatetimeFormatterSuite extends SparkFunSuite with SQLHelper with Matchers {
import DateTimeFormatterHelper._
def checkFormatterCreation(pattern: String, isParsing: Boolean): Unit

test("explicitly forbidden datetime patterns") {

Seq(true, false).foreach { isParsing =>
// not support by the legacy one too
val unsupportedBoth = Seq("QQQQQ", "qqqqq", "eeeee", "A", "c", "n", "N", "p", "e")
unsupportedBoth.foreach { pattern =>
intercept[IllegalArgumentException](checkFormatterCreation(pattern, isParsing))
}
// supported by the legacy one, then we will suggest users with SparkUpgradeException
((weekBasedLetters ++ unsupportedLetters).map(_.toString)
++ unsupportedPatternLengths -- unsupportedBoth).foreach {
pattern => intercept[SparkUpgradeException](checkFormatterCreation(pattern, isParsing))
}
}

// not support by the legacy one too
val unsupportedBoth = Seq("q", "Q")
unsupportedBoth.foreach { pattern =>
intercept[IllegalArgumentException](checkFormatterCreation(pattern, true))
}
// supported by the legacy one, then we will suggest users with SparkUpgradeException
(unsupportedLettersForParsing.map(_.toString) -- unsupportedBoth).foreach {
pattern => intercept[SparkUpgradeException](checkFormatterCreation(pattern, true))
}
}
}
Loading