apache · yaooqinn · Jun 4, 2020 · Jun 4, 2020 · Jun 4, 2020 · Jun 4, 2020
diff --git a/docs/sql-ref-datetime-pattern.md b/docs/sql-ref-datetime-pattern.md
@@ -36,11 +36,7 @@ Spark uses pattern letters in the following table for date and timestamp parsing
 |**M/L**|month-of-year|month|7; 07; Jul; July|
 |**d**|day-of-month|number(3)|28|
 |**Q/q**|quarter-of-year|number/text|3; 03; Q3; 3rd quarter|
-|**Y**|week-based-year|year|1996; 96|
-|**w**|week-of-week-based-year|number(2)|27|
-|**W**|week-of-month|number(1)|4|
 |**E**|day-of-week|text|Tue; Tuesday|
-|**u**|localized day-of-week|number/text|2; 02; Tue; Tuesday|
 |**F**|week-of-month|number(1)|3|
 |**a**|am-pm-of-day|am-pm|PM|
 |**h**|clock-hour-of-am-pm (1-12)|number(2)|12|
@@ -63,7 +59,7 @@ Spark uses pattern letters in the following table for date and timestamp parsing
 
 The count of pattern letters determines the format.
 
-- Text: The text style is determined based on the number of pattern letters used. Less than 4 pattern letters will use the short form. Exactly 4 pattern letters will use the full form. Exactly 5 pattern letters will use the narrow form. 5 or more letters will fail.
+- Text: The text style is determined based on the number of pattern letters used. Less than 4 pattern letters will use the short text form, typically an abbreviation, e.g. day-of-week Monday might output "Mon". Exactly 4 pattern letters will use the full text form, typically the full description, e.g, day-of-week Monday might output "Monday". 5 or more letters will fail.
 
 - Number(n): The n here represents the maximum count of letters this type of datetime pattern can be used. If the count of letters is one, then the value is output using the minimum number of digits and without padding. Otherwise, the count of digits is used as the width of the output field, with the value zero-padded as necessary.
 
@@ -137,10 +133,4 @@ The count of pattern letters determines the format.
   During parsing, the whole section may be missing from the parsed string.
   An optional section is started by `[` and ended using `]` (or at the end of the pattern).
 
-- Symbols of 'Y', 'W', 'w', 'E', 'u', 'F', 'q' and 'Q' can only be used for datetime formatting, e.g. `date_format`. They are not allowed used for datetime parsing, e.g. `to_timestamp`.
-
-More details for the text style:
-
-- Short Form: Short text, typically an abbreviation. For example, day-of-week Monday might output "Mon".
-
-- Full Form: Full text, typically the full description. For example, day-of-week Monday might output "Monday".
+- Symbols of 'E', 'F', 'q' and 'Q' can only be used for datetime formatting, e.g. `date_format`. They are not allowed used for datetime parsing, e.g. `to_timestamp`.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
@@ -46,7 +46,7 @@ class Iso8601DateFormatter(
   extends DateFormatter with DateTimeFormatterHelper {
 
   @transient
-  private lazy val formatter = getOrCreateFormatter(pattern, locale)
+  private lazy val formatter = getOrCreateFormatter(pattern, locale, isParsing)
 
   @transient
   private lazy val legacyFormatter = DateFormatter.getLegacyFormatter(
@@ -126,7 +126,7 @@ object DateFormatter {
       zoneId: ZoneId,
       locale: Locale = defaultLocale,
       legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
-      isParsing: Boolean = true): DateFormatter = {
+      isParsing: Boolean): DateFormatter = {
     val pattern = format.getOrElse(defaultPattern)
     if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
       getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
@@ -159,11 +159,11 @@ object DateFormatter {
     getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
   }
 
-  def apply(format: String, zoneId: ZoneId): DateFormatter = {
-    getFormatter(Some(format), zoneId)
+  def apply(format: String, zoneId: ZoneId, isParsing: Boolean = false): DateFormatter = {
+    getFormatter(Some(format), zoneId, isParsing = isParsing)
   }
 
   def apply(zoneId: ZoneId): DateFormatter = {
-    getFormatter(None, zoneId)
+    getFormatter(None, zoneId, isParsing = false)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala
@@ -97,7 +97,7 @@ trait DateTimeFormatterHelper {
   protected def getOrCreateFormatter(
       pattern: String,
       locale: Locale,
-      isParsing: Boolean = false): DateTimeFormatter = {
+      isParsing: Boolean): DateTimeFormatter = {
     val newPattern = convertIncompatiblePattern(pattern, isParsing)
     val useVarLen = isParsing && newPattern.contains('S')
     val key = (newPattern, locale, useVarLen)
@@ -234,22 +234,27 @@ private object DateTimeFormatterHelper {
     val formatter = DateTimeFormatter.ofPattern("LLL qqq", Locale.US)
     formatter.format(LocalDate.of(2000, 1, 1)) == "1 1"
   }
-  final val unsupportedLetters = Set('A', 'c', 'e', 'n', 'N', 'p')
   // SPARK-31892: The week-based date fields are rarely used and really confusing for parsing values
-  // to datetime, especially when they are mixed with other non-week-based ones
+  // to datetime, especially when they are mixed with other non-week-based ones;
+  // SPARK-31879: It's also difficult for us to restore the behavior of week-based date fields
+  // formatting, in DateTimeFormatter the first day of week for week-based date fields become
+  // localized, for the default Locale.US, it uses Sunday as the first day of week, while in Spark
+  // 2.4, the SimpleDateFormat uses Monday as the first day of week.
+  final val weekBasedLetters = Set('Y', 'W', 'w', 'u', 'e', 'c')
+  final val unsupportedLetters = Set('A', 'n', 'N', 'p')
   // The quarter fields will also be parsed strangely, e.g. when the pattern contains `yMd` and can
   // be directly resolved then the `q` do check for whether the month is valid, but if the date
   // fields is incomplete, e.g. `yM`, the checking will be bypassed.
-  final val unsupportedLettersForParsing = Set('Y', 'W', 'w', 'E', 'u', 'F', 'q', 'Q')
+  final val unsupportedLettersForParsing = Set('E', 'F', 'q', 'Q')
   final val unsupportedPatternLengths = {
     // SPARK-31771: Disable Narrow-form TextStyle to avoid silent data change, as it is Full-form in
     // 2.4
-    Seq("G", "M", "L", "E", "u", "Q", "q").map(_ * 5) ++
+    Seq("G", "M", "L", "E", "Q", "q").map(_ * 5) ++
       // SPARK-31867: Disable year pattern longer than 10 which will cause Java time library throw
       // unchecked `ArrayIndexOutOfBoundsException` by the `NumberPrinterParser` for formatting. It
       // makes the call side difficult to handle exceptions and easily leads to silent data change
       // because of the exceptions being suppressed.
-      Seq("y", "Y").map(_ * 11)
+      Seq("y").map(_ * 11)
   }.toSet
 
   /**
@@ -260,7 +265,7 @@ private object DateTimeFormatterHelper {
    * @param pattern The input pattern.
    * @return The pattern for new parser
    */
-  def convertIncompatiblePattern(pattern: String, isParsing: Boolean = false): String = {
+  def convertIncompatiblePattern(pattern: String, isParsing: Boolean): String = {
     val eraDesignatorContained = pattern.split("'").zipWithIndex.exists {
       case (patternPart, index) =>
         // Text can be quoted using single quotes, we only check the non-quote parts.
@@ -269,6 +274,10 @@ private object DateTimeFormatterHelper {
     (pattern + " ").split("'").zipWithIndex.map {
       case (patternPart, index) =>
         if (index % 2 == 0) {
+          for (c <- patternPart if weekBasedLetters.contains(c)) {
+            throw new IllegalArgumentException(s"All week-based patterns are unsupported since" +
+              s" Spark 3.0, detected: $c, Please use the SQL function EXTRACT instead")
+          }
           for (c <- patternPart if unsupportedLetters.contains(c) ||
             (isParsing && unsupportedLettersForParsing.contains(c))) {
             throw new IllegalArgumentException(s"Illegal pattern character: $c")
@@ -282,20 +291,13 @@ private object DateTimeFormatterHelper {
               "or upgrade your Java version. For more details, please read " +
               "https://bugs.openjdk.java.net/browse/JDK-8114833")
           }
-          // The meaning of 'u' was day number of week in SimpleDateFormat, it was changed to year
-          // in DateTimeFormatter. Substitute 'u' to 'e' and use DateTimeFormatter to parse the
-          // string. If parsable, return the result; otherwise, fall back to 'u', and then use the
-          // legacy SimpleDateFormat parser to parse. When it is successfully parsed, throw an
-          // exception and ask users to change the pattern strings or turn on the legacy mode;
-          // otherwise, return NULL as what Spark 2.4 does.
-          val res = patternPart.replace("u", "e")
           // In DateTimeFormatter, 'u' supports negative years. We substitute 'y' to 'u' here for
           // keeping the support in Spark 3.0. If parse failed in Spark 3.0, fall back to 'y'.
           // We only do this substitution when there is no era designator found in the pattern.
           if (!eraDesignatorContained) {
-            res.replace("y", "u")
+            patternPart.replace("y", "u")
           } else {
-            res
+            patternPart
           }
         } else {
           patternPart

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
@@ -62,11 +62,11 @@ class Iso8601TimestampFormatter(
     zoneId: ZoneId,
     locale: Locale,
     legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
-    needVarLengthSecondFraction: Boolean)
+    isParsing: Boolean)
   extends TimestampFormatter with DateTimeFormatterHelper {
   @transient
   protected lazy val formatter: DateTimeFormatter =
-    getOrCreateFormatter(pattern, locale, needVarLengthSecondFraction)
+    getOrCreateFormatter(pattern, locale, isParsing)
 
   @transient
   protected lazy val legacyFormatter = TimestampFormatter.getLegacyFormatter(
@@ -122,7 +122,7 @@ class FractionTimestampFormatter(zoneId: ZoneId)
     zoneId,
     TimestampFormatter.defaultLocale,
     LegacyDateFormats.FAST_DATE_FORMAT,
-    needVarLengthSecondFraction = false) {
+    isParsing = false) {
 
   @transient
   override protected lazy val formatter = DateTimeFormatterHelper.fractionFormatter
@@ -287,7 +287,7 @@ object TimestampFormatter {
       zoneId: ZoneId,
       locale: Locale = defaultLocale,
       legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
-      isParsing: Boolean = false): TimestampFormatter = {
+      isParsing: Boolean): TimestampFormatter = {
     val pattern = format.getOrElse(defaultPattern)
     if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
       getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
@@ -334,12 +334,12 @@ object TimestampFormatter {
   def apply(
       format: String,
       zoneId: ZoneId,
-      isParsing: Boolean = false): TimestampFormatter = {
+      isParsing: Boolean): TimestampFormatter = {
     getFormatter(Some(format), zoneId, isParsing = isParsing)
   }
 
   def apply(zoneId: ZoneId): TimestampFormatter = {
-    getFormatter(None, zoneId)
+    getFormatter(None, zoneId, isParsing = false)
   }
 
   def getFractionFormatter(zoneId: ZoneId): TimestampFormatter = {

diff --git a/...alyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/...alyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -41,7 +41,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   private val JST_OPT = Option(JST.getId)
 
   def toMillis(timestamp: String): Long = {
-    val tf = TimestampFormatter("yyyy-MM-dd HH:mm:ss", UTC)
+    val tf = TimestampFormatter("yyyy-MM-dd HH:mm:ss", UTC, isParsing = true)
     DateTimeUtils.microsToMillis(tf.parse(timestamp))
   }
   val date = "2015-04-08 13:10:15"

diff --git a/...e/spark/sql/util/DateFormatterSuite.scala → ...ql/catalyst/util/DateFormatterSuite.scala b/...e/spark/sql/util/DateFormatterSuite.scala → ...ql/catalyst/util/DateFormatterSuite.scala
@@ -15,19 +15,22 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.util
+package org.apache.spark.sql.catalyst.util
 
 import java.time.{DateTimeException, LocalDate}
 
-import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
-import org.apache.spark.sql.catalyst.plans.SQLHelper
-import org.apache.spark.sql.catalyst.util.{DateFormatter, LegacyDateFormats}
+import org.apache.spark.SparkUpgradeException
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
 
-class DateFormatterSuite extends SparkFunSuite with SQLHelper {
+class DateFormatterSuite extends DatetimeFormatterSuite {
+
+  override def checkFormatterCreation(pattern: String, isParsing: Boolean): Unit = {
+    DateFormatter(pattern, UTC, isParsing)
+  }
+
   test("parsing dates") {
     outstandingTimezonesIds.foreach { timeZone =>
       withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {

diff --git a/...lyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelperSuite.scala b/...lyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelperSuite.scala
@@ -22,27 +22,32 @@ import org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper._
 
 class DateTimeFormatterHelperSuite extends SparkFunSuite {
 
+  private def convertIncompatiblePattern(pattern: String): String = {
+    DateTimeFormatterHelper.convertIncompatiblePattern(pattern, isParsing = false)
+  }
+
   test("check incompatible pattern") {
-    assert(convertIncompatiblePattern("MM-DD-u") === "MM-DD-e")
     assert(convertIncompatiblePattern("yyyy-MM-dd'T'HH:mm:ss.SSSz")
       === "uuuu-MM-dd'T'HH:mm:ss.SSSz")
     assert(convertIncompatiblePattern("yyyy-MM'y contains in quoted text'HH:mm:ss")
       === "uuuu-MM'y contains in quoted text'HH:mm:ss")
-    assert(convertIncompatiblePattern("yyyy-MM-dd-u'T'HH:mm:ss.SSSz")
-      === "uuuu-MM-dd-e'T'HH:mm:ss.SSSz")
     assert(convertIncompatiblePattern("yyyy-MM'u contains in quoted text'HH:mm:ss")
       === "uuuu-MM'u contains in quoted text'HH:mm:ss")
     assert(convertIncompatiblePattern("yyyy-MM'u contains in quoted text'''''HH:mm:ss")
       === "uuuu-MM'u contains in quoted text'''''HH:mm:ss")
     assert(convertIncompatiblePattern("yyyy-MM-dd'T'HH:mm:ss.SSSz G")
       === "yyyy-MM-dd'T'HH:mm:ss.SSSz G")
+    weekBasedLetters.foreach { l =>
+      val e = intercept[IllegalArgumentException](convertIncompatiblePattern(s"yyyy-MM-dd $l G"))
+      assert(e.getMessage.contains("week-based"))
+    }
     unsupportedLetters.foreach { l =>
       val e = intercept[IllegalArgumentException](convertIncompatiblePattern(s"yyyy-MM-dd $l G"))
       assert(e.getMessage === s"Illegal pattern character: $l")
     }
     unsupportedLettersForParsing.foreach { l =>
       val e = intercept[IllegalArgumentException] {
-        convertIncompatiblePattern(s"$l", isParsing = true)
+        DateTimeFormatterHelper.convertIncompatiblePattern(s"$l", isParsing = true)
       }
       assert(e.getMessage === s"Illegal pattern character: $l")
       assert(convertIncompatiblePattern(s"$l").nonEmpty)
@@ -57,7 +62,6 @@ class DateTimeFormatterHelperSuite extends SparkFunSuite {
       }
       assert(e2.getMessage === s"Too many pattern letters: ${style.head}")
     }
-    assert(convertIncompatiblePattern("yyyy-MM-dd uuuu") === "uuuu-MM-dd eeee")
     assert(convertIncompatiblePattern("yyyy-MM-dd EEEE") === "uuuu-MM-dd EEEE")
     assert(convertIncompatiblePattern("yyyy-MM-dd'e'HH:mm:ss") === "uuuu-MM-dd'e'HH:mm:ss")
     assert(convertIncompatiblePattern("yyyy-MM-dd'T'") === "uuuu-MM-dd'T'")

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DatetimeFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DatetimeFormatterSuite.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import org.scalatest.Matchers
+
+import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
+import org.apache.spark.sql.catalyst.plans.SQLHelper
+
+trait DatetimeFormatterSuite extends SparkFunSuite with SQLHelper with Matchers {
+  import DateTimeFormatterHelper._
+  def checkFormatterCreation(pattern: String, isParsing: Boolean): Unit
+
+  test("explicitly forbidden datetime patterns") {
+
+    Seq(true, false).foreach { isParsing =>
+      // not support by the legacy one too
+      val unsupportedBoth = Seq("QQQQQ", "qqqqq", "eeeee", "A", "c", "n", "N", "p", "e")
+      unsupportedBoth.foreach { pattern =>
+        intercept[IllegalArgumentException](checkFormatterCreation(pattern, isParsing))
+      }
+      // supported by the legacy one, then we will suggest users with SparkUpgradeException
+      ((weekBasedLetters ++ unsupportedLetters).map(_.toString)
+        ++ unsupportedPatternLengths -- unsupportedBoth).foreach {
+        pattern => intercept[SparkUpgradeException](checkFormatterCreation(pattern, isParsing))
+      }
+    }
+
+    // not support by the legacy one too
+    val unsupportedBoth = Seq("q", "Q")
+    unsupportedBoth.foreach { pattern =>
+      intercept[IllegalArgumentException](checkFormatterCreation(pattern, true))
+    }
+    // supported by the legacy one, then we will suggest users with SparkUpgradeException
+    (unsupportedLettersForParsing.map(_.toString) -- unsupportedBoth).foreach {
+      pattern => intercept[SparkUpgradeException](checkFormatterCreation(pattern, true))
+    }
+  }
+}