Skip to content

Commit 12d2757

Browse files
committed
[SPARK-31892][SQL] Disable week-based date filed for parsing
1 parent b806fc4 commit 12d2757

File tree

17 files changed

+88
-48
lines changed

17 files changed

+88
-48
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -525,7 +525,7 @@ object CatalogColumnStat extends Logging {
525525
TimestampFormatter(
526526
format = "yyyy-MM-dd HH:mm:ss.SSSSSS",
527527
zoneId = ZoneOffset.UTC,
528-
needVarLengthSecondFraction = isParsing)
528+
isParsing = isParsing)
529529
}
530530

531531
/**

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
3535
options.zoneId,
3636
options.locale,
3737
legacyFormat = FAST_DATE_FORMAT,
38-
needVarLengthSecondFraction = true)
38+
isParsing = true)
3939

4040
private val decimalParser = if (options.locale == Locale.US) {
4141
// Special handling the default locale for backward compatibility

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,13 @@ class UnivocityGenerator(
4747
options.zoneId,
4848
options.locale,
4949
legacyFormat = FAST_DATE_FORMAT,
50-
needVarLengthSecondFraction = false)
50+
isParsing = false)
5151
private val dateFormatter = DateFormatter(
5252
options.dateFormat,
5353
options.zoneId,
5454
options.locale,
55-
legacyFormat = FAST_DATE_FORMAT)
55+
legacyFormat = FAST_DATE_FORMAT,
56+
isParsing = false)
5657

5758
private def makeConverter(dataType: DataType): ValueConverter = dataType match {
5859
case DateType =>

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,13 @@ class UnivocityParser(
9090
options.zoneId,
9191
options.locale,
9292
legacyFormat = FAST_DATE_FORMAT,
93-
needVarLengthSecondFraction = true)
93+
isParsing = true)
9494
private lazy val dateFormatter = DateFormatter(
9595
options.dateFormat,
9696
options.zoneId,
9797
options.locale,
98-
legacyFormat = FAST_DATE_FORMAT)
98+
legacyFormat = FAST_DATE_FORMAT,
99+
isParsing = true)
99100

100101
private val csvFilters = new CSVFilters(filters, requiredSchema)
101102

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -734,7 +734,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
734734
format.toString,
735735
zoneId,
736736
legacyFormat = SIMPLE_DATE_FORMAT,
737-
needVarLengthSecondFraction = false)
737+
isParsing = false)
738738
}
739739
} else None
740740
}
@@ -745,7 +745,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
745745
format.toString,
746746
zoneId,
747747
legacyFormat = SIMPLE_DATE_FORMAT,
748-
needVarLengthSecondFraction = false)
748+
isParsing = false)
749749
} else {
750750
formatter.get
751751
}
@@ -890,7 +890,7 @@ abstract class ToTimestamp
890890
constFormat.toString,
891891
zoneId,
892892
legacyFormat = SIMPLE_DATE_FORMAT,
893-
needVarLengthSecondFraction = true)
893+
isParsing = true)
894894
} catch {
895895
case e: SparkUpgradeException => throw e
896896
case NonFatal(_) => null
@@ -929,7 +929,7 @@ abstract class ToTimestamp
929929
formatString,
930930
zoneId,
931931
legacyFormat = SIMPLE_DATE_FORMAT,
932-
needVarLengthSecondFraction = true)
932+
isParsing = true)
933933
.parse(t.asInstanceOf[UTF8String].toString) / downScaleFactor
934934
} catch {
935935
case e: SparkUpgradeException => throw e
@@ -1072,7 +1072,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
10721072
constFormat.toString,
10731073
zoneId,
10741074
legacyFormat = SIMPLE_DATE_FORMAT,
1075-
needVarLengthSecondFraction = false)
1075+
isParsing = false)
10761076
} catch {
10771077
case e: SparkUpgradeException => throw e
10781078
case NonFatal(_) => null
@@ -1105,7 +1105,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
11051105
f.toString,
11061106
zoneId,
11071107
legacyFormat = SIMPLE_DATE_FORMAT,
1108-
needVarLengthSecondFraction = false)
1108+
isParsing = false)
11091109
.format(time.asInstanceOf[Long] * MICROS_PER_SECOND))
11101110
} catch {
11111111
case e: SparkUpgradeException => throw e

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,13 @@ private[sql] class JacksonGenerator(
8383
options.zoneId,
8484
options.locale,
8585
legacyFormat = FAST_DATE_FORMAT,
86-
needVarLengthSecondFraction = false)
86+
isParsing = false)
8787
private val dateFormatter = DateFormatter(
8888
options.dateFormat,
8989
options.zoneId,
9090
options.locale,
91-
legacyFormat = FAST_DATE_FORMAT)
91+
legacyFormat = FAST_DATE_FORMAT,
92+
isParsing = false)
9293

9394
private def makeWriter(dataType: DataType): ValueWriter = dataType match {
9495
case NullType =>

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,13 @@ class JacksonParser(
6161
options.zoneId,
6262
options.locale,
6363
legacyFormat = FAST_DATE_FORMAT,
64-
needVarLengthSecondFraction = true)
64+
isParsing = true)
6565
private lazy val dateFormatter = DateFormatter(
6666
options.dateFormat,
6767
options.zoneId,
6868
options.locale,
69-
legacyFormat = FAST_DATE_FORMAT)
69+
legacyFormat = FAST_DATE_FORMAT,
70+
isParsing = true)
7071

7172
/**
7273
* Create a converter which converts the JSON documents held by the `JsonParser`

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable {
4343
options.zoneId,
4444
options.locale,
4545
legacyFormat = FAST_DATE_FORMAT,
46-
needVarLengthSecondFraction = true)
46+
isParsing = true)
4747

4848
/**
4949
* Infer the type of a collection of json records in three stages:

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.util
1919

2020
import java.text.SimpleDateFormat
2121
import java.time.{LocalDate, ZoneId}
22-
import java.time.format.DateTimeFormatter
2322
import java.util.{Date, Locale}
2423

2524
import org.apache.commons.lang3.time.FastDateFormat
@@ -42,7 +41,8 @@ class Iso8601DateFormatter(
4241
pattern: String,
4342
zoneId: ZoneId,
4443
locale: Locale,
45-
legacyFormat: LegacyDateFormats.LegacyDateFormat)
44+
legacyFormat: LegacyDateFormats.LegacyDateFormat,
45+
isParsing: Boolean)
4646
extends DateFormatter with DateTimeFormatterHelper {
4747

4848
@transient
@@ -125,12 +125,13 @@ object DateFormatter {
125125
format: Option[String],
126126
zoneId: ZoneId,
127127
locale: Locale = defaultLocale,
128-
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT): DateFormatter = {
128+
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
129+
isParsing: Boolean = true): DateFormatter = {
129130
val pattern = format.getOrElse(defaultPattern)
130131
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
131132
getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
132133
} else {
133-
val df = new Iso8601DateFormatter(pattern, zoneId, locale, legacyFormat)
134+
val df = new Iso8601DateFormatter(pattern, zoneId, locale, legacyFormat, isParsing)
134135
df.validatePatternString()
135136
df
136137
}
@@ -153,8 +154,9 @@ object DateFormatter {
153154
format: String,
154155
zoneId: ZoneId,
155156
locale: Locale,
156-
legacyFormat: LegacyDateFormat): DateFormatter = {
157-
getFormatter(Some(format), zoneId, locale, legacyFormat)
157+
legacyFormat: LegacyDateFormat,
158+
isParsing: Boolean): DateFormatter = {
159+
getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
158160
}
159161

160162
def apply(format: String, zoneId: ZoneId): DateFormatter = {

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,9 @@ trait DateTimeFormatterHelper {
8989
protected def getOrCreateFormatter(
9090
pattern: String,
9191
locale: Locale,
92-
needVarLengthSecondFraction: Boolean = false): DateTimeFormatter = {
93-
val newPattern = convertIncompatiblePattern(pattern)
94-
val useVarLen = needVarLengthSecondFraction && newPattern.contains('S')
92+
isParsing: Boolean = false): DateTimeFormatter = {
93+
val newPattern = convertIncompatiblePattern(pattern, isParsing)
94+
val useVarLen = isParsing && newPattern.contains('S')
9595
val key = (newPattern, locale, useVarLen)
9696
var formatter = cache.getIfPresent(key)
9797
if (formatter == null) {
@@ -227,6 +227,11 @@ private object DateTimeFormatterHelper {
227227
formatter.format(LocalDate.of(2000, 1, 1)) == "1 1"
228228
}
229229
final val unsupportedLetters = Set('A', 'c', 'e', 'n', 'N', 'p')
230+
// SPARK-31892: The week-based date fields are rarely used and really confusing for parsing values
231+
// to timestamp, especially when they are mixed with other non-week-based ones. we have tried our
232+
// best to restore the behavior change between 2.4 and 3.0 and failed, see
233+
// https://github.com/apache/spark/pull/28674
234+
final val unsupportedLettersForParsing = Set('Y', 'W', 'w', 'E', 'u', 'F')
230235
final val unsupportedPatternLengths = {
231236
// SPARK-31771: Disable Narrow-form TextStyle to avoid silent data change, as it is Full-form in
232237
// 2.4
@@ -246,7 +251,7 @@ private object DateTimeFormatterHelper {
246251
* @param pattern The input pattern.
247252
* @return The pattern for new parser
248253
*/
249-
def convertIncompatiblePattern(pattern: String): String = {
254+
def convertIncompatiblePattern(pattern: String, isParsing: Boolean = false): String = {
250255
val eraDesignatorContained = pattern.split("'").zipWithIndex.exists {
251256
case (patternPart, index) =>
252257
// Text can be quoted using single quotes, we only check the non-quote parts.
@@ -255,7 +260,8 @@ private object DateTimeFormatterHelper {
255260
(pattern + " ").split("'").zipWithIndex.map {
256261
case (patternPart, index) =>
257262
if (index % 2 == 0) {
258-
for (c <- patternPart if unsupportedLetters.contains(c)) {
263+
for (c <- patternPart if unsupportedLetters.contains(c) ||
264+
(isParsing && unsupportedLettersForParsing.contains(c))) {
259265
throw new IllegalArgumentException(s"Illegal pattern character: $c")
260266
}
261267
for (style <- unsupportedPatternLengths if patternPart.contains(style)) {

0 commit comments

Comments
 (0)