Skip to content

Commit f39dd48

Browse files
committed
[SPARK-31830][SQL] Consistent error handling for datetime formatting functions
1 parent 695cb61 commit f39dd48

File tree

7 files changed

+141
-89
lines changed

7 files changed

+141
-89
lines changed

docs/sql-migration-guide.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ license: |
2727
- In Spark 3.1, grouping_id() returns long values. In Spark version 3.0 and earlier, this function returns int values. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.integerGroupingId` to `true`.
2828

2929
- In Spark 3.1, SQL UI data adopts the `formatted` mode for the query plan explain results. To restore the behavior before Spark 3.0, you can set `spark.sql.ui.explainMode` to `extended`.
30+
31+
- In Spark 3.1, `from_unixtime` will fail if the specified datetime pattern is invalid. In Spark 3.0 or earlier, it results `NULL`.
3032

3133
## Upgrading from Spark SQL 2.4 to 3.0
3234

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala

Lines changed: 23 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ import org.apache.spark.sql.catalyst.util.{DateTimeUtils, LegacyDateFormats, Tim
3434
import org.apache.spark.sql.catalyst.util.DateTimeConstants._
3535
import org.apache.spark.sql.catalyst.util.DateTimeUtils._
3636
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.SIMPLE_DATE_FORMAT
37-
import org.apache.spark.sql.catalyst.util.toPrettySQL
3837
import org.apache.spark.sql.internal.SQLConf
3938
import org.apache.spark.sql.types._
4039
import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
@@ -1053,91 +1052,38 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
10531052
override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression =
10541053
copy(timeZoneId = Option(timeZoneId))
10551054

1056-
private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String]
1057-
private lazy val formatter: TimestampFormatter =
1058-
try {
1059-
TimestampFormatter(
1060-
constFormat.toString,
1061-
zoneId,
1062-
legacyFormat = SIMPLE_DATE_FORMAT,
1055+
private lazy val formatter: Option[TimestampFormatter] =
1056+
if (right.foldable) {
1057+
Option(right.eval()).map { format =>
1058+
TimestampFormatter(format.toString, zoneId, legacyFormat = SIMPLE_DATE_FORMAT,
10631059
needVarLengthSecondFraction = false)
1064-
} catch {
1065-
case e: SparkUpgradeException => throw e
1066-
case NonFatal(_) => null
10671060
}
1061+
} else None
10681062

1069-
override def eval(input: InternalRow): Any = {
1070-
val time = left.eval(input)
1071-
if (time == null) {
1072-
null
1073-
} else {
1074-
if (format.foldable) {
1075-
if (constFormat == null || formatter == null) {
1076-
null
1077-
} else {
1078-
try {
1079-
UTF8String.fromString(formatter.format(time.asInstanceOf[Long] * MICROS_PER_SECOND))
1080-
} catch {
1081-
case e: SparkUpgradeException => throw e
1082-
case NonFatal(_) => null
1083-
}
1084-
}
1085-
} else {
1086-
val f = format.eval(input)
1087-
if (f == null) {
1088-
null
1089-
} else {
1090-
try {
1091-
UTF8String.fromString(
1092-
TimestampFormatter(
1093-
f.toString,
1094-
zoneId,
1095-
legacyFormat = SIMPLE_DATE_FORMAT,
1096-
needVarLengthSecondFraction = false)
1097-
.format(time.asInstanceOf[Long] * MICROS_PER_SECOND))
1098-
} catch {
1099-
case e: SparkUpgradeException => throw e
1100-
case NonFatal(_) => null
1101-
}
1102-
}
1103-
}
1104-
}
1063+
override def nullSafeEval(seconds: Any, format: Any): Any = {
1064+
val ft = formatter.getOrElse(TimestampFormatter(format.toString, zoneId,
1065+
legacyFormat = SIMPLE_DATE_FORMAT, needVarLengthSecondFraction = false))
1066+
UTF8String.fromString(ft.format(seconds.asInstanceOf[Long] * MICROS_PER_SECOND))
11051067
}
11061068

11071069
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
1108-
val df = classOf[TimestampFormatter].getName
1109-
if (format.foldable) {
1110-
if (formatter == null) {
1111-
ExprCode.forNullValue(StringType)
1112-
} else {
1113-
val formatterName = ctx.addReferenceObj("formatter", formatter, df)
1114-
val t = left.genCode(ctx)
1115-
ev.copy(code = code"""
1116-
${t.code}
1117-
boolean ${ev.isNull} = ${t.isNull};
1118-
${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)};
1119-
if (!${ev.isNull}) {
1120-
try {
1121-
${ev.value} = UTF8String.fromString($formatterName.format(${t.value} * 1000000L));
1122-
} catch (java.lang.IllegalArgumentException e) {
1123-
${ev.isNull} = true;
1124-
}
1125-
}""")
1126-
}
1127-
} else {
1128-
val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
1070+
formatter.map { f =>
1071+
val formatterName = ctx.addReferenceObj("formatter", f)
1072+
defineCodeGen(ctx, ev, (seconds, _) =>
1073+
s"UTF8String.fromString($formatterName.format($seconds * 1000000L))")
1074+
}.getOrElse {
11291075
val tf = TimestampFormatter.getClass.getName.stripSuffix("$")
11301076
val ldf = LegacyDateFormats.getClass.getName.stripSuffix("$")
1131-
nullSafeCodeGen(ctx, ev, (seconds, f) => {
1077+
val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
1078+
defineCodeGen(ctx, ev, (seconds, format) =>
11321079
s"""
1133-
try {
1134-
${ev.value} = UTF8String.fromString(
1135-
$tf$$.MODULE$$.apply($f.toString(), $zid, $ldf$$.MODULE$$.SIMPLE_DATE_FORMAT(), false)
1136-
.format($seconds * 1000000L));
1137-
} catch (java.lang.IllegalArgumentException e) {
1138-
${ev.isNull} = true;
1139-
}"""
1140-
})
1080+
|UTF8String.fromString(
1081+
| $tf$$.MODULE$$.apply($format.toString(),
1082+
| $zid,
1083+
| $ldf$$.MODULE$$.SIMPLE_DATE_FORMAT(),
1084+
| false)
1085+
| .format($seconds * 1000000L))
1086+
|""".stripMargin)
11411087
}
11421088
}
11431089
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
2020
import java.sql.{Date, Timestamp}
2121
import java.text.SimpleDateFormat
2222
import java.time.{Instant, LocalDate, ZoneId}
23+
import java.time.format.DateTimeFormatter
2324
import java.util.{Calendar, Locale, TimeZone}
2425
import java.util.concurrent.TimeUnit._
2526

@@ -777,8 +778,6 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
777778
checkEvaluation(
778779
FromUnixTime(Literal(1000L), Literal.create(null, StringType), timeZoneId),
779780
null)
780-
checkEvaluation(
781-
FromUnixTime(Literal(0L), Literal("not a valid format"), timeZoneId), null)
782781

783782
// SPARK-28072 The codegen path for non-literal input should also work
784783
checkEvaluation(
@@ -792,7 +791,39 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
792791
}
793792
}
794793
// Test escaping of format
795-
GenerateUnsafeProjection.generate(FromUnixTime(Literal(0L), Literal("\"quote")) :: Nil)
794+
val e = FromUnixTime(Literal(0L), Literal("\""))
795+
GenerateUnsafeProjection.generate(e.withTimeZone(conf.sessionLocalTimeZone) :: Nil)
796+
}
797+
798+
test("from_unixtime with invalid datetime pattern") {
799+
val invalidForBoth = Seq("A", "c", "n", "e", "n", "p")
800+
val invalidForNew = Seq("MMMMM", "GGGGG")
801+
802+
invalidForBoth.foreach { format =>
803+
Seq("exception", "legacy", "corrected").foreach { policy =>
804+
withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> policy) {
805+
checkExceptionInExpression[IllegalArgumentException](
806+
FromUnixTime(Literal(0L), Literal(format)), s"${format.head}")
807+
}
808+
}
809+
}
810+
811+
invalidForNew.foreach { format =>
812+
Seq("exception", "corrected").foreach { policy =>
813+
withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> policy) {
814+
checkExceptionInExpression[SparkUpgradeException](
815+
FromUnixTime(Literal(0L), Literal(format)), s"${format.head}")
816+
}
817+
}
818+
}
819+
820+
invalidForNew.foreach { format =>
821+
withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> "legacy") {
822+
checkEvaluation(
823+
FromUnixTime(Literal(0L), Literal(format)),
824+
new SimpleDateFormat(format).format(new Date(0)))
825+
}
826+
}
796827
}
797828

798829
test("unix_timestamp") {

sql/core/src/test/resources/sql-tests/inputs/datetime.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,3 +160,7 @@ select from_json('{"time":"26/October/2015"}', 'time Timestamp', map('timestampF
160160
select from_json('{"date":"26/October/2015"}', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy'));
161161
select from_csv('26/October/2015', 'time Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy'));
162162
select from_csv('26/October/2015', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy'));
163+
164+
select from_unixtime(a, b) from
165+
values (null, null), (12345, null), (null, 'invalid'), (null, 'yyyy-MM-dd'), (67890, 'yyyy-MM-dd') t(a, b);
166+
select from_unixtime(12345, 'invalid');

sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
-- Automatically generated by SQLQueryTestSuite
2-
-- Number of queries: 116
2+
-- Number of queries: 118
33

44

55
-- !query
@@ -951,9 +951,10 @@ You may get a different result due to the upgrading of Spark 3.0: Fail to recogn
951951
-- !query
952952
select from_unixtime(54321, 'QQQQQ')
953953
-- !query schema
954-
struct<from_unixtime(CAST(54321 AS BIGINT), QQQQQ):string>
954+
struct<>
955955
-- !query output
956-
NULL
956+
java.lang.IllegalArgumentException
957+
Too many pattern letters: Q
957958

958959

959960
-- !query
@@ -999,3 +1000,25 @@ struct<>
9991000
-- !query output
10001001
org.apache.spark.SparkUpgradeException
10011002
You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
1003+
1004+
1005+
-- !query
1006+
select from_unixtime(a, b) from
1007+
values (null, null), (12345, null), (null, 'invalid'), (null, 'yyyy-MM-dd'), (67890, 'yyyy-MM-dd') t(a, b)
1008+
-- !query schema
1009+
struct<from_unixtime(CAST(a AS BIGINT), b):string>
1010+
-- !query output
1011+
1970-01-01
1012+
NULL
1013+
NULL
1014+
NULL
1015+
NULL
1016+
1017+
1018+
-- !query
1019+
select from_unixtime(12345, 'invalid')
1020+
-- !query schema
1021+
struct<>
1022+
-- !query output
1023+
java.lang.IllegalArgumentException
1024+
Illegal pattern character: n

sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
-- Automatically generated by SQLQueryTestSuite
2-
-- Number of queries: 116
2+
-- Number of queries: 118
33

44

55
-- !query
@@ -913,9 +913,10 @@ December
913913
-- !query
914914
select from_unixtime(54321, 'QQQQQ')
915915
-- !query schema
916-
struct<from_unixtime(CAST(54321 AS BIGINT), QQQQQ):string>
916+
struct<>
917917
-- !query output
918-
NULL
918+
java.lang.IllegalArgumentException
919+
Illegal pattern character 'Q'
919920

920921

921922
-- !query
@@ -956,3 +957,25 @@ select from_csv('26/October/2015', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy
956957
struct<from_csv(26/October/2015):struct<date:date>>
957958
-- !query output
958959
{"date":2015-10-26}
960+
961+
962+
-- !query
963+
select from_unixtime(a, b) from
964+
values (null, null), (12345, null), (null, 'invalid'), (null, 'yyyy-MM-dd'), (67890, 'yyyy-MM-dd') t(a, b)
965+
-- !query schema
966+
struct<from_unixtime(CAST(a AS BIGINT), b):string>
967+
-- !query output
968+
1970-01-01
969+
NULL
970+
NULL
971+
NULL
972+
NULL
973+
974+
975+
-- !query
976+
select from_unixtime(12345, 'invalid')
977+
-- !query schema
978+
struct<>
979+
-- !query output
980+
java.lang.IllegalArgumentException
981+
Illegal pattern character 'i'

sql/core/src/test/resources/sql-tests/results/datetime.sql.out

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
-- Automatically generated by SQLQueryTestSuite
2-
-- Number of queries: 116
2+
-- Number of queries: 118
33

44

55
-- !query
@@ -923,9 +923,10 @@ You may get a different result due to the upgrading of Spark 3.0: Fail to recogn
923923
-- !query
924924
select from_unixtime(54321, 'QQQQQ')
925925
-- !query schema
926-
struct<from_unixtime(CAST(54321 AS BIGINT), QQQQQ):string>
926+
struct<>
927927
-- !query output
928-
NULL
928+
java.lang.IllegalArgumentException
929+
Too many pattern letters: Q
929930

930931

931932
-- !query
@@ -971,3 +972,25 @@ struct<>
971972
-- !query output
972973
org.apache.spark.SparkUpgradeException
973974
You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
975+
976+
977+
-- !query
978+
select from_unixtime(a, b) from
979+
values (null, null), (12345, null), (null, 'invalid'), (null, 'yyyy-MM-dd'), (67890, 'yyyy-MM-dd') t(a, b)
980+
-- !query schema
981+
struct<from_unixtime(CAST(a AS BIGINT), b):string>
982+
-- !query output
983+
1970-01-01
984+
NULL
985+
NULL
986+
NULL
987+
NULL
988+
989+
990+
-- !query
991+
select from_unixtime(12345, 'invalid')
992+
-- !query schema
993+
struct<>
994+
-- !query output
995+
java.lang.IllegalArgumentException
996+
Illegal pattern character: n

0 commit comments

Comments
 (0)