[SPARK-31830][SQL] Consistent error handling for datetime formatting functions

yaooqinn · yaooqinn · commit f39dd48ffea3 · 2020-05-27T11:45:34.000+08:00
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
@@ -27,6 +27,8 @@ license: |
   - In Spark 3.1, grouping_id() returns long values. In Spark version 3.0 and earlier, this function returns int values. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.integerGroupingId` to `true`.
 
   - In Spark 3.1, SQL UI data adopts the `formatted` mode for the query plan explain results. To restore the behavior before Spark 3.0, you can set `spark.sql.ui.explainMode` to `extended`.
+  
+  - In Spark 3.1, `from_unixtime` will fail if the specified datetime pattern is invalid. In Spark 3.0 or earlier, it results `NULL`.
 
 ## Upgrading from Spark SQL 2.4 to 3.0
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -34,7 +34,6 @@ import org.apache.spark.sql.catalyst.util.{DateTimeUtils, LegacyDateFormats, Tim
 import org.apache.spark.sql.catalyst.util.DateTimeConstants._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 import org.apache.spark.sql.catalyst.util.LegacyDateFormats.SIMPLE_DATE_FORMAT
-import org.apache.spark.sql.catalyst.util.toPrettySQL
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
@@ -1053,91 +1052,38 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
   override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression =
     copy(timeZoneId = Option(timeZoneId))
 
-  private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String]
-  private lazy val formatter: TimestampFormatter =
-    try {
-      TimestampFormatter(
-        constFormat.toString,
-        zoneId,
-        legacyFormat = SIMPLE_DATE_FORMAT,
+  private lazy val formatter: Option[TimestampFormatter] =
+    if (right.foldable) {
+    Option(right.eval()).map { format =>
+      TimestampFormatter(format.toString, zoneId, legacyFormat = SIMPLE_DATE_FORMAT,
         needVarLengthSecondFraction = false)
-    } catch {
-      case e: SparkUpgradeException => throw e
-      case NonFatal(_) => null
     }
+  } else None
 
-  override def eval(input: InternalRow): Any = {
-    val time = left.eval(input)
-    if (time == null) {
-      null
-    } else {
-      if (format.foldable) {
-        if (constFormat == null || formatter == null) {
-          null
-        } else {
-          try {
-            UTF8String.fromString(formatter.format(time.asInstanceOf[Long] * MICROS_PER_SECOND))
-          } catch {
-            case e: SparkUpgradeException => throw e
-            case NonFatal(_) => null
-          }
-        }
-      } else {
-        val f = format.eval(input)
-        if (f == null) {
-          null
-        } else {
-          try {
-            UTF8String.fromString(
-              TimestampFormatter(
-                f.toString,
-                zoneId,
-                legacyFormat = SIMPLE_DATE_FORMAT,
-                needVarLengthSecondFraction = false)
-                .format(time.asInstanceOf[Long] * MICROS_PER_SECOND))
-          } catch {
-            case e: SparkUpgradeException => throw e
-            case NonFatal(_) => null
-          }
-        }
-      }
-    }
+  override def nullSafeEval(seconds: Any, format: Any): Any = {
+    val ft = formatter.getOrElse(TimestampFormatter(format.toString, zoneId,
+      legacyFormat = SIMPLE_DATE_FORMAT, needVarLengthSecondFraction = false))
+    UTF8String.fromString(ft.format(seconds.asInstanceOf[Long] * MICROS_PER_SECOND))
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val df = classOf[TimestampFormatter].getName
-    if (format.foldable) {
-      if (formatter == null) {
-        ExprCode.forNullValue(StringType)
-      } else {
-        val formatterName = ctx.addReferenceObj("formatter", formatter, df)
-        val t = left.genCode(ctx)
-        ev.copy(code = code"""
-          ${t.code}
-          boolean ${ev.isNull} = ${t.isNull};
-          ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)};
-          if (!${ev.isNull}) {
-            try {
-              ${ev.value} = UTF8String.fromString($formatterName.format(${t.value} * 1000000L));
-            } catch (java.lang.IllegalArgumentException e) {
-              ${ev.isNull} = true;
-            }
-          }""")
-      }
-    } else {
-      val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
+    formatter.map { f =>
+      val formatterName = ctx.addReferenceObj("formatter", f)
+      defineCodeGen(ctx, ev, (seconds, _) =>
+        s"UTF8String.fromString($formatterName.format($seconds * 1000000L))")
+    }.getOrElse {
       val tf = TimestampFormatter.getClass.getName.stripSuffix("$")
       val ldf = LegacyDateFormats.getClass.getName.stripSuffix("$")
-      nullSafeCodeGen(ctx, ev, (seconds, f) => {
+      val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName)
+      defineCodeGen(ctx, ev, (seconds, format) =>
         s"""
-        try {
-          ${ev.value} = UTF8String.fromString(
-            $tf$$.MODULE$$.apply($f.toString(), $zid, $ldf$$.MODULE$$.SIMPLE_DATE_FORMAT(), false)
-              .format($seconds * 1000000L));
-        } catch (java.lang.IllegalArgumentException e) {
-          ${ev.isNull} = true;
-        }"""
-      })
+           |UTF8String.fromString(
+           |  $tf$$.MODULE$$.apply($format.toString(),
+           |  $zid,
+           |  $ldf$$.MODULE$$.SIMPLE_DATE_FORMAT(),
+           |  false)
+           |  .format($seconds * 1000000L))
+           |""".stripMargin)
     }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
 import java.time.{Instant, LocalDate, ZoneId}
+import java.time.format.DateTimeFormatter
 import java.util.{Calendar, Locale, TimeZone}
 import java.util.concurrent.TimeUnit._
 
@@ -777,8 +778,6 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
           checkEvaluation(
             FromUnixTime(Literal(1000L), Literal.create(null, StringType), timeZoneId),
             null)
-          checkEvaluation(
-            FromUnixTime(Literal(0L), Literal("not a valid format"), timeZoneId), null)
 
           // SPARK-28072 The codegen path for non-literal input should also work
           checkEvaluation(
@@ -792,7 +791,39 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       }
     }
     // Test escaping of format
-    GenerateUnsafeProjection.generate(FromUnixTime(Literal(0L), Literal("\"quote")) :: Nil)
+    val e = FromUnixTime(Literal(0L), Literal("\""))
+    GenerateUnsafeProjection.generate(e.withTimeZone(conf.sessionLocalTimeZone) :: Nil)
+  }
+
+  test("from_unixtime with invalid datetime pattern") {
+    val invalidForBoth = Seq("A", "c", "n", "e", "n", "p")
+    val invalidForNew = Seq("MMMMM", "GGGGG")
+
+    invalidForBoth.foreach { format =>
+      Seq("exception", "legacy", "corrected").foreach { policy =>
+        withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> policy) {
+          checkExceptionInExpression[IllegalArgumentException](
+            FromUnixTime(Literal(0L), Literal(format)), s"${format.head}")
+        }
+      }
+    }
+
+    invalidForNew.foreach { format =>
+      Seq("exception", "corrected").foreach { policy =>
+        withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> policy) {
+          checkExceptionInExpression[SparkUpgradeException](
+            FromUnixTime(Literal(0L), Literal(format)), s"${format.head}")
+        }
+      }
+    }
+
+    invalidForNew.foreach { format =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> "legacy") {
+        checkEvaluation(
+          FromUnixTime(Literal(0L), Literal(format)),
+          new SimpleDateFormat(format).format(new Date(0)))
+      }
+    }
   }
 
   test("unix_timestamp") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
@@ -160,3 +160,7 @@ select from_json('{"time":"26/October/2015"}', 'time Timestamp', map('timestampF
 select from_json('{"date":"26/October/2015"}', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy'));
 select from_csv('26/October/2015', 'time Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy'));
 select from_csv('26/October/2015', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy'));
+
+select from_unixtime(a, b) from
+ values (null, null), (12345, null), (null, 'invalid'), (null, 'yyyy-MM-dd'), (67890, 'yyyy-MM-dd') t(a, b);
+select from_unixtime(12345, 'invalid');
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 116
+-- Number of queries: 118
 
 
 -- !query
@@ -951,9 +951,10 @@ You may get a different result due to the upgrading of Spark 3.0: Fail to recogn
 -- !query
 select from_unixtime(54321, 'QQQQQ')
 -- !query schema
-struct<from_unixtime(CAST(54321 AS BIGINT), QQQQQ):string>
+struct<>
 -- !query output
-NULL
+java.lang.IllegalArgumentException
+Too many pattern letters: Q
 
 
 -- !query
@@ -999,3 +1000,25 @@ struct<>
 -- !query output
 org.apache.spark.SparkUpgradeException
 You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
+
+
+-- !query
+select from_unixtime(a, b) from
+ values (null, null), (12345, null), (null, 'invalid'), (null, 'yyyy-MM-dd'), (67890, 'yyyy-MM-dd') t(a, b)
+-- !query schema
+struct<from_unixtime(CAST(a AS BIGINT), b):string>
+-- !query output
+1970-01-01
+NULL
+NULL
+NULL
+NULL
+
+
+-- !query
+select from_unixtime(12345, 'invalid')
+-- !query schema
+struct<>
+-- !query output
+java.lang.IllegalArgumentException
+Illegal pattern character: n
diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 116
+-- Number of queries: 118
 
 
 -- !query
@@ -913,9 +913,10 @@ December
 -- !query
 select from_unixtime(54321, 'QQQQQ')
 -- !query schema
-struct<from_unixtime(CAST(54321 AS BIGINT), QQQQQ):string>
+struct<>
 -- !query output
-NULL
+java.lang.IllegalArgumentException
+Illegal pattern character 'Q'
 
 
 -- !query
@@ -956,3 +957,25 @@ select from_csv('26/October/2015', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy
 struct<from_csv(26/October/2015):struct<date:date>>
 -- !query output
 {"date":2015-10-26}
+
+
+-- !query
+select from_unixtime(a, b) from
+ values (null, null), (12345, null), (null, 'invalid'), (null, 'yyyy-MM-dd'), (67890, 'yyyy-MM-dd') t(a, b)
+-- !query schema
+struct<from_unixtime(CAST(a AS BIGINT), b):string>
+-- !query output
+1970-01-01
+NULL
+NULL
+NULL
+NULL
+
+
+-- !query
+select from_unixtime(12345, 'invalid')
+-- !query schema
+struct<>
+-- !query output
+java.lang.IllegalArgumentException
+Illegal pattern character 'i'
diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 116
+-- Number of queries: 118
 
 
 -- !query
@@ -923,9 +923,10 @@ You may get a different result due to the upgrading of Spark 3.0: Fail to recogn
 -- !query
 select from_unixtime(54321, 'QQQQQ')
 -- !query schema
-struct<from_unixtime(CAST(54321 AS BIGINT), QQQQQ):string>
+struct<>
 -- !query output
-NULL
+java.lang.IllegalArgumentException
+Too many pattern letters: Q
 
 
 -- !query
@@ -971,3 +972,25 @@ struct<>
 -- !query output
 org.apache.spark.SparkUpgradeException
 You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
+
+
+-- !query
+select from_unixtime(a, b) from
+ values (null, null), (12345, null), (null, 'invalid'), (null, 'yyyy-MM-dd'), (67890, 'yyyy-MM-dd') t(a, b)
+-- !query schema
+struct<from_unixtime(CAST(a AS BIGINT), b):string>
+-- !query output
+1970-01-01
+NULL
+NULL
+NULL
+NULL
+
+
+-- !query
+select from_unixtime(12345, 'invalid')
+-- !query schema
+struct<>
+-- !query output
+java.lang.IllegalArgumentException
+Illegal pattern character: n