From ab0517dbb8935775bdb4026ba3c832b6bd534d1a Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Mon, 1 Jun 2020 16:38:46 +0800 Subject: [PATCH 1/3] [SPARK-31879][SQL] Using GB as default Locale for datetime formatters --- .../spark/sql/catalyst/util/DateFormatter.scala | 11 ++++++++++- .../spark/sql/catalyst/util/TimestampFormatter.scala | 11 ++++++++++- .../src/test/resources/sql-tests/inputs/datetime.sql | 2 ++ .../resources/sql-tests/results/ansi/datetime.sql.out | 10 +++++++++- .../sql-tests/results/datetime-legacy.sql.out | 10 +++++++++- .../test/resources/sql-tests/results/datetime.sql.out | 10 +++++++++- 6 files changed, 49 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala index 06e1cdc27e7d5..a58d26cbe8e93 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala @@ -117,7 +117,16 @@ class LegacySimpleDateFormatter(pattern: String, locale: Locale) extends LegacyD object DateFormatter { import LegacyDateFormats._ - val defaultLocale: Locale = Locale.US + /** + * This is change from Locale.US to GB, because: + * The first day-of-week varies by culture. + * For example, the US uses Sunday, while the United Kingdom and the ISO-8601 standard use Monday. + * + * Using `US` makes functions which rely on the Locale to express the first day of week + * inconsistent with Spark 2.4 + * see https://issues.apache.org/jira/browse/SPARK-31879 + */ + val defaultLocale: Locale = new Locale("en", "GB") val defaultPattern: String = "yyyy-MM-dd" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala index 3e302e2170390..ca567b6cc4169 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala @@ -278,7 +278,16 @@ object LegacyDateFormats extends Enumeration { object TimestampFormatter { import LegacyDateFormats._ - val defaultLocale: Locale = Locale.US + /** + * This is change from Locale.US to GB, because: + * The first day-of-week varies by culture. + * For example, the US uses Sunday, while the United Kingdom and the ISO-8601 standard use Monday. + * + * Using `US` makes functions which rely on the Locale to express the first day of week + * inconsistent with Spark 2.4 + * see https://issues.apache.org/jira/browse/SPARK-31879 + */ + val defaultLocale: Locale = new Locale("en", "GB") def defaultPattern(): String = s"${DateFormatter.defaultPattern} HH:mm:ss" diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index 9bd936f6f441f..e7b3898334f4b 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -164,3 +164,5 @@ select from_csv('26/October/2015', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy select from_unixtime(1, 'yyyyyyyyyyy-MM-dd'); select date_format(timestamp '2018-11-17 13:33:33', 'yyyyyyyyyy-MM-dd HH:mm:ss'); select date_format(date '2018-11-17', 'yyyyyyyyyyy-MM-dd'); + +select to_timestamp('2020-01-01', 'YYYY-ww-uu'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out index ca04b008d6537..a9a3bccadce2d 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 119 +-- Number of queries: 120 -- !query @@ -1025,3 +1025,11 @@ struct<> -- !query output org.apache.spark.SparkUpgradeException You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'yyyyyyyyyyy-MM-dd' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select to_timestamp('2020-01-01', 'YYYY-ww-uu') +-- !query schema +struct +-- !query output +2019-12-30 00:00:00 diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out index fe932d3a706a8..414249d97fdfa 100644 --- a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 119 +-- Number of queries: 120 -- !query @@ -980,3 +980,11 @@ select date_format(date '2018-11-17', 'yyyyyyyyyyy-MM-dd') struct -- !query output 00000002018-11-17 + + +-- !query +select to_timestamp('2020-01-01', 'YYYY-ww-uu') +-- !query schema +struct +-- !query output +2019-12-30 00:00:00 diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out index 06a41da2671e6..fb374a07a1241 100755 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 119 +-- Number of queries: 120 -- !query @@ -997,3 +997,11 @@ struct<> -- !query output org.apache.spark.SparkUpgradeException You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'yyyyyyyyyyy-MM-dd' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select to_timestamp('2020-01-01', 'YYYY-ww-uu') +-- !query schema +struct +-- !query output +2019-12-30 00:00:00 From 33392744c4c9f2ffebe8e05411908cea76de17cc Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Mon, 1 Jun 2020 22:17:09 +0800 Subject: [PATCH 2/3] comment --- .../spark/sql/catalyst/util/DateFormatter.scala | 11 ++++------- .../spark/sql/catalyst/util/TimestampFormatter.scala | 11 ++++------- .../src/test/resources/sql-tests/inputs/datetime.sql | 1 + 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala index a58d26cbe8e93..fe20e546f5d24 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala @@ -118,13 +118,10 @@ object DateFormatter { import LegacyDateFormats._ /** - * This is change from Locale.US to GB, because: - * The first day-of-week varies by culture. - * For example, the US uses Sunday, while the United Kingdom and the ISO-8601 standard use Monday. - * - * Using `US` makes functions which rely on the Locale to express the first day of week - * inconsistent with Spark 2.4 - * see https://issues.apache.org/jira/browse/SPARK-31879 + * Before Spark 3.0, the first day-of-week is always Monday. Since Spark 3.0, it depends on the + * locale. + * We pick GB as the default locale instead of US, to be compatible with Spark 2.x, as US locale + * uses Sunday as the first day-of-week. See SPARK-31879. */ val defaultLocale: Locale = new Locale("en", "GB") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala index ca567b6cc4169..1f14c70164c1a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala @@ -279,13 +279,10 @@ object TimestampFormatter { import LegacyDateFormats._ /** - * This is change from Locale.US to GB, because: - * The first day-of-week varies by culture. - * For example, the US uses Sunday, while the United Kingdom and the ISO-8601 standard use Monday. - * - * Using `US` makes functions which rely on the Locale to express the first day of week - * inconsistent with Spark 2.4 - * see https://issues.apache.org/jira/browse/SPARK-31879 + * Before Spark 3.0, the first day-of-week is always Monday. Since Spark 3.0, it depends on the + * locale. + * We pick GB as the default locale instead of US, to be compatible with Spark 2.x, as US locale + * uses Sunday as the first day-of-week. See SPARK-31879. */ val defaultLocale: Locale = new Locale("en", "GB") diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index e7b3898334f4b..626afa62fe74b 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -165,4 +165,5 @@ select from_unixtime(1, 'yyyyyyyyyyy-MM-dd'); select date_format(timestamp '2018-11-17 13:33:33', 'yyyyyyyyyy-MM-dd HH:mm:ss'); select date_format(date '2018-11-17', 'yyyyyyyyyyy-MM-dd'); +-- SPARK-31879: the first day of week select to_timestamp('2020-01-01', 'YYYY-ww-uu'); From f94c72be5edc5846b64bef8ed81a24a9109716c8 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Tue, 2 Jun 2020 21:03:51 +0800 Subject: [PATCH 3/3] use formatter --- .../test/resources/sql-tests/inputs/datetime.sql | 3 ++- .../sql-tests/results/ansi/datetime.sql.out | 16 ++++++++++++---- .../sql-tests/results/datetime-legacy.sql.out | 16 ++++++++++++---- .../resources/sql-tests/results/datetime.sql.out | 16 ++++++++++++---- 4 files changed, 38 insertions(+), 13 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index 626afa62fe74b..5636e0b670362 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -166,4 +166,5 @@ select date_format(timestamp '2018-11-17 13:33:33', 'yyyyyyyyyy-MM-dd HH:mm:ss') select date_format(date '2018-11-17', 'yyyyyyyyyyy-MM-dd'); -- SPARK-31879: the first day of week -select to_timestamp('2020-01-01', 'YYYY-ww-uu'); +select date_format('2020-01-01', 'YYYY-MM-dd uu'); +select date_format('2020-01-01', 'YYYY-MM-dd uuuu'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out index a9a3bccadce2d..3803460f3f083 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 120 +-- Number of queries: 121 -- !query @@ -1028,8 +1028,16 @@ You may get a different result due to the upgrading of Spark 3.0: Fail to recogn -- !query -select to_timestamp('2020-01-01', 'YYYY-ww-uu') +select date_format('2020-01-01', 'YYYY-MM-dd uu') -- !query schema -struct +struct -- !query output -2019-12-30 00:00:00 +2020-01-01 03 + + +-- !query +select date_format('2020-01-01', 'YYYY-MM-dd uuuu') +-- !query schema +struct +-- !query output +2020-01-01 Wednesday diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out index 414249d97fdfa..99dd14d21e6fd 100644 --- a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 120 +-- Number of queries: 121 -- !query @@ -983,8 +983,16 @@ struct +struct -- !query output -2019-12-30 00:00:00 +2020-01-01 03 + + +-- !query +select date_format('2020-01-01', 'YYYY-MM-dd uuuu') +-- !query schema +struct +-- !query output +2020-01-01 0003 diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out index fb374a07a1241..c8c568c736d76 100755 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 120 +-- Number of queries: 121 -- !query @@ -1000,8 +1000,16 @@ You may get a different result due to the upgrading of Spark 3.0: Fail to recogn -- !query -select to_timestamp('2020-01-01', 'YYYY-ww-uu') +select date_format('2020-01-01', 'YYYY-MM-dd uu') -- !query schema -struct +struct -- !query output -2019-12-30 00:00:00 +2020-01-01 03 + + +-- !query +select date_format('2020-01-01', 'YYYY-MM-dd uuuu') +-- !query schema +struct +-- !query output +2020-01-01 Wednesday