From 65385a1bb51586bac8f160267a08a889faf253ef Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 2 Oct 2019 12:11:36 +0300 Subject: [PATCH 01/20] Add SECONDS_PER_MONTH and SECONDS_PER_YEAR --- .../apache/spark/sql/catalyst/util/DateTimeUtils.scala | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 34e8012106bb..75248b95eb0e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -59,6 +59,15 @@ object DateTimeUtils { final val MILLIS_PER_MINUTE: Long = 60 * MILLIS_PER_SECOND final val MILLIS_PER_HOUR: Long = 60 * MILLIS_PER_MINUTE final val MILLIS_PER_DAY: Long = SECONDS_PER_DAY * MILLIS_PER_SECOND + // The average year of the Gregorian calendar 365.2425 days long, see + // https://en.wikipedia.org/wiki/Gregorian_calendar + // Leap year occurs every 4 years, except for years that are divisible by 100 + // and not divisible by 400. So, the mean length of of the Gregorian calendar year is: + // 1 mean year = (365 + 1/4 - 1/100 + 1/400) days = 365.2425 days + // The mean year length in seconds is: + // 60 * 60 * 24 * 365.2425 = 31556952.0 = 12 * 2629746 + final val SECONDS_PER_MONTH: Int = 2629746 + final val SECONDS_PER_YEAR: Int = 12 * SECONDS_PER_MONTH // number of days between 1.1.1970 and 1.1.2001 final val to2001 = -11323 From 9b58059cd081c5ec9ea00e488f07033a9988eb95 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 2 Oct 2019 12:17:53 +0300 Subject: [PATCH 02/20] Use SECONDS_PER_MONTH in monthsBetween --- .../org/apache/spark/sql/catalyst/util/DateTimeUtils.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 75248b95eb0e..18f860e4daed 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -628,8 +628,7 @@ object DateTimeUtils { val secondsInDay1 = MILLISECONDS.toSeconds(millis1 - daysToMillis(date1, timeZone)) val secondsInDay2 = MILLISECONDS.toSeconds(millis2 - daysToMillis(date2, timeZone)) val secondsDiff = (dayInMonth1 - dayInMonth2) * SECONDS_PER_DAY + secondsInDay1 - secondsInDay2 - val secondsInMonth = DAYS.toSeconds(31) - val diff = monthDiff + secondsDiff / secondsInMonth.toDouble + val diff = monthDiff + secondsDiff / SECONDS_PER_MONTH.toDouble if (roundOff) { // rounding to 8 digits math.round(diff * 1e8) / 1e8 From b0f765a50ac0cbdea85364bbf2d2275aa4a48e1e Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 2 Oct 2019 12:18:23 +0300 Subject: [PATCH 03/20] Fix expected values in DateTimeUtilsSuite --- .../spark/sql/catalyst/util/DateTimeUtilsSuite.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index a9e3c9006a33..d2b6eb139852 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -385,8 +385,8 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers { test("monthsBetween") { val date1 = date(1997, 2, 28, 10, 30, 0) var date2 = date(1996, 10, 30) - assert(monthsBetween(date1, date2, true, TimeZoneUTC) === 3.94959677) - assert(monthsBetween(date1, date2, false, TimeZoneUTC) === 3.9495967741935485) + assert(monthsBetween(date1, date2, true, TimeZoneUTC) === 3.94866424) + assert(monthsBetween(date1, date2, false, TimeZoneUTC) === 3.9486642436189654) Seq(true, false).foreach { roundOff => date2 = date(2000, 2, 28) assert(monthsBetween(date1, date2, roundOff, TimeZoneUTC) === -36) @@ -399,8 +399,8 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers { val date3 = date(2000, 2, 28, 16, tz = TimeZonePST) val date4 = date(1997, 2, 28, 16, tz = TimeZonePST) assert(monthsBetween(date3, date4, true, TimeZonePST) === 36.0) - assert(monthsBetween(date3, date4, true, TimeZoneGMT) === 35.90322581) - assert(monthsBetween(date3, date4, false, TimeZoneGMT) === 35.903225806451616) + assert(monthsBetween(date3, date4, true, TimeZoneGMT) === 35.91993675) + assert(monthsBetween(date3, date4, false, TimeZoneGMT) === 35.919936754348136) } test("from UTC timestamp") { From 10196454a3774b46863ca65bfd6673e77888bace Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 2 Oct 2019 12:24:53 +0300 Subject: [PATCH 04/20] Fix expected values in DateExpressionsSuite --- .../spark/sql/catalyst/expressions/DateExpressionsSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala index 1d109fad149b..70c5398a9aa6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala @@ -488,13 +488,13 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { Literal(new Timestamp(sdf.parse("1997-02-28 10:30:00").getTime)), Literal(new Timestamp(sdf.parse("1996-10-30 00:00:00").getTime)), Literal.TrueLiteral, - timeZoneId = timeZoneId), 3.94959677) + timeZoneId = timeZoneId), 3.94866424) checkEvaluation( MonthsBetween( Literal(new Timestamp(sdf.parse("1997-02-28 10:30:00").getTime)), Literal(new Timestamp(sdf.parse("1996-10-30 00:00:00").getTime)), Literal.FalseLiteral, - timeZoneId = timeZoneId), 3.9495967741935485) + timeZoneId = timeZoneId), 3.9486642436189654) Seq(Literal.FalseLiteral, Literal.TrueLiteral). foreach { roundOff => checkEvaluation( From 285af30af5a30855466aab0f9a0ecd70d7690798 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 2 Oct 2019 12:33:45 +0300 Subject: [PATCH 05/20] Fix expected values in DateFunctionsSuite --- .../scala/org/apache/spark/sql/DateFunctionsSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index 99189a96b299..a8394283a3a7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -341,13 +341,13 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { val s2 = "2015-10-01 00:00:00" val df = Seq((t1, d1, s1), (t2, d2, s2)).toDF("t", "d", "s") checkAnswer(df.select(months_between(col("t"), col("d"))), Seq(Row(-10.0), Row(7.0))) - checkAnswer(df.selectExpr("months_between(t, s)"), Seq(Row(0.5), Row(-0.5))) - checkAnswer(df.selectExpr("months_between(t, s, true)"), Seq(Row(0.5), Row(-0.5))) + checkAnswer(df.selectExpr("months_between(t, s)"), Seq(Row(0.5092507), Row(-0.4907493))) + checkAnswer(df.selectExpr("months_between(t, s, true)"), Seq(Row(0.5092507), Row(-0.4907493))) Seq(true, false).foreach { roundOff => checkAnswer(df.select(months_between(col("t"), col("d"), roundOff)), Seq(Row(-10.0), Row(7.0))) checkAnswer(df.withColumn("r", lit(false)).selectExpr("months_between(t, s, r)"), - Seq(Row(0.5), Row(-0.5))) + Seq(Row(0.5092507032998624), Row(-0.49074929670013756))) } } From f59e0062e707c7263989d069c9c6980b15c9a9f2 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 2 Oct 2019 12:39:52 +0300 Subject: [PATCH 06/20] Use SECONDS_PER_MONTH in GroupStateImpl --- .../apache/spark/sql/execution/streaming/GroupStateImpl.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala index dda9d41f630e..f861d7afd23e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala @@ -21,6 +21,7 @@ import java.sql.Date import java.util.concurrent.TimeUnit import org.apache.spark.sql.catalyst.plans.logical.{EventTimeTimeout, ProcessingTimeTimeout} +import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.streaming.GroupStateImpl._ import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout} import org.apache.spark.unsafe.types.CalendarInterval @@ -164,7 +165,7 @@ private[sql] class GroupStateImpl[S] private( throw new IllegalArgumentException(s"Provided duration ($duration) is not positive") } - val millisPerMonth = TimeUnit.MICROSECONDS.toMillis(CalendarInterval.MICROS_PER_DAY) * 31 + val millisPerMonth = TimeUnit.SECONDS.toMillis(DateTimeUtils.SECONDS_PER_MONTH) cal.milliseconds + cal.months * millisPerMonth } From e97f4198da7d7ac243fed06bfe211315106adfa3 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 2 Oct 2019 12:50:36 +0300 Subject: [PATCH 07/20] Add MILLIS_PER_MONTH --- .../scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 18f860e4daed..a6a586da04e7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -68,6 +68,7 @@ object DateTimeUtils { // 60 * 60 * 24 * 365.2425 = 31556952.0 = 12 * 2629746 final val SECONDS_PER_MONTH: Int = 2629746 final val SECONDS_PER_YEAR: Int = 12 * SECONDS_PER_MONTH + final val MILLIS_PER_MONTH: Long = SECONDS_PER_MONTH * MILLIS_PER_SECOND // number of days between 1.1.1970 and 1.1.2001 final val to2001 = -11323 From 71dc2c0ce1d7118c4d8e8ed0b3dfca19a4203ab1 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 2 Oct 2019 12:51:11 +0300 Subject: [PATCH 08/20] Use MILLIS_PER_MONTH in EventTimeWatermark --- .../sql/catalyst/plans/logical/EventTimeWatermark.scala | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala index 8441c2c481ec..2309aa42b80c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala @@ -17,9 +17,8 @@ package org.apache.spark.sql.catalyst.plans.logical -import java.util.concurrent.TimeUnit - import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.util.DateTimeUtils.MILLIS_PER_MONTH import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval @@ -28,9 +27,7 @@ object EventTimeWatermark { val delayKey = "spark.watermarkDelayMs" def getDelayMs(delay: CalendarInterval): Long = { - // We define month as `31 days` to simplify calculation. - val millisPerMonth = TimeUnit.MICROSECONDS.toMillis(CalendarInterval.MICROS_PER_DAY) * 31 - delay.milliseconds + delay.months * millisPerMonth + delay.milliseconds + delay.months * MILLIS_PER_MONTH } } From e7c9920d3ab2cb905ee39d9c4e95367c2e01dcd7 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 2 Oct 2019 12:51:26 +0300 Subject: [PATCH 09/20] Use MILLIS_PER_MONTH in GroupStateImpl --- .../spark/sql/execution/streaming/GroupStateImpl.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala index f861d7afd23e..f459a2c1f8e2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala @@ -18,10 +18,9 @@ package org.apache.spark.sql.execution.streaming import java.sql.Date -import java.util.concurrent.TimeUnit import org.apache.spark.sql.catalyst.plans.logical.{EventTimeTimeout, ProcessingTimeTimeout} -import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.catalyst.util.DateTimeUtils.MILLIS_PER_MONTH import org.apache.spark.sql.execution.streaming.GroupStateImpl._ import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout} import org.apache.spark.unsafe.types.CalendarInterval @@ -165,8 +164,7 @@ private[sql] class GroupStateImpl[S] private( throw new IllegalArgumentException(s"Provided duration ($duration) is not positive") } - val millisPerMonth = TimeUnit.SECONDS.toMillis(DateTimeUtils.SECONDS_PER_MONTH) - cal.milliseconds + cal.months * millisPerMonth + cal.milliseconds + cal.months * MILLIS_PER_MONTH } private def checkTimeoutTimestampAllowed(): Unit = { From 6125a6b673c0388e332d0d4c8665a9e5885ebade Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 2 Oct 2019 13:10:47 +0300 Subject: [PATCH 10/20] Remove SECONDS_PER_YEAR --- .../scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index a6a586da04e7..f8afd23ea7af 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -67,7 +67,6 @@ object DateTimeUtils { // The mean year length in seconds is: // 60 * 60 * 24 * 365.2425 = 31556952.0 = 12 * 2629746 final val SECONDS_PER_MONTH: Int = 2629746 - final val SECONDS_PER_YEAR: Int = 12 * SECONDS_PER_MONTH final val MILLIS_PER_MONTH: Long = SECONDS_PER_MONTH * MILLIS_PER_SECOND // number of days between 1.1.1970 and 1.1.2001 From 7e30eced7a4fcf50303c31025bc820790e9d74fe Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 2 Oct 2019 13:47:57 +0300 Subject: [PATCH 11/20] Fix examples of months_between --- .../spark/sql/catalyst/expressions/datetimeExpressions.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index feeb8d2e9b29..8ac7c6c1140b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -1273,9 +1273,9 @@ case class AddMonths(startDate: Expression, numMonths: Expression) examples = """ Examples: > SELECT _FUNC_('1997-02-28 10:30:00', '1996-10-30'); - 3.94959677 + 3.94866424 > SELECT _FUNC_('1997-02-28 10:30:00', '1996-10-30', false); - 3.9495967741935485 + 3.9486642436189654 """, since = "1.5.0") // scalastyle:on line.size.limit From b4ecba4c09fe471df9dca54a2a4dbef44ad920ac Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 2 Oct 2019 18:10:21 +0300 Subject: [PATCH 12/20] Update expected values in pyspark.sql.functions --- python/pyspark/sql/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 95b78175d556..ba3fb12ffda1 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1122,9 +1122,9 @@ def months_between(date1, date2, roundOff=True): >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['date1', 'date2']) >>> df.select(months_between(df.date1, df.date2).alias('months')).collect() - [Row(months=3.94959677)] + [Row(months=3.94866424)] >>> df.select(months_between(df.date1, df.date2, False).alias('months')).collect() - [Row(months=3.9495967741935485)] + [Row(months=3.9486642436189654)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.months_between( From f4441237fa7c00151f70e40d678d23d8dcb3e2de Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 2 Oct 2019 18:36:30 +0300 Subject: [PATCH 13/20] Update expected values in test_sparkSQL.R --- R/pkg/tests/fulltests/test_sparkSQL.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 035525a7a849..7dfde3cd0737 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1444,9 +1444,9 @@ test_that("column functions", { df <- createDataFrame(list(list(a = as.Date("1997-02-28"), b = as.Date("1996-10-30")))) result1 <- collect(select(df, alias(months_between(df[[1]], df[[2]]), "month")))[[1]] - expect_equal(result1, 3.93548387) + expect_equal(result1, 3.93429023) result2 <- collect(select(df, alias(months_between(df[[1]], df[[2]], FALSE), "month")))[[1]] - expect_equal(result2, 3.935483870967742) + expect_equal(result2, 3.934290231832276) # Test array_contains(), array_max(), array_min(), array_position(), element_at() and reverse() df <- createDataFrame(list(list(list(1L, 2L, 3L)), list(list(6L, 5L, 4L)))) From 2e33203dfac126c70ada3082478e6f9f9ac1dfe8 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 3 Oct 2019 20:10:27 +0300 Subject: [PATCH 14/20] Revert "Update expected values in test_sparkSQL.R" This reverts commit f4441237fa7c00151f70e40d678d23d8dcb3e2de. --- R/pkg/tests/fulltests/test_sparkSQL.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 7dfde3cd0737..035525a7a849 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1444,9 +1444,9 @@ test_that("column functions", { df <- createDataFrame(list(list(a = as.Date("1997-02-28"), b = as.Date("1996-10-30")))) result1 <- collect(select(df, alias(months_between(df[[1]], df[[2]]), "month")))[[1]] - expect_equal(result1, 3.93429023) + expect_equal(result1, 3.93548387) result2 <- collect(select(df, alias(months_between(df[[1]], df[[2]], FALSE), "month")))[[1]] - expect_equal(result2, 3.934290231832276) + expect_equal(result2, 3.935483870967742) # Test array_contains(), array_max(), array_min(), array_position(), element_at() and reverse() df <- createDataFrame(list(list(list(1L, 2L, 3L)), list(list(6L, 5L, 4L)))) From 00a1988207f298f74047216d433d4c4d7109e7a4 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 3 Oct 2019 20:10:40 +0300 Subject: [PATCH 15/20] Revert "Update expected values in pyspark.sql.functions" This reverts commit b4ecba4c09fe471df9dca54a2a4dbef44ad920ac. --- python/pyspark/sql/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index ba3fb12ffda1..95b78175d556 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1122,9 +1122,9 @@ def months_between(date1, date2, roundOff=True): >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['date1', 'date2']) >>> df.select(months_between(df.date1, df.date2).alias('months')).collect() - [Row(months=3.94866424)] + [Row(months=3.94959677)] >>> df.select(months_between(df.date1, df.date2, False).alias('months')).collect() - [Row(months=3.9486642436189654)] + [Row(months=3.9495967741935485)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.months_between( From 5256ff46910e21ea7c32e55d8f78ee0e2666717c Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 3 Oct 2019 20:10:54 +0300 Subject: [PATCH 16/20] Revert "Fix examples of months_between" This reverts commit 7e30eced7a4fcf50303c31025bc820790e9d74fe. --- .../spark/sql/catalyst/expressions/datetimeExpressions.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 8ac7c6c1140b..feeb8d2e9b29 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -1273,9 +1273,9 @@ case class AddMonths(startDate: Expression, numMonths: Expression) examples = """ Examples: > SELECT _FUNC_('1997-02-28 10:30:00', '1996-10-30'); - 3.94866424 + 3.94959677 > SELECT _FUNC_('1997-02-28 10:30:00', '1996-10-30', false); - 3.9486642436189654 + 3.9495967741935485 """, since = "1.5.0") // scalastyle:on line.size.limit From c59443f1845e339920f08b0c34480b0db038d99d Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 3 Oct 2019 20:12:02 +0300 Subject: [PATCH 17/20] Revert "Fix expected values in DateFunctionsSuite" This reverts commit 285af30af5a30855466aab0f9a0ecd70d7690798. --- .../scala/org/apache/spark/sql/DateFunctionsSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index a8394283a3a7..99189a96b299 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -341,13 +341,13 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { val s2 = "2015-10-01 00:00:00" val df = Seq((t1, d1, s1), (t2, d2, s2)).toDF("t", "d", "s") checkAnswer(df.select(months_between(col("t"), col("d"))), Seq(Row(-10.0), Row(7.0))) - checkAnswer(df.selectExpr("months_between(t, s)"), Seq(Row(0.5092507), Row(-0.4907493))) - checkAnswer(df.selectExpr("months_between(t, s, true)"), Seq(Row(0.5092507), Row(-0.4907493))) + checkAnswer(df.selectExpr("months_between(t, s)"), Seq(Row(0.5), Row(-0.5))) + checkAnswer(df.selectExpr("months_between(t, s, true)"), Seq(Row(0.5), Row(-0.5))) Seq(true, false).foreach { roundOff => checkAnswer(df.select(months_between(col("t"), col("d"), roundOff)), Seq(Row(-10.0), Row(7.0))) checkAnswer(df.withColumn("r", lit(false)).selectExpr("months_between(t, s, r)"), - Seq(Row(0.5092507032998624), Row(-0.49074929670013756))) + Seq(Row(0.5), Row(-0.5))) } } From 93a680ba238fe489fa173fee4506c7a26fff7c11 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 3 Oct 2019 20:12:24 +0300 Subject: [PATCH 18/20] Revert "Fix expected values in DateExpressionsSuite" This reverts commit 10196454a3774b46863ca65bfd6673e77888bace. --- .../spark/sql/catalyst/expressions/DateExpressionsSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala index 70c5398a9aa6..1d109fad149b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala @@ -488,13 +488,13 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { Literal(new Timestamp(sdf.parse("1997-02-28 10:30:00").getTime)), Literal(new Timestamp(sdf.parse("1996-10-30 00:00:00").getTime)), Literal.TrueLiteral, - timeZoneId = timeZoneId), 3.94866424) + timeZoneId = timeZoneId), 3.94959677) checkEvaluation( MonthsBetween( Literal(new Timestamp(sdf.parse("1997-02-28 10:30:00").getTime)), Literal(new Timestamp(sdf.parse("1996-10-30 00:00:00").getTime)), Literal.FalseLiteral, - timeZoneId = timeZoneId), 3.9486642436189654) + timeZoneId = timeZoneId), 3.9495967741935485) Seq(Literal.FalseLiteral, Literal.TrueLiteral). foreach { roundOff => checkEvaluation( From df2d97a1e36683e2a9ce99eaae758c09f3fef2a4 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 3 Oct 2019 20:12:42 +0300 Subject: [PATCH 19/20] Revert "Fix expected values in DateTimeUtilsSuite" This reverts commit b0f765a50ac0cbdea85364bbf2d2275aa4a48e1e. --- .../spark/sql/catalyst/util/DateTimeUtilsSuite.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index d2b6eb139852..a9e3c9006a33 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -385,8 +385,8 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers { test("monthsBetween") { val date1 = date(1997, 2, 28, 10, 30, 0) var date2 = date(1996, 10, 30) - assert(monthsBetween(date1, date2, true, TimeZoneUTC) === 3.94866424) - assert(monthsBetween(date1, date2, false, TimeZoneUTC) === 3.9486642436189654) + assert(monthsBetween(date1, date2, true, TimeZoneUTC) === 3.94959677) + assert(monthsBetween(date1, date2, false, TimeZoneUTC) === 3.9495967741935485) Seq(true, false).foreach { roundOff => date2 = date(2000, 2, 28) assert(monthsBetween(date1, date2, roundOff, TimeZoneUTC) === -36) @@ -399,8 +399,8 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers { val date3 = date(2000, 2, 28, 16, tz = TimeZonePST) val date4 = date(1997, 2, 28, 16, tz = TimeZonePST) assert(monthsBetween(date3, date4, true, TimeZonePST) === 36.0) - assert(monthsBetween(date3, date4, true, TimeZoneGMT) === 35.91993675) - assert(monthsBetween(date3, date4, false, TimeZoneGMT) === 35.919936754348136) + assert(monthsBetween(date3, date4, true, TimeZoneGMT) === 35.90322581) + assert(monthsBetween(date3, date4, false, TimeZoneGMT) === 35.903225806451616) } test("from UTC timestamp") { From 9d78910c4b89855c734e8c24e5cdb27738831631 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 3 Oct 2019 20:13:05 +0300 Subject: [PATCH 20/20] Revert "Use SECONDS_PER_MONTH in monthsBetween" This reverts commit 9b58059cd081c5ec9ea00e488f07033a9988eb95. --- .../org/apache/spark/sql/catalyst/util/DateTimeUtils.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index f8afd23ea7af..79fc45ec8947 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -628,7 +628,8 @@ object DateTimeUtils { val secondsInDay1 = MILLISECONDS.toSeconds(millis1 - daysToMillis(date1, timeZone)) val secondsInDay2 = MILLISECONDS.toSeconds(millis2 - daysToMillis(date2, timeZone)) val secondsDiff = (dayInMonth1 - dayInMonth2) * SECONDS_PER_DAY + secondsInDay1 - secondsInDay2 - val diff = monthDiff + secondsDiff / SECONDS_PER_MONTH.toDouble + val secondsInMonth = DAYS.toSeconds(31) + val diff = monthDiff + secondsDiff / secondsInMonth.toDouble if (roundOff) { // rounding to 8 digits math.round(diff * 1e8) / 1e8