diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 268cd191c171..04994a1b3fbb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -1033,6 +1033,44 @@ object DateTimeUtils { instantToMicros(localDateTime.atZone(ZoneId.systemDefault).toInstant) } + /** + * Rebases days since the epoch from an original to an target calendar, from instance + * from a hybrid (Julian + Gregorian) to Proleptic Gregorian calendar. + * + * It finds the latest switch day which is less than `days`, and adds the difference + * in days associated with the switch day to the given `days`. The function is based + * on linear search which starts from the most recent switch days. This allows to perform + * less comparisons for modern dates. + * + * @param switchDays The days when difference in days between original and target + * calendar was changed. + * @param diffs The differences in days between calendars. + * @param days The number of days since the epoch 1970-01-01 to be rebased to the + * target calendar. + * @return The rebased day + */ + private def rebaseDays(switchDays: Array[Int], diffs: Array[Int], days: Int): Int = { + var i = switchDays.length - 1 + while (i >= 0 && days < switchDays(i)) { + i -= 1 + } + val rebased = days + diffs(if (i < 0) 0 else i) + rebased + } + + // The differences in days between Julian and Proleptic Gregorian dates. + // The diff at the index `i` is applicable for all days in the date interval: + // [julianGregDiffSwitchDay(i), julianGregDiffSwitchDay(i+1)) + private val julianGregDiffs = Array(2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, 0) + // The sorted days in Julian calendar when difference in days between Julian and + // Proleptic Gregorian calendars was changed. + // The starting point is the `0001-01-01` (-719164 days since the epoch in + // Julian calendar). All dates before the staring point have the same difference + // of 2 days in Julian and Proleptic Gregorian calendars. + private val julianGregDiffSwitchDay = Array( + -719164, -682945, -646420, -609895, -536845, -500320, -463795, + -390745, -354220, -317695, -244645, -208120, -171595, -141427) + /** * Converts the given number of days since the epoch day 1970-01-01 to * a local date in Julian calendar, interprets the result as a local @@ -1043,25 +1081,22 @@ object DateTimeUtils { * @return The rebased number of days in Gregorian calendar. */ def rebaseJulianToGregorianDays(days: Int): Int = { - val utcCal = new Calendar.Builder() - // `gregory` is a hybrid calendar that supports both - // the Julian and Gregorian calendar systems - .setCalendarType("gregory") - .setTimeZone(TimeZoneUTC) - .setInstant(Math.multiplyExact(days, MILLIS_PER_DAY)) - .build() - val localDate = LocalDate.of( - utcCal.get(Calendar.YEAR), - utcCal.get(Calendar.MONTH) + 1, - // The number of days will be added later to handle non-existing - // Julian dates in Proleptic Gregorian calendar. - // For example, 1000-02-29 exists in Julian calendar because 1000 - // is a leap year but it is not a leap year in Gregorian calendar. - 1) - .plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1) - Math.toIntExact(localDate.toEpochDay) + rebaseDays(julianGregDiffSwitchDay, julianGregDiffs, days) } + // The differences in days between Proleptic Gregorian and Julian dates. + // The diff at the index `i` is applicable for all days in the date interval: + // [gregJulianDiffSwitchDay(i), gregJulianDiffSwitchDay(i+1)) + private val grepJulianDiffs = Array(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0) + // The sorted days in Proleptic Gregorian calendar when difference in days between + // Proleptic Gregorian and Julian was changed. + // The starting point is the `0001-01-01` (-719162 days since the epoch in + // Proleptic Gregorian calendar). All dates before the staring point have the same + // difference of -2 days in Proleptic Gregorian and Julian calendars. + private val gregJulianDiffSwitchDay = Array( + -719162, -682944, -646420, -609896, -536847, -500323, -463799, + -390750, -354226, -317702, -244653, -208129, -171605, -141427) + /** * Rebasing days since the epoch to store the same number of days * as by Spark 2.4 and earlier versions. Spark 3.0 switched to @@ -1079,14 +1114,6 @@ object DateTimeUtils { * @return The rebased number of days since the epoch in Julian calendar. */ def rebaseGregorianToJulianDays(days: Int): Int = { - val localDate = LocalDate.ofEpochDay(days) - val utcCal = new Calendar.Builder() - // `gregory` is a hybrid calendar that supports both - // the Julian and Gregorian calendar systems - .setCalendarType("gregory") - .setTimeZone(TimeZoneUTC) - .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth) - .build() - Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, MILLIS_PER_DAY)) + rebaseDays(gregJulianDiffSwitchDay, grepJulianDiffs, days) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index 6a164311a447..268a978b0dfd 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.util import java.sql.{Date, Timestamp} import java.text.SimpleDateFormat import java.time.{Instant, LocalDate, LocalDateTime, LocalTime, ZoneId} -import java.util.{Locale, TimeZone} +import java.util.{Calendar, Locale, TimeZone} import java.util.concurrent.TimeUnit import org.scalatest.Matchers @@ -765,4 +765,60 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { } } } + + test("optimization of days rebasing - Gregorian to Julian") { + def refRebaseGregorianToJulianDays(days: Int): Int = { + val localDate = LocalDate.ofEpochDay(days) + val utcCal = new Calendar.Builder() + // `gregory` is a hybrid calendar that supports both + // the Julian and Gregorian calendar systems + .setCalendarType("gregory") + .setTimeZone(TimeZoneUTC) + .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth) + .build() + Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, MILLIS_PER_DAY)) + } + + val start = localDateToDays(LocalDate.of(1, 1, 1)) + val end = localDateToDays(LocalDate.of(2030, 1, 1)) + + var days = start + while (days < end) { + assert(rebaseGregorianToJulianDays(days) === refRebaseGregorianToJulianDays(days)) + days += 1 + } + } + + test("optimization of days rebasing - Julian to Gregorian") { + def refRebaseJulianToGregorianDays(days: Int): Int = { + val utcCal = new Calendar.Builder() + // `gregory` is a hybrid calendar that supports both + // the Julian and Gregorian calendar systems + .setCalendarType("gregory") + .setTimeZone(TimeZoneUTC) + .setInstant(Math.multiplyExact(days, MILLIS_PER_DAY)) + .build() + val localDate = LocalDate.of( + utcCal.get(Calendar.YEAR), + utcCal.get(Calendar.MONTH) + 1, + // The number of days will be added later to handle non-existing + // Julian dates in Proleptic Gregorian calendar. + // For example, 1000-02-29 exists in Julian calendar because 1000 + // is a leap year but it is not a leap year in Gregorian calendar. + 1) + .plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1) + Math.toIntExact(localDate.toEpochDay) + } + + val start = rebaseGregorianToJulianDays( + localDateToDays(LocalDate.of(1, 1, 1))) + val end = rebaseGregorianToJulianDays( + localDateToDays(LocalDate.of(2030, 1, 1))) + + var days = start + while (days < end) { + assert(rebaseJulianToGregorianDays(days) === refRebaseJulianToGregorianDays(days)) + days += 1 + } + } } diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt index 52522f8f88c7..4fed51113912 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt @@ -2,52 +2,52 @@ Rebasing dates/timestamps in Parquet datasource ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 9272 9272 0 10.8 92.7 1.0X -before 1582, noop 9142 9142 0 10.9 91.4 1.0X -after 1582, rebase off 21841 21841 0 4.6 218.4 0.4X -after 1582, rebase on 58245 58245 0 1.7 582.4 0.2X -before 1582, rebase off 19813 19813 0 5.0 198.1 0.5X -before 1582, rebase on 63737 63737 0 1.6 637.4 0.1X +after 1582, noop 9304 9304 0 10.7 93.0 1.0X +before 1582, noop 9187 9187 0 10.9 91.9 1.0X +after 1582, rebase off 22054 22054 0 4.5 220.5 0.4X +after 1582, rebase on 20361 20361 0 4.9 203.6 0.5X +before 1582, rebase off 20286 20286 0 4.9 202.9 0.5X +before 1582, rebase on 22230 22230 0 4.5 222.3 0.4X -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 13004 13063 67 7.7 130.0 1.0X -after 1582, vec off, rebase on 36224 36253 26 2.8 362.2 0.4X -after 1582, vec on, rebase off 3596 3654 54 27.8 36.0 3.6X -after 1582, vec on, rebase on 26144 26253 112 3.8 261.4 0.5X -before 1582, vec off, rebase off 12872 12914 51 7.8 128.7 1.0X -before 1582, vec off, rebase on 37762 37904 153 2.6 377.6 0.3X -before 1582, vec on, rebase off 3522 3592 94 28.4 35.2 3.7X -before 1582, vec on, rebase on 27580 27615 59 3.6 275.8 0.5X +after 1582, vec off, rebase off 12773 12866 129 7.8 127.7 1.0X +after 1582, vec off, rebase on 13063 13086 39 7.7 130.6 1.0X +after 1582, vec on, rebase off 3678 3719 61 27.2 36.8 3.5X +after 1582, vec on, rebase on 5078 5121 52 19.7 50.8 2.5X +before 1582, vec off, rebase off 12942 12972 42 7.7 129.4 1.0X +before 1582, vec off, rebase on 13866 13904 58 7.2 138.7 0.9X +before 1582, vec on, rebase off 3678 3711 43 27.2 36.8 3.5X +before 1582, vec on, rebase on 5621 5657 44 17.8 56.2 2.3X -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 3113 3113 0 32.1 31.1 1.0X -before 1582, noop 3078 3078 0 32.5 30.8 1.0X -after 1582, rebase off 15749 15749 0 6.3 157.5 0.2X -after 1582, rebase on 69106 69106 0 1.4 691.1 0.0X -before 1582, rebase off 15967 15967 0 6.3 159.7 0.2X -before 1582, rebase on 76843 76843 0 1.3 768.4 0.0X +after 1582, noop 2983 2983 0 33.5 29.8 1.0X +before 1582, noop 2979 2979 0 33.6 29.8 1.0X +after 1582, rebase off 17452 17452 0 5.7 174.5 0.2X +after 1582, rebase on 70193 70193 0 1.4 701.9 0.0X +before 1582, rebase off 17784 17784 0 5.6 177.8 0.2X +before 1582, rebase on 83498 83498 0 1.2 835.0 0.0X -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 15070 15172 94 6.6 150.7 1.0X -after 1582, vec off, rebase on 43748 43867 157 2.3 437.5 0.3X -after 1582, vec on, rebase off 4805 4859 60 20.8 48.1 3.1X -after 1582, vec on, rebase on 33960 34027 61 2.9 339.6 0.4X -before 1582, vec off, rebase off 15037 15071 52 6.7 150.4 1.0X -before 1582, vec off, rebase on 44590 44749 156 2.2 445.9 0.3X -before 1582, vec on, rebase off 4831 4852 30 20.7 48.3 3.1X -before 1582, vec on, rebase on 35460 35481 18 2.8 354.6 0.4X +after 1582, vec off, rebase off 15114 15151 32 6.6 151.1 1.0X +after 1582, vec off, rebase on 45804 45912 126 2.2 458.0 0.3X +after 1582, vec on, rebase off 4900 4947 56 20.4 49.0 3.1X +after 1582, vec on, rebase on 34599 34650 45 2.9 346.0 0.4X +before 1582, vec off, rebase off 15093 15174 70 6.6 150.9 1.0X +before 1582, vec off, rebase on 47367 47472 121 2.1 473.7 0.3X +before 1582, vec on, rebase off 4884 4952 80 20.5 48.8 3.1X +before 1582, vec on, rebase on 35831 35883 59 2.8 358.3 0.4X diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt index c9320cfe660f..ee486276653f 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt @@ -2,52 +2,52 @@ Rebasing dates/timestamps in Parquet datasource ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 9472 9472 0 10.6 94.7 1.0X -before 1582, noop 9226 9226 0 10.8 92.3 1.0X -after 1582, rebase off 21201 21201 0 4.7 212.0 0.4X -after 1582, rebase on 56471 56471 0 1.8 564.7 0.2X -before 1582, rebase off 20179 20179 0 5.0 201.8 0.5X -before 1582, rebase on 65717 65717 0 1.5 657.2 0.1X +after 1582, noop 9582 9582 0 10.4 95.8 1.0X +before 1582, noop 9473 9473 0 10.6 94.7 1.0X +after 1582, rebase off 21431 21431 0 4.7 214.3 0.4X +after 1582, rebase on 22156 22156 0 4.5 221.6 0.4X +before 1582, rebase off 21399 21399 0 4.7 214.0 0.4X +before 1582, rebase on 22927 22927 0 4.4 229.3 0.4X -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 12294 12434 205 8.1 122.9 1.0X -after 1582, vec off, rebase on 36959 36967 12 2.7 369.6 0.3X -after 1582, vec on, rebase off 3644 3691 49 27.4 36.4 3.4X -after 1582, vec on, rebase on 26764 26852 92 3.7 267.6 0.5X -before 1582, vec off, rebase off 12830 12917 85 7.8 128.3 1.0X -before 1582, vec off, rebase on 38897 39053 229 2.6 389.0 0.3X -before 1582, vec on, rebase off 3638 3693 85 27.5 36.4 3.4X -before 1582, vec on, rebase on 28956 29007 44 3.5 289.6 0.4X +after 1582, vec off, rebase off 12637 12736 111 7.9 126.4 1.0X +after 1582, vec off, rebase on 13463 13531 61 7.4 134.6 0.9X +after 1582, vec on, rebase off 3693 3703 8 27.1 36.9 3.4X +after 1582, vec on, rebase on 5242 5252 9 19.1 52.4 2.4X +before 1582, vec off, rebase off 13055 13169 126 7.7 130.5 1.0X +before 1582, vec off, rebase on 14067 14270 185 7.1 140.7 0.9X +before 1582, vec on, rebase off 3697 3702 7 27.1 37.0 3.4X +before 1582, vec on, rebase on 6058 6097 34 16.5 60.6 2.1X -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 2952 2952 0 33.9 29.5 1.0X -before 1582, noop 2880 2880 0 34.7 28.8 1.0X -after 1582, rebase off 15928 15928 0 6.3 159.3 0.2X -after 1582, rebase on 82816 82816 0 1.2 828.2 0.0X -before 1582, rebase off 15988 15988 0 6.3 159.9 0.2X -before 1582, rebase on 92636 92636 0 1.1 926.4 0.0X +after 1582, noop 2713 2713 0 36.9 27.1 1.0X +before 1582, noop 2715 2715 0 36.8 27.2 1.0X +after 1582, rebase off 16768 16768 0 6.0 167.7 0.2X +after 1582, rebase on 82811 82811 0 1.2 828.1 0.0X +before 1582, rebase off 17052 17052 0 5.9 170.5 0.2X +before 1582, rebase on 95134 95134 0 1.1 951.3 0.0X -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 14863 14917 78 6.7 148.6 1.0X -after 1582, vec off, rebase on 54819 54939 140 1.8 548.2 0.3X -after 1582, vec on, rebase off 4905 4941 32 20.4 49.0 3.0X -after 1582, vec on, rebase on 44914 45008 124 2.2 449.1 0.3X -before 1582, vec off, rebase off 14928 14970 48 6.7 149.3 1.0X -before 1582, vec off, rebase on 59752 59996 245 1.7 597.5 0.2X -before 1582, vec on, rebase off 4892 4916 33 20.4 48.9 3.0X -before 1582, vec on, rebase on 46854 46977 198 2.1 468.5 0.3X +after 1582, vec off, rebase off 15200 15321 194 6.6 152.0 1.0X +after 1582, vec off, rebase on 63160 63337 177 1.6 631.6 0.2X +after 1582, vec on, rebase off 4891 4928 43 20.4 48.9 3.1X +after 1582, vec on, rebase on 45474 45484 10 2.2 454.7 0.3X +before 1582, vec off, rebase off 15203 15330 110 6.6 152.0 1.0X +before 1582, vec off, rebase on 65588 65664 73 1.5 655.9 0.2X +before 1582, vec on, rebase off 4844 4916 105 20.6 48.4 3.1X +before 1582, vec on, rebase on 47815 47943 162 2.1 478.2 0.3X