Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,44 @@ object DateTimeUtils {
instantToMicros(localDateTime.atZone(ZoneId.systemDefault).toInstant)
}

/**
* Rebases days since the epoch from an original to an target calendar, from instance
* from a hybrid (Julian + Gregorian) to Proleptic Gregorian calendar.
*
* It finds the latest switch day which is less than `days`, and adds the difference
* in days associated with the switch day to the given `days`. The function is based
* on linear search which starts from the most recent switch days. This allows to perform
* less comparisons for modern dates.
*
* @param switchDays The days when difference in days between original and target
* calendar was changed.
* @param diffs The differences in days between calendars.
* @param days The number of days since the epoch 1970-01-01 to be rebased to the
* target calendar.
* @return The rebased day
*/
private def rebaseDays(switchDays: Array[Int], diffs: Array[Int], days: Int): Int = {
var i = switchDays.length - 1
while (i >= 0 && days < switchDays(i)) {
i -= 1
}
val rebased = days + diffs(if (i < 0) 0 else i)
rebased
}

// The differences in days between Julian and Proleptic Gregorian dates.
// The diff at the index `i` is applicable for all days in the date interval:
// [julianGregDiffSwitchDay(i), julianGregDiffSwitchDay(i+1))
private val julianGregDiffs = Array(2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, 0)
// The sorted days in Julian calendar when difference in days between Julian and
// Proleptic Gregorian calendars was changed.
// The starting point is the `0001-01-01` (-719164 days since the epoch in
// Julian calendar). All dates before the staring point have the same difference
// of 2 days in Julian and Proleptic Gregorian calendars.
private val julianGregDiffSwitchDay = Array(
-719164, -682945, -646420, -609895, -536845, -500320, -463795,
-390745, -354220, -317695, -244645, -208120, -171595, -141427)

/**
* Converts the given number of days since the epoch day 1970-01-01 to
* a local date in Julian calendar, interprets the result as a local
Expand All @@ -1043,25 +1081,22 @@ object DateTimeUtils {
* @return The rebased number of days in Gregorian calendar.
*/
def rebaseJulianToGregorianDays(days: Int): Int = {
val utcCal = new Calendar.Builder()
// `gregory` is a hybrid calendar that supports both
// the Julian and Gregorian calendar systems
.setCalendarType("gregory")
.setTimeZone(TimeZoneUTC)
.setInstant(Math.multiplyExact(days, MILLIS_PER_DAY))
.build()
val localDate = LocalDate.of(
utcCal.get(Calendar.YEAR),
utcCal.get(Calendar.MONTH) + 1,
// The number of days will be added later to handle non-existing
// Julian dates in Proleptic Gregorian calendar.
// For example, 1000-02-29 exists in Julian calendar because 1000
// is a leap year but it is not a leap year in Gregorian calendar.
1)
.plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1)
Math.toIntExact(localDate.toEpochDay)
rebaseDays(julianGregDiffSwitchDay, julianGregDiffs, days)
}

// The differences in days between Proleptic Gregorian and Julian dates.
// The diff at the index `i` is applicable for all days in the date interval:
// [gregJulianDiffSwitchDay(i), gregJulianDiffSwitchDay(i+1))
private val grepJulianDiffs = Array(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0)
// The sorted days in Proleptic Gregorian calendar when difference in days between
// Proleptic Gregorian and Julian was changed.
// The starting point is the `0001-01-01` (-719162 days since the epoch in
// Proleptic Gregorian calendar). All dates before the staring point have the same
// difference of -2 days in Proleptic Gregorian and Julian calendars.
private val gregJulianDiffSwitchDay = Array(
-719162, -682944, -646420, -609896, -536847, -500323, -463799,
-390750, -354226, -317702, -244653, -208129, -171605, -141427)

/**
* Rebasing days since the epoch to store the same number of days
* as by Spark 2.4 and earlier versions. Spark 3.0 switched to
Expand All @@ -1079,14 +1114,6 @@ object DateTimeUtils {
* @return The rebased number of days since the epoch in Julian calendar.
*/
def rebaseGregorianToJulianDays(days: Int): Int = {
val localDate = LocalDate.ofEpochDay(days)
val utcCal = new Calendar.Builder()
// `gregory` is a hybrid calendar that supports both
// the Julian and Gregorian calendar systems
.setCalendarType("gregory")
.setTimeZone(TimeZoneUTC)
.setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth)
.build()
Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, MILLIS_PER_DAY))
rebaseDays(gregJulianDiffSwitchDay, grepJulianDiffs, days)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.util
import java.sql.{Date, Timestamp}
import java.text.SimpleDateFormat
import java.time.{Instant, LocalDate, LocalDateTime, LocalTime, ZoneId}
import java.util.{Locale, TimeZone}
import java.util.{Calendar, Locale, TimeZone}
import java.util.concurrent.TimeUnit

import org.scalatest.Matchers
Expand Down Expand Up @@ -765,4 +765,60 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
}
}
}

test("optimization of days rebasing - Gregorian to Julian") {
def refRebaseGregorianToJulianDays(days: Int): Int = {
val localDate = LocalDate.ofEpochDay(days)
val utcCal = new Calendar.Builder()
// `gregory` is a hybrid calendar that supports both
// the Julian and Gregorian calendar systems
.setCalendarType("gregory")
.setTimeZone(TimeZoneUTC)
.setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth)
.build()
Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, MILLIS_PER_DAY))
}

val start = localDateToDays(LocalDate.of(1, 1, 1))
val end = localDateToDays(LocalDate.of(2030, 1, 1))

var days = start
while (days < end) {
assert(rebaseGregorianToJulianDays(days) === refRebaseGregorianToJulianDays(days))
days += 1
}
}

test("optimization of days rebasing - Julian to Gregorian") {
def refRebaseJulianToGregorianDays(days: Int): Int = {
val utcCal = new Calendar.Builder()
// `gregory` is a hybrid calendar that supports both
// the Julian and Gregorian calendar systems
.setCalendarType("gregory")
.setTimeZone(TimeZoneUTC)
.setInstant(Math.multiplyExact(days, MILLIS_PER_DAY))
.build()
val localDate = LocalDate.of(
utcCal.get(Calendar.YEAR),
utcCal.get(Calendar.MONTH) + 1,
// The number of days will be added later to handle non-existing
// Julian dates in Proleptic Gregorian calendar.
// For example, 1000-02-29 exists in Julian calendar because 1000
// is a leap year but it is not a leap year in Gregorian calendar.
1)
.plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1)
Math.toIntExact(localDate.toEpochDay)
}

val start = rebaseGregorianToJulianDays(
localDateToDays(LocalDate.of(1, 1, 1)))
val end = rebaseGregorianToJulianDays(
localDateToDays(LocalDate.of(2030, 1, 1)))

var days = start
while (days < end) {
assert(rebaseJulianToGregorianDays(days) === refRebaseJulianToGregorianDays(days))
days += 1
}
}
}
64 changes: 32 additions & 32 deletions sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,52 +2,52 @@
Rebasing dates/timestamps in Parquet datasource
================================================================================================

OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws
OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
after 1582, noop 9272 9272 0 10.8 92.7 1.0X
before 1582, noop 9142 9142 0 10.9 91.4 1.0X
after 1582, rebase off 21841 21841 0 4.6 218.4 0.4X
after 1582, rebase on 58245 58245 0 1.7 582.4 0.2X
before 1582, rebase off 19813 19813 0 5.0 198.1 0.5X
before 1582, rebase on 63737 63737 0 1.6 637.4 0.1X
after 1582, noop 9304 9304 0 10.7 93.0 1.0X
before 1582, noop 9187 9187 0 10.9 91.9 1.0X
after 1582, rebase off 22054 22054 0 4.5 220.5 0.4X
after 1582, rebase on 20361 20361 0 4.9 203.6 0.5X
before 1582, rebase off 20286 20286 0 4.9 202.9 0.5X
before 1582, rebase on 22230 22230 0 4.5 222.3 0.4X

OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws
OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
after 1582, vec off, rebase off 13004 13063 67 7.7 130.0 1.0X
after 1582, vec off, rebase on 36224 36253 26 2.8 362.2 0.4X
after 1582, vec on, rebase off 3596 3654 54 27.8 36.0 3.6X
after 1582, vec on, rebase on 26144 26253 112 3.8 261.4 0.5X
before 1582, vec off, rebase off 12872 12914 51 7.8 128.7 1.0X
before 1582, vec off, rebase on 37762 37904 153 2.6 377.6 0.3X
before 1582, vec on, rebase off 3522 3592 94 28.4 35.2 3.7X
before 1582, vec on, rebase on 27580 27615 59 3.6 275.8 0.5X
after 1582, vec off, rebase off 12773 12866 129 7.8 127.7 1.0X
after 1582, vec off, rebase on 13063 13086 39 7.7 130.6 1.0X
after 1582, vec on, rebase off 3678 3719 61 27.2 36.8 3.5X
after 1582, vec on, rebase on 5078 5121 52 19.7 50.8 2.5X
before 1582, vec off, rebase off 12942 12972 42 7.7 129.4 1.0X
before 1582, vec off, rebase on 13866 13904 58 7.2 138.7 0.9X
before 1582, vec on, rebase off 3678 3711 43 27.2 36.8 3.5X
before 1582, vec on, rebase on 5621 5657 44 17.8 56.2 2.3X

OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws
OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
after 1582, noop 3113 3113 0 32.1 31.1 1.0X
before 1582, noop 3078 3078 0 32.5 30.8 1.0X
after 1582, rebase off 15749 15749 0 6.3 157.5 0.2X
after 1582, rebase on 69106 69106 0 1.4 691.1 0.0X
before 1582, rebase off 15967 15967 0 6.3 159.7 0.2X
before 1582, rebase on 76843 76843 0 1.3 768.4 0.0X
after 1582, noop 2983 2983 0 33.5 29.8 1.0X
before 1582, noop 2979 2979 0 33.6 29.8 1.0X
after 1582, rebase off 17452 17452 0 5.7 174.5 0.2X
after 1582, rebase on 70193 70193 0 1.4 701.9 0.0X
before 1582, rebase off 17784 17784 0 5.6 177.8 0.2X
before 1582, rebase on 83498 83498 0 1.2 835.0 0.0X

OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws
OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
after 1582, vec off, rebase off 15070 15172 94 6.6 150.7 1.0X
after 1582, vec off, rebase on 43748 43867 157 2.3 437.5 0.3X
after 1582, vec on, rebase off 4805 4859 60 20.8 48.1 3.1X
after 1582, vec on, rebase on 33960 34027 61 2.9 339.6 0.4X
before 1582, vec off, rebase off 15037 15071 52 6.7 150.4 1.0X
before 1582, vec off, rebase on 44590 44749 156 2.2 445.9 0.3X
before 1582, vec on, rebase off 4831 4852 30 20.7 48.3 3.1X
before 1582, vec on, rebase on 35460 35481 18 2.8 354.6 0.4X
after 1582, vec off, rebase off 15114 15151 32 6.6 151.1 1.0X
after 1582, vec off, rebase on 45804 45912 126 2.2 458.0 0.3X
after 1582, vec on, rebase off 4900 4947 56 20.4 49.0 3.1X
after 1582, vec on, rebase on 34599 34650 45 2.9 346.0 0.4X
before 1582, vec off, rebase off 15093 15174 70 6.6 150.9 1.0X
before 1582, vec off, rebase on 47367 47472 121 2.1 473.7 0.3X
before 1582, vec on, rebase off 4884 4952 80 20.5 48.8 3.1X
before 1582, vec on, rebase on 35831 35883 59 2.8 358.3 0.4X


Loading