From 839b0294bf3f54167009c63d22d6b5e82ca53be8 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 29 Mar 2020 22:01:59 +0300 Subject: [PATCH 01/13] Optimize greg to jul days rebasing --- .../sql/catalyst/util/DateTimeUtils.scala | 19 +++++++------- .../catalyst/util/DateTimeUtilsSuite.scala | 26 +++++++++++++++++-- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 268cd191c171..d9ae2bc5c95b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -21,6 +21,7 @@ import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} import java.time._ import java.time.temporal.{ChronoField, ChronoUnit, IsoFields} +import java.util import java.util.{Calendar, Locale, TimeZone} import java.util.concurrent.TimeUnit._ @@ -1062,6 +1063,11 @@ object DateTimeUtils { Math.toIntExact(localDate.toEpochDay) } + private val gregToJulDay = Array( + -719162, -682944, -646420, -609896, -536847, -500323, -463799, + -390750, -354226, -317702, -244653, -208129, -171605, -141427) + private val gregToJulDiff = Array(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0) + /** * Rebasing days since the epoch to store the same number of days * as by Spark 2.4 and earlier versions. Spark 3.0 switched to @@ -1079,14 +1085,9 @@ object DateTimeUtils { * @return The rebased number of days since the epoch in Julian calendar. */ def rebaseGregorianToJulianDays(days: Int): Int = { - val localDate = LocalDate.ofEpochDay(days) - val utcCal = new Calendar.Builder() - // `gregory` is a hybrid calendar that supports both - // the Julian and Gregorian calendar systems - .setCalendarType("gregory") - .setTimeZone(TimeZoneUTC) - .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth) - .build() - Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, MILLIS_PER_DAY)) + val index = util.Arrays.binarySearch(gregToJulDay, days) + val diff = if (index >= 0) gregToJulDiff(index) else gregToJulDiff(-index - 2) + val rebased = days + diff + rebased } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index 6a164311a447..562e9e844fda 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -20,11 +20,10 @@ package org.apache.spark.sql.catalyst.util import java.sql.{Date, Timestamp} import java.text.SimpleDateFormat import java.time.{Instant, LocalDate, LocalDateTime, LocalTime, ZoneId} -import java.util.{Locale, TimeZone} +import java.util.{Calendar, Locale, TimeZone} import java.util.concurrent.TimeUnit import org.scalatest.Matchers - import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.catalyst.util.DateTimeConstants._ @@ -765,4 +764,27 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { } } } + + test("optimization of days rebasing - Gregorian to Julian") { + def refRebaseGregorianToJulianDays(days: Int): Int = { + val localDate = LocalDate.ofEpochDay(days) + val utcCal = new Calendar.Builder() + // `gregory` is a hybrid calendar that supports both + // the Julian and Gregorian calendar systems + .setCalendarType("gregory") + .setTimeZone(TimeZoneUTC) + .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth) + .build() + Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, MILLIS_PER_DAY)) + } + + val start = localDateToDays(LocalDate.of(1, 1, 1)) + val end = localDateToDays(LocalDate.of(2030, 1, 1)) + + var days = start + while (days < end) { + assert(rebaseGregorianToJulianDays(days) === refRebaseGregorianToJulianDays(days)) + days += 1 + } + } } From cc4e3ecde313b66bf43de54f622e53336c3ef6be Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 29 Mar 2020 22:02:46 +0300 Subject: [PATCH 02/13] Fix imports --- .../org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index 562e9e844fda..2d8de07c9a44 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -24,6 +24,7 @@ import java.util.{Calendar, Locale, TimeZone} import java.util.concurrent.TimeUnit import org.scalatest.Matchers + import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.catalyst.util.DateTimeConstants._ From 203fa5419e274b415111eb5f76b30655272582e0 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 29 Mar 2020 22:20:03 +0300 Subject: [PATCH 03/13] Optimize jul to greg days rebasing --- .../sql/catalyst/util/DateTimeUtils.scala | 35 ++++++--------- .../catalyst/util/DateTimeUtilsSuite.scala | 44 +++++++++++++++++-- 2 files changed, 54 insertions(+), 25 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index d9ae2bc5c95b..0a31d3be6408 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -1034,6 +1034,18 @@ object DateTimeUtils { instantToMicros(localDateTime.atZone(ZoneId.systemDefault).toInstant) } + private def rebaseDays(daysArr: Array[Int], diffArr: Array[Int], days: Int): Int = { + val index = util.Arrays.binarySearch(daysArr, days) + val diff = if (index >= 0) diffArr(index) else diffArr(-index - 2) + val rebased = days + diff + rebased + } + + private val julToGregDay = Array( + -719164, -682945, -646420, -609895, -536845, -500320, -463795, + -390745, -354220, -317695, -244645, -208120, -171595, -141427) + private val julToGregDiff = Array(2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, 0) + /** * Converts the given number of days since the epoch day 1970-01-01 to * a local date in Julian calendar, interprets the result as a local @@ -1044,23 +1056,7 @@ object DateTimeUtils { * @return The rebased number of days in Gregorian calendar. */ def rebaseJulianToGregorianDays(days: Int): Int = { - val utcCal = new Calendar.Builder() - // `gregory` is a hybrid calendar that supports both - // the Julian and Gregorian calendar systems - .setCalendarType("gregory") - .setTimeZone(TimeZoneUTC) - .setInstant(Math.multiplyExact(days, MILLIS_PER_DAY)) - .build() - val localDate = LocalDate.of( - utcCal.get(Calendar.YEAR), - utcCal.get(Calendar.MONTH) + 1, - // The number of days will be added later to handle non-existing - // Julian dates in Proleptic Gregorian calendar. - // For example, 1000-02-29 exists in Julian calendar because 1000 - // is a leap year but it is not a leap year in Gregorian calendar. - 1) - .plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1) - Math.toIntExact(localDate.toEpochDay) + rebaseDays(julToGregDay, julToGregDiff, days) } private val gregToJulDay = Array( @@ -1085,9 +1081,6 @@ object DateTimeUtils { * @return The rebased number of days since the epoch in Julian calendar. */ def rebaseGregorianToJulianDays(days: Int): Int = { - val index = util.Arrays.binarySearch(gregToJulDay, days) - val diff = if (index >= 0) gregToJulDiff(index) else gregToJulDiff(-index - 2) - val rebased = days + diff - rebased + rebaseDays(gregToJulDay, gregToJulDiff, days) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index 2d8de07c9a44..aff19ad57da2 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -38,6 +38,7 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { test("nanoseconds truncation") { val tf = TimestampFormatter.getFractionFormatter(DateTimeUtils.defaultTimeZone.toZoneId) + def checkStringToTimestamp(originalTime: String, expectedParsedTime: String): Unit = { val parsedTimestampOp = DateTimeUtils.stringToTimestamp( UTF8String.fromString(originalTime), defaultZoneId) @@ -455,6 +456,7 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { assert(toJavaTimestamp(fromUTCTime(fromJavaTimestamp(Timestamp.valueOf(utc)), tz)).toString === expected) } + for (tz <- ALL_TIMEZONES) { withDefaultTimeZone(tz) { test("2011-12-25 09:00:00.123456", "UTC", "2011-12-25 09:00:00.123456") @@ -510,10 +512,10 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { test("truncTimestamp") { def testTrunc( - level: Int, - expected: String, - inputTS: SQLTimestamp, - zoneId: ZoneId = defaultZoneId): Unit = { + level: Int, + expected: String, + inputTS: SQLTimestamp, + zoneId: ZoneId = defaultZoneId): Unit = { val truncated = DateTimeUtils.truncTimestamp(inputTS, level, zoneId) val expectedTS = toTimestamp(expected, defaultZoneId) @@ -699,6 +701,7 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { val millisLocal = millisUtc + timeZone.getOffset(millisUtc) Math.floor(millisLocal.toDouble / MILLIS_PER_DAY).toInt } + private def fromJavaDateLegacy(date: Date): Int = { millisToDaysLegacy(date.getTime, defaultTimeZone()) } @@ -788,4 +791,37 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { days += 1 } } + + test("optimization of days rebasing - Julian to Gregorian") { + def refRebaseJulianToGregorianDays(days: Int): Int = { + val utcCal = new Calendar.Builder() + // `gregory` is a hybrid calendar that supports both + // the Julian and Gregorian calendar systems + .setCalendarType("gregory") + .setTimeZone(TimeZoneUTC) + .setInstant(Math.multiplyExact(days, MILLIS_PER_DAY)) + .build() + val localDate = LocalDate.of( + utcCal.get(Calendar.YEAR), + utcCal.get(Calendar.MONTH) + 1, + // The number of days will be added later to handle non-existing + // Julian dates in Proleptic Gregorian calendar. + // For example, 1000-02-29 exists in Julian calendar because 1000 + // is a leap year but it is not a leap year in Gregorian calendar. + 1) + .plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1) + Math.toIntExact(localDate.toEpochDay) + } + + val start = rebaseGregorianToJulianDays( + localDateToDays(LocalDate.of(1, 1, 1))) + val end = rebaseGregorianToJulianDays( + localDateToDays(LocalDate.of(2030, 1, 1))) + + var days = start + while (days < end) { + assert(rebaseJulianToGregorianDays(days) === refRebaseJulianToGregorianDays(days)) + days += 1 + } + } } From 3aa88bca1fb0ccb10124ffa8f78c428a6ced752b Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 29 Mar 2020 22:29:38 +0300 Subject: [PATCH 04/13] Add benchmark for dates --- .../benchmark/DateTimeRebaseBenchmark.scala | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala new file mode 100644 index 000000000000..029b23ebe785 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import java.time.{LocalDate, LocalDateTime, LocalTime, ZoneOffset} + +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.catalyst.util.DateTimeConstants.SECONDS_PER_DAY +import org.apache.spark.sql.internal.SQLConf + +/** + * Synthetic benchmark for rebasing of date and timestamp in read/write. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "sql/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/DateTimeRebaseBenchmark-results.txt". + * }}} + */ +object DateTimeRebaseBenchmark extends SqlBasedBenchmark { + import spark.implicits._ + + private def genTs(cardinality: Int, start: LocalDateTime, end: LocalDateTime): DataFrame = { + val startSec = start.toEpochSecond(ZoneOffset.UTC) + val endSec = end.toEpochSecond(ZoneOffset.UTC) + spark.range(0, cardinality, 1, 1) + .select((($"id" % (endSec - startSec)) + startSec).as("seconds")) + .select($"seconds".cast("timestamp").as("ts")) + } + + private def genTsAfter1582(cardinality: Int): DataFrame = { + val start = LocalDateTime.of(1582, 10, 15, 0, 0, 0) + val end = LocalDateTime.of(3000, 1, 1, 0, 0, 0) + genTs(cardinality, start, end) + } + + private def genTsBefore1582(cardinality: Int): DataFrame = { + val start = LocalDateTime.of(10, 1, 1, 0, 0, 0) + val end = LocalDateTime.of(1580, 1, 1, 0, 0, 0) + genTs(cardinality, start, end) + } + + private def genDate(cardinality: Int, start: LocalDate, end: LocalDate): DataFrame = { + val startSec = LocalDateTime.of(start, LocalTime.MIDNIGHT).toEpochSecond(ZoneOffset.UTC) + val endSec = LocalDateTime.of(end, LocalTime.MIDNIGHT).toEpochSecond(ZoneOffset.UTC) + spark.range(0, cardinality * SECONDS_PER_DAY, SECONDS_PER_DAY, 1) + .select((($"id" % (endSec - startSec)) + startSec).as("seconds")) + .select($"seconds".cast("timestamp").as("ts")) + .select($"ts".cast("date").as("date")) + } + + private def genDateAfter1582(cardinality: Int): DataFrame = { + val start = LocalDate.of(1582, 10, 15) + val end = LocalDate.of(3000, 1, 1) + genDate(cardinality, start, end) + } + + private def genDateBefore1582(cardinality: Int): DataFrame = { + val start = LocalDate.of(10, 1, 1) + val end = LocalDate.of(1580, 1, 1) + genDate(cardinality, start, end) + } + + private def genDF(cardinality: Int, dateTime: String, after1582: Boolean): DataFrame = { + (dateTime, after1582) match { + case ("date", true) => genDateAfter1582(cardinality) + case ("date", false) => genDateBefore1582(cardinality) + case ("timestamp", true) => genTsAfter1582(cardinality) + case ("timestamp", false) => genTsBefore1582(cardinality) + case _ => throw new IllegalArgumentException( + s"cardinality = $cardinality dateTime = $dateTime after1582 = $after1582") + } + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + withTempPath { path => + runBenchmark("Parquet read/write") { + val rowsNum = 100000000 + Seq("date" /* , "timestamp" */ ).foreach { dateTime => + val benchmark = new Benchmark(s"Save ${dateTime}s to parquet", rowsNum, output = output) + benchmark.addCase("after 1582, noop", 1) { _ => + genDF(rowsNum, dateTime, after1582 = true).noop() + } + benchmark.addCase("before 1582, noop", 1) { _ => + genDF(rowsNum, dateTime, after1582 = false).noop() + } + + def save(after1582: Boolean, rebase: Boolean): Unit = { + val period = if (after1582) "after" else "before" + val rebaseFlag = if (rebase) "on" else "off" + val caseName = s"$period 1582, rebase $rebaseFlag" + benchmark.addCase(caseName, 1) { _ => + withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> rebase.toString) { + val df = genDF(rowsNum, dateTime, after1582) + val pathToWrite = path.getAbsolutePath + s"/${dateTime}_${period}_1582_$rebaseFlag" + df.write + .mode("overwrite") + .format("parquet") + .save(pathToWrite) + } + } + } + + Seq(true, false).foreach { after1582 => + Seq(false, true).foreach { rebase => + save(after1582, rebase) + } + } + benchmark.run() + + val benchmark2 = new Benchmark( + s"Load ${dateTime}s from parquet", rowsNum, output = output) + + def load(after1582: Boolean, vec: Boolean, rebase: Boolean): Unit = { + val period = if (after1582) "after" else "before" + val rebaseFlag = if (rebase) "on" else "off" + val vecFlag = if (vec) "on" else "off" + val caseName = s"$period 1582, vec $vecFlag, rebase $rebaseFlag" + benchmark2.addCase(caseName, 3) { _ => + withSQLConf( + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vec.toString, + SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> rebase.toString) { + val pathToRead = path.getAbsolutePath + s"/${dateTime}_${period}_1582_$rebaseFlag" + spark.read.format("parquet").load(pathToRead).noop() + } + } + } + + Seq(true, false).foreach { after1582 => + Seq(false, true).foreach { vec => + Seq(false, true).foreach { rebase => + load(after1582, vec, rebase) + } + } + } + + benchmark2.run() + } + } + } + } +} From 2dc5be4496726e2b3168cbffb68a73d79e5e012c Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 29 Mar 2020 23:04:46 +0300 Subject: [PATCH 05/13] Linear search --- .../apache/spark/sql/catalyst/util/DateTimeUtils.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 0a31d3be6408..b6dabd92b564 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -1035,9 +1035,11 @@ object DateTimeUtils { } private def rebaseDays(daysArr: Array[Int], diffArr: Array[Int], days: Int): Int = { - val index = util.Arrays.binarySearch(daysArr, days) - val diff = if (index >= 0) diffArr(index) else diffArr(-index - 2) - val rebased = days + diff + var i = daysArr.length - 1 + while (i >= 0 && days < daysArr(i)) { + i -= 1 + } + val rebased = days + diffArr(if (i < 0) 0 else i) rebased } From 65f222e03396e43f5629ac4a53853617980ea9a0 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 29 Mar 2020 23:05:52 +0300 Subject: [PATCH 06/13] Avoid unnecessary changes in DateTimeUtilsSuite --- .../spark/sql/catalyst/util/DateTimeUtilsSuite.scala | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index aff19ad57da2..268a978b0dfd 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -38,7 +38,6 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { test("nanoseconds truncation") { val tf = TimestampFormatter.getFractionFormatter(DateTimeUtils.defaultTimeZone.toZoneId) - def checkStringToTimestamp(originalTime: String, expectedParsedTime: String): Unit = { val parsedTimestampOp = DateTimeUtils.stringToTimestamp( UTF8String.fromString(originalTime), defaultZoneId) @@ -456,7 +455,6 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { assert(toJavaTimestamp(fromUTCTime(fromJavaTimestamp(Timestamp.valueOf(utc)), tz)).toString === expected) } - for (tz <- ALL_TIMEZONES) { withDefaultTimeZone(tz) { test("2011-12-25 09:00:00.123456", "UTC", "2011-12-25 09:00:00.123456") @@ -512,10 +510,10 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { test("truncTimestamp") { def testTrunc( - level: Int, - expected: String, - inputTS: SQLTimestamp, - zoneId: ZoneId = defaultZoneId): Unit = { + level: Int, + expected: String, + inputTS: SQLTimestamp, + zoneId: ZoneId = defaultZoneId): Unit = { val truncated = DateTimeUtils.truncTimestamp(inputTS, level, zoneId) val expectedTS = toTimestamp(expected, defaultZoneId) @@ -701,7 +699,6 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { val millisLocal = millisUtc + timeZone.getOffset(millisUtc) Math.floor(millisLocal.toDouble / MILLIS_PER_DAY).toInt } - private def fromJavaDateLegacy(date: Date): Int = { millisToDaysLegacy(date.getTime, defaultTimeZone()) } From 89d35fd2d2d85c1a09be95f73583b6120a5d6f40 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 30 Mar 2020 12:24:38 +0300 Subject: [PATCH 07/13] Remove an unused import --- .../scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index b6dabd92b564..5c3352a1c27c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -21,7 +21,6 @@ import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} import java.time._ import java.time.temporal.{ChronoField, ChronoUnit, IsoFields} -import java.util import java.util.{Calendar, Locale, TimeZone} import java.util.concurrent.TimeUnit._ From 2b6d25d5d640713a7dd428ae85258bb8c4505609 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 30 Mar 2020 13:15:42 +0300 Subject: [PATCH 08/13] Refactoring `rebaseDays`, and add comments --- .../sql/catalyst/util/DateTimeUtils.scala | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 5c3352a1c27c..670fd2c6a2d5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -1033,12 +1033,28 @@ object DateTimeUtils { instantToMicros(localDateTime.atZone(ZoneId.systemDefault).toInstant) } - private def rebaseDays(daysArr: Array[Int], diffArr: Array[Int], days: Int): Int = { - var i = daysArr.length - 1 - while (i >= 0 && days < daysArr(i)) { + /** + * Rebases days since the epoch from an original to an target calendar, from instance + * from a hybrid (Julian + Gregorian) to Proleptic Gregorian calendar. + * + * It finds the latest switch day which is less than `days`, and adds the difference + * in days associated with the switch day to the given `days`. The function is based + * on linear search which starts from the most recent switch days. This allows to perform + * less comparisons for modern dates. + * + * @param switchDays The days when difference in days between original and target + * calendar was changed. + * @param diffs The differences in days between calendars. + * @param days The number of days since the epoch 1970-01-01 to be rebased to the + * target calendar. + * @return The rebased day + */ + private def rebaseDays(switchDays: Array[Int], diffs: Array[Int], days: Int): Int = { + var i = switchDays.length - 1 + while (i >= 0 && days < switchDays(i)) { i -= 1 } - val rebased = days + diffArr(if (i < 0) 0 else i) + val rebased = days + diffs(if (i < 0) 0 else i) rebased } From fd88c5692bbd34fe55066ecfb893ba4e533aa4d1 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 30 Mar 2020 13:30:09 +0300 Subject: [PATCH 09/13] Refactoring and comments --- .../sql/catalyst/util/DateTimeUtils.scala | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 670fd2c6a2d5..e5fd4f74ede6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -1058,11 +1058,15 @@ object DateTimeUtils { rebased } - private val julToGregDay = Array( + // The differences in days between Julian and Proleptic Gregorian dates. + // The diff at the index `i` is applicable for all days in the date interval: + // [julianGregDiffSwitchDay(i), julianGregDiffSwitchDay(i+1)) + private val julianGregDiffs = Array(2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, 0) + // The sorted days when difference in days between Julian and Proleptic + // Gregorian calendars was changed. + private val julianGregDiffSwitchDay = Array( -719164, -682945, -646420, -609895, -536845, -500320, -463795, -390745, -354220, -317695, -244645, -208120, -171595, -141427) - private val julToGregDiff = Array(2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, 0) - /** * Converts the given number of days since the epoch day 1970-01-01 to * a local date in Julian calendar, interprets the result as a local @@ -1073,14 +1077,18 @@ object DateTimeUtils { * @return The rebased number of days in Gregorian calendar. */ def rebaseJulianToGregorianDays(days: Int): Int = { - rebaseDays(julToGregDay, julToGregDiff, days) + rebaseDays(julianGregDiffSwitchDay, julianGregDiffs, days) } - private val gregToJulDay = Array( + // The differences in days between Proleptic Gregorian and Julian dates. + // The diff at the index `i` is applicable for all days in the date interval: + // [gregJulianDiffSwitchDay(i), gregJulianDiffSwitchDay(i+1)) + private val grepJulianDiffs = Array(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0) + // The sorted days when difference in days between Proleptic + // Gregorian and Julian was changed. + private val gregJulianDiffSwitchDay = Array( -719162, -682944, -646420, -609896, -536847, -500323, -463799, -390750, -354226, -317702, -244653, -208129, -171605, -141427) - private val gregToJulDiff = Array(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0) - /** * Rebasing days since the epoch to store the same number of days * as by Spark 2.4 and earlier versions. Spark 3.0 switched to @@ -1098,6 +1106,6 @@ object DateTimeUtils { * @return The rebased number of days since the epoch in Julian calendar. */ def rebaseGregorianToJulianDays(days: Int): Int = { - rebaseDays(gregToJulDay, gregToJulDiff, days) + rebaseDays(gregJulianDiffSwitchDay, grepJulianDiffs, days) } } From 0152a1c690e7024d73e6b307b217154b96974a4a Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 30 Mar 2020 14:32:19 +0000 Subject: [PATCH 10/13] Address Wenchen's review comment --- .../apache/spark/sql/catalyst/util/DateTimeUtils.scala | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index e5fd4f74ede6..5316bc3fec65 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -1062,11 +1062,12 @@ object DateTimeUtils { // The diff at the index `i` is applicable for all days in the date interval: // [julianGregDiffSwitchDay(i), julianGregDiffSwitchDay(i+1)) private val julianGregDiffs = Array(2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, 0) - // The sorted days when difference in days between Julian and Proleptic - // Gregorian calendars was changed. + // The sorted days in Julian calendar when difference in days between Julian and + // Proleptic Gregorian calendars was changed. private val julianGregDiffSwitchDay = Array( -719164, -682945, -646420, -609895, -536845, -500320, -463795, -390745, -354220, -317695, -244645, -208120, -171595, -141427) + /** * Converts the given number of days since the epoch day 1970-01-01 to * a local date in Julian calendar, interprets the result as a local @@ -1084,11 +1085,12 @@ object DateTimeUtils { // The diff at the index `i` is applicable for all days in the date interval: // [gregJulianDiffSwitchDay(i), gregJulianDiffSwitchDay(i+1)) private val grepJulianDiffs = Array(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0) - // The sorted days when difference in days between Proleptic - // Gregorian and Julian was changed. + // The sorted days in Proleptic Gregorian calendar when difference in days between + // Proleptic Gregorian and Julian was changed. private val gregJulianDiffSwitchDay = Array( -719162, -682944, -646420, -609896, -536847, -500323, -463799, -390750, -354226, -317702, -244653, -208129, -171605, -141427) + /** * Rebasing days since the epoch to store the same number of days * as by Spark 2.4 and earlier versions. Spark 3.0 switched to From 08443d90d9bae67e889ef36399de11ed838efae4 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 30 Mar 2020 14:32:42 +0000 Subject: [PATCH 11/13] Re-gen DateTimeBenchmark results on JDK 11 --- .../DateTimeRebaseBenchmark-jdk11-results.txt | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt index 52522f8f88c7..4fed51113912 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt @@ -2,52 +2,52 @@ Rebasing dates/timestamps in Parquet datasource ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 9272 9272 0 10.8 92.7 1.0X -before 1582, noop 9142 9142 0 10.9 91.4 1.0X -after 1582, rebase off 21841 21841 0 4.6 218.4 0.4X -after 1582, rebase on 58245 58245 0 1.7 582.4 0.2X -before 1582, rebase off 19813 19813 0 5.0 198.1 0.5X -before 1582, rebase on 63737 63737 0 1.6 637.4 0.1X +after 1582, noop 9304 9304 0 10.7 93.0 1.0X +before 1582, noop 9187 9187 0 10.9 91.9 1.0X +after 1582, rebase off 22054 22054 0 4.5 220.5 0.4X +after 1582, rebase on 20361 20361 0 4.9 203.6 0.5X +before 1582, rebase off 20286 20286 0 4.9 202.9 0.5X +before 1582, rebase on 22230 22230 0 4.5 222.3 0.4X -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 13004 13063 67 7.7 130.0 1.0X -after 1582, vec off, rebase on 36224 36253 26 2.8 362.2 0.4X -after 1582, vec on, rebase off 3596 3654 54 27.8 36.0 3.6X -after 1582, vec on, rebase on 26144 26253 112 3.8 261.4 0.5X -before 1582, vec off, rebase off 12872 12914 51 7.8 128.7 1.0X -before 1582, vec off, rebase on 37762 37904 153 2.6 377.6 0.3X -before 1582, vec on, rebase off 3522 3592 94 28.4 35.2 3.7X -before 1582, vec on, rebase on 27580 27615 59 3.6 275.8 0.5X +after 1582, vec off, rebase off 12773 12866 129 7.8 127.7 1.0X +after 1582, vec off, rebase on 13063 13086 39 7.7 130.6 1.0X +after 1582, vec on, rebase off 3678 3719 61 27.2 36.8 3.5X +after 1582, vec on, rebase on 5078 5121 52 19.7 50.8 2.5X +before 1582, vec off, rebase off 12942 12972 42 7.7 129.4 1.0X +before 1582, vec off, rebase on 13866 13904 58 7.2 138.7 0.9X +before 1582, vec on, rebase off 3678 3711 43 27.2 36.8 3.5X +before 1582, vec on, rebase on 5621 5657 44 17.8 56.2 2.3X -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 3113 3113 0 32.1 31.1 1.0X -before 1582, noop 3078 3078 0 32.5 30.8 1.0X -after 1582, rebase off 15749 15749 0 6.3 157.5 0.2X -after 1582, rebase on 69106 69106 0 1.4 691.1 0.0X -before 1582, rebase off 15967 15967 0 6.3 159.7 0.2X -before 1582, rebase on 76843 76843 0 1.3 768.4 0.0X +after 1582, noop 2983 2983 0 33.5 29.8 1.0X +before 1582, noop 2979 2979 0 33.6 29.8 1.0X +after 1582, rebase off 17452 17452 0 5.7 174.5 0.2X +after 1582, rebase on 70193 70193 0 1.4 701.9 0.0X +before 1582, rebase off 17784 17784 0 5.6 177.8 0.2X +before 1582, rebase on 83498 83498 0 1.2 835.0 0.0X -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 15070 15172 94 6.6 150.7 1.0X -after 1582, vec off, rebase on 43748 43867 157 2.3 437.5 0.3X -after 1582, vec on, rebase off 4805 4859 60 20.8 48.1 3.1X -after 1582, vec on, rebase on 33960 34027 61 2.9 339.6 0.4X -before 1582, vec off, rebase off 15037 15071 52 6.7 150.4 1.0X -before 1582, vec off, rebase on 44590 44749 156 2.2 445.9 0.3X -before 1582, vec on, rebase off 4831 4852 30 20.7 48.3 3.1X -before 1582, vec on, rebase on 35460 35481 18 2.8 354.6 0.4X +after 1582, vec off, rebase off 15114 15151 32 6.6 151.1 1.0X +after 1582, vec off, rebase on 45804 45912 126 2.2 458.0 0.3X +after 1582, vec on, rebase off 4900 4947 56 20.4 49.0 3.1X +after 1582, vec on, rebase on 34599 34650 45 2.9 346.0 0.4X +before 1582, vec off, rebase off 15093 15174 70 6.6 150.9 1.0X +before 1582, vec off, rebase on 47367 47472 121 2.1 473.7 0.3X +before 1582, vec on, rebase off 4884 4952 80 20.5 48.8 3.1X +before 1582, vec on, rebase on 35831 35883 59 2.8 358.3 0.4X From db5badb9c9731068e5ce23ed780ddd1612dc7cef Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 30 Mar 2020 15:38:54 +0000 Subject: [PATCH 12/13] Re-gen DateTimeBenchmark results on JDK 8 --- .../DateTimeRebaseBenchmark-results.txt | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt index c9320cfe660f..ee486276653f 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt @@ -2,52 +2,52 @@ Rebasing dates/timestamps in Parquet datasource ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 9472 9472 0 10.6 94.7 1.0X -before 1582, noop 9226 9226 0 10.8 92.3 1.0X -after 1582, rebase off 21201 21201 0 4.7 212.0 0.4X -after 1582, rebase on 56471 56471 0 1.8 564.7 0.2X -before 1582, rebase off 20179 20179 0 5.0 201.8 0.5X -before 1582, rebase on 65717 65717 0 1.5 657.2 0.1X +after 1582, noop 9582 9582 0 10.4 95.8 1.0X +before 1582, noop 9473 9473 0 10.6 94.7 1.0X +after 1582, rebase off 21431 21431 0 4.7 214.3 0.4X +after 1582, rebase on 22156 22156 0 4.5 221.6 0.4X +before 1582, rebase off 21399 21399 0 4.7 214.0 0.4X +before 1582, rebase on 22927 22927 0 4.4 229.3 0.4X -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 12294 12434 205 8.1 122.9 1.0X -after 1582, vec off, rebase on 36959 36967 12 2.7 369.6 0.3X -after 1582, vec on, rebase off 3644 3691 49 27.4 36.4 3.4X -after 1582, vec on, rebase on 26764 26852 92 3.7 267.6 0.5X -before 1582, vec off, rebase off 12830 12917 85 7.8 128.3 1.0X -before 1582, vec off, rebase on 38897 39053 229 2.6 389.0 0.3X -before 1582, vec on, rebase off 3638 3693 85 27.5 36.4 3.4X -before 1582, vec on, rebase on 28956 29007 44 3.5 289.6 0.4X +after 1582, vec off, rebase off 12637 12736 111 7.9 126.4 1.0X +after 1582, vec off, rebase on 13463 13531 61 7.4 134.6 0.9X +after 1582, vec on, rebase off 3693 3703 8 27.1 36.9 3.4X +after 1582, vec on, rebase on 5242 5252 9 19.1 52.4 2.4X +before 1582, vec off, rebase off 13055 13169 126 7.7 130.5 1.0X +before 1582, vec off, rebase on 14067 14270 185 7.1 140.7 0.9X +before 1582, vec on, rebase off 3697 3702 7 27.1 37.0 3.4X +before 1582, vec on, rebase on 6058 6097 34 16.5 60.6 2.1X -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 2952 2952 0 33.9 29.5 1.0X -before 1582, noop 2880 2880 0 34.7 28.8 1.0X -after 1582, rebase off 15928 15928 0 6.3 159.3 0.2X -after 1582, rebase on 82816 82816 0 1.2 828.2 0.0X -before 1582, rebase off 15988 15988 0 6.3 159.9 0.2X -before 1582, rebase on 92636 92636 0 1.1 926.4 0.0X +after 1582, noop 2713 2713 0 36.9 27.1 1.0X +before 1582, noop 2715 2715 0 36.8 27.2 1.0X +after 1582, rebase off 16768 16768 0 6.0 167.7 0.2X +after 1582, rebase on 82811 82811 0 1.2 828.1 0.0X +before 1582, rebase off 17052 17052 0 5.9 170.5 0.2X +before 1582, rebase on 95134 95134 0 1.1 951.3 0.0X -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1058-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 14863 14917 78 6.7 148.6 1.0X -after 1582, vec off, rebase on 54819 54939 140 1.8 548.2 0.3X -after 1582, vec on, rebase off 4905 4941 32 20.4 49.0 3.0X -after 1582, vec on, rebase on 44914 45008 124 2.2 449.1 0.3X -before 1582, vec off, rebase off 14928 14970 48 6.7 149.3 1.0X -before 1582, vec off, rebase on 59752 59996 245 1.7 597.5 0.2X -before 1582, vec on, rebase off 4892 4916 33 20.4 48.9 3.0X -before 1582, vec on, rebase on 46854 46977 198 2.1 468.5 0.3X +after 1582, vec off, rebase off 15200 15321 194 6.6 152.0 1.0X +after 1582, vec off, rebase on 63160 63337 177 1.6 631.6 0.2X +after 1582, vec on, rebase off 4891 4928 43 20.4 48.9 3.1X +after 1582, vec on, rebase on 45474 45484 10 2.2 454.7 0.3X +before 1582, vec off, rebase off 15203 15330 110 6.6 152.0 1.0X +before 1582, vec off, rebase on 65588 65664 73 1.5 655.9 0.2X +before 1582, vec on, rebase off 4844 4916 105 20.6 48.4 3.1X +before 1582, vec on, rebase on 47815 47943 162 2.1 478.2 0.3X From b8fa18ee1968fb8b9aca84daa67f2419c16dca95 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 30 Mar 2020 19:21:26 +0300 Subject: [PATCH 13/13] Add comments --- .../org/apache/spark/sql/catalyst/util/DateTimeUtils.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 5316bc3fec65..04994a1b3fbb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -1064,6 +1064,9 @@ object DateTimeUtils { private val julianGregDiffs = Array(2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, 0) // The sorted days in Julian calendar when difference in days between Julian and // Proleptic Gregorian calendars was changed. + // The starting point is the `0001-01-01` (-719164 days since the epoch in + // Julian calendar). All dates before the staring point have the same difference + // of 2 days in Julian and Proleptic Gregorian calendars. private val julianGregDiffSwitchDay = Array( -719164, -682945, -646420, -609895, -536845, -500320, -463795, -390745, -354220, -317695, -244645, -208120, -171595, -141427) @@ -1087,6 +1090,9 @@ object DateTimeUtils { private val grepJulianDiffs = Array(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0) // The sorted days in Proleptic Gregorian calendar when difference in days between // Proleptic Gregorian and Julian was changed. + // The starting point is the `0001-01-01` (-719162 days since the epoch in + // Proleptic Gregorian calendar). All dates before the staring point have the same + // difference of -2 days in Proleptic Gregorian and Julian calendars. private val gregJulianDiffSwitchDay = Array( -719162, -682944, -646420, -609896, -536847, -500323, -463799, -390750, -354226, -317702, -244653, -208129, -171605, -141427)