From de8dafb3b4e6610a4c3adee6bc8549cdcfe55d51 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 13 Sep 2019 14:13:16 +0500 Subject: [PATCH 1/8] Optimize date_format for foldable fmt --- .../expressions/datetimeExpressions.scala | 39 ++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 9d43701f03056..54dd7fb73969f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -589,22 +589,43 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, StringType) - override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = + var formatter: Option[TimestampFormatter] = None + + override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = { + if (formatter.isEmpty && right.foldable) { + val format = right.eval().toString + formatter = Some(TimestampFormatter( + format, + DateTimeUtils.getZoneId(timeZoneId), + Locale.US)) + } copy(timeZoneId = Option(timeZoneId)) + } override protected def nullSafeEval(timestamp: Any, format: Any): Any = { - val df = TimestampFormatter(format.toString, zoneId) - UTF8String.fromString(df.format(timestamp.asInstanceOf[Long])) + val tf = if (formatter.isEmpty) { + TimestampFormatter(format.toString, zoneId, Locale.US) + } else { + formatter.get + } + UTF8String.fromString(tf.format(timestamp.asInstanceOf[Long])) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - val tf = TimestampFormatter.getClass.getName.stripSuffix("$") - val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName) - val locale = ctx.addReferenceObj("locale", Locale.US) - defineCodeGen(ctx, ev, (timestamp, format) => { - s"""UTF8String.fromString($tf$$.MODULE$$.apply($format.toString(), $zid, $locale) + formatter.map { tf => + val timestampFormatter = ctx.addReferenceObj("timestampFormatter", tf) + defineCodeGen(ctx, ev, (timestamp, _) => { + s"""UTF8String.fromString($timestampFormatter.format($timestamp))""" + }) + }.getOrElse { + val tf = TimestampFormatter.getClass.getName.stripSuffix("$") + val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName) + val locale = ctx.addReferenceObj("locale", Locale.US) + defineCodeGen(ctx, ev, (timestamp, format) => { + s"""UTF8String.fromString($tf$$.MODULE$$.apply($format.toString(), $zid, $locale) .format($timestamp))""" - }) + }) + } } override def prettyName: String = "date_format" From 5ab324fbd7d54cc1e5fbd89a173478f7280143a1 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 13 Sep 2019 14:33:38 +0500 Subject: [PATCH 2/8] Re-gen results of DateTimeBenchmark --- sql/core/benchmarks/DateTimeBenchmark-results.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/benchmarks/DateTimeBenchmark-results.txt b/sql/core/benchmarks/DateTimeBenchmark-results.txt index 1a58b05a2abba..1dc040247282f 100644 --- a/sql/core/benchmarks/DateTimeBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeBenchmark-results.txt @@ -168,8 +168,8 @@ Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.3 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz format date: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -format date wholestage off 7180 / 7181 1.4 718.0 1.0X -format date wholestage on 7051 / 7194 1.4 705.1 1.0X +format date wholestage off 6642 / 6666 1.4 664.2 1.0X +format date wholestage on 6556 / 6565 1.5 655.6 1.0X ================================================================================================ From 85a290e632195f8d7639a9f20eb0a3e1212aa28f Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 13 Sep 2019 21:05:17 +0500 Subject: [PATCH 3/8] Bug fix: copy formatter --- .../expressions/datetimeExpressions.scala | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 54dd7fb73969f..43813d462dd16 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -580,7 +580,11 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa """, since = "1.5.0") // scalastyle:on line.size.limit -case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Option[String] = None) +case class DateFormatClass( + left: Expression, + right: Expression, + timeZoneId: Option[String] = None, + formatter: Option[TimestampFormatter] = None) extends BinaryExpression with TimeZoneAwareExpression with ImplicitCastInputTypes { def this(left: Expression, right: Expression) = this(left, right, None) @@ -589,17 +593,15 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, StringType) - var formatter: Option[TimestampFormatter] = None - override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = { - if (formatter.isEmpty && right.foldable) { + val tf = if (formatter.isEmpty && right.foldable) { val format = right.eval().toString - formatter = Some(TimestampFormatter( + Some(TimestampFormatter( format, DateTimeUtils.getZoneId(timeZoneId), Locale.US)) - } - copy(timeZoneId = Option(timeZoneId)) + } else None + copy(formatter = tf, timeZoneId = Option(timeZoneId)) } override protected def nullSafeEval(timestamp: Any, format: Any): Any = { From ee2a7d24b75126485b5e2659e4162aba98b8ed89 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 13 Sep 2019 21:07:09 +0500 Subject: [PATCH 4/8] Re-gen results of DateTimeBenchmark --- sql/core/benchmarks/DateTimeBenchmark-results.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/benchmarks/DateTimeBenchmark-results.txt b/sql/core/benchmarks/DateTimeBenchmark-results.txt index 1dc040247282f..7d562544dd498 100644 --- a/sql/core/benchmarks/DateTimeBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeBenchmark-results.txt @@ -168,8 +168,8 @@ Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.3 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz format date: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -format date wholestage off 6642 / 6666 1.4 664.2 1.0X -format date wholestage on 6556 / 6565 1.5 655.6 1.0X +format date wholestage off 4787 / 4839 2.1 478.7 1.0X +format date wholestage on 4736 / 4802 2.1 473.6 1.0X ================================================================================================ From 3292accbe06294c5b69a144ec6cab828b12a7a4a Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 14 Sep 2019 09:52:29 +0500 Subject: [PATCH 5/8] Use default locale --- .../sql/catalyst/expressions/datetimeExpressions.scala | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 43813d462dd16..9379bc6779f3a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -596,17 +596,14 @@ case class DateFormatClass( override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = { val tf = if (formatter.isEmpty && right.foldable) { val format = right.eval().toString - Some(TimestampFormatter( - format, - DateTimeUtils.getZoneId(timeZoneId), - Locale.US)) + Some(TimestampFormatter(format, DateTimeUtils.getZoneId(timeZoneId))) } else None copy(formatter = tf, timeZoneId = Option(timeZoneId)) } override protected def nullSafeEval(timestamp: Any, format: Any): Any = { val tf = if (formatter.isEmpty) { - TimestampFormatter(format.toString, zoneId, Locale.US) + TimestampFormatter(format.toString, zoneId) } else { formatter.get } @@ -622,9 +619,8 @@ case class DateFormatClass( }.getOrElse { val tf = TimestampFormatter.getClass.getName.stripSuffix("$") val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName) - val locale = ctx.addReferenceObj("locale", Locale.US) defineCodeGen(ctx, ev, (timestamp, format) => { - s"""UTF8String.fromString($tf$$.MODULE$$.apply($format.toString(), $zid, $locale) + s"""UTF8String.fromString($tf$$.MODULE$$.apply($format.toString(), $zid) .format($timestamp))""" }) } From 3b5a8c2ff4bd0516e7759619fa8b4f92be840d9c Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 14 Sep 2019 10:28:59 +0500 Subject: [PATCH 6/8] Skip DateTimeUtils --- .../spark/sql/catalyst/expressions/datetimeExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 9379bc6779f3a..295cc83ba086a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -596,7 +596,7 @@ case class DateFormatClass( override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = { val tf = if (formatter.isEmpty && right.foldable) { val format = right.eval().toString - Some(TimestampFormatter(format, DateTimeUtils.getZoneId(timeZoneId))) + Some(TimestampFormatter(format, getZoneId(timeZoneId))) } else None copy(formatter = tf, timeZoneId = Option(timeZoneId)) } From 0242852793ba23d603c9ca0c1a18a61648abbc1c Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 16 Sep 2019 09:44:26 +0500 Subject: [PATCH 7/8] Use @transient private lazy val formatter --- .../expressions/datetimeExpressions.scala | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 295cc83ba086a..a2ae0921c1552 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -580,11 +580,7 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa """, since = "1.5.0") // scalastyle:on line.size.limit -case class DateFormatClass( - left: Expression, - right: Expression, - timeZoneId: Option[String] = None, - formatter: Option[TimestampFormatter] = None) +case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Option[String] = None) extends BinaryExpression with TimeZoneAwareExpression with ImplicitCastInputTypes { def this(left: Expression, right: Expression) = this(left, right, None) @@ -593,12 +589,14 @@ case class DateFormatClass( override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, StringType) - override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = { - val tf = if (formatter.isEmpty && right.foldable) { + override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = + copy(timeZoneId = Option(timeZoneId)) + + @transient private lazy val formatter: Option[TimestampFormatter] = { + if (right.foldable) { val format = right.eval().toString - Some(TimestampFormatter(format, getZoneId(timeZoneId))) + Some(TimestampFormatter(format, zoneId)) } else None - copy(formatter = tf, timeZoneId = Option(timeZoneId)) } override protected def nullSafeEval(timestamp: Any, format: Any): Any = { From a9fd8e43d6ec60819fd85134bc3e11e9d4cc0357 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 16 Sep 2019 10:05:30 +0500 Subject: [PATCH 8/8] Handle null from right.eval() --- .../spark/sql/catalyst/expressions/datetimeExpressions.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index a2ae0921c1552..f3a716404cef6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -594,8 +594,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti @transient private lazy val formatter: Option[TimestampFormatter] = { if (right.foldable) { - val format = right.eval().toString - Some(TimestampFormatter(format, zoneId)) + Option(right.eval()).map(format => TimestampFormatter(format.toString, zoneId)) } else None }