Skip to content

Commit 3643565

Browse files
MaxGekkcloud-fan
authored andcommitted
[SPARK-31710][SQL][FOLLOWUP] Replace CAST by TIMESTAMP_SECONDS in benchmarks
### What changes were proposed in this pull request? Replace `CAST(... AS TIMESTAMP` by `TIMESTAMP_SECONDS` in the following benchmarks: - ExtractBenchmark - DateTimeBenchmark - FilterPushdownBenchmark - InExpressionBenchmark ### Why are the changes needed? The benchmarks fail w/o the changes: ``` [info] Running benchmark: datetime +/- interval [info] Running case: date + interval(m) [error] Exception in thread "main" org.apache.spark.sql.AnalysisException: cannot resolve 'CAST(`id` AS TIMESTAMP)' due to data type mismatch: cannot cast bigint to timestamp,you can enable the casting by setting spark.sql.legacy.allowCastNumericToTimestamp to true,but we strongly recommend using function TIMESTAMP_SECONDS/TIMESTAMP_MILLIS/TIMESTAMP_MICROS instead.; line 1 pos 5; [error] 'Project [(cast(cast(id#0L as timestamp) as date) + 1 months) AS (CAST(CAST(id AS TIMESTAMP) AS DATE) + INTERVAL '1 months')#2] [error] +- Range (0, 10000000, step=1, splits=Some(1)) ``` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By running the affected benchmarks. Closes apache#28843 from MaxGekk/GuoPhilipse-31710-fix-compatibility-followup. Authored-by: Max Gekk <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 6e9ff72 commit 3643565

File tree

4 files changed

+23
-23
lines changed

4 files changed

+23
-23
lines changed

sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ object DateTimeBenchmark extends SqlBasedBenchmark {
5454

5555
private def run(cardinality: Int, func: String): Unit = {
5656
codegenBenchmark(s"$func of timestamp", cardinality) {
57-
doBenchmark(cardinality, s"$func(cast(id as timestamp))")
57+
doBenchmark(cardinality, s"$func(timestamp_seconds(id))")
5858
}
5959
}
6060

@@ -64,7 +64,7 @@ object DateTimeBenchmark extends SqlBasedBenchmark {
6464
val N = 10000000
6565
runBenchmark("datetime +/- interval") {
6666
val benchmark = new Benchmark("datetime +/- interval", N, output = output)
67-
val ts = "cast(id as timestamp)"
67+
val ts = "timestamp_seconds(id)"
6868
val dt = s"cast($ts as date)"
6969
benchmark.addCase("date + interval(m)") { _ =>
7070
doBenchmark(N, s"$dt + interval 1 month")
@@ -105,7 +105,7 @@ object DateTimeBenchmark extends SqlBasedBenchmark {
105105
benchmark.run()
106106
}
107107
runBenchmark("Extract components") {
108-
run(N, "cast to timestamp", "cast(id as timestamp)")
108+
run(N, "cast to timestamp", "timestamp_seconds(id)")
109109
run(N, "year")
110110
run(N, "quarter")
111111
run(N, "month")
@@ -124,7 +124,7 @@ object DateTimeBenchmark extends SqlBasedBenchmark {
124124
run(N, "current_timestamp", "current_timestamp")
125125
}
126126
runBenchmark("Date arithmetic") {
127-
val dateExpr = "cast(cast(id as timestamp) as date)"
127+
val dateExpr = "cast(timestamp_seconds(id) as date)"
128128
run(N, "cast to date", dateExpr)
129129
run(N, "last_day", s"last_day($dateExpr)")
130130
run(N, "next_day", s"next_day($dateExpr, 'TU')")
@@ -133,31 +133,31 @@ object DateTimeBenchmark extends SqlBasedBenchmark {
133133
run(N, "add_months", s"add_months($dateExpr, 10)")
134134
}
135135
runBenchmark("Formatting dates") {
136-
val dateExpr = "cast(cast(id as timestamp) as date)"
136+
val dateExpr = "cast(timestamp_seconds(id) as date)"
137137
run(N, "format date", s"date_format($dateExpr, 'MMM yyyy')")
138138
}
139139
runBenchmark("Formatting timestamps") {
140140
run(N, "from_unixtime", "from_unixtime(id, 'yyyy-MM-dd HH:mm:ss.SSSSSS')")
141141
}
142142
runBenchmark("Convert timestamps") {
143-
val timestampExpr = "cast(id as timestamp)"
143+
val timestampExpr = "timestamp_seconds(id)"
144144
run(N, "from_utc_timestamp", s"from_utc_timestamp($timestampExpr, 'CET')")
145145
run(N, "to_utc_timestamp", s"to_utc_timestamp($timestampExpr, 'CET')")
146146
}
147147
runBenchmark("Intervals") {
148-
val (start, end) = ("cast(id as timestamp)", "cast((id+8640000) as timestamp)")
148+
val (start, end) = ("timestamp_seconds(id)", "timestamp_seconds(id+8640000)")
149149
run(N, "cast interval", start, end)
150150
run(N, "datediff", s"datediff($start, $end)")
151151
run(N, "months_between", s"months_between($start, $end)")
152152
run(1000000, "window", s"window($start, 100, 10, 1)")
153153
}
154154
runBenchmark("Truncation") {
155-
val timestampExpr = "cast(id as timestamp)"
155+
val timestampExpr = "timestamp_seconds(id)"
156156
Seq("YEAR", "YYYY", "YY", "MON", "MONTH", "MM", "DAY", "DD", "HOUR", "MINUTE",
157157
"SECOND", "WEEK", "QUARTER").foreach { level =>
158158
run(N, s"date_trunc $level", s"date_trunc('$level', $timestampExpr)")
159159
}
160-
val dateExpr = "cast(cast(id as timestamp) as date)"
160+
val dateExpr = "cast(timestamp_seconds(id) as date)"
161161
Seq("year", "yyyy", "yy", "mon", "month", "mm").foreach { level =>
162162
run(N, s"trunc $level", s"trunc('$level', $dateExpr)")
163163
}

sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ExtractBenchmark.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,10 @@ object ExtractBenchmark extends SqlBasedBenchmark {
5959
}
6060

6161
private def castExpr(from: String): String = from match {
62-
case "timestamp" => "cast(id as timestamp)"
63-
case "date" => "cast(cast(id as timestamp) as date)"
64-
case "interval" => "(cast(cast(id as timestamp) as date) - date'0001-01-01') + " +
65-
"(cast(id as timestamp) - timestamp'1000-01-01 01:02:03.123456')"
62+
case "timestamp" => "timestamp_seconds(id)"
63+
case "date" => "cast(timestamp_seconds(id) as date)"
64+
case "interval" => "(cast(timestamp_seconds(id) as date) - date'0001-01-01') + " +
65+
"(timestamp_seconds(id) - timestamp'1000-01-01 01:02:03.123456')"
6666
case other => throw new IllegalArgumentException(
6767
s"Unsupported column type $other. Valid column types are 'timestamp' and 'date'")
6868
}

sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import scala.util.Random
2424
import org.apache.spark.SparkConf
2525
import org.apache.spark.benchmark.Benchmark
2626
import org.apache.spark.sql.{DataFrame, SparkSession}
27-
import org.apache.spark.sql.functions.monotonically_increasing_id
27+
import org.apache.spark.sql.functions.{monotonically_increasing_id, timestamp_seconds}
2828
import org.apache.spark.sql.internal.SQLConf
2929
import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType
3030
import org.apache.spark.sql.types.{ByteType, Decimal, DecimalType, TimestampType}
@@ -332,11 +332,11 @@ object FilterPushdownBenchmark extends SqlBasedBenchmark {
332332
withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> fileType) {
333333
val columns = (1 to width).map(i => s"CAST(id AS string) c$i")
334334
val df = spark.range(numRows).selectExpr(columns: _*)
335-
.withColumn("value", monotonically_increasing_id().cast(TimestampType))
335+
.withColumn("value", timestamp_seconds(monotonically_increasing_id()))
336336
withTempTable("orcTable", "parquetTable") {
337337
saveAsTable(df, dir)
338338

339-
Seq(s"value = CAST($mid AS timestamp)").foreach { whereExpr =>
339+
Seq(s"value = timestamp_seconds($mid)").foreach { whereExpr =>
340340
val title = s"Select 1 timestamp stored as $fileType row ($whereExpr)"
341341
.replace("value AND value", "value")
342342
filterPushDownBenchmark(numRows, title, whereExpr)
@@ -348,8 +348,8 @@ object FilterPushdownBenchmark extends SqlBasedBenchmark {
348348
filterPushDownBenchmark(
349349
numRows,
350350
s"Select $percent% timestamp stored as $fileType rows " +
351-
s"(value < CAST(${numRows * percent / 100} AS timestamp))",
352-
s"value < CAST(${numRows * percent / 100} as timestamp)",
351+
s"(value < timestamp_seconds(${numRows * percent / 100}))",
352+
s"value < timestamp_seconds(${numRows * percent / 100})",
353353
selectExpr
354354
)
355355
}

sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/InExpressionBenchmark.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.benchmark
1919

2020
import org.apache.spark.benchmark.Benchmark
2121
import org.apache.spark.sql.DataFrame
22-
import org.apache.spark.sql.functions.{array, struct}
22+
import org.apache.spark.sql.functions.{array, struct, timestamp_seconds}
2323
import org.apache.spark.sql.internal.SQLConf
2424
import org.apache.spark.sql.types._
2525

@@ -128,15 +128,15 @@ object InExpressionBenchmark extends SqlBasedBenchmark {
128128

129129
private def runTimestampBenchmark(numItems: Int, numRows: Long, minNumIters: Int): Unit = {
130130
val name = s"$numItems timestamps"
131-
val values = (1 to numItems).map(m => s"CAST('1970-01-01 01:00:00.$m' AS timestamp)")
132-
val df = spark.range(0, numRows).select($"id".cast(TimestampType))
131+
val values = (1 to numItems).map(m => s"timestamp'1970-01-01 01:00:00.$m'")
132+
val df = spark.range(0, numRows).select(timestamp_seconds($"id").as("id"))
133133
runBenchmark(name, df, values, numRows, minNumIters)
134134
}
135135

136136
private def runDateBenchmark(numItems: Int, numRows: Long, minNumIters: Int): Unit = {
137137
val name = s"$numItems dates"
138-
val values = (1 to numItems).map(n => 1970 + n).map(y => s"CAST('$y-01-01' AS date)")
139-
val df = spark.range(0, numRows).select($"id".cast(TimestampType).cast(DateType))
138+
val values = (1 to numItems).map(n => 1970 + n).map(y => s"date'$y-01-01'")
139+
val df = spark.range(0, numRows).select(timestamp_seconds($"id").cast(DateType).as("id"))
140140
runBenchmark(name, df, values, numRows, minNumIters)
141141
}
142142

0 commit comments

Comments
 (0)