[SPARK-33428][SQL] Conv UDF use BigInt to avoid Long value overflow

AngersZhuuuu · cloud-fan · commit 5f9a7fea06cb · 2020-12-14T14:32:08.000Z
### What changes were proposed in this pull request? Use Long value store encode value will overflow and return unexpected result, use BigInt to replace Long value and make logical more simple. ### Why are the changes needed? Fix value overflow issue ### Does this PR introduce _any_ user-facing change? People can sue `conf` function to convert value big then LONG.MAX_VALUE ### How was this patch tested? Added UT #### BenchMark ``` /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.sql.execution.benchmark import scala.util.Random import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.functions._ object ConvFuncBenchMark extends SqlBasedBenchmark { val charset = Array[String]("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z") def constructString(from: Int, length: Int): String = { val chars = charset.slice(0, from) (0 to length).map(x => { val v = Random.nextInt(from) chars(v) }).mkString("") } private def doBenchmark(cardinality: Long, length: Int, from: Int, toBase: Int): Unit = { spark.range(cardinality) .withColumn("str", lit(constructString(from, length))) .select(conv(col("str"), from, toBase)) .noop() } /** * Main process of the whole benchmark. * Implementations of this method are supposed to use the wrapper method `runBenchmark` * for each benchmark scenario. */ override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { val N = 1000000L val benchmark = new Benchmark("conv", N, output = output) benchmark.addCase("length 10 from 2 to 16") { _ => doBenchmark(N, 10, 2, 16) } benchmark.addCase("length 10 from 2 to 10") { _ => doBenchmark(N, 10, 2, 10) } benchmark.addCase("length 10 from 10 to 16") { _ => doBenchmark(N, 10, 10, 16) } benchmark.addCase("length 10 from 10 to 36") { _ => doBenchmark(N, 10, 10, 36) } benchmark.addCase("length 10 from 16 to 10") { _ => doBenchmark(N, 10, 10, 10) } benchmark.addCase("length 10 from 16 to 36") { _ => doBenchmark(N, 10, 16, 36) } benchmark.addCase("length 10 from 36 to 10") { _ => doBenchmark(N, 10, 36, 10) } benchmark.addCase("length 10 from 36 to 16") { _ => doBenchmark(N, 10, 36, 16) } // benchmark.addCase("length 20 from 10 to 16") { _ => doBenchmark(N, 20, 10, 16) } benchmark.addCase("length 20 from 10 to 36") { _ => doBenchmark(N, 20, 10, 36) } benchmark.addCase("length 30 from 10 to 16") { _ => doBenchmark(N, 30, 10, 16) } benchmark.addCase("length 30 from 10 to 36") { _ => doBenchmark(N, 30, 10, 36) } // benchmark.addCase("length 20 from 16 to 10") { _ => doBenchmark(N, 20, 16, 10) } benchmark.addCase("length 20 from 16 to 36") { _ => doBenchmark(N, 20, 16, 36) } benchmark.addCase("length 30 from 16 to 10") { _ => doBenchmark(N, 30, 16, 10) } benchmark.addCase("length 30 from 16 to 36") { _ => doBenchmark(N, 30, 16, 36) } benchmark.run() } } ``` Result with patch : ``` Java HotSpot(TM) 64-Bit Server VM 1.8.0_191-b12 on Mac OS X 10.14.6 Intel(R) Core(TM) i5-8259U CPU 2.30GHz conv: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ length 10 from 2 to 16 54 73 18 18.7 53.6 1.0X length 10 from 2 to 10 43 47 5 23.5 42.5 1.3X length 10 from 10 to 16 39 47 12 25.5 39.2 1.4X length 10 from 10 to 36 38 42 3 26.5 37.7 1.4X length 10 from 16 to 10 39 41 3 25.7 38.9 1.4X length 10 from 16 to 36 36 41 4 27.6 36.3 1.5X length 10 from 36 to 10 38 40 2 26.3 38.0 1.4X length 10 from 36 to 16 37 39 2 26.8 37.2 1.4X length 20 from 10 to 16 36 39 2 27.4 36.5 1.5X length 20 from 10 to 36 37 39 2 27.2 36.8 1.5X length 30 from 10 to 16 37 39 2 27.0 37.0 1.4X length 30 from 10 to 36 36 38 2 27.5 36.3 1.5X length 20 from 16 to 10 35 38 2 28.3 35.4 1.5X length 20 from 16 to 36 34 38 3 29.2 34.3 1.6X length 30 from 16 to 10 38 40 2 26.3 38.1 1.4X length 30 from 16 to 36 37 38 1 27.2 36.8 1.5X ``` Result without patch: ``` Java HotSpot(TM) 64-Bit Server VM 1.8.0_191-b12 on Mac OS X 10.14.6 Intel(R) Core(TM) i5-8259U CPU 2.30GHz conv: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ length 10 from 2 to 16 66 101 29 15.1 66.1 1.0X length 10 from 2 to 10 50 55 5 20.2 49.5 1.3X length 10 from 10 to 16 46 51 5 21.8 45.9 1.4X length 10 from 10 to 36 43 48 4 23.4 42.7 1.5X length 10 from 16 to 10 44 47 4 22.9 43.7 1.5X length 10 from 16 to 36 40 44 2 24.7 40.5 1.6X length 10 from 36 to 10 40 44 4 25.0 40.1 1.6X length 10 from 36 to 16 41 43 2 24.3 41.2 1.6X length 20 from 10 to 16 39 41 2 25.7 38.9 1.7X length 20 from 10 to 36 40 42 2 24.9 40.2 1.6X length 30 from 10 to 16 39 40 1 25.9 38.6 1.7X length 30 from 10 to 36 40 41 1 25.0 40.0 1.7X length 20 from 16 to 10 40 41 1 25.1 39.8 1.7X length 20 from 16 to 36 40 42 2 25.2 39.7 1.7X length 30 from 16 to 10 39 42 2 25.6 39.0 1.7X length 30 from 16 to 36 39 40 2 25.7 38.8 1.7X ``` Closes apache#30350 from AngersZhuuuu/SPARK-33428. Authored-by: angerszhu <angers.zhu@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala
@@ -21,64 +21,37 @@ import org.apache.spark.unsafe.types.UTF8String
 
 object NumberConverter {
 
-  /**
-   * Divide x by m as if x is an unsigned 64-bit integer. Examples:
-   * unsignedLongDiv(-1, 2) == Long.MAX_VALUE unsignedLongDiv(6, 3) == 2
-   * unsignedLongDiv(0, 5) == 0
-   *
-   * @param x is treated as unsigned
-   * @param m is treated as signed
-   */
-  private def unsignedLongDiv(x: Long, m: Int): Long = {
-    if (x >= 0) {
-      x / m
-    } else {
-      // Let uval be the value of the unsigned long with the same bits as x
-      // Two's complement => x = uval - 2*MAX - 2
-      // => uval = x + 2*MAX + 2
-      // Now, use the fact: (a+b)/c = a/c + b/c + (a%c+b%c)/c
-      x / m + 2 * (Long.MaxValue / m) + 2 / m + (x % m + 2 * (Long.MaxValue % m) + 2 % m) / m
-    }
-  }
-
   /**
    * Decode v into value[].
    *
-   * @param v is treated as an unsigned 64-bit integer
+   * @param v is treated as an BigInt
    * @param radix must be between MIN_RADIX and MAX_RADIX
    */
-  private def decode(v: Long, radix: Int, value: Array[Byte]): Unit = {
+  private def decode(v: BigInt, radix: Int, value: Array[Byte]): Unit = {
     var tmpV = v
     java.util.Arrays.fill(value, 0.asInstanceOf[Byte])
     var i = value.length - 1
     while (tmpV != 0) {
-      val q = unsignedLongDiv(tmpV, radix)
-      value(i) = (tmpV - q * radix).asInstanceOf[Byte]
+      val q = tmpV / radix
+      value(i) = (tmpV - q * radix).byteValue
       tmpV = q
       i -= 1
     }
   }
 
   /**
-   * Convert value[] into a long. On overflow, return -1 (as mySQL does). If a
-   * negative digit is found, ignore the suffix starting there.
+   * Convert value[] into a BigInt. If a negative digit is found,
+   * ignore the suffix starting there.
    *
    * @param radix  must be between MIN_RADIX and MAX_RADIX
    * @param fromPos is the first element that should be considered
    * @return the result should be treated as an unsigned 64-bit integer.
    */
-  private def encode(radix: Int, fromPos: Int, value: Array[Byte]): Long = {
-    var v: Long = 0L
-    val bound = unsignedLongDiv(-1 - radix, radix) // Possible overflow once
+  private def encode(radix: Int, fromPos: Int, value: Array[Byte]): BigInt = {
+    var v: BigInt = BigInt(0)
     var i = fromPos
     while (i < value.length && value(i) >= 0) {
-      if (v >= bound) {
-        // Check for overflow
-        if (unsignedLongDiv(-1 - value(i), radix) < v) {
-          return -1
-        }
-      }
-      v = v * radix + value(i)
+      v = (v * radix) + BigInt(value(i))
       i += 1
     }
     v
@@ -129,7 +102,7 @@ object NumberConverter {
       return null
     }
 
-    var (negative, first) = if (n(0) == '-') (true, 1) else (false, 0)
+    val (negative, first) = if (n(0) == '-') (true, 1) else (false, 0)
 
     // Copy the digits in the right side of the array
     val temp = new Array[Byte](64)
@@ -140,19 +113,8 @@ object NumberConverter {
     }
     char2byte(fromBase, temp.length - n.length + first, temp)
 
-    // Do the conversion by going through a 64 bit integer
-    var v = encode(fromBase, temp.length - n.length + first, temp)
-    if (negative && toBase > 0) {
-      if (v < 0) {
-        v = -1
-      } else {
-        v = -v
-      }
-    }
-    if (toBase < 0 && v < 0) {
-      v = -v
-      negative = true
-    }
+    // Do the conversion by going through a BigInt
+    val v: BigInt = encode(fromBase, temp.length - n.length + first, temp)
     decode(v, Math.abs(toBase), temp)
 
     // Find the first non-zero digit or the last digits if all are zero.
@@ -163,7 +125,7 @@ object NumberConverter {
     byte2char(Math.abs(toBase), firstNonZeroPos, temp)
 
     var resultStartPos = firstNonZeroPos
-    if (negative && toBase < 0) {
+    if (negative) {
       resultStartPos = firstNonZeroPos - 1
       temp(resultStartPos) = '-'
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
@@ -158,7 +158,7 @@ class MathExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("conv") {
     checkEvaluation(Conv(Literal("3"), Literal(10), Literal(2)), "11")
     checkEvaluation(Conv(Literal("-15"), Literal(10), Literal(-16)), "-F")
-    checkEvaluation(Conv(Literal("-15"), Literal(10), Literal(16)), "FFFFFFFFFFFFFFF1")
+    checkEvaluation(Conv(Literal("-15"), Literal(10), Literal(16)), "-F")
     checkEvaluation(Conv(Literal("big"), Literal(36), Literal(16)), "3A48")
     checkEvaluation(Conv(Literal.create(null, StringType), Literal(36), Literal(16)), null)
     checkEvaluation(Conv(Literal("3"), Literal.create(null, IntegerType), Literal(16)), null)
@@ -168,10 +168,12 @@ class MathExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(
       Conv(Literal(""), Literal(10), Literal(16)), null)
     checkEvaluation(
-      Conv(Literal("9223372036854775807"), Literal(36), Literal(16)), "FFFFFFFFFFFFFFFF")
+      Conv(Literal("9223372036854775807"), Literal(36), Literal(16)), "12DDAC15F246BAF8C0D551AC7")
     // If there is an invalid digit in the number, the longest valid prefix should be converted.
     checkEvaluation(
       Conv(Literal("11abc"), Literal(10), Literal(16)), "B")
+    checkEvaluation(Conv(Literal("c8dcdfb41711fc9a1f17928001d7fd61"), Literal(16), Literal(10)),
+      "266992441711411603393340504520074460513")
   }
 
   test("e") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala
@@ -34,9 +34,9 @@ class NumberConverterSuite extends SparkFunSuite {
   test("convert") {
     checkConv("3", 10, 2, "11")
     checkConv("-15", 10, -16, "-F")
-    checkConv("-15", 10, 16, "FFFFFFFFFFFFFFF1")
+    checkConv("-15", 10, 16, "-F")
     checkConv("big", 36, 16, "3A48")
-    checkConv("9223372036854775807", 36, 16, "FFFFFFFFFFFFFFFF")
+    checkConv("9223372036854775807", 36, 16, "12DDAC15F246BAF8C0D551AC7")
     checkConv("11abc", 10, 16, "B")
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala
@@ -200,7 +200,7 @@ class MathFunctionsSuite extends QueryTest with SharedSparkSession {
     checkAnswer(df.selectExpr("""conv("100", 2, 10)"""), Row("4"))
     checkAnswer(df.selectExpr("""conv("-10", 16, -10)"""), Row("-16"))
     checkAnswer(
-      df.selectExpr("""conv("9223372036854775807", 36, -16)"""), Row("-1")) // for overflow
+      df.selectExpr("""conv("9223372036854775807", 36, -16)"""), Row("12DDAC15F246BAF8C0D551AC7"))
   }
 
   test("floor") {
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -525,6 +525,9 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_xpath_short",
     "udf_xpath_string",
 
+    // [SPARK-33428][SQL] CONV UDF use BigInt to avoid Long value overflow
+    "udf_conv",
+
     // These tests DROP TABLE that don't exist (but do not specify IF EXISTS)
     "alter_rename_partition1",
     "date_1",
@@ -1003,7 +1006,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_concat_insert1",
     "udf_concat_insert2",
     "udf_concat_ws",
-    "udf_conv",
     "udf_cos",
     "udf_count",
     "udf_date_add",

Original file line number	Diff line number	Diff line change
`@@ -200,7 +200,7 @@ class MathFunctionsSuite extends QueryTest with SharedSparkSession {`
`200`	`200`	`checkAnswer(df.selectExpr("""conv("100", 2, 10)"""), Row("4"))`
`201`	`201`	`checkAnswer(df.selectExpr("""conv("-10", 16, -10)"""), Row("-16"))`
`202`	`202`	`checkAnswer(`
`203`		`- df.selectExpr("""conv("9223372036854775807", 36, -16)"""), Row("-1")) // for overflow`
	`203`	`+ df.selectExpr("""conv("9223372036854775807", 36, -16)"""), Row("12DDAC15F246BAF8C0D551AC7"))`
`204`	`204`	`}`
`205`	`205`
`206`	`206`	`test("floor") {`