From 0c90de182f99c0d462c5579877cba4c2a042dede Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 29 May 2024 11:45:48 +0200 Subject: [PATCH 1/3] Initial commit --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 09ec501311ade..ac23962f41ed3 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2240,8 +2240,8 @@ case class Levenshtein( } override def inputTypes: Seq[AbstractDataType] = threshold match { - case Some(_) => Seq(StringType, StringType, IntegerType) - case _ => Seq(StringType, StringType) + case Some(_) => Seq(StringTypeAnyCollation, StringTypeAnyCollation, IntegerType) + case _ => Seq(StringTypeAnyCollation, StringTypeAnyCollation) } override def children: Seq[Expression] = threshold match { From 11008bb6610651d154e09a5936d0fa2554d0aff9 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 10 Jun 2024 08:11:41 +0200 Subject: [PATCH 2/3] Tests --- .../sql/CollationStringExpressionsSuite.scala | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala index db02946e3dfe5..d51b6f598ff64 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala @@ -30,6 +30,8 @@ class CollationStringExpressionsSuite with SharedSparkSession with ExpressionEvalHelper { + private val testSuppCollations = Seq("UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI") + test("Support ConcatWs string expression with collation") { // Supported collations case class ConcatWsTestCase[R](s: String, a: Array[String], c: String, result: R) @@ -645,6 +647,28 @@ class CollationStringExpressionsSuite }) } + test("Levenshtein string expression with collation") { + // Supported collations + case class LevenshteinTestCase( + left: String, right: String, collationName: String, threshold: Option[Int], result: Int + ) + val testCases = Seq( + LevenshteinTestCase("kitten", "sitTing", "UTF8_BINARY", None, result = 4), + LevenshteinTestCase("kitten", "sitTing", "UTF8_BINARY_LCASE", None, result = 4), + LevenshteinTestCase("kitten", "sitTing", "UNICODE", Some(3), result = -1), + LevenshteinTestCase("kitten", "sitTing", "UNICODE_CI", Some(3), result = -1) + ) + testCases.foreach(t => { + withSQLConf(SQLConf.DEFAULT_COLLATION.key -> t.collationName) { + val th = if (t.threshold.isDefined) s", ${t.threshold.get}" else "" + val query = s"select levenshtein('${t.left}', '${t.right}'$th)" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(IntegerType)) + } + }) + } + test("Support Left/Right/Substr with collation") { case class SubstringTestCase( method: String, From 5d4d06ab15fe4923a373db1170e4c203b7a7fb83 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 10 Jun 2024 08:19:33 +0200 Subject: [PATCH 3/3] Update CollationStringExpressionsSuite.scala --- .../org/apache/spark/sql/CollationStringExpressionsSuite.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala index d51b6f598ff64..31be149b9c9cb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala @@ -30,8 +30,6 @@ class CollationStringExpressionsSuite with SharedSparkSession with ExpressionEvalHelper { - private val testSuppCollations = Seq("UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI") - test("Support ConcatWs string expression with collation") { // Supported collations case class ConcatWsTestCase[R](s: String, a: Array[String], c: String, result: R)