From 0209fc4f60f1e14ef962e76ea352c27701f2ae3f Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 27 May 2024 22:44:42 +0200 Subject: [PATCH 01/14] Initial commit --- .../sql/catalyst/util/CollationSupport.java | 48 ++++++++++++++----- .../expressions/stringExpressions.scala | 4 +- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index bea3dc08b4489..42bbab65cda0a 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -534,8 +534,10 @@ public static UTF8String exec( CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { return execBinary(srcString); - } else { + } else if (collation.supportsLowercaseEquality) { return execLowercase(srcString); + } else { + return execLowercase(srcString); // TODO: ICU implementation } } public static UTF8String exec( @@ -545,8 +547,10 @@ public static UTF8String exec( CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { return execBinary(srcString, trimString); - } else { + } else if (collation.supportsLowercaseEquality) { return execLowercase(srcString, trimString); + } else { + return execLowercase(srcString, trimString); // TODO: ICU implementation } } public static String genCode( @@ -556,8 +560,10 @@ public static String genCode( String expr = "CollationSupport.StringTrim.exec"; if (collation.supportsBinaryEquality) { return String.format(expr + "Binary(%s)", srcString); - } { + } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s)", srcString); + } else { + return String.format(expr + "Lowercase(%s)", srcString); // TODO: ICU implementation } } public static String genCode( @@ -568,8 +574,10 @@ public static String genCode( String expr = "CollationSupport.StringTrim.exec"; if (collation.supportsBinaryEquality) { return String.format(expr + "Binary(%s, %s)", srcString, trimString); - } else { + } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); + } else { + return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); // TODO: ICU implementation } } public static UTF8String execBinary( @@ -599,8 +607,10 @@ public static UTF8String exec( CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { return execBinary(srcString); - } else { + } else if (collation.supportsLowercaseEquality) { return execLowercase(srcString); + } else { + return execLowercase(srcString); // TODO: ICU implementation } } public static UTF8String exec( @@ -610,8 +620,10 @@ public static UTF8String exec( CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { return execBinary(srcString, trimString); - } else { + } else if (collation.supportsLowercaseEquality) { return execLowercase(srcString, trimString); + } else { + return execLowercase(srcString, trimString); // TODO: ICU implementation } } public static String genCode( @@ -621,8 +633,10 @@ public static String genCode( String expr = "CollationSupport.StringTrimLeft.exec"; if (collation.supportsBinaryEquality) { return String.format(expr + "Binary(%s)", srcString); - } else { + } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s)", srcString); + } else { + return String.format(expr + "Lowercase(%s)", srcString); // TODO: ICU implementation } } public static String genCode( @@ -633,8 +647,10 @@ public static String genCode( String expr = "CollationSupport.StringTrimLeft.exec"; if (collation.supportsBinaryEquality) { return String.format(expr + "Binary(%s, %s)", srcString, trimString); - } else { + } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); + } else { + return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); // TODO: ICU implementation } } public static UTF8String execBinary( @@ -664,8 +680,10 @@ public static UTF8String exec( CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { return execBinary(srcString); - } else { + } else if (collation.supportsLowercaseEquality) { return execLowercase(srcString); + } else { + return execLowercase(srcString); // TODO: ICU implementation } } public static UTF8String exec( @@ -675,8 +693,10 @@ public static UTF8String exec( CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { return execBinary(srcString, trimString); - } else { + } else if (collation.supportsLowercaseEquality) { return execLowercase(srcString, trimString); + } else { + return execLowercase(srcString, trimString); // TODO: ICU implementation } } public static String genCode( @@ -686,8 +706,10 @@ public static String genCode( String expr = "CollationSupport.StringTrimRight.exec"; if (collation.supportsBinaryEquality) { return String.format(expr + "Binary(%s)", srcString); - } else { + } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s)", srcString); + } else { + return String.format(expr + "Lowercase(%s)", srcString); // TODO: ICU implementation } } public static String genCode( @@ -698,8 +720,10 @@ public static String genCode( String expr = "CollationSupport.StringTrimRight.exec"; if (collation.supportsBinaryEquality) { return String.format(expr + "Binary(%s, %s)", srcString, trimString); - } else { + } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); + } else { + return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); // TODO: ICU implementation } } public static UTF8String execBinary( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 09ec501311ade..4527fd3867deb 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -37,7 +37,7 @@ import org.apache.spark.sql.catalyst.trees.TreePattern.{TreePattern, UPPER_OR_LO import org.apache.spark.sql.catalyst.util.{ArrayData, CollationFactory, CollationSupport, GenericArrayData, TypeUtils} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.types.{AbstractArrayType, StringTypeAnyCollation, StringTypeBinaryLcase} +import org.apache.spark.sql.internal.types.{AbstractArrayType, StringTypeAnyCollation} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.UTF8StringBuilder import org.apache.spark.unsafe.array.ByteArrayMethods @@ -1021,7 +1021,7 @@ trait String2TrimExpression extends Expression with ImplicitCastInputTypes { override def children: Seq[Expression] = srcStr +: trimStr.toSeq override def dataType: DataType = srcStr.dataType - override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringTypeBinaryLcase) + override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringTypeAnyCollation) final lazy val collationId: Int = srcStr.dataType.asInstanceOf[StringType].collationId From 1f6881d101ac695806f75b9f548c4a577bea3d05 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 28 May 2024 20:10:52 +0200 Subject: [PATCH 02/14] Small fixes --- .../spark/sql/catalyst/util/CollationSupport.java | 9 ++++++--- .../sql/CollationStringExpressionsSuite.scala | 14 -------------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index 42bbab65cda0a..93fcd0eee37d0 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -577,7 +577,8 @@ public static String genCode( } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); } else { - return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); // TODO: ICU implementation + // TODO: ICU implementation + return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); } } public static UTF8String execBinary( @@ -650,7 +651,8 @@ public static String genCode( } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); } else { - return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); // TODO: ICU implementation + // TODO: ICU implementation + return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); } } public static UTF8String execBinary( @@ -723,7 +725,8 @@ public static String genCode( } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); } else { - return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); // TODO: ICU implementation + // TODO: ICU implementation + return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); } } public static UTF8String execBinary( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala index 9cc123b708aff..61f54c7bbfc46 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala @@ -945,20 +945,6 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } - test("StringTrim* functions - unsupported collation types") { - List("TRIM", "LTRIM", "RTRIM").foreach(func => { - val collationMismatch = intercept[AnalysisException] { - sql("SELECT " + func + "(COLLATE('x', 'UNICODE_CI'), COLLATE('xxaaaxx', 'UNICODE_CI'))") - } - assert(collationMismatch.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - }) - - val collationMismatch = intercept[AnalysisException] { - sql("SELECT BTRIM(COLLATE('xxaaaxx', 'UNICODE_CI'), COLLATE('x', 'UNICODE_CI'))") - } - assert(collationMismatch.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - } - // TODO: Add more tests for other string expressions } From 5039ddf4e817404a8ac2088d30b037d8a3048d7f Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 31 May 2024 01:07:18 +0200 Subject: [PATCH 03/14] Fix trim logic --- .../util/CollationAwareUTF8String.java | 154 +++++++++--------- .../sql/catalyst/util/CollationFactory.java | 13 ++ .../sql/catalyst/util/CollationSupport.java | 132 +++++---------- .../apache/spark/unsafe/types/UTF8String.java | 87 ++++++++++ .../unsafe/types/CollationSupportSuite.java | 84 ++++++++-- .../spark/unsafe/types/UTF8StringSuite.java | 107 ++++++++++++ .../expressions/stringExpressions.scala | 12 +- .../sql/CollationStringExpressionsSuite.scala | 36 ++++ 8 files changed, 435 insertions(+), 190 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 0d0094d8d0a03..10fa90f9fb6f6 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -28,6 +28,8 @@ import static org.apache.spark.unsafe.Platform.copyMemory; import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; import java.util.Map; /** @@ -524,57 +526,64 @@ public static Map getCollationAwareDict(UTF8String string, public static UTF8String lowercaseTrim( final UTF8String srcString, final UTF8String trimString) { + return lowercaseTrimRight(lowercaseTrimLeft(srcString, trimString), trimString); + } + + public static UTF8String trim( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + return trimRight(trimLeft(srcString, trimString, collationId), trimString, collationId); + } + + public static UTF8String lowercaseTrimLeft( + final UTF8String srcString, + final UTF8String trimString) { // Matching UTF8String behavior for null `trimString`. if (trimString == null) { return null; } - UTF8String leftTrimmed = lowercaseTrimLeft(srcString, trimString); - return lowercaseTrimRight(leftTrimmed, trimString); + HashSet trimChars = new HashSet<>(); + Iterator trimIter = trimString.codePointIterator(); + while (trimIter.hasNext()) trimChars.add(UCharacter.toLowerCase(trimIter.next())); + + int searchIndex = 0; + Iterator srcIter = srcString.codePointIterator(); + while (srcIter.hasNext()) { + if (!trimChars.contains(UCharacter.toLowerCase(srcIter.next()))) break; + ++searchIndex; + } + + return srcString.substring(searchIndex, srcString.numChars()); } - public static UTF8String lowercaseTrimLeft( + public static UTF8String trimLeft( final UTF8String srcString, - final UTF8String trimString) { + final UTF8String trimString, + final int collationId) { // Matching UTF8String behavior for null `trimString`. if (trimString == null) { return null; } - // The searching byte position in the srcString. - int searchIdx = 0; - // The byte position of a first non-matching character in the srcString. - int trimByteIdx = 0; - // Number of bytes in srcString. - int numBytes = srcString.numBytes(); - // Convert trimString to lowercase, so it can be searched properly. - UTF8String lowercaseTrimString = trimString.toLowerCase(); - - while (searchIdx < numBytes) { - UTF8String searchChar = srcString.copyUTF8String( - searchIdx, - searchIdx + UTF8String.numBytesForFirstByte(srcString.getByte(searchIdx)) - 1); - int searchCharBytes = searchChar.numBytes(); - - // Try to find the matching for the searchChar in the trimString. - if (lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) { - trimByteIdx += searchCharBytes; - searchIdx += searchCharBytes; - } else { - // No matching, exit the search. - break; - } + // Create a set of collation keys for all characters of the trim string, for fast lookup. + String trim = trimString.toString(); + HashSet trimChars = new HashSet<>(); + for (int i = 0; i < trim.length(); i++) { + trimChars.add(CollationFactory.getCollationKey(String.valueOf(trim.charAt(i)), collationId)); } - if (searchIdx == 0) { - // Nothing trimmed - return original string (not converted to lowercase). - return srcString; + // Iterate over srcString from the left and find the first character that is not in trimChars. + String input = srcString.toString(); + int i = 0; + while (i < input.length()) { + String key = CollationFactory.getCollationKey(String.valueOf(input.charAt(i)), collationId); + if (!trimChars.contains(key)) break; + ++i; } - if (trimByteIdx >= numBytes) { - // Everything trimmed. - return UTF8String.EMPTY_UTF8; - } - return srcString.copyUTF8String(trimByteIdx, numBytes - 1); + // Return the substring from that position to the end of the string. + return UTF8String.fromString(input.substring(i, srcString.numChars())); } public static UTF8String lowercaseTrimRight( @@ -585,53 +594,48 @@ public static UTF8String lowercaseTrimRight( return null; } - // Number of bytes iterated from the srcString. - int byteIdx = 0; - // Number of characters iterated from the srcString. - int numChars = 0; - // Number of bytes in srcString. - int numBytes = srcString.numBytes(); - // Array of character length for the srcString. - int[] stringCharLen = new int[numBytes]; - // Array of the first byte position for each character in the srcString. - int[] stringCharPos = new int[numBytes]; - // Convert trimString to lowercase, so it can be searched properly. - UTF8String lowercaseTrimString = trimString.toLowerCase(); - - // Build the position and length array. - while (byteIdx < numBytes) { - stringCharPos[numChars] = byteIdx; - stringCharLen[numChars] = UTF8String.numBytesForFirstByte(srcString.getByte(byteIdx)); - byteIdx += stringCharLen[numChars]; - numChars++; - } - - // Index trimEnd points to the first no matching byte position from the right side of - // the source string. - int trimByteIdx = numBytes - 1; + HashSet trimChars = new HashSet<>(); + Iterator trimIter = trimString.codePointIterator(); + while (trimIter.hasNext()) trimChars.add(UCharacter.toLowerCase(trimIter.next())); - while (numChars > 0) { - UTF8String searchChar = srcString.copyUTF8String( - stringCharPos[numChars - 1], - stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1); - - if(lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) { - trimByteIdx -= stringCharLen[numChars - 1]; - numChars--; - } else { + int searchIndex = srcString.numChars(); + Iterator srcIter = srcString.reverseCodePointIterator(); + while (srcIter.hasNext()) { + if (!trimChars.contains(UCharacter.toLowerCase(srcIter.next()))) { break; } + --searchIndex; } - if (trimByteIdx == numBytes - 1) { - // Nothing trimmed. - return srcString; + return srcString.substring(0, searchIndex); + } + + public static UTF8String trimRight( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + // Matching UTF8String behavior for null `trimString`. + if (trimString == null) { + return null; } - if (trimByteIdx < 0) { - // Everything trimmed. - return UTF8String.EMPTY_UTF8; + + // Create a set of collation keys for all characters of the trim string, for fast lookup. + String trim = trimString.toString(); + HashSet trimChars = new HashSet<>(); + for (int i = 0; i < trim.length(); i++) { + trimChars.add(CollationFactory.getCollationKey(String.valueOf(trim.charAt(i)), collationId)); + } + + // Iterate over srcString from the right and find the first character that is not in trimChars. + String input = srcString.toString(); + int i = input.length() - 1; + while (i >= 0) { + String key = CollationFactory.getCollationKey(String.valueOf(input.charAt(i)), collationId); + if (!trimChars.contains(key)) break; + --i; } - return srcString.copyUTF8String(0, trimByteIdx); + // Return the substring from the start of the string until that position. + return UTF8String.fromString(input.substring(0, i + 1)); } // TODO: Add more collation-aware UTF8String operations here. diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index fce12510afaf5..89fc240cab27a 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -23,6 +23,7 @@ import java.util.function.BiFunction; import java.util.function.ToLongFunction; +import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.text.StringSearch; import com.ibm.icu.util.ULocale; @@ -805,6 +806,18 @@ public static String[] getICULocaleNames() { return Collation.CollationSpecICU.ICULocaleNames; } + public static String getCollationKey(String input, int collationId) { + Collation collation = fetchCollation(collationId); + if (collation.supportsBinaryEquality) { + return input; + } else if (collation.supportsLowercaseEquality) { + return input.toLowerCase(); + } else { + CollationKey collationKey = collation.collator.getCollationKey(input); + return Arrays.toString(collationKey.toByteArray()); + } + } + public static UTF8String getCollationKey(UTF8String input, int collationId) { Collation collation = fetchCollation(collationId); if (collation.supportsBinaryEquality) { diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index e1d9b89f3ae2e..8b376f5a4d020 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -528,17 +528,8 @@ public static UTF8String execICU(final UTF8String source, Map di } public static class StringTrim { - public static UTF8String exec( - final UTF8String srcString, - final int collationId) { - CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality) { - return execBinary(srcString); - } else if (collation.supportsLowercaseEquality) { - return execLowercase(srcString); - } else { - return execLowercase(srcString); // TODO: ICU implementation - } + public static UTF8String exec(final UTF8String srcString) { + return execBinary(srcString); } public static UTF8String exec( final UTF8String srcString, @@ -550,21 +541,11 @@ public static UTF8String exec( } else if (collation.supportsLowercaseEquality) { return execLowercase(srcString, trimString); } else { - return execLowercase(srcString, trimString); // TODO: ICU implementation + return execICU(srcString, trimString, collationId); } } - public static String genCode( - final String srcString, - final int collationId) { - CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - String expr = "CollationSupport.StringTrim.exec"; - if (collation.supportsBinaryEquality) { - return String.format(expr + "Binary(%s)", srcString); - } else if (collation.supportsLowercaseEquality) { - return String.format(expr + "Lowercase(%s)", srcString); - } else { - return String.format(expr + "Lowercase(%s)", srcString); // TODO: ICU implementation - } + public static String genCode(final String srcString) { + return String.format("CollationSupport.StringTrim.execBinary(%s)", srcString); } public static String genCode( final String srcString, @@ -577,8 +558,7 @@ public static String genCode( } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); } else { - // TODO: ICU implementation - return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); + return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString, collationId); } } public static UTF8String execBinary( @@ -590,29 +570,22 @@ public static UTF8String execBinary( final UTF8String trimString) { return srcString.trim(trimString); } - public static UTF8String execLowercase( - final UTF8String srcString) { - return srcString.trim(); - } public static UTF8String execLowercase( final UTF8String srcString, final UTF8String trimString) { return CollationAwareUTF8String.lowercaseTrim(srcString, trimString); } + public static UTF8String execICU( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + return CollationAwareUTF8String.trim(srcString, trimString, collationId); + } } public static class StringTrimLeft { - public static UTF8String exec( - final UTF8String srcString, - final int collationId) { - CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality) { - return execBinary(srcString); - } else if (collation.supportsLowercaseEquality) { - return execLowercase(srcString); - } else { - return execLowercase(srcString); // TODO: ICU implementation - } + public static UTF8String exec(final UTF8String srcString) { + return execBinary(srcString); } public static UTF8String exec( final UTF8String srcString, @@ -624,21 +597,11 @@ public static UTF8String exec( } else if (collation.supportsLowercaseEquality) { return execLowercase(srcString, trimString); } else { - return execLowercase(srcString, trimString); // TODO: ICU implementation + return execICU(srcString, trimString, collationId); } } - public static String genCode( - final String srcString, - final int collationId) { - CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - String expr = "CollationSupport.StringTrimLeft.exec"; - if (collation.supportsBinaryEquality) { - return String.format(expr + "Binary(%s)", srcString); - } else if (collation.supportsLowercaseEquality) { - return String.format(expr + "Lowercase(%s)", srcString); - } else { - return String.format(expr + "Lowercase(%s)", srcString); // TODO: ICU implementation - } + public static String genCode(final String srcString) { + return String.format("CollationSupport.StringTrimLeft.execBinary(%s)", srcString); } public static String genCode( final String srcString, @@ -651,12 +614,10 @@ public static String genCode( } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); } else { - // TODO: ICU implementation - return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); + return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString, collationId); } } - public static UTF8String execBinary( - final UTF8String srcString) { + public static UTF8String execBinary(final UTF8String srcString) { return srcString.trimLeft(); } public static UTF8String execBinary( @@ -664,29 +625,22 @@ public static UTF8String execBinary( final UTF8String trimString) { return srcString.trimLeft(trimString); } - public static UTF8String execLowercase( - final UTF8String srcString) { - return srcString.trimLeft(); - } public static UTF8String execLowercase( final UTF8String srcString, final UTF8String trimString) { return CollationAwareUTF8String.lowercaseTrimLeft(srcString, trimString); } + public static UTF8String execICU( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + return CollationAwareUTF8String.trimLeft(srcString, trimString, collationId); + } } public static class StringTrimRight { - public static UTF8String exec( - final UTF8String srcString, - final int collationId) { - CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality) { - return execBinary(srcString); - } else if (collation.supportsLowercaseEquality) { - return execLowercase(srcString); - } else { - return execLowercase(srcString); // TODO: ICU implementation - } + public static UTF8String exec(final UTF8String srcString) { + return execBinary(srcString); } public static UTF8String exec( final UTF8String srcString, @@ -698,21 +652,11 @@ public static UTF8String exec( } else if (collation.supportsLowercaseEquality) { return execLowercase(srcString, trimString); } else { - return execLowercase(srcString, trimString); // TODO: ICU implementation + return execICU(srcString, trimString, collationId); } } - public static String genCode( - final String srcString, - final int collationId) { - CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - String expr = "CollationSupport.StringTrimRight.exec"; - if (collation.supportsBinaryEquality) { - return String.format(expr + "Binary(%s)", srcString); - } else if (collation.supportsLowercaseEquality) { - return String.format(expr + "Lowercase(%s)", srcString); - } else { - return String.format(expr + "Lowercase(%s)", srcString); // TODO: ICU implementation - } + public static String genCode(final String srcString) { + return String.format("CollationSupport.StringTrimRight.execBinary(%s)", srcString); } public static String genCode( final String srcString, @@ -725,12 +669,10 @@ public static String genCode( } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); } else { - // TODO: ICU implementation - return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); + return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString, collationId); } } - public static UTF8String execBinary( - final UTF8String srcString) { + public static UTF8String execBinary(final UTF8String srcString) { return srcString.trimRight(); } public static UTF8String execBinary( @@ -738,15 +680,17 @@ public static UTF8String execBinary( final UTF8String trimString) { return srcString.trimRight(trimString); } - public static UTF8String execLowercase( - final UTF8String srcString) { - return srcString.trimRight(); - } public static UTF8String execLowercase( final UTF8String srcString, final UTF8String trimString) { return CollationAwareUTF8String.lowercaseTrimRight(srcString, trimString); } + public static UTF8String execICU( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + return CollationAwareUTF8String.trimRight(srcString, trimString, collationId); + } } // TODO: Add more collation-aware string expressions. diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 03286e0635287..334421adbeb4d 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -22,6 +22,7 @@ import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.Arrays; +import java.util.Iterator; import java.util.Map; import java.util.regex.Pattern; @@ -270,6 +271,92 @@ public byte[] getBytes() { } } + /** + * Returns the code point starting from the byte at position `index`. + */ + public int codePointFrom(int index) { + if (index < 0 || index >= numBytes) { + throw new IndexOutOfBoundsException(); + } + byte b = getByte(index); + int numBytes = numBytesForFirstByte(b); + return switch (numBytes) { + case 1 -> + b & 0x7F; + case 2 -> + ((b & 0x1F) << 6) | (getByte(index + 1) & 0x3F); + case 3 -> + ((b & 0x0F) << 12) | ((getByte(index + 1) & 0x3F) << 6) | + (getByte(index + 2) & 0x3F); + case 4 -> + ((b & 0x07) << 18) | ((getByte(index + 1) & 0x3F) << 12) | + ((getByte(index + 2) & 0x3F) << 6) | (getByte(index + 3) & 0x3F); + default -> + throw new IllegalArgumentException("Invalid UTF-8 sequence"); + }; + } + + public int getChar(int index) { + if (index < 0 || index >= numChars()) { + throw new IndexOutOfBoundsException(); + } + int charCount = 0, byteCount = 0; + while (charCount < index) { + byteCount += numBytesForFirstByte(getByte(byteCount)); + charCount += 1; + } + return codePointFrom(byteCount); + } + + public Iterator codePointIterator() { + return new CodePointIterator(); + } + private class CodePointIterator implements Iterator { + private int byteIndex = 0; + + @Override + public boolean hasNext() { + return byteIndex < numBytes; + } + + @Override + public Integer next() { + if (!hasNext()) { + throw new IndexOutOfBoundsException(); + } + int codePoint = codePointFrom(byteIndex); + byteIndex += numBytesForFirstByte(getByte(byteIndex)); + return codePoint; + } + } + + public Iterator reverseCodePointIterator() { + return new ReverseCodePointIterator(); + } + private class ReverseCodePointIterator implements Iterator { + private int byteIndex = numBytes - 1; + + @Override + public boolean hasNext() { + return byteIndex >= 0; + } + + @Override + public Integer next() { + if (!hasNext()) { + throw new IndexOutOfBoundsException(); + } + while (byteIndex > 0 && isContinuationByte(getByte(byteIndex))) { + --byteIndex; + } + return codePointFrom(byteIndex--); + } + + private boolean isContinuationByte(byte b) { + return (b & 0xC0) == 0x80; + } + } + /** * Returns a substring of this. * @param start the position of first code point diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index eb18d7665b092..557074373fa7e 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -890,7 +890,7 @@ private void assertStringTrim( if (trimString == null) { result = CollationSupport.StringTrim.exec( - UTF8String.fromString(sourceString), collationId).toString(); + UTF8String.fromString(sourceString)).toString(); } else { result = CollationSupport.StringTrim.exec( UTF8String @@ -911,7 +911,7 @@ private void assertStringTrimLeft( if (trimString == null) { result = CollationSupport.StringTrimLeft.exec( - UTF8String.fromString(sourceString), collationId).toString(); + UTF8String.fromString(sourceString)).toString(); } else { result = CollationSupport.StringTrimLeft.exec( UTF8String @@ -932,7 +932,7 @@ private void assertStringTrimRight( if (trimString == null) { result = CollationSupport.StringTrimRight.exec( - UTF8String.fromString(sourceString), collationId).toString(); + UTF8String.fromString(sourceString)).toString(); } else { result = CollationSupport.StringTrimRight.exec( UTF8String @@ -945,20 +945,29 @@ private void assertStringTrimRight( @Test public void testStringTrim() throws SparkException { + // UTF8_BINARY +// assertStringTrim("UTF8_BINARY", null, null, ""); + assertStringTrim("UTF8_BINARY", "", "", ""); + assertStringTrim("UTF8_BINARY", "", "xyz", ""); + assertStringTrim("UTF8_BINARY", "asd", "", "asd"); assertStringTrim("UTF8_BINARY", "asd", null, "asd"); assertStringTrim("UTF8_BINARY", " asd ", null, "asd"); assertStringTrim("UTF8_BINARY", " a世a ", null, "a世a"); assertStringTrim("UTF8_BINARY", "asd", "x", "asd"); assertStringTrim("UTF8_BINARY", "xxasdxx", "x", "asd"); assertStringTrim("UTF8_BINARY", "xa世ax", "x", "a世a"); - + assertStringTrimLeft("UTF8_BINARY", "", "", ""); + assertStringTrimLeft("UTF8_BINARY", "", "xyz", ""); + assertStringTrimLeft("UTF8_BINARY", "asd", "", "asd"); assertStringTrimLeft("UTF8_BINARY", "asd", null, "asd"); assertStringTrimLeft("UTF8_BINARY", " asd ", null, "asd "); assertStringTrimLeft("UTF8_BINARY", " a世a ", null, "a世a "); assertStringTrimLeft("UTF8_BINARY", "asd", "x", "asd"); assertStringTrimLeft("UTF8_BINARY", "xxasdxx", "x", "asdxx"); assertStringTrimLeft("UTF8_BINARY", "xa世ax", "x", "a世ax"); - + assertStringTrimRight("UTF8_BINARY", "", "", ""); + assertStringTrimRight("UTF8_BINARY", "", "xyz", ""); + assertStringTrimRight("UTF8_BINARY", "asd", "", "asd"); assertStringTrimRight("UTF8_BINARY", "asd", null, "asd"); assertStringTrimRight("UTF8_BINARY", " asd ", null, " asd"); assertStringTrimRight("UTF8_BINARY", " a世a ", null, " a世a"); @@ -966,20 +975,28 @@ public void testStringTrim() throws SparkException { assertStringTrimRight("UTF8_BINARY", "xxasdxx", "x", "xxasd"); assertStringTrimRight("UTF8_BINARY", "xa世ax", "x", "xa世a"); + // UTF8_BINARY_LCASE + assertStringTrim("UTF8_BINARY_LCASE", "", "", ""); + assertStringTrim("UTF8_BINARY_LCASE", "", "xyz", ""); + assertStringTrim("UTF8_BINARY_LCASE", "asd", "", "asd"); assertStringTrim("UTF8_BINARY_LCASE", "asd", null, "asd"); assertStringTrim("UTF8_BINARY_LCASE", " asd ", null, "asd"); assertStringTrim("UTF8_BINARY_LCASE", " a世a ", null, "a世a"); assertStringTrim("UTF8_BINARY_LCASE", "asd", "x", "asd"); assertStringTrim("UTF8_BINARY_LCASE", "xxasdxx", "x", "asd"); assertStringTrim("UTF8_BINARY_LCASE", "xa世ax", "x", "a世a"); - + assertStringTrimLeft("UTF8_BINARY_LCASE", "", "", ""); + assertStringTrimLeft("UTF8_BINARY_LCASE", "", "xyz", ""); + assertStringTrimLeft("UTF8_BINARY_LCASE", "asd", "", "asd"); assertStringTrimLeft("UTF8_BINARY_LCASE", "asd", null, "asd"); assertStringTrimLeft("UTF8_BINARY_LCASE", " asd ", null, "asd "); assertStringTrimLeft("UTF8_BINARY_LCASE", " a世a ", null, "a世a "); assertStringTrimLeft("UTF8_BINARY_LCASE", "asd", "x", "asd"); assertStringTrimLeft("UTF8_BINARY_LCASE", "xxasdxx", "x", "asdxx"); assertStringTrimLeft("UTF8_BINARY_LCASE", "xa世ax", "x", "a世ax"); - + assertStringTrimRight("UTF8_BINARY_LCASE", "", "", ""); + assertStringTrimRight("UTF8_BINARY_LCASE", "", "xyz", ""); + assertStringTrimRight("UTF8_BINARY_LCASE", "asd", "", "asd"); assertStringTrimRight("UTF8_BINARY_LCASE", "asd", null, "asd"); assertStringTrimRight("UTF8_BINARY_LCASE", " asd ", null, " asd"); assertStringTrimRight("UTF8_BINARY_LCASE", " a世a ", null, " a世a"); @@ -987,20 +1004,28 @@ public void testStringTrim() throws SparkException { assertStringTrimRight("UTF8_BINARY_LCASE", "xxasdxx", "x", "xxasd"); assertStringTrimRight("UTF8_BINARY_LCASE", "xa世ax", "x", "xa世a"); - assertStringTrim("UTF8_BINARY_LCASE", "asd", null, "asd"); - assertStringTrim("UTF8_BINARY_LCASE", " asd ", null, "asd"); - assertStringTrim("UTF8_BINARY_LCASE", " a世a ", null, "a世a"); - assertStringTrim("UTF8_BINARY_LCASE", "asd", "x", "asd"); - assertStringTrim("UTF8_BINARY_LCASE", "xxasdxx", "x", "asd"); - assertStringTrim("UTF8_BINARY_LCASE", "xa世ax", "x", "a世a"); - + // UNICODE + assertStringTrim("UNICODE", "", "", ""); + assertStringTrim("UNICODE", "", "xyz", ""); + assertStringTrim("UNICODE", "asd", "", "asd"); + assertStringTrim("UNICODE", "asd", null, "asd"); + assertStringTrim("UNICODE", " asd ", null, "asd"); + assertStringTrim("UNICODE", " a世a ", null, "a世a"); + assertStringTrim("UNICODE", "asd", "x", "asd"); + assertStringTrim("UNICODE", "xxasdxx", "x", "asd"); + assertStringTrim("UNICODE", "xa世ax", "x", "a世a"); + assertStringTrimLeft("UNICODE", "", "", ""); + assertStringTrimLeft("UNICODE", "", "xyz", ""); + assertStringTrimLeft("UNICODE", "asd", "", "asd"); assertStringTrimLeft("UNICODE", "asd", null, "asd"); assertStringTrimLeft("UNICODE", " asd ", null, "asd "); assertStringTrimLeft("UNICODE", " a世a ", null, "a世a "); assertStringTrimLeft("UNICODE", "asd", "x", "asd"); assertStringTrimLeft("UNICODE", "xxasdxx", "x", "asdxx"); assertStringTrimLeft("UNICODE", "xa世ax", "x", "a世ax"); - + assertStringTrimRight("UNICODE", "", "", ""); + assertStringTrimRight("UNICODE", "", "xyz", ""); + assertStringTrimRight("UNICODE", "asd", "", "asd"); assertStringTrimRight("UNICODE", "asd", null, "asd"); assertStringTrimRight("UNICODE", " asd ", null, " asd"); assertStringTrimRight("UNICODE", " a世a ", null, " a世a"); @@ -1008,6 +1033,35 @@ public void testStringTrim() throws SparkException { assertStringTrimRight("UNICODE", "xxasdxx", "x", "xxasd"); assertStringTrimRight("UNICODE", "xa世ax", "x", "xa世a"); + // UNICODE_CI + assertStringTrim("UNICODE_CI", "", "", ""); + assertStringTrim("UNICODE_CI", "", "xyz", ""); + assertStringTrim("UNICODE_CI", "asd", "", "asd"); + assertStringTrim("UNICODE_CI", "asd", null, "asd"); + assertStringTrim("UNICODE_CI", " asd ", null, "asd"); + assertStringTrim("UNICODE_CI", " a世a ", null, "a世a"); + assertStringTrim("UNICODE_CI", "asd", "x", "asd"); + assertStringTrim("UNICODE_CI", "xxasdxx", "x", "asd"); + assertStringTrim("UNICODE_CI", "xa世ax", "x", "a世a"); + assertStringTrimLeft("UNICODE_CI", "", "", ""); + assertStringTrimLeft("UNICODE_CI", "", "xyz", ""); + assertStringTrimLeft("UNICODE_CI", "asd", "", "asd"); + assertStringTrimLeft("UNICODE_CI", "asd", null, "asd"); + assertStringTrimLeft("UNICODE_CI", " asd ", null, "asd "); + assertStringTrimLeft("UNICODE_CI", " a世a ", null, "a世a "); + assertStringTrimLeft("UNICODE_CI", "asd", "x", "asd"); + assertStringTrimLeft("UNICODE_CI", "xxasdxx", "x", "asdxx"); + assertStringTrimLeft("UNICODE_CI", "xa世ax", "x", "a世ax"); + assertStringTrimRight("UNICODE_CI", "", "", ""); + assertStringTrimRight("UNICODE_CI", "", "xyz", ""); + assertStringTrimRight("UNICODE_CI", "asd", "", "asd"); + assertStringTrimRight("UNICODE_CI", "asd", null, "asd"); + assertStringTrimRight("UNICODE_CI", " asd ", null, " asd"); + assertStringTrimRight("UNICODE_CI", " a世a ", null, " a世a"); + assertStringTrimRight("UNICODE_CI", "asd", "x", "asd"); + assertStringTrimRight("UNICODE_CI", "xxasdxx", "x", "xxasd"); + assertStringTrimRight("UNICODE_CI", "xa世ax", "x", "xa世a"); + // Test cases where trimString has more than one character assertStringTrim("UTF8_BINARY", "ddsXXXaa", "asd", "XXX"); assertStringTrimLeft("UTF8_BINARY", "ddsXXXaa", "asd", "XXXaa"); diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 0188297fd05a2..7f5f9d359193b 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -902,4 +902,111 @@ public void skipWrongFirstByte() { assertEquals(1, fromBytes(c).numChars()); } } + + @Test + public void UTF8StringCodePoints() { + String s = "aéह 日å!"; + UTF8String s0 = fromString(s); + for (int i = 0; i < s.length(); ++i) { + assertEquals(s.codePointAt(i), s0.getChar(i)); + } + + UTF8String s1 = fromBytes(new byte[] {0x41, (byte) 0xC3, (byte) 0xB1, (byte) 0xE2, + (byte) 0x82, (byte) 0xAC, (byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0x88}); + // numBytesForFirstByte + assertEquals(1, UTF8String.numBytesForFirstByte(s1.getByte(0))); + assertEquals(2, UTF8String.numBytesForFirstByte(s1.getByte(1))); + assertEquals(3, UTF8String.numBytesForFirstByte(s1.getByte(3))); + assertEquals(4, UTF8String.numBytesForFirstByte(s1.getByte(6))); + // getByte + assertEquals((byte) 0x41, s1.getByte(0)); + assertEquals((byte) 0xC3, s1.getByte(1)); + assertEquals((byte) 0xE2, s1.getByte(3)); + assertEquals((byte) 0xF0, s1.getByte(6)); + // codePointFrom + assertEquals(0x41, s1.codePointFrom(0)); + assertEquals(0xF1, s1.codePointFrom(1)); + assertEquals(0x20AC, s1.codePointFrom(3)); + assertEquals(0x10348, s1.codePointFrom(6)); + assertThrows(IndexOutOfBoundsException.class, () -> s1.codePointFrom(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s1.codePointFrom(99)); + // getChar + assertEquals(0x41, s1.getChar(0)); + assertEquals(0xF1, s1.getChar(1)); + assertEquals(0x20AC, s1.getChar(2)); + assertEquals(0x10348, s1.getChar(3)); + assertThrows(IndexOutOfBoundsException.class, () -> s1.getChar(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s1.getChar(99)); + + UTF8String s2 = fromString("Añ€𐍈"); + // numBytesForFirstByte + assertEquals(1, UTF8String.numBytesForFirstByte(s2.getByte(0))); + assertEquals(2, UTF8String.numBytesForFirstByte(s2.getByte(1))); + assertEquals(3, UTF8String.numBytesForFirstByte(s2.getByte(3))); + assertEquals(4, UTF8String.numBytesForFirstByte(s2.getByte(6))); + // getByte + assertEquals((byte) 0x41, s2.getByte(0)); + assertEquals((byte) 0xC3, s2.getByte(1)); + assertEquals((byte) 0xE2, s2.getByte(3)); + assertEquals((byte) 0xF0, s2.getByte(6)); + // codePointFrom + assertEquals(0x41, s2.codePointFrom(0)); + assertEquals(0xF1, s2.codePointFrom(1)); + assertEquals(0x20AC, s2.codePointFrom(3)); + assertEquals(0x10348, s2.codePointFrom(6)); + assertThrows(IndexOutOfBoundsException.class, () -> s2.codePointFrom(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s2.codePointFrom(99)); + // getChar + assertEquals(0x41, s2.getChar(0)); + assertEquals(0xF1, s2.getChar(1)); + assertEquals(0x20AC, s2.getChar(2)); + assertEquals(0x10348, s2.getChar(3)); + assertThrows(IndexOutOfBoundsException.class, () -> s2.getChar(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s2.getChar(99)); + + UTF8String s3 = EMPTY_UTF8; + // codePointFrom + assertThrows(IndexOutOfBoundsException.class, () -> s3.codePointFrom(0)); + assertThrows(IndexOutOfBoundsException.class, () -> s3.codePointFrom(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s3.codePointFrom(99)); + // getChar + assertThrows(IndexOutOfBoundsException.class, () -> s3.getChar(0)); + assertThrows(IndexOutOfBoundsException.class, () -> s3.getChar(-1)); + assertThrows(IndexOutOfBoundsException.class, () -> s3.getChar(99)); + } + + private void testCodePointIterator(String str) { + UTF8String s = fromString(str); + Iterator it = s.codePointIterator(); + for (int i = 0; i < str.length(); ++i) { + assertTrue(it.hasNext()); + assertEquals(str.charAt(i), (int) it.next()); + } + assertFalse(it.hasNext()); + } + @Test + public void codePointIterator() { + testCodePointIterator(""); + testCodePointIterator("abc"); + testCodePointIterator("a!2&^R"); + testCodePointIterator("aéह 日å!"); + } + + private void testReverseCodePointIterator(String str) { + UTF8String s = fromString(str); + Iterator it = s.reverseCodePointIterator(); + for (int i = str.length() - 1; i >= 0 ; --i) { + assertTrue(it.hasNext()); + assertEquals(str.charAt(i), (int) it.next()); + } + assertFalse(it.hasNext()); + } + @Test + public void reverseCodePointIterator() { + testReverseCodePointIterator(""); + testReverseCodePointIterator("abc"); + testReverseCodePointIterator("a!2&^R"); + testReverseCodePointIterator("aéह 日å!"); + } + } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 4527fd3867deb..e19f3b63c1520 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1049,11 +1049,11 @@ trait String2TrimExpression extends Expression with ImplicitCastInputTypes { if (evals.length == 1) { val stringTrimCode: String = this match { case _: StringTrim => - CollationSupport.StringTrim.genCode(srcString.value, collationId) + CollationSupport.StringTrim.genCode(srcString.value) case _: StringTrimLeft => - CollationSupport.StringTrimLeft.genCode(srcString.value, collationId) + CollationSupport.StringTrimLeft.genCode(srcString.value) case _: StringTrimRight => - CollationSupport.StringTrimRight.genCode(srcString.value, collationId) + CollationSupport.StringTrimRight.genCode(srcString.value) } ev.copy(code = code""" |${srcString.code} @@ -1179,7 +1179,7 @@ case class StringTrim(srcStr: Expression, trimStr: Option[Expression] = None) override protected def direction: String = "BOTH" override def doEval(srcString: UTF8String): UTF8String = - CollationSupport.StringTrim.exec(srcString, collationId) + CollationSupport.StringTrim.exec(srcString) override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String = CollationSupport.StringTrim.exec(srcString, trimString, collationId) @@ -1286,7 +1286,7 @@ case class StringTrimLeft(srcStr: Expression, trimStr: Option[Expression] = None override protected def direction: String = "LEADING" override def doEval(srcString: UTF8String): UTF8String = - CollationSupport.StringTrimLeft.exec(srcString, collationId) + CollationSupport.StringTrimLeft.exec(srcString) override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String = CollationSupport.StringTrimLeft.exec(srcString, trimString, collationId) @@ -1346,7 +1346,7 @@ case class StringTrimRight(srcStr: Expression, trimStr: Option[Expression] = Non override protected def direction: String = "TRAILING" override def doEval(srcString: UTF8String): UTF8String = - CollationSupport.StringTrimRight.exec(srcString, collationId) + CollationSupport.StringTrimRight.exec(srcString) override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String = CollationSupport.StringTrimRight.exec(srcString, trimString, collationId) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala index 61f54c7bbfc46..ce1c09c97f217 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala @@ -802,7 +802,43 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } + test("xxxx") { + checkEvaluation( + StringTrim(Literal.create(null, StringType), Literal.create(null, StringType)), null) + } + test("StringTrim* functions - unit tests for both paths (codegen and eval)") { + def evalStringTrim(src: Any, trim: Any, result: String): Unit = { + Seq("UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI").foreach { collation => + val dt: DataType = StringType(collation) + checkEvaluation(StringTrim(Literal.create(src, dt), Literal.create(trim, dt)), result) + checkEvaluation(StringTrimLeft(Literal.create(src, dt), Literal.create(trim, dt)), result) + checkEvaluation(StringTrimRight(Literal.create(src, dt), Literal.create(trim, dt)), result) + } + } + // General edge cases and basic tests. + evalStringTrim(null, null, null) + evalStringTrim(null, "", null) + evalStringTrim(null, "a", null) + evalStringTrim("", null, null) + evalStringTrim("a", null, null) + evalStringTrim("", "", "") + evalStringTrim("", " ", "") + evalStringTrim("", "a", "") + evalStringTrim("", "aaa", "") + evalStringTrim(" ", "", " ") + evalStringTrim("a", "", "a") + evalStringTrim("aaa", "", "aaa") + evalStringTrim(" ", " ", "") + evalStringTrim(" ", " ", "") + evalStringTrim(" ", " ", "") + evalStringTrim(" ", " ", "") + evalStringTrim("a", "aaa", "") + evalStringTrim("aaa", "a", "") + evalStringTrim("aaa", "aaa", "") + evalStringTrim("abc", "cba", "") + evalStringTrim("cba", "abc", "") + // Without trimString param. checkEvaluation(StringTrim(Literal.create( " asd ", StringType("UTF8_BINARY"))), "asd") checkEvaluation( From 361c7b1fd40ef71662fa5c3c8b7b206c4eeefacc Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 31 May 2024 01:52:50 +0200 Subject: [PATCH 04/14] Lint fixes --- .../org/apache/spark/sql/catalyst/util/CollationFactory.java | 1 - .../org/apache/spark/unsafe/types/CollationSupportSuite.java | 1 - .../java/org/apache/spark/unsafe/types/UTF8StringSuite.java | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 89fc240cab27a..39d0057d737ad 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -23,7 +23,6 @@ import java.util.function.BiFunction; import java.util.function.ToLongFunction; -import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.text.StringSearch; import com.ibm.icu.util.ULocale; diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 557074373fa7e..5ed6c9424c8b3 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -946,7 +946,6 @@ private void assertStringTrimRight( @Test public void testStringTrim() throws SparkException { // UTF8_BINARY -// assertStringTrim("UTF8_BINARY", null, null, ""); assertStringTrim("UTF8_BINARY", "", "", ""); assertStringTrim("UTF8_BINARY", "", "xyz", ""); assertStringTrim("UTF8_BINARY", "asd", "", "asd"); diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 7f5f9d359193b..d07d795bb0c98 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -904,7 +904,7 @@ public void skipWrongFirstByte() { } @Test - public void UTF8StringCodePoints() { + public void utf8StringCodePoints() { String s = "aéह 日å!"; UTF8String s0 = fromString(s); for (int i = 0; i < s.length(); ++i) { From 1af57bffa30ce1663898645ef2569ad755dc8ee7 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 10 Jun 2024 20:20:53 +0200 Subject: [PATCH 05/14] Update CollationSupportSuite.java --- .../unsafe/types/CollationSupportSuite.java | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index a738ac5b76375..3fabc18c9d13b 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -1311,14 +1311,22 @@ public void testStringTrim() throws SparkException { assertStringTrimLeft("UTF8_BINARY_LCASE", "ddsXXXaa", "asd", "XXXaa"); assertStringTrimRight("UTF8_BINARY_LCASE", "ddsXXXaa", "asd", "ddsXXX"); + assertStringTrim("UNICODE", "ddsXXXaa", "asd", "XXX"); + assertStringTrimLeft("UNICODE", "ddsXXXaa", "asd", "XXXaa"); + assertStringTrimRight("UNICODE", "ddsXXXaa", "asd", "ddsXXX"); + // Test cases specific to collation type // uppercase trim, lowercase src assertStringTrim("UTF8_BINARY", "asd", "A", "asd"); assertStringTrim("UTF8_BINARY_LCASE", "asd", "A", "sd"); + assertStringTrim("UNICODE", "asd", "A", "asd"); + assertStringTrim("UNICODE_CI", "asd", "A", "sd"); // lowercase trim, uppercase src assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD"); assertStringTrim("UTF8_BINARY_LCASE", "ASD", "a", "SD"); + assertStringTrim("UNICODE", "ASD", "a", "ASD"); + assertStringTrim("UNICODE_CI", "ASD", "a", "SD"); // uppercase and lowercase chars of different byte-length (utf8) assertStringTrim("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); @@ -1329,6 +1337,10 @@ public void testStringTrim() throws SparkException { assertStringTrimLeft("UTF8_BINARY_LCASE", "ẞaaaẞ", "ß", "aaaẞ"); assertStringTrimRight("UTF8_BINARY_LCASE", "ẞaaaẞ", "ß", "ẞaaa"); + assertStringTrim("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrimLeft("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrimRight("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrim("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); assertStringTrimLeft("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); assertStringTrimRight("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); @@ -1337,6 +1349,10 @@ public void testStringTrim() throws SparkException { assertStringTrimLeft("UTF8_BINARY_LCASE", "ßaaaß", "ẞ", "aaaß"); assertStringTrimRight("UTF8_BINARY_LCASE", "ßaaaß", "ẞ", "ßaaa"); + assertStringTrim("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrimLeft("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrimRight("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); + // different byte-length (utf8) chars trimmed assertStringTrim("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaa"); assertStringTrimLeft("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaaẞ"); @@ -1345,6 +1361,10 @@ public void testStringTrim() throws SparkException { assertStringTrim("UTF8_BINARY_LCASE", "Ëaaaẞ", "Ëẞ", "aaa"); assertStringTrimLeft("UTF8_BINARY_LCASE", "Ëaaaẞ", "Ëẞ", "aaaẞ"); assertStringTrimRight("UTF8_BINARY_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa"); + + assertStringTrim("UNICODE", "Ëaaaẞ", "Ëẞ", "aaa"); + assertStringTrimLeft("UNICODE", "Ëaaaẞ", "Ëẞ", "aaaẞ"); + assertStringTrimRight("UNICODE", "Ëaaaẞ", "Ëẞ", "Ëaaa"); } // TODO: Test more collation-aware string expressions. From 292acf895b685fd32b75f0cc782f4878ce2f28f4 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 8 Jul 2024 00:19:40 +0200 Subject: [PATCH 06/14] Fix test --- .../org/apache/spark/sql/CollationStringExpressionsSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala index 15bb2bd8a9504..1856545c90fe3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala @@ -920,7 +920,7 @@ class CollationStringExpressionsSuite test("StringTrim* functions - unit tests for both paths (codegen and eval)") { def evalStringTrim(src: Any, trim: Any, result: String): Unit = { - Seq("UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI").foreach { collation => + Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI").foreach { collation => val dt: DataType = StringType(collation) checkEvaluation(StringTrim(Literal.create(src, dt), Literal.create(trim, dt)), result) checkEvaluation(StringTrimLeft(Literal.create(src, dt), Literal.create(trim, dt)), result) From 097a8468c1a41d218ad9a9424acd096aa5ccc22d Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 8 Jul 2024 05:03:43 +0200 Subject: [PATCH 07/14] Refactor trim --- .../util/CollationAwareUTF8String.java | 274 +++++++-- .../unsafe/types/CollationSupportSuite.java | 573 ++++++++++++++++-- 2 files changed, 727 insertions(+), 120 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 7a4b2288ab135..73b76320247e0 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -18,6 +18,8 @@ import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.text.StringSearch; import com.ibm.icu.util.ULocale; @@ -26,7 +28,10 @@ import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET; import static org.apache.spark.unsafe.Platform.copyMemory; +import static org.apache.spark.unsafe.types.UTF8String.CodePointIteratorType; +import java.text.CharacterIterator; +import java.text.StringCharacterIterator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -426,19 +431,48 @@ private static UTF8String toLowerCaseSlow(final UTF8String target, final int col * @param codePoint The code point to convert to lowercase. * @param sb The StringBuilder to append the lowercase character to. */ - private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) { - if (codePoint == 0x0130) { + private static void appendLowercaseCodePoint(final int codePoint, final StringBuilder sb) { + int lowercaseCodePoint = getLowercaseCodePoint(codePoint); + if (lowercaseCodePoint == CODE_POINT_COMBINED_LOWERCASE_I_DOT) { // Latin capital letter I with dot above is mapped to 2 lowercase characters. sb.appendCodePoint(0x0069); sb.appendCodePoint(0x0307); + } else { + // All other characters should follow context-unaware ICU single-code point case mapping. + sb.appendCodePoint(lowercaseCodePoint); + } + } + + /** + * `CODE_POINT_COMBINED_LOWERCASE_I_DOT` is an internal representation of the combined lowercase + * code point for ASCII lowercase letter i with an additional combining dot character (U+0307). + * This integer value is not a valid code point itself, but rather an artificial code point + * marker used to represent the two lowercase characters that are the result of converting the + * uppercase Turkish dotted letter I with a combining dot character (U+0130) to lowercase. + */ + private static final int CODE_POINT_LOWERCASE_I = 0x69; + private static final int CODE_POINT_COMBINING_DOT = 0x307; + private static final int CODE_POINT_COMBINED_LOWERCASE_I_DOT = + CODE_POINT_LOWERCASE_I << 16 | CODE_POINT_COMBINING_DOT; + + /** + * Returns the lowercase version of the provided code point, with special handling for + * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase) and + * context-insensitive case mappings (i.e. characters that map to different characters based on + * the position in the string relative to other characters in lowercase). + */ + private static int getLowercaseCodePoint(final int codePoint) { + if (codePoint == 0x0130) { + // Latin capital letter I with dot above is mapped to 2 lowercase characters. + return CODE_POINT_COMBINED_LOWERCASE_I_DOT; } else if (codePoint == 0x03C2) { // Greek final and non-final capital letter sigma should be mapped the same. - sb.appendCodePoint(0x03C3); + return 0x03C3; } else { // All other characters should follow context-unaware ICU single-code point case mapping. - sb.appendCodePoint(UCharacter.toLowerCase(codePoint)); + return UCharacter.toLowerCase(codePoint); } } @@ -446,7 +480,7 @@ else if (codePoint == 0x03C2) { * Converts an entire string to lowercase using ICU rules, code point by code point, with * special handling for one-to-many case mappings (i.e. characters that map to multiple * characters in lowercase). Also, this method omits information about context-sensitive case - * mappings using special handling in the `lowercaseCodePoint` method. + * mappings using special handling in the `appendLowercaseCodePoint` method. * * @param target The target string to convert to lowercase. * @return The string converted to lowercase in a context-unaware manner. @@ -460,7 +494,7 @@ private static UTF8String lowerCaseCodePointsSlow(final UTF8String target) { String targetString = target.toValidString(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < targetString.length(); ++i) { - lowercaseCodePoint(targetString.codePointAt(i), sb); + appendLowercaseCodePoint(targetString.codePointAt(i), sb); } return UTF8String.fromString(sb.toString()); } @@ -691,12 +725,32 @@ public static Map getCollationAwareDict(UTF8String string, return collationAwareDict; } + /** + * Trims the `srcString` string from both ends of the string using the specified `trimString` + * characters, with respect to the UTF8_LCASE collation. String trimming is performed by + * first trimming the left side of the string, and then trimming the right side of the string. + * The method returns the trimmed string. If the `trimString` is null, the method returns null. + * + * @param srcString the input string to be trimmed from both ends of the string + * @param trimString the trim string characters to trim + * @return the trimmed string (for UTF8_LCASE collation) + */ public static UTF8String lowercaseTrim( final UTF8String srcString, final UTF8String trimString) { return lowercaseTrimRight(lowercaseTrimLeft(srcString, trimString), trimString); } + /** + * Trims the `srcString` string from both ends of the string using the specified `trimString` + * characters, with respect to all ICU collations in Spark. String trimming is performed by + * first trimming the left side of the string, and then trimming the right side of the string. + * The method returns the trimmed string. If the `trimString` is null, the method returns null. + * + * @param srcString the input string to be trimmed from both ends of the string + * @param trimString the trim string characters to trim + * @return the trimmed string (for ICU collations) + */ public static UTF8String trim( final UTF8String srcString, final UTF8String trimString, @@ -704,106 +758,218 @@ public static UTF8String trim( return trimRight(trimLeft(srcString, trimString, collationId), trimString, collationId); } + /** + * Trims the `srcString` string from the left side using the specified `trimString` characters, + * with respect to the UTF8_LCASE collation. For UTF8_LCASE, the method first creates a hash + * set of lowercased code points in `trimString`, and then iterates over the `srcString` from + * the left side, until reaching a character whose lowercased code point is not in the hash set. + * Finally, the method returns the substring from that position to the end of `srcString`. + * If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned. + * + * @param srcString the input string to be trimmed from the left end of the string + * @param trimString the trim string characters to trim + * @return the trimmed string (for UTF8_LCASE collation) + */ public static UTF8String lowercaseTrimLeft( final UTF8String srcString, final UTF8String trimString) { - // Matching UTF8String behavior for null `trimString`. + // Matching the default UTF8String behavior for null `trimString`. if (trimString == null) { return null; } + // Create a hash set of lowercased code points for all characters of `trimString`. HashSet trimChars = new HashSet<>(); Iterator trimIter = trimString.codePointIterator(); - while (trimIter.hasNext()) trimChars.add(UCharacter.toLowerCase(trimIter.next())); + while (trimIter.hasNext()) trimChars.add(getLowercaseCodePoint(trimIter.next())); - int searchIndex = 0; + // Iterate over `srcString` from the left to find the first character that is not in the set. + int searchIndex = 0, codePoint; Iterator srcIter = srcString.codePointIterator(); while (srcIter.hasNext()) { - if (!trimChars.contains(UCharacter.toLowerCase(srcIter.next()))) break; - ++searchIndex; + codePoint = getLowercaseCodePoint(srcIter.next()); + // Special handling for Turkish dotted uppercase letter I. + if (codePoint == CODE_POINT_LOWERCASE_I && srcIter.hasNext() && + trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) { + int nextCodePoint = getLowercaseCodePoint(srcIter.next()); + if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint)) + || nextCodePoint == CODE_POINT_COMBINING_DOT) searchIndex += 2; + else { + if (trimChars.contains(codePoint)) ++searchIndex; + break; + } + } else if (trimChars.contains(codePoint)) ++searchIndex; + else break; } - return srcString.substring(searchIndex, srcString.numChars()); + // Return the substring from that position to the end of the string. + return searchIndex == 0 ? srcString : srcString.substring(searchIndex, srcString.numChars()); } + /** + * Trims the `srcString` string from the left side using the specified `trimString` characters, + * with respect to ICU collations. For these collations, the method iterates over `srcString` + * from left to right, and repeatedly skips the longest possible substring that matches any + * character in `trimString`, until reaching a character that is not found in `trimString`. + * Finally, the method returns the substring from that position to the end of `srcString`. + * If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned. + * + * @param srcString the input string to be trimmed from the left end of the string + * @param trimString the trim string characters to trim + * @return the trimmed string (for ICU collations) + */ public static UTF8String trimLeft( final UTF8String srcString, final UTF8String trimString, final int collationId) { - // Matching UTF8String behavior for null `trimString`. - if (trimString == null) { - return null; - } + // Short-circuit for base cases. + if (trimString == null) return null; + if (srcString.numBytes() == 0) return srcString; - // Create a set of collation keys for all characters of the trim string, for fast lookup. - String trim = trimString.toString(); - HashSet trimChars = new HashSet<>(); - for (int i = 0; i < trim.length(); i++) { - trimChars.add(CollationFactory.getCollationKey(String.valueOf(trim.charAt(i)), collationId)); + // Create an array of Strings for all characters of `trimString`. + int trimCharIndex = 0; + String[] trimChars = new String[trimString.numChars()]; + Iterator trimIter = trimString.codePointIterator( + CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID); + while (trimIter.hasNext()) { + trimChars[trimCharIndex++] = String.valueOf((char) trimIter.next().intValue()); } // Iterate over srcString from the left and find the first character that is not in trimChars. - String input = srcString.toString(); - int i = 0; - while (i < input.length()) { - String key = CollationFactory.getCollationKey(String.valueOf(input.charAt(i)), collationId); - if (!trimChars.contains(key)) break; - ++i; + String src = srcString.toValidString(); + CharacterIterator target = new StringCharacterIterator(src); + Collator collator = CollationFactory.fetchCollation(collationId).collator; + int charIndex = 0, longestMatchLen; + while (charIndex < src.length()) { + longestMatchLen = 0; + for (String trim : trimChars) { + StringSearch stringSearch = new StringSearch(trim, target, (RuleBasedCollator) collator); + stringSearch.setIndex(charIndex); + int matchIndex = stringSearch.next(); + if (matchIndex == charIndex) { + int matchLen = stringSearch.getMatchLength(); + if (matchLen > longestMatchLen) { + longestMatchLen = matchLen; + } + } + } + if (longestMatchLen == 0) break; + else charIndex += longestMatchLen; } - // Return the substring from that position to the end of the string. - return UTF8String.fromString(input.substring(i, srcString.numChars())); + + // Return the substring from the calculated position until the end of the string. + return UTF8String.fromString(src.substring(charIndex)); } + /** + * Trims the `srcString` string from the right side using the specified `trimString` characters, + * with respect to the UTF8_LCASE collation. For UTF8_LCASE, the method first creates a hash + * set of lowercased code points in `trimString`, and then iterates over the `srcString` from + * the right side, until reaching a character whose lowercased code point is not in the hash set. + * Finally, the method returns the substring from the start of `srcString` until that position. + * If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned. + * + * @param srcString the input string to be trimmed from the right end of the string + * @param trimString the trim string characters to trim + * @return the trimmed string (for UTF8_LCASE collation) + */ public static UTF8String lowercaseTrimRight( final UTF8String srcString, final UTF8String trimString) { - // Matching UTF8String behavior for null `trimString`. + // Matching the default UTF8String behavior for null `trimString`. if (trimString == null) { return null; } + // Create a hash set of lowercased code points for all characters of `trimString`. HashSet trimChars = new HashSet<>(); Iterator trimIter = trimString.codePointIterator(); - while (trimIter.hasNext()) trimChars.add(UCharacter.toLowerCase(trimIter.next())); + while (trimIter.hasNext()) trimChars.add(getLowercaseCodePoint(trimIter.next())); - int searchIndex = srcString.numChars(); + // Iterate over `srcString` from the right to find the first character that is not in the set. + int searchIndex = srcString.numChars(), codePoint; Iterator srcIter = srcString.reverseCodePointIterator(); while (srcIter.hasNext()) { - if (!trimChars.contains(UCharacter.toLowerCase(srcIter.next()))) { - break; - } - --searchIndex; + codePoint = getLowercaseCodePoint(srcIter.next()); + // Special handling for Turkish dotted uppercase letter I. + if (codePoint == CODE_POINT_COMBINING_DOT && srcIter.hasNext() && + trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) { + int nextCodePoint = getLowercaseCodePoint(srcIter.next()); + if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint)) + || nextCodePoint == CODE_POINT_LOWERCASE_I) searchIndex -= 2; + else { + if (trimChars.contains(codePoint)) --searchIndex; + break; + } + } else if (trimChars.contains(codePoint)) --searchIndex; + else break; } - return srcString.substring(0, searchIndex); + // Return the substring from the start of the string to the calculated position. + return searchIndex == srcString.numChars() ? srcString : srcString.substring(0, searchIndex); } + /** + * Trims the `srcString` string from the right side using the specified `trimString` characters, + * with respect to ICU collations. For these collations, the method iterates over `srcString` + * from right to left, and repeatedly skips the longest possible substring that matches any + * character in `trimString`, until reaching a character that is not found in `trimString`. + * Finally, the method returns the substring from the start of `srcString` until that position. + * If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned. + * + * @param srcString the input string to be trimmed from the right end of the string + * @param trimString the trim string characters to trim + * @return the trimmed string (for ICU collations) + */ public static UTF8String trimRight( final UTF8String srcString, final UTF8String trimString, final int collationId) { - // Matching UTF8String behavior for null `trimString`. - if (trimString == null) { - return null; - } + // Short-circuit for base cases. + if (trimString == null) return null; + if (srcString.numBytes() == 0) return srcString; - // Create a set of collation keys for all characters of the trim string, for fast lookup. - String trim = trimString.toString(); - HashSet trimChars = new HashSet<>(); - for (int i = 0; i < trim.length(); i++) { - trimChars.add(CollationFactory.getCollationKey(String.valueOf(trim.charAt(i)), collationId)); + // Create an array of Strings for all characters of `trimString`. + int trimCharIndex = 0; + String[] trimChars = new String[trimString.numChars()]; + Iterator trimIter = trimString.codePointIterator( + CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID); + while (trimIter.hasNext()) { + trimChars[trimCharIndex++] = String.valueOf((char) trimIter.next().intValue()); } - // Iterate over srcString from the right and find the first character that is not in trimChars. - String input = srcString.toString(); - int i = input.length() - 1; - while (i >= 0) { - String key = CollationFactory.getCollationKey(String.valueOf(input.charAt(i)), collationId); - if (!trimChars.contains(key)) break; - --i; + // Iterate over srcString from the left and find the first character that is not in trimChars. + String src = srcString.toValidString(); + CharacterIterator target = new StringCharacterIterator(src); + Collator collator = CollationFactory.fetchCollation(collationId).collator; + int charIndex = src.length(), longestMatchLen; + while (charIndex >= 0) { + longestMatchLen = 0; + for (String trim : trimChars) { + StringSearch stringSearch = new StringSearch(trim, target, (RuleBasedCollator) collator); + // Note: stringSearch.previous() is NOT consistent with stringSearch.next()! + // Example: StringSearch("İ", "i\\u0307İi\\u0307İi\\u0307İ", "UNICODE_CI") + // stringSearch.next() gives: [0, 2, 3, 5, 6, 8]. + // stringSearch.previous() gives: [8, 6, 3, 0]. + stringSearch.setIndex(Math.max(charIndex - 3, 0)); + int matchIndex = stringSearch.next(); + int matchLen = stringSearch.getMatchLength(); + while (matchIndex != StringSearch.DONE && matchIndex < charIndex - matchLen) { + matchIndex = stringSearch.next(); + matchLen = stringSearch.getMatchLength(); + } + if (matchIndex == charIndex - matchLen) { + if (matchLen > longestMatchLen) { + longestMatchLen = matchLen; + } + } + } + if (longestMatchLen == 0) break; + else charIndex -= longestMatchLen; } + // Return the substring from the start of the string until that position. - return UTF8String.fromString(input.substring(0, i + 1)); + return UTF8String.fromString(src.substring(0, charIndex)); } // TODO: Add more collation-aware UTF8String operations here. diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index c632e51338d0c..42a5e5f3a315d 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -1224,20 +1224,34 @@ private void assertStringTrim( String sourceString, String trimString, String expectedResultString) throws SparkException { + // Prepare the input and expected result. int collationId = CollationFactory.collationNameToId(collation); - String result; + UTF8String src = UTF8String.fromString(sourceString); + UTF8String trim = UTF8String.fromString(trimString); + UTF8String resultTrimLeftRight, resultTrimRightLeft; + String resultTrim; if (trimString == null) { - result = CollationSupport.StringTrim.exec( - UTF8String.fromString(sourceString)).toString(); + // Trim string is ASCII space. + resultTrim = CollationSupport.StringTrim.exec(src).toString(); + UTF8String trimLeft = CollationSupport.StringTrimLeft.exec(src); + resultTrimLeftRight = CollationSupport.StringTrimRight.exec(trimLeft); + UTF8String trimRight = CollationSupport.StringTrimRight.exec(src); + resultTrimRightLeft = CollationSupport.StringTrimLeft.exec(trimRight); } else { - result = CollationSupport.StringTrim.exec( - UTF8String - .fromString(sourceString), UTF8String.fromString(trimString), collationId) - .toString(); + // Trim string is specified. + resultTrim = CollationSupport.StringTrim.exec(src, trim, collationId).toString(); + UTF8String trimLeft = CollationSupport.StringTrimLeft.exec(src, trim, collationId); + resultTrimLeftRight = CollationSupport.StringTrimRight.exec(trimLeft, trim, collationId); + UTF8String trimRight = CollationSupport.StringTrimRight.exec(src, trim, collationId); + resultTrimRightLeft = CollationSupport.StringTrimLeft.exec(trimRight, trim, collationId); } - assertEquals(expectedResultString, result); + // Test that StringTrim result is as expected. + assertEquals(expectedResultString, resultTrim); + // Test that the order of the trims is not important. + assertEquals(resultTrimLeftRight.toString(), resultTrim); + assertEquals(resultTrimRightLeft.toString(), resultTrim); } private void assertStringTrimLeft( @@ -1245,19 +1259,21 @@ private void assertStringTrimLeft( String sourceString, String trimString, String expectedResultString) throws SparkException { + // Prepare the input and expected result. int collationId = CollationFactory.collationNameToId(collation); + UTF8String src = UTF8String.fromString(sourceString); + UTF8String trim = UTF8String.fromString(trimString); String result; if (trimString == null) { - result = CollationSupport.StringTrimLeft.exec( - UTF8String.fromString(sourceString)).toString(); + // Trim string is ASCII space. + result = CollationSupport.StringTrimLeft.exec(src).toString(); } else { - result = CollationSupport.StringTrimLeft.exec( - UTF8String - .fromString(sourceString), UTF8String.fromString(trimString), collationId) - .toString(); + // Trim string is specified. + result = CollationSupport.StringTrimLeft.exec(src, trim, collationId).toString(); } + // Test that StringTrimLeft result is as expected. assertEquals(expectedResultString, result); } @@ -1266,25 +1282,27 @@ private void assertStringTrimRight( String sourceString, String trimString, String expectedResultString) throws SparkException { + // Prepare the input and expected result. int collationId = CollationFactory.collationNameToId(collation); + UTF8String src = UTF8String.fromString(sourceString); + UTF8String trim = UTF8String.fromString(trimString); String result; if (trimString == null) { - result = CollationSupport.StringTrimRight.exec( - UTF8String.fromString(sourceString)).toString(); + // Trim string is ASCII space. + result = CollationSupport.StringTrimRight.exec(src).toString(); } else { - result = CollationSupport.StringTrimRight.exec( - UTF8String - .fromString(sourceString), UTF8String.fromString(trimString), collationId) - .toString(); + // Trim string is specified. + result = CollationSupport.StringTrimRight.exec(src, trim, collationId).toString(); } + // Test that StringTrimRight result is as expected. assertEquals(expectedResultString, result); } @Test public void testStringTrim() throws SparkException { - // UTF8_BINARY + // Basic tests - UTF8_BINARY. assertStringTrim("UTF8_BINARY", "", "", ""); assertStringTrim("UTF8_BINARY", "", "xyz", ""); assertStringTrim("UTF8_BINARY", "asd", "", "asd"); @@ -1312,8 +1330,7 @@ public void testStringTrim() throws SparkException { assertStringTrimRight("UTF8_BINARY", "asd", "x", "asd"); assertStringTrimRight("UTF8_BINARY", "xxasdxx", "x", "xxasd"); assertStringTrimRight("UTF8_BINARY", "xa世ax", "x", "xa世a"); - - // UTF8_LCASE + // Basic tests - UTF8_LCASE. assertStringTrim("UTF8_LCASE", "", "", ""); assertStringTrim("UTF8_LCASE", "", "xyz", ""); assertStringTrim("UTF8_LCASE", "asd", "", "asd"); @@ -1341,8 +1358,7 @@ public void testStringTrim() throws SparkException { assertStringTrimRight("UTF8_LCASE", "asd", "x", "asd"); assertStringTrimRight("UTF8_LCASE", "xxasdxx", "x", "xxasd"); assertStringTrimRight("UTF8_LCASE", "xa世ax", "x", "xa世a"); - - // UNICODE + // Basic tests - UNICODE. assertStringTrim("UNICODE", "", "", ""); assertStringTrim("UNICODE", "", "xyz", ""); assertStringTrim("UNICODE", "asd", "", "asd"); @@ -1370,8 +1386,7 @@ public void testStringTrim() throws SparkException { assertStringTrimRight("UNICODE", "asd", "x", "asd"); assertStringTrimRight("UNICODE", "xxasdxx", "x", "xxasd"); assertStringTrimRight("UNICODE", "xa世ax", "x", "xa世a"); - - // UNICODE_CI + // Basic tests - UNICODE_CI. assertStringTrim("UNICODE_CI", "", "", ""); assertStringTrim("UNICODE_CI", "", "xyz", ""); assertStringTrim("UNICODE_CI", "asd", "", "asd"); @@ -1400,69 +1415,495 @@ public void testStringTrim() throws SparkException { assertStringTrimRight("UNICODE_CI", "xxasdxx", "x", "xxasd"); assertStringTrimRight("UNICODE_CI", "xa世ax", "x", "xa世a"); - // Test cases where trimString has more than one character + // Case variation - UTF8_BINARY. + assertStringTrim("UTF8_BINARY", "asd", "A", "asd"); assertStringTrim("UTF8_BINARY", "ddsXXXaa", "asd", "XXX"); + assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD"); assertStringTrimLeft("UTF8_BINARY", "ddsXXXaa", "asd", "XXXaa"); assertStringTrimRight("UTF8_BINARY", "ddsXXXaa", "asd", "ddsXXX"); - - assertStringTrim("UTF8_LCASE", "ddsXXXaa", "asd", "XXX"); - assertStringTrimLeft("UTF8_LCASE", "ddsXXXaa", "asd", "XXXaa"); - assertStringTrimRight("UTF8_LCASE", "ddsXXXaa", "asd", "ddsXXX"); - + // Case variation - UTF8_LCASE. + assertStringTrim("UTF8_LCASE", "asd", "A", "sd"); + assertStringTrim("UTF8_LCASE", "ASD", "a", "SD"); + assertStringTrim("UTF8_LCASE", "ddsXXXaa", "ASD", "XXX"); + assertStringTrimLeft("UTF8_LCASE", "ddsXXXaa", "aSd", "XXXaa"); + assertStringTrimRight("UTF8_LCASE", "ddsXXXaa", "AsD", "ddsXXX"); + // Case variation - UNICODE. + assertStringTrim("UNICODE", "asd", "A", "asd"); + assertStringTrim("UNICODE", "ASD", "a", "ASD"); assertStringTrim("UNICODE", "ddsXXXaa", "asd", "XXX"); assertStringTrimLeft("UNICODE", "ddsXXXaa", "asd", "XXXaa"); assertStringTrimRight("UNICODE", "ddsXXXaa", "asd", "ddsXXX"); - - // Test cases specific to collation type - // uppercase trim, lowercase src - assertStringTrim("UTF8_BINARY", "asd", "A", "asd"); - assertStringTrim("UTF8_LCASE", "asd", "A", "sd"); - assertStringTrim("UNICODE", "asd", "A", "asd"); + // Case variation - UNICODE_CI. assertStringTrim("UNICODE_CI", "asd", "A", "sd"); - - // lowercase trim, uppercase src - assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD"); - assertStringTrim("UTF8_LCASE", "ASD", "a", "SD"); - assertStringTrim("UNICODE", "ASD", "a", "ASD"); assertStringTrim("UNICODE_CI", "ASD", "a", "SD"); + assertStringTrim("UNICODE_CI", "ddsXXXaa", "ASD", "XXX"); + assertStringTrimLeft("UNICODE_CI", "ddsXXXaa", "aSd", "XXXaa"); + assertStringTrimRight("UNICODE_CI", "ddsXXXaa", "AsD", "ddsXXX"); - // uppercase and lowercase chars of different byte-length (utf8) + // Case-variable character length - UTF8_BINARY. assertStringTrim("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); assertStringTrimLeft("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); assertStringTrimRight("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); - - assertStringTrim("UTF8_LCASE", "ẞaaaẞ", "ß", "aaa"); - assertStringTrimLeft("UTF8_LCASE", "ẞaaaẞ", "ß", "aaaẞ"); - assertStringTrimRight("UTF8_LCASE", "ẞaaaẞ", "ß", "ẞaaa"); - - assertStringTrim("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); - assertStringTrimLeft("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); - assertStringTrimRight("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); - assertStringTrim("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); assertStringTrimLeft("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); assertStringTrimRight("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); - - assertStringTrim("UTF8_LCASE", "ßaaaß", "ẞ", "aaa"); - assertStringTrimLeft("UTF8_LCASE", "ßaaaß", "ẞ", "aaaß"); - assertStringTrimRight("UTF8_LCASE", "ßaaaß", "ẞ", "ßaaa"); - - assertStringTrim("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); - assertStringTrimLeft("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); - assertStringTrimRight("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); - - // different byte-length (utf8) chars trimmed assertStringTrim("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaa"); assertStringTrimLeft("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaaẞ"); assertStringTrimRight("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "Ëaaa"); - + // Case-variable character length - UTF8_LCASE. + assertStringTrim("UTF8_LCASE", "ẞaaaẞ", "ß", "aaa"); + assertStringTrimLeft("UTF8_LCASE", "ẞaaaẞ", "ß", "aaaẞ"); + assertStringTrimRight("UTF8_LCASE", "ẞaaaẞ", "ß", "ẞaaa"); + assertStringTrim("UTF8_LCASE", "ßaaaß", "ẞ", "aaa"); + assertStringTrimLeft("UTF8_LCASE", "ßaaaß", "ẞ", "aaaß"); + assertStringTrimRight("UTF8_LCASE", "ßaaaß", "ẞ", "ßaaa"); assertStringTrim("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaa"); assertStringTrimLeft("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaaẞ"); assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa"); - + // Case-variable character length - UNICODE. + assertStringTrim("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrimLeft("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrimRight("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrim("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrimLeft("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrimRight("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); assertStringTrim("UNICODE", "Ëaaaẞ", "Ëẞ", "aaa"); assertStringTrimLeft("UNICODE", "Ëaaaẞ", "Ëẞ", "aaaẞ"); assertStringTrimRight("UNICODE", "Ëaaaẞ", "Ëẞ", "Ëaaa"); + // Case-variable character length - UNICODE_CI. + assertStringTrim("UNICODE_CI", "ẞaaaẞ", "ß", "aaa"); + assertStringTrimLeft("UNICODE_CI", "ẞaaaẞ", "ß", "aaaẞ"); + assertStringTrimRight("UNICODE_CI", "ẞaaaẞ", "ß", "ẞaaa"); + assertStringTrim("UNICODE_CI", "ßaaaß", "ẞ", "aaa"); + assertStringTrimLeft("UNICODE_CI", "ßaaaß", "ẞ", "aaaß"); + assertStringTrimRight("UNICODE_CI", "ßaaaß", "ẞ", "ßaaa"); + assertStringTrim("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "aaa"); + assertStringTrimLeft("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "aaaẞ"); + assertStringTrimRight("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "Ëaaa"); + + // One-to-many case mapping - UTF8_BINARY. + assertStringTrim("UTF8_BINARY", "i", "i", ""); + assertStringTrim("UTF8_BINARY", "iii", "I", "iii"); + assertStringTrim("UTF8_BINARY", "I", "iii", "I"); + assertStringTrim("UTF8_BINARY", "ixi", "i", "x"); + assertStringTrim("UTF8_BINARY", "i", "İ", "i"); + assertStringTrim("UTF8_BINARY", "i\u0307", "İ", "i\u0307"); + assertStringTrim("UTF8_BINARY", "i\u0307", "i", "\u0307"); + assertStringTrim("UTF8_BINARY", "i\u0307", "\u0307", "i"); + assertStringTrim("UTF8_BINARY", "i\u0307", "i\u0307", ""); + assertStringTrim("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", ""); + assertStringTrim("UTF8_BINARY", "i\u0307\u0307", "i\u0307", ""); + assertStringTrim("UTF8_BINARY", "i\u0307i", "i\u0307", ""); + assertStringTrim("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i"); + assertStringTrim("UTF8_BINARY", "i\u0307İ", "i\u0307", "İ"); + assertStringTrim("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307"); + assertStringTrim("UTF8_BINARY", "İ", "İ", ""); + assertStringTrim("UTF8_BINARY", "IXi", "İ", "IXi"); + assertStringTrim("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrim("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrim("UTF8_BINARY", "i\u0307x", "ix\u0307İ", ""); + assertStringTrim("UTF8_BINARY", "İ", "i", "İ"); + assertStringTrim("UTF8_BINARY", "İ", "\u0307", "İ"); + assertStringTrim("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrim("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ"); + assertStringTrim("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi"); + assertStringTrimLeft("UTF8_BINARY", "i", "i", ""); + assertStringTrimLeft("UTF8_BINARY", "iii", "I", "iii"); + assertStringTrimLeft("UTF8_BINARY", "I", "iii", "I"); + assertStringTrimLeft("UTF8_BINARY", "ixi", "i", "xi"); + assertStringTrimLeft("UTF8_BINARY", "i", "İ", "i"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307", "İ", "i\u0307"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i", "\u0307"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307", "\u0307", "i\u0307"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i\u0307", ""); + assertStringTrimLeft("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", ""); + assertStringTrimLeft("UTF8_BINARY", "i\u0307\u0307", "i\u0307", ""); + assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "i\u0307", ""); + assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "i\u0307", "İ"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307İ"); + assertStringTrimLeft("UTF8_BINARY", "İ", "İ", ""); + assertStringTrimLeft("UTF8_BINARY", "IXi", "İ", "IXi"); + assertStringTrimLeft("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "ix\u0307İ", ""); + assertStringTrimLeft("UTF8_BINARY", "İ", "i", "İ"); + assertStringTrimLeft("UTF8_BINARY", "İ", "\u0307", "İ"); + assertStringTrimLeft("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimLeft("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ"); + assertStringTrimLeft("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi\u0307"); + assertStringTrimRight("UTF8_BINARY", "i", "i", ""); + assertStringTrimRight("UTF8_BINARY", "iii", "I", "iii"); + assertStringTrimRight("UTF8_BINARY", "I", "iii", "I"); + assertStringTrimRight("UTF8_BINARY", "ixi", "i", "ix"); + assertStringTrimRight("UTF8_BINARY", "i", "İ", "i"); + assertStringTrimRight("UTF8_BINARY", "i\u0307", "İ", "i\u0307"); + assertStringTrimRight("UTF8_BINARY", "i\u0307", "i", "i\u0307"); + assertStringTrimRight("UTF8_BINARY", "i\u0307", "\u0307", "i"); + assertStringTrimRight("UTF8_BINARY", "i\u0307", "i\u0307", ""); + assertStringTrimRight("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", ""); + assertStringTrimRight("UTF8_BINARY", "i\u0307\u0307", "i\u0307", ""); + assertStringTrimRight("UTF8_BINARY", "i\u0307i", "i\u0307", ""); + assertStringTrimRight("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i"); + assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307"); + assertStringTrimRight("UTF8_BINARY", "İ", "İ", ""); + assertStringTrimRight("UTF8_BINARY", "IXi", "İ", "IXi"); + assertStringTrimRight("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimRight("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrimRight("UTF8_BINARY", "i\u0307x", "ix\u0307İ", ""); + assertStringTrimRight("UTF8_BINARY", "İ", "i", "İ"); + assertStringTrimRight("UTF8_BINARY", "İ", "\u0307", "İ"); + assertStringTrimRight("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimRight("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ"); + assertStringTrimRight("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi"); + // One-to-many case mapping - UTF8_LCASE. + assertStringTrim("UTF8_LCASE", "i", "i", ""); + assertStringTrim("UTF8_LCASE", "iii", "I", ""); + assertStringTrim("UTF8_LCASE", "I", "iii", ""); + assertStringTrim("UTF8_LCASE", "ixi", "i", "x"); + assertStringTrim("UTF8_LCASE", "i", "İ", "i"); + assertStringTrim("UTF8_LCASE", "i\u0307", "İ", ""); + assertStringTrim("UTF8_LCASE", "i\u0307", "i", "\u0307"); + assertStringTrim("UTF8_LCASE", "i\u0307", "\u0307", "i"); + assertStringTrim("UTF8_LCASE", "i\u0307", "i\u0307", ""); + assertStringTrim("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", ""); + assertStringTrim("UTF8_LCASE", "i\u0307\u0307", "i\u0307", ""); + assertStringTrim("UTF8_LCASE", "i\u0307i", "i\u0307", ""); + assertStringTrim("UTF8_LCASE", "i\u0307i", "İ", "i"); + assertStringTrim("UTF8_LCASE", "i\u0307İ", "i\u0307", "İ"); + assertStringTrim("UTF8_LCASE", "i\u0307İ", "İ", ""); + assertStringTrim("UTF8_LCASE", "İ", "İ", ""); + assertStringTrim("UTF8_LCASE", "IXi", "İ", "IXi"); + assertStringTrim("UTF8_LCASE", "ix\u0307", "Ixİ", "\u0307"); + assertStringTrim("UTF8_LCASE", "i\u0307x", "IXİ", ""); + assertStringTrim("UTF8_LCASE", "i\u0307x", "I\u0307xİ", ""); + assertStringTrim("UTF8_LCASE", "İ", "i", "İ"); + assertStringTrim("UTF8_LCASE", "İ", "\u0307", "İ"); + assertStringTrim("UTF8_LCASE", "Ixİ", "i\u0307", "xİ"); + assertStringTrim("UTF8_LCASE", "IXİ", "ix\u0307", "İ"); + assertStringTrim("UTF8_LCASE", "xi\u0307", "\u0307IX", ""); + assertStringTrimLeft("UTF8_LCASE", "i", "i", ""); + assertStringTrimLeft("UTF8_LCASE", "iii", "I", ""); + assertStringTrimLeft("UTF8_LCASE", "I", "iii", ""); + assertStringTrimLeft("UTF8_LCASE", "ixi", "i", "xi"); + assertStringTrimLeft("UTF8_LCASE", "i", "İ", "i"); + assertStringTrimLeft("UTF8_LCASE", "i\u0307", "İ", ""); + assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i", "\u0307"); + assertStringTrimLeft("UTF8_LCASE", "i\u0307", "\u0307", "i\u0307"); + assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i\u0307", ""); + assertStringTrimLeft("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", ""); + assertStringTrimLeft("UTF8_LCASE", "i\u0307\u0307", "i\u0307", ""); + assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "i\u0307", ""); + assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "İ", "i"); + assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "i\u0307", "İ"); + assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "İ", ""); + assertStringTrimLeft("UTF8_LCASE", "İ", "İ", ""); + assertStringTrimLeft("UTF8_LCASE", "IXi", "İ", "IXi"); + assertStringTrimLeft("UTF8_LCASE", "ix\u0307", "Ixİ", "\u0307"); + assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "IXİ", ""); + assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "I\u0307xİ", ""); + assertStringTrimLeft("UTF8_LCASE", "İ", "i", "İ"); + assertStringTrimLeft("UTF8_LCASE", "İ", "\u0307", "İ"); + assertStringTrimLeft("UTF8_LCASE", "Ixİ", "i\u0307", "xİ"); + assertStringTrimLeft("UTF8_LCASE", "IXİ", "ix\u0307", "İ"); + assertStringTrimLeft("UTF8_LCASE", "xi\u0307", "\u0307IX", ""); + assertStringTrimRight("UTF8_LCASE", "i", "i", ""); + assertStringTrimRight("UTF8_LCASE", "iii", "I", ""); + assertStringTrimRight("UTF8_LCASE", "I", "iii", ""); + assertStringTrimRight("UTF8_LCASE", "ixi", "i", "ix"); + assertStringTrimRight("UTF8_LCASE", "i", "İ", "i"); + assertStringTrimRight("UTF8_LCASE", "i\u0307", "İ", ""); + assertStringTrimRight("UTF8_LCASE", "i\u0307", "i", "i\u0307"); + assertStringTrimRight("UTF8_LCASE", "i\u0307", "\u0307", "i"); + assertStringTrimRight("UTF8_LCASE", "i\u0307", "i\u0307", ""); + assertStringTrimRight("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", ""); + assertStringTrimRight("UTF8_LCASE", "i\u0307\u0307", "i\u0307", ""); + assertStringTrimRight("UTF8_LCASE", "i\u0307i", "i\u0307", ""); + assertStringTrimRight("UTF8_LCASE", "i\u0307i", "İ", "i\u0307i"); + assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "İ", ""); + assertStringTrimRight("UTF8_LCASE", "İ", "İ", ""); + assertStringTrimRight("UTF8_LCASE", "IXi", "İ", "IXi"); + assertStringTrimRight("UTF8_LCASE", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimRight("UTF8_LCASE", "i\u0307x", "IXİ", ""); + assertStringTrimRight("UTF8_LCASE", "i\u0307x", "I\u0307xİ", ""); + assertStringTrimRight("UTF8_LCASE", "İ", "i", "İ"); + assertStringTrimRight("UTF8_LCASE", "İ", "\u0307", "İ"); + assertStringTrimRight("UTF8_LCASE", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimRight("UTF8_LCASE", "IXİ", "ix\u0307", "IXİ"); + assertStringTrimRight("UTF8_LCASE", "xi\u0307", "\u0307IX", ""); + // One-to-many case mapping - UNICODE. + assertStringTrim("UNICODE", "i", "i", ""); + assertStringTrim("UNICODE", "iii", "I", "iii"); + assertStringTrim("UNICODE", "I", "iii", "I"); + assertStringTrim("UNICODE", "ixi", "i", "x"); + assertStringTrim("UNICODE", "i", "İ", "i"); + assertStringTrim("UNICODE", "i\u0307", "İ", "i\u0307"); + assertStringTrim("UNICODE", "i\u0307", "i", "i\u0307"); + assertStringTrim("UNICODE", "i\u0307", "\u0307", "i\u0307"); + assertStringTrim("UNICODE", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrim("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrim("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrim("UNICODE", "i\u0307i", "i\u0307", "i\u0307"); + assertStringTrim("UNICODE", "i\u0307i", "İ", "i\u0307i"); + assertStringTrim("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrim("UNICODE", "i\u0307İ", "İ", "i\u0307"); + assertStringTrim("UNICODE", "İ", "İ", ""); + assertStringTrim("UNICODE", "IXi", "İ", "IXi"); + assertStringTrim("UNICODE", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrim("UNICODE", "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrim("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307"); + assertStringTrim("UNICODE", "İ", "i", "İ"); + assertStringTrim("UNICODE", "İ", "\u0307", "İ"); + assertStringTrim("UNICODE", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrim("UNICODE", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrim("UNICODE", "IXİ", "ix\u0307", "IXİ"); + assertStringTrim("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307"); + assertStringTrimLeft("UNICODE", "i", "i", ""); + assertStringTrimLeft("UNICODE", "iii", "I", "iii"); + assertStringTrimLeft("UNICODE", "I", "iii", "I"); + assertStringTrimLeft("UNICODE", "ixi", "i", "xi"); + assertStringTrimLeft("UNICODE", "i", "İ", "i"); + assertStringTrimLeft("UNICODE", "i\u0307", "İ", "i\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307", "i", "i\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307", "\u0307", "i\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307i", "i\u0307", "i\u0307i"); + assertStringTrimLeft("UNICODE", "i\u0307i", "İ", "i\u0307i"); + assertStringTrimLeft("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimLeft("UNICODE", "i\u0307İ", "İ", "i\u0307İ"); + assertStringTrimLeft("UNICODE", "İ", "İ", ""); + assertStringTrimLeft("UNICODE", "IXi", "İ", "IXi"); + assertStringTrimLeft("UNICODE", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrimLeft("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307x"); + assertStringTrimLeft("UNICODE", "İ", "i", "İ"); + assertStringTrimLeft("UNICODE", "İ", "\u0307", "İ"); + assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimLeft("UNICODE", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimLeft("UNICODE", "IXİ", "ix\u0307", "IXİ"); + assertStringTrimLeft("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307"); + assertStringTrimRight("UNICODE", "i", "i", ""); + assertStringTrimRight("UNICODE", "iii", "I", "iii"); + assertStringTrimRight("UNICODE", "I", "iii", "I"); + assertStringTrimRight("UNICODE", "ixi", "i", "ix"); + assertStringTrimRight("UNICODE", "i", "İ", "i"); + assertStringTrimRight("UNICODE", "i\u0307", "İ", "i\u0307"); + assertStringTrimRight("UNICODE", "i\u0307", "i", "i\u0307"); + assertStringTrimRight("UNICODE", "i\u0307", "\u0307", "i\u0307"); + assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimRight("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrimRight("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrimRight("UNICODE", "i\u0307i", "i\u0307", "i\u0307"); + assertStringTrimRight("UNICODE", "i\u0307i", "İ", "i\u0307i"); + assertStringTrimRight("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimRight("UNICODE", "i\u0307İ", "İ", "i\u0307"); + assertStringTrimRight("UNICODE", "İ", "İ", ""); + assertStringTrimRight("UNICODE", "IXi", "İ", "IXi"); + assertStringTrimRight("UNICODE", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimRight("UNICODE", "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrimRight("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307"); + assertStringTrimRight("UNICODE", "İ", "i", "İ"); + assertStringTrimRight("UNICODE", "İ", "\u0307", "İ"); + assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimRight("UNICODE", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimRight("UNICODE", "IXİ", "ix\u0307", "IXİ"); + assertStringTrimRight("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307"); + // One-to-many case mapping - UNICODE_CI. + assertStringTrim("UNICODE_CI", "i", "i", ""); + assertStringTrim("UNICODE_CI", "iii", "I", ""); + assertStringTrim("UNICODE_CI", "I", "iii", ""); + assertStringTrim("UNICODE_CI", "ixi", "i", "x"); + assertStringTrim("UNICODE_CI", "i", "İ", "i"); + assertStringTrim("UNICODE_CI", "i\u0307", "İ", ""); + assertStringTrim("UNICODE_CI", "i\u0307", "i", "i\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307", "\u0307", "i\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307i", "İ", "i"); + assertStringTrim("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrim("UNICODE_CI", "i\u0307İ", "İ", ""); + assertStringTrim("UNICODE_CI", "İ", "İ", ""); + assertStringTrim("UNICODE_CI", "IXi", "İ", "IXi"); + assertStringTrim("UNICODE_CI", "ix\u0307", "Ixİ", "x\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307x", "IXİ", ""); + assertStringTrim("UNICODE_CI", "i\u0307x", "I\u0307xİ", ""); + assertStringTrim("UNICODE_CI", "İ", "i", "İ"); + assertStringTrim("UNICODE_CI", "İ", "\u0307", "İ"); + assertStringTrim("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrim("UNICODE_CI", "Ixİ", "i\u0307", "xİ"); + assertStringTrim("UNICODE_CI", "IXİ", "ix\u0307", "İ"); + assertStringTrim("UNICODE_CI", "xi\u0307", "\u0307IX", "i\u0307"); + assertStringTrimLeft("UNICODE_CI", "i", "i", ""); + assertStringTrimLeft("UNICODE_CI", "iii", "I", ""); + assertStringTrimLeft("UNICODE_CI", "I", "iii", ""); + assertStringTrimLeft("UNICODE_CI", "ixi", "i", "xi"); + assertStringTrimLeft("UNICODE_CI", "i", "İ", "i"); + assertStringTrimLeft("UNICODE_CI", "i\u0307", "İ", ""); + assertStringTrimLeft("UNICODE_CI", "i\u0307", "i", "i\u0307"); + assertStringTrimLeft("UNICODE_CI", "i\u0307", "\u0307", "i\u0307"); + assertStringTrimLeft("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimLeft("UNICODE_CI", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrimLeft("UNICODE_CI", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrimLeft("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307i"); + assertStringTrimLeft("UNICODE_CI", "i\u0307i", "İ", "i"); + assertStringTrimLeft("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimLeft("UNICODE_CI", "i\u0307İ", "İ", ""); + assertStringTrimLeft("UNICODE_CI", "İ", "İ", ""); + assertStringTrimLeft("UNICODE_CI", "IXi", "İ", "IXi"); + assertStringTrimLeft("UNICODE_CI", "ix\u0307", "Ixİ", "x\u0307"); + assertStringTrimLeft("UNICODE_CI", "i\u0307x", "IXİ", ""); + assertStringTrimLeft("UNICODE_CI", "i\u0307x", "I\u0307xİ", ""); + assertStringTrimLeft("UNICODE_CI", "İ", "i", "İ"); + assertStringTrimLeft("UNICODE_CI", "İ", "\u0307", "İ"); + assertStringTrimLeft("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimLeft("UNICODE_CI", "Ixİ", "i\u0307", "xİ"); + assertStringTrimLeft("UNICODE_CI", "IXİ", "ix\u0307", "İ"); + assertStringTrimLeft("UNICODE_CI", "xi\u0307", "\u0307IX", "i\u0307"); + assertStringTrimRight("UNICODE_CI", "i", "i", ""); + assertStringTrimRight("UNICODE_CI", "iii", "I", ""); + assertStringTrimRight("UNICODE_CI", "I", "iii", ""); + assertStringTrimRight("UNICODE_CI", "ixi", "i", "ix"); + assertStringTrimRight("UNICODE_CI", "i", "İ", "i"); + assertStringTrimRight("UNICODE_CI", "i\u0307", "İ", ""); + assertStringTrimRight("UNICODE_CI", "i\u0307", "i", "i\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307", "\u0307", "i\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307i", "İ", "i\u0307i"); + assertStringTrimRight("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimRight("UNICODE_CI", "i\u0307İ", "İ", ""); + assertStringTrimRight("UNICODE_CI", "İ", "İ", ""); + assertStringTrimRight("UNICODE_CI", "IXi", "İ", "IXi"); + assertStringTrimRight("UNICODE_CI", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307x", "IXİ", ""); + assertStringTrimRight("UNICODE_CI", "i\u0307x", "I\u0307xİ", ""); + assertStringTrimRight("UNICODE_CI", "İ", "i", "İ"); + assertStringTrimRight("UNICODE_CI", "İ", "\u0307", "İ"); + assertStringTrimRight("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimRight("UNICODE_CI", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimRight("UNICODE_CI", "IXİ", "ix\u0307", "IXİ"); + assertStringTrimRight("UNICODE_CI", "xi\u0307", "\u0307IX", "xi\u0307"); + + // Conditional case mapping - UTF8_BINARY. + assertStringTrim("UTF8_BINARY", "ςxς", "σ", "ςxς"); + assertStringTrim("UTF8_BINARY", "ςxς", "ς", "x"); + assertStringTrim("UTF8_BINARY", "ςxς", "Σ", "ςxς"); + assertStringTrim("UTF8_BINARY", "σxσ", "σ", "x"); + assertStringTrim("UTF8_BINARY", "σxσ", "ς", "σxσ"); + assertStringTrim("UTF8_BINARY", "σxσ", "Σ", "σxσ"); + assertStringTrim("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrim("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrim("UTF8_BINARY", "ΣxΣ", "Σ", "x"); + assertStringTrimLeft("UTF8_BINARY", "ςxς", "σ", "ςxς"); + assertStringTrimLeft("UTF8_BINARY", "ςxς", "ς", "xς"); + assertStringTrimLeft("UTF8_BINARY", "ςxς", "Σ", "ςxς"); + assertStringTrimLeft("UTF8_BINARY", "σxσ", "σ", "xσ"); + assertStringTrimLeft("UTF8_BINARY", "σxσ", "ς", "σxσ"); + assertStringTrimLeft("UTF8_BINARY", "σxσ", "Σ", "σxσ"); + assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "Σ", "xΣ"); + assertStringTrimRight("UTF8_BINARY", "ςxς", "σ", "ςxς"); + assertStringTrimRight("UTF8_BINARY", "ςxς", "ς", "ςx"); + assertStringTrimRight("UTF8_BINARY", "ςxς", "Σ", "ςxς"); + assertStringTrimRight("UTF8_BINARY", "σxσ", "σ", "σx"); + assertStringTrimRight("UTF8_BINARY", "σxσ", "ς", "σxσ"); + assertStringTrimRight("UTF8_BINARY", "σxσ", "Σ", "σxσ"); + assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "Σ", "Σx"); + // Conditional case mapping - UTF8_LCASE. + assertStringTrim("UTF8_LCASE", "ςxς", "σ", "x"); + assertStringTrim("UTF8_LCASE", "ςxς", "ς", "x"); + assertStringTrim("UTF8_LCASE", "ςxς", "Σ", "x"); + assertStringTrim("UTF8_LCASE", "σxσ", "σ", "x"); + assertStringTrim("UTF8_LCASE", "σxσ", "ς", "x"); + assertStringTrim("UTF8_LCASE", "σxσ", "Σ", "x"); + assertStringTrim("UTF8_LCASE", "ΣxΣ", "σ", "x"); + assertStringTrim("UTF8_LCASE", "ΣxΣ", "ς", "x"); + assertStringTrim("UTF8_LCASE", "ΣxΣ", "Σ", "x"); + assertStringTrimLeft("UTF8_LCASE", "ςxς", "σ", "xς"); + assertStringTrimLeft("UTF8_LCASE", "ςxς", "ς", "xς"); + assertStringTrimLeft("UTF8_LCASE", "ςxς", "Σ", "xς"); + assertStringTrimLeft("UTF8_LCASE", "σxσ", "σ", "xσ"); + assertStringTrimLeft("UTF8_LCASE", "σxσ", "ς", "xσ"); + assertStringTrimLeft("UTF8_LCASE", "σxσ", "Σ", "xσ"); + assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "σ", "xΣ"); + assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "ς", "xΣ"); + assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "Σ", "xΣ"); + assertStringTrimRight("UTF8_LCASE", "ςxς", "σ", "ςx"); + assertStringTrimRight("UTF8_LCASE", "ςxς", "ς", "ςx"); + assertStringTrimRight("UTF8_LCASE", "ςxς", "Σ", "ςx"); + assertStringTrimRight("UTF8_LCASE", "σxσ", "σ", "σx"); + assertStringTrimRight("UTF8_LCASE", "σxσ", "ς", "σx"); + assertStringTrimRight("UTF8_LCASE", "σxσ", "Σ", "σx"); + assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "σ", "Σx"); + assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "ς", "Σx"); + assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "Σ", "Σx"); + // Conditional case mapping - UNICODE. + assertStringTrim("UNICODE", "ςxς", "σ", "ςxς"); + assertStringTrim("UNICODE", "ςxς", "ς", "x"); + assertStringTrim("UNICODE", "ςxς", "Σ", "ςxς"); + assertStringTrim("UNICODE", "σxσ", "σ", "x"); + assertStringTrim("UNICODE", "σxσ", "ς", "σxσ"); + assertStringTrim("UNICODE", "σxσ", "Σ", "σxσ"); + assertStringTrim("UNICODE", "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrim("UNICODE", "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrim("UNICODE", "ΣxΣ", "Σ", "x"); + assertStringTrimLeft("UNICODE", "ςxς", "σ", "ςxς"); + assertStringTrimLeft("UNICODE", "ςxς", "ς", "xς"); + assertStringTrimLeft("UNICODE", "ςxς", "Σ", "ςxς"); + assertStringTrimLeft("UNICODE", "σxσ", "σ", "xσ"); + assertStringTrimLeft("UNICODE", "σxσ", "ς", "σxσ"); + assertStringTrimLeft("UNICODE", "σxσ", "Σ", "σxσ"); + assertStringTrimLeft("UNICODE", "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrimLeft("UNICODE", "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrimLeft("UNICODE", "ΣxΣ", "Σ", "xΣ"); + assertStringTrimRight("UNICODE", "ςxς", "σ", "ςxς"); + assertStringTrimRight("UNICODE", "ςxς", "ς", "ςx"); + assertStringTrimRight("UNICODE", "ςxς", "Σ", "ςxς"); + assertStringTrimRight("UNICODE", "σxσ", "σ", "σx"); + assertStringTrimRight("UNICODE", "σxσ", "ς", "σxσ"); + assertStringTrimRight("UNICODE", "σxσ", "Σ", "σxσ"); + assertStringTrimRight("UNICODE", "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrimRight("UNICODE", "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrimRight("UNICODE", "ΣxΣ", "Σ", "Σx"); + // Conditional case mapping - UNICODE_CI. + assertStringTrim("UNICODE_CI", "ςxς", "σ", "x"); + assertStringTrim("UNICODE_CI", "ςxς", "ς", "x"); + assertStringTrim("UNICODE_CI", "ςxς", "Σ", "x"); + assertStringTrim("UNICODE_CI", "σxσ", "σ", "x"); + assertStringTrim("UNICODE_CI", "σxσ", "ς", "x"); + assertStringTrim("UNICODE_CI", "σxσ", "Σ", "x"); + assertStringTrim("UNICODE_CI", "ΣxΣ", "σ", "x"); + assertStringTrim("UNICODE_CI", "ΣxΣ", "ς", "x"); + assertStringTrim("UNICODE_CI", "ΣxΣ", "Σ", "x"); + assertStringTrimLeft("UNICODE_CI", "ςxς", "σ", "xς"); + assertStringTrimLeft("UNICODE_CI", "ςxς", "ς", "xς"); + assertStringTrimLeft("UNICODE_CI", "ςxς", "Σ", "xς"); + assertStringTrimLeft("UNICODE_CI", "σxσ", "σ", "xσ"); + assertStringTrimLeft("UNICODE_CI", "σxσ", "ς", "xσ"); + assertStringTrimLeft("UNICODE_CI", "σxσ", "Σ", "xσ"); + assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "σ", "xΣ"); + assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "ς", "xΣ"); + assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "Σ", "xΣ"); + assertStringTrimRight("UNICODE_CI", "ςxς", "σ", "ςx"); + assertStringTrimRight("UNICODE_CI", "ςxς", "ς", "ςx"); + assertStringTrimRight("UNICODE_CI", "ςxς", "Σ", "ςx"); + assertStringTrimRight("UNICODE_CI", "σxσ", "σ", "σx"); + assertStringTrimRight("UNICODE_CI", "σxσ", "ς", "σx"); + assertStringTrimRight("UNICODE_CI", "σxσ", "Σ", "σx"); + assertStringTrimRight("UNICODE_CI", "ΣxΣ", "σ", "Σx"); + assertStringTrimRight("UNICODE_CI", "ΣxΣ", "ς", "Σx"); + assertStringTrimRight("UNICODE_CI", "ΣxΣ", "Σ", "Σx"); } // TODO: Test more collation-aware string expressions. From 176b148972300286186ab6a2dcf915522f1bf1d7 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 8 Jul 2024 05:10:48 +0200 Subject: [PATCH 08/14] Add tests --- .../unsafe/types/CollationSupportSuite.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 42a5e5f3a315d..48897156342f8 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -1904,6 +1904,23 @@ public void testStringTrim() throws SparkException { assertStringTrimRight("UNICODE_CI", "ΣxΣ", "σ", "Σx"); assertStringTrimRight("UNICODE_CI", "ΣxΣ", "ς", "Σx"); assertStringTrimRight("UNICODE_CI", "ΣxΣ", "Σ", "Σx"); + + // Unicode normalization - UTF8_BINARY. + assertStringTrim("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A"); + assertStringTrimLeft("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A"); + assertStringTrimRight("UTF8_BINARY", "åβγδa\u030A", "å", "åβγδa\u030A"); + // Unicode normalization - UTF8_LCASE. + assertStringTrim("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A"); + assertStringTrimLeft("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A"); + assertStringTrimRight("UTF8_LCASE", "åβγδa\u030A", "Å", "åβγδa\u030A"); + // Unicode normalization - UNICODE. + assertStringTrim("UNICODE", "åβγδa\u030A", "å", "βγδ"); + assertStringTrimLeft("UNICODE", "åβγδa\u030A", "å", "βγδa\u030A"); + assertStringTrimRight("UNICODE", "åβγδa\u030A", "å", "åβγδ"); + // Unicode normalization - UNICODE_CI. + assertStringTrim("UNICODE_CI", "åβγδa\u030A", "Å", "βγδ"); + assertStringTrimLeft("UNICODE_CI", "åβγδa\u030A", "Å", "βγδa\u030A"); + assertStringTrimRight("UNICODE_CI", "åβγδa\u030A", "Å", "åβγδ"); } // TODO: Test more collation-aware string expressions. From 598f52216f8ca43dbb2a3c75f40cfc3d3aba4f70 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 8 Jul 2024 05:14:24 +0200 Subject: [PATCH 09/14] Remove unused code --- .../spark/sql/catalyst/util/CollationFactory.java | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index c4bc0eda81511..f13f66e384e0f 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -813,18 +813,6 @@ public static String[] getICULocaleNames() { return Collation.CollationSpecICU.ICULocaleNames; } - public static String getCollationKey(String input, int collationId) { - Collation collation = fetchCollation(collationId); - if (collation.supportsBinaryEquality) { - return input; - } else if (collation.supportsLowercaseEquality) { - return input.toLowerCase(); - } else { - CollationKey collationKey = collation.collator.getCollationKey(input); - return Arrays.toString(collationKey.toByteArray()); - } - } - public static UTF8String getCollationKey(UTF8String input, int collationId) { Collation collation = fetchCollation(collationId); if (collation.supportsBinaryEquality) { From 12cc084baccd82e243a893cb9d0e0440ceb4ff5a Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 8 Jul 2024 09:40:48 +0200 Subject: [PATCH 10/14] Fix java lint --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 73b76320247e0..427f8c023fef0 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -798,7 +798,9 @@ public static UTF8String lowercaseTrimLeft( if (trimChars.contains(codePoint)) ++searchIndex; break; } - } else if (trimChars.contains(codePoint)) ++searchIndex; + } else if (trimChars.contains(codePoint)) { + ++searchIndex; + } else break; } @@ -901,7 +903,9 @@ public static UTF8String lowercaseTrimRight( if (trimChars.contains(codePoint)) --searchIndex; break; } - } else if (trimChars.contains(codePoint)) --searchIndex; + } else if (trimChars.contains(codePoint)) { + --searchIndex; + } else break; } From 469c325be8f1e03216cdc1a3ca5c419e2bc6ac67 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 9 Jul 2024 20:28:00 +0200 Subject: [PATCH 11/14] Fix --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 427f8c023fef0..6fa07256ed9c4 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -467,7 +467,9 @@ private static int getLowercaseCodePoint(final int codePoint) { return CODE_POINT_COMBINED_LOWERCASE_I_DOT; } else if (codePoint == 0x03C2) { - // Greek final and non-final capital letter sigma should be mapped the same. + // Greek final and non-final letter sigma should be mapped the same. This is achieved by + // mapping Greek small final sigma (U+03C2) to Greek small non-final sigma (U+03C3). Capital + // letter sigma (U+03A3) is mapped to small non-final sigma (U+03C3) in the `else` branch. return 0x03C3; } else { From 06575fc5a40adf4277993489ed9e9a5db19033ae Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 12 Jul 2024 07:54:08 +0200 Subject: [PATCH 12/14] Fix comments --- .../apache/spark/unsafe/types/CollationSupportSuite.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 48897156342f8..1a5c585791c5b 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -1792,7 +1792,7 @@ public void testStringTrim() throws SparkException { assertStringTrimRight("UNICODE_CI", "IXİ", "ix\u0307", "IXİ"); assertStringTrimRight("UNICODE_CI", "xi\u0307", "\u0307IX", "xi\u0307"); - // Conditional case mapping - UTF8_BINARY. + // Greek sigmas - UTF8_BINARY. assertStringTrim("UTF8_BINARY", "ςxς", "σ", "ςxς"); assertStringTrim("UTF8_BINARY", "ςxς", "ς", "x"); assertStringTrim("UTF8_BINARY", "ςxς", "Σ", "ςxς"); @@ -1820,7 +1820,7 @@ public void testStringTrim() throws SparkException { assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ"); assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ"); assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "Σ", "Σx"); - // Conditional case mapping - UTF8_LCASE. + // Greek sigmas - UTF8_LCASE. assertStringTrim("UTF8_LCASE", "ςxς", "σ", "x"); assertStringTrim("UTF8_LCASE", "ςxς", "ς", "x"); assertStringTrim("UTF8_LCASE", "ςxς", "Σ", "x"); @@ -1848,7 +1848,7 @@ public void testStringTrim() throws SparkException { assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "σ", "Σx"); assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "ς", "Σx"); assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "Σ", "Σx"); - // Conditional case mapping - UNICODE. + // Greek sigmas - UNICODE. assertStringTrim("UNICODE", "ςxς", "σ", "ςxς"); assertStringTrim("UNICODE", "ςxς", "ς", "x"); assertStringTrim("UNICODE", "ςxς", "Σ", "ςxς"); @@ -1876,7 +1876,7 @@ public void testStringTrim() throws SparkException { assertStringTrimRight("UNICODE", "ΣxΣ", "σ", "ΣxΣ"); assertStringTrimRight("UNICODE", "ΣxΣ", "ς", "ΣxΣ"); assertStringTrimRight("UNICODE", "ΣxΣ", "Σ", "Σx"); - // Conditional case mapping - UNICODE_CI. + // Greek sigmas - UNICODE_CI. assertStringTrim("UNICODE_CI", "ςxς", "σ", "x"); assertStringTrim("UNICODE_CI", "ςxς", "ς", "x"); assertStringTrim("UNICODE_CI", "ςxς", "Σ", "x"); From 8c5787d6fc079775ada3624d0b1c3e82e505182b Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Sun, 14 Jul 2024 13:42:50 +0200 Subject: [PATCH 13/14] Fixes --- .../util/CollationAwareUTF8String.java | 37 +++++++++++++------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 06c06a8c62a26..b9868ca665a65 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -866,6 +866,7 @@ public static UTF8String lowercaseTrim( * * @param srcString the input string to be trimmed from both ends of the string * @param trimString the trim string characters to trim + * @param collationId the collation ID to use for string trimming * @return the trimmed string (for ICU collations) */ public static UTF8String trim( @@ -910,7 +911,9 @@ public static UTF8String lowercaseTrimLeft( trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) { int nextCodePoint = getLowercaseCodePoint(srcIter.next()); if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint)) - || nextCodePoint == CODE_POINT_COMBINING_DOT) searchIndex += 2; + || nextCodePoint == CODE_POINT_COMBINING_DOT) { + searchIndex += 2; + } else { if (trimChars.contains(codePoint)) ++searchIndex; break; @@ -918,7 +921,9 @@ public static UTF8String lowercaseTrimLeft( } else if (trimChars.contains(codePoint)) { ++searchIndex; } - else break; + else { + break; + } } // Return the substring from that position to the end of the string. @@ -935,6 +940,7 @@ public static UTF8String lowercaseTrimLeft( * * @param srcString the input string to be trimmed from the left end of the string * @param trimString the trim string characters to trim + * @param collationId the collation ID to use for string trimming * @return the trimmed string (for ICU collations) */ public static UTF8String trimLeft( @@ -946,12 +952,12 @@ public static UTF8String trimLeft( if (srcString.numBytes() == 0) return srcString; // Create an array of Strings for all characters of `trimString`. - int trimCharIndex = 0; - String[] trimChars = new String[trimString.numChars()]; + Map trimChars = new HashMap<>(); Iterator trimIter = trimString.codePointIterator( CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID); while (trimIter.hasNext()) { - trimChars[trimCharIndex++] = String.valueOf((char) trimIter.next().intValue()); + int codePoint = trimIter.next(); + trimChars.putIfAbsent(codePoint, String.valueOf((char) codePoint)); } // Iterate over srcString from the left and find the first character that is not in trimChars. @@ -961,7 +967,7 @@ public static UTF8String trimLeft( int charIndex = 0, longestMatchLen; while (charIndex < src.length()) { longestMatchLen = 0; - for (String trim : trimChars) { + for (String trim : trimChars.values()) { StringSearch stringSearch = new StringSearch(trim, target, (RuleBasedCollator) collator); stringSearch.setIndex(charIndex); int matchIndex = stringSearch.next(); @@ -1015,7 +1021,9 @@ public static UTF8String lowercaseTrimRight( trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) { int nextCodePoint = getLowercaseCodePoint(srcIter.next()); if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint)) - || nextCodePoint == CODE_POINT_LOWERCASE_I) searchIndex -= 2; + || nextCodePoint == CODE_POINT_LOWERCASE_I) { + searchIndex -= 2; + } else { if (trimChars.contains(codePoint)) --searchIndex; break; @@ -1023,7 +1031,9 @@ public static UTF8String lowercaseTrimRight( } else if (trimChars.contains(codePoint)) { --searchIndex; } - else break; + else { + break; + } } // Return the substring from the start of the string to the calculated position. @@ -1040,6 +1050,7 @@ public static UTF8String lowercaseTrimRight( * * @param srcString the input string to be trimmed from the right end of the string * @param trimString the trim string characters to trim + * @param collationId the collation ID to use for string trimming * @return the trimmed string (for ICU collations) */ public static UTF8String trimRight( @@ -1051,12 +1062,12 @@ public static UTF8String trimRight( if (srcString.numBytes() == 0) return srcString; // Create an array of Strings for all characters of `trimString`. - int trimCharIndex = 0; - String[] trimChars = new String[trimString.numChars()]; + Map trimChars = new HashMap<>(); Iterator trimIter = trimString.codePointIterator( CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID); while (trimIter.hasNext()) { - trimChars[trimCharIndex++] = String.valueOf((char) trimIter.next().intValue()); + int codePoint = trimIter.next(); + trimChars.putIfAbsent(codePoint, String.valueOf((char) codePoint)); } // Iterate over srcString from the left and find the first character that is not in trimChars. @@ -1066,12 +1077,14 @@ public static UTF8String trimRight( int charIndex = src.length(), longestMatchLen; while (charIndex >= 0) { longestMatchLen = 0; - for (String trim : trimChars) { + for (String trim : trimChars.values()) { StringSearch stringSearch = new StringSearch(trim, target, (RuleBasedCollator) collator); // Note: stringSearch.previous() is NOT consistent with stringSearch.next()! // Example: StringSearch("İ", "i\\u0307İi\\u0307İi\\u0307İ", "UNICODE_CI") // stringSearch.next() gives: [0, 2, 3, 5, 6, 8]. // stringSearch.previous() gives: [8, 6, 3, 0]. + // Since 1 character can map to at most 3 characters in Unicode, we can begin the search + // from character position: `charIndex` - 3, and use `next()` to find the longest match. stringSearch.setIndex(Math.max(charIndex - 3, 0)); int matchIndex = stringSearch.next(); int matchLen = stringSearch.getMatchLength(); From d15d92aa9f0b15c879a531756f3f474264bec462 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Sun, 14 Jul 2024 22:51:41 +0200 Subject: [PATCH 14/14] Fix tests --- .../apache/spark/sql/CollationStringExpressionsSuite.scala | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala index 5387b2d435350..815a8bc595294 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala @@ -874,11 +874,6 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } - test("xxxx") { - checkEvaluation( - StringTrim(Literal.create(null, StringType), Literal.create(null, StringType)), null) - } - test("StringTrim* functions - unit tests for both paths (codegen and eval)") { def evalStringTrim(src: Any, trim: Any, result: String): Unit = { Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI").foreach { collation =>