From a5759879338233055aeb9ac38fcecc06ff93bad5 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 22 May 2024 10:03:15 +0200 Subject: [PATCH 01/14] Initial commit --- .../util/CollationAwareUTF8String.java | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index ee0d611d7e652..0f4a59f33b682 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -141,18 +141,84 @@ public static String toUpperCase(final String target, final int collationId) { return UCharacter.toUpperCase(locale, target); } + private static int uppercaseCodePoint(final int codePoint, final StringBuilder sb, final int i, + final String target) { + // Latin small letter i with an additional dot is represented using 2 characters. + if (codePoint == 0x0069 && i + 1 < target.length() && target.codePointAt(i + 1) == 0x0307) { + sb.append("İ"); + return 1; + } + // All other characters should follow context-unaware ICU single-code point case mapping. + sb.appendCodePoint(UCharacter.toTitleCase(codePoint)); + return 0; + } + + public static String toUpperCase(final String target) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < target.length(); ++i) { + int codePoint = target.codePointAt(i); + // Latin small letter i with an additional dot above (represented using 2 characters). + if (codePoint == 0x0069 && i + 1 < target.length() && target.codePointAt(i + 1) == 0x0307) { + sb.append("İ"); + ++i; + } + // All other characters should follow context-unaware ICU single-code point case mapping. + else { + sb.appendCodePoint(UCharacter.toUpperCase(codePoint)); + } + } + return sb.toString(); + } + public static String toLowerCase(final String target, final int collationId) { ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); return UCharacter.toLowerCase(locale, target); } + private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) { + // Latin capital letter I with dot above is mapped to 2 lowercase characters. + if (codePoint == 0x0130) { + sb.append("i̇"); + } + // Greek final and non-final capital letter sigma should be mapped the same. + else if (codePoint == 0x03C2) { + sb.append("σ"); + } + // All other characters should follow context-unaware ICU single-code point case mapping. + else { + sb.appendCodePoint(UCharacter.toLowerCase(codePoint)); + } + } + + public static String toLowerCase(final String target) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < target.length(); ++i) { + int codePoint = target.codePointAt(i); + lowercaseCodePoint(codePoint, sb); + } + return sb.toString(); + } + public static String toTitleCase(final String target, final int collationId) { ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); return UCharacter.toTitleCase(locale, target, BreakIterator.getWordInstance(locale)); } + public static String toTitleCase(final String target) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < target.length(); ++i) { + int codePoint = target.codePointAt(i); + if (i == 0 || Character.isWhitespace(target.codePointBefore(i))) { + i += uppercaseCodePoint(codePoint, sb, i, target); + } else { + lowercaseCodePoint(codePoint, sb); + } + } + return sb.toString(); + } + public static int findInSet(final UTF8String match, final UTF8String set, int collationId) { if (match.contains(UTF8String.fromString(","))) { return 0; From a7241002255293e322a0ed246b5c7ea347f23c21 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 22 May 2024 10:03:18 +0200 Subject: [PATCH 02/14] Tests --- .../unsafe/types/CollationSupportSuite.java | 169 +++++++++++++++++- 1 file changed, 168 insertions(+), 1 deletion(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 7fc3c4e349c3b..e1dd96f6cb8f6 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -17,15 +17,181 @@ package org.apache.spark.unsafe.types; import org.apache.spark.SparkException; +import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String; import org.apache.spark.sql.catalyst.util.CollationFactory; import org.apache.spark.sql.catalyst.util.CollationSupport; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; - +// checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { + /** + * Collation-aware UTF8String comparison. + */ + + private void assertLowercase(String target, String expected, String collationName) + throws SparkException { + if (collationName.equals("UTF8_BINARY")) { + UTF8String targetUTF8 = UTF8String.fromString(target); + UTF8String expectedUTF8 = UTF8String.fromString(expected); + assertEquals(expectedUTF8, targetUTF8.toLowerCase()); + } else if (collationName.equals("UTF8_BINARY_LCASE")) { + assertEquals(expected, CollationAwareUTF8String.toLowerCase(target)); + } else { + int collationId = CollationFactory.collationNameToId(collationName); + assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, collationId)); + } + } + + @Test + public void testLowercase() throws SparkException { + // Edge cases + assertLowercase("", "", "UTF8_BINARY"); + assertLowercase("", "", "UTF8_BINARY_LCASE"); + assertLowercase("", "", "UNICODE"); + assertLowercase("", "", "UNICODE_CI"); + // Basic tests + assertLowercase("abcd", "abcd", "UTF8_BINARY"); + assertLowercase("AbCd", "abcd", "UTF8_BINARY"); + assertLowercase("abcd", "abcd", "UTF8_BINARY_LCASE"); + assertLowercase("aBcD", "abcd", "UTF8_BINARY_LCASE"); + assertLowercase("abcd", "abcd", "UNICODE"); + assertLowercase("aBCd", "abcd", "UNICODE"); + assertLowercase("abcd", "abcd", "UNICODE_CI"); + assertLowercase("AbcD", "abcd", "UNICODE_CI"); + // Accent variation + assertLowercase("AbĆd", "abćd", "UTF8_BINARY"); + assertLowercase("aBcΔ", "abcδ", "UTF8_BINARY_LCASE"); + assertLowercase("ÄbcD", "äbcd", "UNICODE"); + assertLowercase("aB́Cd", "ab́cd", "UNICODE_CI"); + // Case-variable character length + assertLowercase("İoDiNe", "i̇odine", "UTF8_BINARY"); + assertLowercase("Abi̇o12", "abi̇o12", "UTF8_BINARY"); + assertLowercase("İodInE", "i̇odine", "UTF8_BINARY_LCASE"); + assertLowercase("aBi̇o12", "abi̇o12", "UTF8_BINARY_LCASE"); + assertLowercase("İoDinE", "i̇odine", "UNICODE"); + assertLowercase("abi̇O12", "abi̇o12", "UNICODE"); + assertLowercase("İodINe", "i̇odine", "UNICODE_CI"); + assertLowercase("ABi̇o12", "abi̇o12", "UNICODE_CI"); + // Conditional case mapping + assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UTF8_BINARY"); + assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", "UTF8_BINARY_LCASE"); // != UNICODE_CI + assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE"); + assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI"); + } + + private void assertUppercase(String target, String expected, String collationName) + throws SparkException { + if (collationName.equals("UTF8_BINARY")) { + UTF8String targetUTF8 = UTF8String.fromString(target); + UTF8String expectedUTF8 = UTF8String.fromString(expected); + assertEquals(expectedUTF8, targetUTF8.toUpperCase()); + } else if (collationName.equals("UTF8_BINARY_LCASE")) { + assertEquals(expected, CollationAwareUTF8String.toUpperCase(target)); + } else { + int collationId = CollationFactory.collationNameToId(collationName); + assertEquals(expected, CollationAwareUTF8String.toUpperCase(target, collationId)); + } + } + + @Test + public void testUppercase() throws SparkException { + // Edge cases + assertUppercase("", "", "UTF8_BINARY"); + assertUppercase("", "", "UTF8_BINARY_LCASE"); + assertUppercase("", "", "UNICODE"); + assertUppercase("", "", "UNICODE_CI"); + // Basic tests + assertUppercase("abcd", "ABCD", "UTF8_BINARY"); + assertUppercase("AbCd", "ABCD", "UTF8_BINARY"); + assertUppercase("abcd", "ABCD", "UTF8_BINARY_LCASE"); + assertUppercase("aBcD", "ABCD", "UTF8_BINARY_LCASE"); + assertUppercase("abcd", "ABCD", "UNICODE"); + assertUppercase("aBCd", "ABCD", "UNICODE"); + assertUppercase("abcd", "ABCD", "UNICODE_CI"); + assertUppercase("AbcD", "ABCD", "UNICODE_CI"); + // Accent variation + assertUppercase("aBćD", "ABĆD", "UTF8_BINARY"); + assertUppercase("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE"); + assertUppercase("äBCd", "ÄBCD", "UNICODE"); + assertUppercase("Ab́cD", "AB́CD", "UNICODE_CI"); + // Case-variable character length + assertUppercase("i\u0307oDiNe", "I\u0307ODINE", "UTF8_BINARY"); + assertUppercase("Abi\u0307o12", "ABI\u0307O12", "UTF8_BINARY"); + assertUppercase("i̇odInE", "İODINE", "UTF8_BINARY_LCASE"); + assertUppercase("aBi̇o12", "ABİO12", "UTF8_BINARY_LCASE"); + assertUppercase("i̇oDinE", "I\u0307ODINE", "UNICODE"); + assertUppercase("abi̇O12", "ABI\u0307O12", "UNICODE"); + assertUppercase("i̇odINe", "I\u0307ODINE", "UNICODE_CI"); + assertUppercase("ABi̇o12", "ABI\u0307O12", "UNICODE_CI"); + // Conditional case mapping + assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY"); + assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY"); + assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE"); + assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE"); + assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE"); + assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE"); + assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI"); + assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI"); + } + + private void assertTitlecase(String target, String expected, String collationName) + throws SparkException { + if (collationName.equals("UTF8_BINARY")) { + UTF8String targetUTF8 = UTF8String.fromString(target); + UTF8String expectedUTF8 = UTF8String.fromString(expected); + assertEquals(expectedUTF8, targetUTF8.toTitleCase()); + } else if (collationName.equals("UTF8_BINARY_LCASE")) { + assertEquals(expected, CollationAwareUTF8String.toTitleCase(target)); + } else { + int collationId = CollationFactory.collationNameToId(collationName); + assertEquals(expected, CollationAwareUTF8String.toTitleCase(target, collationId)); + } + } + + @Test + public void testTitlecase() throws SparkException { + // Edge cases + assertTitlecase("", "", "UTF8_BINARY"); + assertTitlecase("", "", "UTF8_BINARY_LCASE"); + assertTitlecase("", "", "UNICODE"); + assertTitlecase("", "", "UNICODE_CI"); + // Basic tests + assertTitlecase("ab cd", "Ab Cd", "UTF8_BINARY"); + assertTitlecase("Ab Cd", "Ab Cd", "UTF8_BINARY"); + assertTitlecase("ab cd", "Ab Cd", "UTF8_BINARY_LCASE"); + assertTitlecase("aB cD", "Ab Cd", "UTF8_BINARY_LCASE"); + assertTitlecase("ab cd", "Ab Cd", "UNICODE"); + assertTitlecase("aB Cd", "Ab Cd", "UNICODE"); + assertTitlecase("ab cd", "Ab Cd", "UNICODE_CI"); + assertTitlecase("Ab cD", "Ab Cd", "UNICODE_CI"); + // Accent variation + assertTitlecase("aB ćD", "AB ĆD", "UTF8_BINARY"); + assertTitlecase("AbC δ", "Abc Δ", "UTF8_BINARY_LCASE"); + assertTitlecase("äB Cd", "Äb Cd", "UNICODE"); + assertTitlecase("A b́cD", "A B́cd", "UNICODE_CI"); + // Case-variable character length + assertTitlecase("i\u0307oDiNe", "I\u0307oDiNe", "UTF8_BINARY"); + assertTitlecase("Abi\u0307o12", "Abi\u0307o12", "UTF8_BINARY"); + assertTitlecase("i̇od i̇nE", "İod İne", "UTF8_BINARY_LCASE"); + assertTitlecase("aBi̇o12", "Abi\u0307o12", "UTF8_BINARY_LCASE"); + assertTitlecase("i̇oDinE", "I\u0307odine", "UNICODE"); + assertTitlecase("abi̇O12", "Abi̇o12", "UNICODE"); + assertTitlecase("i̇odINe", "I\u0307odine", "UNICODE_CI"); + assertTitlecase("ABi̇o12", "Abi\u0307o12", "UNICODE_CI"); + // Conditional case mapping + assertTitlecase("a ς c", "A Σ C", "UTF8_BINARY"); + assertTitlecase("a σ c", "A Σ C", "UTF8_BINARY"); + assertTitlecase("a ς c", "A Σ C", "UTF8_BINARY_LCASE"); + assertTitlecase("a σ c", "A Σ C", "UTF8_BINARY_LCASE"); + assertTitlecase("a ς c", "A Σ C", "UNICODE"); + assertTitlecase("a σ c", "A Σ C", "UNICODE"); + assertTitlecase("a ς c", "A Σ C", "UNICODE_CI"); + assertTitlecase("a σ c", "A Σ C", "UNICODE_CI"); + } + /** * Collation-aware string expressions. */ @@ -1008,3 +1174,4 @@ public void testStringTrim() throws SparkException { // TODO: Test other collation-aware expressions. } +// checkstyle.on: AvoidEscapedUnicodeCharacters From d919e4e68c033681345675c101b3ab64d59a0b06 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 22 May 2024 13:23:10 +0200 Subject: [PATCH 03/14] Comparison and hash --- .../util/CollationAwareUTF8String.java | 5 ++ .../sql/catalyst/util/CollationFactory.java | 4 +- .../unsafe/types/CollationSupportSuite.java | 65 +++++++++++++++++++ 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 0f4a59f33b682..ac314f9d30921 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -34,6 +34,11 @@ * Utility class for collation-aware UTF8String operations. */ public class CollationAwareUTF8String { + + public static int compareLowerCase(final UTF8String left, final UTF8String right) { + return toLowerCase(left.toString()).compareTo(toLowerCase(right.toString())); + } + public static UTF8String replace(final UTF8String src, final UTF8String search, final UTF8String replace, final int collationId) { // This collation aware implementation is based on existing implementation on UTF8String diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 0133c3feb611a..9f9773eaeace3 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -232,9 +232,9 @@ public CollationIdentifier identifier() { "UTF8_BINARY_LCASE", PROVIDER_SPARK, null, - UTF8String::compareLowerCase, + CollationAwareUTF8String::compareLowerCase, "1.0", - (s) -> (long)s.toLowerCase().hashCode(), + (s) -> (long)CollationAwareUTF8String.toLowerCase(s.toString()).hashCode(), false, false, true); diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index e1dd96f6cb8f6..fdbb45faf3dfc 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -31,6 +31,71 @@ public class CollationSupportSuite { * Collation-aware UTF8String comparison. */ + private void assertCompare(String s1, String s2, String collationName, int expected) + throws SparkException { + UTF8String l = UTF8String.fromString(s1); + UTF8String r = UTF8String.fromString(s2); + int compare = CollationFactory.fetchCollation(collationName).comparator.compare(l, r); + assertEquals(Integer.signum(expected), Integer.signum(compare)); + } + + @Test + public void testCompare() throws SparkException { + // Edge cases + assertCompare("", "", "UTF8_BINARY", 0); + assertCompare("a", "", "UTF8_BINARY", 1); + assertCompare("", "a", "UTF8_BINARY", -1); + assertCompare("", "", "UTF8_BINARY_LCASE", 0); + assertCompare("a", "", "UTF8_BINARY_LCASE", 1); + assertCompare("", "a", "UTF8_BINARY_LCASE", -1); + assertCompare("", "", "UNICODE", 0); + assertCompare("a", "", "UNICODE", 1); + assertCompare("", "a", "UNICODE", -1); + assertCompare("", "", "UNICODE_CI", 0); + assertCompare("a", "", "UNICODE_CI", 1); + assertCompare("", "a", "UNICODE_CI", -1); + // Basic tests + assertCompare("AbCd", "aBcD", "UTF8_BINARY", -1); + assertCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0); + assertCompare("AbcD", "aBCd", "UNICODE", 1); + assertCompare("abcd", "ABCD", "UNICODE_CI", 0); + // Accent variation + assertCompare("aBćD", "ABĆD", "UTF8_BINARY", 1); + assertCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0); + assertCompare("äBCd", "ÄBCD", "UNICODE", -1); + assertCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0); + // Case-variable character length + assertCompare("i\u0307", "İ", "UTF8_BINARY", -1); + assertCompare("İ", "i\u0307", "UTF8_BINARY", 1); + assertCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0); + assertCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0); + assertCompare("i\u0307", "İ", "UNICODE", -1); + assertCompare("İ", "i\u0307", "UNICODE", 1); + assertCompare("i\u0307", "İ", "UNICODE_CI", 0); + assertCompare("İ", "i\u0307", "UNICODE_CI", 0); + assertCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0); + assertCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0); + assertCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0); + assertCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0); + assertCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0); + assertCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0); + assertCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0); + assertCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0); + // Conditional case mapping + assertCompare("ς", "σ", "UTF8_BINARY", -1); + assertCompare("ς", "Σ", "UTF8_BINARY", 1); + assertCompare("σ", "Σ", "UTF8_BINARY", 1); + assertCompare("ς", "σ", "UTF8_BINARY_LCASE", 0); + assertCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0); + assertCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0); + assertCompare("ς", "σ", "UNICODE", 1); + assertCompare("ς", "Σ", "UNICODE", 1); + assertCompare("σ", "Σ", "UNICODE", -1); + assertCompare("ς", "σ", "UNICODE_CI", 0); + assertCompare("ς", "Σ", "UNICODE_CI", 0); + assertCompare("σ", "Σ", "UNICODE_CI", 0); + } + private void assertLowercase(String target, String expected, String collationName) throws SparkException { if (collationName.equals("UTF8_BINARY")) { From fb48bdc9bff72bb9863eed2e1d4ff06cf73b7c0a Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Thu, 23 May 2024 17:00:28 +0200 Subject: [PATCH 04/14] Add doc comments --- .../util/CollationAwareUTF8String.java | 60 +++- .../sql/catalyst/util/CollationFactory.java | 2 +- .../unsafe/types/CollationSupportSuite.java | 296 +++++++++--------- 3 files changed, 205 insertions(+), 153 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index ac314f9d30921..66632457b328b 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -35,8 +35,18 @@ */ public class CollationAwareUTF8String { + /** + * Lowercase UTF8String comparison used for UTF8_BINARY_LCASE collation. While the default + * UTF8String comparison is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()), this + * method uses code points to compare the strings in a case-insensitive manner using ICU rules, + * as well as handling special rules for conditional case mappings (see: lowerCaseCodePoints). + * + * @param left The first UTF8String to compare. + * @param right The second UTF8String to compare. + * @return An integer representing the comparison result. + */ public static int compareLowerCase(final UTF8String left, final UTF8String right) { - return toLowerCase(left.toString()).compareTo(toLowerCase(right.toString())); + return lowerCaseCodePoints(left.toString()).compareTo(lowerCaseCodePoints(right.toString())); } public static UTF8String replace(final UTF8String src, final UTF8String search, @@ -146,6 +156,16 @@ public static String toUpperCase(final String target, final int collationId) { return UCharacter.toUpperCase(locale, target); } + /** + * Converts a single code point to uppercase using ICU rules, with special handling for + * conditional case mappings (i.e. characters that map to multiple characters in uppercase). + * + * @param codePoint The code point to convert to uppercase. + * @param sb The StringBuilder to append the uppercase character to. + * @param i The index of the code point in the target string. + * @param target The target string to convert to uppercase. + * @return The number of characters consumed by the code point. + */ private static int uppercaseCodePoint(final int codePoint, final StringBuilder sb, final int i, final String target) { // Latin small letter i with an additional dot is represented using 2 characters. @@ -153,12 +173,21 @@ private static int uppercaseCodePoint(final int codePoint, final StringBuilder s sb.append("İ"); return 1; } + // TODO: Add special handling for other chars that map to multiple characters in uppercase. // All other characters should follow context-unaware ICU single-code point case mapping. sb.appendCodePoint(UCharacter.toTitleCase(codePoint)); return 0; } - public static String toUpperCase(final String target) { + /** + * Converts an entire string to uppercase using ICU rules, code point by code point, with + * special handling for conditional case mappings (i.e. characters that map to multiple + * characters in uppercase). This method omits information about context-sensitive case mappings. + * + * @param target The target string to convert to uppercase. + * @return The string converted to uppercase in a context-unaware manner. + */ + public static String upperCaseCodePoints(final String target) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < target.length(); ++i) { int codePoint = target.codePointAt(i); @@ -181,6 +210,13 @@ public static String toLowerCase(final String target, final int collationId) { return UCharacter.toLowerCase(locale, target); } + /** + * Converts a single code point to lowercase using ICU rules, with special handling for + * conditional case mappings (i.e. characters that map to multiple characters in lowercase). + * + * @param codePoint The code point to convert to lowercase. + * @param sb The StringBuilder to append the lowercase character to. + */ private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) { // Latin capital letter I with dot above is mapped to 2 lowercase characters. if (codePoint == 0x0130) { @@ -196,7 +232,15 @@ else if (codePoint == 0x03C2) { } } - public static String toLowerCase(final String target) { + /** + * Converts an entire string to lowercase using ICU rules, code point by code point, with + * special handling for conditional case mappings (i.e. characters that map to multiple + * characters in lowercase). This method omits information about context-sensitive case mappings. + * + * @param target The target string to convert to lowercase. + * @return The string converted to lowercase in a context-unaware manner. + */ + public static String lowerCaseCodePoints(final String target) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < target.length(); ++i) { int codePoint = target.codePointAt(i); @@ -211,7 +255,15 @@ public static String toTitleCase(final String target, final int collationId) { return UCharacter.toTitleCase(locale, target, BreakIterator.getWordInstance(locale)); } - public static String toTitleCase(final String target) { + /** + * Converts an entire string to titlecase using ICU rules, code point by code point, with + * special handling for conditional case mappings (i.e. characters that map to multiple + * characters in lowercase). This method omits information about context-sensitive case mappings. + * + * @param target The target string to convert to lowercase. + * @return The string converted to lowercase in a context-unaware manner. + */ + public static String titleCaseCodePoints(final String target) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < target.length(); ++i) { int codePoint = target.codePointAt(i); diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 9f9773eaeace3..5c9313875fb1c 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -234,7 +234,7 @@ public CollationIdentifier identifier() { null, CollationAwareUTF8String::compareLowerCase, "1.0", - (s) -> (long)CollationAwareUTF8String.toLowerCase(s.toString()).hashCode(), + (s) -> (long)CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(), false, false, true); diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index fdbb45faf3dfc..c9a7e18e1cdbe 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -31,7 +31,7 @@ public class CollationSupportSuite { * Collation-aware UTF8String comparison. */ - private void assertCompare(String s1, String s2, String collationName, int expected) + private void assertStringCompare(String s1, String s2, String collationName, int expected) throws SparkException { UTF8String l = UTF8String.fromString(s1); UTF8String r = UTF8String.fromString(s2); @@ -42,68 +42,68 @@ private void assertCompare(String s1, String s2, String collationName, int expec @Test public void testCompare() throws SparkException { // Edge cases - assertCompare("", "", "UTF8_BINARY", 0); - assertCompare("a", "", "UTF8_BINARY", 1); - assertCompare("", "a", "UTF8_BINARY", -1); - assertCompare("", "", "UTF8_BINARY_LCASE", 0); - assertCompare("a", "", "UTF8_BINARY_LCASE", 1); - assertCompare("", "a", "UTF8_BINARY_LCASE", -1); - assertCompare("", "", "UNICODE", 0); - assertCompare("a", "", "UNICODE", 1); - assertCompare("", "a", "UNICODE", -1); - assertCompare("", "", "UNICODE_CI", 0); - assertCompare("a", "", "UNICODE_CI", 1); - assertCompare("", "a", "UNICODE_CI", -1); + assertStringCompare("", "", "UTF8_BINARY", 0); + assertStringCompare("a", "", "UTF8_BINARY", 1); + assertStringCompare("", "a", "UTF8_BINARY", -1); + assertStringCompare("", "", "UTF8_BINARY_LCASE", 0); + assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1); + assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1); + assertStringCompare("", "", "UNICODE", 0); + assertStringCompare("a", "", "UNICODE", 1); + assertStringCompare("", "a", "UNICODE", -1); + assertStringCompare("", "", "UNICODE_CI", 0); + assertStringCompare("a", "", "UNICODE_CI", 1); + assertStringCompare("", "a", "UNICODE_CI", -1); // Basic tests - assertCompare("AbCd", "aBcD", "UTF8_BINARY", -1); - assertCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0); - assertCompare("AbcD", "aBCd", "UNICODE", 1); - assertCompare("abcd", "ABCD", "UNICODE_CI", 0); + assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1); + assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0); + assertStringCompare("AbcD", "aBCd", "UNICODE", 1); + assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0); // Accent variation - assertCompare("aBćD", "ABĆD", "UTF8_BINARY", 1); - assertCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0); - assertCompare("äBCd", "ÄBCD", "UNICODE", -1); - assertCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0); + assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1); + assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1); + assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0); // Case-variable character length - assertCompare("i\u0307", "İ", "UTF8_BINARY", -1); - assertCompare("İ", "i\u0307", "UTF8_BINARY", 1); - assertCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0); - assertCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0); - assertCompare("i\u0307", "İ", "UNICODE", -1); - assertCompare("İ", "i\u0307", "UNICODE", 1); - assertCompare("i\u0307", "İ", "UNICODE_CI", 0); - assertCompare("İ", "i\u0307", "UNICODE_CI", 0); - assertCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0); - assertCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0); - assertCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0); - assertCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0); - assertCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0); - assertCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0); - assertCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0); - assertCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0); + assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1); + assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1); + assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0); + assertStringCompare("i\u0307", "İ", "UNICODE", -1); + assertStringCompare("İ", "i\u0307", "UNICODE", 1); + assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0); + assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0); + assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0); + assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0); + assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0); + assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0); + assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0); + assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0); // Conditional case mapping - assertCompare("ς", "σ", "UTF8_BINARY", -1); - assertCompare("ς", "Σ", "UTF8_BINARY", 1); - assertCompare("σ", "Σ", "UTF8_BINARY", 1); - assertCompare("ς", "σ", "UTF8_BINARY_LCASE", 0); - assertCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0); - assertCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0); - assertCompare("ς", "σ", "UNICODE", 1); - assertCompare("ς", "Σ", "UNICODE", 1); - assertCompare("σ", "Σ", "UNICODE", -1); - assertCompare("ς", "σ", "UNICODE_CI", 0); - assertCompare("ς", "Σ", "UNICODE_CI", 0); - assertCompare("σ", "Σ", "UNICODE_CI", 0); + assertStringCompare("ς", "σ", "UTF8_BINARY", -1); + assertStringCompare("ς", "Σ", "UTF8_BINARY", 1); + assertStringCompare("σ", "Σ", "UTF8_BINARY", 1); + assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("ς", "σ", "UNICODE", 1); + assertStringCompare("ς", "Σ", "UNICODE", 1); + assertStringCompare("σ", "Σ", "UNICODE", -1); + assertStringCompare("ς", "σ", "UNICODE_CI", 0); + assertStringCompare("ς", "Σ", "UNICODE_CI", 0); + assertStringCompare("σ", "Σ", "UNICODE_CI", 0); } - private void assertLowercase(String target, String expected, String collationName) + private void assertLcaseCompare(String target, String expected, String collationName) throws SparkException { if (collationName.equals("UTF8_BINARY")) { UTF8String targetUTF8 = UTF8String.fromString(target); UTF8String expectedUTF8 = UTF8String.fromString(expected); assertEquals(expectedUTF8, targetUTF8.toLowerCase()); } else if (collationName.equals("UTF8_BINARY_LCASE")) { - assertEquals(expected, CollationAwareUTF8String.toLowerCase(target)); + assertEquals(expected, CollationAwareUTF8String.lowerCaseCodePoints(target)); } else { int collationId = CollationFactory.collationNameToId(collationName); assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, collationId)); @@ -111,50 +111,50 @@ private void assertLowercase(String target, String expected, String collationNam } @Test - public void testLowercase() throws SparkException { + public void testLcaseCompare() throws SparkException { // Edge cases - assertLowercase("", "", "UTF8_BINARY"); - assertLowercase("", "", "UTF8_BINARY_LCASE"); - assertLowercase("", "", "UNICODE"); - assertLowercase("", "", "UNICODE_CI"); + assertLcaseCompare("", "", "UTF8_BINARY"); + assertLcaseCompare("", "", "UTF8_BINARY_LCASE"); + assertLcaseCompare("", "", "UNICODE"); + assertLcaseCompare("", "", "UNICODE_CI"); // Basic tests - assertLowercase("abcd", "abcd", "UTF8_BINARY"); - assertLowercase("AbCd", "abcd", "UTF8_BINARY"); - assertLowercase("abcd", "abcd", "UTF8_BINARY_LCASE"); - assertLowercase("aBcD", "abcd", "UTF8_BINARY_LCASE"); - assertLowercase("abcd", "abcd", "UNICODE"); - assertLowercase("aBCd", "abcd", "UNICODE"); - assertLowercase("abcd", "abcd", "UNICODE_CI"); - assertLowercase("AbcD", "abcd", "UNICODE_CI"); + assertLcaseCompare("abcd", "abcd", "UTF8_BINARY"); + assertLcaseCompare("AbCd", "abcd", "UTF8_BINARY"); + assertLcaseCompare("abcd", "abcd", "UTF8_BINARY_LCASE"); + assertLcaseCompare("aBcD", "abcd", "UTF8_BINARY_LCASE"); + assertLcaseCompare("abcd", "abcd", "UNICODE"); + assertLcaseCompare("aBCd", "abcd", "UNICODE"); + assertLcaseCompare("abcd", "abcd", "UNICODE_CI"); + assertLcaseCompare("AbcD", "abcd", "UNICODE_CI"); // Accent variation - assertLowercase("AbĆd", "abćd", "UTF8_BINARY"); - assertLowercase("aBcΔ", "abcδ", "UTF8_BINARY_LCASE"); - assertLowercase("ÄbcD", "äbcd", "UNICODE"); - assertLowercase("aB́Cd", "ab́cd", "UNICODE_CI"); + assertLcaseCompare("AbĆd", "abćd", "UTF8_BINARY"); + assertLcaseCompare("aBcΔ", "abcδ", "UTF8_BINARY_LCASE"); + assertLcaseCompare("ÄbcD", "äbcd", "UNICODE"); + assertLcaseCompare("aB́Cd", "ab́cd", "UNICODE_CI"); // Case-variable character length - assertLowercase("İoDiNe", "i̇odine", "UTF8_BINARY"); - assertLowercase("Abi̇o12", "abi̇o12", "UTF8_BINARY"); - assertLowercase("İodInE", "i̇odine", "UTF8_BINARY_LCASE"); - assertLowercase("aBi̇o12", "abi̇o12", "UTF8_BINARY_LCASE"); - assertLowercase("İoDinE", "i̇odine", "UNICODE"); - assertLowercase("abi̇O12", "abi̇o12", "UNICODE"); - assertLowercase("İodINe", "i̇odine", "UNICODE_CI"); - assertLowercase("ABi̇o12", "abi̇o12", "UNICODE_CI"); + assertLcaseCompare("İoDiNe", "i̇odine", "UTF8_BINARY"); + assertLcaseCompare("Abi̇o12", "abi̇o12", "UTF8_BINARY"); + assertLcaseCompare("İodInE", "i̇odine", "UTF8_BINARY_LCASE"); + assertLcaseCompare("aBi̇o12", "abi̇o12", "UTF8_BINARY_LCASE"); + assertLcaseCompare("İoDinE", "i̇odine", "UNICODE"); + assertLcaseCompare("abi̇O12", "abi̇o12", "UNICODE"); + assertLcaseCompare("İodINe", "i̇odine", "UNICODE_CI"); + assertLcaseCompare("ABi̇o12", "abi̇o12", "UNICODE_CI"); // Conditional case mapping - assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UTF8_BINARY"); - assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", "UTF8_BINARY_LCASE"); // != UNICODE_CI - assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE"); - assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI"); + assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UTF8_BINARY"); + assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", "UTF8_BINARY_LCASE"); // != UNICODE_CI + assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE"); + assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI"); } - private void assertUppercase(String target, String expected, String collationName) + private void assertUcaseCompare(String target, String expected, String collationName) throws SparkException { if (collationName.equals("UTF8_BINARY")) { UTF8String targetUTF8 = UTF8String.fromString(target); UTF8String expectedUTF8 = UTF8String.fromString(expected); assertEquals(expectedUTF8, targetUTF8.toUpperCase()); } else if (collationName.equals("UTF8_BINARY_LCASE")) { - assertEquals(expected, CollationAwareUTF8String.toUpperCase(target)); + assertEquals(expected, CollationAwareUTF8String.upperCaseCodePoints(target)); } else { int collationId = CollationFactory.collationNameToId(collationName); assertEquals(expected, CollationAwareUTF8String.toUpperCase(target, collationId)); @@ -164,52 +164,52 @@ private void assertUppercase(String target, String expected, String collationNam @Test public void testUppercase() throws SparkException { // Edge cases - assertUppercase("", "", "UTF8_BINARY"); - assertUppercase("", "", "UTF8_BINARY_LCASE"); - assertUppercase("", "", "UNICODE"); - assertUppercase("", "", "UNICODE_CI"); + assertUcaseCompare("", "", "UTF8_BINARY"); + assertUcaseCompare("", "", "UTF8_BINARY_LCASE"); + assertUcaseCompare("", "", "UNICODE"); + assertUcaseCompare("", "", "UNICODE_CI"); // Basic tests - assertUppercase("abcd", "ABCD", "UTF8_BINARY"); - assertUppercase("AbCd", "ABCD", "UTF8_BINARY"); - assertUppercase("abcd", "ABCD", "UTF8_BINARY_LCASE"); - assertUppercase("aBcD", "ABCD", "UTF8_BINARY_LCASE"); - assertUppercase("abcd", "ABCD", "UNICODE"); - assertUppercase("aBCd", "ABCD", "UNICODE"); - assertUppercase("abcd", "ABCD", "UNICODE_CI"); - assertUppercase("AbcD", "ABCD", "UNICODE_CI"); + assertUcaseCompare("abcd", "ABCD", "UTF8_BINARY"); + assertUcaseCompare("AbCd", "ABCD", "UTF8_BINARY"); + assertUcaseCompare("abcd", "ABCD", "UTF8_BINARY_LCASE"); + assertUcaseCompare("aBcD", "ABCD", "UTF8_BINARY_LCASE"); + assertUcaseCompare("abcd", "ABCD", "UNICODE"); + assertUcaseCompare("aBCd", "ABCD", "UNICODE"); + assertUcaseCompare("abcd", "ABCD", "UNICODE_CI"); + assertUcaseCompare("AbcD", "ABCD", "UNICODE_CI"); // Accent variation - assertUppercase("aBćD", "ABĆD", "UTF8_BINARY"); - assertUppercase("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE"); - assertUppercase("äBCd", "ÄBCD", "UNICODE"); - assertUppercase("Ab́cD", "AB́CD", "UNICODE_CI"); + assertUcaseCompare("aBćD", "ABĆD", "UTF8_BINARY"); + assertUcaseCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE"); + assertUcaseCompare("äBCd", "ÄBCD", "UNICODE"); + assertUcaseCompare("Ab́cD", "AB́CD", "UNICODE_CI"); // Case-variable character length - assertUppercase("i\u0307oDiNe", "I\u0307ODINE", "UTF8_BINARY"); - assertUppercase("Abi\u0307o12", "ABI\u0307O12", "UTF8_BINARY"); - assertUppercase("i̇odInE", "İODINE", "UTF8_BINARY_LCASE"); - assertUppercase("aBi̇o12", "ABİO12", "UTF8_BINARY_LCASE"); - assertUppercase("i̇oDinE", "I\u0307ODINE", "UNICODE"); - assertUppercase("abi̇O12", "ABI\u0307O12", "UNICODE"); - assertUppercase("i̇odINe", "I\u0307ODINE", "UNICODE_CI"); - assertUppercase("ABi̇o12", "ABI\u0307O12", "UNICODE_CI"); + assertUcaseCompare("i\u0307oDiNe", "I\u0307ODINE", "UTF8_BINARY"); + assertUcaseCompare("Abi\u0307o12", "ABI\u0307O12", "UTF8_BINARY"); + assertUcaseCompare("i̇odInE", "İODINE", "UTF8_BINARY_LCASE"); + assertUcaseCompare("aBi̇o12", "ABİO12", "UTF8_BINARY_LCASE"); + assertUcaseCompare("i̇oDinE", "I\u0307ODINE", "UNICODE"); + assertUcaseCompare("abi̇O12", "ABI\u0307O12", "UNICODE"); + assertUcaseCompare("i̇odINe", "I\u0307ODINE", "UNICODE_CI"); + assertUcaseCompare("ABi̇o12", "ABI\u0307O12", "UNICODE_CI"); // Conditional case mapping - assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY"); - assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY"); - assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE"); - assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE"); - assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE"); - assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE"); - assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI"); - assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI"); + assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY"); + assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY"); + assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE"); + assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE"); + assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE"); + assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE"); + assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI"); + assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI"); } - private void assertTitlecase(String target, String expected, String collationName) + private void assertTcaseCompare(String target, String expected, String collationName) throws SparkException { if (collationName.equals("UTF8_BINARY")) { UTF8String targetUTF8 = UTF8String.fromString(target); UTF8String expectedUTF8 = UTF8String.fromString(expected); assertEquals(expectedUTF8, targetUTF8.toTitleCase()); } else if (collationName.equals("UTF8_BINARY_LCASE")) { - assertEquals(expected, CollationAwareUTF8String.toTitleCase(target)); + assertEquals(expected, CollationAwareUTF8String.titleCaseCodePoints(target)); } else { int collationId = CollationFactory.collationNameToId(collationName); assertEquals(expected, CollationAwareUTF8String.toTitleCase(target, collationId)); @@ -219,42 +219,42 @@ private void assertTitlecase(String target, String expected, String collationNam @Test public void testTitlecase() throws SparkException { // Edge cases - assertTitlecase("", "", "UTF8_BINARY"); - assertTitlecase("", "", "UTF8_BINARY_LCASE"); - assertTitlecase("", "", "UNICODE"); - assertTitlecase("", "", "UNICODE_CI"); + assertTcaseCompare("", "", "UTF8_BINARY"); + assertTcaseCompare("", "", "UTF8_BINARY_LCASE"); + assertTcaseCompare("", "", "UNICODE"); + assertTcaseCompare("", "", "UNICODE_CI"); // Basic tests - assertTitlecase("ab cd", "Ab Cd", "UTF8_BINARY"); - assertTitlecase("Ab Cd", "Ab Cd", "UTF8_BINARY"); - assertTitlecase("ab cd", "Ab Cd", "UTF8_BINARY_LCASE"); - assertTitlecase("aB cD", "Ab Cd", "UTF8_BINARY_LCASE"); - assertTitlecase("ab cd", "Ab Cd", "UNICODE"); - assertTitlecase("aB Cd", "Ab Cd", "UNICODE"); - assertTitlecase("ab cd", "Ab Cd", "UNICODE_CI"); - assertTitlecase("Ab cD", "Ab Cd", "UNICODE_CI"); + assertTcaseCompare("ab cd", "Ab Cd", "UTF8_BINARY"); + assertTcaseCompare("Ab Cd", "Ab Cd", "UTF8_BINARY"); + assertTcaseCompare("ab cd", "Ab Cd", "UTF8_BINARY_LCASE"); + assertTcaseCompare("aB cD", "Ab Cd", "UTF8_BINARY_LCASE"); + assertTcaseCompare("ab cd", "Ab Cd", "UNICODE"); + assertTcaseCompare("aB Cd", "Ab Cd", "UNICODE"); + assertTcaseCompare("ab cd", "Ab Cd", "UNICODE_CI"); + assertTcaseCompare("Ab cD", "Ab Cd", "UNICODE_CI"); // Accent variation - assertTitlecase("aB ćD", "AB ĆD", "UTF8_BINARY"); - assertTitlecase("AbC δ", "Abc Δ", "UTF8_BINARY_LCASE"); - assertTitlecase("äB Cd", "Äb Cd", "UNICODE"); - assertTitlecase("A b́cD", "A B́cd", "UNICODE_CI"); + assertTcaseCompare("aB ćD", "AB ĆD", "UTF8_BINARY"); + assertTcaseCompare("AbC δ", "Abc Δ", "UTF8_BINARY_LCASE"); + assertTcaseCompare("äB Cd", "Äb Cd", "UNICODE"); + assertTcaseCompare("A b́cD", "A B́cd", "UNICODE_CI"); // Case-variable character length - assertTitlecase("i\u0307oDiNe", "I\u0307oDiNe", "UTF8_BINARY"); - assertTitlecase("Abi\u0307o12", "Abi\u0307o12", "UTF8_BINARY"); - assertTitlecase("i̇od i̇nE", "İod İne", "UTF8_BINARY_LCASE"); - assertTitlecase("aBi̇o12", "Abi\u0307o12", "UTF8_BINARY_LCASE"); - assertTitlecase("i̇oDinE", "I\u0307odine", "UNICODE"); - assertTitlecase("abi̇O12", "Abi̇o12", "UNICODE"); - assertTitlecase("i̇odINe", "I\u0307odine", "UNICODE_CI"); - assertTitlecase("ABi̇o12", "Abi\u0307o12", "UNICODE_CI"); + assertTcaseCompare("i\u0307oDiNe", "I\u0307oDiNe", "UTF8_BINARY"); + assertTcaseCompare("Abi\u0307o12", "Abi\u0307o12", "UTF8_BINARY"); + assertTcaseCompare("i̇od i̇nE", "İod İne", "UTF8_BINARY_LCASE"); + assertTcaseCompare("aBi̇o12", "Abi\u0307o12", "UTF8_BINARY_LCASE"); + assertTcaseCompare("i̇oDinE", "I\u0307odine", "UNICODE"); + assertTcaseCompare("abi̇O12", "Abi̇o12", "UNICODE"); + assertTcaseCompare("i̇odINe", "I\u0307odine", "UNICODE_CI"); + assertTcaseCompare("ABi̇o12", "Abi\u0307o12", "UNICODE_CI"); // Conditional case mapping - assertTitlecase("a ς c", "A Σ C", "UTF8_BINARY"); - assertTitlecase("a σ c", "A Σ C", "UTF8_BINARY"); - assertTitlecase("a ς c", "A Σ C", "UTF8_BINARY_LCASE"); - assertTitlecase("a σ c", "A Σ C", "UTF8_BINARY_LCASE"); - assertTitlecase("a ς c", "A Σ C", "UNICODE"); - assertTitlecase("a σ c", "A Σ C", "UNICODE"); - assertTitlecase("a ς c", "A Σ C", "UNICODE_CI"); - assertTitlecase("a σ c", "A Σ C", "UNICODE_CI"); + assertTcaseCompare("a ς c", "A Σ C", "UTF8_BINARY"); + assertTcaseCompare("a σ c", "A Σ C", "UTF8_BINARY"); + assertTcaseCompare("a ς c", "A Σ C", "UTF8_BINARY_LCASE"); + assertTcaseCompare("a σ c", "A Σ C", "UTF8_BINARY_LCASE"); + assertTcaseCompare("a ς c", "A Σ C", "UNICODE"); + assertTcaseCompare("a σ c", "A Σ C", "UNICODE"); + assertTcaseCompare("a ς c", "A Σ C", "UNICODE_CI"); + assertTcaseCompare("a σ c", "A Σ C", "UNICODE_CI"); } /** From f3e23a1ebc8d1f2d411c3f144ff05c400130949b Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Thu, 23 May 2024 17:22:37 +0200 Subject: [PATCH 05/14] Remove UCASE and TCASE code points --- .../util/CollationAwareUTF8String.java | 69 ----------- .../unsafe/types/CollationSupportSuite.java | 110 ------------------ 2 files changed, 179 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 66632457b328b..d75bcd9656a4d 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -156,54 +156,6 @@ public static String toUpperCase(final String target, final int collationId) { return UCharacter.toUpperCase(locale, target); } - /** - * Converts a single code point to uppercase using ICU rules, with special handling for - * conditional case mappings (i.e. characters that map to multiple characters in uppercase). - * - * @param codePoint The code point to convert to uppercase. - * @param sb The StringBuilder to append the uppercase character to. - * @param i The index of the code point in the target string. - * @param target The target string to convert to uppercase. - * @return The number of characters consumed by the code point. - */ - private static int uppercaseCodePoint(final int codePoint, final StringBuilder sb, final int i, - final String target) { - // Latin small letter i with an additional dot is represented using 2 characters. - if (codePoint == 0x0069 && i + 1 < target.length() && target.codePointAt(i + 1) == 0x0307) { - sb.append("İ"); - return 1; - } - // TODO: Add special handling for other chars that map to multiple characters in uppercase. - // All other characters should follow context-unaware ICU single-code point case mapping. - sb.appendCodePoint(UCharacter.toTitleCase(codePoint)); - return 0; - } - - /** - * Converts an entire string to uppercase using ICU rules, code point by code point, with - * special handling for conditional case mappings (i.e. characters that map to multiple - * characters in uppercase). This method omits information about context-sensitive case mappings. - * - * @param target The target string to convert to uppercase. - * @return The string converted to uppercase in a context-unaware manner. - */ - public static String upperCaseCodePoints(final String target) { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < target.length(); ++i) { - int codePoint = target.codePointAt(i); - // Latin small letter i with an additional dot above (represented using 2 characters). - if (codePoint == 0x0069 && i + 1 < target.length() && target.codePointAt(i + 1) == 0x0307) { - sb.append("İ"); - ++i; - } - // All other characters should follow context-unaware ICU single-code point case mapping. - else { - sb.appendCodePoint(UCharacter.toUpperCase(codePoint)); - } - } - return sb.toString(); - } - public static String toLowerCase(final String target, final int collationId) { ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); @@ -255,27 +207,6 @@ public static String toTitleCase(final String target, final int collationId) { return UCharacter.toTitleCase(locale, target, BreakIterator.getWordInstance(locale)); } - /** - * Converts an entire string to titlecase using ICU rules, code point by code point, with - * special handling for conditional case mappings (i.e. characters that map to multiple - * characters in lowercase). This method omits information about context-sensitive case mappings. - * - * @param target The target string to convert to lowercase. - * @return The string converted to lowercase in a context-unaware manner. - */ - public static String titleCaseCodePoints(final String target) { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < target.length(); ++i) { - int codePoint = target.codePointAt(i); - if (i == 0 || Character.isWhitespace(target.codePointBefore(i))) { - i += uppercaseCodePoint(codePoint, sb, i, target); - } else { - lowercaseCodePoint(codePoint, sb); - } - } - return sb.toString(); - } - public static int findInSet(final UTF8String match, final UTF8String set, int collationId) { if (match.contains(UTF8String.fromString(","))) { return 0; diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index c9a7e18e1cdbe..c079427ce5f6a 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -147,116 +147,6 @@ public void testLcaseCompare() throws SparkException { assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI"); } - private void assertUcaseCompare(String target, String expected, String collationName) - throws SparkException { - if (collationName.equals("UTF8_BINARY")) { - UTF8String targetUTF8 = UTF8String.fromString(target); - UTF8String expectedUTF8 = UTF8String.fromString(expected); - assertEquals(expectedUTF8, targetUTF8.toUpperCase()); - } else if (collationName.equals("UTF8_BINARY_LCASE")) { - assertEquals(expected, CollationAwareUTF8String.upperCaseCodePoints(target)); - } else { - int collationId = CollationFactory.collationNameToId(collationName); - assertEquals(expected, CollationAwareUTF8String.toUpperCase(target, collationId)); - } - } - - @Test - public void testUppercase() throws SparkException { - // Edge cases - assertUcaseCompare("", "", "UTF8_BINARY"); - assertUcaseCompare("", "", "UTF8_BINARY_LCASE"); - assertUcaseCompare("", "", "UNICODE"); - assertUcaseCompare("", "", "UNICODE_CI"); - // Basic tests - assertUcaseCompare("abcd", "ABCD", "UTF8_BINARY"); - assertUcaseCompare("AbCd", "ABCD", "UTF8_BINARY"); - assertUcaseCompare("abcd", "ABCD", "UTF8_BINARY_LCASE"); - assertUcaseCompare("aBcD", "ABCD", "UTF8_BINARY_LCASE"); - assertUcaseCompare("abcd", "ABCD", "UNICODE"); - assertUcaseCompare("aBCd", "ABCD", "UNICODE"); - assertUcaseCompare("abcd", "ABCD", "UNICODE_CI"); - assertUcaseCompare("AbcD", "ABCD", "UNICODE_CI"); - // Accent variation - assertUcaseCompare("aBćD", "ABĆD", "UTF8_BINARY"); - assertUcaseCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE"); - assertUcaseCompare("äBCd", "ÄBCD", "UNICODE"); - assertUcaseCompare("Ab́cD", "AB́CD", "UNICODE_CI"); - // Case-variable character length - assertUcaseCompare("i\u0307oDiNe", "I\u0307ODINE", "UTF8_BINARY"); - assertUcaseCompare("Abi\u0307o12", "ABI\u0307O12", "UTF8_BINARY"); - assertUcaseCompare("i̇odInE", "İODINE", "UTF8_BINARY_LCASE"); - assertUcaseCompare("aBi̇o12", "ABİO12", "UTF8_BINARY_LCASE"); - assertUcaseCompare("i̇oDinE", "I\u0307ODINE", "UNICODE"); - assertUcaseCompare("abi̇O12", "ABI\u0307O12", "UNICODE"); - assertUcaseCompare("i̇odINe", "I\u0307ODINE", "UNICODE_CI"); - assertUcaseCompare("ABi̇o12", "ABI\u0307O12", "UNICODE_CI"); - // Conditional case mapping - assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY"); - assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY"); - assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE"); - assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE"); - assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE"); - assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE"); - assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI"); - assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI"); - } - - private void assertTcaseCompare(String target, String expected, String collationName) - throws SparkException { - if (collationName.equals("UTF8_BINARY")) { - UTF8String targetUTF8 = UTF8String.fromString(target); - UTF8String expectedUTF8 = UTF8String.fromString(expected); - assertEquals(expectedUTF8, targetUTF8.toTitleCase()); - } else if (collationName.equals("UTF8_BINARY_LCASE")) { - assertEquals(expected, CollationAwareUTF8String.titleCaseCodePoints(target)); - } else { - int collationId = CollationFactory.collationNameToId(collationName); - assertEquals(expected, CollationAwareUTF8String.toTitleCase(target, collationId)); - } - } - - @Test - public void testTitlecase() throws SparkException { - // Edge cases - assertTcaseCompare("", "", "UTF8_BINARY"); - assertTcaseCompare("", "", "UTF8_BINARY_LCASE"); - assertTcaseCompare("", "", "UNICODE"); - assertTcaseCompare("", "", "UNICODE_CI"); - // Basic tests - assertTcaseCompare("ab cd", "Ab Cd", "UTF8_BINARY"); - assertTcaseCompare("Ab Cd", "Ab Cd", "UTF8_BINARY"); - assertTcaseCompare("ab cd", "Ab Cd", "UTF8_BINARY_LCASE"); - assertTcaseCompare("aB cD", "Ab Cd", "UTF8_BINARY_LCASE"); - assertTcaseCompare("ab cd", "Ab Cd", "UNICODE"); - assertTcaseCompare("aB Cd", "Ab Cd", "UNICODE"); - assertTcaseCompare("ab cd", "Ab Cd", "UNICODE_CI"); - assertTcaseCompare("Ab cD", "Ab Cd", "UNICODE_CI"); - // Accent variation - assertTcaseCompare("aB ćD", "AB ĆD", "UTF8_BINARY"); - assertTcaseCompare("AbC δ", "Abc Δ", "UTF8_BINARY_LCASE"); - assertTcaseCompare("äB Cd", "Äb Cd", "UNICODE"); - assertTcaseCompare("A b́cD", "A B́cd", "UNICODE_CI"); - // Case-variable character length - assertTcaseCompare("i\u0307oDiNe", "I\u0307oDiNe", "UTF8_BINARY"); - assertTcaseCompare("Abi\u0307o12", "Abi\u0307o12", "UTF8_BINARY"); - assertTcaseCompare("i̇od i̇nE", "İod İne", "UTF8_BINARY_LCASE"); - assertTcaseCompare("aBi̇o12", "Abi\u0307o12", "UTF8_BINARY_LCASE"); - assertTcaseCompare("i̇oDinE", "I\u0307odine", "UNICODE"); - assertTcaseCompare("abi̇O12", "Abi̇o12", "UNICODE"); - assertTcaseCompare("i̇odINe", "I\u0307odine", "UNICODE_CI"); - assertTcaseCompare("ABi̇o12", "Abi\u0307o12", "UNICODE_CI"); - // Conditional case mapping - assertTcaseCompare("a ς c", "A Σ C", "UTF8_BINARY"); - assertTcaseCompare("a σ c", "A Σ C", "UTF8_BINARY"); - assertTcaseCompare("a ς c", "A Σ C", "UTF8_BINARY_LCASE"); - assertTcaseCompare("a σ c", "A Σ C", "UTF8_BINARY_LCASE"); - assertTcaseCompare("a ς c", "A Σ C", "UNICODE"); - assertTcaseCompare("a σ c", "A Σ C", "UNICODE"); - assertTcaseCompare("a ς c", "A Σ C", "UNICODE_CI"); - assertTcaseCompare("a σ c", "A Σ C", "UNICODE_CI"); - } - /** * Collation-aware string expressions. */ From f6e2dd2357453725572e2ee2a47011d7684936f2 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 14:04:53 +0200 Subject: [PATCH 06/14] Update doc comments --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index d75bcd9656a4d..9e8a2eb586b16 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -39,7 +39,7 @@ public class CollationAwareUTF8String { * Lowercase UTF8String comparison used for UTF8_BINARY_LCASE collation. While the default * UTF8String comparison is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()), this * method uses code points to compare the strings in a case-insensitive manner using ICU rules, - * as well as handling special rules for conditional case mappings (see: lowerCaseCodePoints). + * as well as handling special rules for one-to-many case mappings (see: lowerCaseCodePoints). * * @param left The first UTF8String to compare. * @param right The second UTF8String to compare. @@ -164,7 +164,7 @@ public static String toLowerCase(final String target, final int collationId) { /** * Converts a single code point to lowercase using ICU rules, with special handling for - * conditional case mappings (i.e. characters that map to multiple characters in lowercase). + * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase). * * @param codePoint The code point to convert to lowercase. * @param sb The StringBuilder to append the lowercase character to. @@ -186,7 +186,7 @@ else if (codePoint == 0x03C2) { /** * Converts an entire string to lowercase using ICU rules, code point by code point, with - * special handling for conditional case mappings (i.e. characters that map to multiple + * special handling for one-to-many case mappings (i.e. characters that map to multiple * characters in lowercase). This method omits information about context-sensitive case mappings. * * @param target The target string to convert to lowercase. From 569a67df816d8e126afba4e4fb01d260a5bfce5e Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 14:12:21 +0200 Subject: [PATCH 07/14] Small fixes --- .../catalyst/util/CollationAwareUTF8String.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 9e8a2eb586b16..ee79d7380a0fa 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -170,16 +170,17 @@ public static String toLowerCase(final String target, final int collationId) { * @param sb The StringBuilder to append the lowercase character to. */ private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) { - // Latin capital letter I with dot above is mapped to 2 lowercase characters. if (codePoint == 0x0130) { - sb.append("i̇"); + // Latin capital letter I with dot above is mapped to 2 lowercase characters. + sb.appendCodePoint(0x0069); + sb.appendCodePoint(0x0307); } - // Greek final and non-final capital letter sigma should be mapped the same. else if (codePoint == 0x03C2) { - sb.append("σ"); + // Greek final and non-final capital letter sigma should be mapped the same. + sb.appendCodePoint(0x03C3); } - // All other characters should follow context-unaware ICU single-code point case mapping. else { + // All other characters should follow context-unaware ICU single-code point case mapping. sb.appendCodePoint(UCharacter.toLowerCase(codePoint)); } } @@ -193,10 +194,9 @@ else if (codePoint == 0x03C2) { * @return The string converted to lowercase in a context-unaware manner. */ public static String lowerCaseCodePoints(final String target) { - StringBuilder sb = new StringBuilder(); + StringBuilder sb = new StringBuilder(); for (int i = 0; i < target.length(); ++i) { - int codePoint = target.codePointAt(i); - lowercaseCodePoint(codePoint, sb); + lowercaseCodePoint(target.codePointAt(i), sb); } return sb.toString(); } From 021e53ccac4e2f58919b63ac11657f3b3359fcdc Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 27 May 2024 10:25:42 +0200 Subject: [PATCH 08/14] Small fixes --- .../util/CollationAwareUTF8String.java | 2 +- .../sql/catalyst/util/CollationFactory.java | 2 +- .../unsafe/types/CollationSupportSuite.java | 56 ++++++------------- 3 files changed, 20 insertions(+), 40 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index ee79d7380a0fa..7cdf998fa4ec9 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -45,7 +45,7 @@ public class CollationAwareUTF8String { * @param right The second UTF8String to compare. * @return An integer representing the comparison result. */ - public static int compareLowerCase(final UTF8String left, final UTF8String right) { + public static int lowercaseCompare(final UTF8String left, final UTF8String right) { return lowerCaseCodePoints(left.toString()).compareTo(lowerCaseCodePoints(right.toString())); } diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 5c9313875fb1c..e445a8c228910 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -232,7 +232,7 @@ public CollationIdentifier identifier() { "UTF8_BINARY_LCASE", PROVIDER_SPARK, null, - CollationAwareUTF8String::compareLowerCase, + CollationAwareUTF8String::lowercaseCompare, "1.0", (s) -> (long)CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(), false, diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index c079427ce5f6a..008d1c47fb28e 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -96,55 +96,35 @@ public void testCompare() throws SparkException { assertStringCompare("σ", "Σ", "UNICODE_CI", 0); } - private void assertLcaseCompare(String target, String expected, String collationName) - throws SparkException { - if (collationName.equals("UTF8_BINARY")) { - UTF8String targetUTF8 = UTF8String.fromString(target); - UTF8String expectedUTF8 = UTF8String.fromString(expected); - assertEquals(expectedUTF8, targetUTF8.toLowerCase()); - } else if (collationName.equals("UTF8_BINARY_LCASE")) { + private void assertLowerCaseCodePoints(String target, String expected, Boolean useCodePoints) { + if (useCodePoints) { assertEquals(expected, CollationAwareUTF8String.lowerCaseCodePoints(target)); } else { - int collationId = CollationFactory.collationNameToId(collationName); - assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, collationId)); + assertEquals(UTF8String.fromString(expected), UTF8String.fromString(target).toLowerCase()); } } @Test - public void testLcaseCompare() throws SparkException { + public void testLowerCaseCodePoints() { // Edge cases - assertLcaseCompare("", "", "UTF8_BINARY"); - assertLcaseCompare("", "", "UTF8_BINARY_LCASE"); - assertLcaseCompare("", "", "UNICODE"); - assertLcaseCompare("", "", "UNICODE_CI"); + assertLowerCaseCodePoints("", "", false); + assertLowerCaseCodePoints("", "", true); // Basic tests - assertLcaseCompare("abcd", "abcd", "UTF8_BINARY"); - assertLcaseCompare("AbCd", "abcd", "UTF8_BINARY"); - assertLcaseCompare("abcd", "abcd", "UTF8_BINARY_LCASE"); - assertLcaseCompare("aBcD", "abcd", "UTF8_BINARY_LCASE"); - assertLcaseCompare("abcd", "abcd", "UNICODE"); - assertLcaseCompare("aBCd", "abcd", "UNICODE"); - assertLcaseCompare("abcd", "abcd", "UNICODE_CI"); - assertLcaseCompare("AbcD", "abcd", "UNICODE_CI"); + assertLowerCaseCodePoints("abcd", "abcd", false); + assertLowerCaseCodePoints("AbCd", "abcd", false); + assertLowerCaseCodePoints("abcd", "abcd", true); + assertLowerCaseCodePoints("aBcD", "abcd", true); // Accent variation - assertLcaseCompare("AbĆd", "abćd", "UTF8_BINARY"); - assertLcaseCompare("aBcΔ", "abcδ", "UTF8_BINARY_LCASE"); - assertLcaseCompare("ÄbcD", "äbcd", "UNICODE"); - assertLcaseCompare("aB́Cd", "ab́cd", "UNICODE_CI"); + assertLowerCaseCodePoints("AbĆd", "abćd", false); + assertLowerCaseCodePoints("aBcΔ", "abcδ", true); // Case-variable character length - assertLcaseCompare("İoDiNe", "i̇odine", "UTF8_BINARY"); - assertLcaseCompare("Abi̇o12", "abi̇o12", "UTF8_BINARY"); - assertLcaseCompare("İodInE", "i̇odine", "UTF8_BINARY_LCASE"); - assertLcaseCompare("aBi̇o12", "abi̇o12", "UTF8_BINARY_LCASE"); - assertLcaseCompare("İoDinE", "i̇odine", "UNICODE"); - assertLcaseCompare("abi̇O12", "abi̇o12", "UNICODE"); - assertLcaseCompare("İodINe", "i̇odine", "UNICODE_CI"); - assertLcaseCompare("ABi̇o12", "abi̇o12", "UNICODE_CI"); + assertLowerCaseCodePoints("İoDiNe", "i̇odine", false); + assertLowerCaseCodePoints("Abi̇o12", "abi̇o12", false); + assertLowerCaseCodePoints("İodInE", "i̇odine", true); + assertLowerCaseCodePoints("aBi̇o12", "abi̇o12", true); // Conditional case mapping - assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UTF8_BINARY"); - assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", "UTF8_BINARY_LCASE"); // != UNICODE_CI - assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE"); - assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI"); + assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", false); + assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", true); } /** From 220091c06647bd91a2f16fd90ebcefd7fa3dc0cf Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 27 May 2024 12:59:14 +0200 Subject: [PATCH 09/14] Surrogate pair tests --- .../unsafe/types/CollationSupportSuite.java | 47 ++++++++++++------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 008d1c47fb28e..2d0dcd275bf9d 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -96,35 +96,48 @@ public void testCompare() throws SparkException { assertStringCompare("σ", "Σ", "UNICODE_CI", 0); } - private void assertLowerCaseCodePoints(String target, String expected, Boolean useCodePoints) { + private void assertLowerCaseCodePoints(UTF8String target, UTF8String expected, Boolean useCodePoints) { if (useCodePoints) { - assertEquals(expected, CollationAwareUTF8String.lowerCaseCodePoints(target)); + assertEquals(expected.toString(), CollationAwareUTF8String.lowerCaseCodePoints(target.toString())); } else { - assertEquals(UTF8String.fromString(expected), UTF8String.fromString(target).toLowerCase()); + assertEquals(expected, target.toLowerCase()); } } @Test public void testLowerCaseCodePoints() { // Edge cases - assertLowerCaseCodePoints("", "", false); - assertLowerCaseCodePoints("", "", true); + assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), false); + assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), true); // Basic tests - assertLowerCaseCodePoints("abcd", "abcd", false); - assertLowerCaseCodePoints("AbCd", "abcd", false); - assertLowerCaseCodePoints("abcd", "abcd", true); - assertLowerCaseCodePoints("aBcD", "abcd", true); + assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), false); + assertLowerCaseCodePoints(UTF8String.fromString("AbCd"), UTF8String.fromString("abcd"), false); + assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), true); + assertLowerCaseCodePoints(UTF8String.fromString("aBcD"), UTF8String.fromString("abcd"), true); // Accent variation - assertLowerCaseCodePoints("AbĆd", "abćd", false); - assertLowerCaseCodePoints("aBcΔ", "abcδ", true); + assertLowerCaseCodePoints(UTF8String.fromString("AbĆd"), UTF8String.fromString("abćd"), false); + assertLowerCaseCodePoints(UTF8String.fromString("aBcΔ"), UTF8String.fromString("abcδ"), true); // Case-variable character length - assertLowerCaseCodePoints("İoDiNe", "i̇odine", false); - assertLowerCaseCodePoints("Abi̇o12", "abi̇o12", false); - assertLowerCaseCodePoints("İodInE", "i̇odine", true); - assertLowerCaseCodePoints("aBi̇o12", "abi̇o12", true); + assertLowerCaseCodePoints( + UTF8String.fromString("İoDiNe"), UTF8String.fromString("i̇odine"), false); + assertLowerCaseCodePoints( + UTF8String.fromString("Abi̇o12"), UTF8String.fromString("abi̇o12"), false); + assertLowerCaseCodePoints( + UTF8String.fromString("İodInE"), UTF8String.fromString("i̇odine"), true); + assertLowerCaseCodePoints( + UTF8String.fromString("aBi̇o12"), UTF8String.fromString("abi̇o12"), true); // Conditional case mapping - assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", false); - assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", true); + assertLowerCaseCodePoints( + UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινος"), false); + assertLowerCaseCodePoints( + UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινοσ"), true); + // Surrogate pairs are treated as invalid UTF8 sequences + assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[] + {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}), + UTF8String.fromString("\ufffd\ufffd"), false); + assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[] + {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}), + UTF8String.fromString("\ufffd\ufffd"), true); } /** From 0d4da587ef900fed8b1a6d3d503646ffe5227651 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 27 May 2024 15:13:00 +0200 Subject: [PATCH 10/14] Fix Java lint --- .../apache/spark/unsafe/types/CollationSupportSuite.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 2d0dcd275bf9d..e02af65f86b10 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -96,9 +96,11 @@ public void testCompare() throws SparkException { assertStringCompare("σ", "Σ", "UNICODE_CI", 0); } - private void assertLowerCaseCodePoints(UTF8String target, UTF8String expected, Boolean useCodePoints) { + private void assertLowerCaseCodePoints(UTF8String target, UTF8String expected, + Boolean useCodePoints) { if (useCodePoints) { - assertEquals(expected.toString(), CollationAwareUTF8String.lowerCaseCodePoints(target.toString())); + assertEquals(expected.toString(), + CollationAwareUTF8String.lowerCaseCodePoints(target.toString())); } else { assertEquals(expected, target.toLowerCase()); } From 5ac1e20901bb5c05efa0734605fedc332ce9eb64 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 28 May 2024 19:57:51 +0200 Subject: [PATCH 11/14] Update CollationFactory.java --- .../org/apache/spark/sql/catalyst/util/CollationFactory.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 5fa8c3346e141..78d93cd957c57 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -414,7 +414,7 @@ protected Collation buildCollation() { null, CollationAwareUTF8String::lowercaseCompare, "1.0", - (s) -> (long)CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(), + s -> (long)CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(), /* supportsBinaryEquality = */ false, /* supportsBinaryOrdering = */ false, /* supportsLowercaseEquality = */ true); From 1348f9ce80c535539e8bf5ee5f0036792790ad0e Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 28 May 2024 19:58:20 +0200 Subject: [PATCH 12/14] Update CollationFactory.java --- .../org/apache/spark/sql/catalyst/util/CollationFactory.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 78d93cd957c57..0b520d87143b2 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -414,7 +414,7 @@ protected Collation buildCollation() { null, CollationAwareUTF8String::lowercaseCompare, "1.0", - s -> (long)CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(), + s -> (long) CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(), /* supportsBinaryEquality = */ false, /* supportsBinaryOrdering = */ false, /* supportsLowercaseEquality = */ true); From 494add7f7251afecbcb862035cfff329c80b3772 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:48:08 +0200 Subject: [PATCH 13/14] Fixes --- .../util/CollationAwareUTF8String.java | 36 ++++++++++- .../sql/catalyst/util/CollationFactory.java | 2 +- .../apache/spark/unsafe/types/UTF8String.java | 30 +-------- .../unsafe/types/CollationSupportSuite.java | 63 ++++++++++++++----- .../spark/unsafe/types/UTF8StringSuite.java | 23 ------- 5 files changed, 86 insertions(+), 68 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 7b1b21bfca322..84baf18154417 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -193,7 +193,41 @@ private static int lowercaseRFind( * @param right The second UTF8String to compare. * @return An integer representing the comparison result. */ - public static int lowercaseCompare(final UTF8String left, final UTF8String right) { + public static int compareLowerCase(final UTF8String left, final UTF8String right) { + // Only if both strings are ASCII, we can use faster comparison (no string allocations). + if (left.isFullAscii() && right.isFullAscii()) { + return compareLowerCaseAscii(left, right); + } + return compareLowerCaseSlow(left, right); + } + + /** + * Fast version of the `compareLowerCase` method, used when both arguments are ASCII strings. + * + * @param left The first ASCII UTF8String to compare. + * @param right The second ASCII UTF8String to compare. + * @return An integer representing the comparison result. + */ + private static int compareLowerCaseAscii(final UTF8String left, final UTF8String right) { + int leftBytes = left.numBytes(), rightBytes = right.numBytes(); + for (int curr = 0; curr < leftBytes && curr < rightBytes; curr++) { + int lowerLeftByte = Character.toLowerCase(left.getByte(curr)); + int lowerRightByte = Character.toLowerCase(right.getByte(curr)); + if (lowerLeftByte != lowerRightByte) { + return lowerLeftByte - lowerRightByte; + } + } + return leftBytes - rightBytes; + } + + /** + * Slow version of the `compareLowerCase` method, used when both arguments are non-ASCII strings. + * + * @param left The first non-ASCII UTF8String to compare. + * @param right The second non-ASCII UTF8String to compare. + * @return An integer representing the comparison result. + */ + private static int compareLowerCaseSlow(final UTF8String left, final UTF8String right) { return lowerCaseCodePoints(left.toString()).compareTo(lowerCaseCodePoints(right.toString())); } diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index c2a6887ba27f8..c734826648871 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -412,7 +412,7 @@ protected Collation buildCollation() { "UTF8_BINARY_LCASE", PROVIDER_SPARK, null, - CollationAwareUTF8String::lowercaseCompare, + CollationAwareUTF8String::compareLowerCase, "1.0", s -> (long) CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(), /* supportsBinaryEquality = */ false, diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index e28dfa910b59e..c0fa2719e4fe6 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -388,34 +388,6 @@ private UTF8String toUpperCaseSlow() { return fromString(toString().toUpperCase()); } - /** - * Optimized lowercase comparison for UTF8_BINARY_LCASE collation - * a.compareLowerCase(b) is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()) - */ - public int compareLowerCase(UTF8String other) { - int curr; - for (curr = 0; curr < numBytes && curr < other.numBytes; curr++) { - byte left, right; - if ((left = getByte(curr)) < 0 || (right = other.getByte(curr)) < 0) { - return compareLowerCaseSuffixSlow(other, curr); - } - int lowerLeft = Character.toLowerCase(left); - int lowerRight = Character.toLowerCase(right); - if (lowerLeft != lowerRight) { - return lowerLeft - lowerRight; - } - } - return numBytes - other.numBytes; - } - - private int compareLowerCaseSuffixSlow(UTF8String other, int pref) { - UTF8String suffixLeft = UTF8String.fromAddress(base, offset + pref, - numBytes - pref); - UTF8String suffixRight = UTF8String.fromAddress(other.base, other.offset + pref, - other.numBytes - pref); - return suffixLeft.toLowerCaseSlow().binaryCompare(suffixRight.toLowerCaseSlow()); - } - /** * Returns the lower case of this string */ @@ -427,7 +399,7 @@ public UTF8String toLowerCase() { return isFullAscii() ? toLowerCaseAscii() : toLowerCaseSlow(); } - private boolean isFullAscii() { + public boolean isFullAscii() { for (var i = 0; i < numBytes; i++) { if (getByte(i) < 0) { return false; diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 99ffbaf7afb49..25d9836e05361 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -27,6 +27,15 @@ // checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { + /** + * A list containing some of the supported collations in Spark. Use this list to iterate over + * all the important collation groups (binary, lowercase, icu) for complete unit test coverage. + * Note: this list may come in handy when the Spark function result is the same regardless of + * the specified collations (as often seen in some pass-through Spark expressions). + */ + private final String[] testSupportedCollations = + {"UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI"}; + /** * Collation-aware UTF8String comparison. */ @@ -41,20 +50,46 @@ private void assertStringCompare(String s1, String s2, String collationName, int @Test public void testCompare() throws SparkException { - // Edge cases - assertStringCompare("", "", "UTF8_BINARY", 0); - assertStringCompare("a", "", "UTF8_BINARY", 1); - assertStringCompare("", "a", "UTF8_BINARY", -1); - assertStringCompare("", "", "UTF8_BINARY_LCASE", 0); - assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1); - assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1); - assertStringCompare("", "", "UNICODE", 0); - assertStringCompare("a", "", "UNICODE", 1); - assertStringCompare("", "a", "UNICODE", -1); - assertStringCompare("", "", "UNICODE_CI", 0); - assertStringCompare("a", "", "UNICODE_CI", 1); - assertStringCompare("", "a", "UNICODE_CI", -1); - // Basic tests + for (String collationName: testSupportedCollations) { + // Edge cases + assertStringCompare("", "", collationName, 0); + assertStringCompare("a", "", collationName, 1); + assertStringCompare("", "a", collationName, -1); + // Basic tests + assertStringCompare("a", "a", collationName, 0); + assertStringCompare("a", "b", collationName, -1); + assertStringCompare("b", "a", collationName, 1); + assertStringCompare("A", "A", collationName, 0); + assertStringCompare("A", "B", collationName, -1); + assertStringCompare("B", "A", collationName, 1); + assertStringCompare("aa", "a", collationName, 1); + assertStringCompare("b", "bb", collationName, -1); + assertStringCompare("abc", "a", collationName, 1); + assertStringCompare("abc", "b", collationName, -1); + assertStringCompare("abc", "ab", collationName, 1); + assertStringCompare("abc", "abc", collationName, 0); + // ASCII strings + assertStringCompare("aaaa", "aaa", collationName, 1); + assertStringCompare("hello", "world", collationName, -1); + assertStringCompare("Spark", "Spark", collationName, 0); + // Non-ASCII strings + assertStringCompare("ü", "ü", collationName, 0); + assertStringCompare("ü", "", collationName, 1); + assertStringCompare("", "ü", collationName, -1); + assertStringCompare("äü", "äü", collationName, 0); + assertStringCompare("äxx", "äx", collationName, 1); + assertStringCompare("a", "ä", collationName, -1); + } + // Non-ASCII strings + assertStringCompare("äü", "bü", "UTF8_BINARY", 1); + assertStringCompare("bxx", "bü", "UTF8_BINARY", -1); + assertStringCompare("äü", "bü", "UTF8_BINARY_LCASE", 1); + assertStringCompare("bxx", "bü", "UTF8_BINARY_LCASE", -1); + assertStringCompare("äü", "bü", "UNICODE", -1); + assertStringCompare("bxx", "bü", "UNICODE", 1); + assertStringCompare("äü", "bü", "UNICODE_CI", -1); + assertStringCompare("bxx", "bü", "UNICODE_CI", 1); + // Case variation assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1); assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0); assertStringCompare("AbcD", "aBCd", "UNICODE", 1); diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 0188297fd05a2..d3fe361fce37b 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -107,29 +107,6 @@ public void binaryCompareTo() { assertTrue(fromString("你好123").binaryCompare(fromString("你好122")) > 0); } - @Test - public void lowercaseComparison() { - // SPARK-47693: Test optimized lowercase comparison of UTF8String instances - // ASCII - assertEquals(fromString("aaa").compareLowerCase(fromString("AAA")), 0); - assertTrue(fromString("aaa").compareLowerCase(fromString("AAAA")) < 0); - assertTrue(fromString("AAA").compareLowerCase(fromString("aaaa")) < 0); - assertTrue(fromString("a").compareLowerCase(fromString("B")) < 0); - assertTrue(fromString("b").compareLowerCase(fromString("A")) > 0); - assertEquals(fromString("aAa").compareLowerCase(fromString("AaA")), 0); - assertTrue(fromString("abcd").compareLowerCase(fromString("abC")) > 0); - assertTrue(fromString("ABC").compareLowerCase(fromString("abcd")) < 0); - assertEquals(fromString("abcd").compareLowerCase(fromString("abcd")), 0); - // non-ASCII - assertEquals(fromString("ü").compareLowerCase(fromString("Ü")), 0); - assertEquals(fromString("Äü").compareLowerCase(fromString("äÜ")), 0); - assertTrue(fromString("a").compareLowerCase(fromString("ä")) < 0); - assertTrue(fromString("a").compareLowerCase(fromString("Ä")) < 0); - assertTrue(fromString("A").compareLowerCase(fromString("ä")) < 0); - assertTrue(fromString("bä").compareLowerCase(fromString("aü")) > 0); - assertTrue(fromString("bxxxxxxxxxx").compareLowerCase(fromString("bü")) < 0); - } - protected static void testUpperandLower(String upper, String lower) { UTF8String us = fromString(upper); UTF8String ls = fromString(lower); From 9081478465d111cfe064cb57c16993721e53f5f9 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Thu, 6 Jun 2024 08:44:11 +0200 Subject: [PATCH 14/14] Small fixes --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 84baf18154417..c778726f12fd2 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -346,7 +346,9 @@ public static String toLowerCase(final String target, final int collationId) { /** * Converts a single code point to lowercase using ICU rules, with special handling for - * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase). + * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase) and + * context-insensitive case mappings (i.e. characters that map to different characters based on + * string context - e.g. the position in the string relative to other characters). * * @param codePoint The code point to convert to lowercase. * @param sb The StringBuilder to append the lowercase character to. @@ -370,7 +372,8 @@ else if (codePoint == 0x03C2) { /** * Converts an entire string to lowercase using ICU rules, code point by code point, with * special handling for one-to-many case mappings (i.e. characters that map to multiple - * characters in lowercase). This method omits information about context-sensitive case mappings. + * characters in lowercase). Also, this method omits information about context-sensitive case + * mappings using special handling in the `lowercaseCodePoint` method. * * @param target The target string to convert to lowercase. * @return The string converted to lowercase in a context-unaware manner.