From 949eb7c00fe96e23e03cc3abbb725355cb6bc382 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Thu, 23 May 2024 19:59:20 +0200 Subject: [PATCH 1/8] Initial commit --- .../util/CollationAwareUTF8String.java | 70 ++++++++++++++++++- .../sql/catalyst/util/CollationSupport.java | 54 ++++++++------ 2 files changed, 102 insertions(+), 22 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index ee0d611d7e65..6878c8e7fb92 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -135,22 +135,90 @@ public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String return buf.build(); } + /** + * Convert the input string to uppercase using the ICU root locale rules. + * + * @param target the input string + * @return the uppercase string + */ + public static UTF8String toUpperCase(final UTF8String target) { + return UTF8String.fromString(toUpperCase(target.toString())); + } + public static String toUpperCase(final String target) { + return UCharacter.toUpperCase(target); + } + + /** + * Convert the input string to uppercase using the specified ICU collation rules. + * + * @param target the input string + * @return the uppercase string + */ + public static UTF8String toUpperCase(final UTF8String target, final int collationId) { + return UTF8String.fromString(toUpperCase(target.toString(), collationId)); + } public static String toUpperCase(final String target, final int collationId) { ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); return UCharacter.toUpperCase(locale, target); } + /** + * Convert the input string to lowercase using the ICU root locale rules. + * + * @param target the input string + * @return the lowercase string + */ + public static UTF8String toLowerCase(final UTF8String target) { + return UTF8String.fromString(toLowerCase(target.toString())); + } + public static String toLowerCase(final String target) { + return UCharacter.toLowerCase(target); + } + + /** + * Convert the input string to lowercase using the specified ICU collation rules. + * + * @param target the input string + * @return the lowercase string + */ + public static UTF8String toLowerCase(final UTF8String target, final int collationId) { + return UTF8String.fromString(toLowerCase(target.toString(), collationId)); + } public static String toLowerCase(final String target, final int collationId) { ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); return UCharacter.toLowerCase(locale, target); } + /** + * Convert the input string to lowercase using the ICU root locale rules. + * + * @param target the input string + * @return the lowercase string + */ + public static UTF8String toTitleCase(final UTF8String target) { + return UTF8String.fromString(toTitleCase(target.toString())); + } + public static String toTitleCase(final String target) { + BreakIterator wordIterator = BreakIterator.getWordInstance(); + return UCharacter.toTitleCase(target, wordIterator); + } + + /** + * Convert the input string to lowercase using the specified ICU collation rules. + * + * @param target the input string + * @return the lowercase string + */ + public static UTF8String toTitleCase(final UTF8String target, final int collationId) { + return UTF8String.fromString(toTitleCase(target.toString(), collationId)); + } public static String toTitleCase(final String target, final int collationId) { ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); - return UCharacter.toTitleCase(locale, target, BreakIterator.getWordInstance(locale)); + BreakIterator wordIterator = BreakIterator.getWordInstance(locale); + return UCharacter.toTitleCase(locale, target, wordIterator); } public static int findInSet(final UTF8String match, final UTF8String set, int collationId) { diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index bea3dc08b448..ceec9d636b83 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -208,34 +208,43 @@ public static boolean execICU(final UTF8String l, final UTF8String r, public static class Upper { public static UTF8String exec(final UTF8String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { + if (collation.supportsBinaryEquality) { return execUTF8(v); - } else { + } else if (collation.supportsLowercaseEquality) { + return execLowercase(v); + } else { return execICU(v, collationId); } } public static String genCode(final String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); String expr = "CollationSupport.Upper.exec"; - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { + if (collation.supportsBinaryEquality) { return String.format(expr + "UTF8(%s)", v); - } else { + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s)", v); + } else { return String.format(expr + "ICU(%s, %d)", v, collationId); } } public static UTF8String execUTF8(final UTF8String v) { return v.toUpperCase(); } + public static UTF8String execLowercase(final UTF8String v) { + return CollationAwareUTF8String.toUpperCase(v); + } public static UTF8String execICU(final UTF8String v, final int collationId) { - return UTF8String.fromString(CollationAwareUTF8String.toUpperCase(v.toString(), collationId)); + return CollationAwareUTF8String.toUpperCase(v, collationId); } } public static class Lower { public static UTF8String exec(final UTF8String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { + if (collation.supportsBinaryEquality) { return execUTF8(v); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(v); } else { return execICU(v, collationId); } @@ -243,52 +252,55 @@ public static UTF8String exec(final UTF8String v, final int collationId) { public static String genCode(final String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); String expr = "CollationSupport.Lower.exec"; - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { + if (collation.supportsBinaryEquality) { return String.format(expr + "UTF8(%s)", v); - } else { + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s)", v); + } else { return String.format(expr + "ICU(%s, %d)", v, collationId); } } public static UTF8String execUTF8(final UTF8String v) { return v.toLowerCase(); } + public static UTF8String execLowercase(final UTF8String v) { + return CollationAwareUTF8String.toLowerCase(v); + } public static UTF8String execICU(final UTF8String v, final int collationId) { - return UTF8String.fromString(CollationAwareUTF8String.toLowerCase(v.toString(), collationId)); + return CollationAwareUTF8String.toLowerCase(v, collationId); } } public static class InitCap { public static UTF8String exec(final UTF8String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { + if (collation.supportsBinaryEquality) { return execUTF8(v); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(v); } else { return execICU(v, collationId); } } - public static String genCode(final String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); String expr = "CollationSupport.InitCap.exec"; - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { + if (collation.supportsBinaryEquality) { return String.format(expr + "UTF8(%s)", v); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s)", v); } else { return String.format(expr + "ICU(%s, %d)", v, collationId); } } - public static UTF8String execUTF8(final UTF8String v) { return v.toLowerCase().toTitleCase(); } - + public static UTF8String execLowercase(final UTF8String v) { + return CollationAwareUTF8String.toTitleCase(v); + } public static UTF8String execICU(final UTF8String v, final int collationId) { - return UTF8String.fromString( - CollationAwareUTF8String.toTitleCase( - CollationAwareUTF8String.toLowerCase( - v.toString(), - collationId - ), - collationId)); + return CollationAwareUTF8String.toTitleCase(v, collationId); } } From 182c2c5c63ec4608ec604ab677494ec9f02e0611 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Thu, 23 May 2024 19:59:25 +0200 Subject: [PATCH 2/8] Tests --- .../spark/unsafe/types/CollationSupportSuite.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 7fc3c4e349c3..8d9812085c38 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -23,7 +23,7 @@ import static org.junit.jupiter.api.Assertions.*; - +// checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { /** @@ -526,10 +526,10 @@ public void testInitCap() throws SparkException { assertInitCap("aB 世 de", "UNICODE_CI", "Ab 世 De"); assertInitCap("ÄBĆΔE", "UNICODE_CI", "Äbćδe"); // Case-variable character length - assertInitCap("İo", "UTF8_BINARY", "İo"); - assertInitCap("İo", "UTF8_BINARY_LCASE", "İo"); - assertInitCap("İo", "UNICODE", "İo"); - assertInitCap("İo", "UNICODE_CI", "İo"); + assertInitCap("İo", "UTF8_BINARY", "I\u0307o"); + assertInitCap("İo", "UTF8_BINARY_LCASE", "İo"); + assertInitCap("İo", "UNICODE", "I\u0307o"); + assertInitCap("İo", "UNICODE_CI", "İo"); } private void assertStringInstr(String string, String substring, String collationName, @@ -1008,3 +1008,4 @@ public void testStringTrim() throws SparkException { // TODO: Test other collation-aware expressions. } +// checkstyle.on: AvoidEscapedUnicodeCharacters From d9a0b11cc87621834b017570a0b9cb9c23107bd5 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 08:53:27 +0200 Subject: [PATCH 3/8] Update doc comments --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 6878c8e7fb92..7ef79ed86fea 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -192,10 +192,10 @@ public static String toLowerCase(final String target, final int collationId) { } /** - * Convert the input string to lowercase using the ICU root locale rules. + * Convert the input string to titlecase using the ICU root locale rules. * * @param target the input string - * @return the lowercase string + * @return the titlecase string */ public static UTF8String toTitleCase(final UTF8String target) { return UTF8String.fromString(toTitleCase(target.toString())); @@ -206,10 +206,10 @@ public static String toTitleCase(final String target) { } /** - * Convert the input string to lowercase using the specified ICU collation rules. + * Convert the input string to titlecase using the specified ICU collation rules. * * @param target the input string - * @return the lowercase string + * @return the titlecase string */ public static UTF8String toTitleCase(final UTF8String target, final int collationId) { return UTF8String.fromString(toTitleCase(target.toString(), collationId)); From df12951eff07b1c99cd159aa002b81fd375396fe Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 09:03:46 +0200 Subject: [PATCH 4/8] Remove InitCap --- .../util/CollationAwareUTF8String.java | 26 +------------------ .../sql/catalyst/util/CollationSupport.java | 19 +++++++------- .../unsafe/types/CollationSupportSuite.java | 24 ++++++++--------- 3 files changed, 22 insertions(+), 47 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 7ef79ed86fea..1da201c1a1dd 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -191,34 +191,10 @@ public static String toLowerCase(final String target, final int collationId) { return UCharacter.toLowerCase(locale, target); } - /** - * Convert the input string to titlecase using the ICU root locale rules. - * - * @param target the input string - * @return the titlecase string - */ - public static UTF8String toTitleCase(final UTF8String target) { - return UTF8String.fromString(toTitleCase(target.toString())); - } - public static String toTitleCase(final String target) { - BreakIterator wordIterator = BreakIterator.getWordInstance(); - return UCharacter.toTitleCase(target, wordIterator); - } - - /** - * Convert the input string to titlecase using the specified ICU collation rules. - * - * @param target the input string - * @return the titlecase string - */ - public static UTF8String toTitleCase(final UTF8String target, final int collationId) { - return UTF8String.fromString(toTitleCase(target.toString(), collationId)); - } public static String toTitleCase(final String target, final int collationId) { ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); - BreakIterator wordIterator = BreakIterator.getWordInstance(locale); - return UCharacter.toTitleCase(locale, target, wordIterator); + return UCharacter.toTitleCase(locale, target, BreakIterator.getWordInstance(locale)); } public static int findInSet(final UTF8String match, final UTF8String set, int collationId) { diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index ceec9d636b83..893c9908a6e6 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -274,10 +274,8 @@ public static UTF8String execICU(final UTF8String v, final int collationId) { public static class InitCap { public static UTF8String exec(final UTF8String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality) { + if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { return execUTF8(v); - } else if (collation.supportsLowercaseEquality) { - return execLowercase(v); } else { return execICU(v, collationId); } @@ -285,10 +283,8 @@ public static UTF8String exec(final UTF8String v, final int collationId) { public static String genCode(final String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); String expr = "CollationSupport.InitCap.exec"; - if (collation.supportsBinaryEquality) { + if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { return String.format(expr + "UTF8(%s)", v); - } else if (collation.supportsLowercaseEquality) { - return String.format(expr + "Lowercase(%s)", v); } else { return String.format(expr + "ICU(%s, %d)", v, collationId); } @@ -296,11 +292,14 @@ public static String genCode(final String v, final int collationId) { public static UTF8String execUTF8(final UTF8String v) { return v.toLowerCase().toTitleCase(); } - public static UTF8String execLowercase(final UTF8String v) { - return CollationAwareUTF8String.toTitleCase(v); - } public static UTF8String execICU(final UTF8String v, final int collationId) { - return CollationAwareUTF8String.toTitleCase(v, collationId); + return UTF8String.fromString( + CollationAwareUTF8String.toTitleCase( + CollationAwareUTF8String.toLowerCase( + v.toString(), + collationId + ), + collationId)); } } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 8d9812085c38..583b6ebb0be3 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -426,10 +426,10 @@ public void testUpper() throws SparkException { assertUpper("ab世De", "UNICODE_CI", "AB世DE"); assertUpper("äbćδe", "UNICODE_CI", "ÄBĆΔE"); // Case-variable character length - assertUpper("i̇o", "UTF8_BINARY","İO"); - assertUpper("i̇o", "UTF8_BINARY_LCASE","İO"); - assertUpper("i̇o", "UNICODE","İO"); - assertUpper("i̇o", "UNICODE_CI","İO"); + assertUpper("i\u0307o", "UTF8_BINARY","I\u0307O"); + assertUpper("i\u0307o", "UTF8_BINARY_LCASE","I\u0307O"); + assertUpper("i\u0307o", "UNICODE","I\u0307O"); + assertUpper("i\u0307o", "UNICODE_CI","I\u0307O"); } private void assertLower(String target, String collationName, String expected) @@ -476,10 +476,10 @@ public void testLower() throws SparkException { assertLower("aB世De", "UNICODE_CI", "ab世de"); assertLower("ÄBĆΔE", "UNICODE_CI", "äbćδe"); // Case-variable character length - assertLower("İo", "UTF8_BINARY","i̇o"); - assertLower("İo", "UTF8_BINARY_LCASE","i̇o"); - assertLower("İo", "UNICODE","i̇o"); - assertLower("İo", "UNICODE_CI","i̇o"); + assertLower("İo", "UTF8_BINARY","i\u0307o"); + assertLower("İo", "UTF8_BINARY_LCASE","i\u0307o"); + assertLower("İo", "UNICODE","i\u0307o"); + assertLower("İo", "UNICODE_CI","i\u0307o"); } private void assertInitCap(String target, String collationName, String expected) @@ -526,10 +526,10 @@ public void testInitCap() throws SparkException { assertInitCap("aB 世 de", "UNICODE_CI", "Ab 世 De"); assertInitCap("ÄBĆΔE", "UNICODE_CI", "Äbćδe"); // Case-variable character length - assertInitCap("İo", "UTF8_BINARY", "I\u0307o"); - assertInitCap("İo", "UTF8_BINARY_LCASE", "İo"); - assertInitCap("İo", "UNICODE", "I\u0307o"); - assertInitCap("İo", "UNICODE_CI", "İo"); + assertInitCap("İo", "UTF8_BINARY", "İo"); + assertInitCap("İo", "UTF8_BINARY_LCASE", "İo"); + assertInitCap("İo", "UNICODE", "İo"); + assertInitCap("İo", "UNICODE_CI", "İo"); } private void assertStringInstr(String string, String substring, String collationName, From 3e70e6d559e53db918f9c37365b7b2f15c12e5ce Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 09:05:59 +0200 Subject: [PATCH 5/8] Undo unnecessary changes --- .../org/apache/spark/sql/catalyst/util/CollationSupport.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index 893c9908a6e6..ab82bae5ca83 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -280,6 +280,7 @@ public static UTF8String exec(final UTF8String v, final int collationId) { return execICU(v, collationId); } } + public static String genCode(final String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); String expr = "CollationSupport.InitCap.exec"; @@ -289,9 +290,11 @@ public static String genCode(final String v, final int collationId) { return String.format(expr + "ICU(%s, %d)", v, collationId); } } + public static UTF8String execUTF8(final UTF8String v) { return v.toLowerCase().toTitleCase(); } + public static UTF8String execICU(final UTF8String v, final int collationId) { return UTF8String.fromString( CollationAwareUTF8String.toTitleCase( From f5a3939ecf1303cb6c092041a18cb1474aafc0f8 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 09:50:16 +0200 Subject: [PATCH 6/8] Correct naming --- .../spark/sql/catalyst/util/CollationSupport.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index ab82bae5ca83..e6e67c2c3662 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -209,7 +209,7 @@ public static class Upper { public static UTF8String exec(final UTF8String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { - return execUTF8(v); + return execBinary(v); } else if (collation.supportsLowercaseEquality) { return execLowercase(v); } else { @@ -220,14 +220,14 @@ public static String genCode(final String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); String expr = "CollationSupport.Upper.exec"; if (collation.supportsBinaryEquality) { - return String.format(expr + "UTF8(%s)", v); + return String.format(expr + "Binary(%s)", v); } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s)", v); } else { return String.format(expr + "ICU(%s, %d)", v, collationId); } } - public static UTF8String execUTF8(final UTF8String v) { + public static UTF8String execBinary(final UTF8String v) { return v.toUpperCase(); } public static UTF8String execLowercase(final UTF8String v) { @@ -242,7 +242,7 @@ public static class Lower { public static UTF8String exec(final UTF8String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { - return execUTF8(v); + return execBinary(v); } else if (collation.supportsLowercaseEquality) { return execLowercase(v); } else { @@ -253,14 +253,14 @@ public static String genCode(final String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); String expr = "CollationSupport.Lower.exec"; if (collation.supportsBinaryEquality) { - return String.format(expr + "UTF8(%s)", v); + return String.format(expr + "Binary(%s)", v); } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s)", v); } else { return String.format(expr + "ICU(%s, %d)", v, collationId); } } - public static UTF8String execUTF8(final UTF8String v) { + public static UTF8String execBinary(final UTF8String v) { return v.toLowerCase(); } public static UTF8String execLowercase(final UTF8String v) { From a2d16c05811874530a5b0244ae50eb03732bf82d Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:58:06 +0200 Subject: [PATCH 7/8] Small fixes --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index a6c5aac32a99..7b00ee95e1e0 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -293,6 +293,7 @@ public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String public static UTF8String toUpperCase(final UTF8String target) { return UTF8String.fromString(toUpperCase(target.toString())); } + public static String toUpperCase(final String target) { return UCharacter.toUpperCase(target); } @@ -306,6 +307,7 @@ public static String toUpperCase(final String target) { public static UTF8String toUpperCase(final UTF8String target, final int collationId) { return UTF8String.fromString(toUpperCase(target.toString(), collationId)); } + public static String toUpperCase(final String target, final int collationId) { ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); From 36467858e3cda9223d49e2406865eea0911020ff Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 7 Jun 2024 12:24:48 +0200 Subject: [PATCH 8/8] Add tests --- .../org/apache/spark/unsafe/types/CollationSupportSuite.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 712690c0d0c1..493b6e84118b 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -498,6 +498,10 @@ public void testUpper() throws SparkException { assertUpper("i\u0307o", "UTF8_BINARY_LCASE","I\u0307O"); assertUpper("i\u0307o", "UNICODE","I\u0307O"); assertUpper("i\u0307o", "UNICODE_CI","I\u0307O"); + assertUpper("ß fi ffi ff st ῗ", "UTF8_BINARY","SS FI FFI FF ST \u0399\u0308\u0342"); + assertUpper("ß fi ffi ff st ῗ", "UTF8_BINARY_LCASE","SS FI FFI FF ST \u0399\u0308\u0342"); + assertUpper("ß fi ffi ff st ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342"); + assertUpper("ß fi ffi ff st ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342"); } private void assertLower(String target, String collationName, String expected)