diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 056b202bc398..595d13f9c239 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -332,12 +332,58 @@ public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String return buf.build(); } + /** + * Convert the input string to uppercase using the ICU root locale rules. + * + * @param target the input string + * @return the uppercase string + */ + public static UTF8String toUpperCase(final UTF8String target) { + return UTF8String.fromString(toUpperCase(target.toString())); + } + + public static String toUpperCase(final String target) { + return UCharacter.toUpperCase(target); + } + + /** + * Convert the input string to uppercase using the specified ICU collation rules. + * + * @param target the input string + * @return the uppercase string + */ + public static UTF8String toUpperCase(final UTF8String target, final int collationId) { + return UTF8String.fromString(toUpperCase(target.toString(), collationId)); + } + public static String toUpperCase(final String target, final int collationId) { ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); return UCharacter.toUpperCase(locale, target); } + /** + * Convert the input string to lowercase using the ICU root locale rules. + * + * @param target the input string + * @return the lowercase string + */ + public static UTF8String toLowerCase(final UTF8String target) { + return UTF8String.fromString(toLowerCase(target.toString())); + } + public static String toLowerCase(final String target) { + return UCharacter.toLowerCase(target); + } + + /** + * Convert the input string to lowercase using the specified ICU collation rules. + * + * @param target the input string + * @return the lowercase string + */ + public static UTF8String toLowerCase(final UTF8String target, final int collationId) { + return UTF8String.fromString(toLowerCase(target.toString(), collationId)); + } public static String toLowerCase(final String target, final int collationId) { ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index d5bcc61bac2a..5995a47e878c 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -208,34 +208,43 @@ public static boolean execICU(final UTF8String l, final UTF8String r, public static class Upper { public static UTF8String exec(final UTF8String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { - return execUTF8(v); - } else { + if (collation.supportsBinaryEquality) { + return execBinary(v); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(v); + } else { return execICU(v, collationId); } } public static String genCode(final String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); String expr = "CollationSupport.Upper.exec"; - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { - return String.format(expr + "UTF8(%s)", v); - } else { + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s)", v); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s)", v); + } else { return String.format(expr + "ICU(%s, %d)", v, collationId); } } - public static UTF8String execUTF8(final UTF8String v) { + public static UTF8String execBinary(final UTF8String v) { return v.toUpperCase(); } + public static UTF8String execLowercase(final UTF8String v) { + return CollationAwareUTF8String.toUpperCase(v); + } public static UTF8String execICU(final UTF8String v, final int collationId) { - return UTF8String.fromString(CollationAwareUTF8String.toUpperCase(v.toString(), collationId)); + return CollationAwareUTF8String.toUpperCase(v, collationId); } } public static class Lower { public static UTF8String exec(final UTF8String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { - return execUTF8(v); + if (collation.supportsBinaryEquality) { + return execBinary(v); + } else if (collation.supportsLowercaseEquality) { + return execLowercase(v); } else { return execICU(v, collationId); } @@ -243,17 +252,22 @@ public static UTF8String exec(final UTF8String v, final int collationId) { public static String genCode(final String v, final int collationId) { CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); String expr = "CollationSupport.Lower.exec"; - if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) { - return String.format(expr + "UTF8(%s)", v); - } else { + if (collation.supportsBinaryEquality) { + return String.format(expr + "Binary(%s)", v); + } else if (collation.supportsLowercaseEquality) { + return String.format(expr + "Lowercase(%s)", v); + } else { return String.format(expr + "ICU(%s, %d)", v, collationId); } } - public static UTF8String execUTF8(final UTF8String v) { + public static UTF8String execBinary(final UTF8String v) { return v.toLowerCase(); } + public static UTF8String execLowercase(final UTF8String v) { + return CollationAwareUTF8String.toLowerCase(v); + } public static UTF8String execICU(final UTF8String v, final int collationId) { - return UTF8String.fromString(CollationAwareUTF8String.toLowerCase(v.toString(), collationId)); + return CollationAwareUTF8String.toLowerCase(v, collationId); } } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index fefa5b52a0c2..3524339af77b 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -645,10 +645,14 @@ public void testUpper() throws SparkException { assertUpper("ab世De", "UNICODE_CI", "AB世DE"); assertUpper("äbćδe", "UNICODE_CI", "ÄBĆΔE"); // Case-variable character length - assertUpper("i̇o", "UTF8_BINARY","İO"); - assertUpper("i̇o", "UTF8_BINARY_LCASE","İO"); - assertUpper("i̇o", "UNICODE","İO"); - assertUpper("i̇o", "UNICODE_CI","İO"); + assertUpper("i\u0307o", "UTF8_BINARY","I\u0307O"); + assertUpper("i\u0307o", "UTF8_BINARY_LCASE","I\u0307O"); + assertUpper("i\u0307o", "UNICODE","I\u0307O"); + assertUpper("i\u0307o", "UNICODE_CI","I\u0307O"); + assertUpper("ß fi ffi ff st ῗ", "UTF8_BINARY","SS FI FFI FF ST \u0399\u0308\u0342"); + assertUpper("ß fi ffi ff st ῗ", "UTF8_BINARY_LCASE","SS FI FFI FF ST \u0399\u0308\u0342"); + assertUpper("ß fi ffi ff st ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342"); + assertUpper("ß fi ffi ff st ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342"); } private void assertLower(String target, String collationName, String expected) @@ -695,10 +699,10 @@ public void testLower() throws SparkException { assertLower("aB世De", "UNICODE_CI", "ab世de"); assertLower("ÄBĆΔE", "UNICODE_CI", "äbćδe"); // Case-variable character length - assertLower("İo", "UTF8_BINARY","i̇o"); - assertLower("İo", "UTF8_BINARY_LCASE","i̇o"); - assertLower("İo", "UNICODE","i̇o"); - assertLower("İo", "UNICODE_CI","i̇o"); + assertLower("İo", "UTF8_BINARY","i\u0307o"); + assertLower("İo", "UTF8_BINARY_LCASE","i\u0307o"); + assertLower("İo", "UNICODE","i\u0307o"); + assertLower("İo", "UNICODE_CI","i\u0307o"); } private void assertInitCap(String target, String collationName, String expected)