From f226c65fca8ed7043694b62cbffe7bac6bd2a31c Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 15 May 2024 08:45:39 +0200 Subject: [PATCH 1/5] Fix and tests --- .../util/CollationAwareUTF8String.java | 199 +++++++++++++++--- .../sql/catalyst/util/CollationSupport.java | 2 +- .../unsafe/types/CollationSupportSuite.java | 49 ++++- 3 files changed, 214 insertions(+), 36 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index ee0d611d7e65..ef71eb47d310 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -34,6 +34,143 @@ * Utility class for collation-aware UTF8String operations. */ public class CollationAwareUTF8String { + + /** + * The constant value to indicate that the match is not found + * when searching for a pattern string in a target string. + */ + private static final int MATCH_NOT_FOUND = -1; + + /** + * Returns whether the target string starts with the specified prefix, + * with respect to the UTF8_BINARY_LCASE collation. The method assumes + * that the prefix is already lowercased prior to method call to avoid the + * overhead of calling .toLowerCase() multiple times on the same prefix string. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the start position for searching (in the target string) + * @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE + */ + public static boolean lowercaseMatchFrom( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND; + } + + /** + * Returns the length of the substring of the target string that starts with + * the specified prefix, with respect to the UTF8_BINARY_LCASE collation. + * The method assumes that the prefix is already lowercased. The method only + * considers the part of target string that starts from the specified position. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the end position for searching (in the target string) + * @return length of the target substring that ends with the specified suffix in lowercase + */ + public static int lowercaseMatchLengthFrom( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + assert startPos >= 0; + for (int len = 0; len <= target.numChars() - startPos; ++len) { + if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) { + return len; + } + } + return MATCH_NOT_FOUND; + } + + /** + * Returns the position of the first occurrence of the pattern string + * in the target string from the specified position (0-based index), + * with respect to the UTF8_BINARY_LCASE collation. The method assumes + * that the pattern string is already lowercased prior to method call. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the start position for searching (in the target string) + * @return the position of the first occurrence of pattern in target, if not found, -1 returned. + */ + public static int lowercaseFind( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + for (int i = startPos; i <= target.numChars(); ++i) { + if (lowercaseMatchFrom(target, lowercasePattern, i)) { + return i; + } + } + return MATCH_NOT_FOUND; + } + + /** + * Returns whether the target string ends with the specified suffix, + * with respect to the UTF8_BINARY_LCASE collation. The method assumes + * that the suffix is already lowercased prior to method call to avoid the + * overhead of calling .toLowerCase() multiple times on the same suffix string. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return whether the target string ends with the specified suffix in lowercase + */ + public static boolean lowercaseMatchUntil( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND; + } + + /** + * Returns the length of the substring of the target string that ends with + * the specified suffix, with respect to the UTF8_BINARY_LCASE collation. + * The method assumes that the suffix is already lowercased. The method only + * considers the part of target string that ends at the specified position. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return length of the target substring that ends with the specified suffix in lowercase + */ + public static int lowercaseMatchLengthUntil( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + assert endPos <= target.numChars(); + for (int len = 0; len <= endPos; ++len) { + if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) { + return len; + } + } + return MATCH_NOT_FOUND; + } + + /** + * Returns the position of the last occurrence of the pattern string + * in the target string until the specified position (0-based index), + * with respect to the UTF8_BINARY_LCASE collation. The method assumes + * that the pattern string is already lowercased prior to method call. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return the position of the last occurrence of pattern in target, if not found, -1 returned. + */ + public static int lowercaseRFind( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + for (int i = endPos; i >= 0; --i) { + if (lowercaseMatchUntil(target, lowercasePattern, i)) { + return i; + } + } + return MATCH_NOT_FOUND; + } + public static UTF8String replace(final UTF8String src, final UTF8String search, final UTF8String replace, final int collationId) { // This collation aware implementation is based on existing implementation on UTF8String @@ -183,6 +320,22 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co return 0; } + /** + * Returns the position of the first occurrence of the pattern string + * in the target string from the specified position (0-based index), + * with respect to the UTF8_BINARY_LCASE collation. + * + * @param target the string to be searched in + * @param pattern the string to be searched for + * @param start the start position for searching (in the target string) + * @return the position of the first occurrence of pattern in target, if not found, -1 returned. + */ + public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, + final int start) { + if (pattern.numChars() == 0) return 0; + return lowercaseFind(target, pattern.toLowerCase(), start); + } + public static int indexOf(final UTF8String target, final UTF8String pattern, final int start, final int collationId) { if (pattern.numBytes() == 0) { @@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string, return UTF8String.EMPTY_UTF8; } - UTF8String lowercaseString = string.toLowerCase(); UTF8String lowercaseDelimiter = delimiter.toLowerCase(); if (count > 0) { - int idx = -1; + // search left to right (note: the start code point is inclusive) + int matchLength = -1; while (count > 0) { - idx = lowercaseString.find(lowercaseDelimiter, idx + 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } - } - if (idx == 0) { - return UTF8String.EMPTY_UTF8; + matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1); + if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter + else return string; // cannot find enough delimiters in the string } - byte[] bytes = new byte[idx]; - copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx); - return UTF8String.fromBytes(bytes); - + if (matchLength == 0) return UTF8String.EMPTY_UTF8; + return string.substring(0, matchLength); } else { - int idx = string.numBytes() - delimiter.numBytes() + 1; + // search right to left (note: the end code point is exclusive) + int matchLength = string.numChars() + 1; count = -count; while (count > 0) { - idx = lowercaseString.rfind(lowercaseDelimiter, idx - 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } + matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1); + if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter + else return string; // cannot find enough delimiters in the string } - if (idx + delimiter.numBytes() == string.numBytes()) { - return UTF8String.EMPTY_UTF8; - } - int size = string.numBytes() - delimiter.numBytes() - idx; - byte[] bytes = new byte[size]; - copyMemory(string.getBaseObject(), string.getBaseOffset() + idx + delimiter.numBytes(), - bytes, BYTE_ARRAY_OFFSET, size); - return UTF8String.fromBytes(bytes); + if (matchLength == string.numChars()) return UTF8String.EMPTY_UTF8; + return string.substring(matchLength, string.numChars()); } } diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index bea3dc08b448..678b00649a32 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -354,7 +354,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring return string.indexOf(substring, 0); } public static int execLowercase(final UTF8String string, final UTF8String substring) { - return string.toLowerCase().indexOf(substring.toLowerCase(), 0); + return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0); } public static int execICU(final UTF8String string, final UTF8String substring, final int collationId) { diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 7fc3c4e349c3..6f4b757851cc 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -23,7 +23,7 @@ import static org.junit.jupiter.api.Assertions.*; - +// checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { /** @@ -567,8 +567,26 @@ public void testStringInstr() throws SparkException { assertStringInstr("aaads", "dS", "UNICODE_CI", 4); assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0); assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8); - assertStringInstr("abİo12", "i̇o", "UNICODE_CI", 3); - assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3); + assertStringInstr("i̇", "i", "UNICODE_CI", 0); + assertStringInstr("i̇", "\u0307", "UNICODE_CI", 0); + assertStringInstr("i̇", "İ", "UNICODE_CI", 1); + assertStringInstr("İ", "i", "UNICODE_CI", 0); + assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1); + assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1); + assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3); + assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3); + assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5); + assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7); + assertStringInstr("i̇", "i", "UTF8_BINARY_LCASE", 1); // different from UNICODE_CI + assertStringInstr("i̇", "\u0307", "UTF8_BINARY_LCASE", 2); // different from UNICODE_CI + assertStringInstr("i̇", "İ", "UTF8_BINARY_LCASE", 1); + assertStringInstr("İ", "i", "UTF8_BINARY_LCASE", 0); + assertStringInstr("İoi̇o12", "i̇o", "UTF8_BINARY_LCASE", 1); + assertStringInstr("i̇oİo12", "İo", "UTF8_BINARY_LCASE", 1); + assertStringInstr("abİoi̇o", "i̇o", "UTF8_BINARY_LCASE", 3); + assertStringInstr("abi̇oİo", "İo", "UTF8_BINARY_LCASE", 3); + assertStringInstr("ai̇oxXİo", "Xx", "UTF8_BINARY_LCASE", 5); + assertStringInstr("aİoi̇oxx", "XX", "UTF8_BINARY_LCASE", 7); } private void assertFindInSet(String word, String set, String collationName, @@ -798,6 +816,30 @@ public void testSubstringIndex() throws SparkException { assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("abi̇12", "i", 1, "UNICODE_CI", "abi̇12"); + assertSubstringIndex("abi̇12", "\u0307", 1, "UNICODE_CI", "abi̇12"); + assertSubstringIndex("abi̇12", "İ", 1, "UNICODE_CI", "ab"); + assertSubstringIndex("abİ12", "i", 1, "UNICODE_CI", "abİ12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12"); + assertSubstringIndex("abi̇12", "i", 1, "UTF8_BINARY_LCASE", "ab"); + assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_BINARY_LCASE", "abi"); + assertSubstringIndex("abi̇12", "İ", 1, "UTF8_BINARY_LCASE", "ab"); + assertSubstringIndex("abİ12", "i", 1, "UTF8_BINARY_LCASE", "abİ12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12"); } private void assertStringTrim( @@ -1008,3 +1050,4 @@ public void testStringTrim() throws SparkException { // TODO: Test other collation-aware expressions. } +// checkstyle.on: AvoidEscapedUnicodeCharacters From a2e1307b7d118fd28624a57db80c4a3167383ecb Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 15 May 2024 08:57:00 +0200 Subject: [PATCH 2/5] Fix tests --- .../apache/spark/unsafe/types/CollationSupportSuite.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 6f4b757851cc..2d80c50ba084 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -577,8 +577,8 @@ public void testStringInstr() throws SparkException { assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3); assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5); assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7); - assertStringInstr("i̇", "i", "UTF8_BINARY_LCASE", 1); // different from UNICODE_CI - assertStringInstr("i̇", "\u0307", "UTF8_BINARY_LCASE", 2); // different from UNICODE_CI + assertStringInstr("i̇", "i", "UTF8_BINARY_LCASE", 1); // != UNICODE_CI + assertStringInstr("i̇", "\u0307", "UTF8_BINARY_LCASE", 2); // != UNICODE_CI assertStringInstr("i̇", "İ", "UTF8_BINARY_LCASE", 1); assertStringInstr("İ", "i", "UTF8_BINARY_LCASE", 0); assertStringInstr("İoi̇o12", "i̇o", "UTF8_BINARY_LCASE", 1); @@ -828,8 +828,8 @@ public void testSubstringIndex() throws SparkException { assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12"); - assertSubstringIndex("abi̇12", "i", 1, "UTF8_BINARY_LCASE", "ab"); - assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_BINARY_LCASE", "abi"); + assertSubstringIndex("abi̇12", "i", 1, "UTF8_BINARY_LCASE", "ab"); // != UNICODE_CI + assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_BINARY_LCASE", "abi"); // != UNICODE_CI assertSubstringIndex("abi̇12", "İ", 1, "UTF8_BINARY_LCASE", "ab"); assertSubstringIndex("abİ12", "i", 1, "UTF8_BINARY_LCASE", "abİ12"); assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); From 168dde3986edd38bd9de9066cfac552224347d30 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 17 May 2024 10:25:16 +0200 Subject: [PATCH 3/5] Small fixes --- .../util/CollationAwareUTF8String.java | 131 ++++++++++-------- .../unsafe/types/CollationSupportSuite.java | 4 + 2 files changed, 75 insertions(+), 60 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index ef71eb47d310..c48a5f942b7d 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -36,16 +36,17 @@ public class CollationAwareUTF8String { /** - * The constant value to indicate that the match is not found - * when searching for a pattern string in a target string. + * The constant value to indicate that the match is not found when searching for a pattern + * string in a target string. */ private static final int MATCH_NOT_FOUND = -1; /** - * Returns whether the target string starts with the specified prefix, - * with respect to the UTF8_BINARY_LCASE collation. The method assumes - * that the prefix is already lowercased prior to method call to avoid the - * overhead of calling .toLowerCase() multiple times on the same prefix string. + * Returns whether the target string starts with the specified prefix, starting from the + * specified position (0-based index referring to character position in UTF8String), with respect + * to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased + * prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the + * same prefix string. * * @param target the string to be searched in * @param lowercasePattern the string to be searched for @@ -53,27 +54,30 @@ public class CollationAwareUTF8String { * @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE */ public static boolean lowercaseMatchFrom( - final UTF8String target, - final UTF8String lowercasePattern, - int startPos) { + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND; } /** - * Returns the length of the substring of the target string that starts with - * the specified prefix, with respect to the UTF8_BINARY_LCASE collation. - * The method assumes that the prefix is already lowercased. The method only - * considers the part of target string that starts from the specified position. + * Returns the length of the substring of the target string that starts with the specified + * prefix, starting from the specified position (0-based index referring to character position + * in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * prefix is already lowercased. The method only considers the part of target string that + * starts from the specified (inclusive) position (that is, the method does not look at UTF8 + * characters of the target string at or after position `endPos`). If the prefix is not found, + * MATCH_NOT_FOUND is returned. * * @param target the string to be searched in * @param lowercasePattern the string to be searched for - * @param startPos the end position for searching (in the target string) - * @return length of the target substring that ends with the specified suffix in lowercase + * @param startPos the start position for searching (in the target string) + * @return length of the target substring that ends with the specified prefix in lowercase */ public static int lowercaseMatchLengthFrom( - final UTF8String target, - final UTF8String lowercasePattern, - int startPos) { + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { assert startPos >= 0; for (int len = 0; len <= target.numChars() - startPos; ++len) { if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) { @@ -84,20 +88,22 @@ public static int lowercaseMatchLengthFrom( } /** - * Returns the position of the first occurrence of the pattern string - * in the target string from the specified position (0-based index), - * with respect to the UTF8_BINARY_LCASE collation. The method assumes - * that the pattern string is already lowercased prior to method call. + * Returns the position of the first occurrence of the pattern string in the target string, + * starting from the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * pattern string is already lowercased prior to call. If the pattern is not found, + * MATCH_NOT_FOUND is returned. * * @param target the string to be searched in * @param lowercasePattern the string to be searched for * @param startPos the start position for searching (in the target string) - * @return the position of the first occurrence of pattern in target, if not found, -1 returned. + * @return the position of the first occurrence of pattern in target */ public static int lowercaseFind( - final UTF8String target, - final UTF8String lowercasePattern, - int startPos) { + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + assert startPos >= 0; for (int i = startPos; i <= target.numChars(); ++i) { if (lowercaseMatchFrom(target, lowercasePattern, i)) { return i; @@ -107,10 +113,11 @@ public static int lowercaseFind( } /** - * Returns whether the target string ends with the specified suffix, - * with respect to the UTF8_BINARY_LCASE collation. The method assumes - * that the suffix is already lowercased prior to method call to avoid the - * overhead of calling .toLowerCase() multiple times on the same suffix string. + * Returns whether the target string ends with the specified suffix, ending at the specified + * position (0-based index referring to character position in UTF8String), with respect to the + * UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior + * to method call to avoid the overhead of calling .toLowerCase() multiple times on the same + * suffix string. * * @param target the string to be searched in * @param lowercasePattern the string to be searched for @@ -118,17 +125,20 @@ public static int lowercaseFind( * @return whether the target string ends with the specified suffix in lowercase */ public static boolean lowercaseMatchUntil( - final UTF8String target, - final UTF8String lowercasePattern, - int endPos) { + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND; } /** - * Returns the length of the substring of the target string that ends with - * the specified suffix, with respect to the UTF8_BINARY_LCASE collation. - * The method assumes that the suffix is already lowercased. The method only - * considers the part of target string that ends at the specified position. + * Returns the length of the substring of the target string that ends with the specified + * suffix, ending at the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * suffix is already lowercased. The method only considers the part of target string that ends + * at the specified (non-inclusive) position (that is, the method does not look at UTF8 + * characters of the target string at or after position `endPos`). If the suffix is not found, + * MATCH_NOT_FOUND is returned. * * @param target the string to be searched in * @param lowercasePattern the string to be searched for @@ -136,9 +146,9 @@ public static boolean lowercaseMatchUntil( * @return length of the target substring that ends with the specified suffix in lowercase */ public static int lowercaseMatchLengthUntil( - final UTF8String target, - final UTF8String lowercasePattern, - int endPos) { + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { assert endPos <= target.numChars(); for (int len = 0; len <= endPos; ++len) { if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) { @@ -149,20 +159,22 @@ public static int lowercaseMatchLengthUntil( } /** - * Returns the position of the last occurrence of the pattern string - * in the target string until the specified position (0-based index), - * with respect to the UTF8_BINARY_LCASE collation. The method assumes - * that the pattern string is already lowercased prior to method call. + * Returns the position of the last occurrence of the pattern string in the target string, + * ending at the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * pattern string is already lowercased prior to call. If the pattern is not found, + * MATCH_NOT_FOUND is returned. * * @param target the string to be searched in * @param lowercasePattern the string to be searched for * @param endPos the end position for searching (in the target string) - * @return the position of the last occurrence of pattern in target, if not found, -1 returned. + * @return the position of the last occurrence of pattern in target */ public static int lowercaseRFind( - final UTF8String target, - final UTF8String lowercasePattern, - int endPos) { + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + assert endPos <= target.numChars(); for (int i = endPos; i >= 0; --i) { if (lowercaseMatchUntil(target, lowercasePattern, i)) { return i; @@ -321,14 +333,15 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co } /** - * Returns the position of the first occurrence of the pattern string - * in the target string from the specified position (0-based index), - * with respect to the UTF8_BINARY_LCASE collation. + * Returns the position of the first occurrence of the pattern string in the target string, + * starting from the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. If the pattern is not found, + * MATCH_NOT_FOUND is returned. * * @param target the string to be searched in * @param pattern the string to be searched for * @param start the start position for searching (in the target string) - * @return the position of the first occurrence of pattern in target, if not found, -1 returned. + * @return the position of the first occurrence of pattern in target */ public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, final int start) { @@ -434,25 +447,23 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string, UTF8String lowercaseDelimiter = delimiter.toLowerCase(); if (count > 0) { - // search left to right (note: the start code point is inclusive) + // Search left to right (note: the start code point is inclusive). int matchLength = -1; while (count > 0) { matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1); - if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter - else return string; // cannot find enough delimiters in the string + if (matchLength > MATCH_NOT_FOUND) --count; // Found a delimiter. + else return string; // Cannot find enough delimiters in the string. } - if (matchLength == 0) return UTF8String.EMPTY_UTF8; return string.substring(0, matchLength); } else { - // search right to left (note: the end code point is exclusive) + // Search right to left (note: the end code point is exclusive). int matchLength = string.numChars() + 1; count = -count; while (count > 0) { matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1); - if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter - else return string; // cannot find enough delimiters in the string + if (matchLength > MATCH_NOT_FOUND) --count; // Found a delimiter. + else return string; // Cannot find enough delimiters in the string. } - if (matchLength == string.numChars()) return UTF8String.EMPTY_UTF8; return string.substring(matchLength, string.numChars()); } } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 2d80c50ba084..a88e2eff0253 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -585,7 +585,9 @@ public void testStringInstr() throws SparkException { assertStringInstr("i̇oİo12", "İo", "UTF8_BINARY_LCASE", 1); assertStringInstr("abİoi̇o", "i̇o", "UTF8_BINARY_LCASE", 3); assertStringInstr("abi̇oİo", "İo", "UTF8_BINARY_LCASE", 3); + assertStringInstr("abI\u0307oi̇o", "İo", "UTF8_BINARY_LCASE", 3); assertStringInstr("ai̇oxXİo", "Xx", "UTF8_BINARY_LCASE", 5); + assertStringInstr("abİoi̇o", "\u0307o", "UTF8_BINARY_LCASE", 6); assertStringInstr("aİoi̇oxx", "XX", "UTF8_BINARY_LCASE", 7); } @@ -836,10 +838,12 @@ public void testSubstringIndex() throws SparkException { assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); + assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_BINARY_LCASE", "bİoi̇o12i̇o"); assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12"); assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12"); + assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_BINARY_LCASE", "bİoi̇o12i̇o"); } private void assertStringTrim( From a26124cccbf7ba5d0b1dc6a5b7f7d6529731f59e Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 20 May 2024 15:24:14 +0200 Subject: [PATCH 4/5] Centralize indexOf behaviour for empty substring --- .../sql/catalyst/util/CollationAwareUTF8String.java | 4 ++-- .../org/apache/spark/unsafe/types/UTF8String.java | 13 ++++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index c48a5f942b7d..f9f7a14ca080 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -345,14 +345,14 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co */ public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, final int start) { - if (pattern.numChars() == 0) return 0; + if (pattern.numChars() == 0) return target.indexOfEmpty(start); return lowercaseFind(target, pattern.toLowerCase(), start); } public static int indexOf(final UTF8String target, final UTF8String pattern, final int start, final int collationId) { if (pattern.numBytes() == 0) { - return 0; + return target.indexOfEmpty(start); } StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId); diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 20b26b6ebc5a..d6f664133418 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -891,6 +891,17 @@ public UTF8String repeat(int times) { return UTF8String.fromBytes(newBytes); } + /** + * Returns the (default) position of the first occurrence of an empty substr in the current + * string from the specified position (0-based index). + * + * @param start the start position of the current string for searching + * @return the position of the first occurrence of the empty substr (now, always 0) + */ + public int indexOfEmpty(int start) { + return 0; // TODO: Fix this behaviour (SPARK-48284) + } + /** * Returns the position of the first occurrence of substr in * current string from the specified position (0-based index). @@ -901,7 +912,7 @@ public UTF8String repeat(int times) { */ public int indexOf(UTF8String v, int start) { if (v.numBytes() == 0) { - return 0; + return indexOfEmpty(start); } // locate to the start position. From b571084aa56582a9d65a150bcc4de7be86dc2b03 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 15:12:46 +0200 Subject: [PATCH 5/5] Update new method access --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index f9f7a14ca080..7d8e9f81ff18 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -74,7 +74,7 @@ public static boolean lowercaseMatchFrom( * @param startPos the start position for searching (in the target string) * @return length of the target substring that ends with the specified prefix in lowercase */ - public static int lowercaseMatchLengthFrom( + private static int lowercaseMatchLengthFrom( final UTF8String target, final UTF8String lowercasePattern, int startPos) { @@ -99,7 +99,7 @@ public static int lowercaseMatchLengthFrom( * @param startPos the start position for searching (in the target string) * @return the position of the first occurrence of pattern in target */ - public static int lowercaseFind( + private static int lowercaseFind( final UTF8String target, final UTF8String lowercasePattern, int startPos) { @@ -145,7 +145,7 @@ public static boolean lowercaseMatchUntil( * @param endPos the end position for searching (in the target string) * @return length of the target substring that ends with the specified suffix in lowercase */ - public static int lowercaseMatchLengthUntil( + private static int lowercaseMatchLengthUntil( final UTF8String target, final UTF8String lowercasePattern, int endPos) { @@ -170,7 +170,7 @@ public static int lowercaseMatchLengthUntil( * @param endPos the end position for searching (in the target string) * @return the position of the last occurrence of pattern in target */ - public static int lowercaseRFind( + private static int lowercaseRFind( final UTF8String target, final UTF8String lowercasePattern, int endPos) {