-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-48221][SQL] Alter string search logic for UTF8_BINARY_LCASE collation (Contains, StartsWith, EndsWith, StringLocate) #46511
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4981a14
b5794fc
dd261bf
31b757b
35dd21a
0baa713
37e44ad
6f0ceb5
aa91b15
f72998c
86bf00b
77904fa
5b47499
aceeba8
3945137
30774a3
4923181
68bada3
8f59422
4b9b83e
eb602e8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,155 @@ | |
| * Utility class for collation-aware UTF8String operations. | ||
| */ | ||
| public class CollationAwareUTF8String { | ||
|
|
||
| /** | ||
| * The constant value to indicate that the match is not found when searching for a pattern | ||
| * string in a target string. | ||
| */ | ||
| private static final int MATCH_NOT_FOUND = -1; | ||
|
|
||
| /** | ||
| * Returns whether the target string starts with the specified prefix, starting from the | ||
| * specified position (0-based index referring to character position in UTF8String), with respect | ||
| * to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased | ||
| * prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the | ||
| * same prefix string. | ||
| * | ||
| * @param target the string to be searched in | ||
| * @param lowercasePattern the string to be searched for | ||
| * @param startPos the start position for searching (in the target string) | ||
| * @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE | ||
| */ | ||
| public static boolean lowercaseMatchFrom( | ||
| final UTF8String target, | ||
| final UTF8String lowercasePattern, | ||
| int startPos) { | ||
| return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the length of the substring of the target string that starts with the specified | ||
| * prefix, starting from the specified position (0-based index referring to character position | ||
| * in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the | ||
| * prefix is already lowercased. The method only considers the part of target string that | ||
| * starts from the specified (inclusive) position (that is, the method does not look at UTF8 | ||
| * characters of the target string at or after position `endPos`). If the prefix is not found, | ||
| * MATCH_NOT_FOUND is returned. | ||
| * | ||
| * @param target the string to be searched in | ||
| * @param lowercasePattern the string to be searched for | ||
| * @param startPos the start position for searching (in the target string) | ||
| * @return length of the target substring that starts with the specified prefix in lowercase | ||
| */ | ||
| private static int lowercaseMatchLengthFrom( | ||
| final UTF8String target, | ||
| final UTF8String lowercasePattern, | ||
| int startPos) { | ||
| assert startPos >= 0; | ||
| for (int len = 0; len <= target.numChars() - startPos; ++len) { | ||
| if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) { | ||
| return len; | ||
| } | ||
| } | ||
uros-db marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return MATCH_NOT_FOUND; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the position of the first occurrence of the pattern string in the target string, | ||
| * starting from the specified position (0-based index referring to character position in | ||
| * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the | ||
| * pattern string is already lowercased prior to call. If the pattern is not found, | ||
| * MATCH_NOT_FOUND is returned. | ||
| * | ||
| * @param target the string to be searched in | ||
| * @param lowercasePattern the string to be searched for | ||
| * @param startPos the start position for searching (in the target string) | ||
| * @return the position of the first occurrence of pattern in target | ||
| */ | ||
| private static int lowercaseFind( | ||
| final UTF8String target, | ||
| final UTF8String lowercasePattern, | ||
| int startPos) { | ||
| assert startPos >= 0; | ||
| for (int i = startPos; i <= target.numChars(); ++i) { | ||
uros-db marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (lowercaseMatchFrom(target, lowercasePattern, i)) { | ||
| return i; | ||
| } | ||
| } | ||
| return MATCH_NOT_FOUND; | ||
| } | ||
|
|
||
| /** | ||
| * Returns whether the target string ends with the specified suffix, ending at the specified | ||
| * position (0-based index referring to character position in UTF8String), with respect to the | ||
| * UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior | ||
| * to method call to avoid the overhead of calling .toLowerCase() multiple times on the same | ||
| * suffix string. | ||
| * | ||
| * @param target the string to be searched in | ||
| * @param lowercasePattern the string to be searched for | ||
| * @param endPos the end position for searching (in the target string) | ||
uros-db marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| * @return whether the target string ends with the specified suffix in lowercase | ||
| */ | ||
| public static boolean lowercaseMatchUntil( | ||
| final UTF8String target, | ||
| final UTF8String lowercasePattern, | ||
| int endPos) { | ||
| return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the length of the substring of the target string that ends with the specified | ||
| * suffix, ending at the specified position (0-based index referring to character position in | ||
| * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the | ||
| * suffix is already lowercased. The method only considers the part of target string that ends | ||
| * at the specified (non-inclusive) position (that is, the method does not look at UTF8 | ||
| * characters of the target string at or after position `endPos`). If the suffix is not found, | ||
| * MATCH_NOT_FOUND is returned. | ||
| * | ||
| * @param target the string to be searched in | ||
| * @param lowercasePattern the string to be searched for | ||
| * @param endPos the end position for searching (in the target string) | ||
uros-db marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| * @return length of the target substring that ends with the specified suffix in lowercase | ||
| */ | ||
| private static int lowercaseMatchLengthUntil( | ||
| final UTF8String target, | ||
| final UTF8String lowercasePattern, | ||
| int endPos) { | ||
| assert endPos <= target.numChars(); | ||
| for (int len = 0; len <= endPos; ++len) { | ||
| if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a way to do this comparison with only one memory copy? Right now it's two:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know the details very well. Is it the same if we lower case the entire target string first, and then call
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
no, it's not - consider the example in the PR description: however, we can see that these is no
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that said, we are focusing on correctness for now, and are aware of possible performance regression for string searching in UTF8_BINARY_LCASE - we intend to work on perf optimization in a subsequent PR |
||
| return len; | ||
| } | ||
| } | ||
uros-db marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return MATCH_NOT_FOUND; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the position of the last occurrence of the pattern string in the target string, | ||
| * ending at the specified position (0-based index referring to character position in | ||
| * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the | ||
| * pattern string is already lowercased prior to call. If the pattern is not found, | ||
| * MATCH_NOT_FOUND is returned. | ||
| * | ||
| * @param target the string to be searched in | ||
| * @param lowercasePattern the string to be searched for | ||
| * @param endPos the end position for searching (in the target string) | ||
| * @return the position of the last occurrence of pattern in target | ||
| */ | ||
| private static int lowercaseRFind( | ||
| final UTF8String target, | ||
| final UTF8String lowercasePattern, | ||
| int endPos) { | ||
| assert endPos <= target.numChars(); | ||
| for (int i = endPos; i >= 0; --i) { | ||
uros-db marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (lowercaseMatchUntil(target, lowercasePattern, i)) { | ||
| return i; | ||
| } | ||
| } | ||
| return MATCH_NOT_FOUND; | ||
| } | ||
|
|
||
| public static UTF8String replace(final UTF8String src, final UTF8String search, | ||
| final UTF8String replace, final int collationId) { | ||
| // This collation aware implementation is based on existing implementation on UTF8String | ||
|
|
@@ -183,6 +332,23 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co | |
| return 0; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the position of the first occurrence of the pattern string in the target string, | ||
| * starting from the specified position (0-based index referring to character position in | ||
| * UTF8String), with respect to the UTF8_BINARY_LCASE collation. If the pattern is not found, | ||
| * MATCH_NOT_FOUND is returned. | ||
| * | ||
| * @param target the string to be searched in | ||
| * @param pattern the string to be searched for | ||
| * @param start the start position for searching (in the target string) | ||
| * @return the position of the first occurrence of pattern in target | ||
| */ | ||
| public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, | ||
uros-db marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| final int start) { | ||
| if (pattern.numChars() == 0) return 0; | ||
uros-db marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return lowercaseFind(target, pattern.toLowerCase(), start); | ||
| } | ||
|
|
||
| public static int indexOf(final UTF8String target, final UTF8String pattern, | ||
| final int start, final int collationId) { | ||
| if (pattern.numBytes() == 0) { | ||
|
|
@@ -467,4 +633,7 @@ public static UTF8String lowercaseTrimRight( | |
| } | ||
| return srcString.copyUTF8String(0, trimByteIdx); | ||
| } | ||
|
|
||
| // TODO: Add more collation-aware UTF8String operations here. | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -118,7 +118,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { | |
| return l.contains(r); | ||
| } | ||
| public static boolean execLowercase(final UTF8String l, final UTF8String r) { | ||
| return l.containsInLowerCase(r); | ||
| return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0; | ||
| } | ||
| public static boolean execICU(final UTF8String l, final UTF8String r, | ||
| final int collationId) { | ||
|
|
@@ -156,7 +156,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { | |
| return l.startsWith(r); | ||
| } | ||
| public static boolean execLowercase(final UTF8String l, final UTF8String r) { | ||
| return l.startsWithInLowerCase(r); | ||
| return CollationAwareUTF8String.lowercaseMatchFrom(l, r.toLowerCase(), 0); | ||
| } | ||
| public static boolean execICU(final UTF8String l, final UTF8String r, | ||
| final int collationId) { | ||
|
|
@@ -193,7 +193,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { | |
| return l.endsWith(r); | ||
| } | ||
| public static boolean execLowercase(final UTF8String l, final UTF8String r) { | ||
| return l.endsWithInLowerCase(r); | ||
| return CollationAwareUTF8String.lowercaseMatchUntil(l, r.toLowerCase(), l.numChars()); | ||
| } | ||
| public static boolean execICU(final UTF8String l, final UTF8String r, | ||
| final int collationId) { | ||
|
|
@@ -430,7 +430,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring | |
| } | ||
| public static int execLowercase(final UTF8String string, final UTF8String substring, | ||
| final int start) { | ||
| return string.toLowerCase().indexOf(substring.toLowerCase(), start); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to confirm, the previous implementation here is correct, right?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, unfortunately it's not - while it works fine for ASCII, it actually gives wrong results in some special cases featuring conditional case mapping, when a character has a lowercase equivalent that consists of multiple characters, or is found at a particular place in the string (context-awareness)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so as part of this PR, we actually changed the core definition of string-searching in UTF8_BINARY_LCASE, i.e. what it means for one substring (pattern) to be found in another string (target) under UTF8_BINARY_LCASE in the old implementation, and this is all due to the fact that |
||
| return CollationAwareUTF8String.lowercaseIndexOf(string, substring, start); | ||
| } | ||
| public static int execICU(final UTF8String string, final UTF8String substring, final int start, | ||
| final int collationId) { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.