Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,155 @@
* Utility class for collation-aware UTF8String operations.
*/
public class CollationAwareUTF8String {

/**
* The constant value to indicate that the match is not found when searching for a pattern
* string in a target string.
*/
private static final int MATCH_NOT_FOUND = -1;

/**
* Returns whether the target string starts with the specified prefix, starting from the
* specified position (0-based index referring to character position in UTF8String), with respect
* to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased
* prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the
* same prefix string.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE
*/
public static boolean lowercaseMatchFrom(
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND;
}

/**
* Returns the length of the substring of the target string that starts with the specified
* prefix, starting from the specified position (0-based index referring to character position
* in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* prefix is already lowercased. The method only considers the part of target string that
* starts from the specified (inclusive) position (that is, the method does not look at UTF8
* characters of the target string at or after position `endPos`). If the prefix is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return length of the target substring that starts with the specified prefix in lowercase
*/
private static int lowercaseMatchLengthFrom(
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
assert startPos >= 0;
for (int len = 0; len <= target.numChars() - startPos; ++len) {
if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) {
return len;
}
}
return MATCH_NOT_FOUND;
}

/**
* Returns the position of the first occurrence of the pattern string in the target string,
* starting from the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* pattern string is already lowercased prior to call. If the pattern is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return the position of the first occurrence of pattern in target
*/
private static int lowercaseFind(
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
assert startPos >= 0;
for (int i = startPos; i <= target.numChars(); ++i) {
if (lowercaseMatchFrom(target, lowercasePattern, i)) {
return i;
}
}
return MATCH_NOT_FOUND;
}

/**
* Returns whether the target string ends with the specified suffix, ending at the specified
* position (0-based index referring to character position in UTF8String), with respect to the
* UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior
* to method call to avoid the overhead of calling .toLowerCase() multiple times on the same
* suffix string.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param endPos the end position for searching (in the target string)
* @return whether the target string ends with the specified suffix in lowercase
*/
public static boolean lowercaseMatchUntil(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND;
}

/**
* Returns the length of the substring of the target string that ends with the specified
* suffix, ending at the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* suffix is already lowercased. The method only considers the part of target string that ends
* at the specified (non-inclusive) position (that is, the method does not look at UTF8
* characters of the target string at or after position `endPos`). If the suffix is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param endPos the end position for searching (in the target string)
* @return length of the target substring that ends with the specified suffix in lowercase
*/
private static int lowercaseMatchLengthUntil(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
assert endPos <= target.numChars();
for (int len = 0; len <= endPos; ++len) {
if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a way to do this comparison with only one memory copy? Right now it's two: substring and toLowerCase

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know the details very well. Is it the same if we lower case the entire target string first, and then call substring repeatedly?

Copy link
Contributor Author

@uros-db uros-db May 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it the same if we lower case the entire target string first

no, it's not - consider the example in the PR description: contains("İ", "i") where haystack = "İ" & needle = "i"; then we have: lower(haystack) = lower("İ") = "i\u0307" would produce substr(lower("İ"), 1, 1) = "i" = needle

however, we can see that these is no substr(haystack, start, len) such that lower(substr(haystack, start, len)) == needle - so these 2 behaviours are not actually equivalent (as we first assumed), and the reason for this discrepancy lies in conditional one-to-many case mapping (certain characters have lowercase equivalents consisted of multiple characters)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that said, we are focusing on correctness for now, and are aware of possible performance regression for string searching in UTF8_BINARY_LCASE - we intend to work on perf optimization in a subsequent PR

return len;
}
}
return MATCH_NOT_FOUND;
}

/**
* Returns the position of the last occurrence of the pattern string in the target string,
* ending at the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* pattern string is already lowercased prior to call. If the pattern is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param endPos the end position for searching (in the target string)
* @return the position of the last occurrence of pattern in target
*/
private static int lowercaseRFind(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
assert endPos <= target.numChars();
for (int i = endPos; i >= 0; --i) {
if (lowercaseMatchUntil(target, lowercasePattern, i)) {
return i;
}
}
return MATCH_NOT_FOUND;
}

public static UTF8String replace(final UTF8String src, final UTF8String search,
final UTF8String replace, final int collationId) {
// This collation aware implementation is based on existing implementation on UTF8String
Expand Down Expand Up @@ -183,6 +332,23 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
return 0;
}

/**
* Returns the position of the first occurrence of the pattern string in the target string,
* starting from the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. If the pattern is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param pattern the string to be searched for
* @param start the start position for searching (in the target string)
* @return the position of the first occurrence of pattern in target
*/
public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
final int start) {
if (pattern.numChars() == 0) return 0;
return lowercaseFind(target, pattern.toLowerCase(), start);
}

public static int indexOf(final UTF8String target, final UTF8String pattern,
final int start, final int collationId) {
if (pattern.numBytes() == 0) {
Expand Down Expand Up @@ -467,4 +633,7 @@ public static UTF8String lowercaseTrimRight(
}
return srcString.copyUTF8String(0, trimByteIdx);
}

// TODO: Add more collation-aware UTF8String operations here.

}
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
return l.contains(r);
}
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
return l.containsInLowerCase(r);
return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0;
}
public static boolean execICU(final UTF8String l, final UTF8String r,
final int collationId) {
Expand Down Expand Up @@ -156,7 +156,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
return l.startsWith(r);
}
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
return l.startsWithInLowerCase(r);
return CollationAwareUTF8String.lowercaseMatchFrom(l, r.toLowerCase(), 0);
}
public static boolean execICU(final UTF8String l, final UTF8String r,
final int collationId) {
Expand Down Expand Up @@ -193,7 +193,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
return l.endsWith(r);
}
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
return l.endsWithInLowerCase(r);
return CollationAwareUTF8String.lowercaseMatchUntil(l, r.toLowerCase(), l.numChars());
}
public static boolean execICU(final UTF8String l, final UTF8String r,
final int collationId) {
Expand Down Expand Up @@ -430,7 +430,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring
}
public static int execLowercase(final UTF8String string, final UTF8String substring,
final int start) {
return string.toLowerCase().indexOf(substring.toLowerCase(), start);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to confirm, the previous implementation here is correct, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, unfortunately it's not - while it works fine for ASCII, it actually gives wrong results in some special cases featuring conditional case mapping, when a character has a lowercase equivalent that consists of multiple characters, or is found at a particular place in the string (context-awareness)

Copy link
Contributor Author

@uros-db uros-db May 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so as part of this PR, we actually changed the core definition of string-searching in UTF8_BINARY_LCASE, i.e. what it means for one substring (pattern) to be found in another string (target) under UTF8_BINARY_LCASE

in the old implementation, contains("İ", "i") would return true - however, this behaviour is incorrect because it relies on the fact that substr(lower("İ"), 1, 1) == "i" (incorrect, old implementation), instead of lower(substr("İ", 1, 1)) != "i" (correct, new implementation)

and this is all due to the fact that lower("İ") = "i\u0307" (1 uppercase character -> 2 lowercase characters)

return CollationAwareUTF8String.lowercaseIndexOf(string, substring, start);
}
public static int execICU(final UTF8String string, final UTF8String substring, final int start,
final int collationId) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -341,44 +341,6 @@ public boolean contains(final UTF8String substring) {
return false;
}

/**
* Returns whether `this` contains `substring` in a lowercase unicode-aware manner
*
* This function is written in a way which avoids excessive allocations in case if we work with
* bare ASCII-character strings.
*/
public boolean containsInLowerCase(final UTF8String substring) {
if (substring.numBytes == 0) {
return true;
}

// Both `this` and the `substring` are checked for non-ASCII characters, otherwise we would
// have to use `startsWithLowerCase(...)` in a loop, and it would frequently allocate
// (e.g. in case of `containsInLowerCase("1大1大1大...", "11")`)
if (!substring.isFullAscii()) {
return toLowerCase().contains(substring.toLowerCaseSlow());
}
if (!isFullAscii()) {
return toLowerCaseSlow().contains(substring.toLowerCaseAscii());
}

if (numBytes < substring.numBytes) {
return false;
}

final var firstLower = Character.toLowerCase(substring.getByte(0));
for (var i = 0; i <= (numBytes - substring.numBytes); i++) {
if (Character.toLowerCase(getByte(i)) == firstLower) {
final var rest = UTF8String.fromAddress(base, offset + i, numBytes - i);
if (rest.matchAtInLowerCaseAscii(substring, 0)) {
return true;
}
}
}

return false;
}

/**
* Returns the byte at position `i`.
*/
Expand All @@ -393,94 +355,14 @@ public boolean matchAt(final UTF8String s, int pos) {
return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes);
}

private boolean matchAtInLowerCaseAscii(final UTF8String s, int pos) {
if (s.numBytes + pos > numBytes || pos < 0) {
return false;
}

for (var i = 0; i < s.numBytes; i++) {
if (Character.toLowerCase(getByte(pos + i)) != Character.toLowerCase(s.getByte(i))) {
return false;
}
}

return true;
}

public boolean startsWith(final UTF8String prefix) {
return matchAt(prefix, 0);
}

/**
* Checks whether `prefix` is a prefix of `this` in a lowercase unicode-aware manner
*
* This function is written in a way which avoids excessive allocations in case if we work with
* bare ASCII-character strings.
*/
public boolean startsWithInLowerCase(final UTF8String prefix) {
// No way to match sizes of strings for early return, since single grapheme can be expanded
// into several independent ones in lowercase
if (prefix.numBytes == 0) {
return true;
}
if (numBytes == 0) {
return false;
}

if (!prefix.isFullAscii()) {
return toLowerCase().startsWith(prefix.toLowerCaseSlow());
}

final var part = prefix.numBytes >= numBytes ? this : UTF8String.fromAddress(
base, offset, prefix.numBytes);
if (!part.isFullAscii()) {
return toLowerCaseSlow().startsWith(prefix.toLowerCaseAscii());
}

if (numBytes < prefix.numBytes) {
return false;
}

return matchAtInLowerCaseAscii(prefix, 0);
}

public boolean endsWith(final UTF8String suffix) {
return matchAt(suffix, numBytes - suffix.numBytes);
}

/**
* Checks whether `suffix` is a suffix of `this` in a lowercase unicode-aware manner
*
* This function is written in a way which avoids excessive allocations in case if we work with
* bare ASCII-character strings.
*/
public boolean endsWithInLowerCase(final UTF8String suffix) {
// No way to match sizes of strings for early return, since single grapheme can be expanded
// into several independent ones in lowercase
if (suffix.numBytes == 0) {
return true;
}
if (numBytes == 0) {
return false;
}

if (!suffix.isFullAscii()) {
return toLowerCase().endsWith(suffix.toLowerCaseSlow());
}

final var part = suffix.numBytes >= numBytes ? this : UTF8String.fromAddress(
base, offset + numBytes - suffix.numBytes, suffix.numBytes);
if (!part.isFullAscii()) {
return toLowerCaseSlow().endsWith(suffix.toLowerCaseAscii());
}

if (numBytes < suffix.numBytes) {
return false;
}

return matchAtInLowerCaseAscii(suffix, numBytes - suffix.numBytes);
}

/**
* Returns the upper case of this string
*/
Expand Down
Loading