From 4981a14cfb130b34936c699082a74afb2d124f41 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Thu, 9 May 2024 17:31:07 +0200 Subject: [PATCH 01/19] Fix string search in contains/startsWith/endsWith --- .../sql/catalyst/util/CollationSupport.java | 31 ++++- .../apache/spark/unsafe/types/UTF8String.java | 118 ------------------ .../unsafe/types/CollationSupportSuite.java | 78 +++++++----- .../spark/unsafe/types/UTF8StringSuite.java | 105 ---------------- 4 files changed, 78 insertions(+), 254 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index b77671cee90b..bfaa1e2d0b55 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -126,7 +126,19 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.contains(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.containsInLowerCase(r); + if (r.numBytes() == 0) return true; + if (l.numChars() < r.numChars()) return false; + int lenHaystack = l.numChars(), lenNeedle = r.numChars(); + final UTF8String firstLower = r.substring(0, 1).toLowerCase(); + final UTF8String needle = r.toLowerCase(); + for (var i = 0; i <= (lenHaystack - lenNeedle); i++) { + if (l.substring(i, i + 1).toLowerCase().equals(firstLower)) { + if (CollationAwareUTF8String.lowercaseMatchAt(l, needle, i, lenNeedle)) { + return true; + } + } + } + return false; } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { @@ -164,7 +176,9 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.startsWith(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.startsWithInLowerCase(r); + if (r.numBytes() == 0) return true; + if (l.numChars() < r.numChars()) return false; + return CollationAwareUTF8String.lowercaseMatchAt(l, r.toLowerCase(), 0, r.numChars()); } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { @@ -201,7 +215,10 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.endsWith(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.endsWithInLowerCase(r); + if (r.numBytes() == 0) return true; + if (l.numChars() < r.numChars()) return false; + return CollationAwareUTF8String.lowercaseMatchAt(l, r.toLowerCase(), + l.numChars() - r.numChars(), r.numChars()); } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { @@ -572,6 +589,14 @@ public static UTF8String collationAwareRegex(final UTF8String regex, final int c private static class CollationAwareUTF8String { + private static boolean lowercaseMatchAt(final UTF8String l, final UTF8String r, + int pos, int len) { + if (len + pos > l.numChars() || pos < 0) { + return false; + } + return l.substring(pos, pos + len).toLowerCase().equals(r); + } + private static UTF8String replace(final UTF8String src, final UTF8String search, final UTF8String replace, final int collationId) { // This collation aware implementation is based on existing implementation on UTF8String diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 2a5d14580353..1427e53323d3 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -341,44 +341,6 @@ public boolean contains(final UTF8String substring) { return false; } - /** - * Returns whether `this` contains `substring` in a lowercase unicode-aware manner - * - * This function is written in a way which avoids excessive allocations in case if we work with - * bare ASCII-character strings. - */ - public boolean containsInLowerCase(final UTF8String substring) { - if (substring.numBytes == 0) { - return true; - } - - // Both `this` and the `substring` are checked for non-ASCII characters, otherwise we would - // have to use `startsWithLowerCase(...)` in a loop, and it would frequently allocate - // (e.g. in case of `containsInLowerCase("1大1大1大...", "11")`) - if (!substring.isFullAscii()) { - return toLowerCase().contains(substring.toLowerCaseSlow()); - } - if (!isFullAscii()) { - return toLowerCaseSlow().contains(substring.toLowerCaseAscii()); - } - - if (numBytes < substring.numBytes) { - return false; - } - - final var firstLower = Character.toLowerCase(substring.getByte(0)); - for (var i = 0; i <= (numBytes - substring.numBytes); i++) { - if (Character.toLowerCase(getByte(i)) == firstLower) { - final var rest = UTF8String.fromAddress(base, offset + i, numBytes - i); - if (rest.matchAtInLowerCaseAscii(substring, 0)) { - return true; - } - } - } - - return false; - } - /** * Returns the byte at position `i`. */ @@ -393,94 +355,14 @@ public boolean matchAt(final UTF8String s, int pos) { return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes); } - private boolean matchAtInLowerCaseAscii(final UTF8String s, int pos) { - if (s.numBytes + pos > numBytes || pos < 0) { - return false; - } - - for (var i = 0; i < s.numBytes; i++) { - if (Character.toLowerCase(getByte(pos + i)) != Character.toLowerCase(s.getByte(i))) { - return false; - } - } - - return true; - } - public boolean startsWith(final UTF8String prefix) { return matchAt(prefix, 0); } - /** - * Checks whether `prefix` is a prefix of `this` in a lowercase unicode-aware manner - * - * This function is written in a way which avoids excessive allocations in case if we work with - * bare ASCII-character strings. - */ - public boolean startsWithInLowerCase(final UTF8String prefix) { - // No way to match sizes of strings for early return, since single grapheme can be expanded - // into several independent ones in lowercase - if (prefix.numBytes == 0) { - return true; - } - if (numBytes == 0) { - return false; - } - - if (!prefix.isFullAscii()) { - return toLowerCase().startsWith(prefix.toLowerCaseSlow()); - } - - final var part = prefix.numBytes >= numBytes ? this : UTF8String.fromAddress( - base, offset, prefix.numBytes); - if (!part.isFullAscii()) { - return toLowerCaseSlow().startsWith(prefix.toLowerCaseAscii()); - } - - if (numBytes < prefix.numBytes) { - return false; - } - - return matchAtInLowerCaseAscii(prefix, 0); - } - public boolean endsWith(final UTF8String suffix) { return matchAt(suffix, numBytes - suffix.numBytes); } - /** - * Checks whether `suffix` is a suffix of `this` in a lowercase unicode-aware manner - * - * This function is written in a way which avoids excessive allocations in case if we work with - * bare ASCII-character strings. - */ - public boolean endsWithInLowerCase(final UTF8String suffix) { - // No way to match sizes of strings for early return, since single grapheme can be expanded - // into several independent ones in lowercase - if (suffix.numBytes == 0) { - return true; - } - if (numBytes == 0) { - return false; - } - - if (!suffix.isFullAscii()) { - return toLowerCase().endsWith(suffix.toLowerCaseSlow()); - } - - final var part = suffix.numBytes >= numBytes ? this : UTF8String.fromAddress( - base, offset + numBytes - suffix.numBytes, suffix.numBytes); - if (!part.isFullAscii()) { - return toLowerCaseSlow().endsWith(suffix.toLowerCaseAscii()); - } - - if (numBytes < suffix.numBytes) { - return false; - } - - return matchAtInLowerCaseAscii(suffix, numBytes - suffix.numBytes); - } - /** * Returns the upper case of this string */ diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 2f05b9ad88c9..a23b10ad20fb 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -102,20 +102,22 @@ public void testContains() throws SparkException { assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true); assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); // Case-variable character length - assertContains("abİo12", "i̇o", "UNICODE_CI", true); - assertContains("abi̇o12", "İo", "UNICODE_CI", true); - assertContains("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true); - assertContains("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true); - assertContains("The İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", true); - assertContains("İodiNe", "i̇oDine", "UTF8_BINARY_LCASE", true); - assertContains("İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", false); - // Characters with the same binary lowercase representation - assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); - assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); - assertContains("The KKelvin.", "KKelvin", "UTF8_BINARY_LCASE", true); - assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); - assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); - assertContains("The KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false); + assertContains("adi̇os", "io", "UNICODE_CI", false); + assertContains("adi̇os", "Io", "UNICODE_CI", false); + assertContains("adi̇os", "i̇o", "UNICODE_CI", true); + assertContains("adi̇os", "İo", "UNICODE_CI", true); + assertContains("adİos", "io", "UNICODE_CI", false); + assertContains("adİos", "Io", "UNICODE_CI", false); + assertContains("adİos", "i̇o", "UNICODE_CI", true); + assertContains("adİos", "İo", "UNICODE_CI", true); + assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false); + assertContains("adi̇os", "Io", "UTF8_BINARY_LCASE", false); + assertContains("adi̇os", "i̇o", "UTF8_BINARY_LCASE", true); + assertContains("adi̇os", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertContains("adİos", "io", "UTF8_BINARY_LCASE", false); + assertContains("adİos", "Io", "UTF8_BINARY_LCASE", false); + assertContains("adİos", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertContains("adİos", "İo", "UTF8_BINARY_LCASE", true); } private void assertStartsWith( @@ -191,13 +193,6 @@ public void testStartsWith() throws SparkException { assertStartsWith("ab世De", "AB世dE", "UNICODE_CI", true); assertStartsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true); assertStartsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); - // Case-variable character length - assertStartsWith("İonic", "i̇o", "UNICODE_CI", true); - assertStartsWith("i̇onic", "İo", "UNICODE_CI", true); - assertStartsWith("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true); - assertStartsWith("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true); - assertStartsWith("İodiNe", "i̇oDin", "UTF8_BINARY_LCASE", true); - assertStartsWith("The İodiNe", "i̇oDin", "UTF8_BINARY_LCASE", false); // Characters with the same binary lowercase representation assertStartsWith("Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); assertStartsWith("Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); @@ -205,6 +200,23 @@ public void testStartsWith() throws SparkException { assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertStartsWith("KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false); + // Case-variable character length + assertStartsWith("i̇onic", "io", "UNICODE_CI", false); + assertStartsWith("i̇onic", "Io", "UNICODE_CI", false); + assertStartsWith("i̇onic", "i̇o", "UNICODE_CI", true); + assertStartsWith("i̇onic", "İo", "UNICODE_CI", true); + assertStartsWith("İonic", "io", "UNICODE_CI", false); + assertStartsWith("İonic", "Io", "UNICODE_CI", false); + assertStartsWith("İonic", "i̇o", "UNICODE_CI", true); + assertStartsWith("İonic", "İo", "UNICODE_CI", true); + assertStartsWith("i̇onic", "io", "UTF8_BINARY_LCASE", false); + assertStartsWith("i̇onic", "Io", "UTF8_BINARY_LCASE", false); + assertStartsWith("i̇onic", "i̇o", "UTF8_BINARY_LCASE", true); + assertStartsWith("i̇onic", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertStartsWith("İonic", "io", "UTF8_BINARY_LCASE", false); + assertStartsWith("İonic", "Io", "UTF8_BINARY_LCASE", false); + assertStartsWith("İonic", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertStartsWith("İonic", "İo", "UTF8_BINARY_LCASE", true); } private void assertEndsWith(String pattern, String suffix, String collationName, boolean expected) @@ -279,13 +291,6 @@ public void testEndsWith() throws SparkException { assertEndsWith("ab世De", "AB世dE", "UNICODE_CI", true); assertEndsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true); assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); - // Case-variable character length - assertEndsWith("The İo", "i̇o", "UNICODE_CI", true); - assertEndsWith("The i̇o", "İo", "UNICODE_CI", true); - assertEndsWith("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true); - assertEndsWith("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true); - assertEndsWith("The İodiNe", "i̇oDine", "UTF8_BINARY_LCASE", true); - assertEndsWith("The İodiNe", "i̇oDin", "UTF8_BINARY_LCASE", false); // Characters with the same binary lowercase representation assertEndsWith("The Kelvin", "Kelvin", "UTF8_BINARY_LCASE", true); assertEndsWith("The Kelvin", "Kelvin", "UTF8_BINARY_LCASE", true); @@ -293,6 +298,23 @@ public void testEndsWith() throws SparkException { assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertEndsWith("The KKelvin", "KKelvin,", "UTF8_BINARY_LCASE", false); + // Case-variable character length + assertEndsWith("the i̇o", "io", "UNICODE_CI", false); + assertEndsWith("the i̇o", "Io", "UNICODE_CI", false); + assertEndsWith("the i̇o", "i̇o", "UNICODE_CI", true); + assertEndsWith("the i̇o", "İo", "UNICODE_CI", true); + assertEndsWith("the İo", "io", "UNICODE_CI", false); + assertEndsWith("the İo", "Io", "UNICODE_CI", false); + assertEndsWith("the İo", "i̇o", "UNICODE_CI", true); + assertEndsWith("the İo", "İo", "UNICODE_CI", true); + assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false); + assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false); + assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true); + assertEndsWith("the i̇o", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertEndsWith("the İo", "io", "UTF8_BINARY_LCASE", false); + assertEndsWith("the İo", "Io", "UTF8_BINARY_LCASE", false); + assertEndsWith("the İo", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertEndsWith("the İo", "İo", "UTF8_BINARY_LCASE", true); } private void assertStringSplitSQL(String str, String delimiter, String collationName, diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 711e31fd6881..934b93c9345b 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -215,43 +215,6 @@ public void contains() { assertFalse(fromString("大千世界").contains(fromString("大千世界好"))); } - @Test - public void containsInLowerCase() { - // Corner cases - assertTrue(EMPTY_UTF8.containsInLowerCase(EMPTY_UTF8)); - assertTrue(fromString("a").containsInLowerCase(EMPTY_UTF8)); - assertTrue(fromString("A").containsInLowerCase(fromString("a"))); - assertTrue(fromString("a").containsInLowerCase(fromString("A"))); - assertFalse(EMPTY_UTF8.containsInLowerCase(fromString("a"))); - // ASCII - assertTrue(fromString("hello").containsInLowerCase(fromString("ello"))); - assertFalse(fromString("hello").containsInLowerCase(fromString("vello"))); - assertFalse(fromString("hello").containsInLowerCase(fromString("hellooo"))); - // Unicode - assertTrue(fromString("大千世界").containsInLowerCase(fromString("千世界"))); - assertFalse(fromString("大千世界").containsInLowerCase(fromString("世千"))); - assertFalse(fromString("大千世界").containsInLowerCase(fromString("大千世界好"))); - // ASCII lowercase - assertTrue(fromString("HeLlO").containsInLowerCase(fromString("ElL"))); - assertFalse(fromString("HeLlO").containsInLowerCase(fromString("ElLoO"))); - // Unicode lowercase - assertTrue(fromString("ЯбЛоКо").containsInLowerCase(fromString("БлОк"))); - assertFalse(fromString("ЯбЛоКо").containsInLowerCase(fromString("лОкБ"))); - // Characters with the same binary lowercase representation - assertTrue(fromString("The Kelvin.").containsInLowerCase(fromString("Kelvin"))); - assertTrue(fromString("The Kelvin.").containsInLowerCase(fromString("Kelvin"))); - assertTrue(fromString("The KKelvin.").containsInLowerCase(fromString("KKelvin"))); - assertTrue(fromString("2 Kelvin.").containsInLowerCase(fromString("2 Kelvin"))); - assertTrue(fromString("2 Kelvin.").containsInLowerCase(fromString("2 Kelvin"))); - assertFalse(fromString("The KKelvin.").containsInLowerCase(fromString("KKelvin,"))); - // Characters with longer binary lowercase representation - assertTrue(fromString("the İodine").containsInLowerCase(fromString("the i̇odine"))); - assertTrue(fromString("the i̇odine").containsInLowerCase(fromString("the İodine"))); - assertTrue(fromString("The İodiNe").containsInLowerCase(fromString(" i̇oDin"))); - assertTrue(fromString("İodiNe").containsInLowerCase(fromString("i̇oDin"))); - assertFalse(fromString("İodiNe").containsInLowerCase(fromString(" i̇oDin"))); - } - @Test public void startsWith() { assertTrue(EMPTY_UTF8.startsWith(EMPTY_UTF8)); @@ -263,40 +226,6 @@ public void startsWith() { assertFalse(fromString("大千世界").startsWith(fromString("大千世界好"))); } - @Test - public void startsWithInLowerCase() { - // Corner cases - assertTrue(EMPTY_UTF8.startsWithInLowerCase(EMPTY_UTF8)); - assertTrue(fromString("a").startsWithInLowerCase(EMPTY_UTF8)); - assertTrue(fromString("A").startsWithInLowerCase(fromString("a"))); - assertTrue(fromString("a").startsWithInLowerCase(fromString("A"))); - assertFalse(EMPTY_UTF8.startsWithInLowerCase(fromString("a"))); - // ASCII - assertTrue(fromString("hello").startsWithInLowerCase(fromString("hell"))); - assertFalse(fromString("hello").startsWithInLowerCase(fromString("ell"))); - // Unicode - assertTrue(fromString("大千世界").startsWithInLowerCase(fromString("大千"))); - assertFalse(fromString("大千世界").startsWithInLowerCase(fromString("世千"))); - // ASCII lowercase - assertTrue(fromString("HeLlO").startsWithInLowerCase(fromString("hElL"))); - assertFalse(fromString("HeLlO").startsWithInLowerCase(fromString("ElL"))); - // Unicode lowercase - assertTrue(fromString("ЯбЛоКо").startsWithInLowerCase(fromString("яБлОк"))); - assertFalse(fromString("ЯбЛоКо").startsWithInLowerCase(fromString("БлОк"))); - // Characters with the same binary lowercase representation - assertTrue(fromString("Kelvin.").startsWithInLowerCase(fromString("Kelvin"))); - assertTrue(fromString("Kelvin.").startsWithInLowerCase(fromString("Kelvin"))); - assertTrue(fromString("KKelvin.").startsWithInLowerCase(fromString("KKelvin"))); - assertTrue(fromString("2 Kelvin.").startsWithInLowerCase(fromString("2 Kelvin"))); - assertTrue(fromString("2 Kelvin.").startsWithInLowerCase(fromString("2 Kelvin"))); - assertFalse(fromString("KKelvin.").startsWithInLowerCase(fromString("KKelvin,"))); - // Characters with longer binary lowercase representation - assertTrue(fromString("the İodine").startsWithInLowerCase(fromString("the i̇odine"))); - assertTrue(fromString("the i̇odine").startsWithInLowerCase(fromString("the İodine"))); - assertTrue(fromString("İodiNe").startsWithInLowerCase(fromString("i̇oDin"))); - assertFalse(fromString("The İodiNe").startsWithInLowerCase(fromString("i̇oDin"))); - } - @Test public void endsWith() { assertTrue(EMPTY_UTF8.endsWith(EMPTY_UTF8)); @@ -308,40 +237,6 @@ public void endsWith() { assertFalse(fromString("数据砖头").endsWith(fromString("我的数据砖头"))); } - @Test - public void endsWithInLowerCase() { - // Corner cases - assertTrue(EMPTY_UTF8.endsWithInLowerCase(EMPTY_UTF8)); - assertTrue(fromString("a").endsWithInLowerCase(EMPTY_UTF8)); - assertTrue(fromString("A").endsWithInLowerCase(fromString("a"))); - assertTrue(fromString("a").endsWithInLowerCase(fromString("A"))); - assertFalse(EMPTY_UTF8.endsWithInLowerCase(fromString("a"))); - // ASCII - assertTrue(fromString("hello").endsWithInLowerCase(fromString("ello"))); - assertFalse(fromString("hello").endsWithInLowerCase(fromString("hell"))); - // Unicode - assertTrue(fromString("大千世界").endsWithInLowerCase(fromString("世界"))); - assertFalse(fromString("大千世界").endsWithInLowerCase(fromString("大千"))); - // ASCII lowercase - assertTrue(fromString("HeLlO").endsWithInLowerCase(fromString("ElLo"))); - assertFalse(fromString("HeLlO").endsWithInLowerCase(fromString("hElL"))); - // Unicode lowercase - assertTrue(fromString("ЯбЛоКо").endsWithInLowerCase(fromString("БлОкО"))); - assertFalse(fromString("ЯбЛоКо").endsWithInLowerCase(fromString("яБлОк"))); - // Characters with the same binary lowercase representation - assertTrue(fromString("The Kelvin").endsWithInLowerCase(fromString("Kelvin"))); - assertTrue(fromString("The Kelvin").endsWithInLowerCase(fromString("Kelvin"))); - assertTrue(fromString("The KKelvin").endsWithInLowerCase(fromString("KKelvin"))); - assertTrue(fromString("The 2 Kelvin").endsWithInLowerCase(fromString("2 Kelvin"))); - assertTrue(fromString("The 2 Kelvin").endsWithInLowerCase(fromString("2 Kelvin"))); - assertFalse(fromString("The KKelvin").endsWithInLowerCase(fromString("KKelvin,"))); - // Characters with longer binary lowercase representation - assertTrue(fromString("the İodine").endsWithInLowerCase(fromString("the i̇odine"))); - assertTrue(fromString("the i̇odine").endsWithInLowerCase(fromString("the İodine"))); - assertTrue(fromString("The İodiNe").endsWithInLowerCase(fromString("i̇oDine"))); - assertFalse(fromString("The İodiNe").endsWithInLowerCase(fromString("i̇oDin"))); - } - @Test public void substring() { assertEquals(EMPTY_UTF8, fromString("hello").substring(0, 0)); From b5794fc07388adfeac7b988bc5ecfb79d992412e Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Thu, 9 May 2024 17:55:27 +0200 Subject: [PATCH 02/19] Add tests --- .../unsafe/types/CollationSupportSuite.java | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index a23b10ad20fb..5396214bbcc6 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -102,6 +102,9 @@ public void testContains() throws SparkException { assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true); assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); // Case-variable character length + assertContains("i̇", "i", "UNICODE_CI", false); + assertContains("i̇", "İ", "UNICODE_CI", true); + assertContains("İ", "i", "UNICODE_CI", false); assertContains("adi̇os", "io", "UNICODE_CI", false); assertContains("adi̇os", "Io", "UNICODE_CI", false); assertContains("adi̇os", "i̇o", "UNICODE_CI", true); @@ -110,6 +113,9 @@ public void testContains() throws SparkException { assertContains("adİos", "Io", "UNICODE_CI", false); assertContains("adİos", "i̇o", "UNICODE_CI", true); assertContains("adİos", "İo", "UNICODE_CI", true); + assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // note: different from UNICODE_CI + assertContains("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertContains("İ", "i", "UTF8_BINARY_LCASE", false); assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false); assertContains("adi̇os", "Io", "UTF8_BINARY_LCASE", false); assertContains("adi̇os", "i̇o", "UTF8_BINARY_LCASE", true); @@ -201,6 +207,9 @@ public void testStartsWith() throws SparkException { assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertStartsWith("KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false); // Case-variable character length + assertStartsWith("i̇", "i", "UNICODE_CI", false); + assertStartsWith("i̇", "İ", "UNICODE_CI", true); + assertStartsWith("İ", "i", "UNICODE_CI", false); assertStartsWith("i̇onic", "io", "UNICODE_CI", false); assertStartsWith("i̇onic", "Io", "UNICODE_CI", false); assertStartsWith("i̇onic", "i̇o", "UNICODE_CI", true); @@ -209,6 +218,9 @@ public void testStartsWith() throws SparkException { assertStartsWith("İonic", "Io", "UNICODE_CI", false); assertStartsWith("İonic", "i̇o", "UNICODE_CI", true); assertStartsWith("İonic", "İo", "UNICODE_CI", true); + assertStartsWith("i̇", "i", "UTF8_BINARY_LCASE", true); // note: different from UNICODE_CI + assertStartsWith("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertStartsWith("İ", "i", "UTF8_BINARY_LCASE", false); assertStartsWith("i̇onic", "io", "UTF8_BINARY_LCASE", false); assertStartsWith("i̇onic", "Io", "UTF8_BINARY_LCASE", false); assertStartsWith("i̇onic", "i̇o", "UTF8_BINARY_LCASE", true); @@ -299,6 +311,9 @@ public void testEndsWith() throws SparkException { assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertEndsWith("The KKelvin", "KKelvin,", "UTF8_BINARY_LCASE", false); // Case-variable character length + assertEndsWith("i̇", "i", "UNICODE_CI", false); + assertEndsWith("i̇", "İ", "UNICODE_CI", true); + assertEndsWith("İ", "i", "UNICODE_CI", false); assertEndsWith("the i̇o", "io", "UNICODE_CI", false); assertEndsWith("the i̇o", "Io", "UNICODE_CI", false); assertEndsWith("the i̇o", "i̇o", "UNICODE_CI", true); @@ -307,6 +322,9 @@ public void testEndsWith() throws SparkException { assertEndsWith("the İo", "Io", "UNICODE_CI", false); assertEndsWith("the İo", "i̇o", "UNICODE_CI", true); assertEndsWith("the İo", "İo", "UNICODE_CI", true); + assertEndsWith("i̇", "i", "UTF8_BINARY_LCASE", false); + assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertEndsWith("İ", "i", "UTF8_BINARY_LCASE", false); assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false); assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false); assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true); From dd261bfada6814a7a349ee622f4a754cef49a08f Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 10 May 2024 06:15:31 +0200 Subject: [PATCH 03/19] Fix locate/position --- .../sql/catalyst/util/CollationSupport.java | 34 +++++++++++-------- .../unsafe/types/CollationSupportSuite.java | 10 ++++++ 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index bfaa1e2d0b55..8a5ce7b1feb7 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -126,19 +126,9 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.contains(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - if (r.numBytes() == 0) return true; + if (r.numChars() == 0) return true; if (l.numChars() < r.numChars()) return false; - int lenHaystack = l.numChars(), lenNeedle = r.numChars(); - final UTF8String firstLower = r.substring(0, 1).toLowerCase(); - final UTF8String needle = r.toLowerCase(); - for (var i = 0; i <= (lenHaystack - lenNeedle); i++) { - if (l.substring(i, i + 1).toLowerCase().equals(firstLower)) { - if (CollationAwareUTF8String.lowercaseMatchAt(l, needle, i, lenNeedle)) { - return true; - } - } - } - return false; + return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0; } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { @@ -379,7 +369,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring return string.indexOf(substring, 0); } public static int execLowercase(final UTF8String string, final UTF8String substring) { - return string.toLowerCase().indexOf(substring.toLowerCase(), 0); + return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0); } public static int execICU(final UTF8String string, final UTF8String substring, final int collationId) { @@ -455,7 +445,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring } public static int execLowercase(final UTF8String string, final UTF8String substring, final int start) { - return string.toLowerCase().indexOf(substring.toLowerCase(), start); + return CollationAwareUTF8String.lowercaseIndexOf(string, substring, start); } public static int execICU(final UTF8String string, final UTF8String substring, final int start, final int collationId) { @@ -746,6 +736,22 @@ private static int findInSet(final UTF8String match, final UTF8String set, int c return 0; } + private static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, + final int start) { + if (pattern.numChars() == 0) return 0; + int lenHaystack = target.numChars(), lenNeedle = pattern.numChars(); + final UTF8String firstLower = pattern.substring(0, 1).toLowerCase(); + final UTF8String needle = pattern.toLowerCase(); + for (int i = start; i <= (lenHaystack - lenNeedle); i++) { + if (target.substring(i, i + 1).toLowerCase().equals(firstLower)) { + if (CollationAwareUTF8String.lowercaseMatchAt(target, needle, i, lenNeedle)) { + return i; + } + } + } + return -1; + } + private static int indexOf(final UTF8String target, final UTF8String pattern, final int start, final int collationId) { if (pattern.numBytes() == 0) { diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 5396214bbcc6..362678f2ddce 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -749,6 +749,16 @@ public void testLocate() throws SparkException { assertLocate("大千", "test大千世界大千世界", 9, "UNICODE_CI", 9); assertLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1); // Case-variable character length + assertLocate("̇", "i̇", 1, "UTF8_BINARY", 2); + assertLocate("̇", "İ", 1, "UTF8_BINARY_LCASE", 0); // note: different from UTF8_BINARY + assertLocate("i", "i̇", 1, "UNICODE_CI", 0); + assertLocate("i̇", "i", 1, "UNICODE_CI", 0); + assertLocate("İ", "i̇", 1, "UNICODE_CI", 1); + assertLocate("İ", "i", 1, "UNICODE_CI", 0); + assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // note: different from UNICODE_CI + assertLocate("i̇", "i", 1, "UTF8_BINARY_LCASE", 0); + assertLocate("İ", "i̇", 1, "UTF8_BINARY_LCASE", 0); // note: different from UNICODE_CI + assertLocate("İ", "i", 1, "UTF8_BINARY_LCASE", 0); assertLocate("i̇o", "İo世界大千世界", 1, "UNICODE_CI", 1); assertLocate("i̇o", "大千İo世界大千世界", 1, "UNICODE_CI", 3); assertLocate("i̇o", "世界İo大千世界大千İo", 4, "UNICODE_CI", 11); From 35dd21a517c00fc4e1e21c8a22c0d3ab2e642550 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 10 May 2024 07:05:16 +0200 Subject: [PATCH 04/19] Fix CollationAwareUTF8String.java --- .../util/CollationAwareUTF8String.java | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index ee0d611d7e65..81f1fb03ada6 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -34,6 +34,15 @@ * Utility class for collation-aware UTF8String operations. */ public class CollationAwareUTF8String { + + public static boolean lowercaseMatchAt(final UTF8String l, final UTF8String r, + int pos, int len) { + if (len + pos > l.numChars() || pos < 0) { + return false; + } + return l.substring(pos, pos + len).toLowerCase().equals(r); + } + public static UTF8String replace(final UTF8String src, final UTF8String search, final UTF8String replace, final int collationId) { // This collation aware implementation is based on existing implementation on UTF8String @@ -183,6 +192,22 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co return 0; } + public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, + final int start) { + if (pattern.numChars() == 0) return 0; + int lenHaystack = target.numChars(), lenNeedle = pattern.numChars(); + final UTF8String firstLower = pattern.substring(0, 1).toLowerCase(); + final UTF8String needle = pattern.toLowerCase(); + for (int i = start; i <= (lenHaystack - lenNeedle); i++) { + if (target.substring(i, i + 1).toLowerCase().equals(firstLower)) { + if (CollationAwareUTF8String.lowercaseMatchAt(target, needle, i, lenNeedle)) { + return i; + } + } + } + return -1; + } + public static int indexOf(final UTF8String target, final UTF8String pattern, final int start, final int collationId) { if (pattern.numBytes() == 0) { @@ -467,4 +492,7 @@ public static UTF8String lowercaseTrimRight( } return srcString.copyUTF8String(0, trimByteIdx); } + + // TODO: Add more collation-aware UTF8String operations here. + } From 0baa71381906fc7efc1a88c89139d5ab7b2025b8 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 10 May 2024 15:08:41 +0200 Subject: [PATCH 05/19] Fix UTF8_BINARY_LCASE string search --- .../util/CollationAwareUTF8String.java | 29 ++++++---- .../sql/catalyst/util/CollationSupport.java | 10 ++-- .../unsafe/types/CollationSupportSuite.java | 54 +++++++++++++------ 3 files changed, 59 insertions(+), 34 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 81f1fb03ada6..9f598e9a6715 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -35,12 +35,24 @@ */ public class CollationAwareUTF8String { - public static boolean lowercaseMatchAt(final UTF8String l, final UTF8String r, - int pos, int len) { - if (len + pos > l.numChars() || pos < 0) { - return false; + public static boolean lowercaseMatchFrom(final UTF8String l, final UTF8String r, int pos) { + if (pos < 0) return false; + for (int len = 0; len <= l.numChars() - pos; len++) { + if (l.substring(pos, pos + len).toLowerCase().equals(r)) { + return true; + } + } + return false; + } + + public static boolean lowercaseMatchUntil(final UTF8String l, final UTF8String r, int pos) { + if (pos > l.numChars()) return false; + for (int len = 1; len <= pos; len++) { + if (l.substring(pos - len, pos).toLowerCase().equals(r)) { + return true; + } } - return l.substring(pos, pos + len).toLowerCase().equals(r); + return false; } public static UTF8String replace(final UTF8String src, final UTF8String search, @@ -196,13 +208,10 @@ public static int lowercaseIndexOf(final UTF8String target, final UTF8String pat final int start) { if (pattern.numChars() == 0) return 0; int lenHaystack = target.numChars(), lenNeedle = pattern.numChars(); - final UTF8String firstLower = pattern.substring(0, 1).toLowerCase(); final UTF8String needle = pattern.toLowerCase(); for (int i = start; i <= (lenHaystack - lenNeedle); i++) { - if (target.substring(i, i + 1).toLowerCase().equals(firstLower)) { - if (CollationAwareUTF8String.lowercaseMatchAt(target, needle, i, lenNeedle)) { - return i; - } + if (CollationAwareUTF8String.lowercaseMatchFrom(target, needle, i)) { + return i; } } return -1; diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index 1710d877ddfd..fa21ed3ebc65 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -118,8 +118,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.contains(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - if (r.numChars() == 0) return true; - if (l.numChars() < r.numChars()) return false; + if (r.numBytes() == 0) return true; return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0; } public static boolean execICU(final UTF8String l, final UTF8String r, @@ -159,8 +158,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { } public static boolean execLowercase(final UTF8String l, final UTF8String r) { if (r.numBytes() == 0) return true; - if (l.numChars() < r.numChars()) return false; - return CollationAwareUTF8String.lowercaseMatchAt(l, r.toLowerCase(), 0, r.numChars()); + return CollationAwareUTF8String.lowercaseMatchFrom(l, r.toLowerCase(), 0); } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { @@ -198,9 +196,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { } public static boolean execLowercase(final UTF8String l, final UTF8String r) { if (r.numBytes() == 0) return true; - if (l.numChars() < r.numChars()) return false; - return CollationAwareUTF8String.lowercaseMatchAt(l, r.toLowerCase(), - l.numChars() - r.numChars(), r.numChars()); + return CollationAwareUTF8String.lowercaseMatchUntil(l, r.toLowerCase(), l.numChars()); } public static boolean execICU(final UTF8String l, final UTF8String r, final int collationId) { diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 713e47b419ca..89eba451137e 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -103,6 +103,7 @@ public void testContains() throws SparkException { assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); // Case-variable character length assertContains("i̇", "i", "UNICODE_CI", false); + assertContains("i̇", "̇", "UNICODE_CI", false); assertContains("i̇", "İ", "UNICODE_CI", true); assertContains("İ", "i", "UNICODE_CI", false); assertContains("adi̇os", "io", "UNICODE_CI", false); @@ -113,16 +114,17 @@ public void testContains() throws SparkException { assertContains("adİos", "Io", "UNICODE_CI", false); assertContains("adİos", "i̇o", "UNICODE_CI", true); assertContains("adİos", "İo", "UNICODE_CI", true); - assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // note: different from UNICODE_CI - assertContains("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI + assertContains("i̇", "̇", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI + assertContains("i̇", "İ", "UTF8_BINARY_LCASE", true); assertContains("İ", "i", "UTF8_BINARY_LCASE", false); assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false); assertContains("adi̇os", "Io", "UTF8_BINARY_LCASE", false); assertContains("adi̇os", "i̇o", "UTF8_BINARY_LCASE", true); - assertContains("adi̇os", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertContains("adi̇os", "İo", "UTF8_BINARY_LCASE", true); assertContains("adİos", "io", "UTF8_BINARY_LCASE", false); assertContains("adİos", "Io", "UTF8_BINARY_LCASE", false); - assertContains("adİos", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertContains("adİos", "i̇o", "UTF8_BINARY_LCASE", true); assertContains("adİos", "İo", "UTF8_BINARY_LCASE", true); } @@ -210,6 +212,10 @@ public void testStartsWith() throws SparkException { assertStartsWith("i̇", "i", "UNICODE_CI", false); assertStartsWith("i̇", "İ", "UNICODE_CI", true); assertStartsWith("İ", "i", "UNICODE_CI", false); + assertStartsWith("İİİ", "i̇i̇", "UNICODE_CI", true); + assertStartsWith("İİİ", "i̇i", "UNICODE_CI", false); + assertStartsWith("İi̇İ", "i̇İ", "UNICODE_CI", true); + assertStartsWith("i̇İi̇i̇", "İi̇İi", "UNICODE_CI", false); assertStartsWith("i̇onic", "io", "UNICODE_CI", false); assertStartsWith("i̇onic", "Io", "UNICODE_CI", false); assertStartsWith("i̇onic", "i̇o", "UNICODE_CI", true); @@ -218,16 +224,20 @@ public void testStartsWith() throws SparkException { assertStartsWith("İonic", "Io", "UNICODE_CI", false); assertStartsWith("İonic", "i̇o", "UNICODE_CI", true); assertStartsWith("İonic", "İo", "UNICODE_CI", true); - assertStartsWith("i̇", "i", "UTF8_BINARY_LCASE", true); // note: different from UNICODE_CI - assertStartsWith("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertStartsWith("i̇", "i", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI + assertStartsWith("i̇", "İ", "UTF8_BINARY_LCASE", true); assertStartsWith("İ", "i", "UTF8_BINARY_LCASE", false); + assertStartsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true); + assertStartsWith("İİİ", "i̇i", "UTF8_BINARY_LCASE", false); + assertStartsWith("İi̇İ", "i̇İ", "UTF8_BINARY_LCASE", true); + assertStartsWith("i̇İi̇i̇", "İi̇İi", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI assertStartsWith("i̇onic", "io", "UTF8_BINARY_LCASE", false); assertStartsWith("i̇onic", "Io", "UTF8_BINARY_LCASE", false); assertStartsWith("i̇onic", "i̇o", "UTF8_BINARY_LCASE", true); - assertStartsWith("i̇onic", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertStartsWith("i̇onic", "İo", "UTF8_BINARY_LCASE", true); assertStartsWith("İonic", "io", "UTF8_BINARY_LCASE", false); assertStartsWith("İonic", "Io", "UTF8_BINARY_LCASE", false); - assertStartsWith("İonic", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertStartsWith("İonic", "i̇o", "UTF8_BINARY_LCASE", true); assertStartsWith("İonic", "İo", "UTF8_BINARY_LCASE", true); } @@ -311,9 +321,13 @@ public void testEndsWith() throws SparkException { assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertEndsWith("The KKelvin", "KKelvin,", "UTF8_BINARY_LCASE", false); // Case-variable character length - assertEndsWith("i̇", "i", "UNICODE_CI", false); + assertEndsWith("i̇", "̇", "UNICODE_CI", false); assertEndsWith("i̇", "İ", "UNICODE_CI", true); assertEndsWith("İ", "i", "UNICODE_CI", false); + assertEndsWith("İİİ", "i̇i̇", "UNICODE_CI", true); + assertEndsWith("İİİ", "ii̇", "UNICODE_CI", false); + assertEndsWith("İi̇İ", "İi̇", "UNICODE_CI", true); + assertEndsWith("i̇İi̇i̇", "̇İi̇İ", "UNICODE_CI", false); assertEndsWith("the i̇o", "io", "UNICODE_CI", false); assertEndsWith("the i̇o", "Io", "UNICODE_CI", false); assertEndsWith("the i̇o", "i̇o", "UNICODE_CI", true); @@ -322,16 +336,20 @@ public void testEndsWith() throws SparkException { assertEndsWith("the İo", "Io", "UNICODE_CI", false); assertEndsWith("the İo", "i̇o", "UNICODE_CI", true); assertEndsWith("the İo", "İo", "UNICODE_CI", true); - assertEndsWith("i̇", "i", "UTF8_BINARY_LCASE", false); - assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertEndsWith("i̇", "̇", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI + assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", true); assertEndsWith("İ", "i", "UTF8_BINARY_LCASE", false); + assertEndsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true); + assertEndsWith("İİİ", "ii̇", "UTF8_BINARY_LCASE", false); + assertEndsWith("İi̇İ", "İi̇", "UTF8_BINARY_LCASE", true); + assertEndsWith("i̇İi̇i̇", "̇İi̇İ", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false); assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false); assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true); - assertEndsWith("the i̇o", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertEndsWith("the i̇o", "İo", "UTF8_BINARY_LCASE", true); assertEndsWith("the İo", "io", "UTF8_BINARY_LCASE", false); assertEndsWith("the İo", "Io", "UTF8_BINARY_LCASE", false); - assertEndsWith("the İo", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertEndsWith("the İo", "i̇o", "UTF8_BINARY_LCASE", true); assertEndsWith("the İo", "İo", "UTF8_BINARY_LCASE", true); } @@ -750,21 +768,23 @@ public void testLocate() throws SparkException { assertLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1); // Case-variable character length assertLocate("̇", "i̇", 1, "UTF8_BINARY", 2); - assertLocate("̇", "İ", 1, "UTF8_BINARY_LCASE", 0); // note: different from UTF8_BINARY + assertLocate("̇", "İ", 1, "UTF8_BINARY_LCASE", 0); // different from UTF8_BINARY assertLocate("i", "i̇", 1, "UNICODE_CI", 0); + assertLocate("̇", "i̇", 1, "UNICODE_CI", 0); assertLocate("i̇", "i", 1, "UNICODE_CI", 0); assertLocate("İ", "i̇", 1, "UNICODE_CI", 1); assertLocate("İ", "i", 1, "UNICODE_CI", 0); - assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // note: different from UNICODE_CI + assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // different from UNICODE_CI + assertLocate("̇", "i̇", 1, "UTF8_BINARY_LCASE", 2); // different from UNICODE_CI assertLocate("i̇", "i", 1, "UTF8_BINARY_LCASE", 0); - assertLocate("İ", "i̇", 1, "UTF8_BINARY_LCASE", 0); // note: different from UNICODE_CI + assertLocate("İ", "i̇", 1, "UTF8_BINARY_LCASE", 1); assertLocate("İ", "i", 1, "UTF8_BINARY_LCASE", 0); assertLocate("i̇o", "İo世界大千世界", 1, "UNICODE_CI", 1); assertLocate("i̇o", "大千İo世界大千世界", 1, "UNICODE_CI", 3); assertLocate("i̇o", "世界İo大千世界大千İo", 4, "UNICODE_CI", 11); assertLocate("İo", "i̇o世界大千世界", 1, "UNICODE_CI", 1); assertLocate("İo", "大千i̇o世界大千世界", 1, "UNICODE_CI", 3); - assertLocate("İo", "世界i̇o大千世界大千i̇o", 4, "UNICODE_CI", 12); // 12 instead of 11 + assertLocate("İo", "世界i̇o大千世界大千i̇o", 4, "UNICODE_CI", 12); } private void assertSubstringIndex(String string, String delimiter, Integer count, From 37e44ad282cb0eb80b1223584923552fd48cb047 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 14 May 2024 11:17:32 +0200 Subject: [PATCH 06/19] Fix tests --- .../unsafe/types/CollationSupportSuite.java | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 89eba451137e..3980bff59887 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -103,7 +103,7 @@ public void testContains() throws SparkException { assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); // Case-variable character length assertContains("i̇", "i", "UNICODE_CI", false); - assertContains("i̇", "̇", "UNICODE_CI", false); + assertContains("i̇", "\u0307", "UNICODE_CI", false); assertContains("i̇", "İ", "UNICODE_CI", true); assertContains("İ", "i", "UNICODE_CI", false); assertContains("adi̇os", "io", "UNICODE_CI", false); @@ -115,7 +115,7 @@ public void testContains() throws SparkException { assertContains("adİos", "i̇o", "UNICODE_CI", true); assertContains("adİos", "İo", "UNICODE_CI", true); assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI - assertContains("i̇", "̇", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI + assertContains("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI assertContains("i̇", "İ", "UTF8_BINARY_LCASE", true); assertContains("İ", "i", "UTF8_BINARY_LCASE", false); assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false); @@ -321,13 +321,13 @@ public void testEndsWith() throws SparkException { assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true); assertEndsWith("The KKelvin", "KKelvin,", "UTF8_BINARY_LCASE", false); // Case-variable character length - assertEndsWith("i̇", "̇", "UNICODE_CI", false); + assertEndsWith("i̇", "\u0307", "UNICODE_CI", false); assertEndsWith("i̇", "İ", "UNICODE_CI", true); assertEndsWith("İ", "i", "UNICODE_CI", false); assertEndsWith("İİİ", "i̇i̇", "UNICODE_CI", true); assertEndsWith("İİİ", "ii̇", "UNICODE_CI", false); assertEndsWith("İi̇İ", "İi̇", "UNICODE_CI", true); - assertEndsWith("i̇İi̇i̇", "̇İi̇İ", "UNICODE_CI", false); + assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UNICODE_CI", false); assertEndsWith("the i̇o", "io", "UNICODE_CI", false); assertEndsWith("the i̇o", "Io", "UNICODE_CI", false); assertEndsWith("the i̇o", "i̇o", "UNICODE_CI", true); @@ -336,13 +336,13 @@ public void testEndsWith() throws SparkException { assertEndsWith("the İo", "Io", "UNICODE_CI", false); assertEndsWith("the İo", "i̇o", "UNICODE_CI", true); assertEndsWith("the İo", "İo", "UNICODE_CI", true); - assertEndsWith("i̇", "̇", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI + assertEndsWith("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", true); assertEndsWith("İ", "i", "UTF8_BINARY_LCASE", false); assertEndsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true); assertEndsWith("İİİ", "ii̇", "UTF8_BINARY_LCASE", false); assertEndsWith("İi̇İ", "İi̇", "UTF8_BINARY_LCASE", true); - assertEndsWith("i̇İi̇i̇", "̇İi̇İ", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI + assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false); assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false); assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true); @@ -767,15 +767,15 @@ public void testLocate() throws SparkException { assertLocate("大千", "test大千世界大千世界", 9, "UNICODE_CI", 9); assertLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1); // Case-variable character length - assertLocate("̇", "i̇", 1, "UTF8_BINARY", 2); - assertLocate("̇", "İ", 1, "UTF8_BINARY_LCASE", 0); // different from UTF8_BINARY + assertLocate("\u0307", "i̇", 1, "UTF8_BINARY", 2); + assertLocate("\u0307", "İ", 1, "UTF8_BINARY_LCASE", 0); // different from UTF8_BINARY assertLocate("i", "i̇", 1, "UNICODE_CI", 0); - assertLocate("̇", "i̇", 1, "UNICODE_CI", 0); + assertLocate("\u0307", "i̇", 1, "UNICODE_CI", 0); assertLocate("i̇", "i", 1, "UNICODE_CI", 0); assertLocate("İ", "i̇", 1, "UNICODE_CI", 1); assertLocate("İ", "i", 1, "UNICODE_CI", 0); assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // different from UNICODE_CI - assertLocate("̇", "i̇", 1, "UTF8_BINARY_LCASE", 2); // different from UNICODE_CI + assertLocate("\u0307", "i̇", 1, "UTF8_BINARY_LCASE", 2); // different from UNICODE_CI assertLocate("i̇", "i", 1, "UTF8_BINARY_LCASE", 0); assertLocate("İ", "i̇", 1, "UTF8_BINARY_LCASE", 1); assertLocate("İ", "i", 1, "UTF8_BINARY_LCASE", 0); From 6f0ceb5e01ef73f449445402272f254924222891 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 14 May 2024 12:08:26 +0200 Subject: [PATCH 07/19] Minor code fixes --- .../util/CollationAwareUTF8String.java | 28 +++++++++++++------ .../sql/catalyst/util/CollationSupport.java | 3 -- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 9f598e9a6715..fb7f24de8c95 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -35,20 +35,30 @@ */ public class CollationAwareUTF8String { - public static boolean lowercaseMatchFrom(final UTF8String l, final UTF8String r, int pos) { - if (pos < 0) return false; - for (int len = 0; len <= l.numChars() - pos; len++) { - if (l.substring(pos, pos + len).toLowerCase().equals(r)) { + public static boolean lowercaseMatchFrom( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + // To avoid the overhead of calling .toLowerCase() multiple times on the same pattern string, + // this method assumes that the pattern string is already lowercased prior to method call. + assert startPos >= 0; + for (int len = 0; len <= target.numChars() - startPos; ++len) { + if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) { return true; } } return false; } - public static boolean lowercaseMatchUntil(final UTF8String l, final UTF8String r, int pos) { - if (pos > l.numChars()) return false; - for (int len = 1; len <= pos; len++) { - if (l.substring(pos - len, pos).toLowerCase().equals(r)) { + public static boolean lowercaseMatchUntil( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + // To avoid the overhead of calling .toLowerCase() multiple times on the same pattern string, + // this method assumes that the pattern string is already lowercased prior to method call. + assert endPos <= target.numChars(); + for (int len = 0; len <= endPos; ++len) { + if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) { return true; } } @@ -209,7 +219,7 @@ public static int lowercaseIndexOf(final UTF8String target, final UTF8String pat if (pattern.numChars() == 0) return 0; int lenHaystack = target.numChars(), lenNeedle = pattern.numChars(); final UTF8String needle = pattern.toLowerCase(); - for (int i = start; i <= (lenHaystack - lenNeedle); i++) { + for (int i = start; i <= (lenHaystack - lenNeedle); ++i) { if (CollationAwareUTF8String.lowercaseMatchFrom(target, needle, i)) { return i; } diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index fa21ed3ebc65..d5bcc61bac2a 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -118,7 +118,6 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.contains(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - if (r.numBytes() == 0) return true; return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0; } public static boolean execICU(final UTF8String l, final UTF8String r, @@ -157,7 +156,6 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.startsWith(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - if (r.numBytes() == 0) return true; return CollationAwareUTF8String.lowercaseMatchFrom(l, r.toLowerCase(), 0); } public static boolean execICU(final UTF8String l, final UTF8String r, @@ -195,7 +193,6 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.endsWith(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - if (r.numBytes() == 0) return true; return CollationAwareUTF8String.lowercaseMatchUntil(l, r.toLowerCase(), l.numChars()); } public static boolean execICU(final UTF8String l, final UTF8String r, From aa91b15576ccb6eda6c1a60b91c8b2481c74cbc3 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 14 May 2024 12:21:38 +0200 Subject: [PATCH 08/19] Add doc comments --- .../util/CollationAwareUTF8String.java | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index fb7f24de8c95..be13be64044f 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -35,12 +35,21 @@ */ public class CollationAwareUTF8String { + /** + * Returns whether the target string starts with the specified prefix, + * with respect to the UTF8_BINARY_LCASE collation. The method assumes + * that the prefix is already lowercased prior to method call to avoid the + * overhead of calling .toLowerCase() multiple times on the same prefix string. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the start position for searching (in the target string) + * @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE + */ public static boolean lowercaseMatchFrom( final UTF8String target, final UTF8String lowercasePattern, int startPos) { - // To avoid the overhead of calling .toLowerCase() multiple times on the same pattern string, - // this method assumes that the pattern string is already lowercased prior to method call. assert startPos >= 0; for (int len = 0; len <= target.numChars() - startPos; ++len) { if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) { @@ -50,6 +59,17 @@ public static boolean lowercaseMatchFrom( return false; } + /** + * Returns whether the target string ends with the specified suffix, + * with respect to the UTF8_BINARY_LCASE collation. The method assumes + * that the suffix is already lowercased prior to method call to avoid the + * overhead of calling .toLowerCase() multiple times on the same suffix string. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return whether the target string ends with the specified suffix in UTF8_BINARY_LCASE + */ public static boolean lowercaseMatchUntil( final UTF8String target, final UTF8String lowercasePattern, @@ -214,6 +234,16 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co return 0; } + /** + * Returns the position of the first occurrence of the pattern string + * in the target string from the specified position (0-based index), + * with respect to the UTF8_BINARY_LCASE collation. + * + * @param target the string to be searched in + * @param pattern the string to be searched for + * @param start the start position for searching (in the target string) + * @return the position of the first occurrence of pattern in target, if not found, -1 returned. + */ public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, final int start) { if (pattern.numChars() == 0) return 0; From f72998cc040a2a069714f0dc094b116f6f1cded5 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 14 May 2024 13:41:26 +0200 Subject: [PATCH 09/19] Add back some tests --- .../apache/spark/unsafe/types/CollationSupportSuite.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 3980bff59887..f09f7a7f3ff2 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -101,6 +101,13 @@ public void testContains() throws SparkException { assertContains("ab世De", "AB世dE", "UNICODE_CI", true); assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true); assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); + // Characters with the same binary lowercase representation + assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); + assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); + assertContains("The KKelvin.", "KKelvin", "UTF8_BINARY_LCASE", true); + assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); + assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); + assertContains("The KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false); // Case-variable character length assertContains("i̇", "i", "UNICODE_CI", false); assertContains("i̇", "\u0307", "UNICODE_CI", false); From 86bf00bccb847867e0b291a9ab87b66ba930cc5c Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 14 May 2024 20:52:18 +0200 Subject: [PATCH 10/19] Add support for StringInStr & SubstringIndex --- .../util/CollationAwareUTF8String.java | 159 ++++++++++++------ .../unsafe/types/CollationSupportSuite.java | 26 ++- 2 files changed, 133 insertions(+), 52 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index be13be64044f..e2834d7abf06 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -35,6 +35,12 @@ */ public class CollationAwareUTF8String { + /** + * The constant value to indicate that the match is not found + * when searching for a pattern string in a target string. + */ + private static final int MATCH_NOT_FOUND = -1; + /** * Returns whether the target string starts with the specified prefix, * with respect to the UTF8_BINARY_LCASE collation. The method assumes @@ -50,13 +56,53 @@ public static boolean lowercaseMatchFrom( final UTF8String target, final UTF8String lowercasePattern, int startPos) { + return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND; + } + + /** + * Returns the length of the substring of the target string that starts with + * the specified prefix, with respect to the UTF8_BINARY_LCASE collation. + * The method assumes that the prefix is already lowercased. The method only + * considers the part of target string that starts from the specified position. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the end position for searching (in the target string) + * @return length of the target substring that ends with the specified suffix in lowercase + */ + public static int lowercaseMatchLengthFrom( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { assert startPos >= 0; for (int len = 0; len <= target.numChars() - startPos; ++len) { if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) { - return true; + return len; } } - return false; + return MATCH_NOT_FOUND; + } + + /** + * Returns the position of the first occurrence of the pattern string + * in the target string from the specified position (0-based index), + * with respect to the UTF8_BINARY_LCASE collation. The method assumes + * that the pattern string is already lowercased prior to method call. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the start position for searching (in the target string) + * @return the position of the first occurrence of pattern in target, if not found, -1 returned. + */ + public static int lowercaseFind( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + for (int i = startPos; i <= target.numChars(); ++i) { + if (lowercaseMatchFrom(target, lowercasePattern, i)) + return i; + } + return MATCH_NOT_FOUND; } /** @@ -68,21 +114,59 @@ public static boolean lowercaseMatchFrom( * @param target the string to be searched in * @param lowercasePattern the string to be searched for * @param endPos the end position for searching (in the target string) - * @return whether the target string ends with the specified suffix in UTF8_BINARY_LCASE + * @return whether the target string ends with the specified suffix in lowercase */ public static boolean lowercaseMatchUntil( - final UTF8String target, - final UTF8String lowercasePattern, - int endPos) { - // To avoid the overhead of calling .toLowerCase() multiple times on the same pattern string, - // this method assumes that the pattern string is already lowercased prior to method call. + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND; + } + + /** + * Returns the length of the substring of the target string that ends with + * the specified suffix, with respect to the UTF8_BINARY_LCASE collation. + * The method assumes that the suffix is already lowercased. The method only + * considers the part of target string that ends at the specified position. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return length of the target substring that ends with the specified suffix in lowercase + */ + public static int lowercaseMatchLengthUntil( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { assert endPos <= target.numChars(); for (int len = 0; len <= endPos; ++len) { if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) { - return true; + return len; } } - return false; + return MATCH_NOT_FOUND; + } + + /** + * Returns the position of the last occurrence of the pattern string + * in the target string until the specified position (0-based index), + * with respect to the UTF8_BINARY_LCASE collation. The method assumes + * that the pattern string is already lowercased prior to method call. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return the position of the last occurrence of pattern in target, if not found, -1 returned. + */ + public static int lowercaseRFind( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + for (int i = endPos; i >= 0; --i) { + if (lowercaseMatchUntil(target, lowercasePattern, i)) + return i; + } + return MATCH_NOT_FOUND; } public static UTF8String replace(final UTF8String src, final UTF8String search, @@ -247,14 +331,7 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, final int start) { if (pattern.numChars() == 0) return 0; - int lenHaystack = target.numChars(), lenNeedle = pattern.numChars(); - final UTF8String needle = pattern.toLowerCase(); - for (int i = start; i <= (lenHaystack - lenNeedle); ++i) { - if (CollationAwareUTF8String.lowercaseMatchFrom(target, needle, i)) { - return i; - } - } - return -1; + return lowercaseFind(target, pattern.toLowerCase(), start); } public static int indexOf(final UTF8String target, final UTF8String pattern, @@ -352,47 +429,29 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string, return UTF8String.EMPTY_UTF8; } - UTF8String lowercaseString = string.toLowerCase(); UTF8String lowercaseDelimiter = delimiter.toLowerCase(); if (count > 0) { - int idx = -1; + // search left to right (note: the start code point is inclusive) + int matchLength = -1; while (count > 0) { - idx = lowercaseString.find(lowercaseDelimiter, idx + 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } - } - if (idx == 0) { - return UTF8String.EMPTY_UTF8; + matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1); + if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter + else return string; // cannot find enough delimiters in the string } - byte[] bytes = new byte[idx]; - copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx); - return UTF8String.fromBytes(bytes); - + if (matchLength == 0) return UTF8String.EMPTY_UTF8; + return string.substring(0, matchLength); } else { - int idx = string.numBytes() - delimiter.numBytes() + 1; + // search right to left (note: the end code point is exclusive) + int matchLength = string.numChars() + 1; count = -count; while (count > 0) { - idx = lowercaseString.rfind(lowercaseDelimiter, idx - 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } + matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1); + if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter + else return string; // cannot find enough delimiters in the string } - if (idx + delimiter.numBytes() == string.numBytes()) { - return UTF8String.EMPTY_UTF8; - } - int size = string.numBytes() - delimiter.numBytes() - idx; - byte[] bytes = new byte[size]; - copyMemory(string.getBaseObject(), string.getBaseOffset() + idx + delimiter.numBytes(), - bytes, BYTE_ARRAY_OFFSET, size); - return UTF8String.fromBytes(bytes); + if (matchLength == string.numChars()) return UTF8String.EMPTY_UTF8; + return string.substring(matchLength, string.numChars()); } } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index f09f7a7f3ff2..6586beac34b1 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -632,8 +632,18 @@ public void testStringInstr() throws SparkException { assertStringInstr("aaads", "dS", "UNICODE_CI", 4); assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0); assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8); - assertStringInstr("abİo12", "i̇o", "UNICODE_CI", 3); - assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3); + assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1); + assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1); + assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3); + assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3); + assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5); + assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7); + assertStringInstr("İoi̇o12", "i̇o", "UTF8_BINARY_LCASE", 1); + assertStringInstr("i̇oİo12", "İo", "UTF8_BINARY_LCASE", 1); + assertStringInstr("abİoi̇o", "i̇o", "UTF8_BINARY_LCASE", 3); + assertStringInstr("abi̇oİo", "İo", "UTF8_BINARY_LCASE", 3); + assertStringInstr("ai̇oxXİo", "Xx", "UTF8_BINARY_LCASE", 5); + assertStringInstr("aİoi̇oxx", "XX", "UTF8_BINARY_LCASE", 7); } private void assertFindInSet(String word, String set, String collationName, @@ -875,6 +885,18 @@ public void testSubstringIndex() throws SparkException { assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12"); } private void assertStringTrim( From 77904faf2f28dd07b841266039c38949c1fcae54 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 14 May 2024 22:36:33 +0200 Subject: [PATCH 11/19] Unicode java linter checks --- .../org/apache/spark/unsafe/types/CollationSupportSuite.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 6586beac34b1..552bc11809d3 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -23,7 +23,7 @@ import static org.junit.jupiter.api.Assertions.*; - +// checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { /** @@ -1107,3 +1107,4 @@ public void testStringTrim() throws SparkException { // TODO: Test other collation-aware expressions. } +// checkstyle.on: AvoidEscapedUnicodeCharacters From 5b474992c5a6216825aea0902d4e332d51669994 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 15 May 2024 06:37:12 +0200 Subject: [PATCH 12/19] Java style fix --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index e2834d7abf06..598f334af813 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -99,8 +99,9 @@ public static int lowercaseFind( final UTF8String lowercasePattern, int startPos) { for (int i = startPos; i <= target.numChars(); ++i) { - if (lowercaseMatchFrom(target, lowercasePattern, i)) + if (lowercaseMatchFrom(target, lowercasePattern, i)) { return i; + } } return MATCH_NOT_FOUND; } @@ -163,8 +164,9 @@ public static int lowercaseRFind( final UTF8String lowercasePattern, int endPos) { for (int i = endPos; i >= 0; --i) { - if (lowercaseMatchUntil(target, lowercasePattern, i)) + if (lowercaseMatchUntil(target, lowercasePattern, i)) { return i; + } } return MATCH_NOT_FOUND; } From aceeba885c369bbc62af8279f8b6b03a80e31e3a Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 15 May 2024 12:42:33 +0200 Subject: [PATCH 13/19] Undo some changes --- .../util/CollationAwareUTF8String.java | 48 +++++++++++++------ .../sql/catalyst/util/CollationSupport.java | 2 +- .../unsafe/types/CollationSupportSuite.java | 26 +--------- 3 files changed, 36 insertions(+), 40 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 598f334af813..f1648042d587 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -426,34 +426,52 @@ public static UTF8String subStringIndex(final UTF8String string, final UTF8Strin } public static UTF8String lowercaseSubStringIndex(final UTF8String string, - final UTF8String delimiter, int count) { + final UTF8String delimiter, int count) { if (delimiter.numBytes() == 0 || count == 0) { return UTF8String.EMPTY_UTF8; } + UTF8String lowercaseString = string.toLowerCase(); UTF8String lowercaseDelimiter = delimiter.toLowerCase(); if (count > 0) { - // search left to right (note: the start code point is inclusive) - int matchLength = -1; + int idx = -1; while (count > 0) { - matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1); - if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter - else return string; // cannot find enough delimiters in the string + idx = lowercaseString.find(lowercaseDelimiter, idx + 1); + if (idx >= 0) { + count--; + } else { + // can not find enough delim + return string; + } + } + if (idx == 0) { + return UTF8String.EMPTY_UTF8; } - if (matchLength == 0) return UTF8String.EMPTY_UTF8; - return string.substring(0, matchLength); + byte[] bytes = new byte[idx]; + copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx); + return UTF8String.fromBytes(bytes); + } else { - // search right to left (note: the end code point is exclusive) - int matchLength = string.numChars() + 1; + int idx = string.numBytes() - delimiter.numBytes() + 1; count = -count; while (count > 0) { - matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1); - if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter - else return string; // cannot find enough delimiters in the string + idx = lowercaseString.rfind(lowercaseDelimiter, idx - 1); + if (idx >= 0) { + count--; + } else { + // can not find enough delim + return string; + } } - if (matchLength == string.numChars()) return UTF8String.EMPTY_UTF8; - return string.substring(matchLength, string.numChars()); + if (idx + delimiter.numBytes() == string.numBytes()) { + return UTF8String.EMPTY_UTF8; + } + int size = string.numBytes() - delimiter.numBytes() - idx; + byte[] bytes = new byte[size]; + copyMemory(string.getBaseObject(), string.getBaseOffset() + idx + delimiter.numBytes(), + bytes, BYTE_ARRAY_OFFSET, size); + return UTF8String.fromBytes(bytes); } } diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index d5bcc61bac2a..8f7aed30464c 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -354,7 +354,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring return string.indexOf(substring, 0); } public static int execLowercase(final UTF8String string, final UTF8String substring) { - return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0); + return string.toLowerCase().indexOf(substring.toLowerCase(), 0); } public static int execICU(final UTF8String string, final UTF8String substring, final int collationId) { diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 552bc11809d3..ed1e15fab883 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -632,18 +632,8 @@ public void testStringInstr() throws SparkException { assertStringInstr("aaads", "dS", "UNICODE_CI", 4); assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0); assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8); - assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1); - assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1); - assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3); - assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3); - assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5); - assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7); - assertStringInstr("İoi̇o12", "i̇o", "UTF8_BINARY_LCASE", 1); - assertStringInstr("i̇oİo12", "İo", "UTF8_BINARY_LCASE", 1); - assertStringInstr("abİoi̇o", "i̇o", "UTF8_BINARY_LCASE", 3); - assertStringInstr("abi̇oİo", "İo", "UTF8_BINARY_LCASE", 3); - assertStringInstr("ai̇oxXİo", "Xx", "UTF8_BINARY_LCASE", 5); - assertStringInstr("aİoi̇oxx", "XX", "UTF8_BINARY_LCASE", 7); + assertStringInstr("abİo12", "i̇o", "UNICODE_CI", 3); + assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3); } private void assertFindInSet(String word, String set, String collationName, @@ -885,18 +875,6 @@ public void testSubstringIndex() throws SparkException { assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12"); - assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12"); - assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12"); } private void assertStringTrim( From 3945137e872e3d9b73d4f19c45fb55b4dbb003de Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 15 May 2024 12:44:51 +0200 Subject: [PATCH 14/19] Style fixes --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index f1648042d587..e1efa32039ab 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -426,7 +426,7 @@ public static UTF8String subStringIndex(final UTF8String string, final UTF8Strin } public static UTF8String lowercaseSubStringIndex(final UTF8String string, - final UTF8String delimiter, int count) { + final UTF8String delimiter, int count) { if (delimiter.numBytes() == 0 || count == 0) { return UTF8String.EMPTY_UTF8; } @@ -470,7 +470,7 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string, int size = string.numBytes() - delimiter.numBytes() - idx; byte[] bytes = new byte[size]; copyMemory(string.getBaseObject(), string.getBaseOffset() + idx + delimiter.numBytes(), - bytes, BYTE_ARRAY_OFFSET, size); + bytes, BYTE_ARRAY_OFFSET, size); return UTF8String.fromBytes(bytes); } } From 30774a37d01b40062125d71c0d38ed046cf8db88 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 15 May 2024 12:47:59 +0200 Subject: [PATCH 15/19] Update comments --- .../unsafe/types/CollationSupportSuite.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index ed1e15fab883..5843edbb5ecf 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -121,8 +121,8 @@ public void testContains() throws SparkException { assertContains("adİos", "Io", "UNICODE_CI", false); assertContains("adİos", "i̇o", "UNICODE_CI", true); assertContains("adİos", "İo", "UNICODE_CI", true); - assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI - assertContains("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI + assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // != UNICODE_CI + assertContains("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // != UNICODE_CI assertContains("i̇", "İ", "UTF8_BINARY_LCASE", true); assertContains("İ", "i", "UTF8_BINARY_LCASE", false); assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false); @@ -231,13 +231,13 @@ public void testStartsWith() throws SparkException { assertStartsWith("İonic", "Io", "UNICODE_CI", false); assertStartsWith("İonic", "i̇o", "UNICODE_CI", true); assertStartsWith("İonic", "İo", "UNICODE_CI", true); - assertStartsWith("i̇", "i", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI + assertStartsWith("i̇", "i", "UTF8_BINARY_LCASE", true); // != UNICODE_CI assertStartsWith("i̇", "İ", "UTF8_BINARY_LCASE", true); assertStartsWith("İ", "i", "UTF8_BINARY_LCASE", false); assertStartsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true); assertStartsWith("İİİ", "i̇i", "UTF8_BINARY_LCASE", false); assertStartsWith("İi̇İ", "i̇İ", "UTF8_BINARY_LCASE", true); - assertStartsWith("i̇İi̇i̇", "İi̇İi", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI + assertStartsWith("i̇İi̇i̇", "İi̇İi", "UTF8_BINARY_LCASE", true); // != UNICODE_CI assertStartsWith("i̇onic", "io", "UTF8_BINARY_LCASE", false); assertStartsWith("i̇onic", "Io", "UTF8_BINARY_LCASE", false); assertStartsWith("i̇onic", "i̇o", "UTF8_BINARY_LCASE", true); @@ -343,13 +343,13 @@ public void testEndsWith() throws SparkException { assertEndsWith("the İo", "Io", "UNICODE_CI", false); assertEndsWith("the İo", "i̇o", "UNICODE_CI", true); assertEndsWith("the İo", "İo", "UNICODE_CI", true); - assertEndsWith("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI + assertEndsWith("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // != UNICODE_CI assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", true); assertEndsWith("İ", "i", "UTF8_BINARY_LCASE", false); assertEndsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true); assertEndsWith("İİİ", "ii̇", "UTF8_BINARY_LCASE", false); assertEndsWith("İi̇İ", "İi̇", "UTF8_BINARY_LCASE", true); - assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI + assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UTF8_BINARY_LCASE", true); // != UNICODE_CI assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false); assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false); assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true); @@ -775,14 +775,14 @@ public void testLocate() throws SparkException { assertLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1); // Case-variable character length assertLocate("\u0307", "i̇", 1, "UTF8_BINARY", 2); - assertLocate("\u0307", "İ", 1, "UTF8_BINARY_LCASE", 0); // different from UTF8_BINARY + assertLocate("\u0307", "İ", 1, "UTF8_BINARY_LCASE", 0); // != UTF8_BINARY assertLocate("i", "i̇", 1, "UNICODE_CI", 0); assertLocate("\u0307", "i̇", 1, "UNICODE_CI", 0); assertLocate("i̇", "i", 1, "UNICODE_CI", 0); assertLocate("İ", "i̇", 1, "UNICODE_CI", 1); assertLocate("İ", "i", 1, "UNICODE_CI", 0); - assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // different from UNICODE_CI - assertLocate("\u0307", "i̇", 1, "UTF8_BINARY_LCASE", 2); // different from UNICODE_CI + assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // != UNICODE_CI + assertLocate("\u0307", "i̇", 1, "UTF8_BINARY_LCASE", 2); // != UNICODE_CI assertLocate("i̇", "i", 1, "UTF8_BINARY_LCASE", 0); assertLocate("İ", "i̇", 1, "UTF8_BINARY_LCASE", 1); assertLocate("İ", "i", 1, "UTF8_BINARY_LCASE", 0); From 68bada3a79962760c150a5c68a25c078adf968cc Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 17 May 2024 09:46:33 +0200 Subject: [PATCH 16/19] Small fixes --- .../util/CollationAwareUTF8String.java | 75 +++++++++++-------- .../unsafe/types/CollationSupportSuite.java | 5 +- 2 files changed, 47 insertions(+), 33 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index e1efa32039ab..678aef49bceb 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -36,16 +36,17 @@ public class CollationAwareUTF8String { /** - * The constant value to indicate that the match is not found - * when searching for a pattern string in a target string. + * The constant value to indicate that the match is not found when searching for a pattern + * string in a target string. */ private static final int MATCH_NOT_FOUND = -1; /** - * Returns whether the target string starts with the specified prefix, - * with respect to the UTF8_BINARY_LCASE collation. The method assumes - * that the prefix is already lowercased prior to method call to avoid the - * overhead of calling .toLowerCase() multiple times on the same prefix string. + * Returns whether the target string starts with the specified prefix, starting from the + * specified position (0-based index referring to character position in UTF8String), with respect + * to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased + * prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the + * same prefix string. * * @param target the string to be searched in * @param lowercasePattern the string to be searched for @@ -60,14 +61,17 @@ public static boolean lowercaseMatchFrom( } /** - * Returns the length of the substring of the target string that starts with - * the specified prefix, with respect to the UTF8_BINARY_LCASE collation. - * The method assumes that the prefix is already lowercased. The method only - * considers the part of target string that starts from the specified position. + * Returns the length of the substring of the target string that starts with the specified + * prefix, starting from the specified position (0-based index referring to character position + * in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * prefix is already lowercased. The method only considers the part of target string that + * starts from the specified (inclusive) position (that is, the method does not look at UTF8 + * characters of the target string at or after position `endPos`). If the prefix is not found, + * MATCH_NOT_FOUND is returned. * * @param target the string to be searched in * @param lowercasePattern the string to be searched for - * @param startPos the end position for searching (in the target string) + * @param startPos the start position for searching (in the target string) * @return length of the target substring that ends with the specified suffix in lowercase */ public static int lowercaseMatchLengthFrom( @@ -84,20 +88,22 @@ public static int lowercaseMatchLengthFrom( } /** - * Returns the position of the first occurrence of the pattern string - * in the target string from the specified position (0-based index), - * with respect to the UTF8_BINARY_LCASE collation. The method assumes - * that the pattern string is already lowercased prior to method call. + * Returns the position of the first occurrence of the pattern string in the target string, + * starting from the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * pattern string is already lowercased prior to call. If the prefix is not found, + * MATCH_NOT_FOUND is returned. * * @param target the string to be searched in * @param lowercasePattern the string to be searched for * @param startPos the start position for searching (in the target string) - * @return the position of the first occurrence of pattern in target, if not found, -1 returned. + * @return the position of the first occurrence of pattern in target */ public static int lowercaseFind( final UTF8String target, final UTF8String lowercasePattern, int startPos) { + assert startPos >= 0; for (int i = startPos; i <= target.numChars(); ++i) { if (lowercaseMatchFrom(target, lowercasePattern, i)) { return i; @@ -107,10 +113,11 @@ public static int lowercaseFind( } /** - * Returns whether the target string ends with the specified suffix, - * with respect to the UTF8_BINARY_LCASE collation. The method assumes - * that the suffix is already lowercased prior to method call to avoid the - * overhead of calling .toLowerCase() multiple times on the same suffix string. + * Returns whether the target string ends with the specified suffix, ending at the specified + * position (0-based index referring to character position in UTF8String), with respect to the + * UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior + * to method call to avoid the overhead of calling .toLowerCase() multiple times on the same + * suffix string. * * @param target the string to be searched in * @param lowercasePattern the string to be searched for @@ -125,10 +132,13 @@ public static boolean lowercaseMatchUntil( } /** - * Returns the length of the substring of the target string that ends with - * the specified suffix, with respect to the UTF8_BINARY_LCASE collation. - * The method assumes that the suffix is already lowercased. The method only - * considers the part of target string that ends at the specified position. + * Returns the length of the substring of the target string that ends with the specified + * suffix, ending at the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * suffix is already lowercased. The method only considers the part of target string that ends + * at the specified (non-inclusive) position (that is, the method does not look at UTF8 + * characters of the target string at or after position `endPos`). If the suffix is not found, + * MATCH_NOT_FOUND is returned. * * @param target the string to be searched in * @param lowercasePattern the string to be searched for @@ -149,20 +159,22 @@ public static int lowercaseMatchLengthUntil( } /** - * Returns the position of the last occurrence of the pattern string - * in the target string until the specified position (0-based index), - * with respect to the UTF8_BINARY_LCASE collation. The method assumes - * that the pattern string is already lowercased prior to method call. + * Returns the position of the last occurrence of the pattern string in the target string, + * ending at the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * pattern string is already lowercased prior to call. If the suffix is not found, + * MATCH_NOT_FOUND is returned. * * @param target the string to be searched in * @param lowercasePattern the string to be searched for * @param endPos the end position for searching (in the target string) - * @return the position of the last occurrence of pattern in target, if not found, -1 returned. + * @return the position of the last occurrence of pattern in target */ public static int lowercaseRFind( final UTF8String target, final UTF8String lowercasePattern, int endPos) { + assert endPos <= target.numChars(); for (int i = endPos; i >= 0; --i) { if (lowercaseMatchUntil(target, lowercasePattern, i)) { return i; @@ -321,9 +333,8 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co } /** - * Returns the position of the first occurrence of the pattern string - * in the target string from the specified position (0-based index), - * with respect to the UTF8_BINARY_LCASE collation. + * Returns the position of the first occurrence of the pattern string in the target string from + * the specified position (0-based index), with respect to the UTF8_BINARY_LCASE collation. * * @param target the string to be searched in * @param pattern the string to be searched for diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 5843edbb5ecf..eb18d7665b09 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -122,6 +122,8 @@ public void testContains() throws SparkException { assertContains("adİos", "i̇o", "UNICODE_CI", true); assertContains("adİos", "İo", "UNICODE_CI", true); assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // != UNICODE_CI + assertContains("İ", "\u0307", "UTF8_BINARY_LCASE", false); + assertContains("İ", "i", "UTF8_BINARY_LCASE", false); assertContains("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // != UNICODE_CI assertContains("i̇", "İ", "UTF8_BINARY_LCASE", true); assertContains("İ", "i", "UTF8_BINARY_LCASE", false); @@ -345,11 +347,12 @@ public void testEndsWith() throws SparkException { assertEndsWith("the İo", "İo", "UNICODE_CI", true); assertEndsWith("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // != UNICODE_CI assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", true); - assertEndsWith("İ", "i", "UTF8_BINARY_LCASE", false); + assertEndsWith("İ", "\u0307", "UTF8_BINARY_LCASE", false); assertEndsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true); assertEndsWith("İİİ", "ii̇", "UTF8_BINARY_LCASE", false); assertEndsWith("İi̇İ", "İi̇", "UTF8_BINARY_LCASE", true); assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UTF8_BINARY_LCASE", true); // != UNICODE_CI + assertEndsWith("i̇İi̇i̇", "\u0307İİ", "UTF8_BINARY_LCASE", false); assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false); assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false); assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true); From 8f59422c639396ac61d9a281c72625545491058d Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 17 May 2024 10:28:55 +0200 Subject: [PATCH 17/19] Small fixes --- .../catalyst/util/CollationAwareUTF8String.java | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 678aef49bceb..9ae5d68a88cd 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -72,7 +72,7 @@ public static boolean lowercaseMatchFrom( * @param target the string to be searched in * @param lowercasePattern the string to be searched for * @param startPos the start position for searching (in the target string) - * @return length of the target substring that ends with the specified suffix in lowercase + * @return length of the target substring that ends with the specified prefix in lowercase */ public static int lowercaseMatchLengthFrom( final UTF8String target, @@ -91,7 +91,7 @@ public static int lowercaseMatchLengthFrom( * Returns the position of the first occurrence of the pattern string in the target string, * starting from the specified position (0-based index referring to character position in * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the - * pattern string is already lowercased prior to call. If the prefix is not found, + * pattern string is already lowercased prior to call. If the pattern is not found, * MATCH_NOT_FOUND is returned. * * @param target the string to be searched in @@ -162,7 +162,7 @@ public static int lowercaseMatchLengthUntil( * Returns the position of the last occurrence of the pattern string in the target string, * ending at the specified position (0-based index referring to character position in * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the - * pattern string is already lowercased prior to call. If the suffix is not found, + * pattern string is already lowercased prior to call. If the pattern is not found, * MATCH_NOT_FOUND is returned. * * @param target the string to be searched in @@ -333,13 +333,15 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co } /** - * Returns the position of the first occurrence of the pattern string in the target string from - * the specified position (0-based index), with respect to the UTF8_BINARY_LCASE collation. + * Returns the position of the first occurrence of the pattern string in the target string, + * starting from the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. If the pattern is not found, + * MATCH_NOT_FOUND is returned. * * @param target the string to be searched in * @param pattern the string to be searched for * @param start the start position for searching (in the target string) - * @return the position of the first occurrence of pattern in target, if not found, -1 returned. + * @return the position of the first occurrence of pattern in target */ public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, final int start) { From 4b9b83e54640b7a4435e2ad3430f2e4edca90e35 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 15:11:53 +0200 Subject: [PATCH 18/19] Update new method access --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 9ae5d68a88cd..027030c8d97f 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -74,7 +74,7 @@ public static boolean lowercaseMatchFrom( * @param startPos the start position for searching (in the target string) * @return length of the target substring that ends with the specified prefix in lowercase */ - public static int lowercaseMatchLengthFrom( + private static int lowercaseMatchLengthFrom( final UTF8String target, final UTF8String lowercasePattern, int startPos) { @@ -99,7 +99,7 @@ public static int lowercaseMatchLengthFrom( * @param startPos the start position for searching (in the target string) * @return the position of the first occurrence of pattern in target */ - public static int lowercaseFind( + private static int lowercaseFind( final UTF8String target, final UTF8String lowercasePattern, int startPos) { @@ -145,7 +145,7 @@ public static boolean lowercaseMatchUntil( * @param endPos the end position for searching (in the target string) * @return length of the target substring that ends with the specified suffix in lowercase */ - public static int lowercaseMatchLengthUntil( + private static int lowercaseMatchLengthUntil( final UTF8String target, final UTF8String lowercasePattern, int endPos) { @@ -170,7 +170,7 @@ public static int lowercaseMatchLengthUntil( * @param endPos the end position for searching (in the target string) * @return the position of the last occurrence of pattern in target */ - public static int lowercaseRFind( + private static int lowercaseRFind( final UTF8String target, final UTF8String lowercasePattern, int endPos) { From eb602e8a2f61909df5d2e4042c7daa72456fb509 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 27 May 2024 14:30:11 +0200 Subject: [PATCH 19/19] Update doc comment --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 027030c8d97f..0d0094d8d0a0 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -72,7 +72,7 @@ public static boolean lowercaseMatchFrom( * @param target the string to be searched in * @param lowercasePattern the string to be searched for * @param startPos the start position for searching (in the target string) - * @return length of the target substring that ends with the specified prefix in lowercase + * @return length of the target substring that starts with the specified prefix in lowercase */ private static int lowercaseMatchLengthFrom( final UTF8String target,