From 4981a14cfb130b34936c699082a74afb2d124f41 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Thu, 9 May 2024 17:31:07 +0200
Subject: [PATCH 01/19] Fix string search in contains/startsWith/endsWith

---
 .../sql/catalyst/util/CollationSupport.java   |  31 ++++-
 .../apache/spark/unsafe/types/UTF8String.java | 118 ------------------
 .../unsafe/types/CollationSupportSuite.java   |  78 +++++++-----
 .../spark/unsafe/types/UTF8StringSuite.java   | 105 ----------------
 4 files changed, 78 insertions(+), 254 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index b77671cee90b..bfaa1e2d0b55 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -126,7 +126,19 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.contains(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      return l.containsInLowerCase(r);
+      if (r.numBytes() == 0) return true;
+      if (l.numChars() < r.numChars()) return false;
+      int lenHaystack = l.numChars(), lenNeedle = r.numChars();
+      final UTF8String firstLower = r.substring(0, 1).toLowerCase();
+      final UTF8String needle = r.toLowerCase();
+      for (var i = 0; i <= (lenHaystack - lenNeedle); i++) {
+        if (l.substring(i, i + 1).toLowerCase().equals(firstLower)) {
+          if (CollationAwareUTF8String.lowercaseMatchAt(l, needle, i, lenNeedle)) {
+            return true;
+          }
+        }
+      }
+      return false;
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
         final int collationId) {
@@ -164,7 +176,9 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.startsWith(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      return l.startsWithInLowerCase(r);
+      if (r.numBytes() == 0) return true;
+      if (l.numChars() < r.numChars()) return false;
+      return CollationAwareUTF8String.lowercaseMatchAt(l, r.toLowerCase(), 0, r.numChars());
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
         final int collationId) {
@@ -201,7 +215,10 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.endsWith(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      return l.endsWithInLowerCase(r);
+      if (r.numBytes() == 0) return true;
+      if (l.numChars() < r.numChars()) return false;
+      return CollationAwareUTF8String.lowercaseMatchAt(l, r.toLowerCase(),
+        l.numChars() - r.numChars(), r.numChars());
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
         final int collationId) {
@@ -572,6 +589,14 @@ public static UTF8String collationAwareRegex(final UTF8String regex, final int c
 
   private static class CollationAwareUTF8String {
 
+    private static boolean lowercaseMatchAt(final UTF8String l, final UTF8String r,
+        int pos, int len) {
+      if (len + pos > l.numChars() || pos < 0) {
+        return false;
+      }
+      return l.substring(pos, pos + len).toLowerCase().equals(r);
+    }
+
     private static UTF8String replace(final UTF8String src, final UTF8String search,
         final UTF8String replace, final int collationId) {
       // This collation aware implementation is based on existing implementation on UTF8String
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 2a5d14580353..1427e53323d3 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -341,44 +341,6 @@ public boolean contains(final UTF8String substring) {
     return false;
   }
 
-  /**
-   * Returns whether `this` contains `substring` in a lowercase unicode-aware manner
-   *
-   * This function is written in a way which avoids excessive allocations in case if we work with
-   * bare ASCII-character strings.
-   */
-  public boolean containsInLowerCase(final UTF8String substring) {
-    if (substring.numBytes == 0) {
-      return true;
-    }
-
-    // Both `this` and the `substring` are checked for non-ASCII characters, otherwise we would
-    // have to use `startsWithLowerCase(...)` in a loop, and it would frequently allocate
-    // (e.g. in case of `containsInLowerCase("1大1大1大...", "11")`)
-    if (!substring.isFullAscii()) {
-      return toLowerCase().contains(substring.toLowerCaseSlow());
-    }
-    if (!isFullAscii()) {
-      return toLowerCaseSlow().contains(substring.toLowerCaseAscii());
-    }
-
-    if (numBytes < substring.numBytes) {
-      return false;
-    }
-
-    final var firstLower = Character.toLowerCase(substring.getByte(0));
-    for (var i = 0; i <= (numBytes - substring.numBytes); i++) {
-      if (Character.toLowerCase(getByte(i)) == firstLower) {
-        final var rest = UTF8String.fromAddress(base, offset + i, numBytes - i);
-        if (rest.matchAtInLowerCaseAscii(substring, 0)) {
-          return true;
-        }
-      }
-    }
-
-    return false;
-  }
-
   /**
    * Returns the byte at position `i`.
    */
@@ -393,94 +355,14 @@ public boolean matchAt(final UTF8String s, int pos) {
     return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes);
   }
 
-  private boolean matchAtInLowerCaseAscii(final UTF8String s, int pos) {
-    if (s.numBytes + pos > numBytes || pos < 0) {
-      return false;
-    }
-
-    for (var i = 0; i < s.numBytes; i++) {
-      if (Character.toLowerCase(getByte(pos + i)) != Character.toLowerCase(s.getByte(i))) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
   public boolean startsWith(final UTF8String prefix) {
     return matchAt(prefix, 0);
   }
 
-  /**
-   * Checks whether `prefix` is a prefix of `this` in a lowercase unicode-aware manner
-   *
-   * This function is written in a way which avoids excessive allocations in case if we work with
-   * bare ASCII-character strings.
-   */
-  public boolean startsWithInLowerCase(final UTF8String prefix) {
-    // No way to match sizes of strings for early return, since single grapheme can be expanded
-    // into several independent ones in lowercase
-    if (prefix.numBytes == 0) {
-      return true;
-    }
-    if (numBytes == 0) {
-      return false;
-    }
-
-    if (!prefix.isFullAscii()) {
-      return toLowerCase().startsWith(prefix.toLowerCaseSlow());
-    }
-
-    final var part = prefix.numBytes >= numBytes ? this : UTF8String.fromAddress(
-      base, offset, prefix.numBytes);
-    if (!part.isFullAscii()) {
-      return toLowerCaseSlow().startsWith(prefix.toLowerCaseAscii());
-    }
-
-    if (numBytes < prefix.numBytes) {
-      return false;
-    }
-
-    return matchAtInLowerCaseAscii(prefix, 0);
-  }
-
   public boolean endsWith(final UTF8String suffix) {
     return matchAt(suffix, numBytes - suffix.numBytes);
   }
 
-  /**
-   * Checks whether `suffix` is a suffix of `this` in a lowercase unicode-aware manner
-   *
-   * This function is written in a way which avoids excessive allocations in case if we work with
-   * bare ASCII-character strings.
-   */
-  public boolean endsWithInLowerCase(final UTF8String suffix) {
-    // No way to match sizes of strings for early return, since single grapheme can be expanded
-    // into several independent ones in lowercase
-    if (suffix.numBytes == 0) {
-      return true;
-    }
-    if (numBytes == 0) {
-      return false;
-    }
-
-    if (!suffix.isFullAscii()) {
-      return toLowerCase().endsWith(suffix.toLowerCaseSlow());
-    }
-
-    final var part = suffix.numBytes >= numBytes ? this : UTF8String.fromAddress(
-      base, offset + numBytes - suffix.numBytes, suffix.numBytes);
-    if (!part.isFullAscii()) {
-      return toLowerCaseSlow().endsWith(suffix.toLowerCaseAscii());
-    }
-
-    if (numBytes < suffix.numBytes) {
-      return false;
-    }
-
-    return matchAtInLowerCaseAscii(suffix, numBytes - suffix.numBytes);
-  }
-
   /**
    * Returns the upper case of this string
    */
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 2f05b9ad88c9..a23b10ad20fb 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -102,20 +102,22 @@ public void testContains() throws SparkException {
     assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true);
     assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
     // Case-variable character length
-    assertContains("abİo12", "i̇o", "UNICODE_CI", true);
-    assertContains("abi̇o12", "İo", "UNICODE_CI", true);
-    assertContains("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true);
-    assertContains("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true);
-    assertContains("The İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", true);
-    assertContains("İodiNe", "i̇oDine", "UTF8_BINARY_LCASE", true);
-    assertContains("İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", false);
-    // Characters with the same binary lowercase representation
-    assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("The KKelvin.", "KKelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("The KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false);
+    assertContains("adi̇os", "io", "UNICODE_CI", false);
+    assertContains("adi̇os", "Io", "UNICODE_CI", false);
+    assertContains("adi̇os", "i̇o", "UNICODE_CI", true);
+    assertContains("adi̇os", "İo", "UNICODE_CI", true);
+    assertContains("adİos", "io", "UNICODE_CI", false);
+    assertContains("adİos", "Io", "UNICODE_CI", false);
+    assertContains("adİos", "i̇o", "UNICODE_CI", true);
+    assertContains("adİos", "İo", "UNICODE_CI", true);
+    assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false);
+    assertContains("adi̇os", "Io", "UTF8_BINARY_LCASE", false);
+    assertContains("adi̇os", "i̇o", "UTF8_BINARY_LCASE", true);
+    assertContains("adi̇os", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertContains("adİos", "io", "UTF8_BINARY_LCASE", false);
+    assertContains("adİos", "Io", "UTF8_BINARY_LCASE", false);
+    assertContains("adİos", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertContains("adİos", "İo", "UTF8_BINARY_LCASE", true);
   }
 
   private void assertStartsWith(
@@ -191,13 +193,6 @@ public void testStartsWith() throws SparkException {
     assertStartsWith("ab世De", "AB世dE", "UNICODE_CI", true);
     assertStartsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true);
     assertStartsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
-    // Case-variable character length
-    assertStartsWith("İonic", "i̇o", "UNICODE_CI", true);
-    assertStartsWith("i̇onic", "İo", "UNICODE_CI", true);
-    assertStartsWith("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true);
-    assertStartsWith("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true);
-    assertStartsWith("İodiNe", "i̇oDin", "UTF8_BINARY_LCASE", true);
-    assertStartsWith("The İodiNe", "i̇oDin", "UTF8_BINARY_LCASE", false);
     // Characters with the same binary lowercase representation
     assertStartsWith("Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true);
     assertStartsWith("Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true);
@@ -205,6 +200,23 @@ public void testStartsWith() throws SparkException {
     assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true);
     assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true);
     assertStartsWith("KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false);
+    // Case-variable character length
+    assertStartsWith("i̇onic", "io", "UNICODE_CI", false);
+    assertStartsWith("i̇onic", "Io", "UNICODE_CI", false);
+    assertStartsWith("i̇onic", "i̇o", "UNICODE_CI", true);
+    assertStartsWith("i̇onic", "İo", "UNICODE_CI", true);
+    assertStartsWith("İonic", "io", "UNICODE_CI", false);
+    assertStartsWith("İonic", "Io", "UNICODE_CI", false);
+    assertStartsWith("İonic", "i̇o", "UNICODE_CI", true);
+    assertStartsWith("İonic", "İo", "UNICODE_CI", true);
+    assertStartsWith("i̇onic", "io", "UTF8_BINARY_LCASE", false);
+    assertStartsWith("i̇onic", "Io", "UTF8_BINARY_LCASE", false);
+    assertStartsWith("i̇onic", "i̇o", "UTF8_BINARY_LCASE", true);
+    assertStartsWith("i̇onic", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertStartsWith("İonic", "io", "UTF8_BINARY_LCASE", false);
+    assertStartsWith("İonic", "Io", "UTF8_BINARY_LCASE", false);
+    assertStartsWith("İonic", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertStartsWith("İonic", "İo", "UTF8_BINARY_LCASE", true);
   }
 
   private void assertEndsWith(String pattern, String suffix, String collationName, boolean expected)
@@ -279,13 +291,6 @@ public void testEndsWith() throws SparkException {
     assertEndsWith("ab世De", "AB世dE", "UNICODE_CI", true);
     assertEndsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true);
     assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
-    // Case-variable character length
-    assertEndsWith("The İo", "i̇o", "UNICODE_CI", true);
-    assertEndsWith("The i̇o", "İo", "UNICODE_CI", true);
-    assertEndsWith("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true);
-    assertEndsWith("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true);
-    assertEndsWith("The İodiNe", "i̇oDine", "UTF8_BINARY_LCASE", true);
-    assertEndsWith("The İodiNe", "i̇oDin", "UTF8_BINARY_LCASE", false);
     // Characters with the same binary lowercase representation
     assertEndsWith("The Kelvin", "Kelvin", "UTF8_BINARY_LCASE", true);
     assertEndsWith("The Kelvin", "Kelvin", "UTF8_BINARY_LCASE", true);
@@ -293,6 +298,23 @@ public void testEndsWith() throws SparkException {
     assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true);
     assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true);
     assertEndsWith("The KKelvin", "KKelvin,", "UTF8_BINARY_LCASE", false);
+    // Case-variable character length
+    assertEndsWith("the i̇o", "io", "UNICODE_CI", false);
+    assertEndsWith("the i̇o", "Io", "UNICODE_CI", false);
+    assertEndsWith("the i̇o", "i̇o", "UNICODE_CI", true);
+    assertEndsWith("the i̇o", "İo", "UNICODE_CI", true);
+    assertEndsWith("the İo", "io", "UNICODE_CI", false);
+    assertEndsWith("the İo", "Io", "UNICODE_CI", false);
+    assertEndsWith("the İo", "i̇o", "UNICODE_CI", true);
+    assertEndsWith("the İo", "İo", "UNICODE_CI", true);
+    assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true);
+    assertEndsWith("the i̇o", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertEndsWith("the İo", "io", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("the İo", "Io", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("the İo", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertEndsWith("the İo", "İo", "UTF8_BINARY_LCASE", true);
   }
 
   private void assertStringSplitSQL(String str, String delimiter, String collationName,
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 711e31fd6881..934b93c9345b 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -215,43 +215,6 @@ public void contains() {
     assertFalse(fromString("大千世界").contains(fromString("大千世界好")));
   }
 
-  @Test
-  public void containsInLowerCase() {
-    // Corner cases
-    assertTrue(EMPTY_UTF8.containsInLowerCase(EMPTY_UTF8));
-    assertTrue(fromString("a").containsInLowerCase(EMPTY_UTF8));
-    assertTrue(fromString("A").containsInLowerCase(fromString("a")));
-    assertTrue(fromString("a").containsInLowerCase(fromString("A")));
-    assertFalse(EMPTY_UTF8.containsInLowerCase(fromString("a")));
-    // ASCII
-    assertTrue(fromString("hello").containsInLowerCase(fromString("ello")));
-    assertFalse(fromString("hello").containsInLowerCase(fromString("vello")));
-    assertFalse(fromString("hello").containsInLowerCase(fromString("hellooo")));
-    // Unicode
-    assertTrue(fromString("大千世界").containsInLowerCase(fromString("千世界")));
-    assertFalse(fromString("大千世界").containsInLowerCase(fromString("世千")));
-    assertFalse(fromString("大千世界").containsInLowerCase(fromString("大千世界好")));
-    // ASCII lowercase
-    assertTrue(fromString("HeLlO").containsInLowerCase(fromString("ElL")));
-    assertFalse(fromString("HeLlO").containsInLowerCase(fromString("ElLoO")));
-    // Unicode lowercase
-    assertTrue(fromString("ЯбЛоКо").containsInLowerCase(fromString("БлОк")));
-    assertFalse(fromString("ЯбЛоКо").containsInLowerCase(fromString("лОкБ")));
-    // Characters with the same binary lowercase representation
-    assertTrue(fromString("The Kelvin.").containsInLowerCase(fromString("Kelvin")));
-    assertTrue(fromString("The Kelvin.").containsInLowerCase(fromString("Kelvin")));
-    assertTrue(fromString("The KKelvin.").containsInLowerCase(fromString("KKelvin")));
-    assertTrue(fromString("2 Kelvin.").containsInLowerCase(fromString("2 Kelvin")));
-    assertTrue(fromString("2 Kelvin.").containsInLowerCase(fromString("2 Kelvin")));
-    assertFalse(fromString("The KKelvin.").containsInLowerCase(fromString("KKelvin,")));
-    // Characters with longer binary lowercase representation
-    assertTrue(fromString("the İodine").containsInLowerCase(fromString("the i̇odine")));
-    assertTrue(fromString("the i̇odine").containsInLowerCase(fromString("the İodine")));
-    assertTrue(fromString("The İodiNe").containsInLowerCase(fromString(" i̇oDin")));
-    assertTrue(fromString("İodiNe").containsInLowerCase(fromString("i̇oDin")));
-    assertFalse(fromString("İodiNe").containsInLowerCase(fromString(" i̇oDin")));
-  }
-
   @Test
   public void startsWith() {
     assertTrue(EMPTY_UTF8.startsWith(EMPTY_UTF8));
@@ -263,40 +226,6 @@ public void startsWith() {
     assertFalse(fromString("大千世界").startsWith(fromString("大千世界好")));
   }
 
-  @Test
-  public void startsWithInLowerCase() {
-    // Corner cases
-    assertTrue(EMPTY_UTF8.startsWithInLowerCase(EMPTY_UTF8));
-    assertTrue(fromString("a").startsWithInLowerCase(EMPTY_UTF8));
-    assertTrue(fromString("A").startsWithInLowerCase(fromString("a")));
-    assertTrue(fromString("a").startsWithInLowerCase(fromString("A")));
-    assertFalse(EMPTY_UTF8.startsWithInLowerCase(fromString("a")));
-    // ASCII
-    assertTrue(fromString("hello").startsWithInLowerCase(fromString("hell")));
-    assertFalse(fromString("hello").startsWithInLowerCase(fromString("ell")));
-    // Unicode
-    assertTrue(fromString("大千世界").startsWithInLowerCase(fromString("大千")));
-    assertFalse(fromString("大千世界").startsWithInLowerCase(fromString("世千")));
-    // ASCII lowercase
-    assertTrue(fromString("HeLlO").startsWithInLowerCase(fromString("hElL")));
-    assertFalse(fromString("HeLlO").startsWithInLowerCase(fromString("ElL")));
-    // Unicode lowercase
-    assertTrue(fromString("ЯбЛоКо").startsWithInLowerCase(fromString("яБлОк")));
-    assertFalse(fromString("ЯбЛоКо").startsWithInLowerCase(fromString("БлОк")));
-    // Characters with the same binary lowercase representation
-    assertTrue(fromString("Kelvin.").startsWithInLowerCase(fromString("Kelvin")));
-    assertTrue(fromString("Kelvin.").startsWithInLowerCase(fromString("Kelvin")));
-    assertTrue(fromString("KKelvin.").startsWithInLowerCase(fromString("KKelvin")));
-    assertTrue(fromString("2 Kelvin.").startsWithInLowerCase(fromString("2 Kelvin")));
-    assertTrue(fromString("2 Kelvin.").startsWithInLowerCase(fromString("2 Kelvin")));
-    assertFalse(fromString("KKelvin.").startsWithInLowerCase(fromString("KKelvin,")));
-    // Characters with longer binary lowercase representation
-    assertTrue(fromString("the İodine").startsWithInLowerCase(fromString("the i̇odine")));
-    assertTrue(fromString("the i̇odine").startsWithInLowerCase(fromString("the İodine")));
-    assertTrue(fromString("İodiNe").startsWithInLowerCase(fromString("i̇oDin")));
-    assertFalse(fromString("The İodiNe").startsWithInLowerCase(fromString("i̇oDin")));
-  }
-
   @Test
   public void endsWith() {
     assertTrue(EMPTY_UTF8.endsWith(EMPTY_UTF8));
@@ -308,40 +237,6 @@ public void endsWith() {
     assertFalse(fromString("数据砖头").endsWith(fromString("我的数据砖头")));
   }
 
-  @Test
-  public void endsWithInLowerCase() {
-    // Corner cases
-    assertTrue(EMPTY_UTF8.endsWithInLowerCase(EMPTY_UTF8));
-    assertTrue(fromString("a").endsWithInLowerCase(EMPTY_UTF8));
-    assertTrue(fromString("A").endsWithInLowerCase(fromString("a")));
-    assertTrue(fromString("a").endsWithInLowerCase(fromString("A")));
-    assertFalse(EMPTY_UTF8.endsWithInLowerCase(fromString("a")));
-    // ASCII
-    assertTrue(fromString("hello").endsWithInLowerCase(fromString("ello")));
-    assertFalse(fromString("hello").endsWithInLowerCase(fromString("hell")));
-    // Unicode
-    assertTrue(fromString("大千世界").endsWithInLowerCase(fromString("世界")));
-    assertFalse(fromString("大千世界").endsWithInLowerCase(fromString("大千")));
-    // ASCII lowercase
-    assertTrue(fromString("HeLlO").endsWithInLowerCase(fromString("ElLo")));
-    assertFalse(fromString("HeLlO").endsWithInLowerCase(fromString("hElL")));
-    // Unicode lowercase
-    assertTrue(fromString("ЯбЛоКо").endsWithInLowerCase(fromString("БлОкО")));
-    assertFalse(fromString("ЯбЛоКо").endsWithInLowerCase(fromString("яБлОк")));
-    // Characters with the same binary lowercase representation
-    assertTrue(fromString("The Kelvin").endsWithInLowerCase(fromString("Kelvin")));
-    assertTrue(fromString("The Kelvin").endsWithInLowerCase(fromString("Kelvin")));
-    assertTrue(fromString("The KKelvin").endsWithInLowerCase(fromString("KKelvin")));
-    assertTrue(fromString("The 2 Kelvin").endsWithInLowerCase(fromString("2 Kelvin")));
-    assertTrue(fromString("The 2 Kelvin").endsWithInLowerCase(fromString("2 Kelvin")));
-    assertFalse(fromString("The KKelvin").endsWithInLowerCase(fromString("KKelvin,")));
-    // Characters with longer binary lowercase representation
-    assertTrue(fromString("the İodine").endsWithInLowerCase(fromString("the i̇odine")));
-    assertTrue(fromString("the i̇odine").endsWithInLowerCase(fromString("the İodine")));
-    assertTrue(fromString("The İodiNe").endsWithInLowerCase(fromString("i̇oDine")));
-    assertFalse(fromString("The İodiNe").endsWithInLowerCase(fromString("i̇oDin")));
-  }
-
   @Test
   public void substring() {
     assertEquals(EMPTY_UTF8, fromString("hello").substring(0, 0));

From b5794fc07388adfeac7b988bc5ecfb79d992412e Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Thu, 9 May 2024 17:55:27 +0200
Subject: [PATCH 02/19] Add tests

---
 .../unsafe/types/CollationSupportSuite.java    | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index a23b10ad20fb..5396214bbcc6 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -102,6 +102,9 @@ public void testContains() throws SparkException {
     assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true);
     assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
     // Case-variable character length
+    assertContains("i̇", "i", "UNICODE_CI", false);
+    assertContains("i̇", "İ", "UNICODE_CI", true);
+    assertContains("İ", "i", "UNICODE_CI", false);
     assertContains("adi̇os", "io", "UNICODE_CI", false);
     assertContains("adi̇os", "Io", "UNICODE_CI", false);
     assertContains("adi̇os", "i̇o", "UNICODE_CI", true);
@@ -110,6 +113,9 @@ public void testContains() throws SparkException {
     assertContains("adİos", "Io", "UNICODE_CI", false);
     assertContains("adİos", "i̇o", "UNICODE_CI", true);
     assertContains("adİos", "İo", "UNICODE_CI", true);
+    assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // note: different from UNICODE_CI
+    assertContains("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertContains("İ", "i", "UTF8_BINARY_LCASE", false);
     assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false);
     assertContains("adi̇os", "Io", "UTF8_BINARY_LCASE", false);
     assertContains("adi̇os", "i̇o", "UTF8_BINARY_LCASE", true);
@@ -201,6 +207,9 @@ public void testStartsWith() throws SparkException {
     assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true);
     assertStartsWith("KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false);
     // Case-variable character length
+    assertStartsWith("i̇", "i", "UNICODE_CI", false);
+    assertStartsWith("i̇", "İ", "UNICODE_CI", true);
+    assertStartsWith("İ", "i", "UNICODE_CI", false);
     assertStartsWith("i̇onic", "io", "UNICODE_CI", false);
     assertStartsWith("i̇onic", "Io", "UNICODE_CI", false);
     assertStartsWith("i̇onic", "i̇o", "UNICODE_CI", true);
@@ -209,6 +218,9 @@ public void testStartsWith() throws SparkException {
     assertStartsWith("İonic", "Io", "UNICODE_CI", false);
     assertStartsWith("İonic", "i̇o", "UNICODE_CI", true);
     assertStartsWith("İonic", "İo", "UNICODE_CI", true);
+    assertStartsWith("i̇", "i", "UTF8_BINARY_LCASE", true); // note: different from UNICODE_CI
+    assertStartsWith("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertStartsWith("İ", "i", "UTF8_BINARY_LCASE", false);
     assertStartsWith("i̇onic", "io", "UTF8_BINARY_LCASE", false);
     assertStartsWith("i̇onic", "Io", "UTF8_BINARY_LCASE", false);
     assertStartsWith("i̇onic", "i̇o", "UTF8_BINARY_LCASE", true);
@@ -299,6 +311,9 @@ public void testEndsWith() throws SparkException {
     assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true);
     assertEndsWith("The KKelvin", "KKelvin,", "UTF8_BINARY_LCASE", false);
     // Case-variable character length
+    assertEndsWith("i̇", "i", "UNICODE_CI", false);
+    assertEndsWith("i̇", "İ", "UNICODE_CI", true);
+    assertEndsWith("İ", "i", "UNICODE_CI", false);
     assertEndsWith("the i̇o", "io", "UNICODE_CI", false);
     assertEndsWith("the i̇o", "Io", "UNICODE_CI", false);
     assertEndsWith("the i̇o", "i̇o", "UNICODE_CI", true);
@@ -307,6 +322,9 @@ public void testEndsWith() throws SparkException {
     assertEndsWith("the İo", "Io", "UNICODE_CI", false);
     assertEndsWith("the İo", "i̇o", "UNICODE_CI", true);
     assertEndsWith("the İo", "İo", "UNICODE_CI", true);
+    assertEndsWith("i̇", "i", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertEndsWith("İ", "i", "UTF8_BINARY_LCASE", false);
     assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false);
     assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false);
     assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true);

From dd261bfada6814a7a349ee622f4a754cef49a08f Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 10 May 2024 06:15:31 +0200
Subject: [PATCH 03/19] Fix locate/position

---
 .../sql/catalyst/util/CollationSupport.java   | 34 +++++++++++--------
 .../unsafe/types/CollationSupportSuite.java   | 10 ++++++
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index bfaa1e2d0b55..8a5ce7b1feb7 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -126,19 +126,9 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.contains(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      if (r.numBytes() == 0) return true;
+      if (r.numChars() == 0) return true;
       if (l.numChars() < r.numChars()) return false;
-      int lenHaystack = l.numChars(), lenNeedle = r.numChars();
-      final UTF8String firstLower = r.substring(0, 1).toLowerCase();
-      final UTF8String needle = r.toLowerCase();
-      for (var i = 0; i <= (lenHaystack - lenNeedle); i++) {
-        if (l.substring(i, i + 1).toLowerCase().equals(firstLower)) {
-          if (CollationAwareUTF8String.lowercaseMatchAt(l, needle, i, lenNeedle)) {
-            return true;
-          }
-        }
-      }
-      return false;
+      return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0;
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
         final int collationId) {
@@ -379,7 +369,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring
       return string.indexOf(substring, 0);
     }
     public static int execLowercase(final UTF8String string, final UTF8String substring) {
-      return string.toLowerCase().indexOf(substring.toLowerCase(), 0);
+      return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0);
     }
     public static int execICU(final UTF8String string, final UTF8String substring,
         final int collationId) {
@@ -455,7 +445,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring
     }
     public static int execLowercase(final UTF8String string, final UTF8String substring,
         final int start) {
-      return string.toLowerCase().indexOf(substring.toLowerCase(), start);
+      return CollationAwareUTF8String.lowercaseIndexOf(string, substring, start);
     }
     public static int execICU(final UTF8String string, final UTF8String substring, final int start,
                               final int collationId) {
@@ -746,6 +736,22 @@ private static int findInSet(final UTF8String match, final UTF8String set, int c
       return 0;
     }
 
+    private static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
+        final int start) {
+      if (pattern.numChars() == 0) return 0;
+      int lenHaystack = target.numChars(), lenNeedle = pattern.numChars();
+      final UTF8String firstLower = pattern.substring(0, 1).toLowerCase();
+      final UTF8String needle = pattern.toLowerCase();
+      for (int i = start; i <= (lenHaystack - lenNeedle); i++) {
+        if (target.substring(i, i + 1).toLowerCase().equals(firstLower)) {
+          if (CollationAwareUTF8String.lowercaseMatchAt(target, needle, i, lenNeedle)) {
+            return i;
+          }
+        }
+      }
+      return -1;
+    }
+
     private static int indexOf(final UTF8String target, final UTF8String pattern,
         final int start, final int collationId) {
       if (pattern.numBytes() == 0) {
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 5396214bbcc6..362678f2ddce 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -749,6 +749,16 @@ public void testLocate() throws SparkException {
     assertLocate("大千", "test大千世界大千世界", 9, "UNICODE_CI", 9);
     assertLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1);
     // Case-variable character length
+    assertLocate("̇", "i̇", 1, "UTF8_BINARY", 2);
+    assertLocate("̇", "İ", 1, "UTF8_BINARY_LCASE", 0); // note: different from UTF8_BINARY
+    assertLocate("i", "i̇", 1, "UNICODE_CI", 0);
+    assertLocate("i̇", "i", 1, "UNICODE_CI", 0);
+    assertLocate("İ", "i̇", 1, "UNICODE_CI", 1);
+    assertLocate("İ", "i", 1, "UNICODE_CI", 0);
+    assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // note: different from UNICODE_CI
+    assertLocate("i̇", "i", 1, "UTF8_BINARY_LCASE", 0);
+    assertLocate("İ", "i̇", 1, "UTF8_BINARY_LCASE", 0); // note: different from UNICODE_CI
+    assertLocate("İ", "i", 1, "UTF8_BINARY_LCASE", 0);
     assertLocate("i̇o", "İo世界大千世界", 1, "UNICODE_CI", 1);
     assertLocate("i̇o", "大千İo世界大千世界", 1, "UNICODE_CI", 3);
     assertLocate("i̇o", "世界İo大千世界大千İo", 4, "UNICODE_CI", 11);

From 35dd21a517c00fc4e1e21c8a22c0d3ab2e642550 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 10 May 2024 07:05:16 +0200
Subject: [PATCH 04/19] Fix CollationAwareUTF8String.java

---
 .../util/CollationAwareUTF8String.java        | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index ee0d611d7e65..81f1fb03ada6 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -34,6 +34,15 @@
  * Utility class for collation-aware UTF8String operations.
  */
 public class CollationAwareUTF8String {
+
+  public static boolean lowercaseMatchAt(final UTF8String l, final UTF8String r,
+      int pos, int len) {
+    if (len + pos > l.numChars() || pos < 0) {
+      return false;
+    }
+    return l.substring(pos, pos + len).toLowerCase().equals(r);
+  }
+
   public static UTF8String replace(final UTF8String src, final UTF8String search,
       final UTF8String replace, final int collationId) {
     // This collation aware implementation is based on existing implementation on UTF8String
@@ -183,6 +192,22 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
     return 0;
   }
 
+  public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
+      final int start) {
+    if (pattern.numChars() == 0) return 0;
+    int lenHaystack = target.numChars(), lenNeedle = pattern.numChars();
+    final UTF8String firstLower = pattern.substring(0, 1).toLowerCase();
+    final UTF8String needle = pattern.toLowerCase();
+    for (int i = start; i <= (lenHaystack - lenNeedle); i++) {
+      if (target.substring(i, i + 1).toLowerCase().equals(firstLower)) {
+        if (CollationAwareUTF8String.lowercaseMatchAt(target, needle, i, lenNeedle)) {
+          return i;
+        }
+      }
+    }
+    return -1;
+  }
+
   public static int indexOf(final UTF8String target, final UTF8String pattern,
       final int start, final int collationId) {
     if (pattern.numBytes() == 0) {
@@ -467,4 +492,7 @@ public static UTF8String lowercaseTrimRight(
     }
     return srcString.copyUTF8String(0, trimByteIdx);
   }
+
+  // TODO: Add more collation-aware UTF8String operations here.
+
 }

From 0baa71381906fc7efc1a88c89139d5ab7b2025b8 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 10 May 2024 15:08:41 +0200
Subject: [PATCH 05/19] Fix UTF8_BINARY_LCASE string search

---
 .../util/CollationAwareUTF8String.java        | 29 ++++++----
 .../sql/catalyst/util/CollationSupport.java   | 10 ++--
 .../unsafe/types/CollationSupportSuite.java   | 54 +++++++++++++------
 3 files changed, 59 insertions(+), 34 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 81f1fb03ada6..9f598e9a6715 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -35,12 +35,24 @@
  */
 public class CollationAwareUTF8String {
 
-  public static boolean lowercaseMatchAt(final UTF8String l, final UTF8String r,
-      int pos, int len) {
-    if (len + pos > l.numChars() || pos < 0) {
-      return false;
+  public static boolean lowercaseMatchFrom(final UTF8String l, final UTF8String r, int pos) {
+    if (pos < 0) return false;
+    for (int len = 0; len <= l.numChars() - pos; len++) {
+      if (l.substring(pos, pos + len).toLowerCase().equals(r)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  public static boolean lowercaseMatchUntil(final UTF8String l, final UTF8String r, int pos) {
+    if (pos > l.numChars()) return false;
+    for (int len = 1; len <= pos; len++) {
+      if (l.substring(pos - len, pos).toLowerCase().equals(r)) {
+        return true;
+      }
     }
-    return l.substring(pos, pos + len).toLowerCase().equals(r);
+    return false;
   }
 
   public static UTF8String replace(final UTF8String src, final UTF8String search,
@@ -196,13 +208,10 @@ public static int lowercaseIndexOf(final UTF8String target, final UTF8String pat
       final int start) {
     if (pattern.numChars() == 0) return 0;
     int lenHaystack = target.numChars(), lenNeedle = pattern.numChars();
-    final UTF8String firstLower = pattern.substring(0, 1).toLowerCase();
     final UTF8String needle = pattern.toLowerCase();
     for (int i = start; i <= (lenHaystack - lenNeedle); i++) {
-      if (target.substring(i, i + 1).toLowerCase().equals(firstLower)) {
-        if (CollationAwareUTF8String.lowercaseMatchAt(target, needle, i, lenNeedle)) {
-          return i;
-        }
+      if (CollationAwareUTF8String.lowercaseMatchFrom(target, needle, i)) {
+        return i;
       }
     }
     return -1;
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index 1710d877ddfd..fa21ed3ebc65 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -118,8 +118,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.contains(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      if (r.numChars() == 0) return true;
-      if (l.numChars() < r.numChars()) return false;
+      if (r.numBytes() == 0) return true;
       return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0;
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
@@ -159,8 +158,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
       if (r.numBytes() == 0) return true;
-      if (l.numChars() < r.numChars()) return false;
-      return CollationAwareUTF8String.lowercaseMatchAt(l, r.toLowerCase(), 0, r.numChars());
+      return CollationAwareUTF8String.lowercaseMatchFrom(l, r.toLowerCase(), 0);
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
         final int collationId) {
@@ -198,9 +196,7 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
       if (r.numBytes() == 0) return true;
-      if (l.numChars() < r.numChars()) return false;
-      return CollationAwareUTF8String.lowercaseMatchAt(l, r.toLowerCase(),
-        l.numChars() - r.numChars(), r.numChars());
+      return CollationAwareUTF8String.lowercaseMatchUntil(l, r.toLowerCase(), l.numChars());
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
         final int collationId) {
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 713e47b419ca..89eba451137e 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -103,6 +103,7 @@ public void testContains() throws SparkException {
     assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
     // Case-variable character length
     assertContains("i̇", "i", "UNICODE_CI", false);
+    assertContains("i̇", "̇", "UNICODE_CI", false);
     assertContains("i̇", "İ", "UNICODE_CI", true);
     assertContains("İ", "i", "UNICODE_CI", false);
     assertContains("adi̇os", "io", "UNICODE_CI", false);
@@ -113,16 +114,17 @@ public void testContains() throws SparkException {
     assertContains("adİos", "Io", "UNICODE_CI", false);
     assertContains("adİos", "i̇o", "UNICODE_CI", true);
     assertContains("adİos", "İo", "UNICODE_CI", true);
-    assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // note: different from UNICODE_CI
-    assertContains("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
+    assertContains("i̇", "̇", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
+    assertContains("i̇", "İ", "UTF8_BINARY_LCASE", true);
     assertContains("İ", "i", "UTF8_BINARY_LCASE", false);
     assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false);
     assertContains("adi̇os", "Io", "UTF8_BINARY_LCASE", false);
     assertContains("adi̇os", "i̇o", "UTF8_BINARY_LCASE", true);
-    assertContains("adi̇os", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertContains("adi̇os", "İo", "UTF8_BINARY_LCASE", true);
     assertContains("adİos", "io", "UTF8_BINARY_LCASE", false);
     assertContains("adİos", "Io", "UTF8_BINARY_LCASE", false);
-    assertContains("adİos", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertContains("adİos", "i̇o", "UTF8_BINARY_LCASE", true);
     assertContains("adİos", "İo", "UTF8_BINARY_LCASE", true);
   }
 
@@ -210,6 +212,10 @@ public void testStartsWith() throws SparkException {
     assertStartsWith("i̇", "i", "UNICODE_CI", false);
     assertStartsWith("i̇", "İ", "UNICODE_CI", true);
     assertStartsWith("İ", "i", "UNICODE_CI", false);
+    assertStartsWith("İİİ", "i̇i̇", "UNICODE_CI", true);
+    assertStartsWith("İİİ", "i̇i", "UNICODE_CI", false);
+    assertStartsWith("İi̇İ", "i̇İ", "UNICODE_CI", true);
+    assertStartsWith("i̇İi̇i̇", "İi̇İi", "UNICODE_CI", false);
     assertStartsWith("i̇onic", "io", "UNICODE_CI", false);
     assertStartsWith("i̇onic", "Io", "UNICODE_CI", false);
     assertStartsWith("i̇onic", "i̇o", "UNICODE_CI", true);
@@ -218,16 +224,20 @@ public void testStartsWith() throws SparkException {
     assertStartsWith("İonic", "Io", "UNICODE_CI", false);
     assertStartsWith("İonic", "i̇o", "UNICODE_CI", true);
     assertStartsWith("İonic", "İo", "UNICODE_CI", true);
-    assertStartsWith("i̇", "i", "UTF8_BINARY_LCASE", true); // note: different from UNICODE_CI
-    assertStartsWith("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertStartsWith("i̇", "i", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
+    assertStartsWith("i̇", "İ", "UTF8_BINARY_LCASE", true);
     assertStartsWith("İ", "i", "UTF8_BINARY_LCASE", false);
+    assertStartsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true);
+    assertStartsWith("İİİ", "i̇i", "UTF8_BINARY_LCASE", false);
+    assertStartsWith("İi̇İ", "i̇İ", "UTF8_BINARY_LCASE", true);
+    assertStartsWith("i̇İi̇i̇", "İi̇İi", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
     assertStartsWith("i̇onic", "io", "UTF8_BINARY_LCASE", false);
     assertStartsWith("i̇onic", "Io", "UTF8_BINARY_LCASE", false);
     assertStartsWith("i̇onic", "i̇o", "UTF8_BINARY_LCASE", true);
-    assertStartsWith("i̇onic", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertStartsWith("i̇onic", "İo", "UTF8_BINARY_LCASE", true);
     assertStartsWith("İonic", "io", "UTF8_BINARY_LCASE", false);
     assertStartsWith("İonic", "Io", "UTF8_BINARY_LCASE", false);
-    assertStartsWith("İonic", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertStartsWith("İonic", "i̇o", "UTF8_BINARY_LCASE", true);
     assertStartsWith("İonic", "İo", "UTF8_BINARY_LCASE", true);
   }
 
@@ -311,9 +321,13 @@ public void testEndsWith() throws SparkException {
     assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true);
     assertEndsWith("The KKelvin", "KKelvin,", "UTF8_BINARY_LCASE", false);
     // Case-variable character length
-    assertEndsWith("i̇", "i", "UNICODE_CI", false);
+    assertEndsWith("i̇", "̇", "UNICODE_CI", false);
     assertEndsWith("i̇", "İ", "UNICODE_CI", true);
     assertEndsWith("İ", "i", "UNICODE_CI", false);
+    assertEndsWith("İİİ", "i̇i̇", "UNICODE_CI", true);
+    assertEndsWith("İİİ", "ii̇", "UNICODE_CI", false);
+    assertEndsWith("İi̇İ", "İi̇", "UNICODE_CI", true);
+    assertEndsWith("i̇İi̇i̇", "̇İi̇İ", "UNICODE_CI", false);
     assertEndsWith("the i̇o", "io", "UNICODE_CI", false);
     assertEndsWith("the i̇o", "Io", "UNICODE_CI", false);
     assertEndsWith("the i̇o", "i̇o", "UNICODE_CI", true);
@@ -322,16 +336,20 @@ public void testEndsWith() throws SparkException {
     assertEndsWith("the İo", "Io", "UNICODE_CI", false);
     assertEndsWith("the İo", "i̇o", "UNICODE_CI", true);
     assertEndsWith("the İo", "İo", "UNICODE_CI", true);
-    assertEndsWith("i̇", "i", "UTF8_BINARY_LCASE", false);
-    assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertEndsWith("i̇", "̇", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
+    assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", true);
     assertEndsWith("İ", "i", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true);
+    assertEndsWith("İİİ", "ii̇", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("İi̇İ", "İi̇", "UTF8_BINARY_LCASE", true);
+    assertEndsWith("i̇İi̇i̇", "̇İi̇İ", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
     assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false);
     assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false);
     assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true);
-    assertEndsWith("the i̇o", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertEndsWith("the i̇o", "İo", "UTF8_BINARY_LCASE", true);
     assertEndsWith("the İo", "io", "UTF8_BINARY_LCASE", false);
     assertEndsWith("the İo", "Io", "UTF8_BINARY_LCASE", false);
-    assertEndsWith("the İo", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertEndsWith("the İo", "i̇o", "UTF8_BINARY_LCASE", true);
     assertEndsWith("the İo", "İo", "UTF8_BINARY_LCASE", true);
   }
 
@@ -750,21 +768,23 @@ public void testLocate() throws SparkException {
     assertLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1);
     // Case-variable character length
     assertLocate("̇", "i̇", 1, "UTF8_BINARY", 2);
-    assertLocate("̇", "İ", 1, "UTF8_BINARY_LCASE", 0); // note: different from UTF8_BINARY
+    assertLocate("̇", "İ", 1, "UTF8_BINARY_LCASE", 0); // different from UTF8_BINARY
     assertLocate("i", "i̇", 1, "UNICODE_CI", 0);
+    assertLocate("̇", "i̇", 1, "UNICODE_CI", 0);
     assertLocate("i̇", "i", 1, "UNICODE_CI", 0);
     assertLocate("İ", "i̇", 1, "UNICODE_CI", 1);
     assertLocate("İ", "i", 1, "UNICODE_CI", 0);
-    assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // note: different from UNICODE_CI
+    assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // different from UNICODE_CI
+    assertLocate("̇", "i̇", 1, "UTF8_BINARY_LCASE", 2); // different from UNICODE_CI
     assertLocate("i̇", "i", 1, "UTF8_BINARY_LCASE", 0);
-    assertLocate("İ", "i̇", 1, "UTF8_BINARY_LCASE", 0); // note: different from UNICODE_CI
+    assertLocate("İ", "i̇", 1, "UTF8_BINARY_LCASE", 1);
     assertLocate("İ", "i", 1, "UTF8_BINARY_LCASE", 0);
     assertLocate("i̇o", "İo世界大千世界", 1, "UNICODE_CI", 1);
     assertLocate("i̇o", "大千İo世界大千世界", 1, "UNICODE_CI", 3);
     assertLocate("i̇o", "世界İo大千世界大千İo", 4, "UNICODE_CI", 11);
     assertLocate("İo", "i̇o世界大千世界", 1, "UNICODE_CI", 1);
     assertLocate("İo", "大千i̇o世界大千世界", 1, "UNICODE_CI", 3);
-    assertLocate("İo", "世界i̇o大千世界大千i̇o", 4, "UNICODE_CI", 12); // 12 instead of 11
+    assertLocate("İo", "世界i̇o大千世界大千i̇o", 4, "UNICODE_CI", 12);
   }
 
   private void assertSubstringIndex(String string, String delimiter, Integer count,

From 37e44ad282cb0eb80b1223584923552fd48cb047 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Tue, 14 May 2024 11:17:32 +0200
Subject: [PATCH 06/19] Fix tests

---
 .../unsafe/types/CollationSupportSuite.java   | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 89eba451137e..3980bff59887 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -103,7 +103,7 @@ public void testContains() throws SparkException {
     assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
     // Case-variable character length
     assertContains("i̇", "i", "UNICODE_CI", false);
-    assertContains("i̇", "̇", "UNICODE_CI", false);
+    assertContains("i̇", "\u0307", "UNICODE_CI", false);
     assertContains("i̇", "İ", "UNICODE_CI", true);
     assertContains("İ", "i", "UNICODE_CI", false);
     assertContains("adi̇os", "io", "UNICODE_CI", false);
@@ -115,7 +115,7 @@ public void testContains() throws SparkException {
     assertContains("adİos", "i̇o", "UNICODE_CI", true);
     assertContains("adİos", "İo", "UNICODE_CI", true);
     assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
-    assertContains("i̇", "̇", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
+    assertContains("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
     assertContains("i̇", "İ", "UTF8_BINARY_LCASE", true);
     assertContains("İ", "i", "UTF8_BINARY_LCASE", false);
     assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false);
@@ -321,13 +321,13 @@ public void testEndsWith() throws SparkException {
     assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true);
     assertEndsWith("The KKelvin", "KKelvin,", "UTF8_BINARY_LCASE", false);
     // Case-variable character length
-    assertEndsWith("i̇", "̇", "UNICODE_CI", false);
+    assertEndsWith("i̇", "\u0307", "UNICODE_CI", false);
     assertEndsWith("i̇", "İ", "UNICODE_CI", true);
     assertEndsWith("İ", "i", "UNICODE_CI", false);
     assertEndsWith("İİİ", "i̇i̇", "UNICODE_CI", true);
     assertEndsWith("İİİ", "ii̇", "UNICODE_CI", false);
     assertEndsWith("İi̇İ", "İi̇", "UNICODE_CI", true);
-    assertEndsWith("i̇İi̇i̇", "̇İi̇İ", "UNICODE_CI", false);
+    assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UNICODE_CI", false);
     assertEndsWith("the i̇o", "io", "UNICODE_CI", false);
     assertEndsWith("the i̇o", "Io", "UNICODE_CI", false);
     assertEndsWith("the i̇o", "i̇o", "UNICODE_CI", true);
@@ -336,13 +336,13 @@ public void testEndsWith() throws SparkException {
     assertEndsWith("the İo", "Io", "UNICODE_CI", false);
     assertEndsWith("the İo", "i̇o", "UNICODE_CI", true);
     assertEndsWith("the İo", "İo", "UNICODE_CI", true);
-    assertEndsWith("i̇", "̇", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
+    assertEndsWith("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
     assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", true);
     assertEndsWith("İ", "i", "UTF8_BINARY_LCASE", false);
     assertEndsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true);
     assertEndsWith("İİİ", "ii̇", "UTF8_BINARY_LCASE", false);
     assertEndsWith("İi̇İ", "İi̇", "UTF8_BINARY_LCASE", true);
-    assertEndsWith("i̇İi̇i̇", "̇İi̇İ", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
+    assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
     assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false);
     assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false);
     assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true);
@@ -767,15 +767,15 @@ public void testLocate() throws SparkException {
     assertLocate("大千", "test大千世界大千世界", 9, "UNICODE_CI", 9);
     assertLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1);
     // Case-variable character length
-    assertLocate("̇", "i̇", 1, "UTF8_BINARY", 2);
-    assertLocate("̇", "İ", 1, "UTF8_BINARY_LCASE", 0); // different from UTF8_BINARY
+    assertLocate("\u0307", "i̇", 1, "UTF8_BINARY", 2);
+    assertLocate("\u0307", "İ", 1, "UTF8_BINARY_LCASE", 0); // different from UTF8_BINARY
     assertLocate("i", "i̇", 1, "UNICODE_CI", 0);
-    assertLocate("̇", "i̇", 1, "UNICODE_CI", 0);
+    assertLocate("\u0307", "i̇", 1, "UNICODE_CI", 0);
     assertLocate("i̇", "i", 1, "UNICODE_CI", 0);
     assertLocate("İ", "i̇", 1, "UNICODE_CI", 1);
     assertLocate("İ", "i", 1, "UNICODE_CI", 0);
     assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // different from UNICODE_CI
-    assertLocate("̇", "i̇", 1, "UTF8_BINARY_LCASE", 2); // different from UNICODE_CI
+    assertLocate("\u0307", "i̇", 1, "UTF8_BINARY_LCASE", 2); // different from UNICODE_CI
     assertLocate("i̇", "i", 1, "UTF8_BINARY_LCASE", 0);
     assertLocate("İ", "i̇", 1, "UTF8_BINARY_LCASE", 1);
     assertLocate("İ", "i", 1, "UTF8_BINARY_LCASE", 0);

From 6f0ceb5e01ef73f449445402272f254924222891 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Tue, 14 May 2024 12:08:26 +0200
Subject: [PATCH 07/19] Minor code fixes

---
 .../util/CollationAwareUTF8String.java        | 28 +++++++++++++------
 .../sql/catalyst/util/CollationSupport.java   |  3 --
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 9f598e9a6715..fb7f24de8c95 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -35,20 +35,30 @@
  */
 public class CollationAwareUTF8String {
 
-  public static boolean lowercaseMatchFrom(final UTF8String l, final UTF8String r, int pos) {
-    if (pos < 0) return false;
-    for (int len = 0; len <= l.numChars() - pos; len++) {
-      if (l.substring(pos, pos + len).toLowerCase().equals(r)) {
+  public static boolean lowercaseMatchFrom(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
+    // To avoid the overhead of calling .toLowerCase() multiple times on the same pattern string,
+    // this method assumes that the pattern string is already lowercased prior to method call.
+    assert startPos >= 0;
+    for (int len = 0; len <= target.numChars() - startPos; ++len) {
+      if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) {
         return true;
       }
     }
     return false;
   }
 
-  public static boolean lowercaseMatchUntil(final UTF8String l, final UTF8String r, int pos) {
-    if (pos > l.numChars()) return false;
-    for (int len = 1; len <= pos; len++) {
-      if (l.substring(pos - len, pos).toLowerCase().equals(r)) {
+  public static boolean lowercaseMatchUntil(
+          final UTF8String target,
+          final UTF8String lowercasePattern,
+          int endPos) {
+    // To avoid the overhead of calling .toLowerCase() multiple times on the same pattern string,
+    // this method assumes that the pattern string is already lowercased prior to method call.
+    assert endPos <= target.numChars();
+    for (int len = 0; len <= endPos; ++len) {
+      if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) {
         return true;
       }
     }
@@ -209,7 +219,7 @@ public static int lowercaseIndexOf(final UTF8String target, final UTF8String pat
     if (pattern.numChars() == 0) return 0;
     int lenHaystack = target.numChars(), lenNeedle = pattern.numChars();
     final UTF8String needle = pattern.toLowerCase();
-    for (int i = start; i <= (lenHaystack - lenNeedle); i++) {
+    for (int i = start; i <= (lenHaystack - lenNeedle); ++i) {
       if (CollationAwareUTF8String.lowercaseMatchFrom(target, needle, i)) {
         return i;
       }
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index fa21ed3ebc65..d5bcc61bac2a 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -118,7 +118,6 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.contains(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      if (r.numBytes() == 0) return true;
       return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0;
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
@@ -157,7 +156,6 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.startsWith(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      if (r.numBytes() == 0) return true;
       return CollationAwareUTF8String.lowercaseMatchFrom(l, r.toLowerCase(), 0);
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
@@ -195,7 +193,6 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.endsWith(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      if (r.numBytes() == 0) return true;
       return CollationAwareUTF8String.lowercaseMatchUntil(l, r.toLowerCase(), l.numChars());
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,

From aa91b15576ccb6eda6c1a60b91c8b2481c74cbc3 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Tue, 14 May 2024 12:21:38 +0200
Subject: [PATCH 08/19] Add doc comments

---
 .../util/CollationAwareUTF8String.java        | 34 +++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index fb7f24de8c95..be13be64044f 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -35,12 +35,21 @@
  */
 public class CollationAwareUTF8String {
 
+  /**
+   * Returns whether the target string starts with the specified prefix,
+   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
+   * that the prefix is already lowercased prior to method call to avoid the
+   * overhead of calling .toLowerCase() multiple times on the same prefix string.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the start position for searching (in the target string)
+   * @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE
+   */
   public static boolean lowercaseMatchFrom(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int startPos) {
-    // To avoid the overhead of calling .toLowerCase() multiple times on the same pattern string,
-    // this method assumes that the pattern string is already lowercased prior to method call.
     assert startPos >= 0;
     for (int len = 0; len <= target.numChars() - startPos; ++len) {
       if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) {
@@ -50,6 +59,17 @@ public static boolean lowercaseMatchFrom(
     return false;
   }
 
+  /**
+   * Returns whether the target string ends with the specified suffix,
+   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
+   * that the suffix is already lowercased prior to method call to avoid the
+   * overhead of calling .toLowerCase() multiple times on the same suffix string.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return whether the target string ends with the specified suffix in UTF8_BINARY_LCASE
+   */
   public static boolean lowercaseMatchUntil(
           final UTF8String target,
           final UTF8String lowercasePattern,
@@ -214,6 +234,16 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
     return 0;
   }
 
+  /**
+   * Returns the position of the first occurrence of the pattern string
+   * in the target string from the specified position (0-based index),
+   * with respect to the UTF8_BINARY_LCASE collation.
+   *
+   * @param target the string to be searched in
+   * @param pattern the string to be searched for
+   * @param start the start position for searching (in the target string)
+   * @return the position of the first occurrence of pattern in target, if not found, -1 returned.
+   */
   public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
       final int start) {
     if (pattern.numChars() == 0) return 0;

From f72998cc040a2a069714f0dc094b116f6f1cded5 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Tue, 14 May 2024 13:41:26 +0200
Subject: [PATCH 09/19] Add back some tests

---
 .../apache/spark/unsafe/types/CollationSupportSuite.java   | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 3980bff59887..f09f7a7f3ff2 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -101,6 +101,13 @@ public void testContains() throws SparkException {
     assertContains("ab世De", "AB世dE", "UNICODE_CI", true);
     assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true);
     assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
+    // Characters with the same binary lowercase representation
+    assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true);
+    assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true);
+    assertContains("The KKelvin.", "KKelvin", "UTF8_BINARY_LCASE", true);
+    assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true);
+    assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true);
+    assertContains("The KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false);
     // Case-variable character length
     assertContains("i̇", "i", "UNICODE_CI", false);
     assertContains("i̇", "\u0307", "UNICODE_CI", false);

From 86bf00bccb847867e0b291a9ab87b66ba930cc5c Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Tue, 14 May 2024 20:52:18 +0200
Subject: [PATCH 10/19] Add support for StringInStr & SubstringIndex

---
 .../util/CollationAwareUTF8String.java        | 159 ++++++++++++------
 .../unsafe/types/CollationSupportSuite.java   |  26 ++-
 2 files changed, 133 insertions(+), 52 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index be13be64044f..e2834d7abf06 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -35,6 +35,12 @@
  */
 public class CollationAwareUTF8String {
 
+  /**
+   * The constant value to indicate that the match is not found
+   * when searching for a pattern string in a target string.
+   */
+  private static final int MATCH_NOT_FOUND = -1;
+
   /**
    * Returns whether the target string starts with the specified prefix,
    * with respect to the UTF8_BINARY_LCASE collation. The method assumes
@@ -50,13 +56,53 @@ public static boolean lowercaseMatchFrom(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int startPos) {
+    return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the length of the substring of the target string that starts with
+   * the specified prefix, with respect to the UTF8_BINARY_LCASE collation.
+   * The method assumes that the prefix is already lowercased. The method only
+   * considers the part of target string that starts from the specified position.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the end position for searching (in the target string)
+   * @return length of the target substring that ends with the specified suffix in lowercase
+   */
+  public static int lowercaseMatchLengthFrom(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
     assert startPos >= 0;
     for (int len = 0; len <= target.numChars() - startPos; ++len) {
       if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) {
-        return true;
+        return len;
       }
     }
-    return false;
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the position of the first occurrence of the pattern string
+   * in the target string from the specified position (0-based index),
+   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
+   * that the pattern string is already lowercased prior to method call.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the start position for searching (in the target string)
+   * @return the position of the first occurrence of pattern in target, if not found, -1 returned.
+   */
+  public static int lowercaseFind(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
+    for (int i = startPos; i <= target.numChars(); ++i) {
+      if (lowercaseMatchFrom(target, lowercasePattern, i))
+        return i;
+    }
+    return MATCH_NOT_FOUND;
   }
 
   /**
@@ -68,21 +114,59 @@ public static boolean lowercaseMatchFrom(
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
    * @param endPos the end position for searching (in the target string)
-   * @return whether the target string ends with the specified suffix in UTF8_BINARY_LCASE
+   * @return whether the target string ends with the specified suffix in lowercase
    */
   public static boolean lowercaseMatchUntil(
-          final UTF8String target,
-          final UTF8String lowercasePattern,
-          int endPos) {
-    // To avoid the overhead of calling .toLowerCase() multiple times on the same pattern string,
-    // this method assumes that the pattern string is already lowercased prior to method call.
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
+    return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the length of the substring of the target string that ends with
+   * the specified suffix, with respect to the UTF8_BINARY_LCASE collation.
+   * The method assumes that the suffix is already lowercased. The method only
+   * considers the part of target string that ends at the specified position.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return length of the target substring that ends with the specified suffix in lowercase
+   */
+  public static int lowercaseMatchLengthUntil(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
     assert endPos <= target.numChars();
     for (int len = 0; len <= endPos; ++len) {
       if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) {
-        return true;
+        return len;
       }
     }
-    return false;
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the position of the last occurrence of the pattern string
+   * in the target string until the specified position (0-based index),
+   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
+   * that the pattern string is already lowercased prior to method call.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return the position of the last occurrence of pattern in target, if not found, -1 returned.
+   */
+  public static int lowercaseRFind(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
+    for (int i = endPos; i >= 0; --i) {
+      if (lowercaseMatchUntil(target, lowercasePattern, i))
+        return i;
+    }
+    return MATCH_NOT_FOUND;
   }
 
   public static UTF8String replace(final UTF8String src, final UTF8String search,
@@ -247,14 +331,7 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
   public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
       final int start) {
     if (pattern.numChars() == 0) return 0;
-    int lenHaystack = target.numChars(), lenNeedle = pattern.numChars();
-    final UTF8String needle = pattern.toLowerCase();
-    for (int i = start; i <= (lenHaystack - lenNeedle); ++i) {
-      if (CollationAwareUTF8String.lowercaseMatchFrom(target, needle, i)) {
-        return i;
-      }
-    }
-    return -1;
+    return lowercaseFind(target, pattern.toLowerCase(), start);
   }
 
   public static int indexOf(final UTF8String target, final UTF8String pattern,
@@ -352,47 +429,29 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string,
       return UTF8String.EMPTY_UTF8;
     }
 
-    UTF8String lowercaseString = string.toLowerCase();
     UTF8String lowercaseDelimiter = delimiter.toLowerCase();
 
     if (count > 0) {
-      int idx = -1;
+      // search left to right (note: the start code point is inclusive)
+      int matchLength = -1;
       while (count > 0) {
-        idx = lowercaseString.find(lowercaseDelimiter, idx + 1);
-        if (idx >= 0) {
-          count--;
-        } else {
-          // can not find enough delim
-          return string;
-        }
-      }
-      if (idx == 0) {
-        return UTF8String.EMPTY_UTF8;
+        matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1);
+        if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter
+        else return string; // cannot find enough delimiters in the string
       }
-      byte[] bytes = new byte[idx];
-      copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx);
-      return UTF8String.fromBytes(bytes);
-
+      if (matchLength == 0) return UTF8String.EMPTY_UTF8;
+      return string.substring(0, matchLength);
     } else {
-      int idx = string.numBytes() - delimiter.numBytes() + 1;
+      // search right to left (note: the end code point is exclusive)
+      int matchLength = string.numChars() + 1;
       count = -count;
       while (count > 0) {
-        idx = lowercaseString.rfind(lowercaseDelimiter, idx - 1);
-        if (idx >= 0) {
-          count--;
-        } else {
-          // can not find enough delim
-          return string;
-        }
+        matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1);
+        if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter
+        else return string; // cannot find enough delimiters in the string
       }
-      if (idx + delimiter.numBytes() == string.numBytes()) {
-        return UTF8String.EMPTY_UTF8;
-      }
-      int size = string.numBytes() - delimiter.numBytes() - idx;
-      byte[] bytes = new byte[size];
-      copyMemory(string.getBaseObject(), string.getBaseOffset() + idx + delimiter.numBytes(),
-        bytes, BYTE_ARRAY_OFFSET, size);
-      return UTF8String.fromBytes(bytes);
+      if (matchLength == string.numChars()) return UTF8String.EMPTY_UTF8;
+      return string.substring(matchLength, string.numChars());
     }
   }
 
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index f09f7a7f3ff2..6586beac34b1 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -632,8 +632,18 @@ public void testStringInstr() throws SparkException {
     assertStringInstr("aaads", "dS", "UNICODE_CI", 4);
     assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0);
     assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8);
-    assertStringInstr("abİo12", "i̇o", "UNICODE_CI", 3);
-    assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3);
+    assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1);
+    assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1);
+    assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3);
+    assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3);
+    assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5);
+    assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7);
+    assertStringInstr("İoi̇o12", "i̇o", "UTF8_BINARY_LCASE", 1);
+    assertStringInstr("i̇oİo12", "İo", "UTF8_BINARY_LCASE", 1);
+    assertStringInstr("abİoi̇o", "i̇o", "UTF8_BINARY_LCASE", 3);
+    assertStringInstr("abi̇oİo", "İo", "UTF8_BINARY_LCASE", 3);
+    assertStringInstr("ai̇oxXİo", "Xx", "UTF8_BINARY_LCASE", 5);
+    assertStringInstr("aİoi̇oxx", "XX", "UTF8_BINARY_LCASE", 7);
   }
 
   private void assertFindInSet(String word, String set, String collationName,
@@ -875,6 +885,18 @@ public void testSubstringIndex() throws SparkException {
     assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
   }
 
   private void assertStringTrim(

From 77904faf2f28dd07b841266039c38949c1fcae54 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Tue, 14 May 2024 22:36:33 +0200
Subject: [PATCH 11/19] Unicode java linter checks

---
 .../org/apache/spark/unsafe/types/CollationSupportSuite.java   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 6586beac34b1..552bc11809d3 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -23,7 +23,7 @@
 
 import static org.junit.jupiter.api.Assertions.*;
 
-
+// checkstyle.off: AvoidEscapedUnicodeCharacters
 public class CollationSupportSuite {
 
   /**
@@ -1107,3 +1107,4 @@ public void testStringTrim() throws SparkException {
   // TODO: Test other collation-aware expressions.
 
 }
+// checkstyle.on: AvoidEscapedUnicodeCharacters

From 5b474992c5a6216825aea0902d4e332d51669994 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Wed, 15 May 2024 06:37:12 +0200
Subject: [PATCH 12/19] Java style fix

---
 .../spark/sql/catalyst/util/CollationAwareUTF8String.java   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index e2834d7abf06..598f334af813 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -99,8 +99,9 @@ public static int lowercaseFind(
       final UTF8String lowercasePattern,
       int startPos) {
     for (int i = startPos; i <= target.numChars(); ++i) {
-      if (lowercaseMatchFrom(target, lowercasePattern, i))
+      if (lowercaseMatchFrom(target, lowercasePattern, i)) {
         return i;
+      }
     }
     return MATCH_NOT_FOUND;
   }
@@ -163,8 +164,9 @@ public static int lowercaseRFind(
       final UTF8String lowercasePattern,
       int endPos) {
     for (int i = endPos; i >= 0; --i) {
-      if (lowercaseMatchUntil(target, lowercasePattern, i))
+      if (lowercaseMatchUntil(target, lowercasePattern, i)) {
         return i;
+      }
     }
     return MATCH_NOT_FOUND;
   }

From aceeba885c369bbc62af8279f8b6b03a80e31e3a Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Wed, 15 May 2024 12:42:33 +0200
Subject: [PATCH 13/19] Undo some changes

---
 .../util/CollationAwareUTF8String.java        | 48 +++++++++++++------
 .../sql/catalyst/util/CollationSupport.java   |  2 +-
 .../unsafe/types/CollationSupportSuite.java   | 26 +---------
 3 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 598f334af813..f1648042d587 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -426,34 +426,52 @@ public static UTF8String subStringIndex(final UTF8String string, final UTF8Strin
   }
 
   public static UTF8String lowercaseSubStringIndex(final UTF8String string,
-      final UTF8String delimiter, int count) {
+                                                   final UTF8String delimiter, int count) {
     if (delimiter.numBytes() == 0 || count == 0) {
       return UTF8String.EMPTY_UTF8;
     }
 
+    UTF8String lowercaseString = string.toLowerCase();
     UTF8String lowercaseDelimiter = delimiter.toLowerCase();
 
     if (count > 0) {
-      // search left to right (note: the start code point is inclusive)
-      int matchLength = -1;
+      int idx = -1;
       while (count > 0) {
-        matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1);
-        if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter
-        else return string; // cannot find enough delimiters in the string
+        idx = lowercaseString.find(lowercaseDelimiter, idx + 1);
+        if (idx >= 0) {
+          count--;
+        } else {
+          // can not find enough delim
+          return string;
+        }
+      }
+      if (idx == 0) {
+        return UTF8String.EMPTY_UTF8;
       }
-      if (matchLength == 0) return UTF8String.EMPTY_UTF8;
-      return string.substring(0, matchLength);
+      byte[] bytes = new byte[idx];
+      copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx);
+      return UTF8String.fromBytes(bytes);
+
     } else {
-      // search right to left (note: the end code point is exclusive)
-      int matchLength = string.numChars() + 1;
+      int idx = string.numBytes() - delimiter.numBytes() + 1;
       count = -count;
       while (count > 0) {
-        matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1);
-        if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter
-        else return string; // cannot find enough delimiters in the string
+        idx = lowercaseString.rfind(lowercaseDelimiter, idx - 1);
+        if (idx >= 0) {
+          count--;
+        } else {
+          // can not find enough delim
+          return string;
+        }
       }
-      if (matchLength == string.numChars()) return UTF8String.EMPTY_UTF8;
-      return string.substring(matchLength, string.numChars());
+      if (idx + delimiter.numBytes() == string.numBytes()) {
+        return UTF8String.EMPTY_UTF8;
+      }
+      int size = string.numBytes() - delimiter.numBytes() - idx;
+      byte[] bytes = new byte[size];
+      copyMemory(string.getBaseObject(), string.getBaseOffset() + idx + delimiter.numBytes(),
+              bytes, BYTE_ARRAY_OFFSET, size);
+      return UTF8String.fromBytes(bytes);
     }
   }
 
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index d5bcc61bac2a..8f7aed30464c 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -354,7 +354,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring
       return string.indexOf(substring, 0);
     }
     public static int execLowercase(final UTF8String string, final UTF8String substring) {
-      return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0);
+      return string.toLowerCase().indexOf(substring.toLowerCase(), 0);
     }
     public static int execICU(final UTF8String string, final UTF8String substring,
         final int collationId) {
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 552bc11809d3..ed1e15fab883 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -632,18 +632,8 @@ public void testStringInstr() throws SparkException {
     assertStringInstr("aaads", "dS", "UNICODE_CI", 4);
     assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0);
     assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8);
-    assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1);
-    assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1);
-    assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3);
-    assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3);
-    assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5);
-    assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7);
-    assertStringInstr("İoi̇o12", "i̇o", "UTF8_BINARY_LCASE", 1);
-    assertStringInstr("i̇oİo12", "İo", "UTF8_BINARY_LCASE", 1);
-    assertStringInstr("abİoi̇o", "i̇o", "UTF8_BINARY_LCASE", 3);
-    assertStringInstr("abi̇oİo", "İo", "UTF8_BINARY_LCASE", 3);
-    assertStringInstr("ai̇oxXİo", "Xx", "UTF8_BINARY_LCASE", 5);
-    assertStringInstr("aİoi̇oxx", "XX", "UTF8_BINARY_LCASE", 7);
+    assertStringInstr("abİo12", "i̇o", "UNICODE_CI", 3);
+    assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3);
   }
 
   private void assertFindInSet(String word, String set, String collationName,
@@ -885,18 +875,6 @@ public void testSubstringIndex() throws SparkException {
     assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo");
-    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12");
-    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12");
-    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12");
-    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12");
-    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");
-    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");
-    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
-    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
-    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
-    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
-    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
-    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
   }
 
   private void assertStringTrim(

From 3945137e872e3d9b73d4f19c45fb55b4dbb003de Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Wed, 15 May 2024 12:44:51 +0200
Subject: [PATCH 14/19] Style fixes

---
 .../spark/sql/catalyst/util/CollationAwareUTF8String.java     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index f1648042d587..e1efa32039ab 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -426,7 +426,7 @@ public static UTF8String subStringIndex(final UTF8String string, final UTF8Strin
   }
 
   public static UTF8String lowercaseSubStringIndex(final UTF8String string,
-                                                   final UTF8String delimiter, int count) {
+      final UTF8String delimiter, int count) {
     if (delimiter.numBytes() == 0 || count == 0) {
       return UTF8String.EMPTY_UTF8;
     }
@@ -470,7 +470,7 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string,
       int size = string.numBytes() - delimiter.numBytes() - idx;
       byte[] bytes = new byte[size];
       copyMemory(string.getBaseObject(), string.getBaseOffset() + idx + delimiter.numBytes(),
-              bytes, BYTE_ARRAY_OFFSET, size);
+        bytes, BYTE_ARRAY_OFFSET, size);
       return UTF8String.fromBytes(bytes);
     }
   }

From 30774a37d01b40062125d71c0d38ed046cf8db88 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Wed, 15 May 2024 12:47:59 +0200
Subject: [PATCH 15/19] Update comments

---
 .../unsafe/types/CollationSupportSuite.java    | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index ed1e15fab883..5843edbb5ecf 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -121,8 +121,8 @@ public void testContains() throws SparkException {
     assertContains("adİos", "Io", "UNICODE_CI", false);
     assertContains("adİos", "i̇o", "UNICODE_CI", true);
     assertContains("adİos", "İo", "UNICODE_CI", true);
-    assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
-    assertContains("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
+    assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // != UNICODE_CI
+    assertContains("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // != UNICODE_CI
     assertContains("i̇", "İ", "UTF8_BINARY_LCASE", true);
     assertContains("İ", "i", "UTF8_BINARY_LCASE", false);
     assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false);
@@ -231,13 +231,13 @@ public void testStartsWith() throws SparkException {
     assertStartsWith("İonic", "Io", "UNICODE_CI", false);
     assertStartsWith("İonic", "i̇o", "UNICODE_CI", true);
     assertStartsWith("İonic", "İo", "UNICODE_CI", true);
-    assertStartsWith("i̇", "i", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
+    assertStartsWith("i̇", "i", "UTF8_BINARY_LCASE", true); // != UNICODE_CI
     assertStartsWith("i̇", "İ", "UTF8_BINARY_LCASE", true);
     assertStartsWith("İ", "i", "UTF8_BINARY_LCASE", false);
     assertStartsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true);
     assertStartsWith("İİİ", "i̇i", "UTF8_BINARY_LCASE", false);
     assertStartsWith("İi̇İ", "i̇İ", "UTF8_BINARY_LCASE", true);
-    assertStartsWith("i̇İi̇i̇", "İi̇İi", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
+    assertStartsWith("i̇İi̇i̇", "İi̇İi", "UTF8_BINARY_LCASE", true); // != UNICODE_CI
     assertStartsWith("i̇onic", "io", "UTF8_BINARY_LCASE", false);
     assertStartsWith("i̇onic", "Io", "UTF8_BINARY_LCASE", false);
     assertStartsWith("i̇onic", "i̇o", "UTF8_BINARY_LCASE", true);
@@ -343,13 +343,13 @@ public void testEndsWith() throws SparkException {
     assertEndsWith("the İo", "Io", "UNICODE_CI", false);
     assertEndsWith("the İo", "i̇o", "UNICODE_CI", true);
     assertEndsWith("the İo", "İo", "UNICODE_CI", true);
-    assertEndsWith("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
+    assertEndsWith("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // != UNICODE_CI
     assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", true);
     assertEndsWith("İ", "i", "UTF8_BINARY_LCASE", false);
     assertEndsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true);
     assertEndsWith("İİİ", "ii̇", "UTF8_BINARY_LCASE", false);
     assertEndsWith("İi̇İ", "İi̇", "UTF8_BINARY_LCASE", true);
-    assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UTF8_BINARY_LCASE", true); // different from UNICODE_CI
+    assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UTF8_BINARY_LCASE", true); // != UNICODE_CI
     assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false);
     assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false);
     assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true);
@@ -775,14 +775,14 @@ public void testLocate() throws SparkException {
     assertLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1);
     // Case-variable character length
     assertLocate("\u0307", "i̇", 1, "UTF8_BINARY", 2);
-    assertLocate("\u0307", "İ", 1, "UTF8_BINARY_LCASE", 0); // different from UTF8_BINARY
+    assertLocate("\u0307", "İ", 1, "UTF8_BINARY_LCASE", 0); // != UTF8_BINARY
     assertLocate("i", "i̇", 1, "UNICODE_CI", 0);
     assertLocate("\u0307", "i̇", 1, "UNICODE_CI", 0);
     assertLocate("i̇", "i", 1, "UNICODE_CI", 0);
     assertLocate("İ", "i̇", 1, "UNICODE_CI", 1);
     assertLocate("İ", "i", 1, "UNICODE_CI", 0);
-    assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // different from UNICODE_CI
-    assertLocate("\u0307", "i̇", 1, "UTF8_BINARY_LCASE", 2); // different from UNICODE_CI
+    assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // != UNICODE_CI
+    assertLocate("\u0307", "i̇", 1, "UTF8_BINARY_LCASE", 2); // != UNICODE_CI
     assertLocate("i̇", "i", 1, "UTF8_BINARY_LCASE", 0);
     assertLocate("İ", "i̇", 1, "UTF8_BINARY_LCASE", 1);
     assertLocate("İ", "i", 1, "UTF8_BINARY_LCASE", 0);

From 68bada3a79962760c150a5c68a25c078adf968cc Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 17 May 2024 09:46:33 +0200
Subject: [PATCH 16/19] Small fixes

---
 .../util/CollationAwareUTF8String.java        | 75 +++++++++++--------
 .../unsafe/types/CollationSupportSuite.java   |  5 +-
 2 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index e1efa32039ab..678aef49bceb 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -36,16 +36,17 @@
 public class CollationAwareUTF8String {
 
   /**
-   * The constant value to indicate that the match is not found
-   * when searching for a pattern string in a target string.
+   * The constant value to indicate that the match is not found when searching for a pattern
+   * string in a target string.
    */
   private static final int MATCH_NOT_FOUND = -1;
 
   /**
-   * Returns whether the target string starts with the specified prefix,
-   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
-   * that the prefix is already lowercased prior to method call to avoid the
-   * overhead of calling .toLowerCase() multiple times on the same prefix string.
+   * Returns whether the target string starts with the specified prefix, starting from the
+   * specified position (0-based index referring to character position in UTF8String), with respect
+   * to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased
+   * prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the
+   * same prefix string.
    *
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
@@ -60,14 +61,17 @@ public static boolean lowercaseMatchFrom(
   }
 
   /**
-   * Returns the length of the substring of the target string that starts with
-   * the specified prefix, with respect to the UTF8_BINARY_LCASE collation.
-   * The method assumes that the prefix is already lowercased. The method only
-   * considers the part of target string that starts from the specified position.
+   * Returns the length of the substring of the target string that starts with the specified
+   * prefix, starting from the specified position (0-based index referring to character position
+   * in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * prefix is already lowercased. The method only considers the part of target string that
+   * starts from the specified (inclusive) position (that is, the method does not look at UTF8
+   * characters of the target string at or after position `endPos`). If the prefix is not found,
+   * MATCH_NOT_FOUND is returned.
    *
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
-   * @param startPos the end position for searching (in the target string)
+   * @param startPos the start position for searching (in the target string)
    * @return length of the target substring that ends with the specified suffix in lowercase
    */
   public static int lowercaseMatchLengthFrom(
@@ -84,20 +88,22 @@ public static int lowercaseMatchLengthFrom(
   }
 
   /**
-   * Returns the position of the first occurrence of the pattern string
-   * in the target string from the specified position (0-based index),
-   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
-   * that the pattern string is already lowercased prior to method call.
+   * Returns the position of the first occurrence of the pattern string in the target string,
+   * starting from the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * pattern string is already lowercased prior to call. If the prefix is not found,
+   * MATCH_NOT_FOUND is returned.
    *
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
    * @param startPos the start position for searching (in the target string)
-   * @return the position of the first occurrence of pattern in target, if not found, -1 returned.
+   * @return the position of the first occurrence of pattern in target
    */
   public static int lowercaseFind(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int startPos) {
+    assert startPos >= 0;
     for (int i = startPos; i <= target.numChars(); ++i) {
       if (lowercaseMatchFrom(target, lowercasePattern, i)) {
         return i;
@@ -107,10 +113,11 @@ public static int lowercaseFind(
   }
 
   /**
-   * Returns whether the target string ends with the specified suffix,
-   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
-   * that the suffix is already lowercased prior to method call to avoid the
-   * overhead of calling .toLowerCase() multiple times on the same suffix string.
+   * Returns whether the target string ends with the specified suffix, ending at the specified
+   * position (0-based index referring to character position in UTF8String), with respect to the
+   * UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior
+   * to method call to avoid the overhead of calling .toLowerCase() multiple times on the same
+   * suffix string.
    *
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
@@ -125,10 +132,13 @@ public static boolean lowercaseMatchUntil(
   }
 
   /**
-   * Returns the length of the substring of the target string that ends with
-   * the specified suffix, with respect to the UTF8_BINARY_LCASE collation.
-   * The method assumes that the suffix is already lowercased. The method only
-   * considers the part of target string that ends at the specified position.
+   * Returns the length of the substring of the target string that ends with the specified
+   * suffix, ending at the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * suffix is already lowercased. The method only considers the part of target string that ends
+   * at the specified (non-inclusive) position (that is, the method does not look at UTF8
+   * characters of the target string at or after position `endPos`). If the suffix is not found,
+   * MATCH_NOT_FOUND is returned.
    *
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
@@ -149,20 +159,22 @@ public static int lowercaseMatchLengthUntil(
   }
 
   /**
-   * Returns the position of the last occurrence of the pattern string
-   * in the target string until the specified position (0-based index),
-   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
-   * that the pattern string is already lowercased prior to method call.
+   * Returns the position of the last occurrence of the pattern string in the target string,
+   * ending at the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * pattern string is already lowercased prior to call. If the suffix is not found,
+   * MATCH_NOT_FOUND is returned.
    *
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
    * @param endPos the end position for searching (in the target string)
-   * @return the position of the last occurrence of pattern in target, if not found, -1 returned.
+   * @return the position of the last occurrence of pattern in target
    */
   public static int lowercaseRFind(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int endPos) {
+    assert endPos <= target.numChars();
     for (int i = endPos; i >= 0; --i) {
       if (lowercaseMatchUntil(target, lowercasePattern, i)) {
         return i;
@@ -321,9 +333,8 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
   }
 
   /**
-   * Returns the position of the first occurrence of the pattern string
-   * in the target string from the specified position (0-based index),
-   * with respect to the UTF8_BINARY_LCASE collation.
+   * Returns the position of the first occurrence of the pattern string in the target string from
+   * the specified position (0-based index), with respect to the UTF8_BINARY_LCASE collation.
    *
    * @param target the string to be searched in
    * @param pattern the string to be searched for
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 5843edbb5ecf..eb18d7665b09 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -122,6 +122,8 @@ public void testContains() throws SparkException {
     assertContains("adİos", "i̇o", "UNICODE_CI", true);
     assertContains("adİos", "İo", "UNICODE_CI", true);
     assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // != UNICODE_CI
+    assertContains("İ", "\u0307", "UTF8_BINARY_LCASE", false);
+    assertContains("İ", "i", "UTF8_BINARY_LCASE", false);
     assertContains("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // != UNICODE_CI
     assertContains("i̇", "İ", "UTF8_BINARY_LCASE", true);
     assertContains("İ", "i", "UTF8_BINARY_LCASE", false);
@@ -345,11 +347,12 @@ public void testEndsWith() throws SparkException {
     assertEndsWith("the İo", "İo", "UNICODE_CI", true);
     assertEndsWith("i̇", "\u0307", "UTF8_BINARY_LCASE", true); // != UNICODE_CI
     assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", true);
-    assertEndsWith("İ", "i", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("İ", "\u0307", "UTF8_BINARY_LCASE", false);
     assertEndsWith("İİİ", "i̇i̇", "UTF8_BINARY_LCASE", true);
     assertEndsWith("İİİ", "ii̇", "UTF8_BINARY_LCASE", false);
     assertEndsWith("İi̇İ", "İi̇", "UTF8_BINARY_LCASE", true);
     assertEndsWith("i̇İi̇i̇", "\u0307İi̇İ", "UTF8_BINARY_LCASE", true); // != UNICODE_CI
+    assertEndsWith("i̇İi̇i̇", "\u0307İİ", "UTF8_BINARY_LCASE", false);
     assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false);
     assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false);
     assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true);

From 8f59422c639396ac61d9a281c72625545491058d Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 17 May 2024 10:28:55 +0200
Subject: [PATCH 17/19] Small fixes

---
 .../catalyst/util/CollationAwareUTF8String.java    | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 678aef49bceb..9ae5d68a88cd 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -72,7 +72,7 @@ public static boolean lowercaseMatchFrom(
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
    * @param startPos the start position for searching (in the target string)
-   * @return length of the target substring that ends with the specified suffix in lowercase
+   * @return length of the target substring that ends with the specified prefix in lowercase
    */
   public static int lowercaseMatchLengthFrom(
       final UTF8String target,
@@ -91,7 +91,7 @@ public static int lowercaseMatchLengthFrom(
    * Returns the position of the first occurrence of the pattern string in the target string,
    * starting from the specified position (0-based index referring to character position in
    * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
-   * pattern string is already lowercased prior to call. If the prefix is not found,
+   * pattern string is already lowercased prior to call. If the pattern is not found,
    * MATCH_NOT_FOUND is returned.
    *
    * @param target the string to be searched in
@@ -162,7 +162,7 @@ public static int lowercaseMatchLengthUntil(
    * Returns the position of the last occurrence of the pattern string in the target string,
    * ending at the specified position (0-based index referring to character position in
    * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
-   * pattern string is already lowercased prior to call. If the suffix is not found,
+   * pattern string is already lowercased prior to call. If the pattern is not found,
    * MATCH_NOT_FOUND is returned.
    *
    * @param target the string to be searched in
@@ -333,13 +333,15 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
   }
 
   /**
-   * Returns the position of the first occurrence of the pattern string in the target string from
-   * the specified position (0-based index), with respect to the UTF8_BINARY_LCASE collation.
+   * Returns the position of the first occurrence of the pattern string in the target string,
+   * starting from the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. If the pattern is not found,
+   * MATCH_NOT_FOUND is returned.
    *
    * @param target the string to be searched in
    * @param pattern the string to be searched for
    * @param start the start position for searching (in the target string)
-   * @return the position of the first occurrence of pattern in target, if not found, -1 returned.
+   * @return the position of the first occurrence of pattern in target
    */
   public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
       final int start) {

From 4b9b83e54640b7a4435e2ad3430f2e4edca90e35 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 24 May 2024 15:11:53 +0200
Subject: [PATCH 18/19] Update new method access

---
 .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 9ae5d68a88cd..027030c8d97f 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -74,7 +74,7 @@ public static boolean lowercaseMatchFrom(
    * @param startPos the start position for searching (in the target string)
    * @return length of the target substring that ends with the specified prefix in lowercase
    */
-  public static int lowercaseMatchLengthFrom(
+  private static int lowercaseMatchLengthFrom(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int startPos) {
@@ -99,7 +99,7 @@ public static int lowercaseMatchLengthFrom(
    * @param startPos the start position for searching (in the target string)
    * @return the position of the first occurrence of pattern in target
    */
-  public static int lowercaseFind(
+  private static int lowercaseFind(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int startPos) {
@@ -145,7 +145,7 @@ public static boolean lowercaseMatchUntil(
    * @param endPos the end position for searching (in the target string)
    * @return length of the target substring that ends with the specified suffix in lowercase
    */
-  public static int lowercaseMatchLengthUntil(
+  private static int lowercaseMatchLengthUntil(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int endPos) {
@@ -170,7 +170,7 @@ public static int lowercaseMatchLengthUntil(
    * @param endPos the end position for searching (in the target string)
    * @return the position of the last occurrence of pattern in target
    */
-  public static int lowercaseRFind(
+  private static int lowercaseRFind(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int endPos) {

From eb602e8a2f61909df5d2e4042c7daa72456fb509 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 27 May 2024 14:30:11 +0200
Subject: [PATCH 19/19] Update doc comment

---
 .../spark/sql/catalyst/util/CollationAwareUTF8String.java       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 027030c8d97f..0d0094d8d0a0 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -72,7 +72,7 @@ public static boolean lowercaseMatchFrom(
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
    * @param startPos the start position for searching (in the target string)
-   * @return length of the target substring that ends with the specified prefix in lowercase
+   * @return length of the target substring that starts with the specified prefix in lowercase
    */
   private static int lowercaseMatchLengthFrom(
       final UTF8String target,