From f226c65fca8ed7043694b62cbffe7bac6bd2a31c Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Wed, 15 May 2024 08:45:39 +0200
Subject: [PATCH 1/5] Fix and tests

---
 .../util/CollationAwareUTF8String.java        | 199 +++++++++++++++---
 .../sql/catalyst/util/CollationSupport.java   |   2 +-
 .../unsafe/types/CollationSupportSuite.java   |  49 ++++-
 3 files changed, 214 insertions(+), 36 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index ee0d611d7e65..ef71eb47d310 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -34,6 +34,143 @@
  * Utility class for collation-aware UTF8String operations.
  */
 public class CollationAwareUTF8String {
+
+  /**
+   * The constant value to indicate that the match is not found
+   * when searching for a pattern string in a target string.
+   */
+  private static final int MATCH_NOT_FOUND = -1;
+
+  /**
+   * Returns whether the target string starts with the specified prefix,
+   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
+   * that the prefix is already lowercased prior to method call to avoid the
+   * overhead of calling .toLowerCase() multiple times on the same prefix string.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the start position for searching (in the target string)
+   * @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE
+   */
+  public static boolean lowercaseMatchFrom(
+          final UTF8String target,
+          final UTF8String lowercasePattern,
+          int startPos) {
+    return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the length of the substring of the target string that starts with
+   * the specified prefix, with respect to the UTF8_BINARY_LCASE collation.
+   * The method assumes that the prefix is already lowercased. The method only
+   * considers the part of target string that starts from the specified position.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the end position for searching (in the target string)
+   * @return length of the target substring that ends with the specified suffix in lowercase
+   */
+  public static int lowercaseMatchLengthFrom(
+          final UTF8String target,
+          final UTF8String lowercasePattern,
+          int startPos) {
+    assert startPos >= 0;
+    for (int len = 0; len <= target.numChars() - startPos; ++len) {
+      if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) {
+        return len;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the position of the first occurrence of the pattern string
+   * in the target string from the specified position (0-based index),
+   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
+   * that the pattern string is already lowercased prior to method call.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the start position for searching (in the target string)
+   * @return the position of the first occurrence of pattern in target, if not found, -1 returned.
+   */
+  public static int lowercaseFind(
+          final UTF8String target,
+          final UTF8String lowercasePattern,
+          int startPos) {
+    for (int i = startPos; i <= target.numChars(); ++i) {
+      if (lowercaseMatchFrom(target, lowercasePattern, i)) {
+        return i;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns whether the target string ends with the specified suffix,
+   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
+   * that the suffix is already lowercased prior to method call to avoid the
+   * overhead of calling .toLowerCase() multiple times on the same suffix string.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return whether the target string ends with the specified suffix in lowercase
+   */
+  public static boolean lowercaseMatchUntil(
+          final UTF8String target,
+          final UTF8String lowercasePattern,
+          int endPos) {
+    return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the length of the substring of the target string that ends with
+   * the specified suffix, with respect to the UTF8_BINARY_LCASE collation.
+   * The method assumes that the suffix is already lowercased. The method only
+   * considers the part of target string that ends at the specified position.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return length of the target substring that ends with the specified suffix in lowercase
+   */
+  public static int lowercaseMatchLengthUntil(
+          final UTF8String target,
+          final UTF8String lowercasePattern,
+          int endPos) {
+    assert endPos <= target.numChars();
+    for (int len = 0; len <= endPos; ++len) {
+      if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) {
+        return len;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the position of the last occurrence of the pattern string
+   * in the target string until the specified position (0-based index),
+   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
+   * that the pattern string is already lowercased prior to method call.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return the position of the last occurrence of pattern in target, if not found, -1 returned.
+   */
+  public static int lowercaseRFind(
+          final UTF8String target,
+          final UTF8String lowercasePattern,
+          int endPos) {
+    for (int i = endPos; i >= 0; --i) {
+      if (lowercaseMatchUntil(target, lowercasePattern, i)) {
+        return i;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
   public static UTF8String replace(final UTF8String src, final UTF8String search,
       final UTF8String replace, final int collationId) {
     // This collation aware implementation is based on existing implementation on UTF8String
@@ -183,6 +320,22 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
     return 0;
   }
 
+  /**
+   * Returns the position of the first occurrence of the pattern string
+   * in the target string from the specified position (0-based index),
+   * with respect to the UTF8_BINARY_LCASE collation.
+   *
+   * @param target the string to be searched in
+   * @param pattern the string to be searched for
+   * @param start the start position for searching (in the target string)
+   * @return the position of the first occurrence of pattern in target, if not found, -1 returned.
+   */
+  public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
+      final int start) {
+    if (pattern.numChars() == 0) return 0;
+    return lowercaseFind(target, pattern.toLowerCase(), start);
+  }
+
   public static int indexOf(final UTF8String target, final UTF8String pattern,
       final int start, final int collationId) {
     if (pattern.numBytes() == 0) {
@@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string,
       return UTF8String.EMPTY_UTF8;
     }
 
-    UTF8String lowercaseString = string.toLowerCase();
     UTF8String lowercaseDelimiter = delimiter.toLowerCase();
 
     if (count > 0) {
-      int idx = -1;
+      // search left to right (note: the start code point is inclusive)
+      int matchLength = -1;
       while (count > 0) {
-        idx = lowercaseString.find(lowercaseDelimiter, idx + 1);
-        if (idx >= 0) {
-          count--;
-        } else {
-          // can not find enough delim
-          return string;
-        }
-      }
-      if (idx == 0) {
-        return UTF8String.EMPTY_UTF8;
+        matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1);
+        if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter
+        else return string; // cannot find enough delimiters in the string
       }
-      byte[] bytes = new byte[idx];
-      copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx);
-      return UTF8String.fromBytes(bytes);
-
+      if (matchLength == 0) return UTF8String.EMPTY_UTF8;
+      return string.substring(0, matchLength);
     } else {
-      int idx = string.numBytes() - delimiter.numBytes() + 1;
+      // search right to left (note: the end code point is exclusive)
+      int matchLength = string.numChars() + 1;
       count = -count;
       while (count > 0) {
-        idx = lowercaseString.rfind(lowercaseDelimiter, idx - 1);
-        if (idx >= 0) {
-          count--;
-        } else {
-          // can not find enough delim
-          return string;
-        }
+        matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1);
+        if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter
+        else return string; // cannot find enough delimiters in the string
       }
-      if (idx + delimiter.numBytes() == string.numBytes()) {
-        return UTF8String.EMPTY_UTF8;
-      }
-      int size = string.numBytes() - delimiter.numBytes() - idx;
-      byte[] bytes = new byte[size];
-      copyMemory(string.getBaseObject(), string.getBaseOffset() + idx + delimiter.numBytes(),
-        bytes, BYTE_ARRAY_OFFSET, size);
-      return UTF8String.fromBytes(bytes);
+      if (matchLength == string.numChars()) return UTF8String.EMPTY_UTF8;
+      return string.substring(matchLength, string.numChars());
     }
   }
 
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index bea3dc08b448..678b00649a32 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -354,7 +354,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring
       return string.indexOf(substring, 0);
     }
     public static int execLowercase(final UTF8String string, final UTF8String substring) {
-      return string.toLowerCase().indexOf(substring.toLowerCase(), 0);
+      return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0);
     }
     public static int execICU(final UTF8String string, final UTF8String substring,
         final int collationId) {
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 7fc3c4e349c3..6f4b757851cc 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -23,7 +23,7 @@
 
 import static org.junit.jupiter.api.Assertions.*;
 
-
+// checkstyle.off: AvoidEscapedUnicodeCharacters
 public class CollationSupportSuite {
 
   /**
@@ -567,8 +567,26 @@ public void testStringInstr() throws SparkException {
     assertStringInstr("aaads", "dS", "UNICODE_CI", 4);
     assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0);
     assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8);
-    assertStringInstr("abİo12", "i̇o", "UNICODE_CI", 3);
-    assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3);
+    assertStringInstr("i̇", "i", "UNICODE_CI", 0);
+    assertStringInstr("i̇", "\u0307", "UNICODE_CI", 0);
+    assertStringInstr("i̇", "İ", "UNICODE_CI", 1);
+    assertStringInstr("İ", "i", "UNICODE_CI", 0);
+    assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1);
+    assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1);
+    assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3);
+    assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3);
+    assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5);
+    assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7);
+    assertStringInstr("i̇", "i", "UTF8_BINARY_LCASE", 1); // different from UNICODE_CI
+    assertStringInstr("i̇", "\u0307", "UTF8_BINARY_LCASE", 2); // different from UNICODE_CI
+    assertStringInstr("i̇", "İ", "UTF8_BINARY_LCASE", 1);
+    assertStringInstr("İ", "i", "UTF8_BINARY_LCASE", 0);
+    assertStringInstr("İoi̇o12", "i̇o", "UTF8_BINARY_LCASE", 1);
+    assertStringInstr("i̇oİo12", "İo", "UTF8_BINARY_LCASE", 1);
+    assertStringInstr("abİoi̇o", "i̇o", "UTF8_BINARY_LCASE", 3);
+    assertStringInstr("abi̇oİo", "İo", "UTF8_BINARY_LCASE", 3);
+    assertStringInstr("ai̇oxXİo", "Xx", "UTF8_BINARY_LCASE", 5);
+    assertStringInstr("aİoi̇oxx", "XX", "UTF8_BINARY_LCASE", 7);
   }
 
   private void assertFindInSet(String word, String set, String collationName,
@@ -798,6 +816,30 @@ public void testSubstringIndex() throws SparkException {
     assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo");
+    assertSubstringIndex("abi̇12", "i", 1, "UNICODE_CI", "abi̇12");
+    assertSubstringIndex("abi̇12", "\u0307", 1, "UNICODE_CI", "abi̇12");
+    assertSubstringIndex("abi̇12", "İ", 1, "UNICODE_CI", "ab");
+    assertSubstringIndex("abİ12", "i", 1, "UNICODE_CI", "abİ12");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12");
+    assertSubstringIndex("abi̇12", "i", 1, "UTF8_BINARY_LCASE", "ab");
+    assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_BINARY_LCASE", "abi");
+    assertSubstringIndex("abi̇12", "İ", 1, "UTF8_BINARY_LCASE", "ab");
+    assertSubstringIndex("abİ12", "i", 1, "UTF8_BINARY_LCASE", "abİ12");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
   }
 
   private void assertStringTrim(
@@ -1008,3 +1050,4 @@ public void testStringTrim() throws SparkException {
   // TODO: Test other collation-aware expressions.
 
 }
+// checkstyle.on: AvoidEscapedUnicodeCharacters

From a2e1307b7d118fd28624a57db80c4a3167383ecb Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Wed, 15 May 2024 08:57:00 +0200
Subject: [PATCH 2/5] Fix tests

---
 .../apache/spark/unsafe/types/CollationSupportSuite.java  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 6f4b757851cc..2d80c50ba084 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -577,8 +577,8 @@ public void testStringInstr() throws SparkException {
     assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3);
     assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5);
     assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7);
-    assertStringInstr("i̇", "i", "UTF8_BINARY_LCASE", 1); // different from UNICODE_CI
-    assertStringInstr("i̇", "\u0307", "UTF8_BINARY_LCASE", 2); // different from UNICODE_CI
+    assertStringInstr("i̇", "i", "UTF8_BINARY_LCASE", 1); // != UNICODE_CI
+    assertStringInstr("i̇", "\u0307", "UTF8_BINARY_LCASE", 2); // != UNICODE_CI
     assertStringInstr("i̇", "İ", "UTF8_BINARY_LCASE", 1);
     assertStringInstr("İ", "i", "UTF8_BINARY_LCASE", 0);
     assertStringInstr("İoi̇o12", "i̇o", "UTF8_BINARY_LCASE", 1);
@@ -828,8 +828,8 @@ public void testSubstringIndex() throws SparkException {
     assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12");
-    assertSubstringIndex("abi̇12", "i", 1, "UTF8_BINARY_LCASE", "ab");
-    assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_BINARY_LCASE", "abi");
+    assertSubstringIndex("abi̇12", "i", 1, "UTF8_BINARY_LCASE", "ab"); // != UNICODE_CI
+    assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_BINARY_LCASE", "abi"); // != UNICODE_CI
     assertSubstringIndex("abi̇12", "İ", 1, "UTF8_BINARY_LCASE", "ab");
     assertSubstringIndex("abİ12", "i", 1, "UTF8_BINARY_LCASE", "abİ12");
     assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");

From 168dde3986edd38bd9de9066cfac552224347d30 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 17 May 2024 10:25:16 +0200
Subject: [PATCH 3/5] Small fixes

---
 .../util/CollationAwareUTF8String.java        | 131 ++++++++++--------
 .../unsafe/types/CollationSupportSuite.java   |   4 +
 2 files changed, 75 insertions(+), 60 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index ef71eb47d310..c48a5f942b7d 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -36,16 +36,17 @@
 public class CollationAwareUTF8String {
 
   /**
-   * The constant value to indicate that the match is not found
-   * when searching for a pattern string in a target string.
+   * The constant value to indicate that the match is not found when searching for a pattern
+   * string in a target string.
    */
   private static final int MATCH_NOT_FOUND = -1;
 
   /**
-   * Returns whether the target string starts with the specified prefix,
-   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
-   * that the prefix is already lowercased prior to method call to avoid the
-   * overhead of calling .toLowerCase() multiple times on the same prefix string.
+   * Returns whether the target string starts with the specified prefix, starting from the
+   * specified position (0-based index referring to character position in UTF8String), with respect
+   * to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased
+   * prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the
+   * same prefix string.
    *
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
@@ -53,27 +54,30 @@ public class CollationAwareUTF8String {
    * @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE
    */
   public static boolean lowercaseMatchFrom(
-          final UTF8String target,
-          final UTF8String lowercasePattern,
-          int startPos) {
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
     return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND;
   }
 
   /**
-   * Returns the length of the substring of the target string that starts with
-   * the specified prefix, with respect to the UTF8_BINARY_LCASE collation.
-   * The method assumes that the prefix is already lowercased. The method only
-   * considers the part of target string that starts from the specified position.
+   * Returns the length of the substring of the target string that starts with the specified
+   * prefix, starting from the specified position (0-based index referring to character position
+   * in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * prefix is already lowercased. The method only considers the part of target string that
+   * starts from the specified (inclusive) position (that is, the method does not look at UTF8
+   * characters of the target string at or after position `endPos`). If the prefix is not found,
+   * MATCH_NOT_FOUND is returned.
    *
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
-   * @param startPos the end position for searching (in the target string)
-   * @return length of the target substring that ends with the specified suffix in lowercase
+   * @param startPos the start position for searching (in the target string)
+   * @return length of the target substring that ends with the specified prefix in lowercase
    */
   public static int lowercaseMatchLengthFrom(
-          final UTF8String target,
-          final UTF8String lowercasePattern,
-          int startPos) {
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
     assert startPos >= 0;
     for (int len = 0; len <= target.numChars() - startPos; ++len) {
       if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) {
@@ -84,20 +88,22 @@ public static int lowercaseMatchLengthFrom(
   }
 
   /**
-   * Returns the position of the first occurrence of the pattern string
-   * in the target string from the specified position (0-based index),
-   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
-   * that the pattern string is already lowercased prior to method call.
+   * Returns the position of the first occurrence of the pattern string in the target string,
+   * starting from the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * pattern string is already lowercased prior to call. If the pattern is not found,
+   * MATCH_NOT_FOUND is returned.
    *
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
    * @param startPos the start position for searching (in the target string)
-   * @return the position of the first occurrence of pattern in target, if not found, -1 returned.
+   * @return the position of the first occurrence of pattern in target
    */
   public static int lowercaseFind(
-          final UTF8String target,
-          final UTF8String lowercasePattern,
-          int startPos) {
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
+    assert startPos >= 0;
     for (int i = startPos; i <= target.numChars(); ++i) {
       if (lowercaseMatchFrom(target, lowercasePattern, i)) {
         return i;
@@ -107,10 +113,11 @@ public static int lowercaseFind(
   }
 
   /**
-   * Returns whether the target string ends with the specified suffix,
-   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
-   * that the suffix is already lowercased prior to method call to avoid the
-   * overhead of calling .toLowerCase() multiple times on the same suffix string.
+   * Returns whether the target string ends with the specified suffix, ending at the specified
+   * position (0-based index referring to character position in UTF8String), with respect to the
+   * UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior
+   * to method call to avoid the overhead of calling .toLowerCase() multiple times on the same
+   * suffix string.
    *
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
@@ -118,17 +125,20 @@ public static int lowercaseFind(
    * @return whether the target string ends with the specified suffix in lowercase
    */
   public static boolean lowercaseMatchUntil(
-          final UTF8String target,
-          final UTF8String lowercasePattern,
-          int endPos) {
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
     return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND;
   }
 
   /**
-   * Returns the length of the substring of the target string that ends with
-   * the specified suffix, with respect to the UTF8_BINARY_LCASE collation.
-   * The method assumes that the suffix is already lowercased. The method only
-   * considers the part of target string that ends at the specified position.
+   * Returns the length of the substring of the target string that ends with the specified
+   * suffix, ending at the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * suffix is already lowercased. The method only considers the part of target string that ends
+   * at the specified (non-inclusive) position (that is, the method does not look at UTF8
+   * characters of the target string at or after position `endPos`). If the suffix is not found,
+   * MATCH_NOT_FOUND is returned.
    *
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
@@ -136,9 +146,9 @@ public static boolean lowercaseMatchUntil(
    * @return length of the target substring that ends with the specified suffix in lowercase
    */
   public static int lowercaseMatchLengthUntil(
-          final UTF8String target,
-          final UTF8String lowercasePattern,
-          int endPos) {
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
     assert endPos <= target.numChars();
     for (int len = 0; len <= endPos; ++len) {
       if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) {
@@ -149,20 +159,22 @@ public static int lowercaseMatchLengthUntil(
   }
 
   /**
-   * Returns the position of the last occurrence of the pattern string
-   * in the target string until the specified position (0-based index),
-   * with respect to the UTF8_BINARY_LCASE collation. The method assumes
-   * that the pattern string is already lowercased prior to method call.
+   * Returns the position of the last occurrence of the pattern string in the target string,
+   * ending at the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * pattern string is already lowercased prior to call. If the pattern is not found,
+   * MATCH_NOT_FOUND is returned.
    *
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
    * @param endPos the end position for searching (in the target string)
-   * @return the position of the last occurrence of pattern in target, if not found, -1 returned.
+   * @return the position of the last occurrence of pattern in target
    */
   public static int lowercaseRFind(
-          final UTF8String target,
-          final UTF8String lowercasePattern,
-          int endPos) {
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
+    assert endPos <= target.numChars();
     for (int i = endPos; i >= 0; --i) {
       if (lowercaseMatchUntil(target, lowercasePattern, i)) {
         return i;
@@ -321,14 +333,15 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
   }
 
   /**
-   * Returns the position of the first occurrence of the pattern string
-   * in the target string from the specified position (0-based index),
-   * with respect to the UTF8_BINARY_LCASE collation.
+   * Returns the position of the first occurrence of the pattern string in the target string,
+   * starting from the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. If the pattern is not found,
+   * MATCH_NOT_FOUND is returned.
    *
    * @param target the string to be searched in
    * @param pattern the string to be searched for
    * @param start the start position for searching (in the target string)
-   * @return the position of the first occurrence of pattern in target, if not found, -1 returned.
+   * @return the position of the first occurrence of pattern in target
    */
   public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
       final int start) {
@@ -434,25 +447,23 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string,
     UTF8String lowercaseDelimiter = delimiter.toLowerCase();
 
     if (count > 0) {
-      // search left to right (note: the start code point is inclusive)
+      // Search left to right (note: the start code point is inclusive).
       int matchLength = -1;
       while (count > 0) {
         matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1);
-        if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter
-        else return string; // cannot find enough delimiters in the string
+        if (matchLength > MATCH_NOT_FOUND) --count; // Found a delimiter.
+        else return string; // Cannot find enough delimiters in the string.
       }
-      if (matchLength == 0) return UTF8String.EMPTY_UTF8;
       return string.substring(0, matchLength);
     } else {
-      // search right to left (note: the end code point is exclusive)
+      // Search right to left (note: the end code point is exclusive).
       int matchLength = string.numChars() + 1;
       count = -count;
       while (count > 0) {
         matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1);
-        if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter
-        else return string; // cannot find enough delimiters in the string
+        if (matchLength > MATCH_NOT_FOUND) --count; // Found a delimiter.
+        else return string; // Cannot find enough delimiters in the string.
       }
-      if (matchLength == string.numChars()) return UTF8String.EMPTY_UTF8;
       return string.substring(matchLength, string.numChars());
     }
   }
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 2d80c50ba084..a88e2eff0253 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -585,7 +585,9 @@ public void testStringInstr() throws SparkException {
     assertStringInstr("i̇oİo12", "İo", "UTF8_BINARY_LCASE", 1);
     assertStringInstr("abİoi̇o", "i̇o", "UTF8_BINARY_LCASE", 3);
     assertStringInstr("abi̇oİo", "İo", "UTF8_BINARY_LCASE", 3);
+    assertStringInstr("abI\u0307oi̇o", "İo", "UTF8_BINARY_LCASE", 3);
     assertStringInstr("ai̇oxXİo", "Xx", "UTF8_BINARY_LCASE", 5);
+    assertStringInstr("abİoi̇o", "\u0307o", "UTF8_BINARY_LCASE", 6);
     assertStringInstr("aİoi̇oxx", "XX", "UTF8_BINARY_LCASE", 7);
   }
 
@@ -836,10 +838,12 @@ public void testSubstringIndex() throws SparkException {
     assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
+    assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_BINARY_LCASE", "bİoi̇o12i̇o");
     assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
     assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
+    assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_BINARY_LCASE", "bİoi̇o12i̇o");
   }
 
   private void assertStringTrim(

From a26124cccbf7ba5d0b1dc6a5b7f7d6529731f59e Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 20 May 2024 15:24:14 +0200
Subject: [PATCH 4/5] Centralize indexOf behaviour for empty substring

---
 .../sql/catalyst/util/CollationAwareUTF8String.java |  4 ++--
 .../org/apache/spark/unsafe/types/UTF8String.java   | 13 ++++++++++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index c48a5f942b7d..f9f7a14ca080 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -345,14 +345,14 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
    */
   public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
       final int start) {
-    if (pattern.numChars() == 0) return 0;
+    if (pattern.numChars() == 0) return target.indexOfEmpty(start);
     return lowercaseFind(target, pattern.toLowerCase(), start);
   }
 
   public static int indexOf(final UTF8String target, final UTF8String pattern,
       final int start, final int collationId) {
     if (pattern.numBytes() == 0) {
-      return 0;
+      return target.indexOfEmpty(start);
     }
 
     StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId);
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 20b26b6ebc5a..d6f664133418 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -891,6 +891,17 @@ public UTF8String repeat(int times) {
     return UTF8String.fromBytes(newBytes);
   }
 
+  /**
+   * Returns the (default) position of the first occurrence of an empty substr in the current
+   * string from the specified position (0-based index).
+   *
+   * @param start the start position of the current string for searching
+   * @return the position of the first occurrence of the empty substr (now, always 0)
+   */
+  public int indexOfEmpty(int start) {
+    return 0; // TODO: Fix this behaviour (SPARK-48284)
+  }
+
   /**
    * Returns the position of the first occurrence of substr in
    * current string from the specified position (0-based index).
@@ -901,7 +912,7 @@ public UTF8String repeat(int times) {
    */
   public int indexOf(UTF8String v, int start) {
     if (v.numBytes() == 0) {
-      return 0;
+      return indexOfEmpty(start);
     }
 
     // locate to the start position.

From b571084aa56582a9d65a150bcc4de7be86dc2b03 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 24 May 2024 15:12:46 +0200
Subject: [PATCH 5/5] Update new method access

---
 .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index f9f7a14ca080..7d8e9f81ff18 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -74,7 +74,7 @@ public static boolean lowercaseMatchFrom(
    * @param startPos the start position for searching (in the target string)
    * @return length of the target substring that ends with the specified prefix in lowercase
    */
-  public static int lowercaseMatchLengthFrom(
+  private static int lowercaseMatchLengthFrom(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int startPos) {
@@ -99,7 +99,7 @@ public static int lowercaseMatchLengthFrom(
    * @param startPos the start position for searching (in the target string)
    * @return the position of the first occurrence of pattern in target
    */
-  public static int lowercaseFind(
+  private static int lowercaseFind(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int startPos) {
@@ -145,7 +145,7 @@ public static boolean lowercaseMatchUntil(
    * @param endPos the end position for searching (in the target string)
    * @return length of the target substring that ends with the specified suffix in lowercase
    */
-  public static int lowercaseMatchLengthUntil(
+  private static int lowercaseMatchLengthUntil(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int endPos) {
@@ -170,7 +170,7 @@ public static int lowercaseMatchLengthUntil(
    * @param endPos the end position for searching (in the target string)
    * @return the position of the last occurrence of pattern in target
    */
-  public static int lowercaseRFind(
+  private static int lowercaseRFind(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int endPos) {