From a5759879338233055aeb9ac38fcecc06ff93bad5 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Wed, 22 May 2024 10:03:15 +0200
Subject: [PATCH 01/14] Initial commit

---
 .../util/CollationAwareUTF8String.java        | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index ee0d611d7e652..0f4a59f33b682 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -141,18 +141,84 @@ public static String toUpperCase(final String target, final int collationId) {
     return UCharacter.toUpperCase(locale, target);
   }
 
+  private static int uppercaseCodePoint(final int codePoint, final StringBuilder sb, final int i,
+      final String target) {
+    // Latin small letter i with an additional dot is represented using 2 characters.
+    if (codePoint == 0x0069 && i + 1 < target.length() && target.codePointAt(i + 1) == 0x0307) {
+      sb.append("İ");
+      return 1;
+    }
+    // All other characters should follow context-unaware ICU single-code point case mapping.
+    sb.appendCodePoint(UCharacter.toTitleCase(codePoint));
+    return 0;
+  }
+
+  public static String toUpperCase(final String target) {
+      StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < target.length(); ++i) {
+      int codePoint = target.codePointAt(i);
+      // Latin small letter i with an additional dot above (represented using 2 characters).
+      if (codePoint == 0x0069 && i + 1 < target.length() && target.codePointAt(i + 1) == 0x0307) {
+        sb.append("İ");
+        ++i;
+      }
+      // All other characters should follow context-unaware ICU single-code point case mapping.
+      else {
+        sb.appendCodePoint(UCharacter.toUpperCase(codePoint));
+      }
+    }
+    return sb.toString();
+  }
+
   public static String toLowerCase(final String target, final int collationId) {
     ULocale locale = CollationFactory.fetchCollation(collationId)
       .collator.getLocale(ULocale.ACTUAL_LOCALE);
     return UCharacter.toLowerCase(locale, target);
   }
 
+  private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) {
+    // Latin capital letter I with dot above is mapped to 2 lowercase characters.
+    if (codePoint == 0x0130) {
+      sb.append("i̇");
+    }
+    // Greek final and non-final capital letter sigma should be mapped the same.
+    else if (codePoint == 0x03C2) {
+      sb.append("σ");
+    }
+    // All other characters should follow context-unaware ICU single-code point case mapping.
+    else {
+      sb.appendCodePoint(UCharacter.toLowerCase(codePoint));
+    }
+  }
+
+  public static String toLowerCase(final String target) {
+      StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < target.length(); ++i) {
+      int codePoint = target.codePointAt(i);
+      lowercaseCodePoint(codePoint, sb);
+    }
+    return sb.toString();
+  }
+
   public static String toTitleCase(final String target, final int collationId) {
     ULocale locale = CollationFactory.fetchCollation(collationId)
       .collator.getLocale(ULocale.ACTUAL_LOCALE);
     return UCharacter.toTitleCase(locale, target, BreakIterator.getWordInstance(locale));
   }
 
+  public static String toTitleCase(final String target) {
+      StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < target.length(); ++i) {
+      int codePoint = target.codePointAt(i);
+      if (i == 0 || Character.isWhitespace(target.codePointBefore(i))) {
+        i += uppercaseCodePoint(codePoint, sb, i, target);
+      } else {
+        lowercaseCodePoint(codePoint, sb);
+      }
+    }
+    return sb.toString();
+  }
+
   public static int findInSet(final UTF8String match, final UTF8String set, int collationId) {
     if (match.contains(UTF8String.fromString(","))) {
       return 0;

From a7241002255293e322a0ed246b5c7ea347f23c21 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Wed, 22 May 2024 10:03:18 +0200
Subject: [PATCH 02/14] Tests

---
 .../unsafe/types/CollationSupportSuite.java   | 169 +++++++++++++++++-
 1 file changed, 168 insertions(+), 1 deletion(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 7fc3c4e349c3b..e1dd96f6cb8f6 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -17,15 +17,181 @@
 package org.apache.spark.unsafe.types;
 
 import org.apache.spark.SparkException;
+import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String;
 import org.apache.spark.sql.catalyst.util.CollationFactory;
 import org.apache.spark.sql.catalyst.util.CollationSupport;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.*;
 
-
+// checkstyle.off: AvoidEscapedUnicodeCharacters
 public class CollationSupportSuite {
 
+  /**
+   * Collation-aware UTF8String comparison.
+   */
+
+  private void assertLowercase(String target, String expected, String collationName)
+      throws SparkException {
+    if (collationName.equals("UTF8_BINARY")) {
+      UTF8String targetUTF8 = UTF8String.fromString(target);
+      UTF8String expectedUTF8 = UTF8String.fromString(expected);
+      assertEquals(expectedUTF8, targetUTF8.toLowerCase());
+    } else if (collationName.equals("UTF8_BINARY_LCASE")) {
+      assertEquals(expected, CollationAwareUTF8String.toLowerCase(target));
+    } else {
+      int collationId = CollationFactory.collationNameToId(collationName);
+      assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, collationId));
+    }
+  }
+
+  @Test
+  public void testLowercase() throws SparkException {
+    // Edge cases
+    assertLowercase("", "", "UTF8_BINARY");
+    assertLowercase("", "", "UTF8_BINARY_LCASE");
+    assertLowercase("", "", "UNICODE");
+    assertLowercase("", "", "UNICODE_CI");
+    // Basic tests
+    assertLowercase("abcd", "abcd", "UTF8_BINARY");
+    assertLowercase("AbCd", "abcd", "UTF8_BINARY");
+    assertLowercase("abcd", "abcd", "UTF8_BINARY_LCASE");
+    assertLowercase("aBcD", "abcd", "UTF8_BINARY_LCASE");
+    assertLowercase("abcd", "abcd", "UNICODE");
+    assertLowercase("aBCd", "abcd", "UNICODE");
+    assertLowercase("abcd", "abcd", "UNICODE_CI");
+    assertLowercase("AbcD", "abcd", "UNICODE_CI");
+    // Accent variation
+    assertLowercase("AbĆd", "abćd", "UTF8_BINARY");
+    assertLowercase("aBcΔ", "abcδ", "UTF8_BINARY_LCASE");
+    assertLowercase("ÄbcD", "äbcd", "UNICODE");
+    assertLowercase("aB́Cd", "ab́cd", "UNICODE_CI");
+    // Case-variable character length
+    assertLowercase("İoDiNe", "i̇odine", "UTF8_BINARY");
+    assertLowercase("Abi̇o12", "abi̇o12", "UTF8_BINARY");
+    assertLowercase("İodInE", "i̇odine", "UTF8_BINARY_LCASE");
+    assertLowercase("aBi̇o12", "abi̇o12", "UTF8_BINARY_LCASE");
+    assertLowercase("İoDinE", "i̇odine", "UNICODE");
+    assertLowercase("abi̇O12", "abi̇o12", "UNICODE");
+    assertLowercase("İodINe", "i̇odine", "UNICODE_CI");
+    assertLowercase("ABi̇o12", "abi̇o12", "UNICODE_CI");
+    // Conditional case mapping
+    assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UTF8_BINARY");
+    assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", "UTF8_BINARY_LCASE"); // != UNICODE_CI
+    assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE");
+    assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI");
+  }
+
+  private void assertUppercase(String target, String expected, String collationName)
+      throws SparkException {
+    if (collationName.equals("UTF8_BINARY")) {
+      UTF8String targetUTF8 = UTF8String.fromString(target);
+      UTF8String expectedUTF8 = UTF8String.fromString(expected);
+      assertEquals(expectedUTF8, targetUTF8.toUpperCase());
+    } else if (collationName.equals("UTF8_BINARY_LCASE")) {
+      assertEquals(expected, CollationAwareUTF8String.toUpperCase(target));
+    } else {
+      int collationId = CollationFactory.collationNameToId(collationName);
+      assertEquals(expected, CollationAwareUTF8String.toUpperCase(target, collationId));
+    }
+  }
+
+  @Test
+  public void testUppercase() throws SparkException {
+    // Edge cases
+    assertUppercase("", "", "UTF8_BINARY");
+    assertUppercase("", "", "UTF8_BINARY_LCASE");
+    assertUppercase("", "", "UNICODE");
+    assertUppercase("", "", "UNICODE_CI");
+    // Basic tests
+    assertUppercase("abcd", "ABCD", "UTF8_BINARY");
+    assertUppercase("AbCd", "ABCD", "UTF8_BINARY");
+    assertUppercase("abcd", "ABCD", "UTF8_BINARY_LCASE");
+    assertUppercase("aBcD", "ABCD", "UTF8_BINARY_LCASE");
+    assertUppercase("abcd", "ABCD", "UNICODE");
+    assertUppercase("aBCd", "ABCD", "UNICODE");
+    assertUppercase("abcd", "ABCD", "UNICODE_CI");
+    assertUppercase("AbcD", "ABCD", "UNICODE_CI");
+    // Accent variation
+    assertUppercase("aBćD", "ABĆD", "UTF8_BINARY");
+    assertUppercase("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE");
+    assertUppercase("äBCd", "ÄBCD", "UNICODE");
+    assertUppercase("Ab́cD", "AB́CD", "UNICODE_CI");
+    // Case-variable character length
+    assertUppercase("i\u0307oDiNe", "I\u0307ODINE", "UTF8_BINARY");
+    assertUppercase("Abi\u0307o12", "ABI\u0307O12", "UTF8_BINARY");
+    assertUppercase("i̇odInE", "İODINE", "UTF8_BINARY_LCASE");
+    assertUppercase("aBi̇o12", "ABİO12", "UTF8_BINARY_LCASE");
+    assertUppercase("i̇oDinE", "I\u0307ODINE", "UNICODE");
+    assertUppercase("abi̇O12", "ABI\u0307O12", "UNICODE");
+    assertUppercase("i̇odINe", "I\u0307ODINE", "UNICODE_CI");
+    assertUppercase("ABi̇o12", "ABI\u0307O12", "UNICODE_CI");
+    // Conditional case mapping
+    assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY");
+    assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY");
+    assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE");
+    assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE");
+    assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE");
+    assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE");
+    assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI");
+    assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI");
+  }
+
+  private void assertTitlecase(String target, String expected, String collationName)
+      throws SparkException {
+    if (collationName.equals("UTF8_BINARY")) {
+      UTF8String targetUTF8 = UTF8String.fromString(target);
+      UTF8String expectedUTF8 = UTF8String.fromString(expected);
+      assertEquals(expectedUTF8, targetUTF8.toTitleCase());
+    } else if (collationName.equals("UTF8_BINARY_LCASE")) {
+      assertEquals(expected, CollationAwareUTF8String.toTitleCase(target));
+    } else {
+      int collationId = CollationFactory.collationNameToId(collationName);
+      assertEquals(expected, CollationAwareUTF8String.toTitleCase(target, collationId));
+    }
+  }
+
+  @Test
+  public void testTitlecase() throws SparkException {
+    // Edge cases
+    assertTitlecase("", "", "UTF8_BINARY");
+    assertTitlecase("", "", "UTF8_BINARY_LCASE");
+    assertTitlecase("", "", "UNICODE");
+    assertTitlecase("", "", "UNICODE_CI");
+    // Basic tests
+    assertTitlecase("ab cd", "Ab Cd", "UTF8_BINARY");
+    assertTitlecase("Ab Cd", "Ab Cd", "UTF8_BINARY");
+    assertTitlecase("ab cd", "Ab Cd", "UTF8_BINARY_LCASE");
+    assertTitlecase("aB cD", "Ab Cd", "UTF8_BINARY_LCASE");
+    assertTitlecase("ab cd", "Ab Cd", "UNICODE");
+    assertTitlecase("aB Cd", "Ab Cd", "UNICODE");
+    assertTitlecase("ab cd", "Ab Cd", "UNICODE_CI");
+    assertTitlecase("Ab cD", "Ab Cd", "UNICODE_CI");
+    // Accent variation
+    assertTitlecase("aB ćD", "AB ĆD", "UTF8_BINARY");
+    assertTitlecase("AbC δ", "Abc Δ", "UTF8_BINARY_LCASE");
+    assertTitlecase("äB Cd", "Äb Cd", "UNICODE");
+    assertTitlecase("A b́cD", "A B́cd", "UNICODE_CI");
+    // Case-variable character length
+    assertTitlecase("i\u0307oDiNe", "I\u0307oDiNe", "UTF8_BINARY");
+    assertTitlecase("Abi\u0307o12", "Abi\u0307o12", "UTF8_BINARY");
+    assertTitlecase("i̇od i̇nE", "İod İne", "UTF8_BINARY_LCASE");
+    assertTitlecase("aBi̇o12", "Abi\u0307o12", "UTF8_BINARY_LCASE");
+    assertTitlecase("i̇oDinE", "I\u0307odine", "UNICODE");
+    assertTitlecase("abi̇O12", "Abi̇o12", "UNICODE");
+    assertTitlecase("i̇odINe", "I\u0307odine", "UNICODE_CI");
+    assertTitlecase("ABi̇o12", "Abi\u0307o12", "UNICODE_CI");
+    // Conditional case mapping
+    assertTitlecase("a ς c", "A Σ C", "UTF8_BINARY");
+    assertTitlecase("a σ c", "A Σ C", "UTF8_BINARY");
+    assertTitlecase("a ς c", "A Σ C", "UTF8_BINARY_LCASE");
+    assertTitlecase("a σ c", "A Σ C", "UTF8_BINARY_LCASE");
+    assertTitlecase("a ς c", "A Σ C", "UNICODE");
+    assertTitlecase("a σ c", "A Σ C", "UNICODE");
+    assertTitlecase("a ς c", "A Σ C", "UNICODE_CI");
+    assertTitlecase("a σ c", "A Σ C", "UNICODE_CI");
+  }
+
   /**
    * Collation-aware string expressions.
    */
@@ -1008,3 +1174,4 @@ public void testStringTrim() throws SparkException {
   // TODO: Test other collation-aware expressions.
 
 }
+// checkstyle.on: AvoidEscapedUnicodeCharacters

From d919e4e68c033681345675c101b3ab64d59a0b06 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Wed, 22 May 2024 13:23:10 +0200
Subject: [PATCH 03/14] Comparison and hash

---
 .../util/CollationAwareUTF8String.java        |  5 ++
 .../sql/catalyst/util/CollationFactory.java   |  4 +-
 .../unsafe/types/CollationSupportSuite.java   | 65 +++++++++++++++++++
 3 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 0f4a59f33b682..ac314f9d30921 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -34,6 +34,11 @@
  * Utility class for collation-aware UTF8String operations.
  */
 public class CollationAwareUTF8String {
+
+  public static int compareLowerCase(final UTF8String left, final UTF8String right) {
+    return toLowerCase(left.toString()).compareTo(toLowerCase(right.toString()));
+  }
+
   public static UTF8String replace(final UTF8String src, final UTF8String search,
       final UTF8String replace, final int collationId) {
     // This collation aware implementation is based on existing implementation on UTF8String
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index 0133c3feb611a..9f9773eaeace3 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -232,9 +232,9 @@ public CollationIdentifier identifier() {
       "UTF8_BINARY_LCASE",
       PROVIDER_SPARK,
       null,
-      UTF8String::compareLowerCase,
+      CollationAwareUTF8String::compareLowerCase,
       "1.0",
-      (s) -> (long)s.toLowerCase().hashCode(),
+      (s) -> (long)CollationAwareUTF8String.toLowerCase(s.toString()).hashCode(),
       false,
       false,
       true);
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index e1dd96f6cb8f6..fdbb45faf3dfc 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -31,6 +31,71 @@ public class CollationSupportSuite {
    * Collation-aware UTF8String comparison.
    */
 
+  private void assertCompare(String s1, String s2, String collationName, int expected)
+          throws SparkException {
+    UTF8String l = UTF8String.fromString(s1);
+    UTF8String r = UTF8String.fromString(s2);
+    int compare = CollationFactory.fetchCollation(collationName).comparator.compare(l, r);
+    assertEquals(Integer.signum(expected), Integer.signum(compare));
+  }
+
+  @Test
+  public void testCompare() throws SparkException {
+    // Edge cases
+    assertCompare("", "", "UTF8_BINARY", 0);
+    assertCompare("a", "", "UTF8_BINARY", 1);
+    assertCompare("", "a", "UTF8_BINARY", -1);
+    assertCompare("", "", "UTF8_BINARY_LCASE", 0);
+    assertCompare("a", "", "UTF8_BINARY_LCASE", 1);
+    assertCompare("", "a", "UTF8_BINARY_LCASE", -1);
+    assertCompare("", "", "UNICODE", 0);
+    assertCompare("a", "", "UNICODE", 1);
+    assertCompare("", "a", "UNICODE", -1);
+    assertCompare("", "", "UNICODE_CI", 0);
+    assertCompare("a", "", "UNICODE_CI", 1);
+    assertCompare("", "a", "UNICODE_CI", -1);
+    // Basic tests
+    assertCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
+    assertCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0);
+    assertCompare("AbcD", "aBCd", "UNICODE", 1);
+    assertCompare("abcd", "ABCD", "UNICODE_CI", 0);
+    // Accent variation
+    assertCompare("aBćD", "ABĆD", "UTF8_BINARY", 1);
+    assertCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0);
+    assertCompare("äBCd", "ÄBCD", "UNICODE", -1);
+    assertCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
+    // Case-variable character length
+    assertCompare("i\u0307", "İ", "UTF8_BINARY", -1);
+    assertCompare("İ", "i\u0307", "UTF8_BINARY", 1);
+    assertCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0);
+    assertCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0);
+    assertCompare("i\u0307", "İ", "UNICODE", -1);
+    assertCompare("İ", "i\u0307", "UNICODE", 1);
+    assertCompare("i\u0307", "İ", "UNICODE_CI", 0);
+    assertCompare("İ", "i\u0307", "UNICODE_CI", 0);
+    assertCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0);
+    assertCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0);
+    assertCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0);
+    assertCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0);
+    // Conditional case mapping
+    assertCompare("ς", "σ", "UTF8_BINARY", -1);
+    assertCompare("ς", "Σ", "UTF8_BINARY", 1);
+    assertCompare("σ", "Σ", "UTF8_BINARY", 1);
+    assertCompare("ς", "σ", "UTF8_BINARY_LCASE", 0);
+    assertCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertCompare("ς", "σ", "UNICODE", 1);
+    assertCompare("ς", "Σ", "UNICODE", 1);
+    assertCompare("σ", "Σ", "UNICODE", -1);
+    assertCompare("ς", "σ", "UNICODE_CI", 0);
+    assertCompare("ς", "Σ", "UNICODE_CI", 0);
+    assertCompare("σ", "Σ", "UNICODE_CI", 0);
+  }
+
   private void assertLowercase(String target, String expected, String collationName)
       throws SparkException {
     if (collationName.equals("UTF8_BINARY")) {

From fb48bdc9bff72bb9863eed2e1d4ff06cf73b7c0a Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Thu, 23 May 2024 17:00:28 +0200
Subject: [PATCH 04/14] Add doc comments

---
 .../util/CollationAwareUTF8String.java        |  60 +++-
 .../sql/catalyst/util/CollationFactory.java   |   2 +-
 .../unsafe/types/CollationSupportSuite.java   | 296 +++++++++---------
 3 files changed, 205 insertions(+), 153 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index ac314f9d30921..66632457b328b 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -35,8 +35,18 @@
  */
 public class CollationAwareUTF8String {
 
+  /**
+   * Lowercase UTF8String comparison used for UTF8_BINARY_LCASE collation. While the default
+   * UTF8String comparison is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()), this
+   * method uses code points to compare the strings in a case-insensitive manner using ICU rules,
+   * as well as handling special rules for conditional case mappings (see: lowerCaseCodePoints).
+   *
+   * @param left The first UTF8String to compare.
+   * @param right The second UTF8String to compare.
+   * @return An integer representing the comparison result.
+   */
   public static int compareLowerCase(final UTF8String left, final UTF8String right) {
-    return toLowerCase(left.toString()).compareTo(toLowerCase(right.toString()));
+    return lowerCaseCodePoints(left.toString()).compareTo(lowerCaseCodePoints(right.toString()));
   }
 
   public static UTF8String replace(final UTF8String src, final UTF8String search,
@@ -146,6 +156,16 @@ public static String toUpperCase(final String target, final int collationId) {
     return UCharacter.toUpperCase(locale, target);
   }
 
+  /**
+   * Converts a single code point to uppercase using ICU rules, with special handling for
+   * conditional case mappings (i.e. characters that map to multiple characters in uppercase).
+   *
+   * @param codePoint The code point to convert to uppercase.
+   * @param sb The StringBuilder to append the uppercase character to.
+   * @param i The index of the code point in the target string.
+   * @param target The target string to convert to uppercase.
+   * @return The number of characters consumed by the code point.
+   */
   private static int uppercaseCodePoint(final int codePoint, final StringBuilder sb, final int i,
       final String target) {
     // Latin small letter i with an additional dot is represented using 2 characters.
@@ -153,12 +173,21 @@ private static int uppercaseCodePoint(final int codePoint, final StringBuilder s
       sb.append("İ");
       return 1;
     }
+    // TODO: Add special handling for other chars that map to multiple characters in uppercase.
     // All other characters should follow context-unaware ICU single-code point case mapping.
     sb.appendCodePoint(UCharacter.toTitleCase(codePoint));
     return 0;
   }
 
-  public static String toUpperCase(final String target) {
+  /**
+   * Converts an entire string to uppercase using ICU rules, code point by code point, with
+   * special handling for conditional case mappings (i.e. characters that map to multiple
+   * characters in uppercase). This method omits information about context-sensitive case mappings.
+   *
+   * @param target The target string to convert to uppercase.
+   * @return The string converted to uppercase in a context-unaware manner.
+   */
+  public static String upperCaseCodePoints(final String target) {
       StringBuilder sb = new StringBuilder();
     for (int i = 0; i < target.length(); ++i) {
       int codePoint = target.codePointAt(i);
@@ -181,6 +210,13 @@ public static String toLowerCase(final String target, final int collationId) {
     return UCharacter.toLowerCase(locale, target);
   }
 
+  /**
+   * Converts a single code point to lowercase using ICU rules, with special handling for
+   * conditional case mappings (i.e. characters that map to multiple characters in lowercase).
+   *
+   * @param codePoint The code point to convert to lowercase.
+   * @param sb The StringBuilder to append the lowercase character to.
+   */
   private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) {
     // Latin capital letter I with dot above is mapped to 2 lowercase characters.
     if (codePoint == 0x0130) {
@@ -196,7 +232,15 @@ else if (codePoint == 0x03C2) {
     }
   }
 
-  public static String toLowerCase(final String target) {
+  /**
+   * Converts an entire string to lowercase using ICU rules, code point by code point, with
+   * special handling for conditional case mappings (i.e. characters that map to multiple
+   * characters in lowercase). This method omits information about context-sensitive case mappings.
+   *
+   * @param target The target string to convert to lowercase.
+   * @return The string converted to lowercase in a context-unaware manner.
+   */
+  public static String lowerCaseCodePoints(final String target) {
       StringBuilder sb = new StringBuilder();
     for (int i = 0; i < target.length(); ++i) {
       int codePoint = target.codePointAt(i);
@@ -211,7 +255,15 @@ public static String toTitleCase(final String target, final int collationId) {
     return UCharacter.toTitleCase(locale, target, BreakIterator.getWordInstance(locale));
   }
 
-  public static String toTitleCase(final String target) {
+  /**
+   * Converts an entire string to titlecase using ICU rules, code point by code point, with
+   * special handling for conditional case mappings (i.e. characters that map to multiple
+   * characters in lowercase). This method omits information about context-sensitive case mappings.
+   *
+   * @param target The target string to convert to lowercase.
+   * @return The string converted to lowercase in a context-unaware manner.
+   */
+  public static String titleCaseCodePoints(final String target) {
       StringBuilder sb = new StringBuilder();
     for (int i = 0; i < target.length(); ++i) {
       int codePoint = target.codePointAt(i);
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index 9f9773eaeace3..5c9313875fb1c 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -234,7 +234,7 @@ public CollationIdentifier identifier() {
       null,
       CollationAwareUTF8String::compareLowerCase,
       "1.0",
-      (s) -> (long)CollationAwareUTF8String.toLowerCase(s.toString()).hashCode(),
+      (s) -> (long)CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(),
       false,
       false,
       true);
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index fdbb45faf3dfc..c9a7e18e1cdbe 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -31,7 +31,7 @@ public class CollationSupportSuite {
    * Collation-aware UTF8String comparison.
    */
 
-  private void assertCompare(String s1, String s2, String collationName, int expected)
+  private void assertStringCompare(String s1, String s2, String collationName, int expected)
           throws SparkException {
     UTF8String l = UTF8String.fromString(s1);
     UTF8String r = UTF8String.fromString(s2);
@@ -42,68 +42,68 @@ private void assertCompare(String s1, String s2, String collationName, int expec
   @Test
   public void testCompare() throws SparkException {
     // Edge cases
-    assertCompare("", "", "UTF8_BINARY", 0);
-    assertCompare("a", "", "UTF8_BINARY", 1);
-    assertCompare("", "a", "UTF8_BINARY", -1);
-    assertCompare("", "", "UTF8_BINARY_LCASE", 0);
-    assertCompare("a", "", "UTF8_BINARY_LCASE", 1);
-    assertCompare("", "a", "UTF8_BINARY_LCASE", -1);
-    assertCompare("", "", "UNICODE", 0);
-    assertCompare("a", "", "UNICODE", 1);
-    assertCompare("", "a", "UNICODE", -1);
-    assertCompare("", "", "UNICODE_CI", 0);
-    assertCompare("a", "", "UNICODE_CI", 1);
-    assertCompare("", "a", "UNICODE_CI", -1);
+    assertStringCompare("", "", "UTF8_BINARY", 0);
+    assertStringCompare("a", "", "UTF8_BINARY", 1);
+    assertStringCompare("", "a", "UTF8_BINARY", -1);
+    assertStringCompare("", "", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1);
+    assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1);
+    assertStringCompare("", "", "UNICODE", 0);
+    assertStringCompare("a", "", "UNICODE", 1);
+    assertStringCompare("", "a", "UNICODE", -1);
+    assertStringCompare("", "", "UNICODE_CI", 0);
+    assertStringCompare("a", "", "UNICODE_CI", 1);
+    assertStringCompare("", "a", "UNICODE_CI", -1);
     // Basic tests
-    assertCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
-    assertCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0);
-    assertCompare("AbcD", "aBCd", "UNICODE", 1);
-    assertCompare("abcd", "ABCD", "UNICODE_CI", 0);
+    assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
+    assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("AbcD", "aBCd", "UNICODE", 1);
+    assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0);
     // Accent variation
-    assertCompare("aBćD", "ABĆD", "UTF8_BINARY", 1);
-    assertCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0);
-    assertCompare("äBCd", "ÄBCD", "UNICODE", -1);
-    assertCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
+    assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1);
+    assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1);
+    assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
     // Case-variable character length
-    assertCompare("i\u0307", "İ", "UTF8_BINARY", -1);
-    assertCompare("İ", "i\u0307", "UTF8_BINARY", 1);
-    assertCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0);
-    assertCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0);
-    assertCompare("i\u0307", "İ", "UNICODE", -1);
-    assertCompare("İ", "i\u0307", "UNICODE", 1);
-    assertCompare("i\u0307", "İ", "UNICODE_CI", 0);
-    assertCompare("İ", "i\u0307", "UNICODE_CI", 0);
-    assertCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
-    assertCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0);
-    assertCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
-    assertCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0);
-    assertCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0);
-    assertCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0);
-    assertCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0);
-    assertCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1);
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307", "İ", "UNICODE", -1);
+    assertStringCompare("İ", "i\u0307", "UNICODE", 1);
+    assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0);
+    assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0);
     // Conditional case mapping
-    assertCompare("ς", "σ", "UTF8_BINARY", -1);
-    assertCompare("ς", "Σ", "UTF8_BINARY", 1);
-    assertCompare("σ", "Σ", "UTF8_BINARY", 1);
-    assertCompare("ς", "σ", "UTF8_BINARY_LCASE", 0);
-    assertCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0);
-    assertCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0);
-    assertCompare("ς", "σ", "UNICODE", 1);
-    assertCompare("ς", "Σ", "UNICODE", 1);
-    assertCompare("σ", "Σ", "UNICODE", -1);
-    assertCompare("ς", "σ", "UNICODE_CI", 0);
-    assertCompare("ς", "Σ", "UNICODE_CI", 0);
-    assertCompare("σ", "Σ", "UNICODE_CI", 0);
+    assertStringCompare("ς", "σ", "UTF8_BINARY", -1);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "σ", "UNICODE", 1);
+    assertStringCompare("ς", "Σ", "UNICODE", 1);
+    assertStringCompare("σ", "Σ", "UNICODE", -1);
+    assertStringCompare("ς", "σ", "UNICODE_CI", 0);
+    assertStringCompare("ς", "Σ", "UNICODE_CI", 0);
+    assertStringCompare("σ", "Σ", "UNICODE_CI", 0);
   }
 
-  private void assertLowercase(String target, String expected, String collationName)
+  private void assertLcaseCompare(String target, String expected, String collationName)
       throws SparkException {
     if (collationName.equals("UTF8_BINARY")) {
       UTF8String targetUTF8 = UTF8String.fromString(target);
       UTF8String expectedUTF8 = UTF8String.fromString(expected);
       assertEquals(expectedUTF8, targetUTF8.toLowerCase());
     } else if (collationName.equals("UTF8_BINARY_LCASE")) {
-      assertEquals(expected, CollationAwareUTF8String.toLowerCase(target));
+      assertEquals(expected, CollationAwareUTF8String.lowerCaseCodePoints(target));
     } else {
       int collationId = CollationFactory.collationNameToId(collationName);
       assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, collationId));
@@ -111,50 +111,50 @@ private void assertLowercase(String target, String expected, String collationNam
   }
 
   @Test
-  public void testLowercase() throws SparkException {
+  public void testLcaseCompare() throws SparkException {
     // Edge cases
-    assertLowercase("", "", "UTF8_BINARY");
-    assertLowercase("", "", "UTF8_BINARY_LCASE");
-    assertLowercase("", "", "UNICODE");
-    assertLowercase("", "", "UNICODE_CI");
+    assertLcaseCompare("", "", "UTF8_BINARY");
+    assertLcaseCompare("", "", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("", "", "UNICODE");
+    assertLcaseCompare("", "", "UNICODE_CI");
     // Basic tests
-    assertLowercase("abcd", "abcd", "UTF8_BINARY");
-    assertLowercase("AbCd", "abcd", "UTF8_BINARY");
-    assertLowercase("abcd", "abcd", "UTF8_BINARY_LCASE");
-    assertLowercase("aBcD", "abcd", "UTF8_BINARY_LCASE");
-    assertLowercase("abcd", "abcd", "UNICODE");
-    assertLowercase("aBCd", "abcd", "UNICODE");
-    assertLowercase("abcd", "abcd", "UNICODE_CI");
-    assertLowercase("AbcD", "abcd", "UNICODE_CI");
+    assertLcaseCompare("abcd", "abcd", "UTF8_BINARY");
+    assertLcaseCompare("AbCd", "abcd", "UTF8_BINARY");
+    assertLcaseCompare("abcd", "abcd", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("aBcD", "abcd", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("abcd", "abcd", "UNICODE");
+    assertLcaseCompare("aBCd", "abcd", "UNICODE");
+    assertLcaseCompare("abcd", "abcd", "UNICODE_CI");
+    assertLcaseCompare("AbcD", "abcd", "UNICODE_CI");
     // Accent variation
-    assertLowercase("AbĆd", "abćd", "UTF8_BINARY");
-    assertLowercase("aBcΔ", "abcδ", "UTF8_BINARY_LCASE");
-    assertLowercase("ÄbcD", "äbcd", "UNICODE");
-    assertLowercase("aB́Cd", "ab́cd", "UNICODE_CI");
+    assertLcaseCompare("AbĆd", "abćd", "UTF8_BINARY");
+    assertLcaseCompare("aBcΔ", "abcδ", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("ÄbcD", "äbcd", "UNICODE");
+    assertLcaseCompare("aB́Cd", "ab́cd", "UNICODE_CI");
     // Case-variable character length
-    assertLowercase("İoDiNe", "i̇odine", "UTF8_BINARY");
-    assertLowercase("Abi̇o12", "abi̇o12", "UTF8_BINARY");
-    assertLowercase("İodInE", "i̇odine", "UTF8_BINARY_LCASE");
-    assertLowercase("aBi̇o12", "abi̇o12", "UTF8_BINARY_LCASE");
-    assertLowercase("İoDinE", "i̇odine", "UNICODE");
-    assertLowercase("abi̇O12", "abi̇o12", "UNICODE");
-    assertLowercase("İodINe", "i̇odine", "UNICODE_CI");
-    assertLowercase("ABi̇o12", "abi̇o12", "UNICODE_CI");
+    assertLcaseCompare("İoDiNe", "i̇odine", "UTF8_BINARY");
+    assertLcaseCompare("Abi̇o12", "abi̇o12", "UTF8_BINARY");
+    assertLcaseCompare("İodInE", "i̇odine", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("aBi̇o12", "abi̇o12", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("İoDinE", "i̇odine", "UNICODE");
+    assertLcaseCompare("abi̇O12", "abi̇o12", "UNICODE");
+    assertLcaseCompare("İodINe", "i̇odine", "UNICODE_CI");
+    assertLcaseCompare("ABi̇o12", "abi̇o12", "UNICODE_CI");
     // Conditional case mapping
-    assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UTF8_BINARY");
-    assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", "UTF8_BINARY_LCASE"); // != UNICODE_CI
-    assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE");
-    assertLowercase("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI");
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UTF8_BINARY");
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", "UTF8_BINARY_LCASE"); // != UNICODE_CI
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE");
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI");
   }
 
-  private void assertUppercase(String target, String expected, String collationName)
+  private void assertUcaseCompare(String target, String expected, String collationName)
       throws SparkException {
     if (collationName.equals("UTF8_BINARY")) {
       UTF8String targetUTF8 = UTF8String.fromString(target);
       UTF8String expectedUTF8 = UTF8String.fromString(expected);
       assertEquals(expectedUTF8, targetUTF8.toUpperCase());
     } else if (collationName.equals("UTF8_BINARY_LCASE")) {
-      assertEquals(expected, CollationAwareUTF8String.toUpperCase(target));
+      assertEquals(expected, CollationAwareUTF8String.upperCaseCodePoints(target));
     } else {
       int collationId = CollationFactory.collationNameToId(collationName);
       assertEquals(expected, CollationAwareUTF8String.toUpperCase(target, collationId));
@@ -164,52 +164,52 @@ private void assertUppercase(String target, String expected, String collationNam
   @Test
   public void testUppercase() throws SparkException {
     // Edge cases
-    assertUppercase("", "", "UTF8_BINARY");
-    assertUppercase("", "", "UTF8_BINARY_LCASE");
-    assertUppercase("", "", "UNICODE");
-    assertUppercase("", "", "UNICODE_CI");
+    assertUcaseCompare("", "", "UTF8_BINARY");
+    assertUcaseCompare("", "", "UTF8_BINARY_LCASE");
+    assertUcaseCompare("", "", "UNICODE");
+    assertUcaseCompare("", "", "UNICODE_CI");
     // Basic tests
-    assertUppercase("abcd", "ABCD", "UTF8_BINARY");
-    assertUppercase("AbCd", "ABCD", "UTF8_BINARY");
-    assertUppercase("abcd", "ABCD", "UTF8_BINARY_LCASE");
-    assertUppercase("aBcD", "ABCD", "UTF8_BINARY_LCASE");
-    assertUppercase("abcd", "ABCD", "UNICODE");
-    assertUppercase("aBCd", "ABCD", "UNICODE");
-    assertUppercase("abcd", "ABCD", "UNICODE_CI");
-    assertUppercase("AbcD", "ABCD", "UNICODE_CI");
+    assertUcaseCompare("abcd", "ABCD", "UTF8_BINARY");
+    assertUcaseCompare("AbCd", "ABCD", "UTF8_BINARY");
+    assertUcaseCompare("abcd", "ABCD", "UTF8_BINARY_LCASE");
+    assertUcaseCompare("aBcD", "ABCD", "UTF8_BINARY_LCASE");
+    assertUcaseCompare("abcd", "ABCD", "UNICODE");
+    assertUcaseCompare("aBCd", "ABCD", "UNICODE");
+    assertUcaseCompare("abcd", "ABCD", "UNICODE_CI");
+    assertUcaseCompare("AbcD", "ABCD", "UNICODE_CI");
     // Accent variation
-    assertUppercase("aBćD", "ABĆD", "UTF8_BINARY");
-    assertUppercase("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE");
-    assertUppercase("äBCd", "ÄBCD", "UNICODE");
-    assertUppercase("Ab́cD", "AB́CD", "UNICODE_CI");
+    assertUcaseCompare("aBćD", "ABĆD", "UTF8_BINARY");
+    assertUcaseCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE");
+    assertUcaseCompare("äBCd", "ÄBCD", "UNICODE");
+    assertUcaseCompare("Ab́cD", "AB́CD", "UNICODE_CI");
     // Case-variable character length
-    assertUppercase("i\u0307oDiNe", "I\u0307ODINE", "UTF8_BINARY");
-    assertUppercase("Abi\u0307o12", "ABI\u0307O12", "UTF8_BINARY");
-    assertUppercase("i̇odInE", "İODINE", "UTF8_BINARY_LCASE");
-    assertUppercase("aBi̇o12", "ABİO12", "UTF8_BINARY_LCASE");
-    assertUppercase("i̇oDinE", "I\u0307ODINE", "UNICODE");
-    assertUppercase("abi̇O12", "ABI\u0307O12", "UNICODE");
-    assertUppercase("i̇odINe", "I\u0307ODINE", "UNICODE_CI");
-    assertUppercase("ABi̇o12", "ABI\u0307O12", "UNICODE_CI");
+    assertUcaseCompare("i\u0307oDiNe", "I\u0307ODINE", "UTF8_BINARY");
+    assertUcaseCompare("Abi\u0307o12", "ABI\u0307O12", "UTF8_BINARY");
+    assertUcaseCompare("i̇odInE", "İODINE", "UTF8_BINARY_LCASE");
+    assertUcaseCompare("aBi̇o12", "ABİO12", "UTF8_BINARY_LCASE");
+    assertUcaseCompare("i̇oDinE", "I\u0307ODINE", "UNICODE");
+    assertUcaseCompare("abi̇O12", "ABI\u0307O12", "UNICODE");
+    assertUcaseCompare("i̇odINe", "I\u0307ODINE", "UNICODE_CI");
+    assertUcaseCompare("ABi̇o12", "ABI\u0307O12", "UNICODE_CI");
     // Conditional case mapping
-    assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY");
-    assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY");
-    assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE");
-    assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE");
-    assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE");
-    assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE");
-    assertUppercase("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI");
-    assertUppercase("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI");
+    assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY");
+    assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY");
+    assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE");
+    assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE");
+    assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE");
+    assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE");
+    assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI");
+    assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI");
   }
 
-  private void assertTitlecase(String target, String expected, String collationName)
+  private void assertTcaseCompare(String target, String expected, String collationName)
       throws SparkException {
     if (collationName.equals("UTF8_BINARY")) {
       UTF8String targetUTF8 = UTF8String.fromString(target);
       UTF8String expectedUTF8 = UTF8String.fromString(expected);
       assertEquals(expectedUTF8, targetUTF8.toTitleCase());
     } else if (collationName.equals("UTF8_BINARY_LCASE")) {
-      assertEquals(expected, CollationAwareUTF8String.toTitleCase(target));
+      assertEquals(expected, CollationAwareUTF8String.titleCaseCodePoints(target));
     } else {
       int collationId = CollationFactory.collationNameToId(collationName);
       assertEquals(expected, CollationAwareUTF8String.toTitleCase(target, collationId));
@@ -219,42 +219,42 @@ private void assertTitlecase(String target, String expected, String collationNam
   @Test
   public void testTitlecase() throws SparkException {
     // Edge cases
-    assertTitlecase("", "", "UTF8_BINARY");
-    assertTitlecase("", "", "UTF8_BINARY_LCASE");
-    assertTitlecase("", "", "UNICODE");
-    assertTitlecase("", "", "UNICODE_CI");
+    assertTcaseCompare("", "", "UTF8_BINARY");
+    assertTcaseCompare("", "", "UTF8_BINARY_LCASE");
+    assertTcaseCompare("", "", "UNICODE");
+    assertTcaseCompare("", "", "UNICODE_CI");
     // Basic tests
-    assertTitlecase("ab cd", "Ab Cd", "UTF8_BINARY");
-    assertTitlecase("Ab Cd", "Ab Cd", "UTF8_BINARY");
-    assertTitlecase("ab cd", "Ab Cd", "UTF8_BINARY_LCASE");
-    assertTitlecase("aB cD", "Ab Cd", "UTF8_BINARY_LCASE");
-    assertTitlecase("ab cd", "Ab Cd", "UNICODE");
-    assertTitlecase("aB Cd", "Ab Cd", "UNICODE");
-    assertTitlecase("ab cd", "Ab Cd", "UNICODE_CI");
-    assertTitlecase("Ab cD", "Ab Cd", "UNICODE_CI");
+    assertTcaseCompare("ab cd", "Ab Cd", "UTF8_BINARY");
+    assertTcaseCompare("Ab Cd", "Ab Cd", "UTF8_BINARY");
+    assertTcaseCompare("ab cd", "Ab Cd", "UTF8_BINARY_LCASE");
+    assertTcaseCompare("aB cD", "Ab Cd", "UTF8_BINARY_LCASE");
+    assertTcaseCompare("ab cd", "Ab Cd", "UNICODE");
+    assertTcaseCompare("aB Cd", "Ab Cd", "UNICODE");
+    assertTcaseCompare("ab cd", "Ab Cd", "UNICODE_CI");
+    assertTcaseCompare("Ab cD", "Ab Cd", "UNICODE_CI");
     // Accent variation
-    assertTitlecase("aB ćD", "AB ĆD", "UTF8_BINARY");
-    assertTitlecase("AbC δ", "Abc Δ", "UTF8_BINARY_LCASE");
-    assertTitlecase("äB Cd", "Äb Cd", "UNICODE");
-    assertTitlecase("A b́cD", "A B́cd", "UNICODE_CI");
+    assertTcaseCompare("aB ćD", "AB ĆD", "UTF8_BINARY");
+    assertTcaseCompare("AbC δ", "Abc Δ", "UTF8_BINARY_LCASE");
+    assertTcaseCompare("äB Cd", "Äb Cd", "UNICODE");
+    assertTcaseCompare("A b́cD", "A B́cd", "UNICODE_CI");
     // Case-variable character length
-    assertTitlecase("i\u0307oDiNe", "I\u0307oDiNe", "UTF8_BINARY");
-    assertTitlecase("Abi\u0307o12", "Abi\u0307o12", "UTF8_BINARY");
-    assertTitlecase("i̇od i̇nE", "İod İne", "UTF8_BINARY_LCASE");
-    assertTitlecase("aBi̇o12", "Abi\u0307o12", "UTF8_BINARY_LCASE");
-    assertTitlecase("i̇oDinE", "I\u0307odine", "UNICODE");
-    assertTitlecase("abi̇O12", "Abi̇o12", "UNICODE");
-    assertTitlecase("i̇odINe", "I\u0307odine", "UNICODE_CI");
-    assertTitlecase("ABi̇o12", "Abi\u0307o12", "UNICODE_CI");
+    assertTcaseCompare("i\u0307oDiNe", "I\u0307oDiNe", "UTF8_BINARY");
+    assertTcaseCompare("Abi\u0307o12", "Abi\u0307o12", "UTF8_BINARY");
+    assertTcaseCompare("i̇od i̇nE", "İod İne", "UTF8_BINARY_LCASE");
+    assertTcaseCompare("aBi̇o12", "Abi\u0307o12", "UTF8_BINARY_LCASE");
+    assertTcaseCompare("i̇oDinE", "I\u0307odine", "UNICODE");
+    assertTcaseCompare("abi̇O12", "Abi̇o12", "UNICODE");
+    assertTcaseCompare("i̇odINe", "I\u0307odine", "UNICODE_CI");
+    assertTcaseCompare("ABi̇o12", "Abi\u0307o12", "UNICODE_CI");
     // Conditional case mapping
-    assertTitlecase("a ς c", "A Σ C", "UTF8_BINARY");
-    assertTitlecase("a σ c", "A Σ C", "UTF8_BINARY");
-    assertTitlecase("a ς c", "A Σ C", "UTF8_BINARY_LCASE");
-    assertTitlecase("a σ c", "A Σ C", "UTF8_BINARY_LCASE");
-    assertTitlecase("a ς c", "A Σ C", "UNICODE");
-    assertTitlecase("a σ c", "A Σ C", "UNICODE");
-    assertTitlecase("a ς c", "A Σ C", "UNICODE_CI");
-    assertTitlecase("a σ c", "A Σ C", "UNICODE_CI");
+    assertTcaseCompare("a ς c", "A Σ C", "UTF8_BINARY");
+    assertTcaseCompare("a σ c", "A Σ C", "UTF8_BINARY");
+    assertTcaseCompare("a ς c", "A Σ C", "UTF8_BINARY_LCASE");
+    assertTcaseCompare("a σ c", "A Σ C", "UTF8_BINARY_LCASE");
+    assertTcaseCompare("a ς c", "A Σ C", "UNICODE");
+    assertTcaseCompare("a σ c", "A Σ C", "UNICODE");
+    assertTcaseCompare("a ς c", "A Σ C", "UNICODE_CI");
+    assertTcaseCompare("a σ c", "A Σ C", "UNICODE_CI");
   }
 
   /**

From f3e23a1ebc8d1f2d411c3f144ff05c400130949b Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Thu, 23 May 2024 17:22:37 +0200
Subject: [PATCH 05/14] Remove UCASE and TCASE code points

---
 .../util/CollationAwareUTF8String.java        |  69 -----------
 .../unsafe/types/CollationSupportSuite.java   | 110 ------------------
 2 files changed, 179 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 66632457b328b..d75bcd9656a4d 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -156,54 +156,6 @@ public static String toUpperCase(final String target, final int collationId) {
     return UCharacter.toUpperCase(locale, target);
   }
 
-  /**
-   * Converts a single code point to uppercase using ICU rules, with special handling for
-   * conditional case mappings (i.e. characters that map to multiple characters in uppercase).
-   *
-   * @param codePoint The code point to convert to uppercase.
-   * @param sb The StringBuilder to append the uppercase character to.
-   * @param i The index of the code point in the target string.
-   * @param target The target string to convert to uppercase.
-   * @return The number of characters consumed by the code point.
-   */
-  private static int uppercaseCodePoint(final int codePoint, final StringBuilder sb, final int i,
-      final String target) {
-    // Latin small letter i with an additional dot is represented using 2 characters.
-    if (codePoint == 0x0069 && i + 1 < target.length() && target.codePointAt(i + 1) == 0x0307) {
-      sb.append("İ");
-      return 1;
-    }
-    // TODO: Add special handling for other chars that map to multiple characters in uppercase.
-    // All other characters should follow context-unaware ICU single-code point case mapping.
-    sb.appendCodePoint(UCharacter.toTitleCase(codePoint));
-    return 0;
-  }
-
-  /**
-   * Converts an entire string to uppercase using ICU rules, code point by code point, with
-   * special handling for conditional case mappings (i.e. characters that map to multiple
-   * characters in uppercase). This method omits information about context-sensitive case mappings.
-   *
-   * @param target The target string to convert to uppercase.
-   * @return The string converted to uppercase in a context-unaware manner.
-   */
-  public static String upperCaseCodePoints(final String target) {
-      StringBuilder sb = new StringBuilder();
-    for (int i = 0; i < target.length(); ++i) {
-      int codePoint = target.codePointAt(i);
-      // Latin small letter i with an additional dot above (represented using 2 characters).
-      if (codePoint == 0x0069 && i + 1 < target.length() && target.codePointAt(i + 1) == 0x0307) {
-        sb.append("İ");
-        ++i;
-      }
-      // All other characters should follow context-unaware ICU single-code point case mapping.
-      else {
-        sb.appendCodePoint(UCharacter.toUpperCase(codePoint));
-      }
-    }
-    return sb.toString();
-  }
-
   public static String toLowerCase(final String target, final int collationId) {
     ULocale locale = CollationFactory.fetchCollation(collationId)
       .collator.getLocale(ULocale.ACTUAL_LOCALE);
@@ -255,27 +207,6 @@ public static String toTitleCase(final String target, final int collationId) {
     return UCharacter.toTitleCase(locale, target, BreakIterator.getWordInstance(locale));
   }
 
-  /**
-   * Converts an entire string to titlecase using ICU rules, code point by code point, with
-   * special handling for conditional case mappings (i.e. characters that map to multiple
-   * characters in lowercase). This method omits information about context-sensitive case mappings.
-   *
-   * @param target The target string to convert to lowercase.
-   * @return The string converted to lowercase in a context-unaware manner.
-   */
-  public static String titleCaseCodePoints(final String target) {
-      StringBuilder sb = new StringBuilder();
-    for (int i = 0; i < target.length(); ++i) {
-      int codePoint = target.codePointAt(i);
-      if (i == 0 || Character.isWhitespace(target.codePointBefore(i))) {
-        i += uppercaseCodePoint(codePoint, sb, i, target);
-      } else {
-        lowercaseCodePoint(codePoint, sb);
-      }
-    }
-    return sb.toString();
-  }
-
   public static int findInSet(final UTF8String match, final UTF8String set, int collationId) {
     if (match.contains(UTF8String.fromString(","))) {
       return 0;
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index c9a7e18e1cdbe..c079427ce5f6a 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -147,116 +147,6 @@ public void testLcaseCompare() throws SparkException {
     assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI");
   }
 
-  private void assertUcaseCompare(String target, String expected, String collationName)
-      throws SparkException {
-    if (collationName.equals("UTF8_BINARY")) {
-      UTF8String targetUTF8 = UTF8String.fromString(target);
-      UTF8String expectedUTF8 = UTF8String.fromString(expected);
-      assertEquals(expectedUTF8, targetUTF8.toUpperCase());
-    } else if (collationName.equals("UTF8_BINARY_LCASE")) {
-      assertEquals(expected, CollationAwareUTF8String.upperCaseCodePoints(target));
-    } else {
-      int collationId = CollationFactory.collationNameToId(collationName);
-      assertEquals(expected, CollationAwareUTF8String.toUpperCase(target, collationId));
-    }
-  }
-
-  @Test
-  public void testUppercase() throws SparkException {
-    // Edge cases
-    assertUcaseCompare("", "", "UTF8_BINARY");
-    assertUcaseCompare("", "", "UTF8_BINARY_LCASE");
-    assertUcaseCompare("", "", "UNICODE");
-    assertUcaseCompare("", "", "UNICODE_CI");
-    // Basic tests
-    assertUcaseCompare("abcd", "ABCD", "UTF8_BINARY");
-    assertUcaseCompare("AbCd", "ABCD", "UTF8_BINARY");
-    assertUcaseCompare("abcd", "ABCD", "UTF8_BINARY_LCASE");
-    assertUcaseCompare("aBcD", "ABCD", "UTF8_BINARY_LCASE");
-    assertUcaseCompare("abcd", "ABCD", "UNICODE");
-    assertUcaseCompare("aBCd", "ABCD", "UNICODE");
-    assertUcaseCompare("abcd", "ABCD", "UNICODE_CI");
-    assertUcaseCompare("AbcD", "ABCD", "UNICODE_CI");
-    // Accent variation
-    assertUcaseCompare("aBćD", "ABĆD", "UTF8_BINARY");
-    assertUcaseCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE");
-    assertUcaseCompare("äBCd", "ÄBCD", "UNICODE");
-    assertUcaseCompare("Ab́cD", "AB́CD", "UNICODE_CI");
-    // Case-variable character length
-    assertUcaseCompare("i\u0307oDiNe", "I\u0307ODINE", "UTF8_BINARY");
-    assertUcaseCompare("Abi\u0307o12", "ABI\u0307O12", "UTF8_BINARY");
-    assertUcaseCompare("i̇odInE", "İODINE", "UTF8_BINARY_LCASE");
-    assertUcaseCompare("aBi̇o12", "ABİO12", "UTF8_BINARY_LCASE");
-    assertUcaseCompare("i̇oDinE", "I\u0307ODINE", "UNICODE");
-    assertUcaseCompare("abi̇O12", "ABI\u0307O12", "UNICODE");
-    assertUcaseCompare("i̇odINe", "I\u0307ODINE", "UNICODE_CI");
-    assertUcaseCompare("ABi̇o12", "ABI\u0307O12", "UNICODE_CI");
-    // Conditional case mapping
-    assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY");
-    assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY");
-    assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE");
-    assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UTF8_BINARY_LCASE");
-    assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE");
-    assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE");
-    assertUcaseCompare("θαλασσινος", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI");
-    assertUcaseCompare("θαλασσινοσ", "ΘΑΛΑΣΣΙΝΟΣ", "UNICODE_CI");
-  }
-
-  private void assertTcaseCompare(String target, String expected, String collationName)
-      throws SparkException {
-    if (collationName.equals("UTF8_BINARY")) {
-      UTF8String targetUTF8 = UTF8String.fromString(target);
-      UTF8String expectedUTF8 = UTF8String.fromString(expected);
-      assertEquals(expectedUTF8, targetUTF8.toTitleCase());
-    } else if (collationName.equals("UTF8_BINARY_LCASE")) {
-      assertEquals(expected, CollationAwareUTF8String.titleCaseCodePoints(target));
-    } else {
-      int collationId = CollationFactory.collationNameToId(collationName);
-      assertEquals(expected, CollationAwareUTF8String.toTitleCase(target, collationId));
-    }
-  }
-
-  @Test
-  public void testTitlecase() throws SparkException {
-    // Edge cases
-    assertTcaseCompare("", "", "UTF8_BINARY");
-    assertTcaseCompare("", "", "UTF8_BINARY_LCASE");
-    assertTcaseCompare("", "", "UNICODE");
-    assertTcaseCompare("", "", "UNICODE_CI");
-    // Basic tests
-    assertTcaseCompare("ab cd", "Ab Cd", "UTF8_BINARY");
-    assertTcaseCompare("Ab Cd", "Ab Cd", "UTF8_BINARY");
-    assertTcaseCompare("ab cd", "Ab Cd", "UTF8_BINARY_LCASE");
-    assertTcaseCompare("aB cD", "Ab Cd", "UTF8_BINARY_LCASE");
-    assertTcaseCompare("ab cd", "Ab Cd", "UNICODE");
-    assertTcaseCompare("aB Cd", "Ab Cd", "UNICODE");
-    assertTcaseCompare("ab cd", "Ab Cd", "UNICODE_CI");
-    assertTcaseCompare("Ab cD", "Ab Cd", "UNICODE_CI");
-    // Accent variation
-    assertTcaseCompare("aB ćD", "AB ĆD", "UTF8_BINARY");
-    assertTcaseCompare("AbC δ", "Abc Δ", "UTF8_BINARY_LCASE");
-    assertTcaseCompare("äB Cd", "Äb Cd", "UNICODE");
-    assertTcaseCompare("A b́cD", "A B́cd", "UNICODE_CI");
-    // Case-variable character length
-    assertTcaseCompare("i\u0307oDiNe", "I\u0307oDiNe", "UTF8_BINARY");
-    assertTcaseCompare("Abi\u0307o12", "Abi\u0307o12", "UTF8_BINARY");
-    assertTcaseCompare("i̇od i̇nE", "İod İne", "UTF8_BINARY_LCASE");
-    assertTcaseCompare("aBi̇o12", "Abi\u0307o12", "UTF8_BINARY_LCASE");
-    assertTcaseCompare("i̇oDinE", "I\u0307odine", "UNICODE");
-    assertTcaseCompare("abi̇O12", "Abi̇o12", "UNICODE");
-    assertTcaseCompare("i̇odINe", "I\u0307odine", "UNICODE_CI");
-    assertTcaseCompare("ABi̇o12", "Abi\u0307o12", "UNICODE_CI");
-    // Conditional case mapping
-    assertTcaseCompare("a ς c", "A Σ C", "UTF8_BINARY");
-    assertTcaseCompare("a σ c", "A Σ C", "UTF8_BINARY");
-    assertTcaseCompare("a ς c", "A Σ C", "UTF8_BINARY_LCASE");
-    assertTcaseCompare("a σ c", "A Σ C", "UTF8_BINARY_LCASE");
-    assertTcaseCompare("a ς c", "A Σ C", "UNICODE");
-    assertTcaseCompare("a σ c", "A Σ C", "UNICODE");
-    assertTcaseCompare("a ς c", "A Σ C", "UNICODE_CI");
-    assertTcaseCompare("a σ c", "A Σ C", "UNICODE_CI");
-  }
-
   /**
    * Collation-aware string expressions.
    */

From f6e2dd2357453725572e2ee2a47011d7684936f2 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 24 May 2024 14:04:53 +0200
Subject: [PATCH 06/14] Update doc comments

---
 .../spark/sql/catalyst/util/CollationAwareUTF8String.java   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index d75bcd9656a4d..9e8a2eb586b16 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -39,7 +39,7 @@ public class CollationAwareUTF8String {
    * Lowercase UTF8String comparison used for UTF8_BINARY_LCASE collation. While the default
    * UTF8String comparison is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()), this
    * method uses code points to compare the strings in a case-insensitive manner using ICU rules,
-   * as well as handling special rules for conditional case mappings (see: lowerCaseCodePoints).
+   * as well as handling special rules for one-to-many case mappings (see: lowerCaseCodePoints).
    *
    * @param left The first UTF8String to compare.
    * @param right The second UTF8String to compare.
@@ -164,7 +164,7 @@ public static String toLowerCase(final String target, final int collationId) {
 
   /**
    * Converts a single code point to lowercase using ICU rules, with special handling for
-   * conditional case mappings (i.e. characters that map to multiple characters in lowercase).
+   * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase).
    *
    * @param codePoint The code point to convert to lowercase.
    * @param sb The StringBuilder to append the lowercase character to.
@@ -186,7 +186,7 @@ else if (codePoint == 0x03C2) {
 
   /**
    * Converts an entire string to lowercase using ICU rules, code point by code point, with
-   * special handling for conditional case mappings (i.e. characters that map to multiple
+   * special handling for one-to-many case mappings (i.e. characters that map to multiple
    * characters in lowercase). This method omits information about context-sensitive case mappings.
    *
    * @param target The target string to convert to lowercase.

From 569a67df816d8e126afba4e4fb01d260a5bfce5e Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 24 May 2024 14:12:21 +0200
Subject: [PATCH 07/14] Small fixes

---
 .../catalyst/util/CollationAwareUTF8String.java  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 9e8a2eb586b16..ee79d7380a0fa 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -170,16 +170,17 @@ public static String toLowerCase(final String target, final int collationId) {
    * @param sb The StringBuilder to append the lowercase character to.
    */
   private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) {
-    // Latin capital letter I with dot above is mapped to 2 lowercase characters.
     if (codePoint == 0x0130) {
-      sb.append("i̇");
+      // Latin capital letter I with dot above is mapped to 2 lowercase characters.
+      sb.appendCodePoint(0x0069);
+      sb.appendCodePoint(0x0307);
     }
-    // Greek final and non-final capital letter sigma should be mapped the same.
     else if (codePoint == 0x03C2) {
-      sb.append("σ");
+      // Greek final and non-final capital letter sigma should be mapped the same.
+      sb.appendCodePoint(0x03C3);
     }
-    // All other characters should follow context-unaware ICU single-code point case mapping.
     else {
+      // All other characters should follow context-unaware ICU single-code point case mapping.
       sb.appendCodePoint(UCharacter.toLowerCase(codePoint));
     }
   }
@@ -193,10 +194,9 @@ else if (codePoint == 0x03C2) {
    * @return The string converted to lowercase in a context-unaware manner.
    */
   public static String lowerCaseCodePoints(final String target) {
-      StringBuilder sb = new StringBuilder();
+    StringBuilder sb = new StringBuilder();
     for (int i = 0; i < target.length(); ++i) {
-      int codePoint = target.codePointAt(i);
-      lowercaseCodePoint(codePoint, sb);
+      lowercaseCodePoint(target.codePointAt(i), sb);
     }
     return sb.toString();
   }

From 021e53ccac4e2f58919b63ac11657f3b3359fcdc Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 27 May 2024 10:25:42 +0200
Subject: [PATCH 08/14] Small fixes

---
 .../util/CollationAwareUTF8String.java        |  2 +-
 .../sql/catalyst/util/CollationFactory.java   |  2 +-
 .../unsafe/types/CollationSupportSuite.java   | 56 ++++++-------------
 3 files changed, 20 insertions(+), 40 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index ee79d7380a0fa..7cdf998fa4ec9 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -45,7 +45,7 @@ public class CollationAwareUTF8String {
    * @param right The second UTF8String to compare.
    * @return An integer representing the comparison result.
    */
-  public static int compareLowerCase(final UTF8String left, final UTF8String right) {
+  public static int lowercaseCompare(final UTF8String left, final UTF8String right) {
     return lowerCaseCodePoints(left.toString()).compareTo(lowerCaseCodePoints(right.toString()));
   }
 
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index 5c9313875fb1c..e445a8c228910 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -232,7 +232,7 @@ public CollationIdentifier identifier() {
       "UTF8_BINARY_LCASE",
       PROVIDER_SPARK,
       null,
-      CollationAwareUTF8String::compareLowerCase,
+      CollationAwareUTF8String::lowercaseCompare,
       "1.0",
       (s) -> (long)CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(),
       false,
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index c079427ce5f6a..008d1c47fb28e 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -96,55 +96,35 @@ public void testCompare() throws SparkException {
     assertStringCompare("σ", "Σ", "UNICODE_CI", 0);
   }
 
-  private void assertLcaseCompare(String target, String expected, String collationName)
-      throws SparkException {
-    if (collationName.equals("UTF8_BINARY")) {
-      UTF8String targetUTF8 = UTF8String.fromString(target);
-      UTF8String expectedUTF8 = UTF8String.fromString(expected);
-      assertEquals(expectedUTF8, targetUTF8.toLowerCase());
-    } else if (collationName.equals("UTF8_BINARY_LCASE")) {
+  private void assertLowerCaseCodePoints(String target, String expected, Boolean useCodePoints) {
+    if (useCodePoints) {
       assertEquals(expected, CollationAwareUTF8String.lowerCaseCodePoints(target));
     } else {
-      int collationId = CollationFactory.collationNameToId(collationName);
-      assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, collationId));
+      assertEquals(UTF8String.fromString(expected), UTF8String.fromString(target).toLowerCase());
     }
   }
 
   @Test
-  public void testLcaseCompare() throws SparkException {
+  public void testLowerCaseCodePoints() {
     // Edge cases
-    assertLcaseCompare("", "", "UTF8_BINARY");
-    assertLcaseCompare("", "", "UTF8_BINARY_LCASE");
-    assertLcaseCompare("", "", "UNICODE");
-    assertLcaseCompare("", "", "UNICODE_CI");
+    assertLowerCaseCodePoints("", "", false);
+    assertLowerCaseCodePoints("", "", true);
     // Basic tests
-    assertLcaseCompare("abcd", "abcd", "UTF8_BINARY");
-    assertLcaseCompare("AbCd", "abcd", "UTF8_BINARY");
-    assertLcaseCompare("abcd", "abcd", "UTF8_BINARY_LCASE");
-    assertLcaseCompare("aBcD", "abcd", "UTF8_BINARY_LCASE");
-    assertLcaseCompare("abcd", "abcd", "UNICODE");
-    assertLcaseCompare("aBCd", "abcd", "UNICODE");
-    assertLcaseCompare("abcd", "abcd", "UNICODE_CI");
-    assertLcaseCompare("AbcD", "abcd", "UNICODE_CI");
+    assertLowerCaseCodePoints("abcd", "abcd", false);
+    assertLowerCaseCodePoints("AbCd", "abcd", false);
+    assertLowerCaseCodePoints("abcd", "abcd", true);
+    assertLowerCaseCodePoints("aBcD", "abcd", true);
     // Accent variation
-    assertLcaseCompare("AbĆd", "abćd", "UTF8_BINARY");
-    assertLcaseCompare("aBcΔ", "abcδ", "UTF8_BINARY_LCASE");
-    assertLcaseCompare("ÄbcD", "äbcd", "UNICODE");
-    assertLcaseCompare("aB́Cd", "ab́cd", "UNICODE_CI");
+    assertLowerCaseCodePoints("AbĆd", "abćd", false);
+    assertLowerCaseCodePoints("aBcΔ", "abcδ", true);
     // Case-variable character length
-    assertLcaseCompare("İoDiNe", "i̇odine", "UTF8_BINARY");
-    assertLcaseCompare("Abi̇o12", "abi̇o12", "UTF8_BINARY");
-    assertLcaseCompare("İodInE", "i̇odine", "UTF8_BINARY_LCASE");
-    assertLcaseCompare("aBi̇o12", "abi̇o12", "UTF8_BINARY_LCASE");
-    assertLcaseCompare("İoDinE", "i̇odine", "UNICODE");
-    assertLcaseCompare("abi̇O12", "abi̇o12", "UNICODE");
-    assertLcaseCompare("İodINe", "i̇odine", "UNICODE_CI");
-    assertLcaseCompare("ABi̇o12", "abi̇o12", "UNICODE_CI");
+    assertLowerCaseCodePoints("İoDiNe", "i̇odine", false);
+    assertLowerCaseCodePoints("Abi̇o12", "abi̇o12", false);
+    assertLowerCaseCodePoints("İodInE", "i̇odine", true);
+    assertLowerCaseCodePoints("aBi̇o12", "abi̇o12", true);
     // Conditional case mapping
-    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UTF8_BINARY");
-    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", "UTF8_BINARY_LCASE"); // != UNICODE_CI
-    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE");
-    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI");
+    assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", false);
+    assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", true);
   }
 
   /**

From 220091c06647bd91a2f16fd90ebcefd7fa3dc0cf Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 27 May 2024 12:59:14 +0200
Subject: [PATCH 09/14] Surrogate pair tests

---
 .../unsafe/types/CollationSupportSuite.java   | 47 ++++++++++++-------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 008d1c47fb28e..2d0dcd275bf9d 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -96,35 +96,48 @@ public void testCompare() throws SparkException {
     assertStringCompare("σ", "Σ", "UNICODE_CI", 0);
   }
 
-  private void assertLowerCaseCodePoints(String target, String expected, Boolean useCodePoints) {
+  private void assertLowerCaseCodePoints(UTF8String target, UTF8String expected, Boolean useCodePoints) {
     if (useCodePoints) {
-      assertEquals(expected, CollationAwareUTF8String.lowerCaseCodePoints(target));
+      assertEquals(expected.toString(), CollationAwareUTF8String.lowerCaseCodePoints(target.toString()));
     } else {
-      assertEquals(UTF8String.fromString(expected), UTF8String.fromString(target).toLowerCase());
+      assertEquals(expected, target.toLowerCase());
     }
   }
 
   @Test
   public void testLowerCaseCodePoints() {
     // Edge cases
-    assertLowerCaseCodePoints("", "", false);
-    assertLowerCaseCodePoints("", "", true);
+    assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), false);
+    assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), true);
     // Basic tests
-    assertLowerCaseCodePoints("abcd", "abcd", false);
-    assertLowerCaseCodePoints("AbCd", "abcd", false);
-    assertLowerCaseCodePoints("abcd", "abcd", true);
-    assertLowerCaseCodePoints("aBcD", "abcd", true);
+    assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), false);
+    assertLowerCaseCodePoints(UTF8String.fromString("AbCd"), UTF8String.fromString("abcd"), false);
+    assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), true);
+    assertLowerCaseCodePoints(UTF8String.fromString("aBcD"), UTF8String.fromString("abcd"), true);
     // Accent variation
-    assertLowerCaseCodePoints("AbĆd", "abćd", false);
-    assertLowerCaseCodePoints("aBcΔ", "abcδ", true);
+    assertLowerCaseCodePoints(UTF8String.fromString("AbĆd"), UTF8String.fromString("abćd"), false);
+    assertLowerCaseCodePoints(UTF8String.fromString("aBcΔ"), UTF8String.fromString("abcδ"), true);
     // Case-variable character length
-    assertLowerCaseCodePoints("İoDiNe", "i̇odine", false);
-    assertLowerCaseCodePoints("Abi̇o12", "abi̇o12", false);
-    assertLowerCaseCodePoints("İodInE", "i̇odine", true);
-    assertLowerCaseCodePoints("aBi̇o12", "abi̇o12", true);
+    assertLowerCaseCodePoints(
+      UTF8String.fromString("İoDiNe"), UTF8String.fromString("i̇odine"), false);
+    assertLowerCaseCodePoints(
+      UTF8String.fromString("Abi̇o12"), UTF8String.fromString("abi̇o12"), false);
+    assertLowerCaseCodePoints(
+      UTF8String.fromString("İodInE"), UTF8String.fromString("i̇odine"), true);
+    assertLowerCaseCodePoints(
+      UTF8String.fromString("aBi̇o12"), UTF8String.fromString("abi̇o12"), true);
     // Conditional case mapping
-    assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", false);
-    assertLowerCaseCodePoints("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", true);
+    assertLowerCaseCodePoints(
+      UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινος"), false);
+    assertLowerCaseCodePoints(
+      UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινοσ"), true);
+    // Surrogate pairs are treated as invalid UTF8 sequences
+    assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[]
+      {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}),
+      UTF8String.fromString("\ufffd\ufffd"), false);
+    assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[]
+      {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}),
+      UTF8String.fromString("\ufffd\ufffd"), true);
   }
 
   /**

From 0d4da587ef900fed8b1a6d3d503646ffe5227651 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 27 May 2024 15:13:00 +0200
Subject: [PATCH 10/14] Fix Java lint

---
 .../apache/spark/unsafe/types/CollationSupportSuite.java    | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 2d0dcd275bf9d..e02af65f86b10 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -96,9 +96,11 @@ public void testCompare() throws SparkException {
     assertStringCompare("σ", "Σ", "UNICODE_CI", 0);
   }
 
-  private void assertLowerCaseCodePoints(UTF8String target, UTF8String expected, Boolean useCodePoints) {
+  private void assertLowerCaseCodePoints(UTF8String target, UTF8String expected,
+      Boolean useCodePoints) {
     if (useCodePoints) {
-      assertEquals(expected.toString(), CollationAwareUTF8String.lowerCaseCodePoints(target.toString()));
+      assertEquals(expected.toString(),
+        CollationAwareUTF8String.lowerCaseCodePoints(target.toString()));
     } else {
       assertEquals(expected, target.toLowerCase());
     }

From 5ac1e20901bb5c05efa0734605fedc332ce9eb64 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Tue, 28 May 2024 19:57:51 +0200
Subject: [PATCH 11/14] Update CollationFactory.java

---
 .../org/apache/spark/sql/catalyst/util/CollationFactory.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index 5fa8c3346e141..78d93cd957c57 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -414,7 +414,7 @@ protected Collation buildCollation() {
             null,
             CollationAwareUTF8String::lowercaseCompare,
             "1.0",
-            (s) -> (long)CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(),
+            s -> (long)CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(),
             /* supportsBinaryEquality = */ false,
             /* supportsBinaryOrdering = */ false,
             /* supportsLowercaseEquality = */ true);

From 1348f9ce80c535539e8bf5ee5f0036792790ad0e Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Tue, 28 May 2024 19:58:20 +0200
Subject: [PATCH 12/14] Update CollationFactory.java

---
 .../org/apache/spark/sql/catalyst/util/CollationFactory.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index 78d93cd957c57..0b520d87143b2 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -414,7 +414,7 @@ protected Collation buildCollation() {
             null,
             CollationAwareUTF8String::lowercaseCompare,
             "1.0",
-            s -> (long)CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(),
+            s -> (long) CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(),
             /* supportsBinaryEquality = */ false,
             /* supportsBinaryOrdering = */ false,
             /* supportsLowercaseEquality = */ true);

From 494add7f7251afecbcb862035cfff329c80b3772 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Wed, 5 Jun 2024 09:48:08 +0200
Subject: [PATCH 13/14] Fixes

---
 .../util/CollationAwareUTF8String.java        | 36 ++++++++++-
 .../sql/catalyst/util/CollationFactory.java   |  2 +-
 .../apache/spark/unsafe/types/UTF8String.java | 30 +--------
 .../unsafe/types/CollationSupportSuite.java   | 63 ++++++++++++++-----
 .../spark/unsafe/types/UTF8StringSuite.java   | 23 -------
 5 files changed, 86 insertions(+), 68 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 7b1b21bfca322..84baf18154417 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -193,7 +193,41 @@ private static int lowercaseRFind(
    * @param right The second UTF8String to compare.
    * @return An integer representing the comparison result.
    */
-  public static int lowercaseCompare(final UTF8String left, final UTF8String right) {
+  public static int compareLowerCase(final UTF8String left, final UTF8String right) {
+    // Only if both strings are ASCII, we can use faster comparison (no string allocations).
+    if (left.isFullAscii() && right.isFullAscii()) {
+      return compareLowerCaseAscii(left, right);
+    }
+    return compareLowerCaseSlow(left, right);
+  }
+
+  /**
+   * Fast version of the `compareLowerCase` method, used when both arguments are ASCII strings.
+   *
+   * @param left The first ASCII UTF8String to compare.
+   * @param right The second ASCII UTF8String to compare.
+   * @return An integer representing the comparison result.
+   */
+  private static int compareLowerCaseAscii(final UTF8String left, final UTF8String right) {
+    int leftBytes = left.numBytes(), rightBytes = right.numBytes();
+    for (int curr = 0; curr < leftBytes && curr < rightBytes; curr++) {
+      int lowerLeftByte = Character.toLowerCase(left.getByte(curr));
+      int lowerRightByte = Character.toLowerCase(right.getByte(curr));
+      if (lowerLeftByte != lowerRightByte) {
+        return lowerLeftByte - lowerRightByte;
+      }
+    }
+    return leftBytes - rightBytes;
+  }
+
+  /**
+   * Slow version of the `compareLowerCase` method, used when both arguments are non-ASCII strings.
+   *
+   * @param left The first non-ASCII UTF8String to compare.
+   * @param right The second non-ASCII UTF8String to compare.
+   * @return An integer representing the comparison result.
+   */
+  private static int compareLowerCaseSlow(final UTF8String left, final UTF8String right) {
     return lowerCaseCodePoints(left.toString()).compareTo(lowerCaseCodePoints(right.toString()));
   }
 
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index c2a6887ba27f8..c734826648871 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -412,7 +412,7 @@ protected Collation buildCollation() {
             "UTF8_BINARY_LCASE",
             PROVIDER_SPARK,
             null,
-            CollationAwareUTF8String::lowercaseCompare,
+            CollationAwareUTF8String::compareLowerCase,
             "1.0",
             s -> (long) CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(),
             /* supportsBinaryEquality = */ false,
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index e28dfa910b59e..c0fa2719e4fe6 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -388,34 +388,6 @@ private UTF8String toUpperCaseSlow() {
     return fromString(toString().toUpperCase());
   }
 
-  /**
-   * Optimized lowercase comparison for UTF8_BINARY_LCASE collation
-   * a.compareLowerCase(b) is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase())
-   */
-  public int compareLowerCase(UTF8String other) {
-    int curr;
-    for (curr = 0; curr < numBytes && curr < other.numBytes; curr++) {
-      byte left, right;
-      if ((left = getByte(curr)) < 0 || (right = other.getByte(curr)) < 0) {
-        return compareLowerCaseSuffixSlow(other, curr);
-      }
-      int lowerLeft = Character.toLowerCase(left);
-      int lowerRight = Character.toLowerCase(right);
-      if (lowerLeft != lowerRight) {
-        return lowerLeft - lowerRight;
-      }
-    }
-    return numBytes - other.numBytes;
-  }
-
-  private int compareLowerCaseSuffixSlow(UTF8String other, int pref) {
-    UTF8String suffixLeft = UTF8String.fromAddress(base, offset + pref,
-      numBytes - pref);
-    UTF8String suffixRight = UTF8String.fromAddress(other.base, other.offset + pref,
-      other.numBytes - pref);
-    return suffixLeft.toLowerCaseSlow().binaryCompare(suffixRight.toLowerCaseSlow());
-  }
-
   /**
    * Returns the lower case of this string
    */
@@ -427,7 +399,7 @@ public UTF8String toLowerCase() {
     return isFullAscii() ? toLowerCaseAscii() : toLowerCaseSlow();
   }
 
-  private boolean isFullAscii() {
+  public boolean isFullAscii() {
     for (var i = 0; i < numBytes; i++) {
       if (getByte(i) < 0) {
         return false;
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 99ffbaf7afb49..25d9836e05361 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -27,6 +27,15 @@
 // checkstyle.off: AvoidEscapedUnicodeCharacters
 public class CollationSupportSuite {
 
+  /**
+   * A list containing some of the supported collations in Spark. Use this list to iterate over
+   * all the important collation groups (binary, lowercase, icu) for complete unit test coverage.
+   * Note: this list may come in handy when the Spark function result is the same regardless of
+   * the specified collations (as often seen in some pass-through Spark expressions).
+   */
+  private final String[] testSupportedCollations =
+    {"UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI"};
+
   /**
    * Collation-aware UTF8String comparison.
    */
@@ -41,20 +50,46 @@ private void assertStringCompare(String s1, String s2, String collationName, int
 
   @Test
   public void testCompare() throws SparkException {
-    // Edge cases
-    assertStringCompare("", "", "UTF8_BINARY", 0);
-    assertStringCompare("a", "", "UTF8_BINARY", 1);
-    assertStringCompare("", "a", "UTF8_BINARY", -1);
-    assertStringCompare("", "", "UTF8_BINARY_LCASE", 0);
-    assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1);
-    assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1);
-    assertStringCompare("", "", "UNICODE", 0);
-    assertStringCompare("a", "", "UNICODE", 1);
-    assertStringCompare("", "a", "UNICODE", -1);
-    assertStringCompare("", "", "UNICODE_CI", 0);
-    assertStringCompare("a", "", "UNICODE_CI", 1);
-    assertStringCompare("", "a", "UNICODE_CI", -1);
-    // Basic tests
+    for (String collationName: testSupportedCollations) {
+      // Edge cases
+      assertStringCompare("", "", collationName, 0);
+      assertStringCompare("a", "", collationName, 1);
+      assertStringCompare("", "a", collationName, -1);
+      // Basic tests
+      assertStringCompare("a", "a", collationName, 0);
+      assertStringCompare("a", "b", collationName, -1);
+      assertStringCompare("b", "a", collationName, 1);
+      assertStringCompare("A", "A", collationName, 0);
+      assertStringCompare("A", "B", collationName, -1);
+      assertStringCompare("B", "A", collationName, 1);
+      assertStringCompare("aa", "a", collationName, 1);
+      assertStringCompare("b", "bb", collationName, -1);
+      assertStringCompare("abc", "a", collationName, 1);
+      assertStringCompare("abc", "b", collationName, -1);
+      assertStringCompare("abc", "ab", collationName, 1);
+      assertStringCompare("abc", "abc", collationName, 0);
+      // ASCII strings
+      assertStringCompare("aaaa", "aaa", collationName, 1);
+      assertStringCompare("hello", "world", collationName, -1);
+      assertStringCompare("Spark", "Spark", collationName, 0);
+      // Non-ASCII strings
+      assertStringCompare("ü", "ü", collationName, 0);
+      assertStringCompare("ü", "", collationName, 1);
+      assertStringCompare("", "ü", collationName, -1);
+      assertStringCompare("äü", "äü", collationName, 0);
+      assertStringCompare("äxx", "äx", collationName, 1);
+      assertStringCompare("a", "ä", collationName, -1);
+    }
+    // Non-ASCII strings
+    assertStringCompare("äü", "bü", "UTF8_BINARY", 1);
+    assertStringCompare("bxx", "bü", "UTF8_BINARY", -1);
+    assertStringCompare("äü", "bü", "UTF8_BINARY_LCASE", 1);
+    assertStringCompare("bxx", "bü", "UTF8_BINARY_LCASE", -1);
+    assertStringCompare("äü", "bü", "UNICODE", -1);
+    assertStringCompare("bxx", "bü", "UNICODE", 1);
+    assertStringCompare("äü", "bü", "UNICODE_CI", -1);
+    assertStringCompare("bxx", "bü", "UNICODE_CI", 1);
+    // Case variation
     assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
     assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0);
     assertStringCompare("AbcD", "aBCd", "UNICODE", 1);
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 0188297fd05a2..d3fe361fce37b 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -107,29 +107,6 @@ public void binaryCompareTo() {
     assertTrue(fromString("你好123").binaryCompare(fromString("你好122")) > 0);
   }
 
-  @Test
-  public void lowercaseComparison() {
-    // SPARK-47693: Test optimized lowercase comparison of UTF8String instances
-    // ASCII
-    assertEquals(fromString("aaa").compareLowerCase(fromString("AAA")), 0);
-    assertTrue(fromString("aaa").compareLowerCase(fromString("AAAA")) < 0);
-    assertTrue(fromString("AAA").compareLowerCase(fromString("aaaa")) < 0);
-    assertTrue(fromString("a").compareLowerCase(fromString("B")) < 0);
-    assertTrue(fromString("b").compareLowerCase(fromString("A")) > 0);
-    assertEquals(fromString("aAa").compareLowerCase(fromString("AaA")), 0);
-    assertTrue(fromString("abcd").compareLowerCase(fromString("abC")) > 0);
-    assertTrue(fromString("ABC").compareLowerCase(fromString("abcd")) < 0);
-    assertEquals(fromString("abcd").compareLowerCase(fromString("abcd")), 0);
-    // non-ASCII
-    assertEquals(fromString("ü").compareLowerCase(fromString("Ü")), 0);
-    assertEquals(fromString("Äü").compareLowerCase(fromString("äÜ")), 0);
-    assertTrue(fromString("a").compareLowerCase(fromString("ä")) < 0);
-    assertTrue(fromString("a").compareLowerCase(fromString("Ä")) < 0);
-    assertTrue(fromString("A").compareLowerCase(fromString("ä")) < 0);
-    assertTrue(fromString("bä").compareLowerCase(fromString("aü")) > 0);
-    assertTrue(fromString("bxxxxxxxxxx").compareLowerCase(fromString("bü")) < 0);
-  }
-
   protected static void testUpperandLower(String upper, String lower) {
     UTF8String us = fromString(upper);
     UTF8String ls = fromString(lower);

From 9081478465d111cfe064cb57c16993721e53f5f9 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Thu, 6 Jun 2024 08:44:11 +0200
Subject: [PATCH 14/14] Small fixes

---
 .../spark/sql/catalyst/util/CollationAwareUTF8String.java  | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 84baf18154417..c778726f12fd2 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -346,7 +346,9 @@ public static String toLowerCase(final String target, final int collationId) {
 
   /**
    * Converts a single code point to lowercase using ICU rules, with special handling for
-   * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase).
+   * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase) and
+   * context-insensitive case mappings (i.e. characters that map to different characters based on
+   * string context - e.g. the position in the string relative to other characters).
    *
    * @param codePoint The code point to convert to lowercase.
    * @param sb The StringBuilder to append the lowercase character to.
@@ -370,7 +372,8 @@ else if (codePoint == 0x03C2) {
   /**
    * Converts an entire string to lowercase using ICU rules, code point by code point, with
    * special handling for one-to-many case mappings (i.e. characters that map to multiple
-   * characters in lowercase). This method omits information about context-sensitive case mappings.
+   * characters in lowercase). Also, this method omits information about context-sensitive case
+   * mappings using special handling in the `lowercaseCodePoint` method.
    *
    * @param target The target string to convert to lowercase.
    * @return The string converted to lowercase in a context-unaware manner.