From 0209fc4f60f1e14ef962e76ea352c27701f2ae3f Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 27 May 2024 22:44:42 +0200
Subject: [PATCH 01/14] Initial commit

---
 .../sql/catalyst/util/CollationSupport.java   | 48 ++++++++++++++-----
 .../expressions/stringExpressions.scala       |  4 +-
 2 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index bea3dc08b4489..42bbab65cda0a 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -534,8 +534,10 @@ public static UTF8String exec(
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
       if (collation.supportsBinaryEquality) {
         return execBinary(srcString);
-      } else {
+      } else if (collation.supportsLowercaseEquality) {
         return execLowercase(srcString);
+      } else {
+        return execLowercase(srcString); // TODO: ICU implementation
       }
     }
     public static UTF8String exec(
@@ -545,8 +547,10 @@ public static UTF8String exec(
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
       if (collation.supportsBinaryEquality) {
         return execBinary(srcString, trimString);
-      } else {
+      } else if (collation.supportsLowercaseEquality) {
         return execLowercase(srcString, trimString);
+      } else {
+        return execLowercase(srcString, trimString); // TODO: ICU implementation
       }
     }
     public static String genCode(
@@ -556,8 +560,10 @@ public static String genCode(
       String expr = "CollationSupport.StringTrim.exec";
       if (collation.supportsBinaryEquality) {
         return String.format(expr + "Binary(%s)", srcString);
-      } {
+      } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s)", srcString);
+      } else {
+        return String.format(expr + "Lowercase(%s)", srcString); // TODO: ICU implementation
       }
     }
     public static String genCode(
@@ -568,8 +574,10 @@ public static String genCode(
       String expr = "CollationSupport.StringTrim.exec";
       if (collation.supportsBinaryEquality) {
         return String.format(expr + "Binary(%s, %s)", srcString, trimString);
-      } else {
+      } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
+      } else {
+        return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); // TODO: ICU implementation
       }
     }
     public static UTF8String execBinary(
@@ -599,8 +607,10 @@ public static UTF8String exec(
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
       if (collation.supportsBinaryEquality) {
         return execBinary(srcString);
-      } else {
+      } else if (collation.supportsLowercaseEquality) {
         return execLowercase(srcString);
+      } else {
+        return execLowercase(srcString); // TODO: ICU implementation
       }
     }
     public static UTF8String exec(
@@ -610,8 +620,10 @@ public static UTF8String exec(
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
       if (collation.supportsBinaryEquality) {
         return execBinary(srcString, trimString);
-      } else {
+      } else if (collation.supportsLowercaseEquality) {
         return execLowercase(srcString, trimString);
+      } else {
+        return execLowercase(srcString, trimString); // TODO: ICU implementation
       }
     }
     public static String genCode(
@@ -621,8 +633,10 @@ public static String genCode(
       String expr = "CollationSupport.StringTrimLeft.exec";
       if (collation.supportsBinaryEquality) {
         return String.format(expr + "Binary(%s)", srcString);
-      } else {
+      } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s)", srcString);
+      } else {
+        return String.format(expr + "Lowercase(%s)", srcString); // TODO: ICU implementation
       }
     }
     public static String genCode(
@@ -633,8 +647,10 @@ public static String genCode(
       String expr = "CollationSupport.StringTrimLeft.exec";
       if (collation.supportsBinaryEquality) {
         return String.format(expr + "Binary(%s, %s)", srcString, trimString);
-      } else {
+      } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
+      } else {
+        return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); // TODO: ICU implementation
       }
     }
     public static UTF8String execBinary(
@@ -664,8 +680,10 @@ public static UTF8String exec(
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
       if (collation.supportsBinaryEquality) {
         return execBinary(srcString);
-      } else {
+      } else if (collation.supportsLowercaseEquality) {
         return execLowercase(srcString);
+      } else {
+        return execLowercase(srcString); // TODO: ICU implementation
       }
     }
     public static UTF8String exec(
@@ -675,8 +693,10 @@ public static UTF8String exec(
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
       if (collation.supportsBinaryEquality) {
         return execBinary(srcString, trimString);
-      } else {
+      } else if (collation.supportsLowercaseEquality) {
         return execLowercase(srcString, trimString);
+      } else {
+        return execLowercase(srcString, trimString); // TODO: ICU implementation
       }
     }
     public static String genCode(
@@ -686,8 +706,10 @@ public static String genCode(
       String expr = "CollationSupport.StringTrimRight.exec";
       if (collation.supportsBinaryEquality) {
         return String.format(expr + "Binary(%s)", srcString);
-      } else {
+      } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s)", srcString);
+      } else {
+        return String.format(expr + "Lowercase(%s)", srcString); // TODO: ICU implementation
       }
     }
     public static String genCode(
@@ -698,8 +720,10 @@ public static String genCode(
       String expr = "CollationSupport.StringTrimRight.exec";
       if (collation.supportsBinaryEquality) {
         return String.format(expr + "Binary(%s, %s)", srcString, trimString);
-      } else {
+      } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
+      } else {
+        return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); // TODO: ICU implementation
       }
     }
     public static UTF8String execBinary(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 09ec501311ade..4527fd3867deb 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.catalyst.trees.TreePattern.{TreePattern, UPPER_OR_LO
 import org.apache.spark.sql.catalyst.util.{ArrayData, CollationFactory, CollationSupport, GenericArrayData, TypeUtils}
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.internal.types.{AbstractArrayType, StringTypeAnyCollation, StringTypeBinaryLcase}
+import org.apache.spark.sql.internal.types.{AbstractArrayType, StringTypeAnyCollation}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.UTF8StringBuilder
 import org.apache.spark.unsafe.array.ByteArrayMethods
@@ -1021,7 +1021,7 @@ trait String2TrimExpression extends Expression with ImplicitCastInputTypes {
 
   override def children: Seq[Expression] = srcStr +: trimStr.toSeq
   override def dataType: DataType = srcStr.dataType
-  override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringTypeBinaryLcase)
+  override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringTypeAnyCollation)
 
   final lazy val collationId: Int = srcStr.dataType.asInstanceOf[StringType].collationId
 

From 1f6881d101ac695806f75b9f548c4a577bea3d05 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Tue, 28 May 2024 20:10:52 +0200
Subject: [PATCH 02/14] Small fixes

---
 .../spark/sql/catalyst/util/CollationSupport.java  |  9 ++++++---
 .../sql/CollationStringExpressionsSuite.scala      | 14 --------------
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index 42bbab65cda0a..93fcd0eee37d0 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -577,7 +577,8 @@ public static String genCode(
       } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
       } else {
-        return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); // TODO: ICU implementation
+        // TODO: ICU implementation
+        return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
       }
     }
     public static UTF8String execBinary(
@@ -650,7 +651,8 @@ public static String genCode(
       } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
       } else {
-        return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); // TODO: ICU implementation
+        // TODO: ICU implementation
+        return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
       }
     }
     public static UTF8String execBinary(
@@ -723,7 +725,8 @@ public static String genCode(
       } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
       } else {
-        return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); // TODO: ICU implementation
+        // TODO: ICU implementation
+        return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
       }
     }
     public static UTF8String execBinary(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
index 9cc123b708aff..61f54c7bbfc46 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
@@ -945,20 +945,6 @@ class CollationStringExpressionsSuite
     assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
   }
 
-  test("StringTrim* functions - unsupported collation types") {
-    List("TRIM", "LTRIM", "RTRIM").foreach(func => {
-      val collationMismatch = intercept[AnalysisException] {
-        sql("SELECT " + func + "(COLLATE('x', 'UNICODE_CI'), COLLATE('xxaaaxx', 'UNICODE_CI'))")
-      }
-      assert(collationMismatch.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE")
-    })
-
-    val collationMismatch = intercept[AnalysisException] {
-      sql("SELECT BTRIM(COLLATE('xxaaaxx', 'UNICODE_CI'), COLLATE('x', 'UNICODE_CI'))")
-    }
-    assert(collationMismatch.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE")
-  }
-
   // TODO: Add more tests for other string expressions
 
 }

From 5039ddf4e817404a8ac2088d30b037d8a3048d7f Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 31 May 2024 01:07:18 +0200
Subject: [PATCH 03/14] Fix trim logic

---
 .../util/CollationAwareUTF8String.java        | 154 +++++++++---------
 .../sql/catalyst/util/CollationFactory.java   |  13 ++
 .../sql/catalyst/util/CollationSupport.java   | 132 +++++----------
 .../apache/spark/unsafe/types/UTF8String.java |  87 ++++++++++
 .../unsafe/types/CollationSupportSuite.java   |  84 ++++++++--
 .../spark/unsafe/types/UTF8StringSuite.java   | 107 ++++++++++++
 .../expressions/stringExpressions.scala       |  12 +-
 .../sql/CollationStringExpressionsSuite.scala |  36 ++++
 8 files changed, 435 insertions(+), 190 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 0d0094d8d0a03..10fa90f9fb6f6 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -28,6 +28,8 @@
 import static org.apache.spark.unsafe.Platform.copyMemory;
 
 import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
 import java.util.Map;
 
 /**
@@ -524,57 +526,64 @@ public static Map<String, String> getCollationAwareDict(UTF8String string,
   public static UTF8String lowercaseTrim(
       final UTF8String srcString,
       final UTF8String trimString) {
+    return lowercaseTrimRight(lowercaseTrimLeft(srcString, trimString), trimString);
+  }
+
+  public static UTF8String trim(
+      final UTF8String srcString,
+      final UTF8String trimString,
+      final int collationId) {
+    return trimRight(trimLeft(srcString, trimString, collationId), trimString, collationId);
+  }
+
+  public static UTF8String lowercaseTrimLeft(
+      final UTF8String srcString,
+      final UTF8String trimString) {
     // Matching UTF8String behavior for null `trimString`.
     if (trimString == null) {
       return null;
     }
 
-    UTF8String leftTrimmed = lowercaseTrimLeft(srcString, trimString);
-    return lowercaseTrimRight(leftTrimmed, trimString);
+    HashSet<Integer> trimChars = new HashSet<>();
+    Iterator<Integer> trimIter = trimString.codePointIterator();
+    while (trimIter.hasNext()) trimChars.add(UCharacter.toLowerCase(trimIter.next()));
+
+    int searchIndex = 0;
+    Iterator<Integer> srcIter = srcString.codePointIterator();
+    while (srcIter.hasNext()) {
+      if (!trimChars.contains(UCharacter.toLowerCase(srcIter.next()))) break;
+      ++searchIndex;
+    }
+
+    return srcString.substring(searchIndex, srcString.numChars());
   }
 
-  public static UTF8String lowercaseTrimLeft(
+  public static UTF8String trimLeft(
       final UTF8String srcString,
-      final UTF8String trimString) {
+      final UTF8String trimString,
+      final int collationId) {
     // Matching UTF8String behavior for null `trimString`.
     if (trimString == null) {
       return null;
     }
 
-    // The searching byte position in the srcString.
-    int searchIdx = 0;
-    // The byte position of a first non-matching character in the srcString.
-    int trimByteIdx = 0;
-    // Number of bytes in srcString.
-    int numBytes = srcString.numBytes();
-    // Convert trimString to lowercase, so it can be searched properly.
-    UTF8String lowercaseTrimString = trimString.toLowerCase();
-
-    while (searchIdx < numBytes) {
-      UTF8String searchChar = srcString.copyUTF8String(
-        searchIdx,
-        searchIdx + UTF8String.numBytesForFirstByte(srcString.getByte(searchIdx)) - 1);
-      int searchCharBytes = searchChar.numBytes();
-
-      // Try to find the matching for the searchChar in the trimString.
-      if (lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) {
-        trimByteIdx += searchCharBytes;
-        searchIdx += searchCharBytes;
-      } else {
-        // No matching, exit the search.
-        break;
-      }
+    // Create a set of collation keys for all characters of the trim string, for fast lookup.
+    String trim = trimString.toString();
+    HashSet<String> trimChars = new HashSet<>();
+    for (int i = 0; i < trim.length(); i++) {
+      trimChars.add(CollationFactory.getCollationKey(String.valueOf(trim.charAt(i)), collationId));
     }
 
-    if (searchIdx == 0) {
-      // Nothing trimmed - return original string (not converted to lowercase).
-      return srcString;
+    // Iterate over srcString from the left and find the first character that is not in trimChars.
+    String input = srcString.toString();
+    int i = 0;
+    while (i < input.length()) {
+      String key = CollationFactory.getCollationKey(String.valueOf(input.charAt(i)), collationId);
+      if (!trimChars.contains(key)) break;
+      ++i;
     }
-    if (trimByteIdx >= numBytes) {
-      // Everything trimmed.
-      return UTF8String.EMPTY_UTF8;
-    }
-    return srcString.copyUTF8String(trimByteIdx, numBytes - 1);
+    // Return the substring from that position to the end of the string.
+    return UTF8String.fromString(input.substring(i, srcString.numChars()));
   }
 
   public static UTF8String lowercaseTrimRight(
@@ -585,53 +594,48 @@ public static UTF8String lowercaseTrimRight(
       return null;
     }
 
-    // Number of bytes iterated from the srcString.
-    int byteIdx = 0;
-    // Number of characters iterated from the srcString.
-    int numChars = 0;
-    // Number of bytes in srcString.
-    int numBytes = srcString.numBytes();
-    // Array of character length for the srcString.
-    int[] stringCharLen = new int[numBytes];
-    // Array of the first byte position for each character in the srcString.
-    int[] stringCharPos = new int[numBytes];
-    // Convert trimString to lowercase, so it can be searched properly.
-    UTF8String lowercaseTrimString = trimString.toLowerCase();
-
-    // Build the position and length array.
-    while (byteIdx < numBytes) {
-      stringCharPos[numChars] = byteIdx;
-      stringCharLen[numChars] = UTF8String.numBytesForFirstByte(srcString.getByte(byteIdx));
-      byteIdx += stringCharLen[numChars];
-      numChars++;
-    }
-
-    // Index trimEnd points to the first no matching byte position from the right side of
-    //  the source string.
-    int trimByteIdx = numBytes - 1;
+    HashSet<Integer> trimChars = new HashSet<>();
+    Iterator<Integer> trimIter = trimString.codePointIterator();
+    while (trimIter.hasNext()) trimChars.add(UCharacter.toLowerCase(trimIter.next()));
 
-    while (numChars > 0) {
-      UTF8String searchChar = srcString.copyUTF8String(
-        stringCharPos[numChars - 1],
-        stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);
-
-      if(lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) {
-        trimByteIdx -= stringCharLen[numChars - 1];
-        numChars--;
-      } else {
+    int searchIndex = srcString.numChars();
+    Iterator<Integer> srcIter = srcString.reverseCodePointIterator();
+    while (srcIter.hasNext()) {
+      if (!trimChars.contains(UCharacter.toLowerCase(srcIter.next()))) {
         break;
       }
+      --searchIndex;
     }
 
-    if (trimByteIdx == numBytes - 1) {
-      // Nothing trimmed.
-      return srcString;
+    return srcString.substring(0, searchIndex);
+  }
+
+  public static UTF8String trimRight(
+      final UTF8String srcString,
+      final UTF8String trimString,
+      final int collationId) {
+    // Matching UTF8String behavior for null `trimString`.
+    if (trimString == null) {
+      return null;
     }
-    if (trimByteIdx < 0) {
-      // Everything trimmed.
-      return UTF8String.EMPTY_UTF8;
+
+    // Create a set of collation keys for all characters of the trim string, for fast lookup.
+    String trim = trimString.toString();
+    HashSet<String> trimChars = new HashSet<>();
+    for (int i = 0; i < trim.length(); i++) {
+      trimChars.add(CollationFactory.getCollationKey(String.valueOf(trim.charAt(i)), collationId));
+    }
+
+    // Iterate over srcString from the right and find the first character that is not in trimChars.
+    String input = srcString.toString();
+    int i = input.length() - 1;
+    while (i >= 0) {
+      String key = CollationFactory.getCollationKey(String.valueOf(input.charAt(i)), collationId);
+      if (!trimChars.contains(key)) break;
+      --i;
     }
-    return srcString.copyUTF8String(0, trimByteIdx);
+    // Return the substring from the start of the string until that position.
+    return UTF8String.fromString(input.substring(0, i + 1));
   }
 
   // TODO: Add more collation-aware UTF8String operations here.
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index fce12510afaf5..89fc240cab27a 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -23,6 +23,7 @@
 import java.util.function.BiFunction;
 import java.util.function.ToLongFunction;
 
+import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.RuleBasedCollator;
 import com.ibm.icu.text.StringSearch;
 import com.ibm.icu.util.ULocale;
@@ -805,6 +806,18 @@ public static String[] getICULocaleNames() {
     return Collation.CollationSpecICU.ICULocaleNames;
   }
 
+  public static String getCollationKey(String input, int collationId) {
+    Collation collation = fetchCollation(collationId);
+    if (collation.supportsBinaryEquality) {
+      return input;
+    } else if (collation.supportsLowercaseEquality) {
+      return input.toLowerCase();
+    } else {
+      CollationKey collationKey = collation.collator.getCollationKey(input);
+      return Arrays.toString(collationKey.toByteArray());
+    }
+  }
+
   public static UTF8String getCollationKey(UTF8String input, int collationId) {
     Collation collation = fetchCollation(collationId);
     if (collation.supportsBinaryEquality) {
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index e1d9b89f3ae2e..8b376f5a4d020 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -528,17 +528,8 @@ public static UTF8String execICU(final UTF8String source, Map<String, String> di
   }
 
   public static class StringTrim {
-    public static UTF8String exec(
-        final UTF8String srcString,
-        final int collationId) {
-      CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
-      if (collation.supportsBinaryEquality) {
-        return execBinary(srcString);
-      } else if (collation.supportsLowercaseEquality) {
-        return execLowercase(srcString);
-      } else {
-        return execLowercase(srcString); // TODO: ICU implementation
-      }
+    public static UTF8String exec(final UTF8String srcString) {
+      return execBinary(srcString);
     }
     public static UTF8String exec(
         final UTF8String srcString,
@@ -550,21 +541,11 @@ public static UTF8String exec(
       } else if (collation.supportsLowercaseEquality) {
         return execLowercase(srcString, trimString);
       } else {
-        return execLowercase(srcString, trimString); // TODO: ICU implementation
+        return execICU(srcString, trimString, collationId);
       }
     }
-    public static String genCode(
-        final String srcString,
-        final int collationId) {
-      CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
-      String expr = "CollationSupport.StringTrim.exec";
-      if (collation.supportsBinaryEquality) {
-        return String.format(expr + "Binary(%s)", srcString);
-      } else if (collation.supportsLowercaseEquality) {
-        return String.format(expr + "Lowercase(%s)", srcString);
-      } else {
-        return String.format(expr + "Lowercase(%s)", srcString); // TODO: ICU implementation
-      }
+    public static String genCode(final String srcString) {
+      return String.format("CollationSupport.StringTrim.execBinary(%s)", srcString);
     }
     public static String genCode(
         final String srcString,
@@ -577,8 +558,7 @@ public static String genCode(
       } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
       } else {
-        // TODO: ICU implementation
-        return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
+        return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString, collationId);
       }
     }
     public static UTF8String execBinary(
@@ -590,29 +570,22 @@ public static UTF8String execBinary(
         final UTF8String trimString) {
       return srcString.trim(trimString);
     }
-    public static UTF8String execLowercase(
-        final UTF8String srcString) {
-      return srcString.trim();
-    }
     public static UTF8String execLowercase(
         final UTF8String srcString,
         final UTF8String trimString) {
       return CollationAwareUTF8String.lowercaseTrim(srcString, trimString);
     }
+    public static UTF8String execICU(
+        final UTF8String srcString,
+        final UTF8String trimString,
+        final int collationId) {
+      return CollationAwareUTF8String.trim(srcString, trimString, collationId);
+    }
   }
 
   public static class StringTrimLeft {
-    public static UTF8String exec(
-        final UTF8String srcString,
-        final int collationId) {
-      CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
-      if (collation.supportsBinaryEquality) {
-        return execBinary(srcString);
-      } else if (collation.supportsLowercaseEquality) {
-        return execLowercase(srcString);
-      } else {
-        return execLowercase(srcString); // TODO: ICU implementation
-      }
+    public static UTF8String exec(final UTF8String srcString) {
+      return execBinary(srcString);
     }
     public static UTF8String exec(
         final UTF8String srcString,
@@ -624,21 +597,11 @@ public static UTF8String exec(
       } else if (collation.supportsLowercaseEquality) {
         return execLowercase(srcString, trimString);
       } else {
-        return execLowercase(srcString, trimString); // TODO: ICU implementation
+        return execICU(srcString, trimString, collationId);
       }
     }
-    public static String genCode(
-        final String srcString,
-        final int collationId) {
-      CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
-      String expr = "CollationSupport.StringTrimLeft.exec";
-      if (collation.supportsBinaryEquality) {
-        return String.format(expr + "Binary(%s)", srcString);
-      } else if (collation.supportsLowercaseEquality) {
-        return String.format(expr + "Lowercase(%s)", srcString);
-      } else {
-        return String.format(expr + "Lowercase(%s)", srcString); // TODO: ICU implementation
-      }
+    public static String genCode(final String srcString) {
+      return String.format("CollationSupport.StringTrimLeft.execBinary(%s)", srcString);
     }
     public static String genCode(
         final String srcString,
@@ -651,12 +614,10 @@ public static String genCode(
       } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
       } else {
-        // TODO: ICU implementation
-        return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
+        return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString, collationId);
       }
     }
-    public static UTF8String execBinary(
-        final UTF8String srcString) {
+    public static UTF8String execBinary(final UTF8String srcString) {
       return srcString.trimLeft();
     }
     public static UTF8String execBinary(
@@ -664,29 +625,22 @@ public static UTF8String execBinary(
         final UTF8String trimString) {
       return srcString.trimLeft(trimString);
     }
-    public static UTF8String execLowercase(
-        final UTF8String srcString) {
-      return srcString.trimLeft();
-    }
     public static UTF8String execLowercase(
         final UTF8String srcString,
         final UTF8String trimString) {
       return CollationAwareUTF8String.lowercaseTrimLeft(srcString, trimString);
     }
+    public static UTF8String execICU(
+        final UTF8String srcString,
+        final UTF8String trimString,
+        final int collationId) {
+      return CollationAwareUTF8String.trimLeft(srcString, trimString, collationId);
+    }
   }
 
   public static class StringTrimRight {
-    public static UTF8String exec(
-        final UTF8String srcString,
-        final int collationId) {
-      CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
-      if (collation.supportsBinaryEquality) {
-        return execBinary(srcString);
-      } else if (collation.supportsLowercaseEquality) {
-        return execLowercase(srcString);
-      } else {
-        return execLowercase(srcString); // TODO: ICU implementation
-      }
+    public static UTF8String exec(final UTF8String srcString) {
+      return execBinary(srcString);
     }
     public static UTF8String exec(
         final UTF8String srcString,
@@ -698,21 +652,11 @@ public static UTF8String exec(
       } else if (collation.supportsLowercaseEquality) {
         return execLowercase(srcString, trimString);
       } else {
-        return execLowercase(srcString, trimString); // TODO: ICU implementation
+        return execICU(srcString, trimString, collationId);
       }
     }
-    public static String genCode(
-        final String srcString,
-        final int collationId) {
-      CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
-      String expr = "CollationSupport.StringTrimRight.exec";
-      if (collation.supportsBinaryEquality) {
-        return String.format(expr + "Binary(%s)", srcString);
-      } else if (collation.supportsLowercaseEquality) {
-        return String.format(expr + "Lowercase(%s)", srcString);
-      } else {
-        return String.format(expr + "Lowercase(%s)", srcString); // TODO: ICU implementation
-      }
+    public static String genCode(final String srcString) {
+      return String.format("CollationSupport.StringTrimRight.execBinary(%s)", srcString);
     }
     public static String genCode(
         final String srcString,
@@ -725,12 +669,10 @@ public static String genCode(
       } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
       } else {
-        // TODO: ICU implementation
-        return String.format(expr + "Lowercase(%s, %s)", srcString, trimString);
+        return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString, collationId);
       }
     }
-    public static UTF8String execBinary(
-        final UTF8String srcString) {
+    public static UTF8String execBinary(final UTF8String srcString) {
       return srcString.trimRight();
     }
     public static UTF8String execBinary(
@@ -738,15 +680,17 @@ public static UTF8String execBinary(
         final UTF8String trimString) {
       return srcString.trimRight(trimString);
     }
-    public static UTF8String execLowercase(
-        final UTF8String srcString) {
-      return srcString.trimRight();
-    }
     public static UTF8String execLowercase(
         final UTF8String srcString,
         final UTF8String trimString) {
       return CollationAwareUTF8String.lowercaseTrimRight(srcString, trimString);
     }
+    public static UTF8String execICU(
+        final UTF8String srcString,
+        final UTF8String trimString,
+        final int collationId) {
+      return CollationAwareUTF8String.trimRight(srcString, trimString, collationId);
+    }
   }
 
   // TODO: Add more collation-aware string expressions.
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 03286e0635287..334421adbeb4d 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -22,6 +22,7 @@
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
+import java.util.Iterator;
 import java.util.Map;
 import java.util.regex.Pattern;
 
@@ -270,6 +271,92 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Returns the code point starting from the byte at position `index`.
+   */
+  public int codePointFrom(int index) {
+    if (index < 0 || index >= numBytes) {
+      throw new IndexOutOfBoundsException();
+    }
+    byte b = getByte(index);
+    int numBytes = numBytesForFirstByte(b);
+    return switch (numBytes) {
+      case 1 ->
+        b & 0x7F;
+      case 2 ->
+        ((b & 0x1F) << 6) | (getByte(index + 1) & 0x3F);
+      case 3 ->
+        ((b & 0x0F) << 12) | ((getByte(index + 1) & 0x3F) << 6) |
+        (getByte(index + 2) & 0x3F);
+      case 4 ->
+        ((b & 0x07) << 18) | ((getByte(index + 1) & 0x3F) << 12) |
+        ((getByte(index + 2) & 0x3F) << 6) | (getByte(index + 3) & 0x3F);
+      default ->
+        throw new IllegalArgumentException("Invalid UTF-8 sequence");
+    };
+  }
+
+  public int getChar(int index) {
+    if (index < 0 || index >= numChars()) {
+      throw new IndexOutOfBoundsException();
+    }
+    int charCount = 0, byteCount = 0;
+    while (charCount < index) {
+      byteCount += numBytesForFirstByte(getByte(byteCount));
+      charCount += 1;
+    }
+    return codePointFrom(byteCount);
+  }
+
+  public Iterator<Integer> codePointIterator() {
+    return new CodePointIterator();
+  }
+  private class CodePointIterator implements Iterator<Integer> {
+    private int byteIndex = 0;
+
+    @Override
+    public boolean hasNext() {
+      return byteIndex < numBytes;
+    }
+
+    @Override
+    public Integer next() {
+      if (!hasNext()) {
+        throw new IndexOutOfBoundsException();
+      }
+      int codePoint = codePointFrom(byteIndex);
+      byteIndex += numBytesForFirstByte(getByte(byteIndex));
+      return codePoint;
+    }
+  }
+
+  public Iterator<Integer> reverseCodePointIterator() {
+    return new ReverseCodePointIterator();
+  }
+  private class ReverseCodePointIterator implements Iterator<Integer> {
+    private int byteIndex = numBytes - 1;
+
+    @Override
+    public boolean hasNext() {
+      return byteIndex >= 0;
+    }
+
+    @Override
+    public Integer next() {
+      if (!hasNext()) {
+        throw new IndexOutOfBoundsException();
+      }
+      while (byteIndex > 0 && isContinuationByte(getByte(byteIndex))) {
+        --byteIndex;
+      }
+      return codePointFrom(byteIndex--);
+    }
+
+    private boolean isContinuationByte(byte b) {
+      return (b & 0xC0) == 0x80;
+    }
+  }
+
   /**
    * Returns a substring of this.
    * @param start the position of first code point
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index eb18d7665b092..557074373fa7e 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -890,7 +890,7 @@ private void assertStringTrim(
 
     if (trimString == null) {
       result = CollationSupport.StringTrim.exec(
-        UTF8String.fromString(sourceString), collationId).toString();
+        UTF8String.fromString(sourceString)).toString();
     } else {
       result = CollationSupport.StringTrim.exec(
         UTF8String
@@ -911,7 +911,7 @@ private void assertStringTrimLeft(
 
     if (trimString == null) {
       result = CollationSupport.StringTrimLeft.exec(
-        UTF8String.fromString(sourceString), collationId).toString();
+        UTF8String.fromString(sourceString)).toString();
     } else {
       result = CollationSupport.StringTrimLeft.exec(
         UTF8String
@@ -932,7 +932,7 @@ private void assertStringTrimRight(
 
     if (trimString == null) {
       result = CollationSupport.StringTrimRight.exec(
-        UTF8String.fromString(sourceString), collationId).toString();
+        UTF8String.fromString(sourceString)).toString();
     } else {
       result = CollationSupport.StringTrimRight.exec(
         UTF8String
@@ -945,20 +945,29 @@ private void assertStringTrimRight(
 
   @Test
   public void testStringTrim() throws SparkException {
+    // UTF8_BINARY
+//    assertStringTrim("UTF8_BINARY", null, null, "");
+    assertStringTrim("UTF8_BINARY", "", "", "");
+    assertStringTrim("UTF8_BINARY", "", "xyz", "");
+    assertStringTrim("UTF8_BINARY", "asd", "", "asd");
     assertStringTrim("UTF8_BINARY", "asd", null, "asd");
     assertStringTrim("UTF8_BINARY", "  asd  ", null, "asd");
     assertStringTrim("UTF8_BINARY", " a世a ", null, "a世a");
     assertStringTrim("UTF8_BINARY", "asd", "x", "asd");
     assertStringTrim("UTF8_BINARY", "xxasdxx", "x", "asd");
     assertStringTrim("UTF8_BINARY", "xa世ax", "x", "a世a");
-
+    assertStringTrimLeft("UTF8_BINARY", "", "", "");
+    assertStringTrimLeft("UTF8_BINARY", "", "xyz", "");
+    assertStringTrimLeft("UTF8_BINARY", "asd", "", "asd");
     assertStringTrimLeft("UTF8_BINARY", "asd", null, "asd");
     assertStringTrimLeft("UTF8_BINARY", "  asd  ", null, "asd  ");
     assertStringTrimLeft("UTF8_BINARY", " a世a ", null, "a世a ");
     assertStringTrimLeft("UTF8_BINARY", "asd", "x", "asd");
     assertStringTrimLeft("UTF8_BINARY", "xxasdxx", "x", "asdxx");
     assertStringTrimLeft("UTF8_BINARY", "xa世ax", "x", "a世ax");
-
+    assertStringTrimRight("UTF8_BINARY", "", "", "");
+    assertStringTrimRight("UTF8_BINARY", "", "xyz", "");
+    assertStringTrimRight("UTF8_BINARY", "asd", "", "asd");
     assertStringTrimRight("UTF8_BINARY", "asd", null, "asd");
     assertStringTrimRight("UTF8_BINARY", "  asd  ", null, "  asd");
     assertStringTrimRight("UTF8_BINARY", " a世a ", null, " a世a");
@@ -966,20 +975,28 @@ public void testStringTrim() throws SparkException {
     assertStringTrimRight("UTF8_BINARY", "xxasdxx", "x", "xxasd");
     assertStringTrimRight("UTF8_BINARY", "xa世ax", "x", "xa世a");
 
+    // UTF8_BINARY_LCASE
+    assertStringTrim("UTF8_BINARY_LCASE", "", "", "");
+    assertStringTrim("UTF8_BINARY_LCASE", "", "xyz", "");
+    assertStringTrim("UTF8_BINARY_LCASE", "asd", "", "asd");
     assertStringTrim("UTF8_BINARY_LCASE", "asd", null, "asd");
     assertStringTrim("UTF8_BINARY_LCASE", "  asd  ", null, "asd");
     assertStringTrim("UTF8_BINARY_LCASE", " a世a ", null, "a世a");
     assertStringTrim("UTF8_BINARY_LCASE", "asd", "x", "asd");
     assertStringTrim("UTF8_BINARY_LCASE", "xxasdxx", "x", "asd");
     assertStringTrim("UTF8_BINARY_LCASE", "xa世ax", "x", "a世a");
-
+    assertStringTrimLeft("UTF8_BINARY_LCASE", "", "", "");
+    assertStringTrimLeft("UTF8_BINARY_LCASE", "", "xyz", "");
+    assertStringTrimLeft("UTF8_BINARY_LCASE", "asd", "", "asd");
     assertStringTrimLeft("UTF8_BINARY_LCASE", "asd", null, "asd");
     assertStringTrimLeft("UTF8_BINARY_LCASE", "  asd  ", null, "asd  ");
     assertStringTrimLeft("UTF8_BINARY_LCASE", " a世a ", null, "a世a ");
     assertStringTrimLeft("UTF8_BINARY_LCASE", "asd", "x", "asd");
     assertStringTrimLeft("UTF8_BINARY_LCASE", "xxasdxx", "x", "asdxx");
     assertStringTrimLeft("UTF8_BINARY_LCASE", "xa世ax", "x", "a世ax");
-
+    assertStringTrimRight("UTF8_BINARY_LCASE", "", "", "");
+    assertStringTrimRight("UTF8_BINARY_LCASE", "", "xyz", "");
+    assertStringTrimRight("UTF8_BINARY_LCASE", "asd", "", "asd");
     assertStringTrimRight("UTF8_BINARY_LCASE", "asd", null, "asd");
     assertStringTrimRight("UTF8_BINARY_LCASE", "  asd  ", null, "  asd");
     assertStringTrimRight("UTF8_BINARY_LCASE", " a世a ", null, " a世a");
@@ -987,20 +1004,28 @@ public void testStringTrim() throws SparkException {
     assertStringTrimRight("UTF8_BINARY_LCASE", "xxasdxx", "x", "xxasd");
     assertStringTrimRight("UTF8_BINARY_LCASE", "xa世ax", "x", "xa世a");
 
-    assertStringTrim("UTF8_BINARY_LCASE", "asd", null, "asd");
-    assertStringTrim("UTF8_BINARY_LCASE", "  asd  ", null, "asd");
-    assertStringTrim("UTF8_BINARY_LCASE", " a世a ", null, "a世a");
-    assertStringTrim("UTF8_BINARY_LCASE", "asd", "x", "asd");
-    assertStringTrim("UTF8_BINARY_LCASE", "xxasdxx", "x", "asd");
-    assertStringTrim("UTF8_BINARY_LCASE", "xa世ax", "x", "a世a");
-
+    // UNICODE
+    assertStringTrim("UNICODE", "", "", "");
+    assertStringTrim("UNICODE", "", "xyz", "");
+    assertStringTrim("UNICODE", "asd", "", "asd");
+    assertStringTrim("UNICODE", "asd", null, "asd");
+    assertStringTrim("UNICODE", "  asd  ", null, "asd");
+    assertStringTrim("UNICODE", " a世a ", null, "a世a");
+    assertStringTrim("UNICODE", "asd", "x", "asd");
+    assertStringTrim("UNICODE", "xxasdxx", "x", "asd");
+    assertStringTrim("UNICODE", "xa世ax", "x", "a世a");
+    assertStringTrimLeft("UNICODE", "", "", "");
+    assertStringTrimLeft("UNICODE", "", "xyz", "");
+    assertStringTrimLeft("UNICODE", "asd", "", "asd");
     assertStringTrimLeft("UNICODE", "asd", null, "asd");
     assertStringTrimLeft("UNICODE", "  asd  ", null, "asd  ");
     assertStringTrimLeft("UNICODE", " a世a ", null, "a世a ");
     assertStringTrimLeft("UNICODE", "asd", "x", "asd");
     assertStringTrimLeft("UNICODE", "xxasdxx", "x", "asdxx");
     assertStringTrimLeft("UNICODE", "xa世ax", "x", "a世ax");
-
+    assertStringTrimRight("UNICODE", "", "", "");
+    assertStringTrimRight("UNICODE", "", "xyz", "");
+    assertStringTrimRight("UNICODE", "asd", "", "asd");
     assertStringTrimRight("UNICODE", "asd", null, "asd");
     assertStringTrimRight("UNICODE", "  asd  ", null, "  asd");
     assertStringTrimRight("UNICODE", " a世a ", null, " a世a");
@@ -1008,6 +1033,35 @@ public void testStringTrim() throws SparkException {
     assertStringTrimRight("UNICODE", "xxasdxx", "x", "xxasd");
     assertStringTrimRight("UNICODE", "xa世ax", "x", "xa世a");
 
+    // UNICODE_CI
+    assertStringTrim("UNICODE_CI", "", "", "");
+    assertStringTrim("UNICODE_CI", "", "xyz", "");
+    assertStringTrim("UNICODE_CI", "asd", "", "asd");
+    assertStringTrim("UNICODE_CI", "asd", null, "asd");
+    assertStringTrim("UNICODE_CI", "  asd  ", null, "asd");
+    assertStringTrim("UNICODE_CI", " a世a ", null, "a世a");
+    assertStringTrim("UNICODE_CI", "asd", "x", "asd");
+    assertStringTrim("UNICODE_CI", "xxasdxx", "x", "asd");
+    assertStringTrim("UNICODE_CI", "xa世ax", "x", "a世a");
+    assertStringTrimLeft("UNICODE_CI", "", "", "");
+    assertStringTrimLeft("UNICODE_CI", "", "xyz", "");
+    assertStringTrimLeft("UNICODE_CI", "asd", "", "asd");
+    assertStringTrimLeft("UNICODE_CI", "asd", null, "asd");
+    assertStringTrimLeft("UNICODE_CI", "  asd  ", null, "asd  ");
+    assertStringTrimLeft("UNICODE_CI", " a世a ", null, "a世a ");
+    assertStringTrimLeft("UNICODE_CI", "asd", "x", "asd");
+    assertStringTrimLeft("UNICODE_CI", "xxasdxx", "x", "asdxx");
+    assertStringTrimLeft("UNICODE_CI", "xa世ax", "x", "a世ax");
+    assertStringTrimRight("UNICODE_CI", "", "", "");
+    assertStringTrimRight("UNICODE_CI", "", "xyz", "");
+    assertStringTrimRight("UNICODE_CI", "asd", "", "asd");
+    assertStringTrimRight("UNICODE_CI", "asd", null, "asd");
+    assertStringTrimRight("UNICODE_CI", "  asd  ", null, "  asd");
+    assertStringTrimRight("UNICODE_CI", " a世a ", null, " a世a");
+    assertStringTrimRight("UNICODE_CI", "asd", "x", "asd");
+    assertStringTrimRight("UNICODE_CI", "xxasdxx", "x", "xxasd");
+    assertStringTrimRight("UNICODE_CI", "xa世ax", "x", "xa世a");
+
     // Test cases where trimString has more than one character
     assertStringTrim("UTF8_BINARY", "ddsXXXaa", "asd", "XXX");
     assertStringTrimLeft("UTF8_BINARY", "ddsXXXaa", "asd", "XXXaa");
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 0188297fd05a2..7f5f9d359193b 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -902,4 +902,111 @@ public void skipWrongFirstByte() {
       assertEquals(1, fromBytes(c).numChars());
     }
   }
+
+  @Test
+  public void UTF8StringCodePoints() {
+    String s = "aéह 日å!";
+    UTF8String s0 = fromString(s);
+    for (int i = 0; i < s.length(); ++i) {
+      assertEquals(s.codePointAt(i), s0.getChar(i));
+    }
+
+    UTF8String s1 = fromBytes(new byte[] {0x41, (byte) 0xC3, (byte) 0xB1, (byte) 0xE2,
+      (byte) 0x82, (byte) 0xAC, (byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0x88});
+    // numBytesForFirstByte
+    assertEquals(1, UTF8String.numBytesForFirstByte(s1.getByte(0)));
+    assertEquals(2, UTF8String.numBytesForFirstByte(s1.getByte(1)));
+    assertEquals(3, UTF8String.numBytesForFirstByte(s1.getByte(3)));
+    assertEquals(4, UTF8String.numBytesForFirstByte(s1.getByte(6)));
+    // getByte
+    assertEquals((byte) 0x41, s1.getByte(0));
+    assertEquals((byte) 0xC3, s1.getByte(1));
+    assertEquals((byte) 0xE2, s1.getByte(3));
+    assertEquals((byte) 0xF0, s1.getByte(6));
+    // codePointFrom
+    assertEquals(0x41, s1.codePointFrom(0));
+    assertEquals(0xF1, s1.codePointFrom(1));
+    assertEquals(0x20AC, s1.codePointFrom(3));
+    assertEquals(0x10348, s1.codePointFrom(6));
+    assertThrows(IndexOutOfBoundsException.class, () -> s1.codePointFrom(-1));
+    assertThrows(IndexOutOfBoundsException.class, () -> s1.codePointFrom(99));
+    // getChar
+    assertEquals(0x41, s1.getChar(0));
+    assertEquals(0xF1, s1.getChar(1));
+    assertEquals(0x20AC, s1.getChar(2));
+    assertEquals(0x10348, s1.getChar(3));
+    assertThrows(IndexOutOfBoundsException.class, () -> s1.getChar(-1));
+    assertThrows(IndexOutOfBoundsException.class, () -> s1.getChar(99));
+
+    UTF8String s2 = fromString("Añ€𐍈");
+    // numBytesForFirstByte
+    assertEquals(1, UTF8String.numBytesForFirstByte(s2.getByte(0)));
+    assertEquals(2, UTF8String.numBytesForFirstByte(s2.getByte(1)));
+    assertEquals(3, UTF8String.numBytesForFirstByte(s2.getByte(3)));
+    assertEquals(4, UTF8String.numBytesForFirstByte(s2.getByte(6)));
+    // getByte
+    assertEquals((byte) 0x41, s2.getByte(0));
+    assertEquals((byte) 0xC3, s2.getByte(1));
+    assertEquals((byte) 0xE2, s2.getByte(3));
+    assertEquals((byte) 0xF0, s2.getByte(6));
+    // codePointFrom
+    assertEquals(0x41, s2.codePointFrom(0));
+    assertEquals(0xF1, s2.codePointFrom(1));
+    assertEquals(0x20AC, s2.codePointFrom(3));
+    assertEquals(0x10348, s2.codePointFrom(6));
+    assertThrows(IndexOutOfBoundsException.class, () -> s2.codePointFrom(-1));
+    assertThrows(IndexOutOfBoundsException.class, () -> s2.codePointFrom(99));
+    // getChar
+    assertEquals(0x41, s2.getChar(0));
+    assertEquals(0xF1, s2.getChar(1));
+    assertEquals(0x20AC, s2.getChar(2));
+    assertEquals(0x10348, s2.getChar(3));
+    assertThrows(IndexOutOfBoundsException.class, () -> s2.getChar(-1));
+    assertThrows(IndexOutOfBoundsException.class, () -> s2.getChar(99));
+
+    UTF8String s3 = EMPTY_UTF8;
+    // codePointFrom
+    assertThrows(IndexOutOfBoundsException.class, () -> s3.codePointFrom(0));
+    assertThrows(IndexOutOfBoundsException.class, () -> s3.codePointFrom(-1));
+    assertThrows(IndexOutOfBoundsException.class, () -> s3.codePointFrom(99));
+    // getChar
+    assertThrows(IndexOutOfBoundsException.class, () -> s3.getChar(0));
+    assertThrows(IndexOutOfBoundsException.class, () -> s3.getChar(-1));
+    assertThrows(IndexOutOfBoundsException.class, () -> s3.getChar(99));
+  }
+
+  private void testCodePointIterator(String str) {
+    UTF8String s = fromString(str);
+    Iterator<Integer> it = s.codePointIterator();
+    for (int i = 0; i < str.length(); ++i) {
+      assertTrue(it.hasNext());
+      assertEquals(str.charAt(i), (int) it.next());
+    }
+    assertFalse(it.hasNext());
+  }
+  @Test
+  public void codePointIterator() {
+    testCodePointIterator("");
+    testCodePointIterator("abc");
+    testCodePointIterator("a!2&^R");
+    testCodePointIterator("aéह 日å!");
+  }
+
+  private void testReverseCodePointIterator(String str) {
+    UTF8String s = fromString(str);
+    Iterator<Integer> it = s.reverseCodePointIterator();
+    for (int i = str.length() - 1; i >= 0 ; --i) {
+      assertTrue(it.hasNext());
+      assertEquals(str.charAt(i), (int) it.next());
+    }
+    assertFalse(it.hasNext());
+  }
+  @Test
+  public void reverseCodePointIterator() {
+    testReverseCodePointIterator("");
+    testReverseCodePointIterator("abc");
+    testReverseCodePointIterator("a!2&^R");
+    testReverseCodePointIterator("aéह 日å!");
+  }
+
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 4527fd3867deb..e19f3b63c1520 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1049,11 +1049,11 @@ trait String2TrimExpression extends Expression with ImplicitCastInputTypes {
     if (evals.length == 1) {
       val stringTrimCode: String = this match {
         case _: StringTrim =>
-          CollationSupport.StringTrim.genCode(srcString.value, collationId)
+          CollationSupport.StringTrim.genCode(srcString.value)
         case _: StringTrimLeft =>
-          CollationSupport.StringTrimLeft.genCode(srcString.value, collationId)
+          CollationSupport.StringTrimLeft.genCode(srcString.value)
         case _: StringTrimRight =>
-          CollationSupport.StringTrimRight.genCode(srcString.value, collationId)
+          CollationSupport.StringTrimRight.genCode(srcString.value)
       }
       ev.copy(code = code"""
          |${srcString.code}
@@ -1179,7 +1179,7 @@ case class StringTrim(srcStr: Expression, trimStr: Option[Expression] = None)
   override protected def direction: String = "BOTH"
 
   override def doEval(srcString: UTF8String): UTF8String =
-    CollationSupport.StringTrim.exec(srcString, collationId)
+    CollationSupport.StringTrim.exec(srcString)
 
   override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String =
     CollationSupport.StringTrim.exec(srcString, trimString, collationId)
@@ -1286,7 +1286,7 @@ case class StringTrimLeft(srcStr: Expression, trimStr: Option[Expression] = None
   override protected def direction: String = "LEADING"
 
   override def doEval(srcString: UTF8String): UTF8String =
-    CollationSupport.StringTrimLeft.exec(srcString, collationId)
+    CollationSupport.StringTrimLeft.exec(srcString)
 
   override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String =
     CollationSupport.StringTrimLeft.exec(srcString, trimString, collationId)
@@ -1346,7 +1346,7 @@ case class StringTrimRight(srcStr: Expression, trimStr: Option[Expression] = Non
   override protected def direction: String = "TRAILING"
 
   override def doEval(srcString: UTF8String): UTF8String =
-    CollationSupport.StringTrimRight.exec(srcString, collationId)
+    CollationSupport.StringTrimRight.exec(srcString)
 
   override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String =
     CollationSupport.StringTrimRight.exec(srcString, trimString, collationId)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
index 61f54c7bbfc46..ce1c09c97f217 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
@@ -802,7 +802,43 @@ class CollationStringExpressionsSuite
     assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
   }
 
+  test("xxxx") {
+    checkEvaluation(
+      StringTrim(Literal.create(null, StringType), Literal.create(null, StringType)), null)
+  }
+
   test("StringTrim* functions - unit tests for both paths (codegen and eval)") {
+    def evalStringTrim(src: Any, trim: Any, result: String): Unit = {
+      Seq("UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI").foreach { collation =>
+        val dt: DataType = StringType(collation)
+        checkEvaluation(StringTrim(Literal.create(src, dt), Literal.create(trim, dt)), result)
+        checkEvaluation(StringTrimLeft(Literal.create(src, dt), Literal.create(trim, dt)), result)
+        checkEvaluation(StringTrimRight(Literal.create(src, dt), Literal.create(trim, dt)), result)
+      }
+    }
+    // General edge cases and basic tests.
+    evalStringTrim(null, null, null)
+    evalStringTrim(null, "", null)
+    evalStringTrim(null, "a", null)
+    evalStringTrim("", null, null)
+    evalStringTrim("a", null, null)
+    evalStringTrim("", "", "")
+    evalStringTrim("", " ", "")
+    evalStringTrim("", "a", "")
+    evalStringTrim("", "aaa", "")
+    evalStringTrim(" ", "", " ")
+    evalStringTrim("a", "", "a")
+    evalStringTrim("aaa", "", "aaa")
+    evalStringTrim(" ", " ", "")
+    evalStringTrim(" ", "   ", "")
+    evalStringTrim("   ", " ", "")
+    evalStringTrim("   ", "   ", "")
+    evalStringTrim("a", "aaa", "")
+    evalStringTrim("aaa", "a", "")
+    evalStringTrim("aaa", "aaa", "")
+    evalStringTrim("abc", "cba", "")
+    evalStringTrim("cba", "abc", "")
+
     // Without trimString param.
     checkEvaluation(StringTrim(Literal.create( "  asd  ", StringType("UTF8_BINARY"))), "asd")
     checkEvaluation(

From 361c7b1fd40ef71662fa5c3c8b7b206c4eeefacc Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 31 May 2024 01:52:50 +0200
Subject: [PATCH 04/14] Lint fixes

---
 .../org/apache/spark/sql/catalyst/util/CollationFactory.java    | 1 -
 .../org/apache/spark/unsafe/types/CollationSupportSuite.java    | 1 -
 .../java/org/apache/spark/unsafe/types/UTF8StringSuite.java     | 2 +-
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index 89fc240cab27a..39d0057d737ad 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -23,7 +23,6 @@
 import java.util.function.BiFunction;
 import java.util.function.ToLongFunction;
 
-import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.RuleBasedCollator;
 import com.ibm.icu.text.StringSearch;
 import com.ibm.icu.util.ULocale;
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 557074373fa7e..5ed6c9424c8b3 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -946,7 +946,6 @@ private void assertStringTrimRight(
   @Test
   public void testStringTrim() throws SparkException {
     // UTF8_BINARY
-//    assertStringTrim("UTF8_BINARY", null, null, "");
     assertStringTrim("UTF8_BINARY", "", "", "");
     assertStringTrim("UTF8_BINARY", "", "xyz", "");
     assertStringTrim("UTF8_BINARY", "asd", "", "asd");
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 7f5f9d359193b..d07d795bb0c98 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -904,7 +904,7 @@ public void skipWrongFirstByte() {
   }
 
   @Test
-  public void UTF8StringCodePoints() {
+  public void utf8StringCodePoints() {
     String s = "aéह 日å!";
     UTF8String s0 = fromString(s);
     for (int i = 0; i < s.length(); ++i) {

From 1af57bffa30ce1663898645ef2569ad755dc8ee7 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 10 Jun 2024 20:20:53 +0200
Subject: [PATCH 05/14] Update CollationSupportSuite.java

---
 .../unsafe/types/CollationSupportSuite.java   | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index a738ac5b76375..3fabc18c9d13b 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -1311,14 +1311,22 @@ public void testStringTrim() throws SparkException {
     assertStringTrimLeft("UTF8_BINARY_LCASE", "ddsXXXaa", "asd", "XXXaa");
     assertStringTrimRight("UTF8_BINARY_LCASE", "ddsXXXaa", "asd", "ddsXXX");
 
+    assertStringTrim("UNICODE", "ddsXXXaa", "asd", "XXX");
+    assertStringTrimLeft("UNICODE", "ddsXXXaa", "asd", "XXXaa");
+    assertStringTrimRight("UNICODE", "ddsXXXaa", "asd", "ddsXXX");
+
     // Test cases specific to collation type
     // uppercase trim, lowercase src
     assertStringTrim("UTF8_BINARY", "asd", "A", "asd");
     assertStringTrim("UTF8_BINARY_LCASE", "asd", "A", "sd");
+    assertStringTrim("UNICODE", "asd", "A", "asd");
+    assertStringTrim("UNICODE_CI", "asd", "A", "sd");
 
     // lowercase trim, uppercase src
     assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD");
     assertStringTrim("UTF8_BINARY_LCASE", "ASD", "a", "SD");
+    assertStringTrim("UNICODE", "ASD", "a", "ASD");
+    assertStringTrim("UNICODE_CI", "ASD", "a", "SD");
 
     // uppercase and lowercase chars of different byte-length (utf8)
     assertStringTrim("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
@@ -1329,6 +1337,10 @@ public void testStringTrim() throws SparkException {
     assertStringTrimLeft("UTF8_BINARY_LCASE", "ẞaaaẞ", "ß", "aaaẞ");
     assertStringTrimRight("UTF8_BINARY_LCASE", "ẞaaaẞ", "ß", "ẞaaa");
 
+    assertStringTrim("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
+    assertStringTrimLeft("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
+    assertStringTrimRight("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
+
     assertStringTrim("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
     assertStringTrimLeft("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
     assertStringTrimRight("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
@@ -1337,6 +1349,10 @@ public void testStringTrim() throws SparkException {
     assertStringTrimLeft("UTF8_BINARY_LCASE", "ßaaaß", "ẞ", "aaaß");
     assertStringTrimRight("UTF8_BINARY_LCASE", "ßaaaß", "ẞ", "ßaaa");
 
+    assertStringTrim("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
+    assertStringTrimLeft("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
+    assertStringTrimRight("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
+
     // different byte-length (utf8) chars trimmed
     assertStringTrim("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaa");
     assertStringTrimLeft("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaaẞ");
@@ -1345,6 +1361,10 @@ public void testStringTrim() throws SparkException {
     assertStringTrim("UTF8_BINARY_LCASE", "Ëaaaẞ", "Ëẞ", "aaa");
     assertStringTrimLeft("UTF8_BINARY_LCASE", "Ëaaaẞ", "Ëẞ", "aaaẞ");
     assertStringTrimRight("UTF8_BINARY_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
+
+    assertStringTrim("UNICODE", "Ëaaaẞ", "Ëẞ", "aaa");
+    assertStringTrimLeft("UNICODE", "Ëaaaẞ", "Ëẞ", "aaaẞ");
+    assertStringTrimRight("UNICODE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
   }
 
   // TODO: Test more collation-aware string expressions.

From 292acf895b685fd32b75f0cc782f4878ce2f28f4 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 8 Jul 2024 00:19:40 +0200
Subject: [PATCH 06/14] Fix test

---
 .../org/apache/spark/sql/CollationStringExpressionsSuite.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
index 15bb2bd8a9504..1856545c90fe3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
@@ -920,7 +920,7 @@ class CollationStringExpressionsSuite
 
   test("StringTrim* functions - unit tests for both paths (codegen and eval)") {
     def evalStringTrim(src: Any, trim: Any, result: String): Unit = {
-      Seq("UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI").foreach { collation =>
+      Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI").foreach { collation =>
         val dt: DataType = StringType(collation)
         checkEvaluation(StringTrim(Literal.create(src, dt), Literal.create(trim, dt)), result)
         checkEvaluation(StringTrimLeft(Literal.create(src, dt), Literal.create(trim, dt)), result)

From 097a8468c1a41d218ad9a9424acd096aa5ccc22d Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 8 Jul 2024 05:03:43 +0200
Subject: [PATCH 07/14] Refactor trim

---
 .../util/CollationAwareUTF8String.java        | 274 +++++++--
 .../unsafe/types/CollationSupportSuite.java   | 573 ++++++++++++++++--
 2 files changed, 727 insertions(+), 120 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 7a4b2288ab135..73b76320247e0 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -18,6 +18,8 @@
 
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
 import com.ibm.icu.text.StringSearch;
 import com.ibm.icu.util.ULocale;
 
@@ -26,7 +28,10 @@
 
 import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET;
 import static org.apache.spark.unsafe.Platform.copyMemory;
+import static org.apache.spark.unsafe.types.UTF8String.CodePointIteratorType;
 
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -426,19 +431,48 @@ private static UTF8String toLowerCaseSlow(final UTF8String target, final int col
    * @param codePoint The code point to convert to lowercase.
    * @param sb The StringBuilder to append the lowercase character to.
    */
-  private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) {
-    if (codePoint == 0x0130) {
+  private static void appendLowercaseCodePoint(final int codePoint, final StringBuilder sb) {
+    int lowercaseCodePoint = getLowercaseCodePoint(codePoint);
+    if (lowercaseCodePoint == CODE_POINT_COMBINED_LOWERCASE_I_DOT) {
       // Latin capital letter I with dot above is mapped to 2 lowercase characters.
       sb.appendCodePoint(0x0069);
       sb.appendCodePoint(0x0307);
+    } else {
+      // All other characters should follow context-unaware ICU single-code point case mapping.
+      sb.appendCodePoint(lowercaseCodePoint);
+    }
+  }
+
+  /**
+   * `CODE_POINT_COMBINED_LOWERCASE_I_DOT` is an internal representation of the combined lowercase
+   * code point for ASCII lowercase letter i with an additional combining dot character (U+0307).
+   * This integer value is not a valid code point itself, but rather an artificial code point
+   * marker used to represent the two lowercase characters that are the result of converting the
+   * uppercase Turkish dotted letter I with a combining dot character (U+0130) to lowercase.
+   */
+  private static final int CODE_POINT_LOWERCASE_I = 0x69;
+  private static final int CODE_POINT_COMBINING_DOT = 0x307;
+  private static final int CODE_POINT_COMBINED_LOWERCASE_I_DOT =
+    CODE_POINT_LOWERCASE_I << 16 | CODE_POINT_COMBINING_DOT;
+
+  /**
+   * Returns the lowercase version of the provided code point, with special handling for
+   * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase) and
+   * context-insensitive case mappings (i.e. characters that map to different characters based on
+   * the position in the string relative to other characters in lowercase).
+   */
+  private static int getLowercaseCodePoint(final int codePoint) {
+    if (codePoint == 0x0130) {
+      // Latin capital letter I with dot above is mapped to 2 lowercase characters.
+      return CODE_POINT_COMBINED_LOWERCASE_I_DOT;
     }
     else if (codePoint == 0x03C2) {
       // Greek final and non-final capital letter sigma should be mapped the same.
-      sb.appendCodePoint(0x03C3);
+      return 0x03C3;
     }
     else {
       // All other characters should follow context-unaware ICU single-code point case mapping.
-      sb.appendCodePoint(UCharacter.toLowerCase(codePoint));
+      return UCharacter.toLowerCase(codePoint);
     }
   }
 
@@ -446,7 +480,7 @@ else if (codePoint == 0x03C2) {
    * Converts an entire string to lowercase using ICU rules, code point by code point, with
    * special handling for one-to-many case mappings (i.e. characters that map to multiple
    * characters in lowercase). Also, this method omits information about context-sensitive case
-   * mappings using special handling in the `lowercaseCodePoint` method.
+   * mappings using special handling in the `appendLowercaseCodePoint` method.
    *
    * @param target The target string to convert to lowercase.
    * @return The string converted to lowercase in a context-unaware manner.
@@ -460,7 +494,7 @@ private static UTF8String lowerCaseCodePointsSlow(final UTF8String target) {
     String targetString = target.toValidString();
     StringBuilder sb = new StringBuilder();
     for (int i = 0; i < targetString.length(); ++i) {
-      lowercaseCodePoint(targetString.codePointAt(i), sb);
+      appendLowercaseCodePoint(targetString.codePointAt(i), sb);
     }
     return UTF8String.fromString(sb.toString());
   }
@@ -691,12 +725,32 @@ public static Map<String, String> getCollationAwareDict(UTF8String string,
     return collationAwareDict;
   }
 
+  /**
+   * Trims the `srcString` string from both ends of the string using the specified `trimString`
+   * characters, with respect to the UTF8_LCASE collation. String trimming is performed by
+   * first trimming the left side of the string, and then trimming the right side of the string.
+   * The method returns the trimmed string. If the `trimString` is null, the method returns null.
+   *
+   * @param srcString the input string to be trimmed from both ends of the string
+   * @param trimString the trim string characters to trim
+   * @return the trimmed string (for UTF8_LCASE collation)
+   */
   public static UTF8String lowercaseTrim(
       final UTF8String srcString,
       final UTF8String trimString) {
     return lowercaseTrimRight(lowercaseTrimLeft(srcString, trimString), trimString);
   }
 
+  /**
+   * Trims the `srcString` string from both ends of the string using the specified `trimString`
+   * characters, with respect to all ICU collations in Spark. String trimming is performed by
+   * first trimming the left side of the string, and then trimming the right side of the string.
+   * The method returns the trimmed string. If the `trimString` is null, the method returns null.
+   *
+   * @param srcString the input string to be trimmed from both ends of the string
+   * @param trimString the trim string characters to trim
+   * @return the trimmed string (for ICU collations)
+   */
   public static UTF8String trim(
       final UTF8String srcString,
       final UTF8String trimString,
@@ -704,106 +758,218 @@ public static UTF8String trim(
     return trimRight(trimLeft(srcString, trimString, collationId), trimString, collationId);
   }
 
+  /**
+   * Trims the `srcString` string from the left side using the specified `trimString` characters,
+   * with respect to the UTF8_LCASE collation. For UTF8_LCASE, the method first creates a hash
+   * set of lowercased code points in `trimString`, and then iterates over the `srcString` from
+   * the left side, until reaching a character whose lowercased code point is not in the hash set.
+   * Finally, the method returns the substring from that position to the end of `srcString`.
+   * If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned.
+   *
+   * @param srcString the input string to be trimmed from the left end of the string
+   * @param trimString the trim string characters to trim
+   * @return the trimmed string (for UTF8_LCASE collation)
+   */
   public static UTF8String lowercaseTrimLeft(
       final UTF8String srcString,
       final UTF8String trimString) {
-    // Matching UTF8String behavior for null `trimString`.
+    // Matching the default UTF8String behavior for null `trimString`.
     if (trimString == null) {
       return null;
     }
 
+    // Create a hash set of lowercased code points for all characters of `trimString`.
     HashSet<Integer> trimChars = new HashSet<>();
     Iterator<Integer> trimIter = trimString.codePointIterator();
-    while (trimIter.hasNext()) trimChars.add(UCharacter.toLowerCase(trimIter.next()));
+    while (trimIter.hasNext()) trimChars.add(getLowercaseCodePoint(trimIter.next()));
 
-    int searchIndex = 0;
+    // Iterate over `srcString` from the left to find the first character that is not in the set.
+    int searchIndex = 0, codePoint;
     Iterator<Integer> srcIter = srcString.codePointIterator();
     while (srcIter.hasNext()) {
-      if (!trimChars.contains(UCharacter.toLowerCase(srcIter.next()))) break;
-      ++searchIndex;
+      codePoint = getLowercaseCodePoint(srcIter.next());
+      // Special handling for Turkish dotted uppercase letter I.
+      if (codePoint == CODE_POINT_LOWERCASE_I && srcIter.hasNext() &&
+          trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) {
+        int nextCodePoint = getLowercaseCodePoint(srcIter.next());
+        if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint))
+          || nextCodePoint == CODE_POINT_COMBINING_DOT) searchIndex += 2;
+        else {
+          if (trimChars.contains(codePoint)) ++searchIndex;
+          break;
+        }
+      } else if (trimChars.contains(codePoint)) ++searchIndex;
+      else break;
     }
 
-    return srcString.substring(searchIndex, srcString.numChars());
+    // Return the substring from that position to the end of the string.
+    return searchIndex == 0 ? srcString : srcString.substring(searchIndex, srcString.numChars());
   }
 
+  /**
+   * Trims the `srcString` string from the left side using the specified `trimString` characters,
+   * with respect to ICU collations. For these collations, the method iterates over `srcString`
+   * from left to right, and repeatedly skips the longest possible substring that matches any
+   * character in `trimString`, until reaching a character that is not found in `trimString`.
+   * Finally, the method returns the substring from that position to the end of `srcString`.
+   * If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned.
+   *
+   * @param srcString the input string to be trimmed from the left end of the string
+   * @param trimString the trim string characters to trim
+   * @return the trimmed string (for ICU collations)
+   */
   public static UTF8String trimLeft(
       final UTF8String srcString,
       final UTF8String trimString,
       final int collationId) {
-    // Matching UTF8String behavior for null `trimString`.
-    if (trimString == null) {
-      return null;
-    }
+    // Short-circuit for base cases.
+    if (trimString == null) return null;
+    if (srcString.numBytes() == 0) return srcString;
 
-    // Create a set of collation keys for all characters of the trim string, for fast lookup.
-    String trim = trimString.toString();
-    HashSet<String> trimChars = new HashSet<>();
-    for (int i = 0; i < trim.length(); i++) {
-      trimChars.add(CollationFactory.getCollationKey(String.valueOf(trim.charAt(i)), collationId));
+    // Create an array of Strings for all characters of `trimString`.
+    int trimCharIndex = 0;
+    String[] trimChars = new String[trimString.numChars()];
+    Iterator<Integer> trimIter = trimString.codePointIterator(
+      CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+    while (trimIter.hasNext()) {
+      trimChars[trimCharIndex++] = String.valueOf((char) trimIter.next().intValue());
     }
 
     // Iterate over srcString from the left and find the first character that is not in trimChars.
-    String input = srcString.toString();
-    int i = 0;
-    while (i < input.length()) {
-      String key = CollationFactory.getCollationKey(String.valueOf(input.charAt(i)), collationId);
-      if (!trimChars.contains(key)) break;
-      ++i;
+    String src = srcString.toValidString();
+    CharacterIterator target = new StringCharacterIterator(src);
+    Collator collator = CollationFactory.fetchCollation(collationId).collator;
+    int charIndex = 0, longestMatchLen;
+    while (charIndex < src.length()) {
+      longestMatchLen = 0;
+      for (String trim : trimChars) {
+        StringSearch stringSearch = new StringSearch(trim, target, (RuleBasedCollator) collator);
+        stringSearch.setIndex(charIndex);
+        int matchIndex = stringSearch.next();
+        if (matchIndex == charIndex) {
+          int matchLen = stringSearch.getMatchLength();
+          if (matchLen > longestMatchLen) {
+            longestMatchLen = matchLen;
+          }
+        }
+      }
+      if (longestMatchLen == 0) break;
+      else charIndex += longestMatchLen;
     }
-    // Return the substring from that position to the end of the string.
-    return UTF8String.fromString(input.substring(i, srcString.numChars()));
+
+    // Return the substring from the calculated position until the end of the string.
+    return UTF8String.fromString(src.substring(charIndex));
   }
 
+  /**
+   * Trims the `srcString` string from the right side using the specified `trimString` characters,
+   * with respect to the UTF8_LCASE collation. For UTF8_LCASE, the method first creates a hash
+   * set of lowercased code points in `trimString`, and then iterates over the `srcString` from
+   * the right side, until reaching a character whose lowercased code point is not in the hash set.
+   * Finally, the method returns the substring from the start of `srcString` until that position.
+   * If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned.
+   *
+   * @param srcString the input string to be trimmed from the right end of the string
+   * @param trimString the trim string characters to trim
+   * @return the trimmed string (for UTF8_LCASE collation)
+   */
   public static UTF8String lowercaseTrimRight(
       final UTF8String srcString,
       final UTF8String trimString) {
-    // Matching UTF8String behavior for null `trimString`.
+    // Matching the default UTF8String behavior for null `trimString`.
     if (trimString == null) {
       return null;
     }
 
+    // Create a hash set of lowercased code points for all characters of `trimString`.
     HashSet<Integer> trimChars = new HashSet<>();
     Iterator<Integer> trimIter = trimString.codePointIterator();
-    while (trimIter.hasNext()) trimChars.add(UCharacter.toLowerCase(trimIter.next()));
+    while (trimIter.hasNext()) trimChars.add(getLowercaseCodePoint(trimIter.next()));
 
-    int searchIndex = srcString.numChars();
+    // Iterate over `srcString` from the right to find the first character that is not in the set.
+    int searchIndex = srcString.numChars(), codePoint;
     Iterator<Integer> srcIter = srcString.reverseCodePointIterator();
     while (srcIter.hasNext()) {
-      if (!trimChars.contains(UCharacter.toLowerCase(srcIter.next()))) {
-        break;
-      }
-      --searchIndex;
+      codePoint = getLowercaseCodePoint(srcIter.next());
+      // Special handling for Turkish dotted uppercase letter I.
+      if (codePoint == CODE_POINT_COMBINING_DOT && srcIter.hasNext() &&
+          trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) {
+        int nextCodePoint = getLowercaseCodePoint(srcIter.next());
+        if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint))
+          || nextCodePoint == CODE_POINT_LOWERCASE_I) searchIndex -= 2;
+        else {
+          if (trimChars.contains(codePoint)) --searchIndex;
+          break;
+        }
+      } else if (trimChars.contains(codePoint)) --searchIndex;
+      else break;
     }
 
-    return srcString.substring(0, searchIndex);
+    // Return the substring from the start of the string to the calculated position.
+    return searchIndex == srcString.numChars() ? srcString : srcString.substring(0, searchIndex);
   }
 
+  /**
+   * Trims the `srcString` string from the right side using the specified `trimString` characters,
+   * with respect to ICU collations. For these collations, the method iterates over `srcString`
+   * from right to left, and repeatedly skips the longest possible substring that matches any
+   * character in `trimString`, until reaching a character that is not found in `trimString`.
+   * Finally, the method returns the substring from the start of `srcString` until that position.
+   * If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned.
+   *
+   * @param srcString the input string to be trimmed from the right end of the string
+   * @param trimString the trim string characters to trim
+   * @return the trimmed string (for ICU collations)
+   */
   public static UTF8String trimRight(
       final UTF8String srcString,
       final UTF8String trimString,
       final int collationId) {
-    // Matching UTF8String behavior for null `trimString`.
-    if (trimString == null) {
-      return null;
-    }
+    // Short-circuit for base cases.
+    if (trimString == null) return null;
+    if (srcString.numBytes() == 0) return srcString;
 
-    // Create a set of collation keys for all characters of the trim string, for fast lookup.
-    String trim = trimString.toString();
-    HashSet<String> trimChars = new HashSet<>();
-    for (int i = 0; i < trim.length(); i++) {
-      trimChars.add(CollationFactory.getCollationKey(String.valueOf(trim.charAt(i)), collationId));
+    // Create an array of Strings for all characters of `trimString`.
+    int trimCharIndex = 0;
+    String[] trimChars = new String[trimString.numChars()];
+    Iterator<Integer> trimIter = trimString.codePointIterator(
+      CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+    while (trimIter.hasNext()) {
+      trimChars[trimCharIndex++] = String.valueOf((char) trimIter.next().intValue());
     }
 
-    // Iterate over srcString from the right and find the first character that is not in trimChars.
-    String input = srcString.toString();
-    int i = input.length() - 1;
-    while (i >= 0) {
-      String key = CollationFactory.getCollationKey(String.valueOf(input.charAt(i)), collationId);
-      if (!trimChars.contains(key)) break;
-      --i;
+    // Iterate over srcString from the left and find the first character that is not in trimChars.
+    String src = srcString.toValidString();
+    CharacterIterator target = new StringCharacterIterator(src);
+    Collator collator = CollationFactory.fetchCollation(collationId).collator;
+    int charIndex = src.length(), longestMatchLen;
+    while (charIndex >= 0) {
+      longestMatchLen = 0;
+      for (String trim : trimChars) {
+        StringSearch stringSearch = new StringSearch(trim, target, (RuleBasedCollator) collator);
+        // Note: stringSearch.previous() is NOT consistent with stringSearch.next()!
+        //  Example: StringSearch("İ", "i\\u0307İi\\u0307İi\\u0307İ", "UNICODE_CI")
+        //    stringSearch.next() gives: [0, 2, 3, 5, 6, 8].
+        //    stringSearch.previous() gives: [8, 6, 3, 0].
+        stringSearch.setIndex(Math.max(charIndex - 3, 0));
+        int matchIndex = stringSearch.next();
+        int matchLen = stringSearch.getMatchLength();
+        while (matchIndex != StringSearch.DONE && matchIndex < charIndex - matchLen) {
+          matchIndex = stringSearch.next();
+          matchLen = stringSearch.getMatchLength();
+        }
+        if (matchIndex == charIndex - matchLen) {
+          if (matchLen > longestMatchLen) {
+            longestMatchLen = matchLen;
+          }
+        }
+      }
+      if (longestMatchLen == 0) break;
+      else charIndex -= longestMatchLen;
     }
+
     // Return the substring from the start of the string until that position.
-    return UTF8String.fromString(input.substring(0, i + 1));
+    return UTF8String.fromString(src.substring(0, charIndex));
   }
 
   // TODO: Add more collation-aware UTF8String operations here.
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index c632e51338d0c..42a5e5f3a315d 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -1224,20 +1224,34 @@ private void assertStringTrim(
       String sourceString,
       String trimString,
       String expectedResultString) throws SparkException {
+    // Prepare the input and expected result.
     int collationId = CollationFactory.collationNameToId(collation);
-    String result;
+    UTF8String src = UTF8String.fromString(sourceString);
+    UTF8String trim = UTF8String.fromString(trimString);
+    UTF8String resultTrimLeftRight, resultTrimRightLeft;
+    String resultTrim;
 
     if (trimString == null) {
-      result = CollationSupport.StringTrim.exec(
-        UTF8String.fromString(sourceString)).toString();
+      // Trim string is ASCII space.
+      resultTrim = CollationSupport.StringTrim.exec(src).toString();
+      UTF8String trimLeft = CollationSupport.StringTrimLeft.exec(src);
+      resultTrimLeftRight = CollationSupport.StringTrimRight.exec(trimLeft);
+      UTF8String trimRight = CollationSupport.StringTrimRight.exec(src);
+      resultTrimRightLeft = CollationSupport.StringTrimLeft.exec(trimRight);
     } else {
-      result = CollationSupport.StringTrim.exec(
-        UTF8String
-          .fromString(sourceString), UTF8String.fromString(trimString), collationId)
-          .toString();
+      // Trim string is specified.
+      resultTrim = CollationSupport.StringTrim.exec(src, trim, collationId).toString();
+      UTF8String trimLeft = CollationSupport.StringTrimLeft.exec(src, trim, collationId);
+      resultTrimLeftRight = CollationSupport.StringTrimRight.exec(trimLeft, trim, collationId);
+      UTF8String trimRight = CollationSupport.StringTrimRight.exec(src, trim, collationId);
+      resultTrimRightLeft = CollationSupport.StringTrimLeft.exec(trimRight, trim, collationId);
     }
 
-    assertEquals(expectedResultString, result);
+    // Test that StringTrim result is as expected.
+    assertEquals(expectedResultString, resultTrim);
+    // Test that the order of the trims is not important.
+    assertEquals(resultTrimLeftRight.toString(), resultTrim);
+    assertEquals(resultTrimRightLeft.toString(), resultTrim);
   }
 
   private void assertStringTrimLeft(
@@ -1245,19 +1259,21 @@ private void assertStringTrimLeft(
       String sourceString,
       String trimString,
       String expectedResultString) throws SparkException {
+    // Prepare the input and expected result.
     int collationId = CollationFactory.collationNameToId(collation);
+    UTF8String src = UTF8String.fromString(sourceString);
+    UTF8String trim = UTF8String.fromString(trimString);
     String result;
 
     if (trimString == null) {
-      result = CollationSupport.StringTrimLeft.exec(
-        UTF8String.fromString(sourceString)).toString();
+      // Trim string is ASCII space.
+      result = CollationSupport.StringTrimLeft.exec(src).toString();
     } else {
-      result = CollationSupport.StringTrimLeft.exec(
-        UTF8String
-          .fromString(sourceString), UTF8String.fromString(trimString), collationId)
-          .toString();
+      // Trim string is specified.
+      result = CollationSupport.StringTrimLeft.exec(src, trim, collationId).toString();
     }
 
+    // Test that StringTrimLeft result is as expected.
     assertEquals(expectedResultString, result);
   }
 
@@ -1266,25 +1282,27 @@ private void assertStringTrimRight(
       String sourceString,
       String trimString,
       String expectedResultString) throws SparkException {
+    // Prepare the input and expected result.
     int collationId = CollationFactory.collationNameToId(collation);
+    UTF8String src = UTF8String.fromString(sourceString);
+    UTF8String trim = UTF8String.fromString(trimString);
     String result;
 
     if (trimString == null) {
-      result = CollationSupport.StringTrimRight.exec(
-        UTF8String.fromString(sourceString)).toString();
+      // Trim string is ASCII space.
+      result = CollationSupport.StringTrimRight.exec(src).toString();
     } else {
-      result = CollationSupport.StringTrimRight.exec(
-        UTF8String
-          .fromString(sourceString), UTF8String.fromString(trimString), collationId)
-          .toString();
+      // Trim string is specified.
+      result = CollationSupport.StringTrimRight.exec(src, trim, collationId).toString();
     }
 
+    // Test that StringTrimRight result is as expected.
     assertEquals(expectedResultString, result);
   }
 
   @Test
   public void testStringTrim() throws SparkException {
-    // UTF8_BINARY
+    // Basic tests - UTF8_BINARY.
     assertStringTrim("UTF8_BINARY", "", "", "");
     assertStringTrim("UTF8_BINARY", "", "xyz", "");
     assertStringTrim("UTF8_BINARY", "asd", "", "asd");
@@ -1312,8 +1330,7 @@ public void testStringTrim() throws SparkException {
     assertStringTrimRight("UTF8_BINARY", "asd", "x", "asd");
     assertStringTrimRight("UTF8_BINARY", "xxasdxx", "x", "xxasd");
     assertStringTrimRight("UTF8_BINARY", "xa世ax", "x", "xa世a");
-
-    // UTF8_LCASE
+    // Basic tests - UTF8_LCASE.
     assertStringTrim("UTF8_LCASE", "", "", "");
     assertStringTrim("UTF8_LCASE", "", "xyz", "");
     assertStringTrim("UTF8_LCASE", "asd", "", "asd");
@@ -1341,8 +1358,7 @@ public void testStringTrim() throws SparkException {
     assertStringTrimRight("UTF8_LCASE", "asd", "x", "asd");
     assertStringTrimRight("UTF8_LCASE", "xxasdxx", "x", "xxasd");
     assertStringTrimRight("UTF8_LCASE", "xa世ax", "x", "xa世a");
-
-    // UNICODE
+    // Basic tests - UNICODE.
     assertStringTrim("UNICODE", "", "", "");
     assertStringTrim("UNICODE", "", "xyz", "");
     assertStringTrim("UNICODE", "asd", "", "asd");
@@ -1370,8 +1386,7 @@ public void testStringTrim() throws SparkException {
     assertStringTrimRight("UNICODE", "asd", "x", "asd");
     assertStringTrimRight("UNICODE", "xxasdxx", "x", "xxasd");
     assertStringTrimRight("UNICODE", "xa世ax", "x", "xa世a");
-
-    // UNICODE_CI
+    // Basic tests - UNICODE_CI.
     assertStringTrim("UNICODE_CI", "", "", "");
     assertStringTrim("UNICODE_CI", "", "xyz", "");
     assertStringTrim("UNICODE_CI", "asd", "", "asd");
@@ -1400,69 +1415,495 @@ public void testStringTrim() throws SparkException {
     assertStringTrimRight("UNICODE_CI", "xxasdxx", "x", "xxasd");
     assertStringTrimRight("UNICODE_CI", "xa世ax", "x", "xa世a");
 
-    // Test cases where trimString has more than one character
+    // Case variation - UTF8_BINARY.
+    assertStringTrim("UTF8_BINARY", "asd", "A", "asd");
     assertStringTrim("UTF8_BINARY", "ddsXXXaa", "asd", "XXX");
+    assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD");
     assertStringTrimLeft("UTF8_BINARY", "ddsXXXaa", "asd", "XXXaa");
     assertStringTrimRight("UTF8_BINARY", "ddsXXXaa", "asd", "ddsXXX");
-
-    assertStringTrim("UTF8_LCASE", "ddsXXXaa", "asd", "XXX");
-    assertStringTrimLeft("UTF8_LCASE", "ddsXXXaa", "asd", "XXXaa");
-    assertStringTrimRight("UTF8_LCASE", "ddsXXXaa", "asd", "ddsXXX");
-
+    // Case variation - UTF8_LCASE.
+    assertStringTrim("UTF8_LCASE", "asd", "A", "sd");
+    assertStringTrim("UTF8_LCASE", "ASD", "a", "SD");
+    assertStringTrim("UTF8_LCASE", "ddsXXXaa", "ASD", "XXX");
+    assertStringTrimLeft("UTF8_LCASE", "ddsXXXaa", "aSd", "XXXaa");
+    assertStringTrimRight("UTF8_LCASE", "ddsXXXaa", "AsD", "ddsXXX");
+    // Case variation - UNICODE.
+    assertStringTrim("UNICODE", "asd", "A", "asd");
+    assertStringTrim("UNICODE", "ASD", "a", "ASD");
     assertStringTrim("UNICODE", "ddsXXXaa", "asd", "XXX");
     assertStringTrimLeft("UNICODE", "ddsXXXaa", "asd", "XXXaa");
     assertStringTrimRight("UNICODE", "ddsXXXaa", "asd", "ddsXXX");
-
-    // Test cases specific to collation type
-    // uppercase trim, lowercase src
-    assertStringTrim("UTF8_BINARY", "asd", "A", "asd");
-    assertStringTrim("UTF8_LCASE", "asd", "A", "sd");
-    assertStringTrim("UNICODE", "asd", "A", "asd");
+    // Case variation - UNICODE_CI.
     assertStringTrim("UNICODE_CI", "asd", "A", "sd");
-
-    // lowercase trim, uppercase src
-    assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD");
-    assertStringTrim("UTF8_LCASE", "ASD", "a", "SD");
-    assertStringTrim("UNICODE", "ASD", "a", "ASD");
     assertStringTrim("UNICODE_CI", "ASD", "a", "SD");
+    assertStringTrim("UNICODE_CI", "ddsXXXaa", "ASD", "XXX");
+    assertStringTrimLeft("UNICODE_CI", "ddsXXXaa", "aSd", "XXXaa");
+    assertStringTrimRight("UNICODE_CI", "ddsXXXaa", "AsD", "ddsXXX");
 
-    // uppercase and lowercase chars of different byte-length (utf8)
+    // Case-variable character length - UTF8_BINARY.
     assertStringTrim("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
     assertStringTrimLeft("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
     assertStringTrimRight("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
-
-    assertStringTrim("UTF8_LCASE", "ẞaaaẞ", "ß", "aaa");
-    assertStringTrimLeft("UTF8_LCASE", "ẞaaaẞ", "ß", "aaaẞ");
-    assertStringTrimRight("UTF8_LCASE", "ẞaaaẞ", "ß", "ẞaaa");
-
-    assertStringTrim("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
-    assertStringTrimLeft("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
-    assertStringTrimRight("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
-
     assertStringTrim("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
     assertStringTrimLeft("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
     assertStringTrimRight("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
-
-    assertStringTrim("UTF8_LCASE", "ßaaaß", "ẞ", "aaa");
-    assertStringTrimLeft("UTF8_LCASE", "ßaaaß", "ẞ", "aaaß");
-    assertStringTrimRight("UTF8_LCASE", "ßaaaß", "ẞ", "ßaaa");
-
-    assertStringTrim("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
-    assertStringTrimLeft("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
-    assertStringTrimRight("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
-
-    // different byte-length (utf8) chars trimmed
     assertStringTrim("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaa");
     assertStringTrimLeft("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaaẞ");
     assertStringTrimRight("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "Ëaaa");
-
+    // Case-variable character length - UTF8_LCASE.
+    assertStringTrim("UTF8_LCASE", "ẞaaaẞ", "ß", "aaa");
+    assertStringTrimLeft("UTF8_LCASE", "ẞaaaẞ", "ß", "aaaẞ");
+    assertStringTrimRight("UTF8_LCASE", "ẞaaaẞ", "ß", "ẞaaa");
+    assertStringTrim("UTF8_LCASE", "ßaaaß", "ẞ", "aaa");
+    assertStringTrimLeft("UTF8_LCASE", "ßaaaß", "ẞ", "aaaß");
+    assertStringTrimRight("UTF8_LCASE", "ßaaaß", "ẞ", "ßaaa");
     assertStringTrim("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaa");
     assertStringTrimLeft("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaaẞ");
     assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
-
+    // Case-variable character length - UNICODE.
+    assertStringTrim("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
+    assertStringTrimLeft("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
+    assertStringTrimRight("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
+    assertStringTrim("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
+    assertStringTrimLeft("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
+    assertStringTrimRight("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
     assertStringTrim("UNICODE", "Ëaaaẞ", "Ëẞ", "aaa");
     assertStringTrimLeft("UNICODE", "Ëaaaẞ", "Ëẞ", "aaaẞ");
     assertStringTrimRight("UNICODE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
+    // Case-variable character length - UNICODE_CI.
+    assertStringTrim("UNICODE_CI", "ẞaaaẞ", "ß", "aaa");
+    assertStringTrimLeft("UNICODE_CI", "ẞaaaẞ", "ß", "aaaẞ");
+    assertStringTrimRight("UNICODE_CI", "ẞaaaẞ", "ß", "ẞaaa");
+    assertStringTrim("UNICODE_CI", "ßaaaß", "ẞ", "aaa");
+    assertStringTrimLeft("UNICODE_CI", "ßaaaß", "ẞ", "aaaß");
+    assertStringTrimRight("UNICODE_CI", "ßaaaß", "ẞ", "ßaaa");
+    assertStringTrim("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "aaa");
+    assertStringTrimLeft("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "aaaẞ");
+    assertStringTrimRight("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "Ëaaa");
+
+    // One-to-many case mapping - UTF8_BINARY.
+    assertStringTrim("UTF8_BINARY", "i", "i", "");
+    assertStringTrim("UTF8_BINARY", "iii", "I", "iii");
+    assertStringTrim("UTF8_BINARY", "I", "iii", "I");
+    assertStringTrim("UTF8_BINARY", "ixi", "i", "x");
+    assertStringTrim("UTF8_BINARY", "i", "İ", "i");
+    assertStringTrim("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
+    assertStringTrim("UTF8_BINARY", "i\u0307", "i", "\u0307");
+    assertStringTrim("UTF8_BINARY", "i\u0307", "\u0307", "i");
+    assertStringTrim("UTF8_BINARY", "i\u0307", "i\u0307", "");
+    assertStringTrim("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", "");
+    assertStringTrim("UTF8_BINARY", "i\u0307\u0307", "i\u0307", "");
+    assertStringTrim("UTF8_BINARY", "i\u0307i", "i\u0307", "");
+    assertStringTrim("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i");
+    assertStringTrim("UTF8_BINARY", "i\u0307İ", "i\u0307", "İ");
+    assertStringTrim("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307");
+    assertStringTrim("UTF8_BINARY", "İ", "İ", "");
+    assertStringTrim("UTF8_BINARY", "IXi", "İ", "IXi");
+    assertStringTrim("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307");
+    assertStringTrim("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x");
+    assertStringTrim("UTF8_BINARY", "i\u0307x", "ix\u0307İ", "");
+    assertStringTrim("UTF8_BINARY", "İ", "i", "İ");
+    assertStringTrim("UTF8_BINARY", "İ", "\u0307", "İ");
+    assertStringTrim("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ");
+    assertStringTrim("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ");
+    assertStringTrim("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi");
+    assertStringTrimLeft("UTF8_BINARY", "i", "i", "");
+    assertStringTrimLeft("UTF8_BINARY", "iii", "I", "iii");
+    assertStringTrimLeft("UTF8_BINARY", "I", "iii", "I");
+    assertStringTrimLeft("UTF8_BINARY", "ixi", "i", "xi");
+    assertStringTrimLeft("UTF8_BINARY", "i", "İ", "i");
+    assertStringTrimLeft("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
+    assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i", "\u0307");
+    assertStringTrimLeft("UTF8_BINARY", "i\u0307", "\u0307", "i\u0307");
+    assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i\u0307", "");
+    assertStringTrimLeft("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", "");
+    assertStringTrimLeft("UTF8_BINARY", "i\u0307\u0307", "i\u0307", "");
+    assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "i\u0307", "");
+    assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i");
+    assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "i\u0307", "İ");
+    assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307İ");
+    assertStringTrimLeft("UTF8_BINARY", "İ", "İ", "");
+    assertStringTrimLeft("UTF8_BINARY", "IXi", "İ", "IXi");
+    assertStringTrimLeft("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307");
+    assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x");
+    assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "ix\u0307İ", "");
+    assertStringTrimLeft("UTF8_BINARY", "İ", "i", "İ");
+    assertStringTrimLeft("UTF8_BINARY", "İ", "\u0307", "İ");
+    assertStringTrimLeft("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ");
+    assertStringTrimLeft("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ");
+    assertStringTrimLeft("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi\u0307");
+    assertStringTrimRight("UTF8_BINARY", "i", "i", "");
+    assertStringTrimRight("UTF8_BINARY", "iii", "I", "iii");
+    assertStringTrimRight("UTF8_BINARY", "I", "iii", "I");
+    assertStringTrimRight("UTF8_BINARY", "ixi", "i", "ix");
+    assertStringTrimRight("UTF8_BINARY", "i", "İ", "i");
+    assertStringTrimRight("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
+    assertStringTrimRight("UTF8_BINARY", "i\u0307", "i", "i\u0307");
+    assertStringTrimRight("UTF8_BINARY", "i\u0307", "\u0307", "i");
+    assertStringTrimRight("UTF8_BINARY", "i\u0307", "i\u0307", "");
+    assertStringTrimRight("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", "");
+    assertStringTrimRight("UTF8_BINARY", "i\u0307\u0307", "i\u0307", "");
+    assertStringTrimRight("UTF8_BINARY", "i\u0307i", "i\u0307", "");
+    assertStringTrimRight("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i");
+    assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "i\u0307", "i\u0307İ");
+    assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307");
+    assertStringTrimRight("UTF8_BINARY", "İ", "İ", "");
+    assertStringTrimRight("UTF8_BINARY", "IXi", "İ", "IXi");
+    assertStringTrimRight("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307");
+    assertStringTrimRight("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x");
+    assertStringTrimRight("UTF8_BINARY", "i\u0307x", "ix\u0307İ", "");
+    assertStringTrimRight("UTF8_BINARY", "İ", "i", "İ");
+    assertStringTrimRight("UTF8_BINARY", "İ", "\u0307", "İ");
+    assertStringTrimRight("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ");
+    assertStringTrimRight("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ");
+    assertStringTrimRight("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi");
+    // One-to-many case mapping - UTF8_LCASE.
+    assertStringTrim("UTF8_LCASE", "i", "i", "");
+    assertStringTrim("UTF8_LCASE", "iii", "I", "");
+    assertStringTrim("UTF8_LCASE", "I", "iii", "");
+    assertStringTrim("UTF8_LCASE", "ixi", "i", "x");
+    assertStringTrim("UTF8_LCASE", "i", "İ", "i");
+    assertStringTrim("UTF8_LCASE", "i\u0307", "İ", "");
+    assertStringTrim("UTF8_LCASE", "i\u0307", "i", "\u0307");
+    assertStringTrim("UTF8_LCASE", "i\u0307", "\u0307", "i");
+    assertStringTrim("UTF8_LCASE", "i\u0307", "i\u0307", "");
+    assertStringTrim("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", "");
+    assertStringTrim("UTF8_LCASE", "i\u0307\u0307", "i\u0307", "");
+    assertStringTrim("UTF8_LCASE", "i\u0307i", "i\u0307", "");
+    assertStringTrim("UTF8_LCASE", "i\u0307i", "İ", "i");
+    assertStringTrim("UTF8_LCASE", "i\u0307İ", "i\u0307", "İ");
+    assertStringTrim("UTF8_LCASE", "i\u0307İ", "İ", "");
+    assertStringTrim("UTF8_LCASE", "İ", "İ", "");
+    assertStringTrim("UTF8_LCASE", "IXi", "İ", "IXi");
+    assertStringTrim("UTF8_LCASE", "ix\u0307", "Ixİ", "\u0307");
+    assertStringTrim("UTF8_LCASE", "i\u0307x", "IXİ", "");
+    assertStringTrim("UTF8_LCASE", "i\u0307x", "I\u0307xİ", "");
+    assertStringTrim("UTF8_LCASE", "İ", "i", "İ");
+    assertStringTrim("UTF8_LCASE", "İ", "\u0307", "İ");
+    assertStringTrim("UTF8_LCASE", "Ixİ", "i\u0307", "xİ");
+    assertStringTrim("UTF8_LCASE", "IXİ", "ix\u0307", "İ");
+    assertStringTrim("UTF8_LCASE", "xi\u0307", "\u0307IX", "");
+    assertStringTrimLeft("UTF8_LCASE", "i", "i", "");
+    assertStringTrimLeft("UTF8_LCASE", "iii", "I", "");
+    assertStringTrimLeft("UTF8_LCASE", "I", "iii", "");
+    assertStringTrimLeft("UTF8_LCASE", "ixi", "i", "xi");
+    assertStringTrimLeft("UTF8_LCASE", "i", "İ", "i");
+    assertStringTrimLeft("UTF8_LCASE", "i\u0307", "İ", "");
+    assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i", "\u0307");
+    assertStringTrimLeft("UTF8_LCASE", "i\u0307", "\u0307", "i\u0307");
+    assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i\u0307", "");
+    assertStringTrimLeft("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", "");
+    assertStringTrimLeft("UTF8_LCASE", "i\u0307\u0307", "i\u0307", "");
+    assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "i\u0307", "");
+    assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "İ", "i");
+    assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "i\u0307", "İ");
+    assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "İ", "");
+    assertStringTrimLeft("UTF8_LCASE", "İ", "İ", "");
+    assertStringTrimLeft("UTF8_LCASE", "IXi", "İ", "IXi");
+    assertStringTrimLeft("UTF8_LCASE", "ix\u0307", "Ixİ", "\u0307");
+    assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "IXİ", "");
+    assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "I\u0307xİ", "");
+    assertStringTrimLeft("UTF8_LCASE", "İ", "i", "İ");
+    assertStringTrimLeft("UTF8_LCASE", "İ", "\u0307", "İ");
+    assertStringTrimLeft("UTF8_LCASE", "Ixİ", "i\u0307", "xİ");
+    assertStringTrimLeft("UTF8_LCASE", "IXİ", "ix\u0307", "İ");
+    assertStringTrimLeft("UTF8_LCASE", "xi\u0307", "\u0307IX", "");
+    assertStringTrimRight("UTF8_LCASE", "i", "i", "");
+    assertStringTrimRight("UTF8_LCASE", "iii", "I", "");
+    assertStringTrimRight("UTF8_LCASE", "I", "iii", "");
+    assertStringTrimRight("UTF8_LCASE", "ixi", "i", "ix");
+    assertStringTrimRight("UTF8_LCASE", "i", "İ", "i");
+    assertStringTrimRight("UTF8_LCASE", "i\u0307", "İ", "");
+    assertStringTrimRight("UTF8_LCASE", "i\u0307", "i", "i\u0307");
+    assertStringTrimRight("UTF8_LCASE", "i\u0307", "\u0307", "i");
+    assertStringTrimRight("UTF8_LCASE", "i\u0307", "i\u0307", "");
+    assertStringTrimRight("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", "");
+    assertStringTrimRight("UTF8_LCASE", "i\u0307\u0307", "i\u0307", "");
+    assertStringTrimRight("UTF8_LCASE", "i\u0307i", "i\u0307", "");
+    assertStringTrimRight("UTF8_LCASE", "i\u0307i", "İ", "i\u0307i");
+    assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "i\u0307", "i\u0307İ");
+    assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "İ", "");
+    assertStringTrimRight("UTF8_LCASE", "İ", "İ", "");
+    assertStringTrimRight("UTF8_LCASE", "IXi", "İ", "IXi");
+    assertStringTrimRight("UTF8_LCASE", "ix\u0307", "Ixİ", "ix\u0307");
+    assertStringTrimRight("UTF8_LCASE", "i\u0307x", "IXİ", "");
+    assertStringTrimRight("UTF8_LCASE", "i\u0307x", "I\u0307xİ", "");
+    assertStringTrimRight("UTF8_LCASE", "İ", "i", "İ");
+    assertStringTrimRight("UTF8_LCASE", "İ", "\u0307", "İ");
+    assertStringTrimRight("UTF8_LCASE", "Ixİ", "i\u0307", "Ixİ");
+    assertStringTrimRight("UTF8_LCASE", "IXİ", "ix\u0307", "IXİ");
+    assertStringTrimRight("UTF8_LCASE", "xi\u0307", "\u0307IX", "");
+    // One-to-many case mapping - UNICODE.
+    assertStringTrim("UNICODE", "i", "i", "");
+    assertStringTrim("UNICODE", "iii", "I", "iii");
+    assertStringTrim("UNICODE", "I", "iii", "I");
+    assertStringTrim("UNICODE", "ixi", "i", "x");
+    assertStringTrim("UNICODE", "i", "İ", "i");
+    assertStringTrim("UNICODE", "i\u0307", "İ", "i\u0307");
+    assertStringTrim("UNICODE", "i\u0307", "i", "i\u0307");
+    assertStringTrim("UNICODE", "i\u0307", "\u0307", "i\u0307");
+    assertStringTrim("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+    assertStringTrim("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307");
+    assertStringTrim("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307");
+    assertStringTrim("UNICODE", "i\u0307i", "i\u0307", "i\u0307");
+    assertStringTrim("UNICODE", "i\u0307i", "İ", "i\u0307i");
+    assertStringTrim("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ");
+    assertStringTrim("UNICODE", "i\u0307İ", "İ", "i\u0307");
+    assertStringTrim("UNICODE", "İ", "İ", "");
+    assertStringTrim("UNICODE", "IXi", "İ", "IXi");
+    assertStringTrim("UNICODE", "ix\u0307", "Ixİ", "ix\u0307");
+    assertStringTrim("UNICODE", "i\u0307x", "IXİ", "i\u0307x");
+    assertStringTrim("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307");
+    assertStringTrim("UNICODE", "İ", "i", "İ");
+    assertStringTrim("UNICODE", "İ", "\u0307", "İ");
+    assertStringTrim("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+    assertStringTrim("UNICODE", "Ixİ", "i\u0307", "Ixİ");
+    assertStringTrim("UNICODE", "IXİ", "ix\u0307", "IXİ");
+    assertStringTrim("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307");
+    assertStringTrimLeft("UNICODE", "i", "i", "");
+    assertStringTrimLeft("UNICODE", "iii", "I", "iii");
+    assertStringTrimLeft("UNICODE", "I", "iii", "I");
+    assertStringTrimLeft("UNICODE", "ixi", "i", "xi");
+    assertStringTrimLeft("UNICODE", "i", "İ", "i");
+    assertStringTrimLeft("UNICODE", "i\u0307", "İ", "i\u0307");
+    assertStringTrimLeft("UNICODE", "i\u0307", "i", "i\u0307");
+    assertStringTrimLeft("UNICODE", "i\u0307", "\u0307", "i\u0307");
+    assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+    assertStringTrimLeft("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307");
+    assertStringTrimLeft("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307");
+    assertStringTrimLeft("UNICODE", "i\u0307i", "i\u0307", "i\u0307i");
+    assertStringTrimLeft("UNICODE", "i\u0307i", "İ", "i\u0307i");
+    assertStringTrimLeft("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ");
+    assertStringTrimLeft("UNICODE", "i\u0307İ", "İ", "i\u0307İ");
+    assertStringTrimLeft("UNICODE", "İ", "İ", "");
+    assertStringTrimLeft("UNICODE", "IXi", "İ", "IXi");
+    assertStringTrimLeft("UNICODE", "ix\u0307", "Ixİ", "ix\u0307");
+    assertStringTrimLeft("UNICODE", "i\u0307x", "IXİ", "i\u0307x");
+    assertStringTrimLeft("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307x");
+    assertStringTrimLeft("UNICODE", "İ", "i", "İ");
+    assertStringTrimLeft("UNICODE", "İ", "\u0307", "İ");
+    assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+    assertStringTrimLeft("UNICODE", "Ixİ", "i\u0307", "Ixİ");
+    assertStringTrimLeft("UNICODE", "IXİ", "ix\u0307", "IXİ");
+    assertStringTrimLeft("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307");
+    assertStringTrimRight("UNICODE", "i", "i", "");
+    assertStringTrimRight("UNICODE", "iii", "I", "iii");
+    assertStringTrimRight("UNICODE", "I", "iii", "I");
+    assertStringTrimRight("UNICODE", "ixi", "i", "ix");
+    assertStringTrimRight("UNICODE", "i", "İ", "i");
+    assertStringTrimRight("UNICODE", "i\u0307", "İ", "i\u0307");
+    assertStringTrimRight("UNICODE", "i\u0307", "i", "i\u0307");
+    assertStringTrimRight("UNICODE", "i\u0307", "\u0307", "i\u0307");
+    assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+    assertStringTrimRight("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307");
+    assertStringTrimRight("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307");
+    assertStringTrimRight("UNICODE", "i\u0307i", "i\u0307", "i\u0307");
+    assertStringTrimRight("UNICODE", "i\u0307i", "İ", "i\u0307i");
+    assertStringTrimRight("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ");
+    assertStringTrimRight("UNICODE", "i\u0307İ", "İ", "i\u0307");
+    assertStringTrimRight("UNICODE", "İ", "İ", "");
+    assertStringTrimRight("UNICODE", "IXi", "İ", "IXi");
+    assertStringTrimRight("UNICODE", "ix\u0307", "Ixİ", "ix\u0307");
+    assertStringTrimRight("UNICODE", "i\u0307x", "IXİ", "i\u0307x");
+    assertStringTrimRight("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307");
+    assertStringTrimRight("UNICODE", "İ", "i", "İ");
+    assertStringTrimRight("UNICODE", "İ", "\u0307", "İ");
+    assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+    assertStringTrimRight("UNICODE", "Ixİ", "i\u0307", "Ixİ");
+    assertStringTrimRight("UNICODE", "IXİ", "ix\u0307", "IXİ");
+    assertStringTrimRight("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307");
+    // One-to-many case mapping - UNICODE_CI.
+    assertStringTrim("UNICODE_CI", "i", "i", "");
+    assertStringTrim("UNICODE_CI", "iii", "I", "");
+    assertStringTrim("UNICODE_CI", "I", "iii", "");
+    assertStringTrim("UNICODE_CI", "ixi", "i", "x");
+    assertStringTrim("UNICODE_CI", "i", "İ", "i");
+    assertStringTrim("UNICODE_CI", "i\u0307", "İ", "");
+    assertStringTrim("UNICODE_CI", "i\u0307", "i", "i\u0307");
+    assertStringTrim("UNICODE_CI", "i\u0307", "\u0307", "i\u0307");
+    assertStringTrim("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
+    assertStringTrim("UNICODE_CI", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307");
+    assertStringTrim("UNICODE_CI", "i\u0307\u0307", "i\u0307", "i\u0307\u0307");
+    assertStringTrim("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307");
+    assertStringTrim("UNICODE_CI", "i\u0307i", "İ", "i");
+    assertStringTrim("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ");
+    assertStringTrim("UNICODE_CI", "i\u0307İ", "İ", "");
+    assertStringTrim("UNICODE_CI", "İ", "İ", "");
+    assertStringTrim("UNICODE_CI", "IXi", "İ", "IXi");
+    assertStringTrim("UNICODE_CI", "ix\u0307", "Ixİ", "x\u0307");
+    assertStringTrim("UNICODE_CI", "i\u0307x", "IXİ", "");
+    assertStringTrim("UNICODE_CI", "i\u0307x", "I\u0307xİ", "");
+    assertStringTrim("UNICODE_CI", "İ", "i", "İ");
+    assertStringTrim("UNICODE_CI", "İ", "\u0307", "İ");
+    assertStringTrim("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
+    assertStringTrim("UNICODE_CI", "Ixİ", "i\u0307", "xİ");
+    assertStringTrim("UNICODE_CI", "IXİ", "ix\u0307", "İ");
+    assertStringTrim("UNICODE_CI", "xi\u0307", "\u0307IX", "i\u0307");
+    assertStringTrimLeft("UNICODE_CI", "i", "i", "");
+    assertStringTrimLeft("UNICODE_CI", "iii", "I", "");
+    assertStringTrimLeft("UNICODE_CI", "I", "iii", "");
+    assertStringTrimLeft("UNICODE_CI", "ixi", "i", "xi");
+    assertStringTrimLeft("UNICODE_CI", "i", "İ", "i");
+    assertStringTrimLeft("UNICODE_CI", "i\u0307", "İ", "");
+    assertStringTrimLeft("UNICODE_CI", "i\u0307", "i", "i\u0307");
+    assertStringTrimLeft("UNICODE_CI", "i\u0307", "\u0307", "i\u0307");
+    assertStringTrimLeft("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
+    assertStringTrimLeft("UNICODE_CI", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307");
+    assertStringTrimLeft("UNICODE_CI", "i\u0307\u0307", "i\u0307", "i\u0307\u0307");
+    assertStringTrimLeft("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307i");
+    assertStringTrimLeft("UNICODE_CI", "i\u0307i", "İ", "i");
+    assertStringTrimLeft("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ");
+    assertStringTrimLeft("UNICODE_CI", "i\u0307İ", "İ", "");
+    assertStringTrimLeft("UNICODE_CI", "İ", "İ", "");
+    assertStringTrimLeft("UNICODE_CI", "IXi", "İ", "IXi");
+    assertStringTrimLeft("UNICODE_CI", "ix\u0307", "Ixİ", "x\u0307");
+    assertStringTrimLeft("UNICODE_CI", "i\u0307x", "IXİ", "");
+    assertStringTrimLeft("UNICODE_CI", "i\u0307x", "I\u0307xİ", "");
+    assertStringTrimLeft("UNICODE_CI", "İ", "i", "İ");
+    assertStringTrimLeft("UNICODE_CI", "İ", "\u0307", "İ");
+    assertStringTrimLeft("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
+    assertStringTrimLeft("UNICODE_CI", "Ixİ", "i\u0307", "xİ");
+    assertStringTrimLeft("UNICODE_CI", "IXİ", "ix\u0307", "İ");
+    assertStringTrimLeft("UNICODE_CI", "xi\u0307", "\u0307IX", "i\u0307");
+    assertStringTrimRight("UNICODE_CI", "i", "i", "");
+    assertStringTrimRight("UNICODE_CI", "iii", "I", "");
+    assertStringTrimRight("UNICODE_CI", "I", "iii", "");
+    assertStringTrimRight("UNICODE_CI", "ixi", "i", "ix");
+    assertStringTrimRight("UNICODE_CI", "i", "İ", "i");
+    assertStringTrimRight("UNICODE_CI", "i\u0307", "İ", "");
+    assertStringTrimRight("UNICODE_CI", "i\u0307", "i", "i\u0307");
+    assertStringTrimRight("UNICODE_CI", "i\u0307", "\u0307", "i\u0307");
+    assertStringTrimRight("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
+    assertStringTrimRight("UNICODE_CI", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307");
+    assertStringTrimRight("UNICODE_CI", "i\u0307\u0307", "i\u0307", "i\u0307\u0307");
+    assertStringTrimRight("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307");
+    assertStringTrimRight("UNICODE_CI", "i\u0307i", "İ", "i\u0307i");
+    assertStringTrimRight("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ");
+    assertStringTrimRight("UNICODE_CI", "i\u0307İ", "İ", "");
+    assertStringTrimRight("UNICODE_CI", "İ", "İ", "");
+    assertStringTrimRight("UNICODE_CI", "IXi", "İ", "IXi");
+    assertStringTrimRight("UNICODE_CI", "ix\u0307", "Ixİ", "ix\u0307");
+    assertStringTrimRight("UNICODE_CI", "i\u0307x", "IXİ", "");
+    assertStringTrimRight("UNICODE_CI", "i\u0307x", "I\u0307xİ", "");
+    assertStringTrimRight("UNICODE_CI", "İ", "i", "İ");
+    assertStringTrimRight("UNICODE_CI", "İ", "\u0307", "İ");
+    assertStringTrimRight("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
+    assertStringTrimRight("UNICODE_CI", "Ixİ", "i\u0307", "Ixİ");
+    assertStringTrimRight("UNICODE_CI", "IXİ", "ix\u0307", "IXİ");
+    assertStringTrimRight("UNICODE_CI", "xi\u0307", "\u0307IX", "xi\u0307");
+
+    // Conditional case mapping - UTF8_BINARY.
+    assertStringTrim("UTF8_BINARY", "ςxς", "σ", "ςxς");
+    assertStringTrim("UTF8_BINARY", "ςxς", "ς", "x");
+    assertStringTrim("UTF8_BINARY", "ςxς", "Σ", "ςxς");
+    assertStringTrim("UTF8_BINARY", "σxσ", "σ", "x");
+    assertStringTrim("UTF8_BINARY", "σxσ", "ς", "σxσ");
+    assertStringTrim("UTF8_BINARY", "σxσ", "Σ", "σxσ");
+    assertStringTrim("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ");
+    assertStringTrim("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ");
+    assertStringTrim("UTF8_BINARY", "ΣxΣ", "Σ", "x");
+    assertStringTrimLeft("UTF8_BINARY", "ςxς", "σ", "ςxς");
+    assertStringTrimLeft("UTF8_BINARY", "ςxς", "ς", "xς");
+    assertStringTrimLeft("UTF8_BINARY", "ςxς", "Σ", "ςxς");
+    assertStringTrimLeft("UTF8_BINARY", "σxσ", "σ", "xσ");
+    assertStringTrimLeft("UTF8_BINARY", "σxσ", "ς", "σxσ");
+    assertStringTrimLeft("UTF8_BINARY", "σxσ", "Σ", "σxσ");
+    assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ");
+    assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ");
+    assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "Σ", "xΣ");
+    assertStringTrimRight("UTF8_BINARY", "ςxς", "σ", "ςxς");
+    assertStringTrimRight("UTF8_BINARY", "ςxς", "ς", "ςx");
+    assertStringTrimRight("UTF8_BINARY", "ςxς", "Σ", "ςxς");
+    assertStringTrimRight("UTF8_BINARY", "σxσ", "σ", "σx");
+    assertStringTrimRight("UTF8_BINARY", "σxσ", "ς", "σxσ");
+    assertStringTrimRight("UTF8_BINARY", "σxσ", "Σ", "σxσ");
+    assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ");
+    assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ");
+    assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "Σ", "Σx");
+    // Conditional case mapping - UTF8_LCASE.
+    assertStringTrim("UTF8_LCASE", "ςxς", "σ", "x");
+    assertStringTrim("UTF8_LCASE", "ςxς", "ς", "x");
+    assertStringTrim("UTF8_LCASE", "ςxς", "Σ", "x");
+    assertStringTrim("UTF8_LCASE", "σxσ", "σ", "x");
+    assertStringTrim("UTF8_LCASE", "σxσ", "ς", "x");
+    assertStringTrim("UTF8_LCASE", "σxσ", "Σ", "x");
+    assertStringTrim("UTF8_LCASE", "ΣxΣ", "σ", "x");
+    assertStringTrim("UTF8_LCASE", "ΣxΣ", "ς", "x");
+    assertStringTrim("UTF8_LCASE", "ΣxΣ", "Σ", "x");
+    assertStringTrimLeft("UTF8_LCASE", "ςxς", "σ", "xς");
+    assertStringTrimLeft("UTF8_LCASE", "ςxς", "ς", "xς");
+    assertStringTrimLeft("UTF8_LCASE", "ςxς", "Σ", "xς");
+    assertStringTrimLeft("UTF8_LCASE", "σxσ", "σ", "xσ");
+    assertStringTrimLeft("UTF8_LCASE", "σxσ", "ς", "xσ");
+    assertStringTrimLeft("UTF8_LCASE", "σxσ", "Σ", "xσ");
+    assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "σ", "xΣ");
+    assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "ς", "xΣ");
+    assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "Σ", "xΣ");
+    assertStringTrimRight("UTF8_LCASE", "ςxς", "σ", "ςx");
+    assertStringTrimRight("UTF8_LCASE", "ςxς", "ς", "ςx");
+    assertStringTrimRight("UTF8_LCASE", "ςxς", "Σ", "ςx");
+    assertStringTrimRight("UTF8_LCASE", "σxσ", "σ", "σx");
+    assertStringTrimRight("UTF8_LCASE", "σxσ", "ς", "σx");
+    assertStringTrimRight("UTF8_LCASE", "σxσ", "Σ", "σx");
+    assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "σ", "Σx");
+    assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "ς", "Σx");
+    assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "Σ", "Σx");
+    // Conditional case mapping - UNICODE.
+    assertStringTrim("UNICODE", "ςxς", "σ", "ςxς");
+    assertStringTrim("UNICODE", "ςxς", "ς", "x");
+    assertStringTrim("UNICODE", "ςxς", "Σ", "ςxς");
+    assertStringTrim("UNICODE", "σxσ", "σ", "x");
+    assertStringTrim("UNICODE", "σxσ", "ς", "σxσ");
+    assertStringTrim("UNICODE", "σxσ", "Σ", "σxσ");
+    assertStringTrim("UNICODE", "ΣxΣ", "σ", "ΣxΣ");
+    assertStringTrim("UNICODE", "ΣxΣ", "ς", "ΣxΣ");
+    assertStringTrim("UNICODE", "ΣxΣ", "Σ", "x");
+    assertStringTrimLeft("UNICODE", "ςxς", "σ", "ςxς");
+    assertStringTrimLeft("UNICODE", "ςxς", "ς", "xς");
+    assertStringTrimLeft("UNICODE", "ςxς", "Σ", "ςxς");
+    assertStringTrimLeft("UNICODE", "σxσ", "σ", "xσ");
+    assertStringTrimLeft("UNICODE", "σxσ", "ς", "σxσ");
+    assertStringTrimLeft("UNICODE", "σxσ", "Σ", "σxσ");
+    assertStringTrimLeft("UNICODE", "ΣxΣ", "σ", "ΣxΣ");
+    assertStringTrimLeft("UNICODE", "ΣxΣ", "ς", "ΣxΣ");
+    assertStringTrimLeft("UNICODE", "ΣxΣ", "Σ", "xΣ");
+    assertStringTrimRight("UNICODE", "ςxς", "σ", "ςxς");
+    assertStringTrimRight("UNICODE", "ςxς", "ς", "ςx");
+    assertStringTrimRight("UNICODE", "ςxς", "Σ", "ςxς");
+    assertStringTrimRight("UNICODE", "σxσ", "σ", "σx");
+    assertStringTrimRight("UNICODE", "σxσ", "ς", "σxσ");
+    assertStringTrimRight("UNICODE", "σxσ", "Σ", "σxσ");
+    assertStringTrimRight("UNICODE", "ΣxΣ", "σ", "ΣxΣ");
+    assertStringTrimRight("UNICODE", "ΣxΣ", "ς", "ΣxΣ");
+    assertStringTrimRight("UNICODE", "ΣxΣ", "Σ", "Σx");
+    // Conditional case mapping - UNICODE_CI.
+    assertStringTrim("UNICODE_CI", "ςxς", "σ", "x");
+    assertStringTrim("UNICODE_CI", "ςxς", "ς", "x");
+    assertStringTrim("UNICODE_CI", "ςxς", "Σ", "x");
+    assertStringTrim("UNICODE_CI", "σxσ", "σ", "x");
+    assertStringTrim("UNICODE_CI", "σxσ", "ς", "x");
+    assertStringTrim("UNICODE_CI", "σxσ", "Σ", "x");
+    assertStringTrim("UNICODE_CI", "ΣxΣ", "σ", "x");
+    assertStringTrim("UNICODE_CI", "ΣxΣ", "ς", "x");
+    assertStringTrim("UNICODE_CI", "ΣxΣ", "Σ", "x");
+    assertStringTrimLeft("UNICODE_CI", "ςxς", "σ", "xς");
+    assertStringTrimLeft("UNICODE_CI", "ςxς", "ς", "xς");
+    assertStringTrimLeft("UNICODE_CI", "ςxς", "Σ", "xς");
+    assertStringTrimLeft("UNICODE_CI", "σxσ", "σ", "xσ");
+    assertStringTrimLeft("UNICODE_CI", "σxσ", "ς", "xσ");
+    assertStringTrimLeft("UNICODE_CI", "σxσ", "Σ", "xσ");
+    assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "σ", "xΣ");
+    assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "ς", "xΣ");
+    assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "Σ", "xΣ");
+    assertStringTrimRight("UNICODE_CI", "ςxς", "σ", "ςx");
+    assertStringTrimRight("UNICODE_CI", "ςxς", "ς", "ςx");
+    assertStringTrimRight("UNICODE_CI", "ςxς", "Σ", "ςx");
+    assertStringTrimRight("UNICODE_CI", "σxσ", "σ", "σx");
+    assertStringTrimRight("UNICODE_CI", "σxσ", "ς", "σx");
+    assertStringTrimRight("UNICODE_CI", "σxσ", "Σ", "σx");
+    assertStringTrimRight("UNICODE_CI", "ΣxΣ", "σ", "Σx");
+    assertStringTrimRight("UNICODE_CI", "ΣxΣ", "ς", "Σx");
+    assertStringTrimRight("UNICODE_CI", "ΣxΣ", "Σ", "Σx");
   }
 
   // TODO: Test more collation-aware string expressions.

From 176b148972300286186ab6a2dcf915522f1bf1d7 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 8 Jul 2024 05:10:48 +0200
Subject: [PATCH 08/14] Add tests

---
 .../unsafe/types/CollationSupportSuite.java     | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 42a5e5f3a315d..48897156342f8 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -1904,6 +1904,23 @@ public void testStringTrim() throws SparkException {
     assertStringTrimRight("UNICODE_CI", "ΣxΣ", "σ", "Σx");
     assertStringTrimRight("UNICODE_CI", "ΣxΣ", "ς", "Σx");
     assertStringTrimRight("UNICODE_CI", "ΣxΣ", "Σ", "Σx");
+
+    // Unicode normalization - UTF8_BINARY.
+    assertStringTrim("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A");
+    assertStringTrimLeft("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A");
+    assertStringTrimRight("UTF8_BINARY", "åβγδa\u030A", "å", "åβγδa\u030A");
+    // Unicode normalization - UTF8_LCASE.
+    assertStringTrim("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A");
+    assertStringTrimLeft("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A");
+    assertStringTrimRight("UTF8_LCASE", "åβγδa\u030A", "Å", "åβγδa\u030A");
+    // Unicode normalization - UNICODE.
+    assertStringTrim("UNICODE", "åβγδa\u030A", "å", "βγδ");
+    assertStringTrimLeft("UNICODE", "åβγδa\u030A", "å", "βγδa\u030A");
+    assertStringTrimRight("UNICODE", "åβγδa\u030A", "å", "åβγδ");
+    // Unicode normalization - UNICODE_CI.
+    assertStringTrim("UNICODE_CI", "åβγδa\u030A", "Å", "βγδ");
+    assertStringTrimLeft("UNICODE_CI", "åβγδa\u030A", "Å", "βγδa\u030A");
+    assertStringTrimRight("UNICODE_CI", "åβγδa\u030A", "Å", "åβγδ");
   }
 
   // TODO: Test more collation-aware string expressions.

From 598f52216f8ca43dbb2a3c75f40cfc3d3aba4f70 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 8 Jul 2024 05:14:24 +0200
Subject: [PATCH 09/14] Remove unused code

---
 .../spark/sql/catalyst/util/CollationFactory.java    | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index c4bc0eda81511..f13f66e384e0f 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -813,18 +813,6 @@ public static String[] getICULocaleNames() {
     return Collation.CollationSpecICU.ICULocaleNames;
   }
 
-  public static String getCollationKey(String input, int collationId) {
-    Collation collation = fetchCollation(collationId);
-    if (collation.supportsBinaryEquality) {
-      return input;
-    } else if (collation.supportsLowercaseEquality) {
-      return input.toLowerCase();
-    } else {
-      CollationKey collationKey = collation.collator.getCollationKey(input);
-      return Arrays.toString(collationKey.toByteArray());
-    }
-  }
-
   public static UTF8String getCollationKey(UTF8String input, int collationId) {
     Collation collation = fetchCollation(collationId);
     if (collation.supportsBinaryEquality) {

From 12cc084baccd82e243a893cb9d0e0440ceb4ff5a Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 8 Jul 2024 09:40:48 +0200
Subject: [PATCH 10/14] Fix java lint

---
 .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 73b76320247e0..427f8c023fef0 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -798,7 +798,9 @@ public static UTF8String lowercaseTrimLeft(
           if (trimChars.contains(codePoint)) ++searchIndex;
           break;
         }
-      } else if (trimChars.contains(codePoint)) ++searchIndex;
+      } else if (trimChars.contains(codePoint)) {
+        ++searchIndex;
+      }
       else break;
     }
 
@@ -901,7 +903,9 @@ public static UTF8String lowercaseTrimRight(
           if (trimChars.contains(codePoint)) --searchIndex;
           break;
         }
-      } else if (trimChars.contains(codePoint)) --searchIndex;
+      } else if (trimChars.contains(codePoint)) {
+        --searchIndex;
+      }
       else break;
     }
 

From 469c325be8f1e03216cdc1a3ca5c419e2bc6ac67 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Tue, 9 Jul 2024 20:28:00 +0200
Subject: [PATCH 11/14] Fix

---
 .../spark/sql/catalyst/util/CollationAwareUTF8String.java     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 427f8c023fef0..6fa07256ed9c4 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -467,7 +467,9 @@ private static int getLowercaseCodePoint(final int codePoint) {
       return CODE_POINT_COMBINED_LOWERCASE_I_DOT;
     }
     else if (codePoint == 0x03C2) {
-      // Greek final and non-final capital letter sigma should be mapped the same.
+      // Greek final and non-final letter sigma should be mapped the same. This is achieved by
+      // mapping Greek small final sigma (U+03C2) to Greek small non-final sigma (U+03C3). Capital
+      // letter sigma (U+03A3) is mapped to small non-final sigma (U+03C3) in the `else` branch.
       return 0x03C3;
     }
     else {

From 06575fc5a40adf4277993489ed9e9a5db19033ae Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 12 Jul 2024 07:54:08 +0200
Subject: [PATCH 12/14] Fix comments

---
 .../apache/spark/unsafe/types/CollationSupportSuite.java  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 48897156342f8..1a5c585791c5b 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -1792,7 +1792,7 @@ public void testStringTrim() throws SparkException {
     assertStringTrimRight("UNICODE_CI", "IXİ", "ix\u0307", "IXİ");
     assertStringTrimRight("UNICODE_CI", "xi\u0307", "\u0307IX", "xi\u0307");
 
-    // Conditional case mapping - UTF8_BINARY.
+    // Greek sigmas - UTF8_BINARY.
     assertStringTrim("UTF8_BINARY", "ςxς", "σ", "ςxς");
     assertStringTrim("UTF8_BINARY", "ςxς", "ς", "x");
     assertStringTrim("UTF8_BINARY", "ςxς", "Σ", "ςxς");
@@ -1820,7 +1820,7 @@ public void testStringTrim() throws SparkException {
     assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ");
     assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ");
     assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "Σ", "Σx");
-    // Conditional case mapping - UTF8_LCASE.
+    // Greek sigmas - UTF8_LCASE.
     assertStringTrim("UTF8_LCASE", "ςxς", "σ", "x");
     assertStringTrim("UTF8_LCASE", "ςxς", "ς", "x");
     assertStringTrim("UTF8_LCASE", "ςxς", "Σ", "x");
@@ -1848,7 +1848,7 @@ public void testStringTrim() throws SparkException {
     assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "σ", "Σx");
     assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "ς", "Σx");
     assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "Σ", "Σx");
-    // Conditional case mapping - UNICODE.
+    // Greek sigmas - UNICODE.
     assertStringTrim("UNICODE", "ςxς", "σ", "ςxς");
     assertStringTrim("UNICODE", "ςxς", "ς", "x");
     assertStringTrim("UNICODE", "ςxς", "Σ", "ςxς");
@@ -1876,7 +1876,7 @@ public void testStringTrim() throws SparkException {
     assertStringTrimRight("UNICODE", "ΣxΣ", "σ", "ΣxΣ");
     assertStringTrimRight("UNICODE", "ΣxΣ", "ς", "ΣxΣ");
     assertStringTrimRight("UNICODE", "ΣxΣ", "Σ", "Σx");
-    // Conditional case mapping - UNICODE_CI.
+    // Greek sigmas - UNICODE_CI.
     assertStringTrim("UNICODE_CI", "ςxς", "σ", "x");
     assertStringTrim("UNICODE_CI", "ςxς", "ς", "x");
     assertStringTrim("UNICODE_CI", "ςxς", "Σ", "x");

From 8c5787d6fc079775ada3624d0b1c3e82e505182b Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Sun, 14 Jul 2024 13:42:50 +0200
Subject: [PATCH 13/14] Fixes

---
 .../util/CollationAwareUTF8String.java        | 37 +++++++++++++------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 06c06a8c62a26..b9868ca665a65 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -866,6 +866,7 @@ public static UTF8String lowercaseTrim(
    *
    * @param srcString the input string to be trimmed from both ends of the string
    * @param trimString the trim string characters to trim
+   * @param collationId the collation ID to use for string trimming
    * @return the trimmed string (for ICU collations)
    */
   public static UTF8String trim(
@@ -910,7 +911,9 @@ public static UTF8String lowercaseTrimLeft(
           trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) {
         int nextCodePoint = getLowercaseCodePoint(srcIter.next());
         if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint))
-          || nextCodePoint == CODE_POINT_COMBINING_DOT) searchIndex += 2;
+          || nextCodePoint == CODE_POINT_COMBINING_DOT) {
+          searchIndex += 2;
+        }
         else {
           if (trimChars.contains(codePoint)) ++searchIndex;
           break;
@@ -918,7 +921,9 @@ public static UTF8String lowercaseTrimLeft(
       } else if (trimChars.contains(codePoint)) {
         ++searchIndex;
       }
-      else break;
+      else {
+        break;
+      }
     }
 
     // Return the substring from that position to the end of the string.
@@ -935,6 +940,7 @@ public static UTF8String lowercaseTrimLeft(
    *
    * @param srcString the input string to be trimmed from the left end of the string
    * @param trimString the trim string characters to trim
+   * @param collationId the collation ID to use for string trimming
    * @return the trimmed string (for ICU collations)
    */
   public static UTF8String trimLeft(
@@ -946,12 +952,12 @@ public static UTF8String trimLeft(
     if (srcString.numBytes() == 0) return srcString;
 
     // Create an array of Strings for all characters of `trimString`.
-    int trimCharIndex = 0;
-    String[] trimChars = new String[trimString.numChars()];
+    Map<Integer, String> trimChars = new HashMap<>();
     Iterator<Integer> trimIter = trimString.codePointIterator(
       CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
     while (trimIter.hasNext()) {
-      trimChars[trimCharIndex++] = String.valueOf((char) trimIter.next().intValue());
+      int codePoint = trimIter.next();
+      trimChars.putIfAbsent(codePoint, String.valueOf((char) codePoint));
     }
 
     // Iterate over srcString from the left and find the first character that is not in trimChars.
@@ -961,7 +967,7 @@ public static UTF8String trimLeft(
     int charIndex = 0, longestMatchLen;
     while (charIndex < src.length()) {
       longestMatchLen = 0;
-      for (String trim : trimChars) {
+      for (String trim : trimChars.values()) {
         StringSearch stringSearch = new StringSearch(trim, target, (RuleBasedCollator) collator);
         stringSearch.setIndex(charIndex);
         int matchIndex = stringSearch.next();
@@ -1015,7 +1021,9 @@ public static UTF8String lowercaseTrimRight(
           trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) {
         int nextCodePoint = getLowercaseCodePoint(srcIter.next());
         if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint))
-          || nextCodePoint == CODE_POINT_LOWERCASE_I) searchIndex -= 2;
+          || nextCodePoint == CODE_POINT_LOWERCASE_I) {
+          searchIndex -= 2;
+        }
         else {
           if (trimChars.contains(codePoint)) --searchIndex;
           break;
@@ -1023,7 +1031,9 @@ public static UTF8String lowercaseTrimRight(
       } else if (trimChars.contains(codePoint)) {
         --searchIndex;
       }
-      else break;
+      else {
+        break;
+      }
     }
 
     // Return the substring from the start of the string to the calculated position.
@@ -1040,6 +1050,7 @@ public static UTF8String lowercaseTrimRight(
    *
    * @param srcString the input string to be trimmed from the right end of the string
    * @param trimString the trim string characters to trim
+   * @param collationId the collation ID to use for string trimming
    * @return the trimmed string (for ICU collations)
    */
   public static UTF8String trimRight(
@@ -1051,12 +1062,12 @@ public static UTF8String trimRight(
     if (srcString.numBytes() == 0) return srcString;
 
     // Create an array of Strings for all characters of `trimString`.
-    int trimCharIndex = 0;
-    String[] trimChars = new String[trimString.numChars()];
+    Map<Integer, String> trimChars = new HashMap<>();
     Iterator<Integer> trimIter = trimString.codePointIterator(
       CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
     while (trimIter.hasNext()) {
-      trimChars[trimCharIndex++] = String.valueOf((char) trimIter.next().intValue());
+      int codePoint = trimIter.next();
+      trimChars.putIfAbsent(codePoint, String.valueOf((char) codePoint));
     }
 
     // Iterate over srcString from the left and find the first character that is not in trimChars.
@@ -1066,12 +1077,14 @@ public static UTF8String trimRight(
     int charIndex = src.length(), longestMatchLen;
     while (charIndex >= 0) {
       longestMatchLen = 0;
-      for (String trim : trimChars) {
+      for (String trim : trimChars.values()) {
         StringSearch stringSearch = new StringSearch(trim, target, (RuleBasedCollator) collator);
         // Note: stringSearch.previous() is NOT consistent with stringSearch.next()!
         //  Example: StringSearch("İ", "i\\u0307İi\\u0307İi\\u0307İ", "UNICODE_CI")
         //    stringSearch.next() gives: [0, 2, 3, 5, 6, 8].
         //    stringSearch.previous() gives: [8, 6, 3, 0].
+        // Since 1 character can map to at most 3 characters in Unicode, we can begin the search
+        // from character position: `charIndex` - 3, and use `next()` to find the longest match.
         stringSearch.setIndex(Math.max(charIndex - 3, 0));
         int matchIndex = stringSearch.next();
         int matchLen = stringSearch.getMatchLength();

From d15d92aa9f0b15c879a531756f3f474264bec462 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Sun, 14 Jul 2024 22:51:41 +0200
Subject: [PATCH 14/14] Fix tests

---
 .../apache/spark/sql/CollationStringExpressionsSuite.scala   | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
index 5387b2d435350..815a8bc595294 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
@@ -874,11 +874,6 @@ class CollationStringExpressionsSuite
     assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
   }
 
-  test("xxxx") {
-    checkEvaluation(
-      StringTrim(Literal.create(null, StringType), Literal.create(null, StringType)), null)
-  }
-
   test("StringTrim* functions - unit tests for both paths (codegen and eval)") {
     def evalStringTrim(src: Any, trim: Any, result: String): Unit = {
       Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI").foreach { collation =>