[SPARK-48281][SQL] Alter string search logic for UTF8_BINARY_LCASE collation (StringInStr, SubstringIndex)

uros-db · cloud-fan · commit 0461745f1616 · 2024-05-29T11:16:37.000-07:00
### What changes were proposed in this pull request? String searching in UTF8_BINARY_LCASE now works on character-level, rather than on byte-level. For example: `instr("İ", "i")`; now returns 0, because there exists no `start, len` such that `lowercase(substring("İ", start, len)) == "i"`. ### Why are the changes needed? Fix functions that give unusable results due to one-to-many case mapping when performing string search under UTF8_BINARY_LCASE (see example above). ### Does this PR introduce _any_ user-facing change? Yes, behaviour of `instr` and `substring_index` expressions is changed for edge cases with one-to-many case mapping. ### How was this patch tested? New unit tests in `CollationSupportSuite`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46589 from uros-db/alter-lcase-vol2. Authored-by: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -345,14 +345,14 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
    */
   public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
       final int start) {
-    if (pattern.numChars() == 0) return 0;
+    if (pattern.numChars() == 0) return target.indexOfEmpty(start);
     return lowercaseFind(target, pattern.toLowerCase(), start);
   }
 
   public static int indexOf(final UTF8String target, final UTF8String pattern,
       final int start, final int collationId) {
     if (pattern.numBytes() == 0) {
-      return 0;
+      return target.indexOfEmpty(start);
     }
 
     StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId);
@@ -444,47 +444,27 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string,
       return UTF8String.EMPTY_UTF8;
     }
 
-    UTF8String lowercaseString = string.toLowerCase();
     UTF8String lowercaseDelimiter = delimiter.toLowerCase();
 
     if (count > 0) {
-      int idx = -1;
+      // Search left to right (note: the start code point is inclusive).
+      int matchLength = -1;
       while (count > 0) {
-        idx = lowercaseString.find(lowercaseDelimiter, idx + 1);
-        if (idx >= 0) {
-          count--;
-        } else {
-          // can not find enough delim
-          return string;
-        }
-      }
-      if (idx == 0) {
-        return UTF8String.EMPTY_UTF8;
+        matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1);
+        if (matchLength > MATCH_NOT_FOUND) --count; // Found a delimiter.
+        else return string; // Cannot find enough delimiters in the string.
       }
-      byte[] bytes = new byte[idx];
-      copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx);
-      return UTF8String.fromBytes(bytes);
-
+      return string.substring(0, matchLength);
     } else {
-      int idx = string.numBytes() - delimiter.numBytes() + 1;
+      // Search right to left (note: the end code point is exclusive).
+      int matchLength = string.numChars() + 1;
       count = -count;
       while (count > 0) {
-        idx = lowercaseString.rfind(lowercaseDelimiter, idx - 1);
-        if (idx >= 0) {
-          count--;
-        } else {
-          // can not find enough delim
-          return string;
-        }
+        matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1);
+        if (matchLength > MATCH_NOT_FOUND) --count; // Found a delimiter.
+        else return string; // Cannot find enough delimiters in the string.
       }
-      if (idx + delimiter.numBytes() == string.numBytes()) {
-        return UTF8String.EMPTY_UTF8;
-      }
-      int size = string.numBytes() - delimiter.numBytes() - idx;
-      byte[] bytes = new byte[size];
-      copyMemory(string.getBaseObject(), string.getBaseOffset() + idx + delimiter.numBytes(),
-        bytes, BYTE_ARRAY_OFFSET, size);
-      return UTF8String.fromBytes(bytes);
+      return string.substring(matchLength, string.numChars());
     }
   }
 
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -354,7 +354,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring
       return string.indexOf(substring, 0);
     }
     public static int execLowercase(final UTF8String string, final UTF8String substring) {
-      return string.toLowerCase().indexOf(substring.toLowerCase(), 0);
+      return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0);
     }
     public static int execICU(final UTF8String string, final UTF8String substring,
         final int collationId) {
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -773,6 +773,17 @@ public UTF8String repeat(int times) {
     return UTF8String.fromBytes(newBytes);
   }
 
+  /**
+   * Returns the (default) position of the first occurrence of an empty substr in the current
+   * string from the specified position (0-based index).
+   *
+   * @param start the start position of the current string for searching
+   * @return the position of the first occurrence of the empty substr (now, always 0)
+   */
+  public int indexOfEmpty(int start) {
+    return 0; // TODO: Fix this behaviour (SPARK-48284)
+  }
+
   /**
    * Returns the position of the first occurrence of substr in
    * current string from the specified position (0-based index).
@@ -783,7 +794,7 @@ public UTF8String repeat(int times) {
    */
   public int indexOf(UTF8String v, int start) {
     if (v.numBytes() == 0) {
-      return 0;
+      return indexOfEmpty(start);
     }
 
     // locate to the start position.
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -635,8 +635,28 @@ public void testStringInstr() throws SparkException {
     assertStringInstr("aaads", "dS", "UNICODE_CI", 4);
     assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0);
     assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8);
-    assertStringInstr("abİo12", "i̇o", "UNICODE_CI", 3);
-    assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3);
+    assertStringInstr("i̇", "i", "UNICODE_CI", 0);
+    assertStringInstr("i̇", "\u0307", "UNICODE_CI", 0);
+    assertStringInstr("i̇", "İ", "UNICODE_CI", 1);
+    assertStringInstr("İ", "i", "UNICODE_CI", 0);
+    assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1);
+    assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1);
+    assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3);
+    assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3);
+    assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5);
+    assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7);
+    assertStringInstr("i̇", "i", "UTF8_BINARY_LCASE", 1); // != UNICODE_CI
+    assertStringInstr("i̇", "\u0307", "UTF8_BINARY_LCASE", 2); // != UNICODE_CI
+    assertStringInstr("i̇", "İ", "UTF8_BINARY_LCASE", 1);
+    assertStringInstr("İ", "i", "UTF8_BINARY_LCASE", 0);
+    assertStringInstr("İoi̇o12", "i̇o", "UTF8_BINARY_LCASE", 1);
+    assertStringInstr("i̇oİo12", "İo", "UTF8_BINARY_LCASE", 1);
+    assertStringInstr("abİoi̇o", "i̇o", "UTF8_BINARY_LCASE", 3);
+    assertStringInstr("abi̇oİo", "İo", "UTF8_BINARY_LCASE", 3);
+    assertStringInstr("abI\u0307oi̇o", "İo", "UTF8_BINARY_LCASE", 3);
+    assertStringInstr("ai̇oxXİo", "Xx", "UTF8_BINARY_LCASE", 5);
+    assertStringInstr("abİoi̇o", "\u0307o", "UTF8_BINARY_LCASE", 6);
+    assertStringInstr("aİoi̇oxx", "XX", "UTF8_BINARY_LCASE", 7);
   }
 
   private void assertFindInSet(String word, String set, String collationName,
@@ -878,6 +898,32 @@ public void testSubstringIndex() throws SparkException {
     assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo");
     assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo");
+    assertSubstringIndex("abi̇12", "i", 1, "UNICODE_CI", "abi̇12");
+    assertSubstringIndex("abi̇12", "\u0307", 1, "UNICODE_CI", "abi̇12");
+    assertSubstringIndex("abi̇12", "İ", 1, "UNICODE_CI", "ab");
+    assertSubstringIndex("abİ12", "i", 1, "UNICODE_CI", "abİ12");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12");
+    assertSubstringIndex("abi̇12", "i", 1, "UTF8_BINARY_LCASE", "ab"); // != UNICODE_CI
+    assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_BINARY_LCASE", "abi"); // != UNICODE_CI
+    assertSubstringIndex("abi̇12", "İ", 1, "UTF8_BINARY_LCASE", "ab");
+    assertSubstringIndex("abİ12", "i", 1, "UTF8_BINARY_LCASE", "abİ12");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
+    assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_BINARY_LCASE", "bİoi̇o12i̇o");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
+    assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
+    assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
+    assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_BINARY_LCASE", "bİoi̇o12i̇o");
   }
 
   private void assertStringTrim(

Original file line number	Diff line number	Diff line change
`@@ -354,7 +354,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring`
`354`	`354`	`return string.indexOf(substring, 0);`
`355`	`355`	`}`
`356`	`356`	`public static int execLowercase(final UTF8String string, final UTF8String substring) {`
`357`		`- return string.toLowerCase().indexOf(substring.toLowerCase(), 0);`
	`357`	`+ return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0);`
`358`	`358`	`}`
`359`	`359`	`public static int execICU(final UTF8String string, final UTF8String substring,`
`360`	`360`	`final int collationId) {`