[SPARK-48748][SQL] Cache numChars in UTF8String

uros-db · yaooqinn · commit 0487d7857ecf · 2024-07-01T10:53:11.000+08:00
### What changes were proposed in this pull request? Cache `numChars` value in a thread-safe way. ### Why are the changes needed? Faster access to `numChars()` method, which currently requires entire UTF8String scan every time. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #47142 from uros-db/cache-numchars. Authored-by: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Signed-off-by: Kent Yao <yao@apache.org>
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -59,6 +59,7 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
   private Object base;
   private long offset;
   private int numBytes;
+  private volatile int numChars = -1;
 
   public Object getBaseObject() { return base; }
   public long getBaseOffset() { return offset; }
@@ -254,6 +255,16 @@ public int numBytes() {
    * Returns the number of code points in it.
    */
   public int numChars() {
+    if (numChars == -1) numChars = getNumChars();
+    return numChars;
+  }
+
+  /**
+   * Private helper method to calculate the number of code points in the UTF-8 string. Counting
+   * the code points is a linear time operation, as we need to scan the entire UTF-8 string.
+   * Hence, this method should generally only be called once for non-empty UTF-8 strings.
+   */
+  private int getNumChars() {
     int len = 0;
     for (int i = 0; i < numBytes; i += numBytesForFirstByte(getByte(i))) {
       len += 1;