Fix use with malformed Char

Seelengrab · Seelengrab · commit 9c9b6f61e0fe · 2024-04-09T13:17:46.000+02:00
The previous implementation assumed that all `Char` are well-formed,
which is of course not guaranteed to be the case (and which is also
correctly handled by the existing implementation). On top of that,
this is even faster, since counting the number of trailing zeros
has hardware support on a wide range of architectures.
diff --git a/base/char.jl b/base/char.jl
@@ -63,18 +63,13 @@ to an output stream, or `ncodeunits(string(c))` but computed efficiently.
     using `ncodeunits(string(c))`.
 """
 function ncodeunits(c::Char)
-    # All Char are 4 byte wide, and since unicode encoding
-    # doesn't have null bytes (except for \0), we can just
-    # count non-zero bytes
-    char_data = reinterpret(UInt32, c)
-    mask = 0xff % UInt32
-    nbytes = !iszero(char_data & mask)
-    Base.Cartesian.@nexprs 3 i -> begin
-        m <<= 0x8
-        nbytes += !iszero(char_data & mask)
-    end
-    # We have to account for `\0`, which is encoded as all zeros
-    nbytes + iszero(uc)
+    u = reinterpret(UInt32, c)
+
+    # We care about how many trailing bytes are all zero
+    n_nonzero_bytes = sizeof(UInt32) - div(trailing_zeros(u), 0x8)
+
+    # Take care of '\0', which has an all-zero bitpattern
+    n_nonzero_bytes + iszero(u)
 end
 
 """