@@ -196,22 +196,115 @@ pub fn utf8CountCodepoints(s: []const u8) !usize {
196196 return len ;
197197}
198198
199- pub fn utf8ValidateSlice (s : []const u8 ) bool {
199+ /// Returns true if the input consists entirely of UTF-8 codepoints
200+ pub fn utf8ValidateSlice (input : []const u8 ) bool {
201+ var remaining = input ;
202+
203+ const V_len = comptime std .simd .suggestVectorSize (usize ) orelse 1 ;
204+ const V = @Vector (V_len , usize );
205+ const u8s_in_vector = @sizeOf (usize ) * V_len ;
206+
207+ // Fast path. Check for and skip ASCII characters at the start of the input.
208+ while (remaining .len >= u8s_in_vector ) {
209+ const chunk : V = @bitCast (remaining [0.. u8s_in_vector ].* );
210+ const swapped = mem .littleToNative (V , chunk );
211+ const reduced = @reduce (.Or , swapped );
212+ const mask : usize = @bitCast ([1 ]u8 {0x80 } ** @sizeOf (usize ));
213+ if (reduced & mask != 0 ) {
214+ // Found a non ASCII byte
215+ break ;
216+ }
217+ remaining = remaining [u8s_in_vector .. ];
218+ }
219+
220+ // default lowest and highest continuation byte
221+ const lo_cb = 0b10000000 ;
222+ const hi_cb = 0b10111111 ;
223+
224+ const min_non_ascii_codepoint = 0x80 ;
225+
226+ // The first nibble is used to identify the continuation byte range to
227+ // accept. The second nibble is the size.
228+ const xx = 0xF1 ; // invalid: size 1
229+ const as = 0xF0 ; // ASCII: size 1
230+ const s1 = 0x02 ; // accept 0, size 2
231+ const s2 = 0x13 ; // accept 1, size 3
232+ const s3 = 0x03 ; // accept 0, size 3
233+ const s4 = 0x23 ; // accept 2, size 3
234+ const s5 = 0x34 ; // accept 3, size 4
235+ const s6 = 0x04 ; // accept 0, size 4
236+ const s7 = 0x44 ; // accept 4, size 4
237+
238+ // Information about the first byte in a UTF-8 sequence.
239+ const first = comptime ([_ ]u8 {as } ** 128 ) ++ ([_ ]u8 {xx } ** 64 ) ++ [_ ]u8 {
240+ xx , xx , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 ,
241+ s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 , s1 ,
242+ s2 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s3 , s4 , s3 , s3 ,
243+ s5 , s6 , s6 , s6 , s7 , xx , xx , xx , xx , xx , xx , xx , xx , xx , xx , xx ,
244+ };
245+
246+ var n = remaining .len ;
200247 var i : usize = 0 ;
201- while (i < s .len ) {
202- if (utf8ByteSequenceLength (s [i ])) | cp_len | {
203- if (i + cp_len > s .len ) {
204- return false ;
205- }
248+ while (i < n ) {
249+ const first_byte = remaining [i ];
250+ if (first_byte < min_non_ascii_codepoint ) {
251+ i += 1 ;
252+ continue ;
253+ }
206254
207- if (std .meta .isError (utf8Decode (s [i .. i + cp_len ]))) {
208- return false ;
209- }
210- i += cp_len ;
211- } else | _ | {
255+ const info = first [first_byte ];
256+ if (info == xx ) {
257+ return false ; // Illegal starter byte.
258+ }
259+
260+ const size = info & 7 ;
261+ if (i + size > n ) {
262+ return false ; // Short or invalid.
263+ }
264+
265+ // Figure out the acceptable low and high continuation bytes, starting
266+ // with our defaults.
267+ var accept_lo : u8 = lo_cb ;
268+ var accept_hi : u8 = hi_cb ;
269+
270+ switch (info >> 4 ) {
271+ 0 = > {},
272+ 1 = > accept_lo = 0xA0 ,
273+ 2 = > accept_hi = 0x9F ,
274+ 3 = > accept_lo = 0x90 ,
275+ 4 = > accept_hi = 0x8F ,
276+ else = > unreachable ,
277+ }
278+
279+ const c1 = remaining [i + 1 ];
280+ if (c1 < accept_lo or accept_hi < c1 ) {
212281 return false ;
213282 }
283+
284+ switch (size ) {
285+ 2 = > i += 2 ,
286+ 3 = > {
287+ const c2 = remaining [i + 2 ];
288+ if (c2 < lo_cb or hi_cb < c2 ) {
289+ return false ;
290+ }
291+ i += 3 ;
292+ },
293+ 4 = > {
294+ const c2 = remaining [i + 2 ];
295+ if (c2 < lo_cb or hi_cb < c2 ) {
296+ return false ;
297+ }
298+ const c3 = remaining [i + 3 ];
299+ if (c3 < lo_cb or hi_cb < c3 ) {
300+ return false ;
301+ }
302+ i += 4 ;
303+ },
304+ else = > unreachable ,
305+ }
214306 }
307+
215308 return true ;
216309}
217310
@@ -502,15 +595,44 @@ fn testUtf8ViewOk() !void {
502595 try testing .expect (it2 .nextCodepoint () == null );
503596}
504597
505- test "bad utf8 slice" {
506- try comptime testBadUtf8Slice ();
507- try testBadUtf8Slice ();
598+ test "validate slice" {
599+ try comptime testValidateSlice ();
600+ try testValidateSlice ();
601+
602+ // We skip a variable (based on recommended vector size) chunks of
603+ // ASCII characters. Let's make sure we're chunking correctly.
604+ const str = [_ ]u8 {'a' } ** 550 ++ "\xc0 " ;
605+ for (0.. str .len - 3 ) | i | {
606+ try testing .expect (! utf8ValidateSlice (str [i .. ]));
607+ }
508608}
509- fn testBadUtf8Slice () ! void {
609+ fn testValidateSlice () ! void {
510610 try testing .expect (utf8ValidateSlice ("abc" ));
611+ try testing .expect (utf8ValidateSlice ("abc\xdf\xbf " ));
612+ try testing .expect (utf8ValidateSlice ("" ));
613+ try testing .expect (utf8ValidateSlice ("a" ));
614+ try testing .expect (utf8ValidateSlice ("abc" ));
615+ try testing .expect (utf8ValidateSlice ("Ж" ));
616+ try testing .expect (utf8ValidateSlice ("ЖЖ" ));
617+ try testing .expect (utf8ValidateSlice ("брэд-ЛГТМ" ));
618+ try testing .expect (utf8ValidateSlice ("☺☻☹" ));
619+ try testing .expect (utf8ValidateSlice ("a\u{fffdb} " ));
620+ try testing .expect (utf8ValidateSlice ("\xf4\x8f\xbf\xbf " ));
621+ try testing .expect (utf8ValidateSlice ("abc\xdf\xbf " ));
622+
511623 try testing .expect (! utf8ValidateSlice ("abc\xc0 " ));
512624 try testing .expect (! utf8ValidateSlice ("abc\xc0 abc" ));
513- try testing .expect (utf8ValidateSlice ("abc\xdf\xbf " ));
625+ try testing .expect (! utf8ValidateSlice ("aa\xe2 " ));
626+ try testing .expect (! utf8ValidateSlice ("\x42\xfa " ));
627+ try testing .expect (! utf8ValidateSlice ("\x42\xfa\x43 " ));
628+ try testing .expect (! utf8ValidateSlice ("abc\xc0 " ));
629+ try testing .expect (! utf8ValidateSlice ("abc\xc0 abc" ));
630+ try testing .expect (! utf8ValidateSlice ("\xf4\x90\x80\x80 " ));
631+ try testing .expect (! utf8ValidateSlice ("\xf7\xbf\xbf\xbf " ));
632+ try testing .expect (! utf8ValidateSlice ("\xfb\xbf\xbf\xbf\xbf " ));
633+ try testing .expect (! utf8ValidateSlice ("\xc0\x80 " ));
634+ try testing .expect (! utf8ValidateSlice ("\xed\xa0\x80 " ));
635+ try testing .expect (! utf8ValidateSlice ("\xed\xbf\xbf " ));
514636}
515637
516638test "valid utf8" {
0 commit comments