From 7b55ff2b40e0b5c5448c83e67e20c45e2463f06a Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Sat, 30 Sep 2023 08:27:41 +0800 Subject: [PATCH 1/5] Use Go's implementation for utf8ValidateSlice Ported Go's utf8.Valid function for improved performance, added some test case from Go's test also. --- lib/std/unicode.zig | 151 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 135 insertions(+), 16 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 678e570e1bc6..09d97c0447b4 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -196,22 +196,119 @@ pub fn utf8CountCodepoints(s: []const u8) !usize { return len; } -pub fn utf8ValidateSlice(s: []const u8) bool { +// Ported from go, which is licensed under a BSD-3 license. +// https://golang.org/LICENSE +// +// https://golang.org/src/unicode/utf8/utf8.go +/// Returns true if the input consists entirely of UTF-8 runes +pub fn utf8ValidateSlice(input: []const u8) bool { + var p = input; + + // Fast path. Check for and skip 8 bytes of ASCII characters per iteration. + while (p.len >= 8) { + const first32 = @as(u32, p[0]) | @as(u32, p[1])<<8 | @as(u32, p[2])<<16 | @as(u32, p[3])<<24; + const second32 = @as(u32, p[4]) | @as(u32, p[5])<<8 | @as(u32, p[6])<<16 | @as(u32, p[7])<<24; + if ((first32|second32)&0x80808080 != 0) { + // Found a non ASCII byte + break; + } + p = p[8..]; + } + + const RuneSelf = 0x80; + const locb = 0b10000000; + const hicb = 0b10111111; + + // These names of these constants are chosen to give nice alignment in the + // table below. The first nibble is an index into acceptRanges or F for + // special one-byte cases. The second nibble is the Rune length or the + // Status for the special one-byte case. + const xx = 0xF1; // invalid: size 1 + const as = 0xF0; // ASCII: size 1 + const s1 = 0x02; // accept 0, size 2 + const s2 = 0x13; // accept 1, size 3 + const s3 = 0x03; // accept 0, size 3 + const s4 = 0x23; // accept 2, size 3 + const s5 = 0x34; // accept 3, size 4 + const s6 = 0x04; // accept 0, size 4 + const s7 = 0x44; // accept 4, size 4 + + const first = [256]u8{ + // 1 2 3 4 5 6 7 8 9 A B C D E F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F + as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F + // 1 2 3 4 5 6 7 8 9 A B C D E F + xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F + xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F + xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF + xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF + xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF + s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF + s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF + s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF + }; + + var n = p.len; var i: usize = 0; - while (i < s.len) { - if (utf8ByteSequenceLength(s[i])) |cp_len| { - if (i + cp_len > s.len) { - return false; - } + while (i < n) { + const pi = p[i]; + if (pi < RuneSelf) { + i += 1; + continue; + } - if (std.meta.isError(utf8Decode(s[i .. i + cp_len]))) { - return false; - } - i += cp_len; - } else |_| { + const x = first[pi]; + if (x == xx) { + return false; // Illegal starter byte. + } + + const size = x & 7; + if (i+size > n) { + return false; // Short or invalid. + } + + var accept_lo: u8 = locb; + var accept_hi: u8 = hicb; + switch (x>>4) { + 0 => {}, + 1 => accept_lo = 0xA0, + 2 => accept_hi = 0x9F, + 3 => accept_lo = 0x90, + 4 => accept_hi = 0x8F, + else => unreachable, + } + + const c1 = p[i+1]; + if (c1 < accept_lo or accept_hi < c1) { return false; } + if (size == 2) { + i += size; + continue; + } + + const c2 = p[i+2]; + if (c2 < locb or hicb < c2) { + return false; + } + if (size == 3) { + i += size; + continue; + } + + const c3 = p[i+3]; + if (c3 < locb or hicb < c3) { + return false; + } + i += size; } + return true; } @@ -502,15 +599,37 @@ fn testUtf8ViewOk() !void { try testing.expect(it2.nextCodepoint() == null); } -test "bad utf8 slice" { - try comptime testBadUtf8Slice(); - try testBadUtf8Slice(); +test "validate slice" { + try comptime testValidateSlice(); + try testValidateSlice(); } -fn testBadUtf8Slice() !void { +fn testValidateSlice() !void { try testing.expect(utf8ValidateSlice("abc")); + try testing.expect(utf8ValidateSlice("abc\xdf\xbf")); + try testing.expect(utf8ValidateSlice("")); + try testing.expect(utf8ValidateSlice("a")); + try testing.expect(utf8ValidateSlice("abc")); + try testing.expect(utf8ValidateSlice("Ж")); + try testing.expect(utf8ValidateSlice("ЖЖ")); + try testing.expect(utf8ValidateSlice("брэд-ЛГТМ")); + try testing.expect(utf8ValidateSlice("☺☻☹")); + try testing.expect(utf8ValidateSlice("a\u{fffdb}")); + try testing.expect(utf8ValidateSlice("\xf4\x8f\xbf\xbf")); + try testing.expect(utf8ValidateSlice("abc\xdf\xbf")); + try testing.expect(!utf8ValidateSlice("abc\xc0")); try testing.expect(!utf8ValidateSlice("abc\xc0abc")); - try testing.expect(utf8ValidateSlice("abc\xdf\xbf")); + try testing.expect(!utf8ValidateSlice("aa\xe2")); + try testing.expect(!utf8ValidateSlice("\x42\xfa")); + try testing.expect(!utf8ValidateSlice("\x42\xfa\x43")); + try testing.expect(!utf8ValidateSlice("abc\xc0")); + try testing.expect(!utf8ValidateSlice("abc\xc0abc")); + try testing.expect(!utf8ValidateSlice("\xf4\x90\x80\x80")); + try testing.expect(!utf8ValidateSlice("\xf7\xbf\xbf\xbf")); + try testing.expect(!utf8ValidateSlice("\xfb\xbf\xbf\xbf\xbf")); + try testing.expect(!utf8ValidateSlice("\xc0\x80")); + try testing.expect(!utf8ValidateSlice("\xed\xa0\x80")); + try testing.expect(!utf8ValidateSlice("\xed\xbf\xbf")); } test "valid utf8" { From 4f5f4d8910756443a57c38fb062a1777bc82669c Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Sat, 30 Sep 2023 09:59:48 +0800 Subject: [PATCH 2/5] zig fmt --- lib/std/unicode.zig | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 09d97c0447b4..f42c7e9f1549 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -206,9 +206,9 @@ pub fn utf8ValidateSlice(input: []const u8) bool { // Fast path. Check for and skip 8 bytes of ASCII characters per iteration. while (p.len >= 8) { - const first32 = @as(u32, p[0]) | @as(u32, p[1])<<8 | @as(u32, p[2])<<16 | @as(u32, p[3])<<24; - const second32 = @as(u32, p[4]) | @as(u32, p[5])<<8 | @as(u32, p[6])<<16 | @as(u32, p[7])<<24; - if ((first32|second32)&0x80808080 != 0) { + const first32 = @as(u32, p[0]) | @as(u32, p[1]) << 8 | @as(u32, p[2]) << 16 | @as(u32, p[3]) << 24; + const second32 = @as(u32, p[4]) | @as(u32, p[5]) << 8 | @as(u32, p[6]) << 16 | @as(u32, p[7]) << 24; + if ((first32 | second32) & 0x80808080 != 0) { // Found a non ASCII byte break; } @@ -269,13 +269,13 @@ pub fn utf8ValidateSlice(input: []const u8) bool { } const size = x & 7; - if (i+size > n) { + if (i + size > n) { return false; // Short or invalid. } var accept_lo: u8 = locb; var accept_hi: u8 = hicb; - switch (x>>4) { + switch (x >> 4) { 0 => {}, 1 => accept_lo = 0xA0, 2 => accept_hi = 0x9F, @@ -284,7 +284,7 @@ pub fn utf8ValidateSlice(input: []const u8) bool { else => unreachable, } - const c1 = p[i+1]; + const c1 = p[i + 1]; if (c1 < accept_lo or accept_hi < c1) { return false; } @@ -293,7 +293,7 @@ pub fn utf8ValidateSlice(input: []const u8) bool { continue; } - const c2 = p[i+2]; + const c2 = p[i + 2]; if (c2 < locb or hicb < c2) { return false; } @@ -302,7 +302,7 @@ pub fn utf8ValidateSlice(input: []const u8) bool { continue; } - const c3 = p[i+3]; + const c3 = p[i + 3]; if (c3 < locb or hicb < c3) { return false; } From c33f2fcbd848ee7e59c2f2ea76a24fecc33fbe1a Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Sat, 30 Sep 2023 14:02:55 +0800 Subject: [PATCH 3/5] Optimize implementation Previous version was slower than the existing implementation in ReleaseFast for long UTF8 strings. This version is now faster in all tested cases (short/long ascii/UTF8), in all release modes --- lib/std/unicode.zig | 72 +++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 41 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index f42c7e9f1549..e056f918e2e9 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -206,8 +206,8 @@ pub fn utf8ValidateSlice(input: []const u8) bool { // Fast path. Check for and skip 8 bytes of ASCII characters per iteration. while (p.len >= 8) { - const first32 = @as(u32, p[0]) | @as(u32, p[1]) << 8 | @as(u32, p[2]) << 16 | @as(u32, p[3]) << 24; - const second32 = @as(u32, p[4]) | @as(u32, p[5]) << 8 | @as(u32, p[6]) << 16 | @as(u32, p[7]) << 24; + const first32 = mem.readIntLittle(u32, p[0..4]); + const second32 = mem.readIntLittle(u32, p[4..8]); if ((first32 | second32) & 0x80808080 != 0) { // Found a non ASCII byte break; @@ -215,7 +215,7 @@ pub fn utf8ValidateSlice(input: []const u8) bool { p = p[8..]; } - const RuneSelf = 0x80; + const rune_self = 0x80; const locb = 0b10000000; const hicb = 0b10111111; @@ -233,32 +233,19 @@ pub fn utf8ValidateSlice(input: []const u8) bool { const s6 = 0x04; // accept 0, size 4 const s7 = 0x44; // accept 4, size 4 - const first = [256]u8{ - // 1 2 3 4 5 6 7 8 9 A B C D E F - as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F - as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F - as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F - as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F - as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F - as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F - as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F - as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F - // 1 2 3 4 5 6 7 8 9 A B C D E F - xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F - xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F - xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF - xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF - xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF - s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF - s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF - s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF + // information about the first byte in a UTF-8 sequence. + const first = comptime ([_]u8{as} ** 128) ++ ([_]u8{xx} ** 64) ++ [_]u8{ + xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, + s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, + s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, + s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, }; var n = p.len; var i: usize = 0; while (i < n) { const pi = p[i]; - if (pi < RuneSelf) { + if (pi < rune_self) { i += 1; continue; } @@ -288,25 +275,28 @@ pub fn utf8ValidateSlice(input: []const u8) bool { if (c1 < accept_lo or accept_hi < c1) { return false; } - if (size == 2) { - i += size; - continue; - } - - const c2 = p[i + 2]; - if (c2 < locb or hicb < c2) { - return false; - } - if (size == 3) { - i += size; - continue; - } - - const c3 = p[i + 3]; - if (c3 < locb or hicb < c3) { - return false; + switch (size) { + 2 => i += 2, + 3 => { + const c2 = p[i + 2]; + if (c2 < locb or hicb < c2) { + return false; + } + i += 3; + }, + 4 => { + const c2 = p[i + 2]; + if (c2 < locb or hicb < c2) { + return false; + } + const c3 = p[i + 3]; + if (c3 < locb or hicb < c3) { + return false; + } + i += 4; + }, + else => unreachable, } - i += size; } return true; From 5bb557d737c51573ccb63a68a9ccb9e7df45d542 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Thu, 5 Oct 2023 11:49:04 +0800 Subject: [PATCH 4/5] remove license statement and other Goisms --- lib/std/unicode.zig | 68 ++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index e056f918e2e9..c121d85bc919 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -196,33 +196,29 @@ pub fn utf8CountCodepoints(s: []const u8) !usize { return len; } -// Ported from go, which is licensed under a BSD-3 license. -// https://golang.org/LICENSE -// -// https://golang.org/src/unicode/utf8/utf8.go -/// Returns true if the input consists entirely of UTF-8 runes +/// Returns true if the input consists entirely of UTF-8 codepoints pub fn utf8ValidateSlice(input: []const u8) bool { - var p = input; + var remaining = input; // Fast path. Check for and skip 8 bytes of ASCII characters per iteration. - while (p.len >= 8) { - const first32 = mem.readIntLittle(u32, p[0..4]); - const second32 = mem.readIntLittle(u32, p[4..8]); + while (remaining.len >= 8) { + const first32 = mem.readIntLittle(u32, remaining[0..4]); + const second32 = mem.readIntLittle(u32, remaining[4..8]); if ((first32 | second32) & 0x80808080 != 0) { // Found a non ASCII byte break; } - p = p[8..]; + remaining = remaining[8..]; } - const rune_self = 0x80; - const locb = 0b10000000; - const hicb = 0b10111111; + // default lowest and highest continuation byte + const lo_cb = 0b10000000; + const hi_cb = 0b10111111; - // These names of these constants are chosen to give nice alignment in the - // table below. The first nibble is an index into acceptRanges or F for - // special one-byte cases. The second nibble is the Rune length or the - // Status for the special one-byte case. + const min_non_ascii_codepoint = 0x80; + + // The first nibble is used to identify the continuation byte range to + // accept. The second nibble is the size. const xx = 0xF1; // invalid: size 1 const as = 0xF0; // ASCII: size 1 const s1 = 0x02; // accept 0, size 2 @@ -233,7 +229,7 @@ pub fn utf8ValidateSlice(input: []const u8) bool { const s6 = 0x04; // accept 0, size 4 const s7 = 0x44; // accept 4, size 4 - // information about the first byte in a UTF-8 sequence. + // Information about the first byte in a UTF-8 sequence. const first = comptime ([_]u8{as} ** 128) ++ ([_]u8{xx} ** 64) ++ [_]u8{ xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, @@ -241,28 +237,31 @@ pub fn utf8ValidateSlice(input: []const u8) bool { s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, }; - var n = p.len; + var n = remaining.len; var i: usize = 0; while (i < n) { - const pi = p[i]; - if (pi < rune_self) { + const first_byte = remaining[i]; + if (first_byte < min_non_ascii_codepoint) { i += 1; continue; } - const x = first[pi]; - if (x == xx) { + const info = first[first_byte]; + if (info == xx) { return false; // Illegal starter byte. } - const size = x & 7; + const size = info & 7; if (i + size > n) { return false; // Short or invalid. } - var accept_lo: u8 = locb; - var accept_hi: u8 = hicb; - switch (x >> 4) { + // Figure out the acceptable low and high continuation bytes, starting + // with our defaults. + var accept_lo: u8 = lo_cb; + var accept_hi: u8 = hi_cb; + + switch (info >> 4) { 0 => {}, 1 => accept_lo = 0xA0, 2 => accept_hi = 0x9F, @@ -271,26 +270,27 @@ pub fn utf8ValidateSlice(input: []const u8) bool { else => unreachable, } - const c1 = p[i + 1]; + const c1 = remaining[i + 1]; if (c1 < accept_lo or accept_hi < c1) { return false; } + switch (size) { 2 => i += 2, 3 => { - const c2 = p[i + 2]; - if (c2 < locb or hicb < c2) { + const c2 = remaining[i + 2]; + if (c2 < lo_cb or hi_cb < c2) { return false; } i += 3; }, 4 => { - const c2 = p[i + 2]; - if (c2 < locb or hicb < c2) { + const c2 = remaining[i + 2]; + if (c2 < lo_cb or hi_cb < c2) { return false; } - const c3 = p[i + 3]; - if (c3 < locb or hicb < c3) { + const c3 = remaining[i + 3]; + if (c3 < lo_cb or hi_cb < c3) { return false; } i += 4; From 5b3df76152e37da0bdf0ccbaaf6d7184ac12b33c Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Fri, 6 Oct 2023 13:02:29 +0800 Subject: [PATCH 5/5] use vector for ASCII fast-path --- lib/std/unicode.zig | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index c121d85bc919..5811d6f768a5 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -200,15 +200,21 @@ pub fn utf8CountCodepoints(s: []const u8) !usize { pub fn utf8ValidateSlice(input: []const u8) bool { var remaining = input; - // Fast path. Check for and skip 8 bytes of ASCII characters per iteration. - while (remaining.len >= 8) { - const first32 = mem.readIntLittle(u32, remaining[0..4]); - const second32 = mem.readIntLittle(u32, remaining[4..8]); - if ((first32 | second32) & 0x80808080 != 0) { + const V_len = comptime std.simd.suggestVectorSize(usize) orelse 1; + const V = @Vector(V_len, usize); + const u8s_in_vector = @sizeOf(usize) * V_len; + + // Fast path. Check for and skip ASCII characters at the start of the input. + while (remaining.len >= u8s_in_vector) { + const chunk: V = @bitCast(remaining[0..u8s_in_vector].*); + const swapped = mem.littleToNative(V, chunk); + const reduced = @reduce(.Or, swapped); + const mask: usize = @bitCast([1]u8{0x80} ** @sizeOf(usize)); + if (reduced & mask != 0) { // Found a non ASCII byte break; } - remaining = remaining[8..]; + remaining = remaining[u8s_in_vector..]; } // default lowest and highest continuation byte @@ -592,6 +598,13 @@ fn testUtf8ViewOk() !void { test "validate slice" { try comptime testValidateSlice(); try testValidateSlice(); + + // We skip a variable (based on recommended vector size) chunks of + // ASCII characters. Let's make sure we're chunking correctly. + const str = [_]u8{'a'} ** 550 ++ "\xc0"; + for (0..str.len - 3) |i| { + try testing.expect(!utf8ValidateSlice(str[i..])); + } } fn testValidateSlice() !void { try testing.expect(utf8ValidateSlice("abc"));