@@ -6,7 +6,7 @@ const mem = std.mem;
66
77/// Returns how many bytes the UTF-8 representation would require
88/// for the given codepoint.
9- pub fn utf8CodepointSequenceLength (c : u32 ) ! u3 {
9+ pub fn utf8CodepointSequenceLength (c : u21 ) ! u3 {
1010 if (c < 0x80 ) return @as (u3 , 1 );
1111 if (c < 0x800 ) return @as (u3 , 2 );
1212 if (c < 0x10000 ) return @as (u3 , 3 );
@@ -18,19 +18,21 @@ pub fn utf8CodepointSequenceLength(c: u32) !u3 {
1818/// returns a number 1-4 indicating the total length of the codepoint in bytes.
1919/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
2020pub fn utf8ByteSequenceLength (first_byte : u8 ) ! u3 {
21- if (first_byte < 0b10000000 ) return @as (u3 , 1 );
22- if (first_byte & 0b11100000 == 0b11000000 ) return @as (u3 , 2 );
23- if (first_byte & 0b11110000 == 0b11100000 ) return @as (u3 , 3 );
24- if (first_byte & 0b11111000 == 0b11110000 ) return @as (u3 , 4 );
25- return error .Utf8InvalidStartByte ;
21+ return switch (@clz (u8 , ~ first_byte )) {
22+ 0 = > 1 ,
23+ 2 = > 2 ,
24+ 3 = > 3 ,
25+ 4 = > 4 ,
26+ else = > error .Utf8InvalidStartByte ,
27+ };
2628}
2729
2830/// Encodes the given codepoint into a UTF-8 byte sequence.
2931/// c: the codepoint.
3032/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).
3133/// Errors: if c cannot be encoded in UTF-8.
3234/// Returns: the number of bytes written to out.
33- pub fn utf8Encode (c : u32 , out : []u8 ) ! u3 {
35+ pub fn utf8Encode (c : u21 , out : []u8 ) ! u3 {
3436 const length = try utf8CodepointSequenceLength (c );
3537 assert (out .len >= length );
3638 switch (length ) {
@@ -66,9 +68,9 @@ const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error
6668/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
6769/// If you already know the length at comptime, you can call one of
6870/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
69- pub fn utf8Decode (bytes : []const u8 ) Utf8DecodeError ! u32 {
71+ pub fn utf8Decode (bytes : []const u8 ) Utf8DecodeError ! u21 {
7072 return switch (bytes .len ) {
71- 1 = > @as (u32 , bytes [0 ]),
73+ 1 = > @as (u21 , bytes [0 ]),
7274 2 = > utf8Decode2 (bytes ),
7375 3 = > utf8Decode3 (bytes ),
7476 4 = > utf8Decode4 (bytes ),
@@ -80,10 +82,10 @@ const Utf8Decode2Error = error{
8082 Utf8ExpectedContinuation ,
8183 Utf8OverlongEncoding ,
8284};
83- pub fn utf8Decode2 (bytes : []const u8 ) Utf8Decode2Error ! u32 {
85+ pub fn utf8Decode2 (bytes : []const u8 ) Utf8Decode2Error ! u21 {
8486 assert (bytes .len == 2 );
8587 assert (bytes [0 ] & 0b11100000 == 0b11000000 );
86- var value : u32 = bytes [0 ] & 0b00011111 ;
88+ var value : u21 = bytes [0 ] & 0b00011111 ;
8789
8890 if (bytes [1 ] & 0b11000000 != 0b10000000 ) return error .Utf8ExpectedContinuation ;
8991 value <<= 6 ;
@@ -99,10 +101,10 @@ const Utf8Decode3Error = error{
99101 Utf8OverlongEncoding ,
100102 Utf8EncodesSurrogateHalf ,
101103};
102- pub fn utf8Decode3 (bytes : []const u8 ) Utf8Decode3Error ! u32 {
104+ pub fn utf8Decode3 (bytes : []const u8 ) Utf8Decode3Error ! u21 {
103105 assert (bytes .len == 3 );
104106 assert (bytes [0 ] & 0b11110000 == 0b11100000 );
105- var value : u32 = bytes [0 ] & 0b00001111 ;
107+ var value : u21 = bytes [0 ] & 0b00001111 ;
106108
107109 if (bytes [1 ] & 0b11000000 != 0b10000000 ) return error .Utf8ExpectedContinuation ;
108110 value <<= 6 ;
@@ -123,10 +125,10 @@ const Utf8Decode4Error = error{
123125 Utf8OverlongEncoding ,
124126 Utf8CodepointTooLarge ,
125127};
126- pub fn utf8Decode4 (bytes : []const u8 ) Utf8Decode4Error ! u32 {
128+ pub fn utf8Decode4 (bytes : []const u8 ) Utf8Decode4Error ! u21 {
127129 assert (bytes .len == 4 );
128130 assert (bytes [0 ] & 0b11111000 == 0b11110000 );
129- var value : u32 = bytes [0 ] & 0b00000111 ;
131+ var value : u21 = bytes [0 ] & 0b00000111 ;
130132
131133 if (bytes [1 ] & 0b11000000 != 0b10000000 ) return error .Utf8ExpectedContinuation ;
132134 value <<= 6 ;
@@ -222,11 +224,11 @@ pub const Utf8Iterator = struct {
222224 return it .bytes [it .i - cp_len .. it .i ];
223225 }
224226
225- pub fn nextCodepoint (it : * Utf8Iterator ) ? u32 {
227+ pub fn nextCodepoint (it : * Utf8Iterator ) ? u21 {
226228 const slice = it .nextCodepointSlice () orelse return null ;
227229
228230 switch (slice .len ) {
229- 1 = > return @as (u32 , slice [0 ]),
231+ 1 = > return @as (u21 , slice [0 ]),
230232 2 = > return utf8Decode2 (slice ) catch unreachable ,
231233 3 = > return utf8Decode3 (slice ) catch unreachable ,
232234 4 = > return utf8Decode4 (slice ) catch unreachable ,
@@ -246,19 +248,19 @@ pub const Utf16LeIterator = struct {
246248 };
247249 }
248250
249- pub fn nextCodepoint (it : * Utf16LeIterator ) ! ? u32 {
251+ pub fn nextCodepoint (it : * Utf16LeIterator ) ! ? u21 {
250252 assert (it .i <= it .bytes .len );
251253 if (it .i == it .bytes .len ) return null ;
252- const c0 : u32 = mem .readIntSliceLittle (u16 , it .bytes [it .i .. it .i + 2 ]);
253- if (c0 & ~ @as (u32 , 0x03ff ) == 0xd800 ) {
254+ const c0 : u21 = mem .readIntSliceLittle (u16 , it .bytes [it .i .. it .i + 2 ]);
255+ if (c0 & ~ @as (u21 , 0x03ff ) == 0xd800 ) {
254256 // surrogate pair
255257 it .i += 2 ;
256258 if (it .i >= it .bytes .len ) return error .DanglingSurrogateHalf ;
257- const c1 : u32 = mem .readIntSliceLittle (u16 , it .bytes [it .i .. it .i + 2 ]);
258- if (c1 & ~ @as (u32 , 0x03ff ) != 0xdc00 ) return error .ExpectedSecondSurrogateHalf ;
259+ const c1 : u21 = mem .readIntSliceLittle (u16 , it .bytes [it .i .. it .i + 2 ]);
260+ if (c1 & ~ @as (u21 , 0x03ff ) != 0xdc00 ) return error .ExpectedSecondSurrogateHalf ;
259261 it .i += 2 ;
260262 return 0x10000 + (((c0 & 0x03ff ) << 10 ) | (c1 & 0x03ff ));
261- } else if (c0 & ~ @as (u32 , 0x03ff ) == 0xdc00 ) {
263+ } else if (c0 & ~ @as (u21 , 0x03ff ) == 0xdc00 ) {
262264 return error .UnexpectedSecondSurrogateHalf ;
263265 } else {
264266 it .i += 2 ;
@@ -302,10 +304,10 @@ fn testUtf8EncodeError() void {
302304 testErrorEncode (0xd800 , array [0.. ], error .Utf8CannotEncodeSurrogateHalf );
303305 testErrorEncode (0xdfff , array [0.. ], error .Utf8CannotEncodeSurrogateHalf );
304306 testErrorEncode (0x110000 , array [0.. ], error .CodepointTooLarge );
305- testErrorEncode (0xffffffff , array [0.. ], error .CodepointTooLarge );
307+ testErrorEncode (0x1fffff , array [0.. ], error .CodepointTooLarge );
306308}
307309
308- fn testErrorEncode (codePoint : u32 , array : []u8 , expectedErr : anyerror ) void {
310+ fn testErrorEncode (codePoint : u21 , array : []u8 , expectedErr : anyerror ) void {
309311 testing .expectError (expectedErr , utf8Encode (codePoint , array ));
310312}
311313
@@ -453,11 +455,11 @@ fn testError(bytes: []const u8, expected_err: anyerror) void {
453455 testing .expectError (expected_err , testDecode (bytes ));
454456}
455457
456- fn testValid (bytes : []const u8 , expected_codepoint : u32 ) void {
458+ fn testValid (bytes : []const u8 , expected_codepoint : u21 ) void {
457459 testing .expect ((testDecode (bytes ) catch unreachable ) == expected_codepoint );
458460}
459461
460- fn testDecode (bytes : []const u8 ) ! u32 {
462+ fn testDecode (bytes : []const u8 ) ! u21 {
461463 const length = try utf8ByteSequenceLength (bytes [0 ]);
462464 if (bytes .len < length ) return error .UnexpectedEof ;
463465 testing .expect (bytes .len == length );
@@ -555,9 +557,8 @@ pub fn utf8ToUtf16LeWithNull(allocator: *mem.Allocator, utf8: []const u8) ![]u16
555557 const short = @intCast (u16 , codepoint );
556558 try result .append (mem .nativeToLittle (u16 , short ));
557559 } else {
558- const short = @intCast (u16 , codepoint - 0x10000 );
559- const high = (short >> 10 ) + 0xD800 ;
560- const low = (short & 0x3FF ) + 0xDC00 ;
560+ const high = @intCast (u16 , (codepoint - 0x10000 ) >> 10 ) + 0xD800 ;
561+ const low = @intCast (u16 , codepoint & 0x3FF ) + 0xDC00 ;
561562 var out : [2 ]u16 = undefined ;
562563 out [0 ] = mem .nativeToLittle (u16 , high );
563564 out [1 ] = mem .nativeToLittle (u16 , low );
@@ -575,48 +576,50 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize {
575576 var dest_i : usize = 0 ;
576577 var src_i : usize = 0 ;
577578 while (src_i < utf8 .len ) {
578- const byte = utf8 [src_i ];
579- const n = @clz (u8 , ~ byte );
580- switch (n ) {
581- 0 = > {
582- utf16le [dest_i ] = byte ;
583- dest_i += 1 ;
584- src_i += 1 ;
585- continue ;
586- },
587- 2 , 3 , 4 = > {
588- const next_src_i = src_i + n ;
589- const codepoint = utf8Decode (utf8 [src_i .. next_src_i ]) catch return error .InvalidUtf8 ;
590- if (codepoint < 0x10000 ) {
591- const short = @intCast (u16 , codepoint );
592- utf16le [dest_i ] = mem .nativeToLittle (u16 , short );
593- dest_i += 1 ;
594- } else {
595- const short = @intCast (u16 , codepoint - 0x10000 );
596- const high = (short >> 10 ) + 0xD800 ;
597- const low = (short & 0x3FF ) + 0xDC00 ;
598- utf16le [dest_i ] = mem .nativeToLittle (u16 , high );
599- utf16le [dest_i + 1 ] = mem .nativeToLittle (u16 , low );
600- dest_i += 2 ;
601- }
602- src_i = next_src_i ;
603- },
604- else = > return error .InvalidUtf8 ,
579+ const n = utf8ByteSequenceLength (utf8 [src_i ]) catch return error .InvalidUtf8 ;
580+ const next_src_i = src_i + n ;
581+ const codepoint = utf8Decode (utf8 [src_i .. next_src_i ]) catch return error .InvalidUtf8 ;
582+ if (codepoint < 0x10000 ) {
583+ const short = @intCast (u16 , codepoint );
584+ utf16le [dest_i ] = mem .nativeToLittle (u16 , short );
585+ dest_i += 1 ;
586+ } else {
587+ const high = @intCast (u16 , (codepoint - 0x10000 ) >> 10 ) + 0xD800 ;
588+ const low = @intCast (u16 , codepoint & 0x3FF ) + 0xDC00 ;
589+ utf16le [dest_i ] = mem .nativeToLittle (u16 , high );
590+ utf16le [dest_i + 1 ] = mem .nativeToLittle (u16 , low );
591+ dest_i += 2 ;
605592 }
593+ src_i = next_src_i ;
606594 }
607595 return dest_i ;
608596}
609597
610598test "utf8ToUtf16Le" {
611599 var utf16le : [2 ]u16 = [_ ]u16 {0 } ** 2 ;
612- const length = try utf8ToUtf16Le (utf16le [0.. ], "𐐷" );
613- testing .expect (@as (usize , 2 ) == length );
614- testing .expectEqualSlices (u8 , "\x01\xd8\x37\xdc " , @sliceToBytes (utf16le [0.. ]));
600+ {
601+ const length = try utf8ToUtf16Le (utf16le [0.. ], "𐐷" );
602+ testing .expectEqual (@as (usize , 2 ), length );
603+ testing .expectEqualSlices (u8 , "\x01\xd8\x37\xdc " , @sliceToBytes (utf16le [0.. ]));
604+ }
605+ {
606+ const length = try utf8ToUtf16Le (utf16le [0.. ], "\u{10FFFF} " );
607+ testing .expectEqual (@as (usize , 2 ), length );
608+ testing .expectEqualSlices (u8 , "\xff\xdb\xff\xdf " , @sliceToBytes (utf16le [0.. ]));
609+ }
615610}
616611
617612test "utf8ToUtf16LeWithNull" {
618- var bytes : [128 ]u8 = undefined ;
619- const allocator = & std .heap .FixedBufferAllocator .init (bytes [0.. ]).allocator ;
620- const utf16 = try utf8ToUtf16LeWithNull (allocator , "𐐷" );
621- testing .expectEqualSlices (u8 , "\x01\xd8\x37\xdc\x00\x00 " , @sliceToBytes (utf16 [0.. ]));
613+ {
614+ var bytes : [128 ]u8 = undefined ;
615+ const allocator = & std .heap .FixedBufferAllocator .init (bytes [0.. ]).allocator ;
616+ const utf16 = try utf8ToUtf16LeWithNull (allocator , "𐐷" );
617+ testing .expectEqualSlices (u8 , "\x01\xd8\x37\xdc\x00\x00 " , @sliceToBytes (utf16 [0.. ]));
618+ }
619+ {
620+ var bytes : [128 ]u8 = undefined ;
621+ const allocator = & std .heap .FixedBufferAllocator .init (bytes [0.. ]).allocator ;
622+ const utf16 = try utf8ToUtf16LeWithNull (allocator , "\u{10FFFF} " );
623+ testing .expectEqualSlices (u8 , "\xff\xdb\xff\xdf\x00\x00 " , @sliceToBytes (utf16 [0.. ]));
624+ }
622625}
0 commit comments