Skip to content

Commit 8208db5

Browse files
committed
std.os.windows: Rewrite ReadConsoleWithUtf16ToUtf8Conversion() using fixed-size temporary buffer
1 parent fffbb92 commit 8208db5

File tree

1 file changed

+132
-107
lines changed

1 file changed

+132
-107
lines changed

lib/std/os/windows.zig

Lines changed: 132 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -564,125 +564,150 @@ fn ReadConsoleWithUtf16ToUtf8Conversion(hConsoleInput: HANDLE, buffer: []u8) Rea
564564
error.ConsoleHandleLimitReached => @panic("Reached maximum number of 64 console handles."),
565565
else => return error.Unexpected,
566566
};
567-
// The temporary buffer can be huge, so keep it away from stack
568-
var heap_allocator: std.heap.HeapAllocator = std.heap.HeapAllocator.init();
569-
defer heap_allocator.deinit();
570-
const allocator: std.mem.Allocator = heap_allocator.allocator();
571-
var temp_buffer: []u8 = allocator.alloc(u8, buffer.len) catch @panic("Out of memory.");
572-
defer allocator.free(temp_buffer);
573-
567+
var temp_buffer: [1024]u8 = undefined;
574568
var bytes_read: DWORD = 0;
575569
var reached_end_of_line: bool = false;
576-
577-
// Try flushing leftover UTF-8 bytes first (one codepoint at most)
578-
if (handle_data.utf8_buffer.bytes_used != 0) {
579-
// LF will only appear at the first byte and there will be only one byte in the buffer
580-
if (handle_data.utf8_buffer.data[0] == 0x0A) {
581-
assert(handle_data.utf8_buffer.bytes_used == 1);
582-
reached_end_of_line = true;
583-
}
584-
// Is there enough space for all bytes in UTF-8 buffer?
585-
const has_enough_space: bool = buffer.len >= handle_data.utf8_buffer.bytes_used;
586-
const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len;
587-
for (0..max_bytes_to_read) |index| {
588-
temp_buffer[index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
589-
// Front index wraps around in the case of 4-byte sequence (non-BMP code point)
590-
handle_data.utf8_buffer.front_index +%= 1;
591-
}
592-
bytes_read += @truncate(max_bytes_to_read);
593-
handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
594-
if (has_enough_space) {
595-
// UTF-8 buffer is now empty, we can safely reset front_index to zero
596-
handle_data.utf8_buffer.front_index = 0;
597-
} else {
598-
return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false);
599-
}
600-
// LF ends a console read immediately
601-
if (reached_end_of_line) {
602-
return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false);
603-
}
604-
}
605-
assert(handle_data.utf8_buffer.front_index == 0);
570+
const TruncateState = enum {
571+
do_not_truncate,
572+
truncate_after_SUB,
573+
truncate_all,
574+
};
575+
var truncate_state: TruncateState = .do_not_truncate;
606576
while (bytes_read < buffer.len) {
607-
// Read only one code unit each loop
608-
var utf16_code_unit: u16 = undefined;
609-
var utf16_code_units_read: DWORD = undefined;
610-
if (kernel32.ReadConsoleW(hConsoleInput, &utf16_code_unit, 1, &utf16_code_units_read, null) == FALSE) {
611-
switch (kernel32.GetLastError()) {
612-
.INVALID_HANDLE => return error.NotOpenForReading,
613-
else => |err| return unexpectedError(err),
614-
}
577+
const remaining_buffer: []u8 = buffer[bytes_read..buffer.len];
578+
var has_enough_space_in_remaining_buffer: bool = undefined;
579+
var bytes_read_into_temp_buffer: DWORD = 0;
580+
var truncate_index: DWORD = undefined;
581+
// If a SUB character is encountered in a previous loop, truncate everything in this loop
582+
if (truncate_state == .truncate_after_SUB) {
583+
truncate_state = .truncate_all;
615584
}
616-
if (utf16_code_unit == 0x000D) {
617-
// CR should always be followed by an LF, so just discard it
618-
continue;
619-
} else if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) {
620-
// When a high surrogate is encountered, store it into the UTF-16 buffer
621-
assert(handle_data.utf16_buffer.code_units_used == 0);
622-
handle_data.utf16_buffer.data[0] = utf16_code_unit;
623-
handle_data.utf16_buffer.code_units_used = 1;
624-
continue;
625-
} else if (utf16_code_unit >= 0xDC00 and utf16_code_unit <= 0xDFFF) {
626-
// When a low surrogate is encountered, assemble surrogate pair and convert to UTF-8
627-
if (!(utf16_code_units_read == 1 and
628-
handle_data.utf16_buffer.data[0] >= 0xD800 and handle_data.utf16_buffer.data[0] <= 0xDBFF)) {
629-
unreachable;
585+
// Try flushing leftover UTF-8 bytes first (one codepoint at most)
586+
if (handle_data.utf8_buffer.bytes_used != 0) {
587+
if (handle_data.utf8_buffer.data[0] == 0x0A) {
588+
assert(handle_data.utf8_buffer.bytes_used == 1);
589+
reached_end_of_line = true;
590+
} else if (handle_data.utf8_buffer.data[0] == 0x1A) {
591+
assert(handle_data.utf8_buffer.bytes_used == 1);
592+
// Truncate after SUB character in this loop if we never truncated in previous loops
593+
if (truncate_state == .do_not_truncate) {
594+
truncate_state = .truncate_after_SUB;
595+
truncate_index = 1;
596+
}
597+
}
598+
// Is there enough space for all bytes in UTF-8 buffer?
599+
const has_enough_space: bool = remaining_buffer.len >= handle_data.utf8_buffer.bytes_used;
600+
const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else remaining_buffer.len;
601+
for (0..max_bytes_to_read) |index| {
602+
temp_buffer[index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
603+
// Front index wraps around in the case of 4-byte sequence (non-BMP code point)
604+
handle_data.utf8_buffer.front_index +%= 1;
605+
}
606+
bytes_read_into_temp_buffer += @truncate(max_bytes_to_read);
607+
handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
608+
if (has_enough_space) {
609+
// UTF-8 buffer is now empty, we can safely reset front_index to zero
610+
handle_data.utf8_buffer.front_index = 0;
611+
} else {
612+
switch (truncate_state) {
613+
.truncate_all => {},
614+
else => @memcpy(remaining_buffer[0..bytes_read_into_temp_buffer], temp_buffer[0..bytes_read_into_temp_buffer]),
615+
}
616+
bytes_read += bytes_read_into_temp_buffer;
617+
break;
630618
}
631-
handle_data.utf16_buffer.data[1] = utf16_code_unit;
632-
handle_data.utf16_buffer.code_units_used = 0;
633-
const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, &handle_data.utf16_buffer.data) catch return error.Unexpected;
634-
assert(utf8_bytes == 4);
635-
handle_data.utf8_buffer.bytes_used = 4;
636-
} else {
637-
assert(handle_data.utf16_buffer.code_units_used == 0);
638-
const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, @as(*[1]u16, &utf16_code_unit)) catch return error.Unexpected;
639-
handle_data.utf8_buffer.bytes_used = @truncate(utf8_bytes);
640619
// LF ends a console read immediately
641-
if (handle_data.utf8_buffer.bytes_used == 1 and handle_data.utf8_buffer.data[0] == 0x0A) {
642-
reached_end_of_line = true;
620+
if (reached_end_of_line) {
621+
switch (truncate_state) {
622+
.truncate_all => {},
623+
else => @memcpy(remaining_buffer[0..bytes_read_into_temp_buffer], temp_buffer[0..bytes_read_into_temp_buffer]),
624+
}
625+
bytes_read += bytes_read_into_temp_buffer;
626+
break;
643627
}
644628
}
645-
// Is there enough space for all bytes in UTF-8 buffer?
646-
const has_enough_space: bool = buffer.len >= bytes_read + handle_data.utf8_buffer.bytes_used;
647-
const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len - bytes_read;
648-
for (0..max_bytes_to_read) |index| {
649-
temp_buffer[bytes_read + index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
650-
// Front index wraps around in the case of 4-byte sequence (non-BMP code point)
651-
handle_data.utf8_buffer.front_index +%= 1;
652-
}
653-
bytes_read += @truncate(max_bytes_to_read);
654-
handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
655-
if (has_enough_space) {
656-
// UTF-8 buffer is now empty, we can safely reset front_index to zero
657-
handle_data.utf8_buffer.front_index = 0;
658-
} else {
659-
break;
629+
assert(handle_data.utf8_buffer.front_index == 0);
630+
while (bytes_read_into_temp_buffer < temp_buffer.len) {
631+
// Read only one code unit each loop
632+
var utf16_code_unit: u16 = undefined;
633+
var utf16_code_units_read: DWORD = undefined;
634+
if (kernel32.ReadConsoleW(hConsoleInput, &utf16_code_unit, 1, &utf16_code_units_read, null) == FALSE) {
635+
switch (kernel32.GetLastError()) {
636+
.INVALID_HANDLE => return error.NotOpenForReading,
637+
else => |err| return unexpectedError(err),
638+
}
639+
}
640+
if (utf16_code_unit == 0x000D) {
641+
// CR should always be followed by an LF, so just discard it
642+
continue;
643+
} else if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) {
644+
// When a high surrogate is encountered, store it into the UTF-16 buffer
645+
assert(handle_data.utf16_buffer.code_units_used == 0);
646+
handle_data.utf16_buffer.data[0] = utf16_code_unit;
647+
handle_data.utf16_buffer.code_units_used = 1;
648+
continue;
649+
} else if (utf16_code_unit >= 0xDC00 and utf16_code_unit <= 0xDFFF) {
650+
// When a low surrogate is encountered, assemble surrogate pair and convert to UTF-8
651+
if (!(utf16_code_units_read == 1 and
652+
handle_data.utf16_buffer.data[0] >= 0xD800 and handle_data.utf16_buffer.data[0] <= 0xDBFF)) {
653+
unreachable;
654+
}
655+
handle_data.utf16_buffer.data[1] = utf16_code_unit;
656+
handle_data.utf16_buffer.code_units_used = 0;
657+
const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, &handle_data.utf16_buffer.data) catch return error.Unexpected;
658+
assert(utf8_bytes == 4);
659+
handle_data.utf8_buffer.bytes_used = 4;
660+
} else {
661+
assert(handle_data.utf16_buffer.code_units_used == 0);
662+
const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, @as(*[1]u16, &utf16_code_unit)) catch return error.Unexpected;
663+
handle_data.utf8_buffer.bytes_used = @truncate(utf8_bytes);
664+
if (handle_data.utf8_buffer.bytes_used == 1) {
665+
if (handle_data.utf8_buffer.data[0] == 0x0A) {
666+
reached_end_of_line = true;
667+
} else if (handle_data.utf8_buffer.data[0] == 0x1A) {
668+
if (truncate_state == .do_not_truncate) {
669+
truncate_state = .truncate_after_SUB;
670+
truncate_index = bytes_read_into_temp_buffer + 1;
671+
}
672+
}
673+
}
674+
}
675+
// Is there enough space for all bytes in UTF-8 buffer?
676+
has_enough_space_in_remaining_buffer = remaining_buffer.len >= bytes_read_into_temp_buffer + handle_data.utf8_buffer.bytes_used;
677+
const has_enough_space: bool = has_enough_space_in_remaining_buffer and temp_buffer.len >= bytes_read_into_temp_buffer + handle_data.utf8_buffer.bytes_used;
678+
const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else remaining_buffer.len - bytes_read_into_temp_buffer;
679+
for (0..max_bytes_to_read) |index| {
680+
temp_buffer[bytes_read_into_temp_buffer + index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
681+
// Front index wraps around in the case of 4-byte sequence (non-BMP code point)
682+
handle_data.utf8_buffer.front_index +%= 1;
683+
}
684+
bytes_read_into_temp_buffer += @truncate(max_bytes_to_read);
685+
handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
686+
if (has_enough_space) {
687+
// UTF-8 buffer is now empty, we can safely reset front_index to zero
688+
handle_data.utf8_buffer.front_index = 0;
689+
} else {
690+
break;
691+
}
692+
// LF ends a console read immediately
693+
if (reached_end_of_line) {
694+
break;
695+
}
660696
}
661-
// LF ends a console read immediately
662-
if (reached_end_of_line) {
697+
// Copy to user-provided buffer
698+
const bytes_copied: DWORD = switch (truncate_state) {
699+
.do_not_truncate => bytes_read_into_temp_buffer,
700+
.truncate_after_SUB => truncate_index,
701+
.truncate_all => 0,
702+
};
703+
@memcpy(remaining_buffer[0..bytes_copied], temp_buffer[0..bytes_copied]);
704+
bytes_read += bytes_copied;
705+
// Early return conditions
706+
if (!has_enough_space_in_remaining_buffer or reached_end_of_line) {
663707
break;
664708
}
665709
}
666-
return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, true);
667-
}
668-
669-
fn ReadConsoleProcessUtf8Buffer(buffer: []u8, temp_buffer: []u8, bytes_read: DWORD, comptime truncate_after_SUB: bool) DWORD {
670-
if (truncate_after_SUB) {
671-
// Truncate everything after the SUB (Ctrl+Z) character
672-
var index: DWORD = 0;
673-
var reached_end_of_file: bool = false;
674-
while (index < bytes_read and !reached_end_of_file) {
675-
if (temp_buffer[index] == 0x1A) {
676-
reached_end_of_file = true;
677-
}
678-
buffer[index] = temp_buffer[index];
679-
index += 1;
680-
}
681-
return index;
682-
} else {
683-
std.mem.copy(u8, buffer, temp_buffer);
684-
return bytes_read;
685-
}
710+
return bytes_read;
686711
}
687712

688713
pub const WriteFileError = error{

0 commit comments

Comments
 (0)