From a7e2ef6d2a4b46d45f6275589209fc088d0a0e01 Mon Sep 17 00:00:00 2001 From: Prcuvu Date: Fri, 4 Aug 2023 00:00:00 +0000 Subject: [PATCH 1/3] =?UTF-8?q?std.os.windows:=20Support=20UTF-8=20?= =?UTF-8?q?=E2=86=94=20UTF-16=20conversion=20for=20Windows=20native=20cons?= =?UTF-8?q?ole=20I/O?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/std/os.zig | 7 +- lib/std/os/windows.zig | 362 +++++++++++++++++++++++++++++++- lib/std/os/windows/kernel32.zig | 3 + src/link.zig | 1 + 4 files changed, 360 insertions(+), 13 deletions(-) diff --git a/lib/std/os.zig b/lib/std/os.zig index f72e2aad42ad..cb8226292557 100644 --- a/lib/std/os.zig +++ b/lib/std/os.zig @@ -1068,6 +1068,10 @@ pub const WriteError = error{ /// Connection reset by peer. ConnectionResetByPeer, + + /// This error occurs when trying to write UTF-8 text to a Windows console, + /// and the UTF-8 to UTF-16 conversion fails. Windows-only. + InvalidUtf8, } || UnexpectedError; /// Write to a file descriptor. @@ -3239,8 +3243,7 @@ pub fn isatty(handle: fd_t) bool { if (isCygwinPty(handle)) return true; - var out: windows.DWORD = undefined; - return windows.kernel32.GetConsoleMode(handle, &out) != 0; + return windows.IsConsoleHandle(handle); } if (builtin.link_libc) { return system.isatty(handle) != 0; diff --git a/lib/std/os/windows.zig b/lib/std/os/windows.zig index d40fee8db241..2bdec650b6c7 100644 --- a/lib/std/os/windows.zig +++ b/lib/std/os/windows.zig @@ -445,6 +445,9 @@ pub fn GetQueuedCompletionStatusEx( pub fn CloseHandle(hObject: HANDLE) void { assert(ntdll.NtClose(hObject) == .SUCCESS); + if (IsConsoleHandle(hObject)) { + _ = removeConsoleHandleData(hObject) catch {}; + } } pub fn FindClose(hFindFile: HANDLE) void { @@ -456,6 +459,7 @@ pub const ReadFileError = error{ NetNameDeleted, OperationAborted, Unexpected, + NotOpenForReading, }; /// If buffer's length exceeds what a Windows DWORD integer can hold, it will be broken into @@ -523,14 +527,31 @@ pub fn ReadFile(in_hFile: HANDLE, buffer: []u8, offset: ?u64, io_mode: std.io.Mo }; break :blk &overlapped_data; } else null; - if (kernel32.ReadFile(in_hFile, buffer.ptr, want_read_count, &amt_read, overlapped) == 0) { - switch (kernel32.GetLastError()) { - .IO_PENDING => unreachable, - .OPERATION_ABORTED => continue, - .BROKEN_PIPE => return 0, - .HANDLE_EOF => return 0, - .NETNAME_DELETED => return error.NetNameDeleted, - else => |err| return unexpectedError(err), + var console_mode: DWORD = undefined; + const is_console_handle: bool = kernel32.GetConsoleMode(in_hFile, &console_mode) != FALSE; + const is_cooked_mode: bool = (console_mode & ENABLE_LINE_INPUT) != 0; + // Implementation issue: + // There is no reliable way to implement perfectly platform-agnostic UTF-16 to UTF-8 + // conversion for raw mode, because it is impossible to know the number of pending + // code units stored in console input buffer, while in cooked mode we can rely on the + // terminating LF character. Without knowing that, ReadConsoleW() may accidentally pop + // out characters without blocking, or prompt for user input at unexpected timing. + // In the case of raw mode, redirect to kernel32.ReadFile() without conversion for now, + // just don't make things worse. + if (is_console_handle and is_cooked_mode) { + assert(offset == null); + amt_read = try ReadConsoleWithUtf16ToUtf8Conversion(in_hFile, buffer); + } else { + if (kernel32.ReadFile(in_hFile, buffer.ptr, want_read_count, &amt_read, overlapped) == 0) { + switch (kernel32.GetLastError()) { + .IO_PENDING => unreachable, + .OPERATION_ABORTED => continue, + .BROKEN_PIPE => return 0, + .HANDLE_EOF => return 0, + .NETNAME_DELETED => return error.NetNameDeleted, + .INVALID_HANDLE => return error.NotOpenForReading, + else => |err| return unexpectedError(err), + } } } return amt_read; @@ -538,6 +559,132 @@ pub fn ReadFile(in_hFile: HANDLE, buffer: []u8, offset: ?u64, io_mode: std.io.Mo } } +fn ReadConsoleWithUtf16ToUtf8Conversion(hConsoleInput: HANDLE, buffer: []u8) ReadFileError!DWORD { + const handle_data: *ConsoleHandleData = getConsoleHandleData(hConsoleInput) catch |err| switch (err) { + error.ConsoleHandleLimitReached => @panic("Reached maximum number of 64 console handles."), + else => return error.Unexpected, + }; + // The temporary buffer can be huge, so keep it away from stack + var heap_allocator: std.heap.HeapAllocator = std.heap.HeapAllocator.init(); + defer heap_allocator.deinit(); + const allocator: std.mem.Allocator = heap_allocator.allocator(); + var temp_buffer: []u8 = allocator.alloc(u8, buffer.len) catch @panic("Out of memory."); + defer allocator.free(temp_buffer); + + var bytes_read: DWORD = 0; + var reached_end_of_line: bool = false; + + // Try flushing leftover UTF-8 bytes first (one codepoint at most) + if (handle_data.utf8_buffer.bytes_used != 0) { + // LF will only appear at the first byte and there will be only one byte in the buffer + if (handle_data.utf8_buffer.data[0] == 0x0A) { + assert(handle_data.utf8_buffer.bytes_used == 1); + reached_end_of_line = true; + } + // Is there enough space for all bytes in UTF-8 buffer? + const has_enough_space: bool = buffer.len >= handle_data.utf8_buffer.bytes_used; + const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len; + for (0..max_bytes_to_read) |index| { + temp_buffer[index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index]; + // Front index wraps around in the case of 4-byte sequence (non-BMP code point) + handle_data.utf8_buffer.front_index +%= 1; + } + bytes_read += @truncate(max_bytes_to_read); + handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read); + if (has_enough_space) { + // UTF-8 buffer is now empty, we can safely reset front_index to zero + handle_data.utf8_buffer.front_index = 0; + } else { + return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false); + } + // LF ends a console read immediately + if (reached_end_of_line) { + return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false); + } + } + assert(handle_data.utf8_buffer.front_index == 0); + while (bytes_read < buffer.len) { + // Read only one code unit each loop + var utf16_code_unit: u16 = undefined; + var utf16_code_units_read: DWORD = undefined; + if (kernel32.ReadConsoleW(hConsoleInput, &utf16_code_unit, 1, &utf16_code_units_read, null) == FALSE) { + switch (kernel32.GetLastError()) { + .INVALID_HANDLE => return error.NotOpenForReading, + else => |err| return unexpectedError(err), + } + } + if (utf16_code_unit == 0x000D) { + // CR should always be followed by an LF, so just discard it + continue; + } else if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) { + // When a high surrogate is encountered, store it into the UTF-16 buffer + assert(handle_data.utf16_buffer.code_units_used == 0); + handle_data.utf16_buffer.data[0] = utf16_code_unit; + handle_data.utf16_buffer.code_units_used = 1; + continue; + } else if (utf16_code_unit >= 0xDC00 and utf16_code_unit <= 0xDFFF) { + // When a low surrogate is encountered, assemble surrogate pair and convert to UTF-8 + if (!(utf16_code_units_read == 1 and + handle_data.utf16_buffer.data[0] >= 0xD800 and handle_data.utf16_buffer.data[0] <= 0xDBFF)) { + unreachable; + } + handle_data.utf16_buffer.data[1] = utf16_code_unit; + handle_data.utf16_buffer.code_units_used = 0; + const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, &handle_data.utf16_buffer.data) catch return error.Unexpected; + assert(utf8_bytes == 4); + handle_data.utf8_buffer.bytes_used = 4; + } else { + assert(handle_data.utf16_buffer.code_units_used == 0); + const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, @as(*[1]u16, &utf16_code_unit)) catch return error.Unexpected; + handle_data.utf8_buffer.bytes_used = @truncate(utf8_bytes); + // LF ends a console read immediately + if (handle_data.utf8_buffer.bytes_used == 1 and handle_data.utf8_buffer.data[0] == 0x0A) { + reached_end_of_line = true; + } + } + // Is there enough space for all bytes in UTF-8 buffer? + const has_enough_space: bool = buffer.len >= bytes_read + handle_data.utf8_buffer.bytes_used; + const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len - bytes_read; + for (0..max_bytes_to_read) |index| { + temp_buffer[bytes_read + index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index]; + // Front index wraps around in the case of 4-byte sequence (non-BMP code point) + handle_data.utf8_buffer.front_index +%= 1; + } + bytes_read += @truncate(max_bytes_to_read); + handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read); + if (has_enough_space) { + // UTF-8 buffer is now empty, we can safely reset front_index to zero + handle_data.utf8_buffer.front_index = 0; + } else { + break; + } + // LF ends a console read immediately + if (reached_end_of_line) { + break; + } + } + return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, true); +} + +fn ReadConsoleProcessUtf8Buffer(buffer: []u8, temp_buffer: []u8, bytes_read: DWORD, comptime truncate_after_SUB: bool) DWORD { + if (truncate_after_SUB) { + // Truncate everything after the SUB (Ctrl+Z) character + var index: DWORD = 0; + var reached_end_of_file: bool = false; + while (index < bytes_read and !reached_end_of_file) { + if (temp_buffer[index] == 0x1A) { + reached_end_of_file = true; + } + buffer[index] = temp_buffer[index]; + index += 1; + } + return index; + } else { + std.mem.copy(u8, buffer, temp_buffer); + return bytes_read; + } +} + pub const WriteFileError = error{ SystemResources, OperationAborted, @@ -547,6 +694,9 @@ pub const WriteFileError = error{ /// a portion of the file. LockViolation, Unexpected, + /// This error occurs when trying to write UTF-8 text to a Windows console, + /// and the UTF-8 to UTF-16 conversion fails. + InvalidUtf8, }; pub fn WriteFile( @@ -617,21 +767,109 @@ pub fn WriteFile( break :blk &overlapped_data; } else null; const adjusted_len = math.cast(u32, bytes.len) orelse maxInt(u32); - if (kernel32.WriteFile(handle, bytes.ptr, adjusted_len, &bytes_written, overlapped) == 0) { + if (IsConsoleHandle(handle)) { + assert(offset == null); + bytes_written = try WriteConsoleWithUtf8ToUtf16Conversion(handle, bytes); + } else { + if (kernel32.WriteFile(handle, bytes.ptr, adjusted_len, &bytes_written, overlapped) == 0) { + switch (kernel32.GetLastError()) { + .INVALID_USER_BUFFER => return error.SystemResources, + .NOT_ENOUGH_MEMORY => return error.SystemResources, + .OPERATION_ABORTED => return error.OperationAborted, + .NOT_ENOUGH_QUOTA => return error.SystemResources, + .IO_PENDING => unreachable, + .BROKEN_PIPE => return error.BrokenPipe, + .INVALID_HANDLE => return error.NotOpenForWriting, + .LOCK_VIOLATION => return error.LockViolation, + else => |err| return unexpectedError(err), + } + } + } + return bytes_written; + } +} + +fn WriteConsoleWithUtf8ToUtf16Conversion(handle: HANDLE, bytes: []const u8) WriteFileError!DWORD { + const handle_data: *ConsoleHandleData = getConsoleHandleData(handle) catch |err| switch (err) { + error.ConsoleHandleLimitReached => @panic("Reached maximum number of 64 console handles."), + else => return error.Unexpected, + }; + var bytes_written: DWORD = 0; + var byte_index: DWORD = 0; + while (byte_index < bytes.len) { + var utf16_buffer: [2]u16 = undefined; + var utf16_code_units: usize = undefined; + if (handle_data.utf8_buffer.bytes_used == 0) { + const utf8_byte_sequence_length: u3 = std.unicode.utf8ByteSequenceLength(bytes[byte_index]) catch return error.InvalidUtf8; + const bytes_available: usize = bytes.len - byte_index; + if (bytes_available < utf8_byte_sequence_length) { + for (0..bytes_available) |index| { + handle_data.utf8_buffer.data[index] = bytes[index]; + } + bytes_written += @truncate(bytes_available); + return bytes_written; + } else { + utf16_code_units = std.unicode.utf8ToUtf16Le(&utf16_buffer, bytes[byte_index..byte_index + utf8_byte_sequence_length]) catch return error.InvalidUtf8; + byte_index += utf8_byte_sequence_length; + } + } else { + const utf8_byte_sequence_length: u3 = std.unicode.utf8ByteSequenceLength(handle_data.utf8_buffer.data[0]) catch return error.InvalidUtf8; + assert(utf8_byte_sequence_length > 1 and utf8_byte_sequence_length > handle_data.utf8_buffer.bytes_used); + const bytes_available: usize = bytes.len - byte_index; + const bytes_needed: u3 = utf8_byte_sequence_length - handle_data.utf8_buffer.bytes_used; + if (bytes_available < bytes_needed) { + assert(handle_data.utf8_buffer.bytes_used + bytes_available < utf8_byte_sequence_length); + for (0..bytes_available) |index| { + handle_data.utf8_buffer.data[handle_data.utf8_buffer.bytes_used + index] = bytes[index]; + } + bytes_written += @truncate(bytes_available); + return bytes_written; + } else { + for (0..bytes_needed) |index| { + handle_data.utf8_buffer.data[handle_data.utf8_buffer.bytes_used + index] = bytes[index]; + } + utf16_code_units = std.unicode.utf8ToUtf16Le(&utf16_buffer, handle_data.utf8_buffer.data[0..utf8_byte_sequence_length]) catch return error.InvalidUtf8; + byte_index += bytes_needed; + } + } + // Handle LF to CRLF conversion + switch (utf16_buffer[0]) { + 0x000D => { + handle_data.last_character_written_is_CR = true; + }, + 0x000A => { + if (handle_data.last_character_written_is_CR) { + handle_data.last_character_written_is_CR = false; + } else { + utf16_buffer = .{ 0x000D, 0x000A }; + utf16_code_units = 2; + } + }, + else => { + handle_data.last_character_written_is_CR = false; + }, + } + var utf16_code_units_written: DWORD = undefined; + if (kernel32.WriteConsoleW(handle, &utf16_buffer, @truncate(utf16_code_units), &utf16_code_units_written, null) == FALSE) { switch (kernel32.GetLastError()) { .INVALID_USER_BUFFER => return error.SystemResources, .NOT_ENOUGH_MEMORY => return error.SystemResources, .OPERATION_ABORTED => return error.OperationAborted, .NOT_ENOUGH_QUOTA => return error.SystemResources, .IO_PENDING => unreachable, - .BROKEN_PIPE => return error.BrokenPipe, + .BROKEN_PIPE => unreachable, .INVALID_HANDLE => return error.NotOpenForWriting, .LOCK_VIOLATION => return error.LockViolation, else => |err| return unexpectedError(err), } } - return bytes_written; + if (utf16_code_units_written < utf16_code_units) { + return bytes_written; + } else { + bytes_written = byte_index; + } } + return bytes_written; } pub const SetCurrentDirectoryError = error{ @@ -5240,3 +5478,105 @@ pub fn ProcessBaseAddress(handle: HANDLE) ProcessBaseAddressError!HMODULE { const ppeb: *const PEB = @ptrCast(@alignCast(peb_out.ptr)); return ppeb.ImageBaseAddress; } + +pub const ENABLE_PROCESSED_INPUT = 0x0001; +pub const ENABLE_LINE_INPUT = 0x0002; +pub const ENABLE_ECHO_INPUT = 0x0004; +pub const ENABLE_WINDOW_INPUT = 0x0008; +pub const ENABLE_MOUSE_INPUT = 0x0010; +pub const ENABLE_INSERT_MODE = 0x0020; +pub const ENABLE_QUICK_EDIT_MODE = 0x0040; +pub const ENABLE_EXTENDED_FLAGS = 0x0080; +pub const ENABLE_AUTO_POSITION = 0x0100; +pub const ENABLE_VIRTUAL_TERMINAL_INPUT = 0x0200; + +pub const CONSOLE_READCONSOLE_CONTROL = extern struct { + nLength: ULONG, + nInitialChars: ULONG, + dwCtrlWakeupMask: ULONG, + dwControlKeyState: ULONG, +}; + +pub const PCONSOLE_READCONSOLE_CONTROL = *CONSOLE_READCONSOLE_CONTROL; + +pub fn IsConsoleHandle(handle: HANDLE) bool { + var out: DWORD = undefined; + return kernel32.GetConsoleMode(handle, &out) != FALSE; +} + +// Non-public extra data associated with console handle, and its helper functions +const ConsoleHandleData = struct { + is_assigned: bool = false, + handle: ?HANDLE = null, + utf8_buffer: Utf8Buffer = .{}, + utf16_buffer: Utf16Buffer = .{}, + last_character_written_is_CR: bool = false, + + const Utf8Buffer = struct { + data: [4]u8 = .{ 0x00, 0x00, 0x00, 0x00 }, + bytes_used: u3 = 0, + front_index: u2 = 0, + }; + + const Utf16Buffer = struct { + data: [2]u16 = .{ 0x0000, 0x0000 }, + code_units_used: u2 = 0, + }; +}; + +const max_console_handle_data = 64; + +var console_handle_data_array: switch (builtin.os.tag) { + .windows => [max_console_handle_data]ConsoleHandleData, + else => void, +} = switch (builtin.os.tag) { + .windows => [_]ConsoleHandleData{.{}} ** max_console_handle_data, + else => void{}, +}; + +const ConsoleHandleDataError = error{ + DataNotFound, + ConsoleHandleLimitReached, +}; + +fn getConsoleHandleData(handle: HANDLE) ConsoleHandleDataError!*ConsoleHandleData { + if (builtin.os.tag == .windows) { + var found_unassigned: bool = false; + var first_unassigned_index: usize = undefined; + for (0..max_console_handle_data) |index| { + if (console_handle_data_array[index].is_assigned) { + if (console_handle_data_array[index].handle == handle) { + return &console_handle_data_array[index]; + } + } else if (!found_unassigned) { + found_unassigned = true; + first_unassigned_index = index; + } + } + if (found_unassigned) { + console_handle_data_array[first_unassigned_index].is_assigned = true; + console_handle_data_array[first_unassigned_index].handle = handle; + console_handle_data_array[first_unassigned_index].utf8_buffer.bytes_used = 0; + console_handle_data_array[first_unassigned_index].last_character_written_is_CR = false; + return &console_handle_data_array[first_unassigned_index]; + } else { + return error.ConsoleHandleLimitReached; + } + } else { + @compileError("Unsupported OS"); + } +} + +fn removeConsoleHandleData(handle: HANDLE) ConsoleHandleDataError!usize { + if (builtin.os.tag == .windows) { + for (0..max_console_handle_data) |index| { + if (console_handle_data_array[index].is_assigned and console_handle_data_array[index].handle == handle) { + console_handle_data_array[index].is_assigned = false; + return index; + } + } + return error.DataNotFound; + } else { + @compileError("Unsupported OS"); + } +} diff --git a/lib/std/os/windows/kernel32.zig b/lib/std/os/windows/kernel32.zig index 942d7ddba791..2b13bb4399eb 100644 --- a/lib/std/os/windows/kernel32.zig +++ b/lib/std/os/windows/kernel32.zig @@ -174,6 +174,9 @@ pub extern "kernel32" fn FillConsoleOutputCharacterW(hConsoleOutput: HANDLE, cCh pub extern "kernel32" fn FillConsoleOutputAttribute(hConsoleOutput: HANDLE, wAttribute: WORD, nLength: DWORD, dwWriteCoord: COORD, lpNumberOfAttrsWritten: *DWORD) callconv(WINAPI) BOOL; pub extern "kernel32" fn SetConsoleCursorPosition(hConsoleOutput: HANDLE, dwCursorPosition: COORD) callconv(WINAPI) BOOL; +pub extern "kernel32" fn ReadConsoleW(hConsoleInput: HANDLE, lpBuffer: LPVOID, nNumberOfCharsToRead: DWORD, lpNumberOfCharsRead: *DWORD, pInputControl: ?LPVOID) callconv(WINAPI) BOOL; +pub extern "kernel32" fn WriteConsoleW(hConsoleOutput: HANDLE, lpBuffer: *const anyopaque, nNumberOfCharsToWrite: DWORD, lpNumberOfCharsWritten: ?*DWORD, lpReserved: ?LPVOID) callconv(WINAPI) BOOL; + pub extern "kernel32" fn GetCurrentDirectoryW(nBufferLength: DWORD, lpBuffer: ?[*]WCHAR) callconv(WINAPI) DWORD; pub extern "kernel32" fn GetCurrentThread() callconv(WINAPI) HANDLE; diff --git a/src/link.zig b/src/link.zig index dd94ed9a5664..dca869021161 100644 --- a/src/link.zig +++ b/src/link.zig @@ -543,6 +543,7 @@ pub const File = struct { DeviceBusy, InvalidArgument, HotSwapUnavailableOnHostOperatingSystem, + InvalidUtf8, }; /// Called from within the CodeGen to lower a local variable instantion as an unnamed From 94955689c5d61bcee242683a39dc7b120f0cc3d5 Mon Sep 17 00:00:00 2001 From: Prcuvu Date: Sun, 6 Aug 2023 00:00:00 +0000 Subject: [PATCH 2/3] std.os.windows: Rewrite `ReadConsoleWithUtf16ToUtf8Conversion()` using fixed-size temporary buffer --- lib/std/os/windows.zig | 240 ++++++++++++++++++++++------------------- 1 file changed, 132 insertions(+), 108 deletions(-) diff --git a/lib/std/os/windows.zig b/lib/std/os/windows.zig index 2bdec650b6c7..463fabea1c5e 100644 --- a/lib/std/os/windows.zig +++ b/lib/std/os/windows.zig @@ -564,125 +564,149 @@ fn ReadConsoleWithUtf16ToUtf8Conversion(hConsoleInput: HANDLE, buffer: []u8) Rea error.ConsoleHandleLimitReached => @panic("Reached maximum number of 64 console handles."), else => return error.Unexpected, }; - // The temporary buffer can be huge, so keep it away from stack - var heap_allocator: std.heap.HeapAllocator = std.heap.HeapAllocator.init(); - defer heap_allocator.deinit(); - const allocator: std.mem.Allocator = heap_allocator.allocator(); - var temp_buffer: []u8 = allocator.alloc(u8, buffer.len) catch @panic("Out of memory."); - defer allocator.free(temp_buffer); - + var temp_buffer: [1024]u8 = undefined; var bytes_read: DWORD = 0; var reached_end_of_line: bool = false; - - // Try flushing leftover UTF-8 bytes first (one codepoint at most) - if (handle_data.utf8_buffer.bytes_used != 0) { - // LF will only appear at the first byte and there will be only one byte in the buffer - if (handle_data.utf8_buffer.data[0] == 0x0A) { - assert(handle_data.utf8_buffer.bytes_used == 1); - reached_end_of_line = true; - } - // Is there enough space for all bytes in UTF-8 buffer? - const has_enough_space: bool = buffer.len >= handle_data.utf8_buffer.bytes_used; - const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len; - for (0..max_bytes_to_read) |index| { - temp_buffer[index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index]; - // Front index wraps around in the case of 4-byte sequence (non-BMP code point) - handle_data.utf8_buffer.front_index +%= 1; - } - bytes_read += @truncate(max_bytes_to_read); - handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read); - if (has_enough_space) { - // UTF-8 buffer is now empty, we can safely reset front_index to zero - handle_data.utf8_buffer.front_index = 0; - } else { - return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false); - } - // LF ends a console read immediately - if (reached_end_of_line) { - return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false); - } - } - assert(handle_data.utf8_buffer.front_index == 0); + const TruncateState = enum { + do_not_truncate, + truncate_after_SUB, + truncate_all, + }; + var truncate_state: TruncateState = .do_not_truncate; while (bytes_read < buffer.len) { - // Read only one code unit each loop - var utf16_code_unit: u16 = undefined; - var utf16_code_units_read: DWORD = undefined; - if (kernel32.ReadConsoleW(hConsoleInput, &utf16_code_unit, 1, &utf16_code_units_read, null) == FALSE) { - switch (kernel32.GetLastError()) { - .INVALID_HANDLE => return error.NotOpenForReading, - else => |err| return unexpectedError(err), - } + const remaining_buffer: []u8 = buffer[bytes_read..buffer.len]; + var has_enough_space_in_remaining_buffer: bool = undefined; + var bytes_read_into_temp_buffer: DWORD = 0; + var truncate_index: DWORD = undefined; + // If a SUB character is encountered in a previous loop, truncate everything in this loop + if (truncate_state == .truncate_after_SUB) { + truncate_state = .truncate_all; } - if (utf16_code_unit == 0x000D) { - // CR should always be followed by an LF, so just discard it - continue; - } else if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) { - // When a high surrogate is encountered, store it into the UTF-16 buffer - assert(handle_data.utf16_buffer.code_units_used == 0); - handle_data.utf16_buffer.data[0] = utf16_code_unit; - handle_data.utf16_buffer.code_units_used = 1; - continue; - } else if (utf16_code_unit >= 0xDC00 and utf16_code_unit <= 0xDFFF) { - // When a low surrogate is encountered, assemble surrogate pair and convert to UTF-8 - if (!(utf16_code_units_read == 1 and - handle_data.utf16_buffer.data[0] >= 0xD800 and handle_data.utf16_buffer.data[0] <= 0xDBFF)) { - unreachable; + // Try flushing leftover UTF-8 bytes first (one codepoint at most) + if (handle_data.utf8_buffer.bytes_used != 0) { + if (handle_data.utf8_buffer.data[0] == 0x0A) { + assert(handle_data.utf8_buffer.bytes_used == 1); + reached_end_of_line = true; + } else if (handle_data.utf8_buffer.data[0] == 0x1A) { + assert(handle_data.utf8_buffer.bytes_used == 1); + // Truncate after SUB character in this loop if we never truncated in previous loops + if (truncate_state == .do_not_truncate) { + truncate_state = .truncate_after_SUB; + truncate_index = 1; + } + } + // Is there enough space for all bytes in UTF-8 buffer? + const has_enough_space: bool = remaining_buffer.len >= handle_data.utf8_buffer.bytes_used; + const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else remaining_buffer.len; + for (0..max_bytes_to_read) |index| { + temp_buffer[index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index]; + // Front index wraps around in the case of 4-byte sequence (non-BMP code point) + handle_data.utf8_buffer.front_index +%= 1; + } + bytes_read_into_temp_buffer += @truncate(max_bytes_to_read); + handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read); + if (has_enough_space) { + // UTF-8 buffer is now empty, we can safely reset front_index to zero + handle_data.utf8_buffer.front_index = 0; + } else { + switch (truncate_state) { + .truncate_all => {}, + else => @memcpy(remaining_buffer[0..bytes_read_into_temp_buffer], temp_buffer[0..bytes_read_into_temp_buffer]), + } + bytes_read += bytes_read_into_temp_buffer; + break; } - handle_data.utf16_buffer.data[1] = utf16_code_unit; - handle_data.utf16_buffer.code_units_used = 0; - const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, &handle_data.utf16_buffer.data) catch return error.Unexpected; - assert(utf8_bytes == 4); - handle_data.utf8_buffer.bytes_used = 4; - } else { - assert(handle_data.utf16_buffer.code_units_used == 0); - const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, @as(*[1]u16, &utf16_code_unit)) catch return error.Unexpected; - handle_data.utf8_buffer.bytes_used = @truncate(utf8_bytes); // LF ends a console read immediately - if (handle_data.utf8_buffer.bytes_used == 1 and handle_data.utf8_buffer.data[0] == 0x0A) { - reached_end_of_line = true; + if (reached_end_of_line) { + switch (truncate_state) { + .truncate_all => {}, + else => @memcpy(remaining_buffer[0..bytes_read_into_temp_buffer], temp_buffer[0..bytes_read_into_temp_buffer]), + } + bytes_read += bytes_read_into_temp_buffer; + break; } } - // Is there enough space for all bytes in UTF-8 buffer? - const has_enough_space: bool = buffer.len >= bytes_read + handle_data.utf8_buffer.bytes_used; - const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len - bytes_read; - for (0..max_bytes_to_read) |index| { - temp_buffer[bytes_read + index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index]; - // Front index wraps around in the case of 4-byte sequence (non-BMP code point) - handle_data.utf8_buffer.front_index +%= 1; - } - bytes_read += @truncate(max_bytes_to_read); - handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read); - if (has_enough_space) { - // UTF-8 buffer is now empty, we can safely reset front_index to zero - handle_data.utf8_buffer.front_index = 0; - } else { - break; + assert(handle_data.utf8_buffer.front_index == 0); + while (bytes_read_into_temp_buffer < temp_buffer.len) { + // Read only one code unit each loop + var utf16_code_unit: u16 = undefined; + var utf16_code_units_read: DWORD = undefined; + if (kernel32.ReadConsoleW(hConsoleInput, &utf16_code_unit, 1, &utf16_code_units_read, null) == FALSE) { + switch (kernel32.GetLastError()) { + .INVALID_HANDLE => return error.NotOpenForReading, + else => |err| return unexpectedError(err), + } + } + if (utf16_code_unit == 0x000D) { + // CR should always be followed by an LF, so just discard it + continue; + } else if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) { + // When a high surrogate is encountered, store it into the UTF-16 buffer + assert(handle_data.utf16_buffer.code_units_used == 0); + handle_data.utf16_buffer.data[0] = utf16_code_unit; + handle_data.utf16_buffer.code_units_used = 1; + continue; + } else if (utf16_code_unit >= 0xDC00 and utf16_code_unit <= 0xDFFF) { + // When a low surrogate is encountered, assemble surrogate pair and convert to UTF-8 + if (!(utf16_code_units_read == 1 and handle_data.utf16_buffer.data[0] >= 0xD800 and handle_data.utf16_buffer.data[0] <= 0xDBFF)) { + unreachable; + } + handle_data.utf16_buffer.data[1] = utf16_code_unit; + handle_data.utf16_buffer.code_units_used = 0; + const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, &handle_data.utf16_buffer.data) catch return error.Unexpected; + assert(utf8_bytes == 4); + handle_data.utf8_buffer.bytes_used = 4; + } else { + assert(handle_data.utf16_buffer.code_units_used == 0); + const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, @as(*[1]u16, &utf16_code_unit)) catch return error.Unexpected; + handle_data.utf8_buffer.bytes_used = @truncate(utf8_bytes); + if (handle_data.utf8_buffer.bytes_used == 1) { + if (handle_data.utf8_buffer.data[0] == 0x0A) { + reached_end_of_line = true; + } else if (handle_data.utf8_buffer.data[0] == 0x1A) { + if (truncate_state == .do_not_truncate) { + truncate_state = .truncate_after_SUB; + truncate_index = bytes_read_into_temp_buffer + 1; + } + } + } + } + // Is there enough space for all bytes in UTF-8 buffer? + has_enough_space_in_remaining_buffer = remaining_buffer.len >= bytes_read_into_temp_buffer + handle_data.utf8_buffer.bytes_used; + const has_enough_space: bool = has_enough_space_in_remaining_buffer and temp_buffer.len >= bytes_read_into_temp_buffer + handle_data.utf8_buffer.bytes_used; + const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else remaining_buffer.len - bytes_read_into_temp_buffer; + for (0..max_bytes_to_read) |index| { + temp_buffer[bytes_read_into_temp_buffer + index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index]; + // Front index wraps around in the case of 4-byte sequence (non-BMP code point) + handle_data.utf8_buffer.front_index +%= 1; + } + bytes_read_into_temp_buffer += @truncate(max_bytes_to_read); + handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read); + if (has_enough_space) { + // UTF-8 buffer is now empty, we can safely reset front_index to zero + handle_data.utf8_buffer.front_index = 0; + } else { + break; + } + // LF ends a console read immediately + if (reached_end_of_line) { + break; + } } - // LF ends a console read immediately - if (reached_end_of_line) { + // Copy to user-provided buffer + const bytes_copied: DWORD = switch (truncate_state) { + .do_not_truncate => bytes_read_into_temp_buffer, + .truncate_after_SUB => truncate_index, + .truncate_all => 0, + }; + @memcpy(remaining_buffer[0..bytes_copied], temp_buffer[0..bytes_copied]); + bytes_read += bytes_copied; + // Early return conditions + if (!has_enough_space_in_remaining_buffer or reached_end_of_line) { break; } } - return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, true); -} - -fn ReadConsoleProcessUtf8Buffer(buffer: []u8, temp_buffer: []u8, bytes_read: DWORD, comptime truncate_after_SUB: bool) DWORD { - if (truncate_after_SUB) { - // Truncate everything after the SUB (Ctrl+Z) character - var index: DWORD = 0; - var reached_end_of_file: bool = false; - while (index < bytes_read and !reached_end_of_file) { - if (temp_buffer[index] == 0x1A) { - reached_end_of_file = true; - } - buffer[index] = temp_buffer[index]; - index += 1; - } - return index; - } else { - std.mem.copy(u8, buffer, temp_buffer); - return bytes_read; - } + return bytes_read; } pub const WriteFileError = error{ @@ -809,7 +833,7 @@ fn WriteConsoleWithUtf8ToUtf16Conversion(handle: HANDLE, bytes: []const u8) Writ bytes_written += @truncate(bytes_available); return bytes_written; } else { - utf16_code_units = std.unicode.utf8ToUtf16Le(&utf16_buffer, bytes[byte_index..byte_index + utf8_byte_sequence_length]) catch return error.InvalidUtf8; + utf16_code_units = std.unicode.utf8ToUtf16Le(&utf16_buffer, bytes[byte_index .. byte_index + utf8_byte_sequence_length]) catch return error.InvalidUtf8; byte_index += utf8_byte_sequence_length; } } else { From 045c169646f21331b26ad005f1441947844953ac Mon Sep 17 00:00:00 2001 From: Prcuvu Date: Mon, 9 Oct 2023 00:00:00 +0000 Subject: [PATCH 3/3] =?UTF-8?q?std.os.windows:=20Remove=20LF=20=E2=86=94?= =?UTF-8?q?=20CRLF=20conversion=20in=20Windows=20native=20console=20I/O?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/std/os/windows.zig | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/lib/std/os/windows.zig b/lib/std/os/windows.zig index 463fabea1c5e..af6a50d8859d 100644 --- a/lib/std/os/windows.zig +++ b/lib/std/os/windows.zig @@ -637,10 +637,7 @@ fn ReadConsoleWithUtf16ToUtf8Conversion(hConsoleInput: HANDLE, buffer: []u8) Rea else => |err| return unexpectedError(err), } } - if (utf16_code_unit == 0x000D) { - // CR should always be followed by an LF, so just discard it - continue; - } else if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) { + if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) { // When a high surrogate is encountered, store it into the UTF-16 buffer assert(handle_data.utf16_buffer.code_units_used == 0); handle_data.utf16_buffer.data[0] = utf16_code_unit; @@ -856,23 +853,6 @@ fn WriteConsoleWithUtf8ToUtf16Conversion(handle: HANDLE, bytes: []const u8) Writ byte_index += bytes_needed; } } - // Handle LF to CRLF conversion - switch (utf16_buffer[0]) { - 0x000D => { - handle_data.last_character_written_is_CR = true; - }, - 0x000A => { - if (handle_data.last_character_written_is_CR) { - handle_data.last_character_written_is_CR = false; - } else { - utf16_buffer = .{ 0x000D, 0x000A }; - utf16_code_units = 2; - } - }, - else => { - handle_data.last_character_written_is_CR = false; - }, - } var utf16_code_units_written: DWORD = undefined; if (kernel32.WriteConsoleW(handle, &utf16_buffer, @truncate(utf16_code_units), &utf16_code_units_written, null) == FALSE) { switch (kernel32.GetLastError()) { @@ -5534,7 +5514,6 @@ const ConsoleHandleData = struct { handle: ?HANDLE = null, utf8_buffer: Utf8Buffer = .{}, utf16_buffer: Utf16Buffer = .{}, - last_character_written_is_CR: bool = false, const Utf8Buffer = struct { data: [4]u8 = .{ 0x00, 0x00, 0x00, 0x00 }, @@ -5581,7 +5560,6 @@ fn getConsoleHandleData(handle: HANDLE) ConsoleHandleDataError!*ConsoleHandleDat console_handle_data_array[first_unassigned_index].is_assigned = true; console_handle_data_array[first_unassigned_index].handle = handle; console_handle_data_array[first_unassigned_index].utf8_buffer.bytes_used = 0; - console_handle_data_array[first_unassigned_index].last_character_written_is_CR = false; return &console_handle_data_array[first_unassigned_index]; } else { return error.ConsoleHandleLimitReached;