From a7e2ef6d2a4b46d45f6275589209fc088d0a0e01 Mon Sep 17 00:00:00 2001
From: Prcuvu <prcuvu@gmail.com>
Date: Fri, 4 Aug 2023 00:00:00 +0000
Subject: [PATCH 1/3] =?UTF-8?q?std.os.windows:=20Support=20UTF-8=20?=
 =?UTF-8?q?=E2=86=94=20UTF-16=20conversion=20for=20Windows=20native=20cons?=
 =?UTF-8?q?ole=20I/O?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lib/std/os.zig                  |   7 +-
 lib/std/os/windows.zig          | 362 +++++++++++++++++++++++++++++++-
 lib/std/os/windows/kernel32.zig |   3 +
 src/link.zig                    |   1 +
 4 files changed, 360 insertions(+), 13 deletions(-)

diff --git a/lib/std/os.zig b/lib/std/os.zig
index f72e2aad42ad..cb8226292557 100644
--- a/lib/std/os.zig
+++ b/lib/std/os.zig
@@ -1068,6 +1068,10 @@ pub const WriteError = error{
 
     /// Connection reset by peer.
     ConnectionResetByPeer,
+
+    /// This error occurs when trying to write UTF-8 text to a Windows console,
+    /// and the UTF-8 to UTF-16 conversion fails. Windows-only.
+    InvalidUtf8,
 } || UnexpectedError;
 
 /// Write to a file descriptor.
@@ -3239,8 +3243,7 @@ pub fn isatty(handle: fd_t) bool {
         if (isCygwinPty(handle))
             return true;
 
-        var out: windows.DWORD = undefined;
-        return windows.kernel32.GetConsoleMode(handle, &out) != 0;
+        return windows.IsConsoleHandle(handle);
     }
     if (builtin.link_libc) {
         return system.isatty(handle) != 0;
diff --git a/lib/std/os/windows.zig b/lib/std/os/windows.zig
index d40fee8db241..2bdec650b6c7 100644
--- a/lib/std/os/windows.zig
+++ b/lib/std/os/windows.zig
@@ -445,6 +445,9 @@ pub fn GetQueuedCompletionStatusEx(
 
 pub fn CloseHandle(hObject: HANDLE) void {
     assert(ntdll.NtClose(hObject) == .SUCCESS);
+    if (IsConsoleHandle(hObject)) {
+        _ = removeConsoleHandleData(hObject) catch {};
+    }
 }
 
 pub fn FindClose(hFindFile: HANDLE) void {
@@ -456,6 +459,7 @@ pub const ReadFileError = error{
     NetNameDeleted,
     OperationAborted,
     Unexpected,
+    NotOpenForReading,
 };
 
 /// If buffer's length exceeds what a Windows DWORD integer can hold, it will be broken into
@@ -523,14 +527,31 @@ pub fn ReadFile(in_hFile: HANDLE, buffer: []u8, offset: ?u64, io_mode: std.io.Mo
                 };
                 break :blk &overlapped_data;
             } else null;
-            if (kernel32.ReadFile(in_hFile, buffer.ptr, want_read_count, &amt_read, overlapped) == 0) {
-                switch (kernel32.GetLastError()) {
-                    .IO_PENDING => unreachable,
-                    .OPERATION_ABORTED => continue,
-                    .BROKEN_PIPE => return 0,
-                    .HANDLE_EOF => return 0,
-                    .NETNAME_DELETED => return error.NetNameDeleted,
-                    else => |err| return unexpectedError(err),
+            var console_mode: DWORD = undefined;
+            const is_console_handle: bool = kernel32.GetConsoleMode(in_hFile, &console_mode) != FALSE;
+            const is_cooked_mode: bool = (console_mode & ENABLE_LINE_INPUT) != 0;
+            // Implementation issue:
+            // There is no reliable way to implement perfectly platform-agnostic UTF-16 to UTF-8
+            // conversion for raw mode, because it is impossible to know the number of pending
+            // code units stored in console input buffer, while in cooked mode we can rely on the
+            // terminating LF character. Without knowing that, ReadConsoleW() may accidentally pop
+            // out characters without blocking, or prompt for user input at unexpected timing.
+            // In the case of raw mode, redirect to kernel32.ReadFile() without conversion for now,
+            // just don't make things worse.
+            if (is_console_handle and is_cooked_mode) {
+                assert(offset == null);
+                amt_read = try ReadConsoleWithUtf16ToUtf8Conversion(in_hFile, buffer);
+            } else {
+                if (kernel32.ReadFile(in_hFile, buffer.ptr, want_read_count, &amt_read, overlapped) == 0) {
+                    switch (kernel32.GetLastError()) {
+                        .IO_PENDING => unreachable,
+                        .OPERATION_ABORTED => continue,
+                        .BROKEN_PIPE => return 0,
+                        .HANDLE_EOF => return 0,
+                        .NETNAME_DELETED => return error.NetNameDeleted,
+                        .INVALID_HANDLE => return error.NotOpenForReading,
+                        else => |err| return unexpectedError(err),
+                    }
                 }
             }
             return amt_read;
@@ -538,6 +559,132 @@ pub fn ReadFile(in_hFile: HANDLE, buffer: []u8, offset: ?u64, io_mode: std.io.Mo
     }
 }
 
+fn ReadConsoleWithUtf16ToUtf8Conversion(hConsoleInput: HANDLE, buffer: []u8) ReadFileError!DWORD {
+    const handle_data: *ConsoleHandleData = getConsoleHandleData(hConsoleInput) catch |err| switch (err) {
+        error.ConsoleHandleLimitReached => @panic("Reached maximum number of 64 console handles."),
+        else => return error.Unexpected,
+    };
+    // The temporary buffer can be huge, so keep it away from stack
+    var heap_allocator: std.heap.HeapAllocator = std.heap.HeapAllocator.init();
+    defer heap_allocator.deinit();
+    const allocator: std.mem.Allocator = heap_allocator.allocator();
+    var temp_buffer: []u8 = allocator.alloc(u8, buffer.len) catch @panic("Out of memory.");
+    defer allocator.free(temp_buffer);
+
+    var bytes_read: DWORD = 0;
+    var reached_end_of_line: bool = false;
+
+    // Try flushing leftover UTF-8 bytes first (one codepoint at most)
+    if (handle_data.utf8_buffer.bytes_used != 0) {
+        // LF will only appear at the first byte and there will be only one byte in the buffer
+        if (handle_data.utf8_buffer.data[0] == 0x0A) {
+            assert(handle_data.utf8_buffer.bytes_used == 1);
+            reached_end_of_line = true;
+        }
+        // Is there enough space for all bytes in UTF-8 buffer?
+        const has_enough_space: bool = buffer.len >= handle_data.utf8_buffer.bytes_used;
+        const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len;
+        for (0..max_bytes_to_read) |index| {
+            temp_buffer[index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
+            // Front index wraps around in the case of 4-byte sequence (non-BMP code point)
+            handle_data.utf8_buffer.front_index +%= 1;
+        }
+        bytes_read += @truncate(max_bytes_to_read);
+        handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
+        if (has_enough_space) {
+            // UTF-8 buffer is now empty, we can safely reset front_index to zero
+            handle_data.utf8_buffer.front_index = 0;
+        } else {
+            return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false);
+        }
+        // LF ends a console read immediately
+        if (reached_end_of_line) {
+            return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false);
+        }
+    }
+    assert(handle_data.utf8_buffer.front_index == 0);
+    while (bytes_read < buffer.len) {
+        // Read only one code unit each loop
+        var utf16_code_unit: u16 = undefined;
+        var utf16_code_units_read: DWORD = undefined;
+        if (kernel32.ReadConsoleW(hConsoleInput, &utf16_code_unit, 1, &utf16_code_units_read, null) == FALSE) {
+            switch (kernel32.GetLastError()) {
+                .INVALID_HANDLE => return error.NotOpenForReading,
+                else => |err| return unexpectedError(err),
+            }
+        }
+        if (utf16_code_unit == 0x000D) {
+            // CR should always be followed by an LF, so just discard it
+            continue;
+        } else if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) {
+            // When a high surrogate is encountered, store it into the UTF-16 buffer
+            assert(handle_data.utf16_buffer.code_units_used == 0);
+            handle_data.utf16_buffer.data[0] = utf16_code_unit;
+            handle_data.utf16_buffer.code_units_used = 1;
+            continue;
+        } else if (utf16_code_unit >= 0xDC00 and utf16_code_unit <= 0xDFFF) {
+            // When a low surrogate is encountered, assemble surrogate pair and convert to UTF-8
+            if (!(utf16_code_units_read == 1 and
+                handle_data.utf16_buffer.data[0] >= 0xD800 and handle_data.utf16_buffer.data[0] <= 0xDBFF)) {
+                unreachable;
+            }
+            handle_data.utf16_buffer.data[1] = utf16_code_unit;
+            handle_data.utf16_buffer.code_units_used = 0;
+            const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, &handle_data.utf16_buffer.data) catch return error.Unexpected;
+            assert(utf8_bytes == 4);
+            handle_data.utf8_buffer.bytes_used = 4;
+        } else {
+            assert(handle_data.utf16_buffer.code_units_used == 0);
+            const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, @as(*[1]u16, &utf16_code_unit)) catch return error.Unexpected;
+            handle_data.utf8_buffer.bytes_used = @truncate(utf8_bytes);
+            // LF ends a console read immediately
+            if (handle_data.utf8_buffer.bytes_used == 1 and handle_data.utf8_buffer.data[0] == 0x0A) {
+                reached_end_of_line = true;
+            }
+        }
+        // Is there enough space for all bytes in UTF-8 buffer?
+        const has_enough_space: bool = buffer.len >= bytes_read + handle_data.utf8_buffer.bytes_used;
+        const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len - bytes_read;
+        for (0..max_bytes_to_read) |index| {
+            temp_buffer[bytes_read + index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
+            // Front index wraps around in the case of 4-byte sequence (non-BMP code point)
+            handle_data.utf8_buffer.front_index +%= 1;
+        }
+        bytes_read += @truncate(max_bytes_to_read);
+        handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
+        if (has_enough_space) {
+            // UTF-8 buffer is now empty, we can safely reset front_index to zero
+            handle_data.utf8_buffer.front_index = 0;
+        } else {
+            break;
+        }
+        // LF ends a console read immediately
+        if (reached_end_of_line) {
+            break;
+        }
+    }
+    return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, true);
+}
+
+fn ReadConsoleProcessUtf8Buffer(buffer: []u8, temp_buffer: []u8, bytes_read: DWORD, comptime truncate_after_SUB: bool) DWORD {
+    if (truncate_after_SUB) {
+        // Truncate everything after the SUB (Ctrl+Z) character
+        var index: DWORD = 0;
+        var reached_end_of_file: bool = false;
+        while (index < bytes_read and !reached_end_of_file) {
+            if (temp_buffer[index] == 0x1A) {
+                reached_end_of_file = true;
+            }
+            buffer[index] = temp_buffer[index];
+            index += 1;
+        }
+        return index;
+    } else {
+        std.mem.copy(u8, buffer, temp_buffer);
+        return bytes_read;
+    }
+}
+
 pub const WriteFileError = error{
     SystemResources,
     OperationAborted,
@@ -547,6 +694,9 @@ pub const WriteFileError = error{
     /// a portion of the file.
     LockViolation,
     Unexpected,
+    /// This error occurs when trying to write UTF-8 text to a Windows console,
+    /// and the UTF-8 to UTF-16 conversion fails.
+    InvalidUtf8,
 };
 
 pub fn WriteFile(
@@ -617,21 +767,109 @@ pub fn WriteFile(
             break :blk &overlapped_data;
         } else null;
         const adjusted_len = math.cast(u32, bytes.len) orelse maxInt(u32);
-        if (kernel32.WriteFile(handle, bytes.ptr, adjusted_len, &bytes_written, overlapped) == 0) {
+        if (IsConsoleHandle(handle)) {
+            assert(offset == null);
+            bytes_written = try WriteConsoleWithUtf8ToUtf16Conversion(handle, bytes);
+        } else {
+            if (kernel32.WriteFile(handle, bytes.ptr, adjusted_len, &bytes_written, overlapped) == 0) {
+                switch (kernel32.GetLastError()) {
+                    .INVALID_USER_BUFFER => return error.SystemResources,
+                    .NOT_ENOUGH_MEMORY => return error.SystemResources,
+                    .OPERATION_ABORTED => return error.OperationAborted,
+                    .NOT_ENOUGH_QUOTA => return error.SystemResources,
+                    .IO_PENDING => unreachable,
+                    .BROKEN_PIPE => return error.BrokenPipe,
+                    .INVALID_HANDLE => return error.NotOpenForWriting,
+                    .LOCK_VIOLATION => return error.LockViolation,
+                    else => |err| return unexpectedError(err),
+                }
+            }
+        }
+        return bytes_written;
+    }
+}
+
+fn WriteConsoleWithUtf8ToUtf16Conversion(handle: HANDLE, bytes: []const u8) WriteFileError!DWORD {
+    const handle_data: *ConsoleHandleData = getConsoleHandleData(handle) catch |err| switch (err) {
+        error.ConsoleHandleLimitReached => @panic("Reached maximum number of 64 console handles."),
+        else => return error.Unexpected,
+    };
+    var bytes_written: DWORD = 0;
+    var byte_index: DWORD = 0;
+    while (byte_index < bytes.len) {
+        var utf16_buffer: [2]u16 = undefined;
+        var utf16_code_units: usize = undefined;
+        if (handle_data.utf8_buffer.bytes_used == 0) {
+            const utf8_byte_sequence_length: u3 = std.unicode.utf8ByteSequenceLength(bytes[byte_index]) catch return error.InvalidUtf8;
+            const bytes_available: usize = bytes.len - byte_index;
+            if (bytes_available < utf8_byte_sequence_length) {
+                for (0..bytes_available) |index| {
+                    handle_data.utf8_buffer.data[index] = bytes[index];
+                }
+                bytes_written += @truncate(bytes_available);
+                return bytes_written;
+            } else {
+                utf16_code_units = std.unicode.utf8ToUtf16Le(&utf16_buffer, bytes[byte_index..byte_index + utf8_byte_sequence_length]) catch return error.InvalidUtf8;
+                byte_index += utf8_byte_sequence_length;
+            }
+        } else {
+            const utf8_byte_sequence_length: u3 = std.unicode.utf8ByteSequenceLength(handle_data.utf8_buffer.data[0]) catch return error.InvalidUtf8;
+            assert(utf8_byte_sequence_length > 1 and utf8_byte_sequence_length > handle_data.utf8_buffer.bytes_used);
+            const bytes_available: usize = bytes.len - byte_index;
+            const bytes_needed: u3 = utf8_byte_sequence_length - handle_data.utf8_buffer.bytes_used;
+            if (bytes_available < bytes_needed) {
+                assert(handle_data.utf8_buffer.bytes_used + bytes_available < utf8_byte_sequence_length);
+                for (0..bytes_available) |index| {
+                    handle_data.utf8_buffer.data[handle_data.utf8_buffer.bytes_used + index] = bytes[index];
+                }
+                bytes_written += @truncate(bytes_available);
+                return bytes_written;
+            } else {
+                for (0..bytes_needed) |index| {
+                    handle_data.utf8_buffer.data[handle_data.utf8_buffer.bytes_used + index] = bytes[index];
+                }
+                utf16_code_units = std.unicode.utf8ToUtf16Le(&utf16_buffer, handle_data.utf8_buffer.data[0..utf8_byte_sequence_length]) catch return error.InvalidUtf8;
+                byte_index += bytes_needed;
+            }
+        }
+        // Handle LF to CRLF conversion
+        switch (utf16_buffer[0]) {
+            0x000D => {
+                handle_data.last_character_written_is_CR = true;
+            },
+            0x000A => {
+                if (handle_data.last_character_written_is_CR) {
+                    handle_data.last_character_written_is_CR = false;
+                } else {
+                    utf16_buffer = .{ 0x000D, 0x000A };
+                    utf16_code_units = 2;
+                }
+            },
+            else => {
+                handle_data.last_character_written_is_CR = false;
+            },
+        }
+        var utf16_code_units_written: DWORD = undefined;
+        if (kernel32.WriteConsoleW(handle, &utf16_buffer, @truncate(utf16_code_units), &utf16_code_units_written, null) == FALSE) {
             switch (kernel32.GetLastError()) {
                 .INVALID_USER_BUFFER => return error.SystemResources,
                 .NOT_ENOUGH_MEMORY => return error.SystemResources,
                 .OPERATION_ABORTED => return error.OperationAborted,
                 .NOT_ENOUGH_QUOTA => return error.SystemResources,
                 .IO_PENDING => unreachable,
-                .BROKEN_PIPE => return error.BrokenPipe,
+                .BROKEN_PIPE => unreachable,
                 .INVALID_HANDLE => return error.NotOpenForWriting,
                 .LOCK_VIOLATION => return error.LockViolation,
                 else => |err| return unexpectedError(err),
             }
         }
-        return bytes_written;
+        if (utf16_code_units_written < utf16_code_units) {
+            return bytes_written;
+        } else {
+            bytes_written = byte_index;
+        }
     }
+    return bytes_written;
 }
 
 pub const SetCurrentDirectoryError = error{
@@ -5240,3 +5478,105 @@ pub fn ProcessBaseAddress(handle: HANDLE) ProcessBaseAddressError!HMODULE {
     const ppeb: *const PEB = @ptrCast(@alignCast(peb_out.ptr));
     return ppeb.ImageBaseAddress;
 }
+
+pub const ENABLE_PROCESSED_INPUT = 0x0001;
+pub const ENABLE_LINE_INPUT = 0x0002;
+pub const ENABLE_ECHO_INPUT = 0x0004;
+pub const ENABLE_WINDOW_INPUT = 0x0008;
+pub const ENABLE_MOUSE_INPUT = 0x0010;
+pub const ENABLE_INSERT_MODE = 0x0020;
+pub const ENABLE_QUICK_EDIT_MODE = 0x0040;
+pub const ENABLE_EXTENDED_FLAGS = 0x0080;
+pub const ENABLE_AUTO_POSITION = 0x0100;
+pub const ENABLE_VIRTUAL_TERMINAL_INPUT = 0x0200;
+
+pub const CONSOLE_READCONSOLE_CONTROL = extern struct {
+    nLength: ULONG,
+    nInitialChars: ULONG,
+    dwCtrlWakeupMask: ULONG,
+    dwControlKeyState: ULONG,
+};
+
+pub const PCONSOLE_READCONSOLE_CONTROL = *CONSOLE_READCONSOLE_CONTROL;
+
+pub fn IsConsoleHandle(handle: HANDLE) bool {
+    var out: DWORD = undefined;
+    return kernel32.GetConsoleMode(handle, &out) != FALSE;
+}
+
+// Non-public extra data associated with console handle, and its helper functions
+const ConsoleHandleData = struct {
+    is_assigned: bool = false,
+    handle: ?HANDLE = null,
+    utf8_buffer: Utf8Buffer = .{},
+    utf16_buffer: Utf16Buffer = .{},
+    last_character_written_is_CR: bool = false,
+
+    const Utf8Buffer = struct {
+        data: [4]u8 = .{ 0x00, 0x00, 0x00, 0x00 },
+        bytes_used: u3 = 0,
+        front_index: u2 = 0,
+    };
+
+    const Utf16Buffer = struct {
+        data: [2]u16 = .{ 0x0000, 0x0000 },
+        code_units_used: u2 = 0,
+    };
+};
+
+const max_console_handle_data = 64;
+
+var console_handle_data_array: switch (builtin.os.tag) {
+    .windows => [max_console_handle_data]ConsoleHandleData,
+    else => void,
+} = switch (builtin.os.tag) {
+    .windows => [_]ConsoleHandleData{.{}} ** max_console_handle_data,
+    else => void{},
+};
+
+const ConsoleHandleDataError = error{
+    DataNotFound,
+    ConsoleHandleLimitReached,
+};
+
+fn getConsoleHandleData(handle: HANDLE) ConsoleHandleDataError!*ConsoleHandleData {
+    if (builtin.os.tag == .windows) {
+        var found_unassigned: bool = false;
+        var first_unassigned_index: usize = undefined;
+        for (0..max_console_handle_data) |index| {
+            if (console_handle_data_array[index].is_assigned) {
+                if (console_handle_data_array[index].handle == handle) {
+                    return &console_handle_data_array[index];
+                }
+            } else if (!found_unassigned) {
+                found_unassigned = true;
+                first_unassigned_index = index;
+            }
+        }
+        if (found_unassigned) {
+            console_handle_data_array[first_unassigned_index].is_assigned = true;
+            console_handle_data_array[first_unassigned_index].handle = handle;
+            console_handle_data_array[first_unassigned_index].utf8_buffer.bytes_used = 0;
+            console_handle_data_array[first_unassigned_index].last_character_written_is_CR = false;
+            return &console_handle_data_array[first_unassigned_index];
+        } else {
+            return error.ConsoleHandleLimitReached;
+        }
+    } else {
+        @compileError("Unsupported OS");
+    }
+}
+
+fn removeConsoleHandleData(handle: HANDLE) ConsoleHandleDataError!usize {
+    if (builtin.os.tag == .windows) {
+        for (0..max_console_handle_data) |index| {
+            if (console_handle_data_array[index].is_assigned and console_handle_data_array[index].handle == handle) {
+                console_handle_data_array[index].is_assigned = false;
+                return index;
+            }
+        }
+        return error.DataNotFound;
+    } else {
+        @compileError("Unsupported OS");
+    }
+}
diff --git a/lib/std/os/windows/kernel32.zig b/lib/std/os/windows/kernel32.zig
index 942d7ddba791..2b13bb4399eb 100644
--- a/lib/std/os/windows/kernel32.zig
+++ b/lib/std/os/windows/kernel32.zig
@@ -174,6 +174,9 @@ pub extern "kernel32" fn FillConsoleOutputCharacterW(hConsoleOutput: HANDLE, cCh
 pub extern "kernel32" fn FillConsoleOutputAttribute(hConsoleOutput: HANDLE, wAttribute: WORD, nLength: DWORD, dwWriteCoord: COORD, lpNumberOfAttrsWritten: *DWORD) callconv(WINAPI) BOOL;
 pub extern "kernel32" fn SetConsoleCursorPosition(hConsoleOutput: HANDLE, dwCursorPosition: COORD) callconv(WINAPI) BOOL;
 
+pub extern "kernel32" fn ReadConsoleW(hConsoleInput: HANDLE, lpBuffer: LPVOID, nNumberOfCharsToRead: DWORD, lpNumberOfCharsRead: *DWORD, pInputControl: ?LPVOID) callconv(WINAPI) BOOL;
+pub extern "kernel32" fn WriteConsoleW(hConsoleOutput: HANDLE, lpBuffer: *const anyopaque, nNumberOfCharsToWrite: DWORD, lpNumberOfCharsWritten: ?*DWORD, lpReserved: ?LPVOID) callconv(WINAPI) BOOL;
+
 pub extern "kernel32" fn GetCurrentDirectoryW(nBufferLength: DWORD, lpBuffer: ?[*]WCHAR) callconv(WINAPI) DWORD;
 
 pub extern "kernel32" fn GetCurrentThread() callconv(WINAPI) HANDLE;
diff --git a/src/link.zig b/src/link.zig
index dd94ed9a5664..dca869021161 100644
--- a/src/link.zig
+++ b/src/link.zig
@@ -543,6 +543,7 @@ pub const File = struct {
         DeviceBusy,
         InvalidArgument,
         HotSwapUnavailableOnHostOperatingSystem,
+        InvalidUtf8,
     };
 
     /// Called from within the CodeGen to lower a local variable instantion as an unnamed

From 94955689c5d61bcee242683a39dc7b120f0cc3d5 Mon Sep 17 00:00:00 2001
From: Prcuvu <prcuvu@gmail.com>
Date: Sun, 6 Aug 2023 00:00:00 +0000
Subject: [PATCH 2/3] std.os.windows: Rewrite
 `ReadConsoleWithUtf16ToUtf8Conversion()` using fixed-size temporary buffer

---
 lib/std/os/windows.zig | 240 ++++++++++++++++++++++-------------------
 1 file changed, 132 insertions(+), 108 deletions(-)

diff --git a/lib/std/os/windows.zig b/lib/std/os/windows.zig
index 2bdec650b6c7..463fabea1c5e 100644
--- a/lib/std/os/windows.zig
+++ b/lib/std/os/windows.zig
@@ -564,125 +564,149 @@ fn ReadConsoleWithUtf16ToUtf8Conversion(hConsoleInput: HANDLE, buffer: []u8) Rea
         error.ConsoleHandleLimitReached => @panic("Reached maximum number of 64 console handles."),
         else => return error.Unexpected,
     };
-    // The temporary buffer can be huge, so keep it away from stack
-    var heap_allocator: std.heap.HeapAllocator = std.heap.HeapAllocator.init();
-    defer heap_allocator.deinit();
-    const allocator: std.mem.Allocator = heap_allocator.allocator();
-    var temp_buffer: []u8 = allocator.alloc(u8, buffer.len) catch @panic("Out of memory.");
-    defer allocator.free(temp_buffer);
-
+    var temp_buffer: [1024]u8 = undefined;
     var bytes_read: DWORD = 0;
     var reached_end_of_line: bool = false;
-
-    // Try flushing leftover UTF-8 bytes first (one codepoint at most)
-    if (handle_data.utf8_buffer.bytes_used != 0) {
-        // LF will only appear at the first byte and there will be only one byte in the buffer
-        if (handle_data.utf8_buffer.data[0] == 0x0A) {
-            assert(handle_data.utf8_buffer.bytes_used == 1);
-            reached_end_of_line = true;
-        }
-        // Is there enough space for all bytes in UTF-8 buffer?
-        const has_enough_space: bool = buffer.len >= handle_data.utf8_buffer.bytes_used;
-        const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len;
-        for (0..max_bytes_to_read) |index| {
-            temp_buffer[index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
-            // Front index wraps around in the case of 4-byte sequence (non-BMP code point)
-            handle_data.utf8_buffer.front_index +%= 1;
-        }
-        bytes_read += @truncate(max_bytes_to_read);
-        handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
-        if (has_enough_space) {
-            // UTF-8 buffer is now empty, we can safely reset front_index to zero
-            handle_data.utf8_buffer.front_index = 0;
-        } else {
-            return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false);
-        }
-        // LF ends a console read immediately
-        if (reached_end_of_line) {
-            return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, false);
-        }
-    }
-    assert(handle_data.utf8_buffer.front_index == 0);
+    const TruncateState = enum {
+        do_not_truncate,
+        truncate_after_SUB,
+        truncate_all,
+    };
+    var truncate_state: TruncateState = .do_not_truncate;
     while (bytes_read < buffer.len) {
-        // Read only one code unit each loop
-        var utf16_code_unit: u16 = undefined;
-        var utf16_code_units_read: DWORD = undefined;
-        if (kernel32.ReadConsoleW(hConsoleInput, &utf16_code_unit, 1, &utf16_code_units_read, null) == FALSE) {
-            switch (kernel32.GetLastError()) {
-                .INVALID_HANDLE => return error.NotOpenForReading,
-                else => |err| return unexpectedError(err),
-            }
+        const remaining_buffer: []u8 = buffer[bytes_read..buffer.len];
+        var has_enough_space_in_remaining_buffer: bool = undefined;
+        var bytes_read_into_temp_buffer: DWORD = 0;
+        var truncate_index: DWORD = undefined;
+        // If a SUB character is encountered in a previous loop, truncate everything in this loop
+        if (truncate_state == .truncate_after_SUB) {
+            truncate_state = .truncate_all;
         }
-        if (utf16_code_unit == 0x000D) {
-            // CR should always be followed by an LF, so just discard it
-            continue;
-        } else if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) {
-            // When a high surrogate is encountered, store it into the UTF-16 buffer
-            assert(handle_data.utf16_buffer.code_units_used == 0);
-            handle_data.utf16_buffer.data[0] = utf16_code_unit;
-            handle_data.utf16_buffer.code_units_used = 1;
-            continue;
-        } else if (utf16_code_unit >= 0xDC00 and utf16_code_unit <= 0xDFFF) {
-            // When a low surrogate is encountered, assemble surrogate pair and convert to UTF-8
-            if (!(utf16_code_units_read == 1 and
-                handle_data.utf16_buffer.data[0] >= 0xD800 and handle_data.utf16_buffer.data[0] <= 0xDBFF)) {
-                unreachable;
+        // Try flushing leftover UTF-8 bytes first (one codepoint at most)
+        if (handle_data.utf8_buffer.bytes_used != 0) {
+            if (handle_data.utf8_buffer.data[0] == 0x0A) {
+                assert(handle_data.utf8_buffer.bytes_used == 1);
+                reached_end_of_line = true;
+            } else if (handle_data.utf8_buffer.data[0] == 0x1A) {
+                assert(handle_data.utf8_buffer.bytes_used == 1);
+                // Truncate after SUB character in this loop if we never truncated in previous loops
+                if (truncate_state == .do_not_truncate) {
+                    truncate_state = .truncate_after_SUB;
+                    truncate_index = 1;
+                }
+            }
+            // Is there enough space for all bytes in UTF-8 buffer?
+            const has_enough_space: bool = remaining_buffer.len >= handle_data.utf8_buffer.bytes_used;
+            const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else remaining_buffer.len;
+            for (0..max_bytes_to_read) |index| {
+                temp_buffer[index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
+                // Front index wraps around in the case of 4-byte sequence (non-BMP code point)
+                handle_data.utf8_buffer.front_index +%= 1;
+            }
+            bytes_read_into_temp_buffer += @truncate(max_bytes_to_read);
+            handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
+            if (has_enough_space) {
+                // UTF-8 buffer is now empty, we can safely reset front_index to zero
+                handle_data.utf8_buffer.front_index = 0;
+            } else {
+                switch (truncate_state) {
+                    .truncate_all => {},
+                    else => @memcpy(remaining_buffer[0..bytes_read_into_temp_buffer], temp_buffer[0..bytes_read_into_temp_buffer]),
+                }
+                bytes_read += bytes_read_into_temp_buffer;
+                break;
             }
-            handle_data.utf16_buffer.data[1] = utf16_code_unit;
-            handle_data.utf16_buffer.code_units_used = 0;
-            const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, &handle_data.utf16_buffer.data) catch return error.Unexpected;
-            assert(utf8_bytes == 4);
-            handle_data.utf8_buffer.bytes_used = 4;
-        } else {
-            assert(handle_data.utf16_buffer.code_units_used == 0);
-            const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, @as(*[1]u16, &utf16_code_unit)) catch return error.Unexpected;
-            handle_data.utf8_buffer.bytes_used = @truncate(utf8_bytes);
             // LF ends a console read immediately
-            if (handle_data.utf8_buffer.bytes_used == 1 and handle_data.utf8_buffer.data[0] == 0x0A) {
-                reached_end_of_line = true;
+            if (reached_end_of_line) {
+                switch (truncate_state) {
+                    .truncate_all => {},
+                    else => @memcpy(remaining_buffer[0..bytes_read_into_temp_buffer], temp_buffer[0..bytes_read_into_temp_buffer]),
+                }
+                bytes_read += bytes_read_into_temp_buffer;
+                break;
             }
         }
-        // Is there enough space for all bytes in UTF-8 buffer?
-        const has_enough_space: bool = buffer.len >= bytes_read + handle_data.utf8_buffer.bytes_used;
-        const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else buffer.len - bytes_read;
-        for (0..max_bytes_to_read) |index| {
-            temp_buffer[bytes_read + index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
-            // Front index wraps around in the case of 4-byte sequence (non-BMP code point)
-            handle_data.utf8_buffer.front_index +%= 1;
-        }
-        bytes_read += @truncate(max_bytes_to_read);
-        handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
-        if (has_enough_space) {
-            // UTF-8 buffer is now empty, we can safely reset front_index to zero
-            handle_data.utf8_buffer.front_index = 0;
-        } else {
-            break;
+        assert(handle_data.utf8_buffer.front_index == 0);
+        while (bytes_read_into_temp_buffer < temp_buffer.len) {
+            // Read only one code unit each loop
+            var utf16_code_unit: u16 = undefined;
+            var utf16_code_units_read: DWORD = undefined;
+            if (kernel32.ReadConsoleW(hConsoleInput, &utf16_code_unit, 1, &utf16_code_units_read, null) == FALSE) {
+                switch (kernel32.GetLastError()) {
+                    .INVALID_HANDLE => return error.NotOpenForReading,
+                    else => |err| return unexpectedError(err),
+                }
+            }
+            if (utf16_code_unit == 0x000D) {
+                // CR should always be followed by an LF, so just discard it
+                continue;
+            } else if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) {
+                // When a high surrogate is encountered, store it into the UTF-16 buffer
+                assert(handle_data.utf16_buffer.code_units_used == 0);
+                handle_data.utf16_buffer.data[0] = utf16_code_unit;
+                handle_data.utf16_buffer.code_units_used = 1;
+                continue;
+            } else if (utf16_code_unit >= 0xDC00 and utf16_code_unit <= 0xDFFF) {
+                // When a low surrogate is encountered, assemble surrogate pair and convert to UTF-8
+                if (!(utf16_code_units_read == 1 and handle_data.utf16_buffer.data[0] >= 0xD800 and handle_data.utf16_buffer.data[0] <= 0xDBFF)) {
+                    unreachable;
+                }
+                handle_data.utf16_buffer.data[1] = utf16_code_unit;
+                handle_data.utf16_buffer.code_units_used = 0;
+                const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, &handle_data.utf16_buffer.data) catch return error.Unexpected;
+                assert(utf8_bytes == 4);
+                handle_data.utf8_buffer.bytes_used = 4;
+            } else {
+                assert(handle_data.utf16_buffer.code_units_used == 0);
+                const utf8_bytes: usize = std.unicode.utf16leToUtf8(&handle_data.utf8_buffer.data, @as(*[1]u16, &utf16_code_unit)) catch return error.Unexpected;
+                handle_data.utf8_buffer.bytes_used = @truncate(utf8_bytes);
+                if (handle_data.utf8_buffer.bytes_used == 1) {
+                    if (handle_data.utf8_buffer.data[0] == 0x0A) {
+                        reached_end_of_line = true;
+                    } else if (handle_data.utf8_buffer.data[0] == 0x1A) {
+                        if (truncate_state == .do_not_truncate) {
+                            truncate_state = .truncate_after_SUB;
+                            truncate_index = bytes_read_into_temp_buffer + 1;
+                        }
+                    }
+                }
+            }
+            // Is there enough space for all bytes in UTF-8 buffer?
+            has_enough_space_in_remaining_buffer = remaining_buffer.len >= bytes_read_into_temp_buffer + handle_data.utf8_buffer.bytes_used;
+            const has_enough_space: bool = has_enough_space_in_remaining_buffer and temp_buffer.len >= bytes_read_into_temp_buffer + handle_data.utf8_buffer.bytes_used;
+            const max_bytes_to_read: usize = if (has_enough_space) handle_data.utf8_buffer.bytes_used else remaining_buffer.len - bytes_read_into_temp_buffer;
+            for (0..max_bytes_to_read) |index| {
+                temp_buffer[bytes_read_into_temp_buffer + index] = handle_data.utf8_buffer.data[handle_data.utf8_buffer.front_index];
+                // Front index wraps around in the case of 4-byte sequence (non-BMP code point)
+                handle_data.utf8_buffer.front_index +%= 1;
+            }
+            bytes_read_into_temp_buffer += @truncate(max_bytes_to_read);
+            handle_data.utf8_buffer.bytes_used -= @truncate(max_bytes_to_read);
+            if (has_enough_space) {
+                // UTF-8 buffer is now empty, we can safely reset front_index to zero
+                handle_data.utf8_buffer.front_index = 0;
+            } else {
+                break;
+            }
+            // LF ends a console read immediately
+            if (reached_end_of_line) {
+                break;
+            }
         }
-        // LF ends a console read immediately
-        if (reached_end_of_line) {
+        // Copy to user-provided buffer
+        const bytes_copied: DWORD = switch (truncate_state) {
+            .do_not_truncate => bytes_read_into_temp_buffer,
+            .truncate_after_SUB => truncate_index,
+            .truncate_all => 0,
+        };
+        @memcpy(remaining_buffer[0..bytes_copied], temp_buffer[0..bytes_copied]);
+        bytes_read += bytes_copied;
+        // Early return conditions
+        if (!has_enough_space_in_remaining_buffer or reached_end_of_line) {
             break;
         }
     }
-    return ReadConsoleProcessUtf8Buffer(buffer, temp_buffer, bytes_read, true);
-}
-
-fn ReadConsoleProcessUtf8Buffer(buffer: []u8, temp_buffer: []u8, bytes_read: DWORD, comptime truncate_after_SUB: bool) DWORD {
-    if (truncate_after_SUB) {
-        // Truncate everything after the SUB (Ctrl+Z) character
-        var index: DWORD = 0;
-        var reached_end_of_file: bool = false;
-        while (index < bytes_read and !reached_end_of_file) {
-            if (temp_buffer[index] == 0x1A) {
-                reached_end_of_file = true;
-            }
-            buffer[index] = temp_buffer[index];
-            index += 1;
-        }
-        return index;
-    } else {
-        std.mem.copy(u8, buffer, temp_buffer);
-        return bytes_read;
-    }
+    return bytes_read;
 }
 
 pub const WriteFileError = error{
@@ -809,7 +833,7 @@ fn WriteConsoleWithUtf8ToUtf16Conversion(handle: HANDLE, bytes: []const u8) Writ
                 bytes_written += @truncate(bytes_available);
                 return bytes_written;
             } else {
-                utf16_code_units = std.unicode.utf8ToUtf16Le(&utf16_buffer, bytes[byte_index..byte_index + utf8_byte_sequence_length]) catch return error.InvalidUtf8;
+                utf16_code_units = std.unicode.utf8ToUtf16Le(&utf16_buffer, bytes[byte_index .. byte_index + utf8_byte_sequence_length]) catch return error.InvalidUtf8;
                 byte_index += utf8_byte_sequence_length;
             }
         } else {

From 045c169646f21331b26ad005f1441947844953ac Mon Sep 17 00:00:00 2001
From: Prcuvu <prcuvu@gmail.com>
Date: Mon, 9 Oct 2023 00:00:00 +0000
Subject: [PATCH 3/3] =?UTF-8?q?std.os.windows:=20Remove=20LF=20=E2=86=94?=
 =?UTF-8?q?=20CRLF=20conversion=20in=20Windows=20native=20console=20I/O?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lib/std/os/windows.zig | 24 +-----------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/lib/std/os/windows.zig b/lib/std/os/windows.zig
index 463fabea1c5e..af6a50d8859d 100644
--- a/lib/std/os/windows.zig
+++ b/lib/std/os/windows.zig
@@ -637,10 +637,7 @@ fn ReadConsoleWithUtf16ToUtf8Conversion(hConsoleInput: HANDLE, buffer: []u8) Rea
                     else => |err| return unexpectedError(err),
                 }
             }
-            if (utf16_code_unit == 0x000D) {
-                // CR should always be followed by an LF, so just discard it
-                continue;
-            } else if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) {
+            if (utf16_code_unit >= 0xD800 and utf16_code_unit <= 0xDBFF) {
                 // When a high surrogate is encountered, store it into the UTF-16 buffer
                 assert(handle_data.utf16_buffer.code_units_used == 0);
                 handle_data.utf16_buffer.data[0] = utf16_code_unit;
@@ -856,23 +853,6 @@ fn WriteConsoleWithUtf8ToUtf16Conversion(handle: HANDLE, bytes: []const u8) Writ
                 byte_index += bytes_needed;
             }
         }
-        // Handle LF to CRLF conversion
-        switch (utf16_buffer[0]) {
-            0x000D => {
-                handle_data.last_character_written_is_CR = true;
-            },
-            0x000A => {
-                if (handle_data.last_character_written_is_CR) {
-                    handle_data.last_character_written_is_CR = false;
-                } else {
-                    utf16_buffer = .{ 0x000D, 0x000A };
-                    utf16_code_units = 2;
-                }
-            },
-            else => {
-                handle_data.last_character_written_is_CR = false;
-            },
-        }
         var utf16_code_units_written: DWORD = undefined;
         if (kernel32.WriteConsoleW(handle, &utf16_buffer, @truncate(utf16_code_units), &utf16_code_units_written, null) == FALSE) {
             switch (kernel32.GetLastError()) {
@@ -5534,7 +5514,6 @@ const ConsoleHandleData = struct {
     handle: ?HANDLE = null,
     utf8_buffer: Utf8Buffer = .{},
     utf16_buffer: Utf16Buffer = .{},
-    last_character_written_is_CR: bool = false,
 
     const Utf8Buffer = struct {
         data: [4]u8 = .{ 0x00, 0x00, 0x00, 0x00 },
@@ -5581,7 +5560,6 @@ fn getConsoleHandleData(handle: HANDLE) ConsoleHandleDataError!*ConsoleHandleDat
             console_handle_data_array[first_unassigned_index].is_assigned = true;
             console_handle_data_array[first_unassigned_index].handle = handle;
             console_handle_data_array[first_unassigned_index].utf8_buffer.bytes_used = 0;
-            console_handle_data_array[first_unassigned_index].last_character_written_is_CR = false;
             return &console_handle_data_array[first_unassigned_index];
         } else {
             return error.ConsoleHandleLimitReached;