From d8f47a57bc765636e5e008c39b733de54e3a1585 Mon Sep 17 00:00:00 2001 From: David Smith Date: Thu, 15 Feb 2024 15:39:52 -0800 Subject: [PATCH 1/5] Switch grapheme break property searching to Eytzinger binary search --- .../stubs/Unicode/Common/GraphemeData.h | 166 +++++++++--------- .../public/stubs/Unicode/UnicodeGrapheme.cpp | 38 ++-- .../GenGraphemeBreakProperty/main.swift | 36 +++- .../Sources/GenUtils/Emit.swift | 16 ++ 4 files changed, 143 insertions(+), 113 deletions(-) diff --git a/stdlib/public/stubs/Unicode/Common/GraphemeData.h b/stdlib/public/stubs/Unicode/Common/GraphemeData.h index 09557c7713ca2..c521a51fd684b 100644 --- a/stdlib/public/stubs/Unicode/Common/GraphemeData.h +++ b/stdlib/public/stubs/Unicode/Common/GraphemeData.h @@ -2,7 +2,7 @@ // // This source file is part of the Swift.org open source project // -// Copyright (c) 2021 - 2023 Apple Inc. and the Swift project authors +// Copyright (c) 2021-2024 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information @@ -18,89 +18,89 @@ #include "swift/shims/SwiftStdint.h" -#define GRAPHEME_BREAK_DATA_COUNT 637 +#define GRAPHEME_BREAK_DATA_COUNT 638 -static const __swift_uint32_t _swift_stdlib_graphemeBreakProperties[637] = { - 0x3E00000, 0x400007F, 0x800000A9, 0xAD, 0x800000AE, 0x2DE00300, 0x20C00483, 0x25800591, - 0x200005BF, 0x202005C1, 0x202005C4, 0x200005C7, 0x40A00600, 0x21400610, 0x61C, 0x2280064B, - 0x20000670, 0x20C006D6, 0x400006DD, 0x20A006DF, 0x202006E7, 0x206006EA, 0x4000070F, 0x20000711, - 0x23400730, 0x214007A6, 0x210007EB, 0x200007FD, 0x20600816, 0x2100081B, 0x20400825, 0x20800829, - 0x20400859, 0x40200890, 0x20E00898, 0x22E008CA, 0x400008E2, 0x23E008E3, 0x60000903, 0x2000093A, - 0x6000093B, 0x2000093C, 0x6040093E, 0x20E00941, 0x60600949, 0x2000094D, 0x6020094E, 0x20C00951, - 0x20200962, 0x20000981, 0x60200982, 0x200009BC, 0x200009BE, 0x602009BF, 0x206009C1, 0x602009C7, - 0x602009CB, 0x200009CD, 0x200009D7, 0x202009E2, 0x200009FE, 0x20200A01, 0x60000A03, 0x20000A3C, - 0x60400A3E, 0x20200A41, 0x20200A47, 0x20400A4B, 0x20000A51, 0x20200A70, 0x20000A75, 0x20200A81, - 0x60000A83, 0x20000ABC, 0x60400ABE, 0x20800AC1, 0x20200AC7, 0x60000AC9, 0x60200ACB, 0x20000ACD, - 0x20200AE2, 0x20A00AFA, 0x20000B01, 0x60200B02, 0x20000B3C, 0x20200B3E, 0x60000B40, 0x20600B41, - 0x60200B47, 0x60200B4B, 0x20000B4D, 0x20400B55, 0x20200B62, 0x20000B82, 0x20000BBE, 0x60000BBF, - 0x20000BC0, 0x60200BC1, 0x60400BC6, 0x60400BCA, 0x20000BCD, 0x20000BD7, 0x20000C00, 0x60400C01, - 0x20000C04, 0x20000C3C, 0x20400C3E, 0x60600C41, 0x20400C46, 0x20600C4A, 0x20200C55, 0x20200C62, - 0x20000C81, 0x60200C82, 0x20000CBC, 0x60000CBE, 0x20000CBF, 0x60200CC0, 0x20000CC2, 0x60200CC3, - 0x20000CC6, 0x60200CC7, 0x60200CCA, 0x20200CCC, 0x20200CD5, 0x20200CE2, 0x60000CF3, 0x20200D00, - 0x60200D02, 0x20200D3B, 0x20000D3E, 0x60200D3F, 0x20600D41, 0x60400D46, 0x60400D4A, 0x20000D4D, - 0x40000D4E, 0x20000D57, 0x20200D62, 0x20000D81, 0x60200D82, 0x20000DCA, 0x20000DCF, 0x60200DD0, - 0x20400DD2, 0x20000DD6, 0x60C00DD8, 0x20000DDF, 0x60200DF2, 0x20000E31, 0x60000E33, 0x20C00E34, - 0x20E00E47, 0x20000EB1, 0x60000EB3, 0x21000EB4, 0x20C00EC8, 0x20200F18, 0x20000F35, 0x20000F37, - 0x20000F39, 0x60200F3E, 0x21A00F71, 0x60000F7F, 0x20800F80, 0x20200F86, 0x21400F8D, 0x24600F99, - 0x20000FC6, 0x2060102D, 0x60001031, 0x20A01032, 0x20201039, 0x6020103B, 0x2020103D, 0x60201056, - 0x20201058, 0x2040105E, 0x20601071, 0x20001082, 0x60001084, 0x20201085, 0x2000108D, 0x2000109D, - 0x2040135D, 0x20401712, 0x60001715, 0x20201732, 0x60001734, 0x20201752, 0x20201772, 0x202017B4, - 0x600017B6, 0x20C017B7, 0x60E017BE, 0x200017C6, 0x602017C7, 0x214017C9, 0x200017DD, 0x2040180B, - 0x180E, 0x2000180F, 0x20201885, 0x200018A9, 0x20401920, 0x60601923, 0x20201927, 0x60401929, - 0x60201930, 0x20001932, 0x60A01933, 0x20401939, 0x20201A17, 0x60201A19, 0x20001A1B, 0x60001A55, - 0x20001A56, 0x60001A57, 0x20C01A58, 0x20001A60, 0x20001A62, 0x20E01A65, 0x60A01A6D, 0x21201A73, - 0x20001A7F, 0x23C01AB0, 0x20601B00, 0x60001B04, 0x20C01B34, 0x60001B3B, 0x20001B3C, 0x60801B3D, - 0x20001B42, 0x60201B43, 0x21001B6B, 0x20201B80, 0x60001B82, 0x60001BA1, 0x20601BA2, 0x60201BA6, - 0x20201BA8, 0x60001BAA, 0x20401BAB, 0x20001BE6, 0x60001BE7, 0x20201BE8, 0x60401BEA, 0x20001BED, - 0x60001BEE, 0x20401BEF, 0x60201BF2, 0x60E01C24, 0x20E01C2C, 0x60201C34, 0x20201C36, 0x20401CD0, - 0x21801CD4, 0x60001CE1, 0x20C01CE2, 0x20001CED, 0x20001CF4, 0x60001CF7, 0x20201CF8, 0x27E01DC0, - 0x200B, 0x2000200C, 0x20200E, 0xC02028, 0x8000203C, 0x80002049, 0x1E02060, 0x240020D0, 0x80002122, - 0x80002139, 0x80A02194, 0x802021A9, 0x8020231A, 0x80002328, 0x80002388, 0x800023CF, 0x814023E9, - 0x804023F8, 0x800024C2, 0x802025AA, 0x800025B6, 0x800025C0, 0x806025FB, 0x80A02600, 0x81602607, - 0x8E202614, 0x8EA02690, 0x81402708, 0x80002714, 0x80002716, 0x8000271D, 0x80002721, 0x80002728, - 0x80202733, 0x80002744, 0x80002747, 0x8000274C, 0x8000274E, 0x80402753, 0x80002757, 0x80802763, - 0x80402795, 0x800027A1, 0x800027B0, 0x800027BF, 0x80202934, 0x80402B05, 0x80202B1B, 0x80002B50, - 0x80002B55, 0x20402CEF, 0x20002D7F, 0x23E02DE0, 0x20A0302A, 0x80003030, 0x8000303D, 0x20203099, - 0x80003297, 0x80003299, 0x2060A66F, 0x2120A674, 0x2020A69E, 0x2020A6F0, 0x2000A802, 0x2000A806, - 0x2000A80B, 0x6020A823, 0x2020A825, 0x6000A827, 0x2000A82C, 0x6020A880, 0x61E0A8B4, 0x2020A8C4, - 0x2220A8E0, 0x2000A8FF, 0x20E0A926, 0x2140A947, 0x6020A952, 0x2040A980, 0x6000A983, 0x2000A9B3, - 0x6020A9B4, 0x2060A9B6, 0x6020A9BA, 0x2020A9BC, 0x6040A9BE, 0x2000A9E5, 0x20A0AA29, 0x6020AA2F, - 0x2020AA31, 0x6020AA33, 0x2020AA35, 0x2000AA43, 0x2000AA4C, 0x6000AA4D, 0x2000AA7C, 0x2000AAB0, - 0x2040AAB2, 0x2020AAB7, 0x2020AABE, 0x2000AAC1, 0x6000AAEB, 0x2020AAEC, 0x6020AAEE, 0x6000AAF5, - 0x2000AAF6, 0x6020ABE3, 0x2000ABE5, 0x6020ABE6, 0x2000ABE8, 0x6020ABE9, 0x6000ABEC, 0x2000ABED, - 0x2000FB1E, 0x21E0FE00, 0x21E0FE20, 0xFEFF, 0x2020FF9E, 0x160FFF0, 0x200101FD, 0x200102E0, - 0x20810376, 0x20410A01, 0x20210A05, 0x20610A0C, 0x20410A38, 0x20010A3F, 0x20210AE5, 0x20610D24, - 0x20210EAB, 0x20410EFD, 0x21410F46, 0x20610F82, 0x60011000, 0x20011001, 0x60011002, 0x21C11038, - 0x20011070, 0x20211073, 0x2041107F, 0x60011082, 0x604110B0, 0x206110B3, 0x602110B7, 0x202110B9, - 0x400110BD, 0x200110C2, 0x400110CD, 0x20411100, 0x20811127, 0x6001112C, 0x20E1112D, 0x60211145, - 0x20011173, 0x20211180, 0x60011182, 0x604111B3, 0x210111B6, 0x602111BF, 0x402111C2, 0x206111C9, - 0x600111CE, 0x200111CF, 0x6041122C, 0x2041122F, 0x60211232, 0x20011234, 0x60011235, 0x20211236, - 0x2001123E, 0x20011241, 0x200112DF, 0x604112E0, 0x20E112E3, 0x20211300, 0x60211302, 0x2021133B, - 0x2001133E, 0x6001133F, 0x20011340, 0x60611341, 0x60211347, 0x6041134B, 0x20011357, 0x60211362, - 0x20C11366, 0x20811370, 0x60411435, 0x20E11438, 0x60211440, 0x20411442, 0x60011445, 0x20011446, - 0x2001145E, 0x200114B0, 0x602114B1, 0x20A114B3, 0x600114B9, 0x200114BA, 0x602114BB, 0x200114BD, - 0x600114BE, 0x202114BF, 0x600114C1, 0x202114C2, 0x200115AF, 0x602115B0, 0x206115B2, 0x606115B8, - 0x202115BC, 0x600115BE, 0x202115BF, 0x202115DC, 0x60411630, 0x20E11633, 0x6021163B, 0x2001163D, - 0x6001163E, 0x2021163F, 0x200116AB, 0x600116AC, 0x200116AD, 0x602116AE, 0x20A116B0, 0x600116B6, - 0x200116B7, 0x2041171D, 0x20611722, 0x60011726, 0x20811727, 0x6041182C, 0x2101182F, 0x60011838, - 0x20211839, 0x20011930, 0x60811931, 0x60211937, 0x2021193B, 0x6001193D, 0x2001193E, 0x4001193F, - 0x60011940, 0x40011941, 0x60011942, 0x20011943, 0x604119D1, 0x206119D4, 0x202119DA, 0x606119DC, - 0x200119E0, 0x600119E4, 0x21211A01, 0x20A11A33, 0x60011A39, 0x40011A3A, 0x20611A3B, 0x20011A47, - 0x20A11A51, 0x60211A57, 0x20411A59, 0x40A11A84, 0x21811A8A, 0x60011A97, 0x20211A98, 0x60011C2F, - 0x20C11C30, 0x20A11C38, 0x60011C3E, 0x20011C3F, 0x22A11C92, 0x60011CA9, 0x20C11CAA, 0x60011CB1, - 0x20211CB2, 0x60011CB4, 0x20211CB5, 0x20A11D31, 0x20011D3A, 0x20211D3C, 0x20C11D3F, 0x40011D46, - 0x20011D47, 0x60811D8A, 0x20211D90, 0x60211D93, 0x20011D95, 0x60011D96, 0x20011D97, 0x20211EF3, - 0x60211EF5, 0x20211F00, 0x40011F02, 0x60011F03, 0x60211F34, 0x20811F36, 0x60211F3E, 0x20011F40, - 0x60011F41, 0x20011F42, 0x1E13430, 0x20013440, 0x21C13447, 0x20816AF0, 0x20C16B30, 0x20016F4F, - 0x66C16F51, 0x20616F8F, 0x20016FE4, 0x60216FF0, 0x2021BC9D, 0x61BCA0, 0x25A1CF00, 0x22C1CF30, - 0x2001D165, 0x6001D166, 0x2041D167, 0x6001D16D, 0x2081D16E, 0xE1D173, 0x20E1D17B, 0x20C1D185, - 0x2061D1AA, 0x2041D242, 0x26C1DA00, 0x2621DA3B, 0x2001DA75, 0x2001DA84, 0x2081DA9B, 0x21C1DAA1, - 0x20C1E000, 0x2201E008, 0x20C1E01B, 0x2021E023, 0x2081E026, 0x2001E08F, 0x20C1E130, 0x2001E2AE, - 0x2061E2EC, 0x2061E4EC, 0x20C1E8D0, 0x20C1E944, 0x9FE1F000, 0x8041F10D, 0x8001F12F, 0x80A1F16C, - 0x8021F17E, 0x8001F18E, 0x8121F191, 0x8701F1AD, 0x81C1F201, 0x8001F21A, 0x8001F22F, 0x8101F232, - 0x8061F23C, 0xB621F249, 0x2081F3FB, 0xA7A1F400, 0xA121F546, 0x8FE1F680, 0x8161F774, 0x8541F7D5, - 0x8061F80C, 0x80E1F848, 0x80A1F85A, 0x80E1F888, 0x8A21F8AE, 0x85C1F90C, 0x8121F93C, 0xB701F947, - 0x3EE0000, 0x2BEE0020, 0xFEE0080, 0x3DEE0100, +static const __swift_uint32_t _swift_stdlib_graphemeBreakProperties[638] = { + 0x0, 0x2020FF9E, 0x60201C34, 0x604119D1, 0x20200D00, 0x20A0302A, 0x60211347, 0x2021BC9D, + 0x20000A3C, 0x202017B4, 0x800025C0, 0x6040A9BE, 0x20811127, 0x60411630, 0x20011D3A, 0x9FE1F000, + 0x20800829, 0x60000BBF, 0x20000F37, 0x21201A73, 0x80002049, 0x8000274E, 0x2000A82C, 0x6000AAEB, + 0x60011000, 0x60211232, 0x600114B9, 0x20811727, 0x21811A8A, 0x60211F34, 0x2001DA75, 0xA121F546, + 0x2280064B, 0x20C00951, 0x20000ACD, 0x20200C62, 0x60200DD0, 0x60201056, 0x60401929, 0x60201BA6, + 0x60001CF7, 0x80002328, 0x80002716, 0x80202934, 0x2020A69E, 0x6020A952, 0x2000AA4C, 0x2000ABE8, + 0x20410A38, 0x604110B0, 0x210111B6, 0x20E112E3, 0x60211440, 0x200115AF, 0x200116AD, 0x2021193B, + 0x60011A39, 0x22A11C92, 0x20011D95, 0x21C13447, 0x2081D16E, 0x2081E026, 0x81C1F201, 0x8A21F8AE, + 0x25800591, 0x20000711, 0x2000093A, 0x602009C7, 0x20200A81, 0x20600B41, 0x60400C01, 0x60200CC3, + 0x20000D4D, 0x20C00E34, 0x24600F99, 0x2000109D, 0x2040180B, 0x60001A55, 0x60801B3D, 0x20001BED, + 0x60001CE1, 0x2000200C, 0x80002139, 0x804023F8, 0x8E202614, 0x80202733, 0x80402795, 0x80002B55, + 0x80003297, 0x2000A80B, 0x2220A8E0, 0x6020A9B4, 0x2020AA31, 0x2040AAB2, 0x2000AAF6, 0x2000FB1E, + 0x20810376, 0x20210EAB, 0x20011070, 0x400110BD, 0x20011173, 0x600111CE, 0x2001123E, 0x2001133E, + 0x20C11366, 0x2001145E, 0x600114BE, 0x202115BC, 0x6001163E, 0x200116B7, 0x20211839, 0x60011940, + 0x200119E0, 0x20A11A51, 0x20C11C30, 0x20211CB2, 0x20011D47, 0x60211EF5, 0x60011F41, 0x66C16F51, + 0x2001D165, 0x2061D1AA, 0x20C1E000, 0x2061E2EC, 0x8021F17E, 0x8061F23C, 0x8061F80C, 0x3EE0000, + 0xAD, 0x200005C7, 0x20A006DF, 0x200007FD, 0x22E008CA, 0x20E00941, 0x200009BC, 0x202009E2, + 0x20400A4B, 0x20800AC1, 0x60200B02, 0x20400B55, 0x60400BCA, 0x60600C41, 0x60000CBE, 0x20200CCC, + 0x60200D3F, 0x20000D81, 0x20000DDF, 0x21000EB4, 0x60000F7F, 0x20A01032, 0x20001082, 0x20201732, + 0x200017C6, 0x200018A9, 0x20401939, 0x20001A60, 0x60001B04, 0x20201B80, 0x20001BE6, 0x60E01C24, + 0x20401CD0, 0x20001CED, 0x27E01DC0, 0xC02028, 0x240020D0, 0x802021A9, 0x800023CF, 0x802025AA, + 0x80A02600, 0x81402708, 0x80002721, 0x80002747, 0x80002757, 0x800027B0, 0x80202B1B, 0x20002D7F, + 0x8000303D, 0x2060A66F, 0x2000A802, 0x2020A825, 0x61E0A8B4, 0x20E0A926, 0x6000A983, 0x6020A9BA, + 0x20A0AA29, 0x2020AA35, 0x2000AA7C, 0x2020AABE, 0x6020AAEE, 0x2000ABE5, 0x6000ABEC, 0x21E0FE20, + 0x200101FD, 0x20210A05, 0x20210AE5, 0x21410F46, 0x60011002, 0x2041107F, 0x602110B7, 0x400110CD, + 0x20E1112D, 0x60011182, 0x402111C2, 0x6041122C, 0x60011235, 0x200112DF, 0x60211302, 0x20011340, + 0x20011357, 0x60411435, 0x60011445, 0x602114B1, 0x602114BB, 0x600114C1, 0x206115B2, 0x202115BF, + 0x6021163B, 0x200116AB, 0x20A116B0, 0x20611722, 0x2101182F, 0x60811931, 0x2001193E, 0x60011942, + 0x202119DA, 0x21211A01, 0x20611A3B, 0x20411A59, 0x20211A98, 0x60011C3E, 0x20C11CAA, 0x20211CB5, + 0x20C11D3F, 0x20211D90, 0x20011D97, 0x40011F02, 0x60211F3E, 0x1E13430, 0x20C16B30, 0x20016FE4, + 0x25A1CF00, 0x2041D167, 0x20E1D17B, 0x26C1DA00, 0x2081DA9B, 0x20C1E01B, 0x20C1E130, 0x20C1E8D0, + 0x8001F12F, 0x8121F191, 0x8001F22F, 0x2081F3FB, 0x8161F774, 0x80A1F85A, 0x8121F93C, 0xFEE0080, + 0x400007F, 0x2DE00300, 0x202005C1, 0x21400610, 0x20C006D6, 0x206006EA, 0x214007A6, 0x2100081B, + 0x40200890, 0x23E008E3, 0x2000093C, 0x2000094D, 0x20000981, 0x602009BF, 0x200009CD, 0x20200A01, + 0x20200A41, 0x20200A70, 0x20000ABC, 0x60000AC9, 0x20A00AFA, 0x20200B3E, 0x60200B4B, 0x20000B82, + 0x60200BC1, 0x20000BD7, 0x20000C3C, 0x20600C4A, 0x60200C82, 0x60200CC0, 0x60200CC7, 0x20200CE2, + 0x20200D3B, 0x60400D46, 0x20000D57, 0x20000DCA, 0x20000DD6, 0x20000E31, 0x20000EB1, 0x20200F18, + 0x60200F3E, 0x20200F86, 0x2060102D, 0x6020103B, 0x2040105E, 0x20201085, 0x20401712, 0x20201752, + 0x20C017B7, 0x214017C9, 0x2000180F, 0x60601923, 0x20001932, 0x60201A19, 0x60001A57, 0x20E01A65, + 0x23C01AB0, 0x60001B3B, 0x60201B43, 0x60001BA1, 0x60001BAA, 0x20201BE8, 0x20401BEF, 0x20E01C2C, + 0x20201C36, 0x21801CD4, 0x20C01CE2, 0x20001CF4, 0x20201CF8, 0x200B, 0x20200E, 0x8000203C, + 0x1E02060, 0x80002122, 0x80A02194, 0x8020231A, 0x80002388, 0x814023E9, 0x800024C2, 0x800025B6, + 0x806025FB, 0x81602607, 0x8EA02690, 0x80002714, 0x8000271D, 0x80002728, 0x80002744, 0x8000274C, + 0x80402753, 0x80802763, 0x800027A1, 0x800027BF, 0x80402B05, 0x80002B50, 0x20402CEF, 0x23E02DE0, + 0x80003030, 0x20203099, 0x80003299, 0x2120A674, 0x2020A6F0, 0x2000A806, 0x6020A823, 0x6000A827, + 0x6020A880, 0x2020A8C4, 0x2000A8FF, 0x2140A947, 0x2040A980, 0x2000A9B3, 0x2060A9B6, 0x2020A9BC, + 0x2000A9E5, 0x6020AA2F, 0x6020AA33, 0x2000AA43, 0x6000AA4D, 0x2000AAB0, 0x2020AAB7, 0x2000AAC1, + 0x2020AAEC, 0x6000AAF5, 0x6020ABE3, 0x6020ABE6, 0x6020ABE9, 0x2000ABED, 0x21E0FE00, 0xFEFF, + 0x160FFF0, 0x200102E0, 0x20410A01, 0x20610A0C, 0x20010A3F, 0x20610D24, 0x20410EFD, 0x20610F82, + 0x20011001, 0x21C11038, 0x20211073, 0x60011082, 0x206110B3, 0x202110B9, 0x200110C2, 0x20411100, + 0x6001112C, 0x60211145, 0x20211180, 0x604111B3, 0x602111BF, 0x206111C9, 0x200111CF, 0x2041122F, + 0x20011234, 0x20211236, 0x20011241, 0x604112E0, 0x20211300, 0x2021133B, 0x6001133F, 0x60611341, + 0x6041134B, 0x60211362, 0x20811370, 0x20E11438, 0x20411442, 0x20011446, 0x200114B0, 0x20A114B3, + 0x200114BA, 0x200114BD, 0x202114BF, 0x202114C2, 0x602115B0, 0x606115B8, 0x600115BE, 0x202115DC, + 0x20E11633, 0x2001163D, 0x2021163F, 0x600116AC, 0x602116AE, 0x600116B6, 0x2041171D, 0x60011726, + 0x6041182C, 0x60011838, 0x20011930, 0x60211937, 0x6001193D, 0x4001193F, 0x40011941, 0x20011943, + 0x206119D4, 0x606119DC, 0x600119E4, 0x20A11A33, 0x40011A3A, 0x20011A47, 0x60211A57, 0x40A11A84, + 0x60011A97, 0x60011C2F, 0x20A11C38, 0x20011C3F, 0x60011CA9, 0x60011CB1, 0x60011CB4, 0x20A11D31, + 0x20211D3C, 0x40011D46, 0x60811D8A, 0x60211D93, 0x60011D96, 0x20211EF3, 0x20211F00, 0x60011F03, + 0x20811F36, 0x20011F40, 0x20011F42, 0x20013440, 0x20816AF0, 0x20016F4F, 0x20616F8F, 0x60216FF0, + 0x61BCA0, 0x22C1CF30, 0x6001D166, 0x6001D16D, 0xE1D173, 0x20C1D185, 0x2041D242, 0x2621DA3B, + 0x2001DA84, 0x21C1DAA1, 0x2201E008, 0x2021E023, 0x2001E08F, 0x2001E2AE, 0x2061E4EC, 0x20C1E944, + 0x8041F10D, 0x80A1F16C, 0x8001F18E, 0x8701F1AD, 0x8001F21A, 0x8101F232, 0xB621F249, 0xA7A1F400, + 0x8FE1F680, 0x8541F7D5, 0x80E1F848, 0x80E1F888, 0x85C1F90C, 0xB701F947, 0x2BEE0020, 0x3DEE0100, + 0x3E00000, 0x800000A9, 0x800000AE, 0x20C00483, 0x200005BF, 0x202005C4, 0x40A00600, 0x61C, + 0x20000670, 0x400006DD, 0x202006E7, 0x4000070F, 0x23400730, 0x210007EB, 0x20600816, 0x20400825, + 0x20400859, 0x20E00898, 0x400008E2, 0x60000903, 0x6000093B, 0x6040093E, 0x60600949, 0x6020094E, + 0x20200962, 0x60200982, 0x200009BE, 0x206009C1, 0x602009CB, 0x200009D7, 0x200009FE, 0x60000A03, + 0x60400A3E, 0x20200A47, 0x20000A51, 0x20000A75, 0x60000A83, 0x60400ABE, 0x20200AC7, 0x60200ACB, + 0x20200AE2, 0x20000B01, 0x20000B3C, 0x60000B40, 0x60200B47, 0x20000B4D, 0x20200B62, 0x20000BBE, + 0x20000BC0, 0x60400BC6, 0x20000BCD, 0x20000C00, 0x20000C04, 0x20400C3E, 0x20400C46, 0x20200C55, + 0x20000C81, 0x20000CBC, 0x20000CBF, 0x20000CC2, 0x20000CC6, 0x60200CCA, 0x20200CD5, 0x60000CF3, + 0x60200D02, 0x20000D3E, 0x20600D41, 0x60400D4A, 0x40000D4E, 0x20200D62, 0x60200D82, 0x20000DCF, + 0x20400DD2, 0x60C00DD8, 0x60200DF2, 0x60000E33, 0x20E00E47, 0x60000EB3, 0x20C00EC8, 0x20000F35, + 0x20000F39, 0x21A00F71, 0x20800F80, 0x21400F8D, 0x20000FC6, 0x60001031, 0x20201039, 0x2020103D, + 0x20201058, 0x20601071, 0x60001084, 0x2000108D, 0x2040135D, 0x60001715, 0x60001734, 0x20201772, + 0x600017B6, 0x60E017BE, 0x602017C7, 0x200017DD, 0x180E, 0x20201885, 0x20401920, 0x20201927, + 0x60201930, 0x60A01933, 0x20201A17, 0x20001A1B, 0x20001A56, 0x20C01A58, 0x20001A62, 0x60A01A6D, + 0x20001A7F, 0x20601B00, 0x20C01B34, 0x20001B3C, 0x20001B42, 0x21001B6B, 0x60001B82, 0x20601BA2, + 0x20201BA8, 0x20401BAB, 0x60001BE7, 0x60401BEA, 0x60001BEE, 0x60201BF2, }; static const __swift_uint16_t _swift_stdlib_linkingConsonant_ranks[165] = { diff --git a/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp b/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp index bd7475ba9c614..07d101d4566e7 100644 --- a/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp +++ b/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp @@ -18,48 +18,40 @@ #include "swift/shims/UnicodeData.h" #include + SWIFT_RUNTIME_STDLIB_INTERNAL __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar) { #if !SWIFT_STDLIB_ENABLE_UNICODE_DATA swift::swift_abortDisabledUnicodeSupport(); #else - auto low = 0; - auto high = GRAPHEME_BREAK_DATA_COUNT - 1; - - while (high >= low) { - auto idx = low + (high - low) / 2; - - auto entry = _swift_stdlib_graphemeBreakProperties[idx]; - + auto index = 1; //0th element is a dummy element + while (index < GRAPHEME_BREAK_DATA_COUNT) { + auto entry = _swift_stdlib_graphemeBreakProperties[index]; + // Shift the enum and range count out of the value. auto lower = (entry << 11) >> 11; - + // Shift the enum out first, then shift out the scalar value. auto upper = lower + ((entry << 3) >> 24); - + // Shift everything out. auto enumValue = (__swift_uint8_t)(entry >> 29); - + // Special case: extendedPictographic who used an extra bit for the range. if (enumValue == 5) { upper = lower + ((entry << 2) >> 23); } - + if (scalar >= lower && scalar <= upper) { return enumValue; } - - if (scalar > upper) { - low = idx + 1; - continue; - } - - if (scalar < lower) { - high = idx - 1; - continue; - } + + //If we want the left child of the current node in our virtual tree, + //that's at index * 2, if we want the right child it's at (index * 2) + 1 + //bsearch branches are inherently unpredictable so ideally we want it to + //generate a csel here + index = 2 * index + (scalar < lower ? 0 : 1); } - // If we made it out here, then our scalar was not found in the grapheme // array (this occurs when a scalar doesn't map to any grapheme break // property). Return the max value here to indicate .any. diff --git a/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/main.swift b/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/main.swift index d8b2aae407ee2..64ba5f39264c5 100644 --- a/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/main.swift +++ b/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/main.swift @@ -48,6 +48,16 @@ extension Unicode { } } +struct GraphemeBreakEntry : Comparable { + static func < (lhs: GraphemeBreakEntry, rhs: GraphemeBreakEntry) -> Bool { + return lhs.index < rhs.index + } + + let index: Int + let range: ClosedRange + let property: Unicode.GraphemeBreakProperty +} + // Given a path to one of the Unicode data files, reads it and returns the // unflattened list of scalar & grapheme break property. // @@ -115,7 +125,7 @@ func getGraphemeBreakPropertyData( // Takes the flattened data and writes it as a static C array. func emit( - _ data: [(ClosedRange, Unicode.GraphemeBreakProperty)], + _ data: [GraphemeBreakEntry], into result: inout String ) { result += """ @@ -125,7 +135,9 @@ func emit( """ - formatCollection(data, into: &result) { (range, gbp) -> String in + formatCollection(data, into: &result) { (entry) -> String in + let range = entry.range + let gbp = entry.property // Our value uses the 21 bits to represent the scalar, 8 bits to represent // the range's count, and finally the last three bits to represent the // grapheme break property enum. @@ -168,13 +180,13 @@ func generateGraphemeBreakProperty() { var result = readFile("Input/GraphemeData.h") let baseData = getGraphemeBreakPropertyData( - for: "Data/GraphemeBreakProperty.txt" + for: "Data/15/GraphemeBreakProperty.txt" ) - let emojiData = getGraphemeBreakPropertyData(for: "Data/emoji-data.txt") + let emojiData = getGraphemeBreakPropertyData(for: "Data/15/emoji-data.txt") let flattened = flatten(baseData + emojiData) - var data: [(ClosedRange, Unicode.GraphemeBreakProperty)] = [] + var data: [GraphemeBreakEntry] = [] for (range, gbp) in flattened { guard range.count < 0x200 else { @@ -185,14 +197,24 @@ func generateGraphemeBreakProperty() { continue } - data.append((range, gbp)) + data.append(GraphemeBreakEntry( + index: data.count, + range: range, + property: gbp + )) } + data = eytzingerize(data, dummy: GraphemeBreakEntry( + index: 0, + range: 0...0, + property: .control + )) + emit(data, into: &result) // Handle the CLDR grapheme breaking rules: - let indicSyllabicCategory = readFile("Data/IndicSyllabicCategory.txt") + let indicSyllabicCategory = readFile("Data/15/IndicSyllabicCategory.txt") let consonants = getLinkingConsonant(from: indicSyllabicCategory) diff --git a/utils/gen-unicode-data/Sources/GenUtils/Emit.swift b/utils/gen-unicode-data/Sources/GenUtils/Emit.swift index 66c13daa13552..cb3d2f18e11a9 100644 --- a/utils/gen-unicode-data/Sources/GenUtils/Emit.swift +++ b/utils/gen-unicode-data/Sources/GenUtils/Emit.swift @@ -10,6 +10,22 @@ // //===----------------------------------------------------------------------===// +func _eytzingerize(_ collection: C, result: inout [C.Element], sourceIndex: Int, resultIndex: Int) -> Int where C.Element: Comparable, C.Index == Int { + var sourceIndex = sourceIndex + if resultIndex < result.count { + sourceIndex = _eytzingerize(collection, result: &result, sourceIndex: sourceIndex, resultIndex: 2 * resultIndex) + result[resultIndex] = collection[sourceIndex] + sourceIndex = _eytzingerize(collection, result: &result, sourceIndex: sourceIndex + 1, resultIndex: 2 * resultIndex + 1) + } + return sourceIndex +} + +public func eytzingerize(_ collection: C, dummy: C.Element) -> [C.Element] where C.Element: Comparable, C.Index == Int { + var result = Array(repeating: dummy, count: collection.count + 1) + _ = _eytzingerize(collection, result: &result, sourceIndex: 0, resultIndex: 1) + return result +} + public func emitCollection( _ collection: C, name: String, From ce4bd2e6b60068c926a0ae401468411da806d18a Mon Sep 17 00:00:00 2001 From: David Smith Date: Thu, 15 Feb 2024 23:15:30 -0800 Subject: [PATCH 2/5] Combine conditionals --- stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp b/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp index 07d101d4566e7..430df5f2170de 100644 --- a/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp +++ b/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp @@ -42,15 +42,15 @@ __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar) upper = lower + ((entry << 2) >> 23); } - if (scalar >= lower && scalar <= upper) { - return enumValue; - } - //If we want the left child of the current node in our virtual tree, //that's at index * 2, if we want the right child it's at (index * 2) + 1 - //bsearch branches are inherently unpredictable so ideally we want it to - //generate a csel here - index = 2 * index + (scalar < lower ? 0 : 1); + if (scalar < lower) { + index = 2 * index; + } else if (scalar <= upper) { + return enumValue; + } else { + index = 2 * index + 1; + } } // If we made it out here, then our scalar was not found in the grapheme // array (this occurs when a scalar doesn't map to any grapheme break From 5c8ea26e3750961a8db4053d3557221f58b73fe1 Mon Sep 17 00:00:00 2001 From: David Smith Date: Fri, 16 Feb 2024 09:25:30 -0800 Subject: [PATCH 3/5] A few more microoptimizations --- stdlib/public/core/StringGraphemeBreaking.swift | 16 +++++++++++----- stdlib/public/core/UnicodeBreakProperty.swift | 1 + 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index e1a48b85d5cec..c5fcca4a0cfd0 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -669,8 +669,12 @@ extension _GraphemeBreakingState { } let x = Unicode._GraphemeBreakProperty(from: scalar1) - let y = Unicode._GraphemeBreakProperty(from: scalar2) - + + //GB4 handled here because we don't need to know `y` for this csae + if x == .control { + return true + } + // This variable and the defer statement help toggle the isInEmojiSequence // state variable to false after every decision of 'shouldBreak'. If we // happen to see a rhs .extend or .zwj, then it's a signal that we should @@ -684,6 +688,8 @@ extension _GraphemeBreakingState { self.isInEmojiSequence = enterEmojiSequence self.isInIndicSequence = enterIndicSequence } + + let y = Unicode._GraphemeBreakProperty(from: scalar2) switch (x, y) { @@ -692,9 +698,9 @@ extension _GraphemeBreakingState { case (.any, .any): return true - // GB4 - case (.control, _): - return true + // GB4 (handled above) + // case (.control, _): + // return true // GB5 case (_, .control): diff --git a/stdlib/public/core/UnicodeBreakProperty.swift b/stdlib/public/core/UnicodeBreakProperty.swift index 4aabfd0b1a095..c0f97694d6c51 100644 --- a/stdlib/public/core/UnicodeBreakProperty.swift +++ b/stdlib/public/core/UnicodeBreakProperty.swift @@ -28,6 +28,7 @@ extension Unicode { case v case zwj + @inline(__always) init(from scalar: Unicode.Scalar) { switch scalar.value { // Some fast paths for ascii characters... From 2ade969cd8bd5f54a470aefb006dc78f9d726460 Mon Sep 17 00:00:00 2001 From: David Smith Date: Fri, 16 Feb 2024 11:00:47 -0800 Subject: [PATCH 4/5] Apply suggestions from code review Co-authored-by: Michael Ilseman --- stdlib/public/core/StringGraphemeBreaking.swift | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index c5fcca4a0cfd0..ffeeb4339c645 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -670,7 +670,7 @@ extension _GraphemeBreakingState { let x = Unicode._GraphemeBreakProperty(from: scalar1) - //GB4 handled here because we don't need to know `y` for this csae + // GB4 handled here because we don't need to know `y` for this csae if x == .control { return true } @@ -698,9 +698,7 @@ extension _GraphemeBreakingState { case (.any, .any): return true - // GB4 (handled above) - // case (.control, _): - // return true + // (GB4 is handled above) // GB5 case (_, .control): From b206358ff5d78852af0d9ca2170cbc421c349566 Mon Sep 17 00:00:00 2001 From: David Smith Date: Fri, 16 Feb 2024 11:09:50 -0800 Subject: [PATCH 5/5] Add a comment describing what I'm doing --- utils/gen-unicode-data/Sources/GenUtils/Emit.swift | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utils/gen-unicode-data/Sources/GenUtils/Emit.swift b/utils/gen-unicode-data/Sources/GenUtils/Emit.swift index cb3d2f18e11a9..a791b92d6006c 100644 --- a/utils/gen-unicode-data/Sources/GenUtils/Emit.swift +++ b/utils/gen-unicode-data/Sources/GenUtils/Emit.swift @@ -20,6 +20,10 @@ func _eytzingerize(_ collection: C, result: inout [C.Element], so return sourceIndex } +/* + Takes a sorted collection and reorders it to an array-encoded binary search tree, as originally developed by Michaƫl Eytzinger in the 16th century. + This allows binary searching the array later to touch roughly 4x fewer cachelines, significantly speeding it up. + */ public func eytzingerize(_ collection: C, dummy: C.Element) -> [C.Element] where C.Element: Comparable, C.Index == Int { var result = Array(repeating: dummy, count: collection.count + 1) _ = _eytzingerize(collection, result: &result, sourceIndex: 0, resultIndex: 1)