diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 18bd5ab0e2af..c0ccc1cff49c 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -3,7 +3,7 @@ // This file is part of [zig](https://ziglang.org/), which is MIT licensed. // The MIT license requires this copyright notice to be included in all copies // and substantial portions of the software. -const std = @import("./std.zig"); +const std = @import("std"); const builtin = @import("builtin"); const assert = std.debug.assert; const testing = std.testing; @@ -23,11 +23,12 @@ pub fn utf8CodepointSequenceLength(c: u21) !u3 { /// returns a number 1-4 indicating the total length of the codepoint in bytes. /// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte. pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { - return switch (@clz(u8, ~first_byte)) { - 0 => 1, - 2 => 2, - 3 => 3, - 4 => 4, + // The switch is optimized much better than a "smart" approach using @clz + return switch (first_byte) { + 0b0000_0000...0b0111_1111 => 1, + 0b1100_0000...0b1101_1111 => 2, + 0b1110_0000...0b1110_1111 => 3, + 0b1111_0000...0b1111_0111 => 4, else => error.Utf8InvalidStartByte, }; } @@ -106,6 +107,7 @@ const Utf8Decode3Error = error{ Utf8OverlongEncoding, Utf8EncodesSurrogateHalf, }; + pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 { assert(bytes.len == 3); assert(bytes[0] & 0b11110000 == 0b11100000); @@ -153,6 +155,50 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 { return value; } +/// Returns true if the given unicode codepoint can be encoded in UTF-8. +pub fn isValidCodepoint(value: u21) bool { + return switch (value) { + 0xD800...0xDFFF => false, // Surrogates range + 0x110000...0x1FFFFF => false, // Above the maximum codepoint value + else => true, + }; +} + +/// Returns the length of a supplied UTF-8 string literal in terms of unicode +/// codepoints. +/// Errors: if this string cannot be decoded. +pub fn utf8CountCodepoints(s: []const u8) !usize { + var len: usize = 0; + + const N = @sizeOf(usize); + const MASK = 0x80 * (std.math.maxInt(usize) / 0xff); + + var i: usize = 0; + while (i < s.len) { + // Fast path for ASCII sequences + while (i + N <= s.len) : (i += N) { + const v = mem.readIntNative(usize, s[i..][0..N]); + if (v & MASK != 0) break; + len += N; + } + + if (i < s.len) { + const n = try utf8ByteSequenceLength(s[i]); + if (i + n > s.len) return error.TruncatedInput; + + switch (n) { + 1 => {}, // ASCII, no validation needed + else => _ = try utf8Decode(s[i .. i + n]), + } + + i += n; + len += 1; + } + } + + return len; +} + pub fn utf8ValidateSlice(s: []const u8) bool { var i: usize = 0; while (i < s.len) { @@ -757,3 +803,31 @@ test "utf8ToUtf16LeStringLiteral" { testing.expect(utf16[2] == 0); } } + +fn testUtf8CountCodepoints() !void { + testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij")); + testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö")); + testing.expectEqual(@as(usize, 5), try utf8CountCodepoints("こんにちは")); + // testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80")); +} + +test "utf8 count codepoints" { + try testUtf8CountCodepoints(); + comptime testUtf8CountCodepoints() catch unreachable; +} + +fn testisValidCodepoint() !void { + testing.expect(isValidCodepoint('e')); + testing.expect(isValidCodepoint('ë')); + testing.expect(isValidCodepoint('は')); + testing.expect(isValidCodepoint(0xe000)); + testing.expect(isValidCodepoint(0x10ffff)); + testing.expect(!isValidCodepoint(0xd800)); + testing.expect(!isValidCodepoint(0xdfff)); + testing.expect(!isValidCodepoint(0x110000)); +} + +test "utf8 valid codepoint" { + try testisValidCodepoint(); + comptime testisValidCodepoint() catch unreachable; +} diff --git a/lib/std/unicode/UTF-8-demo.txt b/lib/std/unicode/UTF-8-demo.txt new file mode 100644 index 000000000000..6017f34d099b --- /dev/null +++ b/lib/std/unicode/UTF-8-demo.txt @@ -0,0 +1,212 @@ + +UTF-8 encoded sample plain-text file +‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ + +Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25 + + +The ASCII compatible UTF-8 encoding used in this plain-text file +is defined in Unicode, ISO 10646-1, and RFC 2279. + + +Using Unicode/UTF-8, you can write in emails and source code things such as + +Mathematics and sciences: + + ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫ + ⎪⎢⎜│a²+b³ ⎟⎥⎪ + ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪ + ⎪⎢⎜⎷ c₈ ⎟⎥⎪ + ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬ + ⎪⎢⎜ ∞ ⎟⎥⎪ + ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪ + ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪ + 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭ + +Linguistics and dictionaries: + + ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn + Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ] + +APL: + + ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈ + +Nicer typography in plain text files: + + ╔══════════════════════════════════════════╗ + ║ ║ + ║ • ‘single’ and “double” quotes ║ + ║ ║ + ║ • Curly apostrophes: “We’ve been here” ║ + ║ ║ + ║ • Latin-1 apostrophe and accents: '´` ║ + ║ ║ + ║ • ‚deutsche‘ „Anführungszeichen“ ║ + ║ ║ + ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║ + ║ ║ + ║ • ASCII safety test: 1lI|, 0OD, 8B ║ + ║ ╭─────────╮ ║ + ║ • the euro symbol: │ 14.95 € │ ║ + ║ ╰─────────╯ ║ + ╚══════════════════════════════════════════╝ + +Combining characters: + + STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑ + +Greek (in Polytonic): + + The Greek anthem: + + Σὲ γνωρίζω ἀπὸ τὴν κόψη + τοῦ σπαθιοῦ τὴν τρομερή, + σὲ γνωρίζω ἀπὸ τὴν ὄψη + ποὺ μὲ βία μετράει τὴ γῆ. + + ᾿Απ᾿ τὰ κόκκαλα βγαλμένη + τῶν ῾Ελλήνων τὰ ἱερά + καὶ σὰν πρῶτα ἀνδρειωμένη + χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά! + + From a speech of Demosthenes in the 4th century BC: + + Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι, + ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς + λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ + τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿ + εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ + πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν + οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι, + οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν + ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον + τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι + γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν + προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους + σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ + τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ + τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς + τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον. + + Δημοσθένους, Γ´ ᾿Ολυνθιακὸς + +Georgian: + + From a Unicode conference invitation: + + გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო + კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს, + ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს + ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი, + ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება + ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში, + ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში. + +Russian: + + From a Unicode conference invitation: + + Зарегистрируйтесь сейчас на Десятую Международную Конференцию по + Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии. + Конференция соберет широкий круг экспертов по вопросам глобального + Интернета и Unicode, локализации и интернационализации, воплощению и + применению Unicode в различных операционных системах и программных + приложениях, шрифтах, верстке и многоязычных компьютерных системах. + +Thai (UCS Level 2): + + Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese + classic 'San Gua'): + + [----------------------------|------------------------] + ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่ + สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา + ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา + โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ + เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ + ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ + พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้ + ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ + + (The above is a two-column text. If combining characters are handled + correctly, the lines of the second column should be aligned with the + | character above.) + +Ethiopian: + + Proverbs in the Amharic language: + + ሰማይ አይታረስ ንጉሥ አይከሰስ። + ብላ ካለኝ እንደአባቴ በቆመጠኝ። + ጌጥ ያለቤቱ ቁምጥና ነው። + ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው። + የአፍ ወለምታ በቅቤ አይታሽም። + አይጥ በበላ ዳዋ ተመታ። + ሲተረጉሙ ይደረግሙ። + ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል። + ድር ቢያብር አንበሳ ያስር። + ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም። + እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም። + የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ። + ሥራ ከመፍታት ልጄን ላፋታት። + ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል። + የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ። + ተንጋሎ ቢተፉ ተመልሶ ባፉ። + ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው። + እግርህን በፍራሽህ ልክ ዘርጋ። + +Runes: + + ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ + + (Old English, which transcribed into Latin reads 'He cwaeth that he + bude thaem lande northweardum with tha Westsae.' and means 'He said + that he lived in the northern land near the Western Sea.') + +Braille: + + ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌ + + ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞ + ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎ + ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂ + ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙ + ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑ + ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲ + + ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ + + ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹ + ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞ + ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕ + ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹ + ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎ + ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎ + ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳ + ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞ + ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ + + (The first couple of paragraphs of "A Christmas Carol" by Dickens) + +Compact font selection example text: + + ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789 + abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ + –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд + ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა + +Greetings in various languages: + + Hello world, Καλημέρα κόσμε, コンニチハ + +Box drawing alignment tests: █ + ▉ + ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳ + ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳ + ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳ + ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳ + ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎ + ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏ + ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█ + ▝▀▘▙▄▟ diff --git a/lib/std/unicode/benchmark.zig b/lib/std/unicode/benchmark.zig new file mode 100644 index 000000000000..1d1205b99b3a --- /dev/null +++ b/lib/std/unicode/benchmark.zig @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2015-2020 Zig Contributors +// This file is part of [zig](https://ziglang.org/), which is MIT licensed. +// The MIT license requires this copyright notice to be included in all copies +// and substantial portions of the software. +const std = @import("std"); +const builtin = std.builtin; +const time = std.time; +const unicode = std.unicode; + +const Timer = time.Timer; + +const N = 1_000_000; + +const KiB = 1024; +const MiB = 1024 * KiB; +const GiB = 1024 * MiB; + +const ResultCount = struct { + count: usize, + throughput: u64, +}; + +const boxes = @embedFile("boxes.txt"); +const glasses = @embedFile("glasses.txt"); +const quickbrown = @embedFile("quickbrown.txt"); +const utf8demo = @embedFile("UTF-8-demo.txt"); + +fn benchmarkCodepointCount(buf: []const u8) !ResultCount { + var timer = try Timer.start(); + + const bytes = N * buf.len; + + const start = timer.lap(); + var i: usize = 0; + var r: usize = undefined; + while (i < N) : (i += 1) { + r = try @call( + .{ .modifier = .never_inline }, + unicode.utf8CountCodepoints, + .{buf}, + ); + } + const end = timer.read(); + + const elapsed_s = @intToFloat(f64, end - start) / time.ns_per_s; + const throughput = @floatToInt(u64, @intToFloat(f64, bytes) / elapsed_s); + + return ResultCount{ .count = r, .throughput = throughput }; +} + +pub fn main() !void { + const stdout = std.io.getStdOut().writer(); + const allocator = std.heap.page_allocator; + + const args = try std.process.argsAlloc(allocator); + defer std.process.argsFree(allocator, args); + + if (args.len > 1) { + var i: usize = 1; + while (i < args.len) : (i += 1) { + const s = args[i]; + try stdout.print("Benchmark '{}' string\n", .{s}); + const result = try benchmarkCodepointCount(s); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } + } + + try stdout.print("Benchmark short ASCII strings\n", .{}); + { + const result = try benchmarkCodepointCount("abc"); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } + + try stdout.print("Benchmark short Unicode strings\n", .{}); + { + const result = try benchmarkCodepointCount("ŌŌŌ"); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } + + try stdout.print("Benchmark pure ASCII strings\n", .{}); + { + const result = try benchmarkCodepointCount("hello" ** 16); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } + + try stdout.print("Benchmark pure Unicode strings\n", .{}); + { + const result = try benchmarkCodepointCount("こんにちは" ** 16); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } + + try stdout.print("Benchmark mixed ASCII/Unicode strings\n", .{}); + { + const result = try benchmarkCodepointCount("Hyvää huomenta" ** 16); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } + + try stdout.print("Benchmark 'boxes.txt'\n", .{}); + { + const result = try benchmarkCodepointCount(boxes); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } + + try stdout.print("Benchmark 'glasses.txt'\n", .{}); + { + const result = try benchmarkCodepointCount(glasses); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } + + try stdout.print("Benchmark 'quickbrown.txt'\n", .{}); + { + const result = try benchmarkCodepointCount(quickbrown); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } + + try stdout.print("Benchmark 'UTF-8-demo.txt'\n", .{}); + { + const result = try benchmarkCodepointCount(utf8demo); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } +} diff --git a/lib/std/unicode/boxes.txt b/lib/std/unicode/boxes.txt new file mode 100644 index 000000000000..f7c03dd69589 --- /dev/null +++ b/lib/std/unicode/boxes.txt @@ -0,0 +1,341 @@ +Single width, hollow. +┌─┐ )0lqk +│ │ )0x x +└─┘ )0mqj +┌─┐ +│ │ +└─┘ + +Single width, single fill. +┌┬┐ )0lwk +├┼┤ )0tnu +└┴┘ )0mvj +┌┬┐ +├┼┤ +└┴┘ + +Double width, hollow. +┏━┓ )0  +┃ ┃ )0  +┗━┛ )0  +╔═╗ +║ ║ +╚═╝ + +Double width, double fill. +┏┳┓ )0  +┣╋┫ )0  +┗┻┛ )0  +╔╦╗ +╠╬╣ +╚╩╝ + +Double width, single fill. +┏┯┓ )0  +┠┼┨ )0 n  +┗┷┛ )0  +╔╤╗ +╟┼╢ +╚╧╝ + +Single width, double fill. +┌┰┐ )0l k +┝╋┥ )0  +└┸┘ )0m j +┌╥┐ +╞╬╡ +└╨┘ + +Single width, mixed fill (double horizontal, single vertical). +┌┬┐ )0lwk +┝┿┥ )0  +└┴┘ )0mvj +┌┬┐ +╞╪╡ +└┴┘ + +Double width, mixed fill (double vertical, single horizontal). +┏┳┓ )0  +┠╂┨ )0  +┗┻┛ )0  +╔╦╗ +╟╫╢ +╚╩╝ + +Double horizontal, single vertical. +┍┑ +┕┙ +╒╕ +╘╛ + +Double vertical, single horizontal. +┎┒ +┖┚ +╓╖ +╙╜ + +Single width, double, triple and quadruple dash. +┌╌╌┐ ┌┄┄┐ ┌┈┈┐ +╎ ╎ ┆ ┆ ┊ ┊ +╎ ╎ ┆ ┆ ┊ ┊ +└╌╌┘ └┄┄┘ └┈┈┘ + +Double width, double, triple and quadruple dash. +┏╍╍┓ ┏┅┅┓ ┏┉┉┓ +╏ ╏ ┇ ┇ ┋ ┋ +╏ ╏ ┇ ┇ ┋ ┋ +┗╍╍┛ ┗┅┅┛ ┗┉┉┛ + +One single, two double lines meet. +┢┪ ┲┱ +┡┩ ┺┹ + +One double, two single lines meet. +┞┦ ┭┮ +┟┧ ┵┶ + +One single, three double lines meet. +╇ ╉╊ +╈ + +One double, three single lines meet. +╁ ┾┽ +╀ + +Two double, two single lines meet. +╆╅ +╄╃ + +Mixed width, starting, ending and changing width mid-character. +╷ ╻ ╶╼╸ +╽ ╿ ╺╾╴ +╹ ╵ + +Single line with vertical lines crossing + ║ ┃ │ │ │ ┃ ║ +─╫─╂─┼─🮯─┼─╂─╫─ + ║ ┃ │ │ │ ┃ ║ + +Rounded. +╭─╮ +│ │ +╰─╯ + +Diagonals. +╲ ╱ ╲ ╱ ╳ ╳ ╲ ╱ ╲ ╱ ╲ ╱ ╳ ╳ ╳ ╳ ╳╳╳╳╳╳╳ + ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳╳╳╳╳╳╳ + ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╱ ╲ ╱ ╲ ╱ ╲ ╳ ╳ ╳ ╳ ╳╳╳╳╳╳╳ + ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳╳╳╳╳╳╳ + ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╲ ╱ ╲ ╱ ╲ ╱ ╳ ╳ ╳ ╳ ╳╳╳╳╳╳╳ + ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ +╱ ╲ ╱ ╲ ╳ ╳ ╱ ╲ ╱ ╲ ╱ ╲ ╳ ╳ ╳ ╳ + +╲ ╱ ╲ ╱ ╱╲ ╱╲ ╱╲ ╱╲ ╱╲ ╲╱╲╱╲╱╲╱ + ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲╱ ╲╱ ╲╱ ╱╲╱╲╱╲╱╲ + ╲╱ ╲╱ ╲╱ ╲╱ ╲╱ ╱╲ ╱╲ ╱╲ ╲╱╲╱╲╱╲╱ + ╱╲ ╱╲ ╱╲ ╱╲ ╱╲ ╲╱ ╲╱ ╲╱ ╱╲╱╲╱╲╱╲ + ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱╲ ╱╲ ╱╲ ╲╱╲╱╲╱╲╱ +╱ ╲ ╱ ╲ ╲╱ ╲╱ ╲╱ ╲╱ ╲╱ ╱╲╱╲╱╲╱╲ + + +Block elements. + +█▏ +▏▏ +█▏ 🭽▔🭶🭷🭸🭹🭺🭻▁🮀🮁🮀▁🭻🭺🭹🭸🭷🭶▔🭾 +🮋▎ ▏ ▕ +🭰🭰 🭰 🭵 +🮋▎ 🭱 ▐ ▌ ▛▀#▀▜ 🭴 +🮊▍ 🭲 ▄▞▀ ▗▄▀▘ ▌▗▄▖▐ 🭳 +🭱🭱 🭳 ▌ ▐ #▐#▌# 🭲 +🮊▍ 🭴 ▀▚▄ ▝▀▄▖ ▌▝▀▘▐ 🭱 +🮉▌ 🭵 ▐ ▌ ▙▄#▄▟ 🭰 +🭲🭲 ▕ ▏ +🮉▌ ▕ ▁▂▃▄▅▆▇█ ▖# ▗# ▏ +▐▋ ▕ ▕ ▉ ▌# ▐# ▏ +🭳🭳 ▕ 🮇 ▊ ▐# ▌# ▏ +▐▋ ▕ 🮈 ░ ▋ ▝# ▘# ▏ +🮈▊ 🭵 ▐ ▒░ ▌ 🭰 +🭴🭴 🭴 🮉 ▓▒░ ▍ ▌# ▐# 🭱 +🮈▊ 🭳 🮊 █▓▒░ ▎ ▚# ▞# 🭲 +🮇▉ 🭲 🮋 ▏ ▐# ▌# 🭳 +🭵🭵 🭱 █🮆🮅🮄▀🮃🮂▔ 🭴 +🮇▉ 🭰 🭵 +▕█ ▏ ▕ +▕▕ 🭼▁🭻🭺🭹🭸🭷🭶▔🮀🮁🮀▔🭶🭷🭸🭹🭺🭻▁🭿 +▕█ + █▔█▇🭶▇▆🭷▆▅🭸▅▄🭹▄▃🭺▃▂🭻▂▁▁▁ + ▔▔▔🮂🭶🮂🮃🭷🮃▀🭸▀🮄🭹🮄🮅🭺🮅🮆🭻🮆█▁█ + +Shades. + ████████████████████████████████ + ░░░░░ ▒▒▒▒▒ ▓▓▓▓▓ ██▓▓▓▓▓█████▒▒▒▒▒█████░░░░░█████ + ░ ░░░░░ ▒ ▒▒▒▒▒ ▓ ▓▓▓▓▓ ██▓▓▓▓▓██▓██▒▒▒▒▒██▒██░░░░░██░██ + ░░░░░ ▒▒▒▒▒ ▓▓▓▓▓ ██▓▓▓▓▓█████▒▒▒▒▒█████░░░░░█████ + ████████████████████████████████ + +Hatchings and Checkerboards + +🮘🮘🮘🮘 🮙🮙🮙🮙 🮘🮙🮘🮙 🮕🮕🮕🮕 🮖🮖🮖🮖 🮕🮖🮕🮖 +🮘🮘🮘🮘 🮙🮙🮙🮙 🮙🮘🮙🮘 🮕🮕🮕🮕 🮖🮖🮖🮖 🮖🮕🮖🮕 +🮘🮘🮘🮘 🮙🮙🮙🮙 🮘🮙🮘🮙 🮕🮕🮕🮕 🮖🮖🮖🮖 🮕🮖🮕🮖 +🮘🮘🮘🮘 🮙🮙🮙🮙 🮙🮘🮙🮘 🮕🮕🮕🮕 🮖🮖🮖🮖 🮖🮕🮖🮕 + + +🬇🬋🬃 🬦🬹🬓 🬞🬭🬏 🬠🬰🬐 🬁🬂🬀 🬉🬎🬄 🬇🬋🬃 + +🬭🬞🬏 🬹🬦🬓 +█▐▌ █▐▌ +🬂🬁🬀 🬎🬉🬄 + + 🬭🬭🬭 +🬭🬭🬭 🬚🬋🬩 🬕🬂🬨 🬹🬹🬹 🬝🬎🬬 🬴🬰🬸 🬛🬋🬫 +▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐ +🬂🬂🬂 🬌🬋🬍 🬲🬭🬷 🬎🬎🬎 🬺🬹🬻 🬴🬰🬸 🬛🬋🬫 + 🬂🬂🬂 + + 🬞🬭🬏 +🬞🬭🬏 🬦🬋🬓 ▐🬂▌ 🬦🬹🬓 ▐🬎▌ ▐🬰▌ ▐🬋▌ +▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌ +🬁🬂🬀 🬉🬋🬄 ▐🬭▌ 🬉🬎🬄 ▐🬹▌ ▐🬰▌ ▐🬋▌ + 🬁🬂🬀 + + 🬞 🬏 +🬖🬏🬇🬗 🬈🬀🬁🬅 🬤🬃🬞🬢 +🬠 🬞🬢 🬔🬓🬦🬧 🬖🬏 🬐 +🬣🬄🬁🬅 🬁 🬀 🬈🬀🬉🬘 + + +🬥 🬙 🬆 🬊 🬒 🬡 🬑 🬟 +🬇 🬃 🬐 🬠 🬃 🬇 🬃 🬇 +🬳 🬶 🬱 🬵 🬮 🬯 🬟 🬑 + + + 🬞🬻🬺🬏 🬞🬜🬪🬏 🬞🬅🬈🬏 + 🬵🬝🬀🬁🬬🬱 🬵🬆 🬊🬱 🬖🬀 🬁🬢 +🬻🬆 🬊🬺 🬜🬀 🬁🬪 🬔 🬧 +🬬🬱 🬵🬝 🬪🬏 🬞🬜 🬣 🬘 + 🬊🬺🬏🬞🬻🬆 🬊🬱 🬵🬆 🬈🬏 🬞🬅 + 🬁🬬🬝🬀 🬁🬪🬜🬀 🬁🬢🬖🬀 + + +Slope 1/3. +🭈🭆🭂🭍🭑🬽 🭈🬭🭆🬹🭂█🭍🬹🭑🬭🬽 +🭣🭧🭓🭞🭜🭘 █#########█ + 🭣🬂🭧🬎🭓█🭞🬎🭜🬂🭘 + +Slope 2/3. + 🭇🬼 🬞🬏 +🭇🭄🭏🬼 🭊🭁🭌🬿 🭇🬭🭄█🭏🬭🬼 🭊🬹🭁🭌🬹🬿 +🭢🭕🭠🭗 🭥🭒🭝🭚 ▐#####▌ █####█ + 🭢🭗 🭢🬂🭕█🭠🬂🭗 🭥🬎🭒🭝🬎🭚 + 🬁🬀 +Slope 1. +◢◣ 🮞🮟 +◥◤ 🮝🮜 + + +Slope 4/3. + 🭉🬹🬾 +🭉🬾 ▐#▌ +🭃🭎 🬞🭃#🭎🬏 +🭔🭟 🬁🭔#🭟🬀 +🭤🭙 ▐#▌ + 🭤🬎🭙 + +Slope 2. + 🭋█🭀 +🭋🭀 ▐#▌ +🭅🭐 🭅#🭐 +🭖🭡 🭖#🭡 +🭦🭛 ▐#▌ + 🭦█🭛 + + +Diagonal quarters. + 🭯 🭯 🭯 🭯 + 🭯 🭫 🭮🭫🭬 🮞🭫🮟 ◢🭫◣ 🭯🭯🭯🭯 + 🭫 🭯 🭯 🮞🮜 🮝🮟 ◢◤ ◥◣ 🭮🮛🮛🭬🮚🮚🮚🮚 +🭮🭪 🭨🭬 🭮🭪 🭨🭬 🭮🭪 🭨🭬 🭮🭪 🭨🭬 🭮🭪 🭨🭬 🭮🮛🮛🭬🮚🮚🮚🮚 + 🭩 🭭 🭭 🮝🮟 🮞🮜 ◥◣ ◢◤ 🭮🮛🮛🭬🮚🮚🮚🮚 + 🭭 🭩 🭮🭩🭬 🮝🭩🮜 ◥🭩◤ 🭭🭭🭭🭭 + 🭭 🭭 🭭 🭭 + + 🭯 🭯 🭯 + ◢◣◢◣◢◣ ◢🭫🭩🭫🭩🭫◣ ◢🭩🭩🭩◣ 🭯🭯🭯 +◢◤◥◤◥◤◥◣ 🭮🭪 🭭 🭭 🭨🭬 ◢◤🭭🭭🭭◥◣ ◢🭫🭫🭫◣ +◥◣ ◢◤ 🭨🭬 🭮🭪 🭨🭬 🭮🭪 🭮🭪 🭨🭬 +◢◤ ◥◣ 🭮🭪 🭨🭬 🭨🭬 🭮🭪 🭮🭪 🭨🭬 +◥◣ ◢◤ 🭨🭬 🭮🭪 🭨🭬 🭮🭪 🭮🭪 🭨🭬 +◢◤ ◥◣ 🭮🭪 🭯 🭯 🭨🭬 ◥◣🭯🭯🭯◢◤ ◥🭩🭩🭩◤ +◥◣◢◣◢◣◢◤ ◥🭩🭫🭩🭫🭩◤ ◥🭫🭫🭫◤ 🭭🭭🭭 + ◥◤◥◤◥◤ 🭭 🭭 🭭 + + 🮞◣🮞◣🮞◣ ◢🮟◢🮟◢🮟 ╱🮟╱🮟╱🮟 🮞╲🮞╲🮞╲ +🮞🮜◥🮜◥🮜◥◣ ◢◤🮝◤🮝◤🮝🮟 ╱╱🮝╱🮝╱🮝🮟 🮞🮜╲🮜╲🮜╲╲ +◥◣ 🮞🮜 🮝🮟 ◢◤ 🮝🮟 ╱╱ ╲╲ 🮞🮜 +🮞🮜 ◥◣ ◢◤ 🮝🮟 ╱╱ 🮝🮟 🮞🮜 ╲╲ +◥◣ 🮞🮜 🮝🮟 ◢◤ 🮝🮟 ╱╱ ╲╲ 🮞🮜 +🮞🮜 ◥◣ ◢◤ 🮝🮟 ╱╱ 🮝🮟 🮞🮜 ╲╲ +◥◣🮞◣🮞◣🮞🮜 🮝🮟◢🮟◢🮟◢◤ 🮝🮟╱🮟╱🮟╱╱ ╲╲🮞╲🮞╲🮞🮜 + ◥🮜◥🮜◥🮜 🮝◤🮝◤🮝◤ 🮝╱🮝╱🮝╱ ╲🮜╲🮜╲🮜 + + ╱◣╱◣╱◣ ◢╲◢╲◢╲ ╱╲╱╲╱╲ 🮣🮧🮧🮧🮧🮢 +╱╱◥╱◥╱◥◣ ◢◤╲◤╲◤╲╲ ╱╱╲╱╲╱╲╲ 🮣🮨🮧🮧🮧🮧🮩🮢 +◥◣ ╱╱ ╲╲ ◢◤ ╲╲ ╱╱ 🮤🮤 🮥🮥 +╱╱ ◥◣ ◢◤ ╲╲ ╱╱ ╲╲ 🮤🮤 🮥🮥 +◥◣ ╱╱ ╲╲ ◢◤ ╲╲ ╱╱ 🮤🮤 🮥🮥 +╱╱ ◥◣ ◢◤ ╲╲ ╱╱ ╲╲ 🮤🮤 🮥🮥 +◥◣╱◣╱◣╱╱ ╲╲◢╲◢╲◢◤ ╲╲╱╲╱╲╱╱ 🮡🮩🮦🮦🮦🮦🮨🮠 + ◥╱◥╱◥╱ ╲◤╲◤╲◤ ╲╱╲╱╲╱ 🮡🮦🮦🮦🮦🮠 + + ╷ ╷ 🮣─🮢 🮣─🮦─🮢 +🮣─🮢 ┌🮧┐ ╶🮭─🮬╴ │ │ │ │ │ +│ │ 🮤 🮥 │ │ 🮣─🮨─🮩─🮢 🮥─🮮─🮤 +🮡─🮠 └🮦┘ ╶🮫─🮪╴ │ │ │ │ │ │ │ + ╵ ╵ 🮡─🮠 🮡─🮠 🮡─🮧─🮠 + + ▗🮒█🮒▖ + ▗▘ ▝▖ + 🮔 🮏 + █ 🮍▒🮌 + 🮔 🮎 + ▝▖ ▗▘ + ▝🮑█🮑▘ + + + ½ + 🬤🬤🬤🬤⅓█ █ 🬗🬗🬗🬗 + 🬗🬗🬗🬗█ █ █🬤🬤🬤🬤 + 🬤🬤🬤🬤 █ █ 🬗🬗🬗🬗 + ¼ 🬗🬗🬗🬗█ █ █🬤🬤🬤🬤 +▒▒▒▒🮖🮖🮖🮖▞▞▞▞½█ █ ▚▚▚▚🮕🮕🮕🮕🮐🮐🮐🮐 ▎ 🮇 ▎ 🮇 +▒▒▒▒🮖🮖🮖🮖▞▞▞▞█ █ █▚▚▚▚🮕🮕🮕🮕🮐🮐🮐🮐 🮂🮕🮗🮖🮂 🮂🮖🮗🮕🮂 +▒▒▒▒🮖🮖🮖🮖▞▞▞▞ █ █ ▚▚▚▚🮕🮕🮕🮕🮐🮐🮐🮐 ▂🮕🮗🮖▂ ▂🮖🮗🮕▂ +▒▒▒▒🮖🮖🮖🮖▞▞▞▞█ █ █▚▚▚▚🮕🮕🮕🮕🮐🮐🮐🮐 ▎ 🮇 ▎ 🮇 + 🬘🬘🬘🬘⅔█ █ 🬣🬣🬣🬣 + 🬧🬧🬧🬧█ █ █🬔🬔🬔🬔 + 🬣🬣🬣🬣 █ █ 🬘🬘🬘🬘 + 🬔🬔🬔🬔█ █ █🬧🬧🬧🬧 + + 🮣🮢 🮣🮢 🮣🮢🮣🮢 +🮣🮠🮡🮢🮣🮨🮩🮢 🮭🮬 🮡🮩🮨🮠 🮨🮨🮨🮩🮩🮩 🮭🮭🮭🮬🮬🮬 🮮🮮🮮🮮🮮🮮 +🮡🮢🮣🮠🮡🮩🮨🮠 🮫🮪 🮣🮨🮩🮢 🮨🮨🮨🮩🮩🮩 🮭🮭🮭🮬🮬🮬 🮮🮮🮮🮮🮮🮮 + 🮡🮠 🮡🮠 🮡🮠🮡🮠 🮨🮨🮨🮩🮩🮩 🮭🮭🮭🮬🮬🮬 🮮🮮🮮🮮🮮🮮 + 🮣🮧🮢 🮣🮧🮢 🮣🮦🮢 🮭🮦🮬 🮩🮩🮩🮨🮨🮨 🮫🮫🮫🮪🮪🮪 🮮🮮🮮🮮🮮🮮 + 🮤 🮥 🮤🮮🮥 🮥 🮤 🮥 🮤 🮩🮩🮩🮨🮨🮨 🮫🮫🮫🮪🮪🮪 🮮🮮🮮🮮🮮🮮 + 🮡🮦🮠 🮡🮦🮠 🮡🮧🮠 🮫🮧🮪 🮩🮩🮩🮨🮨🮨 🮫🮫🮫🮪🮪🮪 🮮🮮🮮🮮🮮🮮 + +◤◤◤◥◥◥ 🮜🮜🮜🮝🮝🮝 +◤◤◤◥◥◥ 🮜🮜🮜🮝🮝🮝 +◤◤◤◥◥◥ 🮜🮜🮜🮝🮝🮝 +◣◣◣◢◢◢ 🮟🮟🮟🮞🮞🮞 +◣◣◣◢◢◢ 🮟🮟🮟🮞🮞🮞 +◣◣◣◢◢◢ 🮟🮟🮟🮞🮞🮞 + +References: +VT-102: http://vt100.net/docs/vt102-ug/table5-13.html +Unicode: http://www.unicode.org/charts/PDF/U2500.pdf + http://www.unicode.org/charts/PDF/U2580.pdf + http://www.unicode.org/charts/PDF/U25A0.pdf + http://www.unicode.org/charts/PDF/U1FB00.pdf diff --git a/lib/std/unicode/glasses.txt b/lib/std/unicode/glasses.txt new file mode 100644 index 000000000000..1d84bb5558d8 --- /dev/null +++ b/lib/std/unicode/glasses.txt @@ -0,0 +1,193 @@ +I Can Eat Glass +In various languages + +# http://kermitproject.org does not support https:// +Adopted from http://kermitproject.org/utf8.html#glass +Submit additions to the URL above and update this file. + +Permission is granted by the Kermit project (http://kermitproject.org/) +to redistribute this file, with absolutely no warranty. + +Sanskrit: काचं शक्नोम्यत्तुम् । नोपहिनस्ति माम् ॥ +Sanskrit (standard transcription): kācaṃ śaknomyattum; nopahinasti mām. +Classical Greek: ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει. +Greek (monotonic): Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα. +Greek (polytonic): Μπορῶ νὰ φάω σπασμένα γυαλιὰ χωρὶς νὰ πάθω τίποτα. +Etruscan: (NEEDED) +Latin: Vitrum edere possum; mihi non nocet. +Old French: Je puis mangier del voirre. Ne me nuit. +French: Je peux manger du verre, ça ne me fait pas mal. +Provençal / Occitan: Pòdi manjar de veire, me nafrariá pas. +Québécois: J'peux manger d'la vitre, ça m'fa pas mal. +Walloon: Dji pou magnî do vêre, çoula m' freut nén må. +Champenois: (NEEDED) +Lorrain: (NEEDED) +Picard: Ch'peux mingi du verre, cha m'foé mie n'ma. +Corsican/Corsu: (NEEDED) +Jèrriais: (NEEDED) +Kreyòl Ayisyen (Haitï): Mwen kap manje vè, li pa blese'm. +Basque: Kristala jan dezaket, ez dit minik ematen. +Catalan / Català: Puc menjar vidre, que no em fa mal. +Spanish: Puedo comer vidrio, no me hace daño. +Aragonés: Puedo minchar beire, no me'n fa mal . +Aranés: (NEEDED) +Mallorquín: (NEEDED) +Galician: Eu podo xantar cristais e non cortarme. +European Portuguese: Posso comer vidro, não me faz mal. +Brazilian Portuguese (8): Posso comer vidro, não me machuca. +Caboverdiano/Kabuverdianu (Cape Verde): M' podê cumê vidru, ca ta maguâ-m'. +Papiamentu: Ami por kome glas anto e no ta hasimi daño. +Italian: Posso mangiare il vetro e non mi fa male. +Milanese: Sôn bôn de magnà el véder, el me fa minga mal. +Roman: Me posso magna' er vetro, e nun me fa male. +Napoletano: M' pozz magna' o'vetr, e nun m' fa mal. +Venetian: Mi posso magnare el vetro, no'l me fa mae. +Zeneise (Genovese): Pòsso mangiâ o veddro e o no me fà mâ. +Sicilian: Puotsu mangiari u vitru, nun mi fa mali. +Campinadese (Sardinia): (NEEDED) +Lugudorese (Sardinia): (NEEDED) +Romansch (Grischun): Jau sai mangiar vaider, senza che quai fa donn a mai. +Romany / Tsigane: (NEEDED) +Romanian: Pot să mănânc sticlă și ea nu mă rănește. +Esperanto: Mi povas manĝi vitron, ĝi ne damaĝas min. +Pictish: (NEEDED) +Breton: (NEEDED) +Cornish: Mý a yl dybry gwéder hag éf ny wra ow ankenya. +Welsh: Dw i'n gallu bwyta gwydr, 'dyw e ddim yn gwneud dolur i mi. +Manx Gaelic: Foddym gee glonney agh cha jean eh gortaghey mee. +Old Irish (Ogham): ᚛᚛ᚉᚑᚅᚔᚉᚉᚔᚋ ᚔᚈᚔ ᚍᚂᚐᚅᚑ ᚅᚔᚋᚌᚓᚅᚐ᚜ +Old Irish (Latin): Con·iccim ithi nglano. Ním·géna. +Irish: Is féidir liom gloinne a ithe. Ní dhéanann sí dochar ar bith dom. +Ulster Gaelic: Ithim-sa gloine agus ní miste damh é. +Scottish Gaelic: S urrainn dhomh gloinne ithe; cha ghoirtich i mi. +Anglo-Saxon (Runes): ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬ +Anglo-Saxon (Latin): Ic mæg glæs eotan ond hit ne hearmiað me. +Middle English: Ich canne glas eten and hit hirtiþ me nouȝt. +English: I can eat glass and it doesn't hurt me. +English (IPA): [aɪ kæn iːt glɑːs ænd ɪt dɐz nɒt hɜːt miː] (Received Pronunciation) +English (Braille): ⠊⠀⠉⠁⠝⠀⠑⠁⠞⠀⠛⠇⠁⠎⠎⠀⠁⠝⠙⠀⠊⠞⠀⠙⠕⠑⠎⠝⠞⠀⠓⠥⠗⠞⠀⠍⠑ +Jamaican: Mi kian niam glas han i neba hot mi. +Lalland Scots / Doric: Ah can eat gless, it disnae hurt us. +Glaswegian: (NEEDED) +Gothic (4): 𐌼𐌰𐌲 𐌲𐌻𐌴𐍃 𐌹̈𐍄𐌰𐌽, 𐌽𐌹 𐌼𐌹𐍃 𐍅𐌿 𐌽𐌳𐌰𐌽 𐌱𐍂𐌹𐌲𐌲𐌹𐌸. +Old Norse (Runes): ᛖᚴ ᚷᛖᛏ ᛖᛏᛁ ᚧ ᚷᛚᛖᚱ ᛘᚾ ᚦᛖᛋᛋ ᚨᚧ ᚡᛖ ᚱᚧᚨ ᛋᚨᚱ +Old Norse (Latin): Ek get etið gler án þess að verða sár. +Norsk / Norwegian (Nynorsk): Eg kan eta glas utan å skada meg. +Norsk / Norwegian (Bokmål): Jeg kan spise glass uten å skade meg. +Føroyskt / Faroese: Eg kann eta glas, skaðaleysur. +Íslenska / Icelandic: Ég get etið gler án þess að meiða mig. +Svenska / Swedish: Jag kan äta glas utan att skada mig. +Dansk / Danish: Jeg kan spise glas, det gør ikke ondt på mig. +Sønderjysk: Æ ka æe glass uhen at det go mæ naue. +Frysk / Frisian: Ik kin glês ite, it docht me net sear. +Nederlands / Dutch: Ik kan glas eten, het doet mij geen kwaad. +Kirchröadsj/Bôchesserplat: Iech ken glaas èèse, mer 't deet miech jing pieng. +Afrikaans: Ek kan glas eet, maar dit doen my nie skade nie. +Lëtzebuergescht / Luxemburgish: Ech kan Glas iessen, daat deet mir nët wei. +Deutsch / German: Ich kann Glas essen, ohne mir zu schaden. +Ruhrdeutsch: Ich kann Glas verkasematuckeln, ohne dattet mich wat jucken tut. +Langenfelder Platt: Isch kann Jlaas kimmeln, uuhne datt mich datt weh dääd. +Lausitzer Mundart ("Lusatian"): Ich koann Gloos assn und doas dudd merr ni wii. +Odenwälderisch: Iech konn glaasch voschbachteln ohne dass es mir ebbs daun doun dud. +Sächsisch / Saxon: 'sch kann Glos essn, ohne dass'sch mer wehtue. +Pfälzisch: Isch konn Glass fresse ohne dasses mer ebbes ausmache dud. +Schwäbisch / Swabian: I kå Glas frässa, ond des macht mr nix! +Deutsch (Voralberg): I ka glas eassa, ohne dass mar weh tuat. +Bayrisch / Bavarian: I koh Glos esa, und es duard ma ned wei. +Allemannisch: I kaun Gloos essen, es tuat ma ned weh. +Schwyzerdütsch (Zürich): Ich chan Glaas ässe, das schadt mir nöd. +Schwyzerdütsch (Luzern): Ech cha Glâs ässe, das schadt mer ned. +Plautdietsch: (NEEDED) +Hungarian: Meg tudom enni az üveget, nem lesz tőle bajom. +Suomi / Finnish: Voin syödä lasia, se ei vahingoita minua. +Sami (Northern): Sáhtán borrat lása, dat ii leat bávččas. +Erzian: Мон ярсан суликадо, ды зыян эйстэнзэ а ули. +Northern Karelian: Mie voin syvvä lasie ta minla ei ole kipie. +Southern Karelian: Minä voin syvvä st'oklua dai minule ei ole kibie. +Vepsian: (NEEDED) +Votian: (NEEDED) +Livonian: (NEEDED) +Estonian: Ma võin klaasi süüa, see ei tee mulle midagi. +Latvian: Es varu ēst stiklu, tas man nekaitē. +Lithuanian: Aš galiu valgyti stiklą ir jis manęs nežeidžia +Old Prussian: (NEEDED) +Sorbian (Wendish): (NEEDED) +Czech: Mohu jíst sklo, neublíží mi. +Slovak: Môžem jesť sklo. Nezraní ma. +Polska / Polish: Mogę jeść szkło i mi nie szkodzi. +Slovenian: Lahko jem steklo, ne da bi mi škodovalo. +Bosnian, Croatian, Montenegrin and Serbian (Latin): Ja mogu jesti staklo, i to mi ne šteti. +Bosnian, Montenegrin and Serbian (Cyrillic): Ја могу јести стакло, и то ми не штети. +Macedonian: Можам да јадам стакло, а не ме штета. +Russian: Я могу есть стекло, оно мне не вредит. +Belarusian (Cyrillic): Я магу есці шкло, яно мне не шкодзіць. +Belarusian (Lacinka): Ja mahu jeści škło, jano mne ne škodzić. +Ukrainian: Я можу їсти скло, і воно мені не зашкодить. +Bulgarian: Мога да ям стъкло, то не ми вреди. +Georgian: მინას ვჭამ და არა მტკივა. +Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։ +Albanian: Unë mund të ha qelq dhe nuk më gjen gjë. +Turkish: Cam yiyebilirim, bana zararı dokunmaz. +Turkish (Ottoman): جام ييه بلورم بڭا ضررى طوقونمز +Tatar: Алам да бар, пыяла, әмма бу ранит мине. +Uzbek / O’zbekcha: (Roman): Men shisha yeyishim mumkin, ammo u menga zarar keltirmaydi. +Uzbek / Ўзбекча (Cyrillic): Мен шиша ейишим мумкин, аммо у менга зарар келтирмайди. +Bangla / Bengali: আমি কাঁচ খেতে পারি, তাতে আমার কোনো ক্ষতি হয় না। +Marathi: मी काच खाऊ शकतो, मला ते दुखत नाही. +Kannada: ನನಗೆ ಹಾನಿ ಆಗದೆ, ನಾನು ಗಜನ್ನು ತಿನಬಹುದು +Hindi: मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती. +Malayalam: എനിക്ക് ഗ്ലാസ് തിന്നാം. അതെന്നെ വേദനിപ്പിക്കില്ല. +Tamil: நான் கண்ணாடி சாப்பிடுவேன், அதனால் எனக்கு ஒரு கேடும் வராது. +Telugu: నేను గాజు తినగలను మరియు అలా చేసినా నాకు ఏమి ఇబ్బంది లేదు +Sinhalese: මට වීදුරු කෑමට හැකියි. එයින් මට කිසි හානියක් සිදු නොවේ. +Urdu(3): میں کانچ کھا سکتا ہوں اور مجھے تکلیف نہیں ہوتی ۔ +Pashto(3): زه شيشه خوړلې شم، هغه ما نه خوږوي +Farsi / Persian(3): .من می توانم بدونِ احساس درد شيشه بخورم +Arabic(3): أنا قادر على أكل الزجاج و هذا لا يؤلمني. +Aramaic: (NEEDED) +Maltese: Nista' niekol il-ħġieġ u ma jagħmilli xejn. +Hebrew(3): אני יכול לאכול זכוכית וזה לא מזיק לי. +Yiddish(3): איך קען עסן גלאָז און עס טוט מיר נישט װײ. +Judeo-Arabic: (NEEDED) +Ladino: (NEEDED) +Gǝʼǝz: (NEEDED) +Amharic: (NEEDED) +Twi: Metumi awe tumpan, ɜnyɜ me hwee. +Hausa (Latin): Inā iya taunar gilāshi kuma in gamā lāfiyā. +Hausa (Ajami) (2): إِنا إِىَ تَونَر غِلَاشِ كُمَ إِن غَمَا لَافِىَا +Yoruba(4): Mo lè je̩ dígí, kò ní pa mí lára. +Lingala: Nakokí kolíya biténi bya milungi, ekosála ngáí mabé tɛ́. +(Ki)Swahili: Naweza kula bilauri na sikunyui. +Malay: Saya boleh makan kaca dan ia tidak mencederakan saya. +Tagalog: Kaya kong kumain nang bubog at hindi ako masaktan. +Chamorro: Siña yo' chumocho krestat, ti ha na'lalamen yo'. +Fijian: Au rawa ni kana iloilo, ia au sega ni vakacacani kina. +Javanese: Aku isa mangan beling tanpa lara. +Burmese (Unicode 4.0): က္ယ္ဝန္‌တော္‌၊က္ယ္ဝန္‌မ မ္ယက္‌စားနုိင္‌သည္‌။ ၎က္ရောင္‌့ ထိခုိက္‌မ္ဟု မရ္ဟိပာ။ (9) +Burmese (Unicode 5.0): ကျွန်တော် ကျွန်မ မှန်စားနိုင်တယ်။ ၎င်းကြောင့် ထိခိုက်မှုမရှိပါ။ (9) +Vietnamese (quốc ngữ): Tôi có thể ăn thủy tinh mà không hại gì. +Vietnamese (nôm) (4): 些 𣎏 世 咹 水 晶 𦓡 空 𣎏 害 咦 +Khmer: ខ្ញុំអាចញុំកញ្ចក់បាន ដោយគ្មានបញ្ហារ +Lao: ຂອ້ຍກິນແກ້ວໄດ້ໂດຍທີ່ມັນບໍ່ໄດ້ເຮັດໃຫ້ຂອ້ຍເຈັບ. +Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ +Mongolian (Cyrillic): Би шил идэй чадна, надад хортой биш +Mongolian (Classic) (5): ᠪᠢ ᠰᠢᠯᠢ ᠢᠳᠡᠶᠦ ᠴᠢᠳᠠᠨᠠ ᠂ ᠨᠠᠳᠤᠷ ᠬᠣᠤᠷᠠᠳᠠᠢ ᠪᠢᠰᠢ +Dzongkha: (NEEDED) +Nepali: म काँच खान सक्छू र मलाई केहि नी हुन्‍न् । +Tibetan: ཤེལ་སྒོ་ཟ་ནས་ང་ན་གི་མ་རེད། +Chinese: 我能吞下玻璃而不伤身体。 +Chinese (Traditional): 我能吞下玻璃而不傷身體。 +Taiwanese(6): Góa ē-tàng chia̍h po-lê, mā bē tio̍h-siong. +Japanese: 私はガラスを食べられます。それは私を傷つけません。 +Korean: 나는 유리를 먹을 수 있어요. 그래도 아프지 않아요 +Bislama: Mi save kakae glas, hemi no save katem mi. +Hawaiian: Hiki iaʻu ke ʻai i ke aniani; ʻaʻole nō lā au e ʻeha. +Marquesan: E koʻana e kai i te karahi, mea ʻā, ʻaʻe hauhau. +Inuktitut (10): ᐊᓕᒍᖅ ᓂᕆᔭᕌᖓᒃᑯ ᓱᕋᙱᑦᑐᓐᓇᖅᑐᖓ +Chinook Jargon: Naika məkmək kakshət labutay, pi weyk ukuk munk-sik nay. +Navajo: Tsésǫʼ yishą́ągo bííníshghah dóó doo shił neezgai da. +Cherokee (and Cree, Chickasaw, Cree, Micmac, Ojibwa, Lakota, Náhuatl, Quechua, Aymara, and other American languages): (NEEDED) +Garifuna: (NEEDED) +Gullah: (NEEDED) +Lojban: mi kakne le nu citka le blaci .iku'i le se go'i na xrani mi +Nórdicg: Ljœr ye caudran créneþ ý jor cẃran. \ No newline at end of file diff --git a/lib/std/unicode/quickbrown.txt b/lib/std/unicode/quickbrown.txt new file mode 100644 index 000000000000..5db944343850 --- /dev/null +++ b/lib/std/unicode/quickbrown.txt @@ -0,0 +1,126 @@ +Sentences that contain all letters commonly used in a language +-------------------------------------------------------------- + +Markus Kuhn -- 2001-09-02 + +This file is UTF-8 encoded. + + +Danish (da) +--------- + + Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen + Wolther spillede på xylofon. + (= Quiz contestants were eating strawbery with cream while Wolther + the circus clown played on xylophone.) + +German (de) +----------- + + Falsches Üben von Xylophonmusik quält jeden größeren Zwerg + (= Wrongful practicing of xylophone music tortures every larger dwarf) + + Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich + (= Twelve boxing fighters hunted Eva across the dike of Sylt) + + Heizölrückstoßabdämpfung + (= fuel oil recoil absorber) + (jqvwxy missing, but all non-ASCII letters in one word) + +English (en) +------------ + + The quick brown fox jumps over the lazy dog + +Spanish (es) +------------ + + El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y + frío, añoraba a su querido cachorro. + (Contains every letter and every accent, but not every combination + of vowel + acute.) + +French (fr) +----------- + + Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à + côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce + qui lui permet de penser à la cænogenèse de l'être dont il est question + dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui, + pense-t-il, diminue çà et là la qualité de son œuvre. + + l'île exiguë + Où l'obèse jury mûr + Fête l'haï volapük, + Âne ex aéquo au whist, + Ôtez ce vœu déçu. + + Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en + canoë au delà des îles, près du mälström où brûlent les novæ. + +Irish Gaelic (ga) +----------------- + + D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh + +Hungarian (hu) +-------------- + + Árvíztűrő tükörfúrógép + (= flood-proof mirror-drilling machine, only all non-ASCII letters) + +Icelandic (is) +-------------- + + Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa + + Sævör grét áðan því úlpan var ónýt + (some ASCII letters missing) + +Japanese (jp) +------------- + + Hiragana: (Iroha) + + いろはにほへとちりぬるを + わかよたれそつねならむ + うゐのおくやまけふこえて + あさきゆめみしゑひもせす + + Katakana: + + イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム + ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン + +Hebrew (iw) +----------- + + ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה + +Polish (pl) +----------- + + Pchnąć w tę łódź jeża lub ośm skrzyń fig + (= To push a hedgehog or eight bins of figs in this boat) + +Russian (ru) +------------ + + В чащах юга жил бы цитрус? Да, но фальшивый экземпляр! + (= Would a citrus live in the bushes of south? Yes, but only a fake one!) + +Thai (th) +--------- + + [--------------------------|------------------------] + ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน + จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร + ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย + ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ + + [The copyright for the Thai example is owned by The Computer + Association of Thailand under the Royal Patronage of His Majesty the + King.] + +Please let me know if you find others! Special thanks to the people +from all over the world who contributed these sentences.