diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig
index 18bd5ab0e2af..c0ccc1cff49c 100644
--- a/lib/std/unicode.zig
+++ b/lib/std/unicode.zig
@@ -3,7 +3,7 @@
// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
// The MIT license requires this copyright notice to be included in all copies
// and substantial portions of the software.
-const std = @import("./std.zig");
+const std = @import("std");
const builtin = @import("builtin");
const assert = std.debug.assert;
const testing = std.testing;
@@ -23,11 +23,12 @@ pub fn utf8CodepointSequenceLength(c: u21) !u3 {
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
- return switch (@clz(u8, ~first_byte)) {
- 0 => 1,
- 2 => 2,
- 3 => 3,
- 4 => 4,
+ // The switch is optimized much better than a "smart" approach using @clz
+ return switch (first_byte) {
+ 0b0000_0000...0b0111_1111 => 1,
+ 0b1100_0000...0b1101_1111 => 2,
+ 0b1110_0000...0b1110_1111 => 3,
+ 0b1111_0000...0b1111_0111 => 4,
else => error.Utf8InvalidStartByte,
};
}
@@ -106,6 +107,7 @@ const Utf8Decode3Error = error{
Utf8OverlongEncoding,
Utf8EncodesSurrogateHalf,
};
+
pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 {
assert(bytes.len == 3);
assert(bytes[0] & 0b11110000 == 0b11100000);
@@ -153,6 +155,50 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
return value;
}
+/// Returns true if the given unicode codepoint can be encoded in UTF-8.
+pub fn isValidCodepoint(value: u21) bool {
+ return switch (value) {
+ 0xD800...0xDFFF => false, // Surrogates range
+ 0x110000...0x1FFFFF => false, // Above the maximum codepoint value
+ else => true,
+ };
+}
+
+/// Returns the length of a supplied UTF-8 string literal in terms of unicode
+/// codepoints.
+/// Errors: if this string cannot be decoded.
+pub fn utf8CountCodepoints(s: []const u8) !usize {
+ var len: usize = 0;
+
+ const N = @sizeOf(usize);
+ const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);
+
+ var i: usize = 0;
+ while (i < s.len) {
+ // Fast path for ASCII sequences
+ while (i + N <= s.len) : (i += N) {
+ const v = mem.readIntNative(usize, s[i..][0..N]);
+ if (v & MASK != 0) break;
+ len += N;
+ }
+
+ if (i < s.len) {
+ const n = try utf8ByteSequenceLength(s[i]);
+ if (i + n > s.len) return error.TruncatedInput;
+
+ switch (n) {
+ 1 => {}, // ASCII, no validation needed
+ else => _ = try utf8Decode(s[i .. i + n]),
+ }
+
+ i += n;
+ len += 1;
+ }
+ }
+
+ return len;
+}
+
pub fn utf8ValidateSlice(s: []const u8) bool {
var i: usize = 0;
while (i < s.len) {
@@ -757,3 +803,31 @@ test "utf8ToUtf16LeStringLiteral" {
testing.expect(utf16[2] == 0);
}
}
+
+fn testUtf8CountCodepoints() !void {
+ testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij"));
+ testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö"));
+ testing.expectEqual(@as(usize, 5), try utf8CountCodepoints("こんにちは"));
+ // testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80"));
+}
+
+test "utf8 count codepoints" {
+ try testUtf8CountCodepoints();
+ comptime testUtf8CountCodepoints() catch unreachable;
+}
+
+fn testisValidCodepoint() !void {
+ testing.expect(isValidCodepoint('e'));
+ testing.expect(isValidCodepoint('ë'));
+ testing.expect(isValidCodepoint('は'));
+ testing.expect(isValidCodepoint(0xe000));
+ testing.expect(isValidCodepoint(0x10ffff));
+ testing.expect(!isValidCodepoint(0xd800));
+ testing.expect(!isValidCodepoint(0xdfff));
+ testing.expect(!isValidCodepoint(0x110000));
+}
+
+test "utf8 valid codepoint" {
+ try testisValidCodepoint();
+ comptime testisValidCodepoint() catch unreachable;
+}
diff --git a/lib/std/unicode/UTF-8-demo.txt b/lib/std/unicode/UTF-8-demo.txt
new file mode 100644
index 000000000000..6017f34d099b
--- /dev/null
+++ b/lib/std/unicode/UTF-8-demo.txt
@@ -0,0 +1,212 @@
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫
+ ⎪⎢⎜│a²+b³ ⎟⎥⎪
+ ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪
+ ⎪⎢⎜⎷ c₈ ⎟⎥⎪
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬
+ ⎪⎢⎜ ∞ ⎟⎥⎪
+ ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪
+ ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+ STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+ ▝▀▘▙▄▟
diff --git a/lib/std/unicode/benchmark.zig b/lib/std/unicode/benchmark.zig
new file mode 100644
index 000000000000..1d1205b99b3a
--- /dev/null
+++ b/lib/std/unicode/benchmark.zig
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2015-2020 Zig Contributors
+// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
+// The MIT license requires this copyright notice to be included in all copies
+// and substantial portions of the software.
+const std = @import("std");
+const builtin = std.builtin;
+const time = std.time;
+const unicode = std.unicode;
+
+const Timer = time.Timer;
+
+const N = 1_000_000;
+
+const KiB = 1024;
+const MiB = 1024 * KiB;
+const GiB = 1024 * MiB;
+
+const ResultCount = struct {
+ count: usize,
+ throughput: u64,
+};
+
+const boxes = @embedFile("boxes.txt");
+const glasses = @embedFile("glasses.txt");
+const quickbrown = @embedFile("quickbrown.txt");
+const utf8demo = @embedFile("UTF-8-demo.txt");
+
+fn benchmarkCodepointCount(buf: []const u8) !ResultCount {
+ var timer = try Timer.start();
+
+ const bytes = N * buf.len;
+
+ const start = timer.lap();
+ var i: usize = 0;
+ var r: usize = undefined;
+ while (i < N) : (i += 1) {
+ r = try @call(
+ .{ .modifier = .never_inline },
+ unicode.utf8CountCodepoints,
+ .{buf},
+ );
+ }
+ const end = timer.read();
+
+ const elapsed_s = @intToFloat(f64, end - start) / time.ns_per_s;
+ const throughput = @floatToInt(u64, @intToFloat(f64, bytes) / elapsed_s);
+
+ return ResultCount{ .count = r, .throughput = throughput };
+}
+
+pub fn main() !void {
+ const stdout = std.io.getStdOut().writer();
+ const allocator = std.heap.page_allocator;
+
+ const args = try std.process.argsAlloc(allocator);
+ defer std.process.argsFree(allocator, args);
+
+ if (args.len > 1) {
+ var i: usize = 1;
+ while (i < args.len) : (i += 1) {
+ const s = args[i];
+ try stdout.print("Benchmark '{}' string\n", .{s});
+ const result = try benchmarkCodepointCount(s);
+ try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+ }
+ }
+
+ try stdout.print("Benchmark short ASCII strings\n", .{});
+ {
+ const result = try benchmarkCodepointCount("abc");
+ try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+ }
+
+ try stdout.print("Benchmark short Unicode strings\n", .{});
+ {
+ const result = try benchmarkCodepointCount("ŌŌŌ");
+ try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+ }
+
+ try stdout.print("Benchmark pure ASCII strings\n", .{});
+ {
+ const result = try benchmarkCodepointCount("hello" ** 16);
+ try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+ }
+
+ try stdout.print("Benchmark pure Unicode strings\n", .{});
+ {
+ const result = try benchmarkCodepointCount("こんにちは" ** 16);
+ try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+ }
+
+ try stdout.print("Benchmark mixed ASCII/Unicode strings\n", .{});
+ {
+ const result = try benchmarkCodepointCount("Hyvää huomenta" ** 16);
+ try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+ }
+
+ try stdout.print("Benchmark 'boxes.txt'\n", .{});
+ {
+ const result = try benchmarkCodepointCount(boxes);
+ try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+ }
+
+ try stdout.print("Benchmark 'glasses.txt'\n", .{});
+ {
+ const result = try benchmarkCodepointCount(glasses);
+ try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+ }
+
+ try stdout.print("Benchmark 'quickbrown.txt'\n", .{});
+ {
+ const result = try benchmarkCodepointCount(quickbrown);
+ try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+ }
+
+ try stdout.print("Benchmark 'UTF-8-demo.txt'\n", .{});
+ {
+ const result = try benchmarkCodepointCount(utf8demo);
+ try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+ }
+}
diff --git a/lib/std/unicode/boxes.txt b/lib/std/unicode/boxes.txt
new file mode 100644
index 000000000000..f7c03dd69589
--- /dev/null
+++ b/lib/std/unicode/boxes.txt
@@ -0,0 +1,341 @@
+Single width, hollow.
+┌─┐ )0lqk
+│ │ )0x x
+└─┘ )0mqj
+┌─┐
+│ │
+└─┘
+
+Single width, single fill.
+┌┬┐ )0lwk
+├┼┤ )0tnu
+└┴┘ )0mvj
+┌┬┐
+├┼┤
+└┴┘
+
+Double width, hollow.
+┏━┓ )0
+┃ ┃ )0
+┗━┛ )0
+╔═╗
+║ ║
+╚═╝
+
+Double width, double fill.
+┏┳┓ )0
+┣╋┫ )0
+┗┻┛ )0
+╔╦╗
+╠╬╣
+╚╩╝
+
+Double width, single fill.
+┏┯┓ )0
+┠┼┨ )0 n
+┗┷┛ )0
+╔╤╗
+╟┼╢
+╚╧╝
+
+Single width, double fill.
+┌┰┐ )0l k
+┝╋┥ )0
+└┸┘ )0m j
+┌╥┐
+╞╬╡
+└╨┘
+
+Single width, mixed fill (double horizontal, single vertical).
+┌┬┐ )0lwk
+┝┿┥ )0
+└┴┘ )0mvj
+┌┬┐
+╞╪╡
+└┴┘
+
+Double width, mixed fill (double vertical, single horizontal).
+┏┳┓ )0
+┠╂┨ )0
+┗┻┛ )0
+╔╦╗
+╟╫╢
+╚╩╝
+
+Double horizontal, single vertical.
+┍┑
+┕┙
+╒╕
+╘╛
+
+Double vertical, single horizontal.
+┎┒
+┖┚
+╓╖
+╙╜
+
+Single width, double, triple and quadruple dash.
+┌╌╌┐ ┌┄┄┐ ┌┈┈┐
+╎ ╎ ┆ ┆ ┊ ┊
+╎ ╎ ┆ ┆ ┊ ┊
+└╌╌┘ └┄┄┘ └┈┈┘
+
+Double width, double, triple and quadruple dash.
+┏╍╍┓ ┏┅┅┓ ┏┉┉┓
+╏ ╏ ┇ ┇ ┋ ┋
+╏ ╏ ┇ ┇ ┋ ┋
+┗╍╍┛ ┗┅┅┛ ┗┉┉┛
+
+One single, two double lines meet.
+┢┪ ┲┱
+┡┩ ┺┹
+
+One double, two single lines meet.
+┞┦ ┭┮
+┟┧ ┵┶
+
+One single, three double lines meet.
+╇ ╉╊
+╈
+
+One double, three single lines meet.
+╁ ┾┽
+╀
+
+Two double, two single lines meet.
+╆╅
+╄╃
+
+Mixed width, starting, ending and changing width mid-character.
+╷ ╻ ╶╼╸
+╽ ╿ ╺╾╴
+╹ ╵
+
+Single line with vertical lines crossing
+ ║ ┃ │ │ │ ┃ ║
+─╫─╂─┼─🮯─┼─╂─╫─
+ ║ ┃ │ │ │ ┃ ║
+
+Rounded.
+╭─╮
+│ │
+╰─╯
+
+Diagonals.
+╲ ╱ ╲ ╱ ╳ ╳ ╲ ╱ ╲ ╱ ╲ ╱ ╳ ╳ ╳ ╳ ╳╳╳╳╳╳╳
+ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳╳╳╳╳╳╳
+ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╱ ╲ ╱ ╲ ╱ ╲ ╳ ╳ ╳ ╳ ╳╳╳╳╳╳╳
+ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳╳╳╳╳╳╳
+ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╲ ╱ ╲ ╱ ╲ ╱ ╳ ╳ ╳ ╳ ╳╳╳╳╳╳╳
+ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╳ ╳ ╳ ╳ ╳ ╳ ╳ ╳
+╱ ╲ ╱ ╲ ╳ ╳ ╱ ╲ ╱ ╲ ╱ ╲ ╳ ╳ ╳ ╳
+
+╲ ╱ ╲ ╱ ╱╲ ╱╲ ╱╲ ╱╲ ╱╲ ╲╱╲╱╲╱╲╱
+ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲╱ ╲╱ ╲╱ ╱╲╱╲╱╲╱╲
+ ╲╱ ╲╱ ╲╱ ╲╱ ╲╱ ╱╲ ╱╲ ╱╲ ╲╱╲╱╲╱╲╱
+ ╱╲ ╱╲ ╱╲ ╱╲ ╱╲ ╲╱ ╲╱ ╲╱ ╱╲╱╲╱╲╱╲
+ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱ ╲ ╱╲ ╱╲ ╱╲ ╲╱╲╱╲╱╲╱
+╱ ╲ ╱ ╲ ╲╱ ╲╱ ╲╱ ╲╱ ╲╱ ╱╲╱╲╱╲╱╲
+
+
+Block elements.
+
+█▏
+▏▏
+█▏ 🭽▔🭶🭷🭸🭹🭺🭻▁🮀🮁🮀▁🭻🭺🭹🭸🭷🭶▔🭾
+🮋▎ ▏ ▕
+🭰🭰 🭰 🭵
+🮋▎ 🭱 ▐ ▌ ▛▀#▀▜ 🭴
+🮊▍ 🭲 ▄▞▀ ▗▄▀▘ ▌▗▄▖▐ 🭳
+🭱🭱 🭳 ▌ ▐ #▐#▌# 🭲
+🮊▍ 🭴 ▀▚▄ ▝▀▄▖ ▌▝▀▘▐ 🭱
+🮉▌ 🭵 ▐ ▌ ▙▄#▄▟ 🭰
+🭲🭲 ▕ ▏
+🮉▌ ▕ ▁▂▃▄▅▆▇█ ▖# ▗# ▏
+▐▋ ▕ ▕ ▉ ▌# ▐# ▏
+🭳🭳 ▕ 🮇 ▊ ▐# ▌# ▏
+▐▋ ▕ 🮈 ░ ▋ ▝# ▘# ▏
+🮈▊ 🭵 ▐ ▒░ ▌ 🭰
+🭴🭴 🭴 🮉 ▓▒░ ▍ ▌# ▐# 🭱
+🮈▊ 🭳 🮊 █▓▒░ ▎ ▚# ▞# 🭲
+🮇▉ 🭲 🮋 ▏ ▐# ▌# 🭳
+🭵🭵 🭱 █🮆🮅🮄▀🮃🮂▔ 🭴
+🮇▉ 🭰 🭵
+▕█ ▏ ▕
+▕▕ 🭼▁🭻🭺🭹🭸🭷🭶▔🮀🮁🮀▔🭶🭷🭸🭹🭺🭻▁🭿
+▕█
+ █▔█▇🭶▇▆🭷▆▅🭸▅▄🭹▄▃🭺▃▂🭻▂▁▁▁
+ ▔▔▔🮂🭶🮂🮃🭷🮃▀🭸▀🮄🭹🮄🮅🭺🮅🮆🭻🮆█▁█
+
+Shades.
+ ████████████████████████████████
+ ░░░░░ ▒▒▒▒▒ ▓▓▓▓▓ ██▓▓▓▓▓█████▒▒▒▒▒█████░░░░░█████
+ ░ ░░░░░ ▒ ▒▒▒▒▒ ▓ ▓▓▓▓▓ ██▓▓▓▓▓██▓██▒▒▒▒▒██▒██░░░░░██░██
+ ░░░░░ ▒▒▒▒▒ ▓▓▓▓▓ ██▓▓▓▓▓█████▒▒▒▒▒█████░░░░░█████
+ ████████████████████████████████
+
+Hatchings and Checkerboards
+
+🮘🮘🮘🮘 🮙🮙🮙🮙 🮘🮙🮘🮙 🮕🮕🮕🮕 🮖🮖🮖🮖 🮕🮖🮕🮖
+🮘🮘🮘🮘 🮙🮙🮙🮙 🮙🮘🮙🮘 🮕🮕🮕🮕 🮖🮖🮖🮖 🮖🮕🮖🮕
+🮘🮘🮘🮘 🮙🮙🮙🮙 🮘🮙🮘🮙 🮕🮕🮕🮕 🮖🮖🮖🮖 🮕🮖🮕🮖
+🮘🮘🮘🮘 🮙🮙🮙🮙 🮙🮘🮙🮘 🮕🮕🮕🮕 🮖🮖🮖🮖 🮖🮕🮖🮕
+
+
+🬇🬋🬃 🬦🬹🬓 🬞🬭🬏 🬠🬰🬐 🬁🬂🬀 🬉🬎🬄 🬇🬋🬃
+
+🬭🬞🬏 🬹🬦🬓
+█▐▌ █▐▌
+🬂🬁🬀 🬎🬉🬄
+
+ 🬭🬭🬭
+🬭🬭🬭 🬚🬋🬩 🬕🬂🬨 🬹🬹🬹 🬝🬎🬬 🬴🬰🬸 🬛🬋🬫
+▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐
+🬂🬂🬂 🬌🬋🬍 🬲🬭🬷 🬎🬎🬎 🬺🬹🬻 🬴🬰🬸 🬛🬋🬫
+ 🬂🬂🬂
+
+ 🬞🬭🬏
+🬞🬭🬏 🬦🬋🬓 ▐🬂▌ 🬦🬹🬓 ▐🬎▌ ▐🬰▌ ▐🬋▌
+▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌ ▐ ▌
+🬁🬂🬀 🬉🬋🬄 ▐🬭▌ 🬉🬎🬄 ▐🬹▌ ▐🬰▌ ▐🬋▌
+ 🬁🬂🬀
+
+ 🬞 🬏
+🬖🬏🬇🬗 🬈🬀🬁🬅 🬤🬃🬞🬢
+🬠 🬞🬢 🬔🬓🬦🬧 🬖🬏 🬐
+🬣🬄🬁🬅 🬁 🬀 🬈🬀🬉🬘
+
+
+🬥 🬙 🬆 🬊 🬒 🬡 🬑 🬟
+🬇 🬃 🬐 🬠 🬃 🬇 🬃 🬇
+🬳 🬶 🬱 🬵 🬮 🬯 🬟 🬑
+
+
+ 🬞🬻🬺🬏 🬞🬜🬪🬏 🬞🬅🬈🬏
+ 🬵🬝🬀🬁🬬🬱 🬵🬆 🬊🬱 🬖🬀 🬁🬢
+🬻🬆 🬊🬺 🬜🬀 🬁🬪 🬔 🬧
+🬬🬱 🬵🬝 🬪🬏 🬞🬜 🬣 🬘
+ 🬊🬺🬏🬞🬻🬆 🬊🬱 🬵🬆 🬈🬏 🬞🬅
+ 🬁🬬🬝🬀 🬁🬪🬜🬀 🬁🬢🬖🬀
+
+
+Slope 1/3.
+🭈🭆🭂🭍🭑🬽 🭈🬭🭆🬹🭂█🭍🬹🭑🬭🬽
+🭣🭧🭓🭞🭜🭘 █#########█
+ 🭣🬂🭧🬎🭓█🭞🬎🭜🬂🭘
+
+Slope 2/3.
+ 🭇🬼 🬞🬏
+🭇🭄🭏🬼 🭊🭁🭌🬿 🭇🬭🭄█🭏🬭🬼 🭊🬹🭁🭌🬹🬿
+🭢🭕🭠🭗 🭥🭒🭝🭚 ▐#####▌ █####█
+ 🭢🭗 🭢🬂🭕█🭠🬂🭗 🭥🬎🭒🭝🬎🭚
+ 🬁🬀
+Slope 1.
+◢◣ 🮞🮟
+◥◤ 🮝🮜
+
+
+Slope 4/3.
+ 🭉🬹🬾
+🭉🬾 ▐#▌
+🭃🭎 🬞🭃#🭎🬏
+🭔🭟 🬁🭔#🭟🬀
+🭤🭙 ▐#▌
+ 🭤🬎🭙
+
+Slope 2.
+ 🭋█🭀
+🭋🭀 ▐#▌
+🭅🭐 🭅#🭐
+🭖🭡 🭖#🭡
+🭦🭛 ▐#▌
+ 🭦█🭛
+
+
+Diagonal quarters.
+ 🭯 🭯 🭯 🭯
+ 🭯 🭫 🭮🭫🭬 🮞🭫🮟 ◢🭫◣ 🭯🭯🭯🭯
+ 🭫 🭯 🭯 🮞🮜 🮝🮟 ◢◤ ◥◣ 🭮🮛🮛🭬🮚🮚🮚🮚
+🭮🭪 🭨🭬 🭮🭪 🭨🭬 🭮🭪 🭨🭬 🭮🭪 🭨🭬 🭮🭪 🭨🭬 🭮🮛🮛🭬🮚🮚🮚🮚
+ 🭩 🭭 🭭 🮝🮟 🮞🮜 ◥◣ ◢◤ 🭮🮛🮛🭬🮚🮚🮚🮚
+ 🭭 🭩 🭮🭩🭬 🮝🭩🮜 ◥🭩◤ 🭭🭭🭭🭭
+ 🭭 🭭 🭭 🭭
+
+ 🭯 🭯 🭯
+ ◢◣◢◣◢◣ ◢🭫🭩🭫🭩🭫◣ ◢🭩🭩🭩◣ 🭯🭯🭯
+◢◤◥◤◥◤◥◣ 🭮🭪 🭭 🭭 🭨🭬 ◢◤🭭🭭🭭◥◣ ◢🭫🭫🭫◣
+◥◣ ◢◤ 🭨🭬 🭮🭪 🭨🭬 🭮🭪 🭮🭪 🭨🭬
+◢◤ ◥◣ 🭮🭪 🭨🭬 🭨🭬 🭮🭪 🭮🭪 🭨🭬
+◥◣ ◢◤ 🭨🭬 🭮🭪 🭨🭬 🭮🭪 🭮🭪 🭨🭬
+◢◤ ◥◣ 🭮🭪 🭯 🭯 🭨🭬 ◥◣🭯🭯🭯◢◤ ◥🭩🭩🭩◤
+◥◣◢◣◢◣◢◤ ◥🭩🭫🭩🭫🭩◤ ◥🭫🭫🭫◤ 🭭🭭🭭
+ ◥◤◥◤◥◤ 🭭 🭭 🭭
+
+ 🮞◣🮞◣🮞◣ ◢🮟◢🮟◢🮟 ╱🮟╱🮟╱🮟 🮞╲🮞╲🮞╲
+🮞🮜◥🮜◥🮜◥◣ ◢◤🮝◤🮝◤🮝🮟 ╱╱🮝╱🮝╱🮝🮟 🮞🮜╲🮜╲🮜╲╲
+◥◣ 🮞🮜 🮝🮟 ◢◤ 🮝🮟 ╱╱ ╲╲ 🮞🮜
+🮞🮜 ◥◣ ◢◤ 🮝🮟 ╱╱ 🮝🮟 🮞🮜 ╲╲
+◥◣ 🮞🮜 🮝🮟 ◢◤ 🮝🮟 ╱╱ ╲╲ 🮞🮜
+🮞🮜 ◥◣ ◢◤ 🮝🮟 ╱╱ 🮝🮟 🮞🮜 ╲╲
+◥◣🮞◣🮞◣🮞🮜 🮝🮟◢🮟◢🮟◢◤ 🮝🮟╱🮟╱🮟╱╱ ╲╲🮞╲🮞╲🮞🮜
+ ◥🮜◥🮜◥🮜 🮝◤🮝◤🮝◤ 🮝╱🮝╱🮝╱ ╲🮜╲🮜╲🮜
+
+ ╱◣╱◣╱◣ ◢╲◢╲◢╲ ╱╲╱╲╱╲ 🮣🮧🮧🮧🮧🮢
+╱╱◥╱◥╱◥◣ ◢◤╲◤╲◤╲╲ ╱╱╲╱╲╱╲╲ 🮣🮨🮧🮧🮧🮧🮩🮢
+◥◣ ╱╱ ╲╲ ◢◤ ╲╲ ╱╱ 🮤🮤 🮥🮥
+╱╱ ◥◣ ◢◤ ╲╲ ╱╱ ╲╲ 🮤🮤 🮥🮥
+◥◣ ╱╱ ╲╲ ◢◤ ╲╲ ╱╱ 🮤🮤 🮥🮥
+╱╱ ◥◣ ◢◤ ╲╲ ╱╱ ╲╲ 🮤🮤 🮥🮥
+◥◣╱◣╱◣╱╱ ╲╲◢╲◢╲◢◤ ╲╲╱╲╱╲╱╱ 🮡🮩🮦🮦🮦🮦🮨🮠
+ ◥╱◥╱◥╱ ╲◤╲◤╲◤ ╲╱╲╱╲╱ 🮡🮦🮦🮦🮦🮠
+
+ ╷ ╷ 🮣─🮢 🮣─🮦─🮢
+🮣─🮢 ┌🮧┐ ╶🮭─🮬╴ │ │ │ │ │
+│ │ 🮤 🮥 │ │ 🮣─🮨─🮩─🮢 🮥─🮮─🮤
+🮡─🮠 └🮦┘ ╶🮫─🮪╴ │ │ │ │ │ │ │
+ ╵ ╵ 🮡─🮠 🮡─🮠 🮡─🮧─🮠
+
+ ▗🮒█🮒▖
+ ▗▘ ▝▖
+ 🮔 🮏
+ █ 🮍▒🮌
+ 🮔 🮎
+ ▝▖ ▗▘
+ ▝🮑█🮑▘
+
+
+ ½
+ 🬤🬤🬤🬤⅓█ █ 🬗🬗🬗🬗
+ 🬗🬗🬗🬗█ █ █🬤🬤🬤🬤
+ 🬤🬤🬤🬤 █ █ 🬗🬗🬗🬗
+ ¼ 🬗🬗🬗🬗█ █ █🬤🬤🬤🬤
+▒▒▒▒🮖🮖🮖🮖▞▞▞▞½█ █ ▚▚▚▚🮕🮕🮕🮕🮐🮐🮐🮐 ▎ 🮇 ▎ 🮇
+▒▒▒▒🮖🮖🮖🮖▞▞▞▞█ █ █▚▚▚▚🮕🮕🮕🮕🮐🮐🮐🮐 🮂🮕🮗🮖🮂 🮂🮖🮗🮕🮂
+▒▒▒▒🮖🮖🮖🮖▞▞▞▞ █ █ ▚▚▚▚🮕🮕🮕🮕🮐🮐🮐🮐 ▂🮕🮗🮖▂ ▂🮖🮗🮕▂
+▒▒▒▒🮖🮖🮖🮖▞▞▞▞█ █ █▚▚▚▚🮕🮕🮕🮕🮐🮐🮐🮐 ▎ 🮇 ▎ 🮇
+ 🬘🬘🬘🬘⅔█ █ 🬣🬣🬣🬣
+ 🬧🬧🬧🬧█ █ █🬔🬔🬔🬔
+ 🬣🬣🬣🬣 █ █ 🬘🬘🬘🬘
+ 🬔🬔🬔🬔█ █ █🬧🬧🬧🬧
+
+ 🮣🮢 🮣🮢 🮣🮢🮣🮢
+🮣🮠🮡🮢🮣🮨🮩🮢 🮭🮬 🮡🮩🮨🮠 🮨🮨🮨🮩🮩🮩 🮭🮭🮭🮬🮬🮬 🮮🮮🮮🮮🮮🮮
+🮡🮢🮣🮠🮡🮩🮨🮠 🮫🮪 🮣🮨🮩🮢 🮨🮨🮨🮩🮩🮩 🮭🮭🮭🮬🮬🮬 🮮🮮🮮🮮🮮🮮
+ 🮡🮠 🮡🮠 🮡🮠🮡🮠 🮨🮨🮨🮩🮩🮩 🮭🮭🮭🮬🮬🮬 🮮🮮🮮🮮🮮🮮
+ 🮣🮧🮢 🮣🮧🮢 🮣🮦🮢 🮭🮦🮬 🮩🮩🮩🮨🮨🮨 🮫🮫🮫🮪🮪🮪 🮮🮮🮮🮮🮮🮮
+ 🮤 🮥 🮤🮮🮥 🮥 🮤 🮥 🮤 🮩🮩🮩🮨🮨🮨 🮫🮫🮫🮪🮪🮪 🮮🮮🮮🮮🮮🮮
+ 🮡🮦🮠 🮡🮦🮠 🮡🮧🮠 🮫🮧🮪 🮩🮩🮩🮨🮨🮨 🮫🮫🮫🮪🮪🮪 🮮🮮🮮🮮🮮🮮
+
+◤◤◤◥◥◥ 🮜🮜🮜🮝🮝🮝
+◤◤◤◥◥◥ 🮜🮜🮜🮝🮝🮝
+◤◤◤◥◥◥ 🮜🮜🮜🮝🮝🮝
+◣◣◣◢◢◢ 🮟🮟🮟🮞🮞🮞
+◣◣◣◢◢◢ 🮟🮟🮟🮞🮞🮞
+◣◣◣◢◢◢ 🮟🮟🮟🮞🮞🮞
+
+References:
+VT-102: http://vt100.net/docs/vt102-ug/table5-13.html
+Unicode: http://www.unicode.org/charts/PDF/U2500.pdf
+ http://www.unicode.org/charts/PDF/U2580.pdf
+ http://www.unicode.org/charts/PDF/U25A0.pdf
+ http://www.unicode.org/charts/PDF/U1FB00.pdf
diff --git a/lib/std/unicode/glasses.txt b/lib/std/unicode/glasses.txt
new file mode 100644
index 000000000000..1d84bb5558d8
--- /dev/null
+++ b/lib/std/unicode/glasses.txt
@@ -0,0 +1,193 @@
+I Can Eat Glass
+In various languages
+
+# http://kermitproject.org does not support https://
+Adopted from http://kermitproject.org/utf8.html#glass
+Submit additions to the URL above and update this file.
+
+Permission is granted by the Kermit project (http://kermitproject.org/)
+to redistribute this file, with absolutely no warranty.
+
+Sanskrit: काचं शक्नोम्यत्तुम् । नोपहिनस्ति माम् ॥
+Sanskrit (standard transcription): kācaṃ śaknomyattum; nopahinasti mām.
+Classical Greek: ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει.
+Greek (monotonic): Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα.
+Greek (polytonic): Μπορῶ νὰ φάω σπασμένα γυαλιὰ χωρὶς νὰ πάθω τίποτα.
+Etruscan: (NEEDED)
+Latin: Vitrum edere possum; mihi non nocet.
+Old French: Je puis mangier del voirre. Ne me nuit.
+French: Je peux manger du verre, ça ne me fait pas mal.
+Provençal / Occitan: Pòdi manjar de veire, me nafrariá pas.
+Québécois: J'peux manger d'la vitre, ça m'fa pas mal.
+Walloon: Dji pou magnî do vêre, çoula m' freut nén må.
+Champenois: (NEEDED)
+Lorrain: (NEEDED)
+Picard: Ch'peux mingi du verre, cha m'foé mie n'ma.
+Corsican/Corsu: (NEEDED)
+Jèrriais: (NEEDED)
+Kreyòl Ayisyen (Haitï): Mwen kap manje vè, li pa blese'm.
+Basque: Kristala jan dezaket, ez dit minik ematen.
+Catalan / Català: Puc menjar vidre, que no em fa mal.
+Spanish: Puedo comer vidrio, no me hace daño.
+Aragonés: Puedo minchar beire, no me'n fa mal .
+Aranés: (NEEDED)
+Mallorquín: (NEEDED)
+Galician: Eu podo xantar cristais e non cortarme.
+European Portuguese: Posso comer vidro, não me faz mal.
+Brazilian Portuguese (8): Posso comer vidro, não me machuca.
+Caboverdiano/Kabuverdianu (Cape Verde): M' podê cumê vidru, ca ta maguâ-m'.
+Papiamentu: Ami por kome glas anto e no ta hasimi daño.
+Italian: Posso mangiare il vetro e non mi fa male.
+Milanese: Sôn bôn de magnà el véder, el me fa minga mal.
+Roman: Me posso magna' er vetro, e nun me fa male.
+Napoletano: M' pozz magna' o'vetr, e nun m' fa mal.
+Venetian: Mi posso magnare el vetro, no'l me fa mae.
+Zeneise (Genovese): Pòsso mangiâ o veddro e o no me fà mâ.
+Sicilian: Puotsu mangiari u vitru, nun mi fa mali.
+Campinadese (Sardinia): (NEEDED)
+Lugudorese (Sardinia): (NEEDED)
+Romansch (Grischun): Jau sai mangiar vaider, senza che quai fa donn a mai.
+Romany / Tsigane: (NEEDED)
+Romanian: Pot să mănânc sticlă și ea nu mă rănește.
+Esperanto: Mi povas manĝi vitron, ĝi ne damaĝas min.
+Pictish: (NEEDED)
+Breton: (NEEDED)
+Cornish: Mý a yl dybry gwéder hag éf ny wra ow ankenya.
+Welsh: Dw i'n gallu bwyta gwydr, 'dyw e ddim yn gwneud dolur i mi.
+Manx Gaelic: Foddym gee glonney agh cha jean eh gortaghey mee.
+Old Irish (Ogham): ᚛᚛ᚉᚑᚅᚔᚉᚉᚔᚋ ᚔᚈᚔ ᚍᚂᚐᚅᚑ ᚅᚔᚋᚌᚓᚅᚐ᚜
+Old Irish (Latin): Con·iccim ithi nglano. Ním·géna.
+Irish: Is féidir liom gloinne a ithe. Ní dhéanann sí dochar ar bith dom.
+Ulster Gaelic: Ithim-sa gloine agus ní miste damh é.
+Scottish Gaelic: S urrainn dhomh gloinne ithe; cha ghoirtich i mi.
+Anglo-Saxon (Runes): ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬
+Anglo-Saxon (Latin): Ic mæg glæs eotan ond hit ne hearmiað me.
+Middle English: Ich canne glas eten and hit hirtiþ me nouȝt.
+English: I can eat glass and it doesn't hurt me.
+English (IPA): [aɪ kæn iːt glɑːs ænd ɪt dɐz nɒt hɜːt miː] (Received Pronunciation)
+English (Braille): ⠊⠀⠉⠁⠝⠀⠑⠁⠞⠀⠛⠇⠁⠎⠎⠀⠁⠝⠙⠀⠊⠞⠀⠙⠕⠑⠎⠝⠞⠀⠓⠥⠗⠞⠀⠍⠑
+Jamaican: Mi kian niam glas han i neba hot mi.
+Lalland Scots / Doric: Ah can eat gless, it disnae hurt us.
+Glaswegian: (NEEDED)
+Gothic (4): 𐌼𐌰𐌲 𐌲𐌻𐌴𐍃 𐌹̈𐍄𐌰𐌽, 𐌽𐌹 𐌼𐌹𐍃 𐍅𐌿 𐌽𐌳𐌰𐌽 𐌱𐍂𐌹𐌲𐌲𐌹𐌸.
+Old Norse (Runes): ᛖᚴ ᚷᛖᛏ ᛖᛏᛁ ᚧ ᚷᛚᛖᚱ ᛘᚾ ᚦᛖᛋᛋ ᚨᚧ ᚡᛖ ᚱᚧᚨ ᛋᚨᚱ
+Old Norse (Latin): Ek get etið gler án þess að verða sár.
+Norsk / Norwegian (Nynorsk): Eg kan eta glas utan å skada meg.
+Norsk / Norwegian (Bokmål): Jeg kan spise glass uten å skade meg.
+Føroyskt / Faroese: Eg kann eta glas, skaðaleysur.
+Íslenska / Icelandic: Ég get etið gler án þess að meiða mig.
+Svenska / Swedish: Jag kan äta glas utan att skada mig.
+Dansk / Danish: Jeg kan spise glas, det gør ikke ondt på mig.
+Sønderjysk: Æ ka æe glass uhen at det go mæ naue.
+Frysk / Frisian: Ik kin glês ite, it docht me net sear.
+Nederlands / Dutch: Ik kan glas eten, het doet mij geen kwaad.
+Kirchröadsj/Bôchesserplat: Iech ken glaas èèse, mer 't deet miech jing pieng.
+Afrikaans: Ek kan glas eet, maar dit doen my nie skade nie.
+Lëtzebuergescht / Luxemburgish: Ech kan Glas iessen, daat deet mir nët wei.
+Deutsch / German: Ich kann Glas essen, ohne mir zu schaden.
+Ruhrdeutsch: Ich kann Glas verkasematuckeln, ohne dattet mich wat jucken tut.
+Langenfelder Platt: Isch kann Jlaas kimmeln, uuhne datt mich datt weh dääd.
+Lausitzer Mundart ("Lusatian"): Ich koann Gloos assn und doas dudd merr ni wii.
+Odenwälderisch: Iech konn glaasch voschbachteln ohne dass es mir ebbs daun doun dud.
+Sächsisch / Saxon: 'sch kann Glos essn, ohne dass'sch mer wehtue.
+Pfälzisch: Isch konn Glass fresse ohne dasses mer ebbes ausmache dud.
+Schwäbisch / Swabian: I kå Glas frässa, ond des macht mr nix!
+Deutsch (Voralberg): I ka glas eassa, ohne dass mar weh tuat.
+Bayrisch / Bavarian: I koh Glos esa, und es duard ma ned wei.
+Allemannisch: I kaun Gloos essen, es tuat ma ned weh.
+Schwyzerdütsch (Zürich): Ich chan Glaas ässe, das schadt mir nöd.
+Schwyzerdütsch (Luzern): Ech cha Glâs ässe, das schadt mer ned.
+Plautdietsch: (NEEDED)
+Hungarian: Meg tudom enni az üveget, nem lesz tőle bajom.
+Suomi / Finnish: Voin syödä lasia, se ei vahingoita minua.
+Sami (Northern): Sáhtán borrat lása, dat ii leat bávččas.
+Erzian: Мон ярсан суликадо, ды зыян эйстэнзэ а ули.
+Northern Karelian: Mie voin syvvä lasie ta minla ei ole kipie.
+Southern Karelian: Minä voin syvvä st'oklua dai minule ei ole kibie.
+Vepsian: (NEEDED)
+Votian: (NEEDED)
+Livonian: (NEEDED)
+Estonian: Ma võin klaasi süüa, see ei tee mulle midagi.
+Latvian: Es varu ēst stiklu, tas man nekaitē.
+Lithuanian: Aš galiu valgyti stiklą ir jis manęs nežeidžia
+Old Prussian: (NEEDED)
+Sorbian (Wendish): (NEEDED)
+Czech: Mohu jíst sklo, neublíží mi.
+Slovak: Môžem jesť sklo. Nezraní ma.
+Polska / Polish: Mogę jeść szkło i mi nie szkodzi.
+Slovenian: Lahko jem steklo, ne da bi mi škodovalo.
+Bosnian, Croatian, Montenegrin and Serbian (Latin): Ja mogu jesti staklo, i to mi ne šteti.
+Bosnian, Montenegrin and Serbian (Cyrillic): Ја могу јести стакло, и то ми не штети.
+Macedonian: Можам да јадам стакло, а не ме штета.
+Russian: Я могу есть стекло, оно мне не вредит.
+Belarusian (Cyrillic): Я магу есці шкло, яно мне не шкодзіць.
+Belarusian (Lacinka): Ja mahu jeści škło, jano mne ne škodzić.
+Ukrainian: Я можу їсти скло, і воно мені не зашкодить.
+Bulgarian: Мога да ям стъкло, то не ми вреди.
+Georgian: მინას ვჭამ და არა მტკივა.
+Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։
+Albanian: Unë mund të ha qelq dhe nuk më gjen gjë.
+Turkish: Cam yiyebilirim, bana zararı dokunmaz.
+Turkish (Ottoman): جام ييه بلورم بڭا ضررى طوقونمز
+Tatar: Алам да бар, пыяла, әмма бу ранит мине.
+Uzbek / O’zbekcha: (Roman): Men shisha yeyishim mumkin, ammo u menga zarar keltirmaydi.
+Uzbek / Ўзбекча (Cyrillic): Мен шиша ейишим мумкин, аммо у менга зарар келтирмайди.
+Bangla / Bengali: আমি কাঁচ খেতে পারি, তাতে আমার কোনো ক্ষতি হয় না।
+Marathi: मी काच खाऊ शकतो, मला ते दुखत नाही.
+Kannada: ನನಗೆ ಹಾನಿ ಆಗದೆ, ನಾನು ಗಜನ್ನು ತಿನಬಹುದು
+Hindi: मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती.
+Malayalam: എനിക്ക് ഗ്ലാസ് തിന്നാം. അതെന്നെ വേദനിപ്പിക്കില്ല.
+Tamil: நான் கண்ணாடி சாப்பிடுவேன், அதனால் எனக்கு ஒரு கேடும் வராது.
+Telugu: నేను గాజు తినగలను మరియు అలా చేసినా నాకు ఏమి ఇబ్బంది లేదు
+Sinhalese: මට වීදුරු කෑමට හැකියි. එයින් මට කිසි හානියක් සිදු නොවේ.
+Urdu(3): میں کانچ کھا سکتا ہوں اور مجھے تکلیف نہیں ہوتی ۔
+Pashto(3): زه شيشه خوړلې شم، هغه ما نه خوږوي
+Farsi / Persian(3): .من می توانم بدونِ احساس درد شيشه بخورم
+Arabic(3): أنا قادر على أكل الزجاج و هذا لا يؤلمني.
+Aramaic: (NEEDED)
+Maltese: Nista' niekol il-ħġieġ u ma jagħmilli xejn.
+Hebrew(3): אני יכול לאכול זכוכית וזה לא מזיק לי.
+Yiddish(3): איך קען עסן גלאָז און עס טוט מיר נישט װײ.
+Judeo-Arabic: (NEEDED)
+Ladino: (NEEDED)
+Gǝʼǝz: (NEEDED)
+Amharic: (NEEDED)
+Twi: Metumi awe tumpan, ɜnyɜ me hwee.
+Hausa (Latin): Inā iya taunar gilāshi kuma in gamā lāfiyā.
+Hausa (Ajami) (2): إِنا إِىَ تَونَر غِلَاشِ كُمَ إِن غَمَا لَافِىَا
+Yoruba(4): Mo lè je̩ dígí, kò ní pa mí lára.
+Lingala: Nakokí kolíya biténi bya milungi, ekosála ngáí mabé tɛ́.
+(Ki)Swahili: Naweza kula bilauri na sikunyui.
+Malay: Saya boleh makan kaca dan ia tidak mencederakan saya.
+Tagalog: Kaya kong kumain nang bubog at hindi ako masaktan.
+Chamorro: Siña yo' chumocho krestat, ti ha na'lalamen yo'.
+Fijian: Au rawa ni kana iloilo, ia au sega ni vakacacani kina.
+Javanese: Aku isa mangan beling tanpa lara.
+Burmese (Unicode 4.0): က္ယ္ဝန္တော္၊က္ယ္ဝန္မ မ္ယက္စားနုိင္သည္။ ၎က္ရောင့္ ထိခုိက္မ္ဟု မရ္ဟိပာ။ (9)
+Burmese (Unicode 5.0): ကျွန်တော် ကျွန်မ မှန်စားနိုင်တယ်။ ၎င်းကြောင့် ထိခိုက်မှုမရှိပါ။ (9)
+Vietnamese (quốc ngữ): Tôi có thể ăn thủy tinh mà không hại gì.
+Vietnamese (nôm) (4): 些 𣎏 世 咹 水 晶 𦓡 空 𣎏 害 咦
+Khmer: ខ្ញុំអាចញុំកញ្ចក់បាន ដោយគ្មានបញ្ហារ
+Lao: ຂອ້ຍກິນແກ້ວໄດ້ໂດຍທີ່ມັນບໍ່ໄດ້ເຮັດໃຫ້ຂອ້ຍເຈັບ.
+Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ
+Mongolian (Cyrillic): Би шил идэй чадна, надад хортой биш
+Mongolian (Classic) (5): ᠪᠢ ᠰᠢᠯᠢ ᠢᠳᠡᠶᠦ ᠴᠢᠳᠠᠨᠠ ᠂ ᠨᠠᠳᠤᠷ ᠬᠣᠤᠷᠠᠳᠠᠢ ᠪᠢᠰᠢ
+Dzongkha: (NEEDED)
+Nepali: म काँच खान सक्छू र मलाई केहि नी हुन्न् ।
+Tibetan: ཤེལ་སྒོ་ཟ་ནས་ང་ན་གི་མ་རེད།
+Chinese: 我能吞下玻璃而不伤身体。
+Chinese (Traditional): 我能吞下玻璃而不傷身體。
+Taiwanese(6): Góa ē-tàng chia̍h po-lê, mā bē tio̍h-siong.
+Japanese: 私はガラスを食べられます。それは私を傷つけません。
+Korean: 나는 유리를 먹을 수 있어요. 그래도 아프지 않아요
+Bislama: Mi save kakae glas, hemi no save katem mi.
+Hawaiian: Hiki iaʻu ke ʻai i ke aniani; ʻaʻole nō lā au e ʻeha.
+Marquesan: E koʻana e kai i te karahi, mea ʻā, ʻaʻe hauhau.
+Inuktitut (10): ᐊᓕᒍᖅ ᓂᕆᔭᕌᖓᒃᑯ ᓱᕋᙱᑦᑐᓐᓇᖅᑐᖓ
+Chinook Jargon: Naika məkmək kakshət labutay, pi weyk ukuk munk-sik nay.
+Navajo: Tsésǫʼ yishą́ągo bííníshghah dóó doo shił neezgai da.
+Cherokee (and Cree, Chickasaw, Cree, Micmac, Ojibwa, Lakota, Náhuatl, Quechua, Aymara, and other American languages): (NEEDED)
+Garifuna: (NEEDED)
+Gullah: (NEEDED)
+Lojban: mi kakne le nu citka le blaci .iku'i le se go'i na xrani mi
+Nórdicg: Ljœr ye caudran créneþ ý jor cẃran.
\ No newline at end of file
diff --git a/lib/std/unicode/quickbrown.txt b/lib/std/unicode/quickbrown.txt
new file mode 100644
index 000000000000..5db944343850
--- /dev/null
+++ b/lib/std/unicode/quickbrown.txt
@@ -0,0 +1,126 @@
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+ Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+ Wolther spillede på xylofon.
+ (= Quiz contestants were eating strawbery with cream while Wolther
+ the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+ Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+ (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+ Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+ (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+ Heizölrückstoßabdämpfung
+ (= fuel oil recoil absorber)
+ (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+ The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+ El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
+ frío, añoraba a su querido cachorro.
+ (Contains every letter and every accent, but not every combination
+ of vowel + acute.)
+
+French (fr)
+-----------
+
+ Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+ côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+ qui lui permet de penser à la cænogenèse de l'être dont il est question
+ dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+ pense-t-il, diminue çà et là la qualité de son œuvre.
+
+ l'île exiguë
+ Où l'obèse jury mûr
+ Fête l'haï volapük,
+ Âne ex aéquo au whist,
+ Ôtez ce vœu déçu.
+
+ Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+ canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+ D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+ Árvíztűrő tükörfúrógép
+ (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+ Sævör grét áðan því úlpan var ónýt
+ (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+ Hiragana: (Iroha)
+
+ いろはにほへとちりぬるを
+ わかよたれそつねならむ
+ うゐのおくやまけふこえて
+ あさきゆめみしゑひもせす
+
+ Katakana:
+
+ イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+ ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+ ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+ Pchnąć w tę łódź jeża lub ośm skrzyń fig
+ (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+ В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+ (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+ [--------------------------|------------------------]
+ ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน
+ จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+ ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย
+ ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+ [The copyright for the Thai example is owned by The Computer
+ Association of Thailand under the Royal Patronage of His Majesty the
+ King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.