From 74f22eb421d7c3ac241dbe462d441b31c1cad645 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Br=C3=A1ulio=20Bezerra?= <brauliobezerra@gmail.com>
Date: Sun, 24 Sep 2017 11:31:12 -0300
Subject: [PATCH 1/3] Add grammar for char and string literals

---
 src/tokens.md | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 2 deletions(-)
diff --git a/src/tokens.md b/src/tokens.md
index a66357ea5..b6ce25c54 100644
--- a/src/tokens.md
+++ b/src/tokens.md
@@ -21,13 +21,24 @@ evaluated (primarily) at compile time.
 
 |                                              | Example         | `#` sets   | Characters  | Escapes             |
 |----------------------------------------------|-----------------|------------|-------------|---------------------|
-| [Character](#character-literals)             | `'H'`           | `N/A`      | All Unicode | [Quote](#quote-escapes) & [Byte](#byte-escapes) & [Unicode](#unicode-escapes) |
-| [String](#string-literals)                   | `"hello"`       | `N/A`      | All Unicode | [Quote](#quote-escapes) & [Byte](#byte-escapes) & [Unicode](#unicode-escapes) |
+| [Character](#character-literals)             | `'H'`           | `N/A`      | All Unicode | [Quote](#quote-escapes) & [ASCII](#ascii-escapes) & [Unicode](#unicode-escapes) |
+| [String](#string-literals)                   | `"hello"`       | `N/A`      | All Unicode | [Quote](#quote-escapes) & [ASCII](#ascii-escapes) & [Unicode](#unicode-escapes) |
 | [Raw](#raw-string-literals)                  | `r#"hello"#`    | `0...`     | All Unicode | `N/A`                                                      |
 | [Byte](#byte-literals)                       | `b'H'`          | `N/A`      | All ASCII   | [Quote](#quote-escapes) & [Byte](#byte-escapes)                               |
 | [Byte string](#byte-string-literals)         | `b"hello"`      | `N/A`      | All ASCII   | [Quote](#quote-escapes) & [Byte](#byte-escapes)                               |
 | [Raw byte string](#raw-byte-string-literals) | `br#"hello"#`   | `0...`     | All ASCII   | `N/A`                                                      |
 
+#### ASCII escapes
+
+|   | Name |
+|---|------|
+| `\x41` | 7-bit character code (exactly 2 digits, up to 0x7F) |
+| `\n` | Newline |
+| `\r` | Carriage return |
+| `\t` | Tab |
+| `\\` | Backslash |
+| `\0` | Null |
+
 #### Byte escapes
 
 |   | Name |
@@ -74,12 +85,45 @@ evaluated (primarily) at compile time.
 
 #### Character literals
 
+> **<sup>Lexer</sup>**  
+> CHAR_LITERAL :  
+> &nbsp;&nbsp; `'` ( ~[`'` `\` \\n \\r \\t] | QUOTE_ESCAPE | ASCII_ESCAPE | UNICODE_ESCAPE ) `'`  
+>  
+> QUOTE_ESCAPE :  
+> &nbsp;&nbsp; `\'` | `\"`  
+>  
+> ASCII_ESCAPE :  
+> &nbsp;&nbsp; &nbsp;&nbsp; `\x` OCT_DIGIT HEX_DIGIT  
+> &nbsp;&nbsp; | `\n` | `\r` | `\t` | `\\` | `\0`  
+>  
+> UNICODE_ESCAPE :  
+> &nbsp;&nbsp; &nbsp;&nbsp; `\u{` HEX_DIGIT `}`  
+> &nbsp;&nbsp; | `\u{` HEX_DIGIT HEX_DIGIT `}`  
+> &nbsp;&nbsp; | `\u{` HEX_DIGIT HEX_DIGIT HEX_DIGIT `}`  
+> &nbsp;&nbsp; | `\u{` HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT `}`  
+> &nbsp;&nbsp; | `\u{` HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT`}`  
+> &nbsp;&nbsp; | `\u{` HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT`}`  
+
 A _character literal_ is a single Unicode character enclosed within two
 `U+0027` (single-quote) characters, with the exception of `U+0027` itself,
 which must be _escaped_ by a preceding `U+005C` character (`\`).
 
 #### String literals
 
+> **<sup>Lexer</sup>**  
+> STRING_LITERAL :  
+> &nbsp;&nbsp; `"` (  
+> &nbsp;&nbsp; &nbsp;&nbsp; ~[`"` `\` _IsolatedCR_]  
+> &nbsp;&nbsp; &nbsp;&nbsp; | QUOTE_ESCAPE  
+> &nbsp;&nbsp; &nbsp;&nbsp; | ASCII_ESCAPE  
+> &nbsp;&nbsp; &nbsp;&nbsp; | UNICODE_ESCAPE  
+> &nbsp;&nbsp; &nbsp;&nbsp; | STRING_CONTINUE  
+> &nbsp;&nbsp; )<sup>\*</sup> `"`  
+>  
+> STRING_CONTINUE :  
+> &nbsp;&nbsp; `\` _followed by_ \\n  
+
+
 A _string literal_ is a sequence of any Unicode characters enclosed within two
 `U+0022` (double-quote) characters, with the exception of `U+0022` itself,
 which must be _escaped_ by a preceding `U+005C` character (`\`).
@@ -120,6 +164,14 @@ following forms:
 
 #### Raw string literals
 
+> **<sup>Lexer</sup>**  
+> RAW_STRING_LITERAL :  
+> &nbsp;&nbsp; `r` RAW_STRING_CONTENT  
+>  
+> RAW_STRING_CONTENT :  
+> &nbsp;&nbsp; &nbsp;&nbsp; `"` ( ~ _IsolatedCR_ )<sup>* (non-greedy)</sup> `"`  
+> &nbsp;&nbsp; | `#` RAW_STRING_CONTENT `#`  
+
 Raw string literals do not process any escapes. They start with the character
 `U+0072` (`r`), followed by zero or more of the character `U+0023` (`#`) and a
 `U+0022` (double-quote) character. The _raw string body_ can contain any sequence
@@ -149,6 +201,17 @@ r##"foo #"# bar"##;                // foo #"# bar
 
 #### Byte literals
 
+> **<sup>Lexer</sup>**  
+> BYTE_LITERAL :  
+> &nbsp;&nbsp; `b'` ( ASCII_FOR_CHAR | BYTE_ESCAPE )  `'`  
+>  
+> ASCII_FOR_CHAR :  
+> &nbsp;&nbsp; _any ASCII (i.e. 0x00 to 0x7F), except_ `'`, `/`, \\n, \\r or \\t  
+>  
+> BYTE_ESCAPE :  
+> &nbsp;&nbsp; &nbsp;&nbsp; `\x` HEX_DIGIT HEX_DIGIT  
+> &nbsp;&nbsp; | `\n` | `\r` | `\t` | `\\` | `\0`  
+
 A _byte literal_ is a single ASCII character (in the `U+0000` to `U+007F`
 range) or a single _escape_ preceded by the characters `U+0062` (`b`) and
 `U+0027` (single-quote), and followed by the character `U+0027`. If the character
@@ -158,6 +221,13 @@ _number literal_.
 
 #### Byte string literals
 
+> **<sup>Lexer</sup>**  
+> BYTE_STRING_LITERAL :  
+> &nbsp;&nbsp; `b"` ( ASCII_FOR_STRING | BYTE_ESCAPE | STRING_CONTINUE )<sup>\*</sup> `"`  
+>  
+> ASCII_FOR_STRING :  
+> &nbsp;&nbsp; _any ASCII (i.e 0x00 to 0x7F), except_ `"`, `/` _and IsolatedCR_ 
+
 A non-raw _byte string literal_ is a sequence of ASCII characters and _escapes_,
 preceded by the characters `U+0062` (`b`) and `U+0022` (double-quote), and
 followed by the character `U+0022`. If the character `U+0022` is present within
@@ -183,6 +253,18 @@ following forms:
 
 #### Raw byte string literals
 
+> **<sup>Lexer</sup>**  
+> RAW_BYTE_STRING_LITERAL :  
+> &nbsp;&nbsp; `br` RAW_BYTE_STRING_CONTENT  
+>  
+> RAW_BYTE_STRING_CONTENT :  
+> &nbsp;&nbsp; &nbsp;&nbsp; `"` ASCII<sup>* (non-greedy)</sup> `"`  
+> &nbsp;&nbsp; | `#` RAW_STRING_CONTENT `#`  
+>  
+> ASCII :  
+> &nbsp;&nbsp; _any ASCII (i.e. 0x00 to 0x7F)_  
+
+
 Raw byte string literals do not process any escapes. They start with the
 character `U+0062` (`b`), followed by `U+0072` (`r`), followed by zero or more
 of the character `U+0023` (`#`), and a `U+0022` (double-quote) character. The

From 2f8877df63edcc1436db7b60eade410b53aa0db5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Br=C3=A1ulio=20Bezerra?= <brauliobezerra@gmail.com>
Date: Tue, 26 Sep 2017 08:23:48 -0300
Subject: [PATCH 2/3] Added _ (underscore) separators to Unicode escapes

---
 src/tokens.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/tokens.md b/src/tokens.md
index b6ce25c54..5d0e21ca3 100644
--- a/src/tokens.md
+++ b/src/tokens.md
@@ -97,12 +97,12 @@ evaluated (primarily) at compile time.
 > &nbsp;&nbsp; | `\n` | `\r` | `\t` | `\\` | `\0`  
 >  
 > UNICODE_ESCAPE :  
-> &nbsp;&nbsp; &nbsp;&nbsp; `\u{` HEX_DIGIT `}`  
-> &nbsp;&nbsp; | `\u{` HEX_DIGIT HEX_DIGIT `}`  
-> &nbsp;&nbsp; | `\u{` HEX_DIGIT HEX_DIGIT HEX_DIGIT `}`  
-> &nbsp;&nbsp; | `\u{` HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT `}`  
-> &nbsp;&nbsp; | `\u{` HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT`}`  
-> &nbsp;&nbsp; | `\u{` HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT`}`  
+> &nbsp;&nbsp; &nbsp;&nbsp; `\u{` HEX_DIGIT `_`<sup>\*</sup> `}`  
+> &nbsp;&nbsp; | `\u{` HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> `}`  
+> &nbsp;&nbsp; | `\u{` HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> `}`  
+> &nbsp;&nbsp; | `\u{` HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> `}`  
+> &nbsp;&nbsp; | `\u{` HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> `}`  
+> &nbsp;&nbsp; | `\u{` HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> `}`  
 
 A _character literal_ is a single Unicode character enclosed within two
 `U+0027` (single-quote) characters, with the exception of `U+0027` itself,

From dbc4ab8677f39778fd28abff3289f39bdba757e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Br=C3=A1ulio=20Bezerra?= <brauliobezerra@gmail.com>
Date: Tue, 26 Sep 2017 17:34:20 -0300
Subject: [PATCH 3/3] Use a repetition notation for unicode escapes to make it
 smaller.

---
 src/tokens.md | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/tokens.md b/src/tokens.md
index 5d0e21ca3..2ad2a9dd8 100644
--- a/src/tokens.md
+++ b/src/tokens.md
@@ -97,12 +97,7 @@ evaluated (primarily) at compile time.
 > &nbsp;&nbsp; | `\n` | `\r` | `\t` | `\\` | `\0`  
 >  
 > UNICODE_ESCAPE :  
-> &nbsp;&nbsp; &nbsp;&nbsp; `\u{` HEX_DIGIT `_`<sup>\*</sup> `}`  
-> &nbsp;&nbsp; | `\u{` HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> `}`  
-> &nbsp;&nbsp; | `\u{` HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> `}`  
-> &nbsp;&nbsp; | `\u{` HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> `}`  
-> &nbsp;&nbsp; | `\u{` HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> `}`  
-> &nbsp;&nbsp; | `\u{` HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> HEX_DIGIT `_`<sup>\*</sup> `}`  
+> &nbsp;&nbsp; `\u{` ( HEX_DIGIT `_`<sup>\*</sup> )<sup>1..6</sup> `}`  
 
 A _character literal_ is a single Unicode character enclosed within two
 `U+0027` (single-quote) characters, with the exception of `U+0027` itself,
@@ -123,7 +118,6 @@ which must be _escaped_ by a preceding `U+005C` character (`\`).
 > STRING_CONTINUE :  
 > &nbsp;&nbsp; `\` _followed by_ \\n  
 
-
 A _string literal_ is a sequence of any Unicode characters enclosed within two
 `U+0022` (double-quote) characters, with the exception of `U+0022` itself,
 which must be _escaped_ by a preceding `U+005C` character (`\`).
@@ -264,7 +258,6 @@ following forms:
 > ASCII :  
 > &nbsp;&nbsp; _any ASCII (i.e. 0x00 to 0x7F)_  
 
-
 Raw byte string literals do not process any escapes. They start with the
 character `U+0062` (`b`), followed by `U+0072` (`r`), followed by zero or more
 of the character `U+0023` (`#`), and a `U+0022` (double-quote) character. The