Skip to content

Commit b16c69a

Browse files
cpmsmithamaanq
authored andcommitted
feat: support HTML entities in JSX text/attributes
JSX text and attributes support HTML character references (a.k.a. entities), and don't support ECMAScript string escape sequences. Although the [spec] calls it "historical" and threatens to change it, it _is_ in the spec, and the spec is pretty stable at this point. In changing this, I landed back on an idea that @maxbrunsfeld suggested in a [PR review] some time ago: having separate `string` and `jsx_string` nodes, and aliasing `jsx_string` to `string` for consumers' convenience. At that time, having two different node types was deemed unnecessary, but this adds a second, more substantive difference between the two, so I've brought the idea back, and stopped allowing invalid newlines in JS string literals, which is invalid in both JS and TS. [spec]: https://facebook.github.io/jsx/#sec-jsx-string-characters [PR review]: #140 (comment)
1 parent c2c2260 commit b16c69a

File tree

2 files changed

+72
-26
lines changed

2 files changed

+72
-26
lines changed

grammar.js

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -607,10 +607,15 @@ module.exports = grammar({
607607

608608
// Should not contain new lines and should not start or end with a space
609609
jsx_text: _ => choice(
610-
/[^{}<>\n ]([^{}<>\n]*[^{}<>\n ])?/,
610+
/[^{}<>\n& ]([^{}<>\n&]*[^{}<>\n& ])?/,
611611
/\/\/[^\n]*/,
612612
),
613613

614+
// An entity can be named, numeric (decimal), or numeric (hexadecimal). The
615+
// longest entity name is 29 characters long, and the HTML spec says that
616+
// no more will ever be added.
617+
html_character_reference: _ => /&(#([xX][0-9a-fA-F]{1,6}|[0-9]{1,5})|[A-Za-z]{1,30});/,
618+
614619
jsx_expression: $ => seq(
615620
'{',
616621
optional(choice(
@@ -623,6 +628,7 @@ module.exports = grammar({
623628

624629
_jsx_child: $ => choice(
625630
$.jsx_text,
631+
$.html_character_reference,
626632
$._jsx_element,
627633
$.jsx_expression,
628634
),
@@ -682,8 +688,36 @@ module.exports = grammar({
682688
)),
683689
),
684690

691+
_jsx_string: $ => choice(
692+
seq(
693+
'"',
694+
repeat(choice(
695+
alias($.unescaped_double_jsx_string_fragment, $.string_fragment),
696+
$.html_character_reference,
697+
)),
698+
'"',
699+
),
700+
seq(
701+
'\'',
702+
repeat(choice(
703+
alias($.unescaped_single_jsx_string_fragment, $.string_fragment),
704+
$.html_character_reference,
705+
)),
706+
'\'',
707+
),
708+
),
709+
710+
// Workaround to https://github.com/tree-sitter/tree-sitter/issues/1156
711+
// We give names to the token() constructs containing a regexp
712+
// so as to obtain a node in the CST.
713+
//
714+
unescaped_double_jsx_string_fragment: _ => token.immediate(prec(1, /[^"&]+/)),
715+
716+
// same here
717+
unescaped_single_jsx_string_fragment: _ => token.immediate(prec(1, /[^'&]+/)),
718+
685719
_jsx_attribute_value: $ => choice(
686-
$.string,
720+
alias($._jsx_string, $.string),
687721
$.jsx_expression,
688722
$._jsx_element,
689723
),
@@ -909,12 +943,6 @@ module.exports = grammar({
909943
// Primitives
910944
//
911945

912-
// Here we tolerate unescaped newlines in double-quoted and
913-
// single-quoted string literals.
914-
// This is legal in typescript as jsx/tsx attribute values (as of
915-
// 2020), and perhaps will be valid in javascript as well in the
916-
// future.
917-
//
918946
string: $ => choice(
919947
seq(
920948
'"',
@@ -938,10 +966,10 @@ module.exports = grammar({
938966
// We give names to the token() constructs containing a regexp
939967
// so as to obtain a node in the CST.
940968
//
941-
unescaped_double_string_fragment: _ => token.immediate(prec(1, /[^"\\]+/)),
969+
unescaped_double_string_fragment: _ => token.immediate(prec(1, /[^"\\\r\n]+/)),
942970

943971
// same here
944-
unescaped_single_string_fragment: _ => token.immediate(prec(1, /[^'\\]+/)),
972+
unescaped_single_string_fragment: _ => token.immediate(prec(1, /[^'\\\r\n]+/)),
945973

946974
escape_sequence: _ => token.immediate(seq(
947975
'\\',

test/corpus/literals.txt

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -108,22 +108,6 @@ world';
108108
(expression_statement
109109
(string (string_fragment) (escape_sequence) (string_fragment))))
110110

111-
============================================================
112-
Non-standard unescaped newlines legal in TSX attributes
113-
============================================================
114-
115-
"hello
116-
world";
117-
118-
'hello
119-
world';
120-
121-
---
122-
123-
(program
124-
(expression_statement (string (string_fragment)))
125-
(expression_statement (string (string_fragment))))
126-
127111
=========================================================
128112
JSX strings with unescaped newlines for TSX attributes
129113
=========================================================
@@ -151,3 +135,37 @@ JSX strings with unescaped newlines for TSX attributes
151135
(jsx_attribute (property_identifier) (string (string_fragment))))
152136
(jsx_closing_element
153137
(identifier)))))
138+
139+
===============================================
140+
JSX with HTML character references (entities)
141+
===============================================
142+
143+
<a>foo &nbsp; bar</a>;
144+
145+
<abbr title="foo &nbsp; \n bar">foo</abbr>;
146+
147+
----
148+
149+
(program
150+
(expression_statement
151+
(jsx_element
152+
(jsx_opening_element
153+
(identifier))
154+
(jsx_text)
155+
(html_character_reference)
156+
(jsx_text)
157+
(jsx_closing_element
158+
(identifier))))
159+
(expression_statement
160+
(jsx_element
161+
(jsx_opening_element
162+
(identifier)
163+
(jsx_attribute
164+
(property_identifier)
165+
(string
166+
(string_fragment)
167+
(html_character_reference)
168+
(string_fragment))))
169+
(jsx_text)
170+
(jsx_closing_element
171+
(identifier)))))

0 commit comments

Comments
 (0)