From ea62729f059059d04be1012c1a15659990a51227 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Sun, 28 Oct 2018 17:09:15 -0500 Subject: [PATCH] cpp: Fully support C++11 raw strings. See https://en.cppreference.com/w/cpp/language/string_literal for the syntax. This requires a fix in highlight.js itself. mode.terminators joins each node's begin regexps with |. This breaks if one of the begin regexps has backreferences. Backreferences count capturing parenthesized groups, and adding new groups in front will change that count. Thus far, the only language that uses backreferences is Rust (also for raw strings), which happens to be the first in the list and avoids this bug. C++ cannot as easily avoid this because, even were raw strings the first option in STRINGS, STRINGS itself is included in other lists. Rather than carefully order things, rewrite the regularly expressions to fix the backreferences. --- src/highlight.js | 45 +++++++++++++++++++++- src/languages/cpp.js | 8 +--- test/markup/cpp/string-literals.expect.txt | 34 ++++++++++++++-- test/markup/cpp/string-literals.txt | 34 ++++++++++++++-- 4 files changed, 104 insertions(+), 17 deletions(-) diff --git a/src/highlight.js b/src/highlight.js index ad0bb19815..8815c80c80 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -237,6 +237,47 @@ https://highlightjs.org/ ); } + // joinRe logically computes regexps.join(separator), but fixes the + // backreferences so they continue to match. + function joinRe(regexps, separator) { + // backreferenceRe matches an open parenthesis or backreference. To avoid + // an incorrect parse, it additionally matches the following: + // - [...] elements, where the meaning of parentheses and escapes change + // - other escape sequences, so we do not misparse escape sequences as + // interesting elements + // - non-matching or lookahead parentheses, which do not capture. These + // follow the '(' with a '?'. + var backreferenceRe = /\[(?:[^\\\]]|\\.)*\]|\(\??|\\([1-9][0-9]*)|\\./; + var numCaptures = 0; + var ret = ''; + for (var i = 0; i < regexps.length; i++) { + var offset = numCaptures; + var re = reStr(regexps[i]); + if (i > 0) { + ret += separator; + } + while (re.length > 0) { + var match = backreferenceRe.exec(re); + if (match == null) { + ret += re; + break; + } + ret += re.substring(0, match.index); + re = re.substring(match.index + match[0].length); + if (match[0][0] == '\\' && match[1]) { + // Adjust the backreference. + ret += '\\' + String(Number(match[1]) + offset); + } else { + ret += match[0]; + if (match[0] == '(') { + numCaptures++; + } + } + } + } + return ret; + } + function compileMode(mode, parent) { if (mode.compiled) return; @@ -302,12 +343,12 @@ https://highlightjs.org/ var terminators = mode.contains.map(function(c) { - return c.beginKeywords ? '\\.?(' + c.begin + ')\\.?' : c.begin; + return c.beginKeywords ? '\\.?(?:' + c.begin + ')\\.?' : c.begin; }) .concat([mode.terminator_end, mode.illegal]) .map(reStr) .filter(Boolean); - mode.terminators = terminators.length ? langRe(terminators.join('|'), true) : {exec: function(/*s*/) {return null;}}; + mode.terminators = terminators.length ? langRe(joinRe(terminators, '|'), true) : {exec: function(/*s*/) {return null;}}; } compileMode(language); diff --git a/src/languages/cpp.js b/src/languages/cpp.js index f91e3f4a61..7817d6da09 100644 --- a/src/languages/cpp.js +++ b/src/languages/cpp.js @@ -19,13 +19,7 @@ function(hljs) { illegal: '\\n', contains: [hljs.BACKSLASH_ESCAPE] }, - { - // TODO: This does not handle raw string literals with prefixes. Using - // a single regex with backreferences would work (note to use *? - // instead of * to make it non-greedy), but the mode.terminators - // computation in highlight.js breaks the counting. - begin: '(u8?|U|L)?R"\\(', end: '\\)"', - }, + { begin: /(?:u8?|U|L)?R"([^()\\ ]{0,16})\((?:.|\n)*?\)\1"/ }, { begin: '\'\\\\?.', end: '\'', illegal: '.' diff --git a/test/markup/cpp/string-literals.expect.txt b/test/markup/cpp/string-literals.expect.txt index 1210b01e5e..4c5baca14f 100644 --- a/test/markup/cpp/string-literals.expect.txt +++ b/test/markup/cpp/string-literals.expect.txt @@ -10,21 +10,47 @@ // Raw string literals (multiline) auto char_multi = R"(Hello "normal" -muliline +multiline string.)"; auto utf8_multi = u8R"(Hello "utf-8" -muliline +multiline string)"; auto utf16_multi = uR"(Hello "utf-16" -muliline +multiline string)"; auto utf32_multi = UR"(Hello "utf-32" -muliline +multiline string)"; +// Raw string literals with delimiter (multiline) +auto char_multi = R"blah1(Hello +"normal" +multiline +)" +)blah" +string.)blah1"; +auto utf8_multi = u8R"blah2(Hello +"utf-8" +multiline +)" +)blah" +string)blah2"; +auto utf16_multi = uR"blah3(Hello +"utf-16" +multiline +)" +)blah" +string)blah3"; +auto utf32_multi = UR"blah4(Hello +"utf-32" +multiline +)" +)blah" +string)blah4"; + // Meta strings #include <stdio> #include "lib.h" diff --git a/test/markup/cpp/string-literals.txt b/test/markup/cpp/string-literals.txt index 68b8bd411a..9939edba3e 100644 --- a/test/markup/cpp/string-literals.txt +++ b/test/markup/cpp/string-literals.txt @@ -10,21 +10,47 @@ auto wide_char = L"Hello wchar_t string"; // Raw string literals (multiline) auto char_multi = R"(Hello "normal" -muliline +multiline string.)"; auto utf8_multi = u8R"(Hello "utf-8" -muliline +multiline string)"; auto utf16_multi = uR"(Hello "utf-16" -muliline +multiline string)"; auto utf32_multi = UR"(Hello "utf-32" -muliline +multiline string)"; +// Raw string literals with delimiter (multiline) +auto char_multi = R"blah1(Hello +"normal" +multiline +)" +)blah" +string.)blah1"; +auto utf8_multi = u8R"blah2(Hello +"utf-8" +multiline +)" +)blah" +string)blah2"; +auto utf16_multi = uR"blah3(Hello +"utf-16" +multiline +)" +)blah" +string)blah3"; +auto utf32_multi = UR"blah4(Hello +"utf-32" +multiline +)" +)blah" +string)blah4"; + // Meta strings #include #include "lib.h"