diff --git a/CHANGES.md b/CHANGES.md index 4a00b76655..ce1b83a012 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,11 +1,13 @@ ## Master New languages: + none. New styles: - *Night Owl* by [Carl Baxter][] Improvements: +- improve parser to properly support look-ahead regex in begin matchers (#2135) - blacklist super-common keywords from having relevance (#2179) - fix(swift): support for `@dynamicMemberLookup` and `@propertyWrapper` (#2202) - fix: `endWithParent` inside `starts` now always works (#2201) diff --git a/docs/language-guide.rst b/docs/language-guide.rst index a3cf8d806a..6e598ab8ca 100644 --- a/docs/language-guide.rst +++ b/docs/language-guide.rst @@ -258,6 +258,23 @@ Many languages share common modes and regular expressions. Such expressions are at the end under "Common regexps" and "Common modes" titles. Use them when possible. +Regular Expression Features +--------------------------- + +The goal of Highlight.js is to support whatever regex features Javascript itself supports. You're using real regular expressions, use them responsibly. That said, due to the design of the parser, there are some caveats. These are addressed below. + +Things we support now that we did not always: + +* look-ahead matching for `begin` (#2135) +* look-ahead matching for `illegal` (#2135) +* back-references within your regex (#1897) + +Things we currently know are still issues: + +* look-ahead matching for `end` matchers +* look-behind matching (when JS supports it) for `end` matchers + + Contributing ------------ diff --git a/src/highlight.js b/src/highlight.js index 8464b65f62..fa8e7c3895 100644 --- a/src/highlight.js +++ b/src/highlight.js @@ -313,8 +313,15 @@ https://highlightjs.org/ ); } + function reCountMatchGroups(re) { + return (new RegExp(re.toString() + '|')).exec('').length - 1; + } + // joinRe logically computes regexps.join(separator), but fixes the // backreferences so they continue to match. + // it also places each individual regular expression into it's own + // match group, keeping track of the sequencing of those match groups + // is currently an exercise for the caller. :-) function joinRe(regexps, separator) { // backreferenceRe matches an open parenthesis or backreference. To avoid // an incorrect parse, it additionally matches the following: @@ -327,11 +334,13 @@ https://highlightjs.org/ var numCaptures = 0; var ret = ''; for (var i = 0; i < regexps.length; i++) { + numCaptures += 1; var offset = numCaptures; var re = reStr(regexps[i]); if (i > 0) { ret += separator; } + ret += "("; while (re.length > 0) { var match = backreferenceRe.exec(re); if (match == null) { @@ -350,10 +359,75 @@ https://highlightjs.org/ } } } + ret += ")"; } return ret; } + function buildModeRegex(mode) { + + var matchIndexes = {}; + var matcherRe; + var regexes = []; + var matcher = {}; + var matchAt = 1; + + function addRule(rule, regex) { + matchIndexes[matchAt] = rule; + regexes.push([rule, regex]); + matchAt += reCountMatchGroups(regex) + 1; + } + + var term; + for (var i=0; i < mode.contains.length; i++) { + var re; + term = mode.contains[i]; + if (term.beginKeywords) { + re = '\\.?(?:' + term.begin + ')\\.?'; + } else { + re = term.begin; + } + addRule(term, re); + } + if (mode.terminator_end) + addRule("end", mode.terminator_end); + if (mode.illegal) + addRule("illegal", mode.illegal); + + var terminators = regexes.map(function(el) { return el[1] }); + matcherRe = langRe(joinRe(terminators, '|'), true); + + matcher.lastIndex = 0; + matcher.exec = function(s) { + var rule; + + if( regexes.length === 0) return null; + + matcherRe.lastIndex = matcher.lastIndex; + var match = matcherRe.exec(s); + if (!match) { return null; } + + for(var i = 0; i') + '"'); + } else if (match.type==="end") { + var processed = doEndMatch(match); + if (processed != undefined) + return processed; + } /* - Parser should not reach this point as all types of lexemes should be caught - earlier, but if it does due to some bug make sure it advances at least one - character forward to prevent infinite looping. + Why might be find ourselves here? Only one occasion now. An end match that was + triggered but could not be completed. When might this happen? When an `endSameasBegin` + rule sets the end rule to a specific match. Since the overall mode termination rule that's + being used to scan the text isn't recompiled that means that any match that LOOKS like + the end (but is not, because it is not an exact match to the beginning) will + end up here. A definite end match, but when `doEndMatch` tries to "reapply" + the end rule and fails to match, we wind up here, and just silently ignore the end. + + This causes no real harm other than stopping a few times too many. */ + mode_buffer += lexeme; - return lexeme.length || 1; + return lexeme.length; } var language = getLanguage(name); @@ -625,7 +713,7 @@ https://highlightjs.org/ match = top.terminators.exec(value); if (!match) break; - count = processLexeme(value.substring(index, match.index), match[0]); + count = processLexeme(value.substring(index, match.index), match); index = match.index + count; } processLexeme(value.substr(index)); @@ -637,12 +725,14 @@ https://highlightjs.org/ return { relevance: relevance, value: result, + illegal:false, language: name, top: top }; } catch (e) { if (e.message && e.message.indexOf('Illegal') !== -1) { return { + illegal: true, relevance: 0, value: escape(value) }; diff --git a/src/languages/abnf.js b/src/languages/abnf.js index ca7b8084ce..5b0f5d4030 100644 --- a/src/languages/abnf.js +++ b/src/languages/abnf.js @@ -52,11 +52,8 @@ function(hljs) { }; var ruleDeclarationMode = { - begin: regexes.ruleDeclaration + '\\s*=', - returnBegin: true, - end: /=/, - relevance: 0, - contains: [{className: "attribute", begin: regexes.ruleDeclaration}] + className: "attribute", + begin: regexes.ruleDeclaration + '(?=\\s*=)', }; return { diff --git a/src/languages/brainfuck.js b/src/languages/brainfuck.js index c9c2755d70..e4e87c4c2e 100644 --- a/src/languages/brainfuck.js +++ b/src/languages/brainfuck.js @@ -33,7 +33,7 @@ function(hljs){ }, { // this mode works as the only relevance counter - begin: /\+\+|\-\-/, returnBegin: true, + begin: /(?:\+\+|\-\-)/, contains: [LITERAL] }, LITERAL diff --git a/src/languages/coffeescript.js b/src/languages/coffeescript.js index 7cbbe52834..d1a6361749 100644 --- a/src/languages/coffeescript.js +++ b/src/languages/coffeescript.js @@ -67,7 +67,7 @@ function(hljs) { { // regex can't start with space to parse x / 2 / 3 as two divisions // regex can't start with *, and it supports an "illegal" in the main mode - begin: /\/(?![ *])(\\\/|.)*?\/[gim]*(?=\W|$)/ + begin: /\/(?![ *])(\\\/|.)*?\/[gim]*(?=\W)/ } ] }, diff --git a/src/languages/livescript.js b/src/languages/livescript.js index f8731f2037..4584ef82e2 100644 --- a/src/languages/livescript.js +++ b/src/languages/livescript.js @@ -81,7 +81,7 @@ function(hljs) { { // regex can't start with space to parse x / 2 / 3 as two divisions // regex can't start with *, and it supports an "illegal" in the main mode - begin: /\/(?![ *])(\\\/|.)*?\/[gim]*(?=\W|$)/ + begin: /\/(?![ *])(\\\/|.)*?\/[gim]*(?=\W)/ } ] }, diff --git a/src/languages/stata.js b/src/languages/stata.js index 6be20a7b88..f5e2a2411d 100644 --- a/src/languages/stata.js +++ b/src/languages/stata.js @@ -37,7 +37,7 @@ function(hljs) { className: 'built_in', variants: [ { - begin: '\\b(abs|acos|asin|atan|atan2|atanh|ceil|cloglog|comb|cos|digamma|exp|floor|invcloglog|invlogit|ln|lnfact|lnfactorial|lngamma|log|log10|max|min|mod|reldif|round|sign|sin|sqrt|sum|tan|tanh|trigamma|trunc|betaden|Binomial|binorm|binormal|chi2|chi2tail|dgammapda|dgammapdada|dgammapdadx|dgammapdx|dgammapdxdx|F|Fden|Ftail|gammaden|gammap|ibeta|invbinomial|invchi2|invchi2tail|invF|invFtail|invgammap|invibeta|invnchi2|invnFtail|invnibeta|invnorm|invnormal|invttail|nbetaden|nchi2|nFden|nFtail|nibeta|norm|normal|normalden|normd|npnchi2|tden|ttail|uniform|abbrev|char|index|indexnot|length|lower|ltrim|match|plural|proper|real|regexm|regexr|regexs|reverse|rtrim|string|strlen|strlower|strltrim|strmatch|strofreal|strpos|strproper|strreverse|strrtrim|strtrim|strupper|subinstr|subinword|substr|trim|upper|word|wordcount|_caller|autocode|byteorder|chop|clip|cond|e|epsdouble|epsfloat|group|inlist|inrange|irecode|matrix|maxbyte|maxdouble|maxfloat|maxint|maxlong|mi|minbyte|mindouble|minfloat|minint|minlong|missing|r|recode|replay|return|s|scalar|d|date|day|dow|doy|halfyear|mdy|month|quarter|week|year|d|daily|dofd|dofh|dofm|dofq|dofw|dofy|h|halfyearly|hofd|m|mofd|monthly|q|qofd|quarterly|tin|twithin|w|weekly|wofd|y|yearly|yh|ym|yofd|yq|yw|cholesky|colnumb|colsof|corr|det|diag|diag0cnt|el|get|hadamard|I|inv|invsym|issym|issymmetric|J|matmissing|matuniform|mreldif|nullmat|rownumb|rowsof|sweep|syminv|trace|vec|vecdiag)(?=\\(|$)' + begin: '\\b(abs|acos|asin|atan|atan2|atanh|ceil|cloglog|comb|cos|digamma|exp|floor|invcloglog|invlogit|ln|lnfact|lnfactorial|lngamma|log|log10|max|min|mod|reldif|round|sign|sin|sqrt|sum|tan|tanh|trigamma|trunc|betaden|Binomial|binorm|binormal|chi2|chi2tail|dgammapda|dgammapdada|dgammapdadx|dgammapdx|dgammapdxdx|F|Fden|Ftail|gammaden|gammap|ibeta|invbinomial|invchi2|invchi2tail|invF|invFtail|invgammap|invibeta|invnchi2|invnFtail|invnibeta|invnorm|invnormal|invttail|nbetaden|nchi2|nFden|nFtail|nibeta|norm|normal|normalden|normd|npnchi2|tden|ttail|uniform|abbrev|char|index|indexnot|length|lower|ltrim|match|plural|proper|real|regexm|regexr|regexs|reverse|rtrim|string|strlen|strlower|strltrim|strmatch|strofreal|strpos|strproper|strreverse|strrtrim|strtrim|strupper|subinstr|subinword|substr|trim|upper|word|wordcount|_caller|autocode|byteorder|chop|clip|cond|e|epsdouble|epsfloat|group|inlist|inrange|irecode|matrix|maxbyte|maxdouble|maxfloat|maxint|maxlong|mi|minbyte|mindouble|minfloat|minint|minlong|missing|r|recode|replay|return|s|scalar|d|date|day|dow|doy|halfyear|mdy|month|quarter|week|year|d|daily|dofd|dofh|dofm|dofq|dofw|dofy|h|halfyearly|hofd|m|mofd|monthly|q|qofd|quarterly|tin|twithin|w|weekly|wofd|y|yearly|yh|ym|yofd|yq|yw|cholesky|colnumb|colsof|corr|det|diag|diag0cnt|el|get|hadamard|I|inv|invsym|issym|issymmetric|J|matmissing|matuniform|mreldif|nullmat|rownumb|rowsof|sweep|syminv|trace|vec|vecdiag)(?=\\()' } ] }, diff --git a/src/languages/stylus.js b/src/languages/stylus.js index 7b1f423c6e..ff794862d3 100644 --- a/src/languages/stylus.js +++ b/src/languages/stylus.js @@ -122,7 +122,7 @@ function(hljs) { 'video' ]; - var TAG_END = '[\\.\\s\\n\\[\\:,]'; + var LOOKAHEAD_TAG_END = '(?=[\\.\\s\\n\\[\\:,])'; var ATTRIBUTES = [ 'align-content', @@ -365,34 +365,25 @@ function(hljs) { // class tag { - begin: '\\.[a-zA-Z][a-zA-Z0-9_-]*' + TAG_END, - returnBegin: true, - contains: [ - {className: 'selector-class', begin: '\\.[a-zA-Z][a-zA-Z0-9_-]*'} - ] + begin: '\\.[a-zA-Z][a-zA-Z0-9_-]*' + LOOKAHEAD_TAG_END, + className: 'selector-class' }, // id tag { - begin: '\\#[a-zA-Z][a-zA-Z0-9_-]*' + TAG_END, - returnBegin: true, - contains: [ - {className: 'selector-id', begin: '\\#[a-zA-Z][a-zA-Z0-9_-]*'} - ] + begin: '\\#[a-zA-Z][a-zA-Z0-9_-]*' + LOOKAHEAD_TAG_END, + className: 'selector-id' }, // tags { - begin: '\\b(' + TAGS.join('|') + ')' + TAG_END, - returnBegin: true, - contains: [ - {className: 'selector-tag', begin: '\\b[a-zA-Z][a-zA-Z0-9_-]*'} - ] + begin: '\\b(' + TAGS.join('|') + ')' + LOOKAHEAD_TAG_END, + className: 'selector-tag' }, // psuedo selectors { - begin: '&?:?:\\b(' + PSEUDO_SELECTORS.join('|') + ')' + TAG_END + begin: '&?:?:\\b(' + PSEUDO_SELECTORS.join('|') + ')' + LOOKAHEAD_TAG_END }, // @ keywords diff --git a/src/languages/xml.js b/src/languages/xml.js index 184da67da4..1f16442735 100644 --- a/src/languages/xml.js +++ b/src/languages/xml.js @@ -79,7 +79,7 @@ function(hljs) { ending braket. The '$' is needed for the lexeme to be recognized by hljs.subMode() that tests lexemes outside the stream. */ - begin: '|$)', end: '>', + begin: ')', end: '>', keywords: {name: 'style'}, contains: [TAG_INTERNALS], starts: { @@ -90,7 +90,7 @@ function(hljs) { { className: 'tag', // See the comment in the