Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
## Master

New languages:
none.

New styles:
- *Night Owl* by [Carl Baxter][]

Improvements:
- improve parser to properly support look-ahead regex in begin matchers (#2135)
- blacklist super-common keywords from having relevance (#2179)
- fix(swift): support for `@dynamicMemberLookup` and `@propertyWrapper` (#2202)
- fix: `endWithParent` inside `starts` now always works (#2201)
Expand Down
17 changes: 17 additions & 0 deletions docs/language-guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,23 @@ Many languages share common modes and regular expressions. Such expressions are
at the end under "Common regexps" and "Common modes" titles. Use them when possible.


Regular Expression Features
---------------------------

The goal of Highlight.js is to support whatever regex features Javascript itself supports. You're using real regular expressions, use them responsibly. That said, due to the design of the parser, there are some caveats. These are addressed below.

Things we support now that we did not always:

* look-ahead matching for `begin` (#2135)
* look-ahead matching for `illegal` (#2135)
* back-references within your regex (#1897)

Things we currently know are still issues:

* look-ahead matching for `end` matchers
* look-behind matching (when JS supports it) for `end` matchers


Contributing
------------

Expand Down
240 changes: 165 additions & 75 deletions src/highlight.js
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,15 @@ https://highlightjs.org/
);
}

function reCountMatchGroups(re) {
return (new RegExp(re.toString() + '|')).exec('').length - 1;
}

// joinRe logically computes regexps.join(separator), but fixes the
// backreferences so they continue to match.
// it also places each individual regular expression into it's own
// match group, keeping track of the sequencing of those match groups
// is currently an exercise for the caller. :-)
function joinRe(regexps, separator) {
// backreferenceRe matches an open parenthesis or backreference. To avoid
// an incorrect parse, it additionally matches the following:
Expand All @@ -327,11 +334,13 @@ https://highlightjs.org/
var numCaptures = 0;
var ret = '';
for (var i = 0; i < regexps.length; i++) {
numCaptures += 1;
var offset = numCaptures;
var re = reStr(regexps[i]);
if (i > 0) {
ret += separator;
}
ret += "(";
while (re.length > 0) {
var match = backreferenceRe.exec(re);
if (match == null) {
Expand All @@ -350,10 +359,75 @@ https://highlightjs.org/
}
}
}
ret += ")";
}
return ret;
}

function buildModeRegex(mode) {

var matchIndexes = {};
var matcherRe;
var regexes = [];
var matcher = {};
var matchAt = 1;

function addRule(rule, regex) {
matchIndexes[matchAt] = rule;
regexes.push([rule, regex]);
matchAt += reCountMatchGroups(regex) + 1;
}

var term;
for (var i=0; i < mode.contains.length; i++) {
var re;
term = mode.contains[i];
if (term.beginKeywords) {
re = '\\.?(?:' + term.begin + ')\\.?';
} else {
re = term.begin;
}
addRule(term, re);
}
if (mode.terminator_end)
addRule("end", mode.terminator_end);
if (mode.illegal)
addRule("illegal", mode.illegal);

var terminators = regexes.map(function(el) { return el[1] });
matcherRe = langRe(joinRe(terminators, '|'), true);

matcher.lastIndex = 0;
matcher.exec = function(s) {
var rule;

if( regexes.length === 0) return null;

matcherRe.lastIndex = matcher.lastIndex;
var match = matcherRe.exec(s);
if (!match) { return null; }

for(var i = 0; i<match.length; i++) {
if (match[i] != undefined && matchIndexes["" +i] != undefined ) {
rule = matchIndexes[""+i];
break;
}
}

// illegal or end match
if (typeof rule === "string") {
match.type = rule;
match.extra = [mode.illegal, mode.terminator_end];
} else {
match.type = "begin";
match.rule = rule;
}
return match;
}

return matcher;
}

function compileMode(mode, parent) {
if (mode.compiled)
return;
Expand Down Expand Up @@ -398,14 +472,7 @@ https://highlightjs.org/
compileMode(mode.starts, parent);
}

var terminators =
mode.contains.map(function(c) {
return c.beginKeywords ? '\\.?(?:' + c.begin + ')\\.?' : c.begin;
})
.concat([mode.terminator_end, mode.illegal])
.map(reStr)
.filter(Boolean);
mode.terminators = terminators.length ? langRe(joinRe(terminators, '|'), true) : {exec: function(/*s*/) {return null;}};
mode.terminators = buildModeRegex(mode);
}

compileMode(language);
Expand All @@ -426,19 +493,6 @@ https://highlightjs.org/
return new RegExp(value.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'), 'm');
}

function subMode(lexeme, mode) {
var i, length;

for (i = 0, length = mode.contains.length; i < length; i++) {
if (testRe(mode.contains[i].beginRe, lexeme)) {
if (mode.contains[i].endSameAsBegin) {
mode.contains[i].endRe = escapeRe( mode.contains[i].beginRe.exec(lexeme)[0] );
}
return mode.contains[i];
}
}
}

function endOfMode(mode, lexeme) {
if (testRe(mode.endRe, lexeme)) {
while (mode.endsParent && mode.parent) {
Expand All @@ -451,10 +505,6 @@ https://highlightjs.org/
}
}

function isIllegal(lexeme, mode) {
return !ignore_illegals && testRe(mode.illegalRe, lexeme);
}

function keywordMatch(mode, match) {
var match_str = language.case_insensitive ? match[0].toLowerCase() : match[0];
return mode.keywords.hasOwnProperty(match_str) && mode.keywords[match_str];
Expand Down Expand Up @@ -532,74 +582,112 @@ https://highlightjs.org/
top = Object.create(mode, {parent: {value: top}});
}

function processLexeme(buffer, lexeme) {

mode_buffer += buffer;
function doBeginMatch(match) {
var lexeme = match[0];
var new_mode = match.rule;

if (lexeme == null) {
processBuffer();
return 0;
if (new_mode && new_mode.endSameAsBegin) {
new_mode.endRe = escapeRe( lexeme );
}

var new_mode = subMode(lexeme, top);
if (new_mode) {
if (new_mode.skip) {
if (new_mode.skip) {
mode_buffer += lexeme;
} else {
if (new_mode.excludeBegin) {
mode_buffer += lexeme;
} else {
if (new_mode.excludeBegin) {
mode_buffer += lexeme;
}
processBuffer();
if (!new_mode.returnBegin && !new_mode.excludeBegin) {
mode_buffer = lexeme;
}
}
startNewMode(new_mode, lexeme);
return new_mode.returnBegin ? 0 : lexeme.length;
processBuffer();
if (!new_mode.returnBegin && !new_mode.excludeBegin) {
mode_buffer = lexeme;
}
}
startNewMode(new_mode, lexeme);
return new_mode.returnBegin ? 0 : lexeme.length;
}

function doEndMatch(match) {
var lexeme = match[0];
var end_mode = endOfMode(top, lexeme);
if (end_mode) {
var origin = top;
if (origin.skip) {
if (!end_mode) { return; }

var origin = top;
if (origin.skip) {
mode_buffer += lexeme;
} else {
if (!(origin.returnEnd || origin.excludeEnd)) {
mode_buffer += lexeme;
} else {
if (!(origin.returnEnd || origin.excludeEnd)) {
mode_buffer += lexeme;
}
processBuffer();
if (origin.excludeEnd) {
mode_buffer = lexeme;
}
}
do {
if (top.className) {
result += spanEndTag;
}
if (!top.skip && !top.subLanguage) {
relevance += top.relevance;
}
top = top.parent;
} while (top !== end_mode.parent);
if (end_mode.starts) {
if (end_mode.endSameAsBegin) {
end_mode.starts.endRe = end_mode.endRe;
}
startNewMode(end_mode.starts, '');
processBuffer();
if (origin.excludeEnd) {
mode_buffer = lexeme;
}
return origin.returnEnd ? 0 : lexeme.length;
}
do {
if (top.className) {
result += spanEndTag;
}
if (!top.skip && !top.subLanguage) {
relevance += top.relevance;
}
top = top.parent;
} while (top !== end_mode.parent);
if (end_mode.starts) {
if (end_mode.endSameAsBegin) {
end_mode.starts.endRe = end_mode.endRe;
}
startNewMode(end_mode.starts, '');
}
return origin.returnEnd ? 0 : lexeme.length;
}

if (isIllegal(lexeme, top))
var lastMatch = {};
function processLexeme(text_before_match, match) {

var lexeme = match && match[0];

// add non-matched text to the current mode buffer
mode_buffer += text_before_match;

if (lexeme == null) {
processBuffer();
return 0;
}

// we've found a 0 width match and we're stuck, so we need to advance
// this happens when we have badly behaved rules that have optional matchers to the degree that
// sometimes they can end up matching nothing at all
// Ref: https://github.com/highlightjs/highlight.js/issues/2140
if (lastMatch.type=="begin" && match.type=="end" && lastMatch.index == match.index && lexeme === "") {
return 1;
}
lastMatch = match;

if (match.type==="begin") {
return doBeginMatch(match);
} else if (match.type==="illegal" && !ignore_illegals) {
// illegal match, we do not continue processing
throw new Error('Illegal lexeme "' + lexeme + '" for mode "' + (top.className || '<unnamed>') + '"');
} else if (match.type==="end") {
var processed = doEndMatch(match);
if (processed != undefined)
return processed;
}

/*
Parser should not reach this point as all types of lexemes should be caught
earlier, but if it does due to some bug make sure it advances at least one
character forward to prevent infinite looping.
Why might be find ourselves here? Only one occasion now. An end match that was
triggered but could not be completed. When might this happen? When an `endSameasBegin`
rule sets the end rule to a specific match. Since the overall mode termination rule that's
being used to scan the text isn't recompiled that means that any match that LOOKS like
the end (but is not, because it is not an exact match to the beginning) will
end up here. A definite end match, but when `doEndMatch` tries to "reapply"
the end rule and fails to match, we wind up here, and just silently ignore the end.

This causes no real harm other than stopping a few times too many.
*/

mode_buffer += lexeme;
return lexeme.length || 1;
return lexeme.length;
}

var language = getLanguage(name);
Expand All @@ -625,7 +713,7 @@ https://highlightjs.org/
match = top.terminators.exec(value);
if (!match)
break;
count = processLexeme(value.substring(index, match.index), match[0]);
count = processLexeme(value.substring(index, match.index), match);
index = match.index + count;
}
processLexeme(value.substr(index));
Expand All @@ -637,12 +725,14 @@ https://highlightjs.org/
return {
relevance: relevance,
value: result,
illegal:false,
language: name,
top: top
};
} catch (e) {
if (e.message && e.message.indexOf('Illegal') !== -1) {
return {
illegal: true,
relevance: 0,
value: escape(value)
};
Expand Down
7 changes: 2 additions & 5 deletions src/languages/abnf.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,8 @@ function(hljs) {
};

var ruleDeclarationMode = {
begin: regexes.ruleDeclaration + '\\s*=',
returnBegin: true,
end: /=/,
relevance: 0,
contains: [{className: "attribute", begin: regexes.ruleDeclaration}]
className: "attribute",
begin: regexes.ruleDeclaration + '(?=\\s*=)',
};

return {
Expand Down
2 changes: 1 addition & 1 deletion src/languages/brainfuck.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ function(hljs){
},
{
// this mode works as the only relevance counter
begin: /\+\+|\-\-/, returnBegin: true,
begin: /(?:\+\+|\-\-)/,
contains: [LITERAL]
},
LITERAL
Expand Down
Loading