Skip to content

Commit 538b7b6

Browse files
committed
Support recursion by group number
1 parent 505a9bd commit 538b7b6

File tree

4 files changed

+167
-111
lines changed

4 files changed

+167
-111
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@
33
[![npm version][npm-version-src]][npm-version-href]
44
[![bundle][bundle-src]][bundle-href]
55

6-
This is a plugin for the [`regex`](https://github.com/slevithan/regex) library that adds support for recursive matching up to a specified max depth *N*, where *N* must be between 2 and 100. Generated regexes are native `RegExp` instances, and support all JavaScript regular expression features.
6+
This is an official plugin for the [`regex`](https://github.com/slevithan/regex) library that adds support for recursive matching up to a specified max depth *N*, where *N* must be between 2 and 100. Generated regexes are native `RegExp` instances, and support all JavaScript regular expression features except numbered backreferences (support could be added in future versions).
77

88
Recursive matching is added to a regex via one of the following:
99

1010
- `(?R=N)` — Recursively match the entire regex at this position.
11-
- `\g<name&R=N>` — Recursively match the contents of group *name* at this position.
11+
- `\g<name&R=N>`, `\g<N&R=N>` — Recursively match the contents of the group referenced by name or number at this position.
1212
- The `\g` subroutine must be called *within* the referenced group.
1313

14-
Recursive matching supports named captures/backreferences, and makes them independent per depth level. So e.g. `groups.name` on a match object is the value captured by group `name` at the top level of the recursion stack.
14+
Recursive matching supports named captures and named backreferences, which are independent per depth level. So e.g. `groups.name` on a match object is the value captured by group `name` at the top level of the recursion stack.
1515

1616
## Install and use
1717

package.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"browser": "./dist/regex-recursion.min.js",
1010
"types": "./types/index.d.ts",
1111
"scripts": {
12-
"bundle:global": "esbuild src/index.js --global-name=Regex.plugins --bundle --minify --sourcemap --outfile=dist/index.min.js",
12+
"bundle:global": "esbuild src/index.js --global-name=Regex.plugins --bundle --minify --sourcemap --outfile=dist/regex-recursion.min.js",
1313
"types": "tsc src/index.js --rootDir src --declaration --allowJs --emitDeclarationOnly --outDir types",
1414
"prebuild": "rm -rf dist/* types/*",
1515
"build": "npm run bundle:global && npm run types",
@@ -18,8 +18,9 @@
1818
"prepare": "npm test"
1919
},
2020
"files": [
21+
"dist",
2122
"src",
22-
"dist"
23+
"types"
2324
],
2425
"repository": {
2526
"type": "git",

spec/recursion-spec.js

Lines changed: 123 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import {recursion} from '../src/index.js';
33

44
describe('recursion', () => {
55
it('should throw for invalid and unsupported recursion depths', () => {
6-
const values = ['-2', '0', '1', '+2', '2.5', '101', 'a', null];
6+
const values = ['-2', '0', '1', '02', '+2', '2.5', '101', 'a', null];
77
for (const value of values) {
88
expect(() => regex({plugins: [recursion]})({raw: [`a(?R=${value})?b`]})).toThrow();
99
expect(() => regex({plugins: [recursion]})({raw: [`(?<r>a\\g<r&R=${value}>?b)`]})).toThrow();
@@ -18,38 +18,8 @@ describe('recursion', () => {
1818
}
1919
});
2020

21-
it('should match global recursion', () => {
22-
expect(regex({plugins: [recursion]})`a(?R=2)?b`.exec('aabb')?.[0]).toBe('aabb');
23-
});
24-
25-
it('should match direct recursion', () => {
26-
expect('aabb').toMatch(regex({plugins: [recursion]})`^(?<r>a\g<r&R=2>?b)$`);
27-
expect('aab').not.toMatch(regex({plugins: [recursion]})`^(?<r>a\g<r&R=2>?b)$`);
28-
});
29-
30-
it('should throw for multiple direct, overlapping recursions', () => {
31-
expect(() => regex({plugins: [recursion]})`a(?R=2)?(?<r>a\g<r&R=2>?)`).toThrow();
32-
expect(() => regex({plugins: [recursion]})`(?<r>a\g<r&R=2>?\g<r&R=2>?)`).toThrow();
33-
});
34-
35-
it('should throw for multiple direct, nonoverlapping recursions', () => {
36-
expect(() => regex({plugins: [recursion]})`(?<r1>a\g<r1&R=2>?)(?<r2>a\g<r2&R=2>?)`).toThrow();
37-
});
38-
39-
it('should throw for indirect recursion', () => {
40-
expect(() => regex({plugins: [recursion]})`(?<a>(?<b>a\g<a&R=2>?)\g<b&R=2>)`).toThrow();
41-
expect(() => regex({plugins: [recursion]})`(?<a>\g<b&R=2>(?<b>a\g<a&R=2>?))`).toThrow();
42-
expect(() => regex({plugins: [recursion]})`(?<a>\g<b&R=2>)(?<b>a\g<a&R=2>?)`).toThrow();
43-
expect(() => regex({plugins: [recursion]})`\g<a&R=2>(?<a>\g<b&R=2>)(?<b>a\g<a&R=2>?)`).toThrow();
44-
expect(() => regex({plugins: [recursion]})`(?<a>\g<b&R=2>)(?<b>\g<c&R=2>)(?<c>a\g<a&R=2>?)`).toThrow();
45-
});
46-
47-
it('should not adjust named backreferences referring outside of the recursed expression', () => {
48-
expect('aababbabcc').toMatch(regex({plugins: [recursion]})`^(?<a>a)\k<a>(?<r>(?<b>b)\k<a>\k<b>\k<c>\g<r&R=2>?)(?<c>c)\k<c>$`);
49-
});
50-
5121
// Just documenting current behavior; this could be supported in the future
52-
it('should throw for numbered backreferences in interpolated regexes when using recursion', () => {
22+
it('should throw for numbered backrefs in interpolated regexes when using recursion', () => {
5323
expect(() => regex({plugins: [recursion]})`a(?R=2)?b${/()\1/}`).toThrow();
5424
expect(() => regex({plugins: [recursion]})`(?<n>a|\g<n&R=2>${/()\1/})`).toThrow();
5525
expect(() => regex({plugins: [recursion]})`(?<n>a|\g<n&R=2>)${/()\1/}`).toThrow();
@@ -62,55 +32,127 @@ describe('recursion', () => {
6232
expect(() => regex({plugins: [recursion]})`a(?R=2)?b(?(DEFINE))`).toThrow();
6333
expect(() => regex({plugins: [recursion]})`(?<n>a|\g<n&R=2>)(?(DEFINE))`).toThrow();
6434
});
65-
});
66-
67-
describe('readme examples', () => {
68-
it('should match an equal number of two different subpatterns', () => {
69-
expect(regex({plugins: [recursion]})`a(?R=50)?b`.exec('test aaaaaabbb')[0]).toBe('aaabbb');
70-
expect('aAbb').toMatch(regex({flags: 'i', plugins: [recursion]})`a(?R=2)?b`);
71-
});
72-
73-
it('should match an equal number of two different subpatterns, as the entire string', () => {
74-
const re = regex({plugins: [recursion]})`^
75-
(?<balanced>
76-
a
77-
# Recursively match just the specified group
78-
\g<balanced&R=50>?
79-
b
80-
)
81-
$`;
82-
expect(re.test('aaabbb')).toBeTrue();
83-
expect(re.test('aaabb')).toBeFalse();
84-
});
85-
86-
it('should match balanced parentheses', () => {
87-
const parens = regex({flags: 'g', plugins: [recursion]})`\(
88-
( [^\(\)] | (?R=50) )*
89-
\)`;
90-
expect('test ) (balanced ((parens))) () ((a)) ( (b)'.match(parens)).toEqual(['(balanced ((parens)))', '()', '((a))', '(b)']);
91-
});
92-
93-
it('should match balanced parentheses using an atomic group', () => {
94-
const parens = regex({flags: 'g', plugins: [recursion]})`\(
95-
( (?> [^\(\)]+ ) | (?R=50) )*
96-
\)`;
97-
expect('test ) (balanced ((parens))) () ((a)) ( (b)'.match(parens)).toEqual(['(balanced ((parens)))', '()', '((a))', '(b)']);
98-
});
99-
100-
it('should match palindromes', () => {
101-
const palindromes = regex({flags: 'gi', plugins: [recursion]})`(?<char>\w) ((?R=15)|\w?) \k<char>`;
102-
expect('Racecar, ABBA, and redivided'.match(palindromes)).toEqual(['Racecar', 'ABBA', 'edivide']);
103-
});
10435

105-
it('should match palindromes as complete words', () => {
106-
const palindromeWords = regex({flags: 'gi', plugins: [recursion]})`\b
107-
(?<palindrome>
108-
(?<char> \w )
109-
# Recurse, or match a lone unbalanced char in the center
110-
( \g<palindrome&R=15> | \w? )
111-
\k<char>
112-
)
113-
\b`;
114-
expect('Racecar, ABBA, and redivided'.match(palindromeWords)).toEqual(['Racecar', 'ABBA']);
36+
it('should not modify escaped recursion operators', () => {
37+
expect('a\\g<r&R=2>b').toMatch(regex({plugins: [recursion]})`^(?<r>a\\g<r&R=2>?b)$`);
38+
expect('a\\a\\bb').toMatch(regex({plugins: [recursion]})`^(?<r>a\\\g<r&R=2>?b)$`);
39+
});
40+
41+
describe('global', () => {
42+
it('should match global recursion', () => {
43+
expect(regex({plugins: [recursion]})`a(?R=2)?b`.exec('aabb')?.[0]).toBe('aabb');
44+
});
45+
46+
it('should have backrefs refer to their own recursion depth', () => {
47+
expect(regex({plugins: [recursion]})`(?<w>\w)0(?R=2)?1\k<w>`.exec('a0b01b1a')?.[0]).toBe('a0b01b1a');
48+
expect(regex({plugins: [recursion]})`(?<w>\w)0(?R=2)?1\k<w>`.test('a0b01a1b')).toBeFalse();
49+
});
50+
});
51+
52+
describe('subpattern by name', () => {
53+
it('should match direct recursion', () => {
54+
expect('aabb').toMatch(regex({plugins: [recursion]})`^(?<r>a\g<r&R=2>?b)$`);
55+
expect('aab').not.toMatch(regex({plugins: [recursion]})`^(?<r>a\g<r&R=2>?b)$`);
56+
});
57+
58+
it('should throw for multiple direct, overlapping recursions', () => {
59+
expect(() => regex({plugins: [recursion]})`a(?R=2)?(?<r>a\g<r&R=2>?)`).toThrow();
60+
expect(() => regex({plugins: [recursion]})`(?<r>a\g<r&R=2>?\g<r&R=2>?)`).toThrow();
61+
});
62+
63+
it('should throw for multiple direct, nonoverlapping recursions', () => {
64+
expect(() => regex({plugins: [recursion]})`(?<r1>a\g<r1&R=2>?)(?<r2>a\g<r2&R=2>?)`).toThrow();
65+
});
66+
67+
it('should throw for indirect recursion', () => {
68+
expect(() => regex({plugins: [recursion]})`(?<a>\g<b&R=2>)(?<b>a\g<a&R=2>?)`).toThrow();
69+
expect(() => regex({plugins: [recursion]})`\g<a&R=2>(?<a>\g<b&R=2>)(?<b>a\g<a&R=2>?)`).toThrow();
70+
expect(() => regex({plugins: [recursion]})`(?<a>\g<b&R=2>)(?<b>\g<c&R=2>)(?<c>a\g<a&R=2>?)`).toThrow();
71+
expect(() => regex({plugins: [recursion]})`(?<a>(?<b>a\g<a&R=2>?)\g<b&R=2>)`).toThrow();
72+
expect(() => regex({plugins: [recursion]})`(?<a>\g<b&R=2>(?<b>a\g<a&R=2>?))`).toThrow();
73+
});
74+
75+
it('should have backrefs refer to their own recursion depth', () => {
76+
expect(regex({plugins: [recursion]})`<(?<n>(?<w>\w)0\g<n&R=2>?1\k<w>)>`.exec('<a0b01b1a>')?.[0]).toBe('<a0b01b1a>');
77+
expect(regex({plugins: [recursion]})`<(?<n>(?<w>\w)0\g<n&R=2>?1\k<w>)>`.test('<a0b01a1b>')).toBeFalse();
78+
});
79+
80+
it('should not adjust named backrefs referring outside of the recursed subpattern', () => {
81+
expect('aababbabcc').toMatch(regex({plugins: [recursion]})`^(?<a>a)\k<a>(?<r>(?<b>b)\k<a>\k<b>\k<c>\g<r&R=2>?)(?<c>c)\k<c>$`);
82+
});
83+
84+
it('should throw if referencing a non-ancestor group', () => {
85+
expect(() => regex({plugins: [recursion]})`(?<a>)\g<a&R=2>?`).toThrow();
86+
expect(() => regex({plugins: [recursion]})`\g<a&R=2>?(?<a>)`).toThrow();
87+
expect(() => regex({plugins: [recursion]})`(?<a>)(?<b>\g<a&R=2>?)`).toThrow();
88+
expect(() => regex({plugins: [recursion]})`(?<b>\g<a&R=2>?)(?<a>)`).toThrow();
89+
});
90+
});
91+
92+
describe('subpattern by number', () => {
93+
it('should match direct recursion', () => {
94+
expect('aabb').toMatch(regex({plugins: [recursion]})`^(?<r>a\g<1&R=2>?b)$`);
95+
expect('aab').not.toMatch(regex({plugins: [recursion]})`^(?<r>a\g<1&R=2>?b)$`);
96+
expect('aabb').toMatch(regex({plugins: [recursion], disable: {n: true}})`^(a\g<1&R=2>?b)$`);
97+
expect('aab').not.toMatch(regex({plugins: [recursion], disable: {n: true}})`^(a\g<1&R=2>?b)$`);
98+
});
99+
100+
it('should throw if referencing a non-ancestor group', () => {
101+
expect(() => regex({plugins: [recursion]})`(?<a>)\g<1&R=2>?`).toThrow();
102+
expect(() => regex({plugins: [recursion]})`\g<1&R=2>?(?<a>)`).toThrow();
103+
expect(() => regex({plugins: [recursion]})`(?<a>)(?<b>\g<1&R=2>?)`).toThrow();
104+
expect(() => regex({plugins: [recursion]})`(?<b>\g<2&R=2>?)(?<a>)`).toThrow();
105+
});
106+
});
107+
108+
describe('readme examples', () => {
109+
it('should match an equal number of two different subpatterns', () => {
110+
expect(regex({plugins: [recursion]})`a(?R=50)?b`.exec('test aaaaaabbb')[0]).toBe('aaabbb');
111+
expect('aAbb').toMatch(regex({flags: 'i', plugins: [recursion]})`a(?R=2)?b`);
112+
});
113+
114+
it('should match an equal number of two different subpatterns, as the entire string', () => {
115+
const re = regex({plugins: [recursion]})`^
116+
(?<balanced>
117+
a
118+
# Recursively match just the specified group
119+
\g<balanced&R=50>?
120+
b
121+
)
122+
$`;
123+
expect(re.test('aaabbb')).toBeTrue();
124+
expect(re.test('aaabb')).toBeFalse();
125+
});
126+
127+
it('should match balanced parentheses', () => {
128+
const parens = regex({flags: 'g', plugins: [recursion]})`\(
129+
( [^\(\)] | (?R=50) )*
130+
\)`;
131+
expect('test ) (balanced ((parens))) () ((a)) ( (b)'.match(parens)).toEqual(['(balanced ((parens)))', '()', '((a))', '(b)']);
132+
});
133+
134+
it('should match balanced parentheses using an atomic group', () => {
135+
const parens = regex({flags: 'g', plugins: [recursion]})`\(
136+
( (?> [^\(\)]+ ) | (?R=50) )*
137+
\)`;
138+
expect('test ) (balanced ((parens))) () ((a)) ( (b)'.match(parens)).toEqual(['(balanced ((parens)))', '()', '((a))', '(b)']);
139+
});
140+
141+
it('should match palindromes', () => {
142+
const palindromes = regex({flags: 'gi', plugins: [recursion]})`(?<char>\w) ((?R=15)|\w?) \k<char>`;
143+
expect('Racecar, ABBA, and redivided'.match(palindromes)).toEqual(['Racecar', 'ABBA', 'edivide']);
144+
});
145+
146+
it('should match palindromes as complete words', () => {
147+
const palindromeWords = regex({flags: 'gi', plugins: [recursion]})`\b
148+
(?<palindrome>
149+
(?<char> \w )
150+
# Recurse, or match a lone unbalanced char in the center
151+
( \g<palindrome&R=15> | \w? )
152+
\k<char>
153+
)
154+
\b`;
155+
expect('Racecar, ABBA, and redivided'.match(palindromeWords)).toEqual(['Racecar', 'ABBA']);
156+
});
115157
});
116158
});

src/index.js

Lines changed: 38 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import {Context, forEachUnescaped, getGroupContents, hasUnescaped, replaceUnescaped} from 'regex-utilities';
22

3-
const gRToken = String.raw`\\g<(?<gRName>[^>&]+)&R=(?<gRDepth>[^>]+)>`;
3+
const gRToken = String.raw`\\g<(?<gRNameOrNum>[^>&]+)&R=(?<gRDepth>[^>]+)>`;
44
const recursiveToken = String.raw`\(\?R=(?<rDepth>[^\)]+)\)|${gRToken}`;
55
const namedCapturingDelim = String.raw`\(\?<(?![=!])(?<captureName>[^>]+)>`;
66
const token = new RegExp(String.raw`${namedCapturingDelim}|${recursiveToken}|\\?.`, 'gsu');
@@ -10,7 +10,9 @@ const token = new RegExp(String.raw`${namedCapturingDelim}|${recursiveToken}|\\?
1010
@returns {string}
1111
*/
1212
export function recursion(expression) {
13-
if (!hasUnescaped(expression, recursiveToken, Context.DEFAULT)) {
13+
// Keep the initial fail-check (which avoids unneeded processing) as fast as possible by testing
14+
// without the accuracy improvement of using `hasUnescaped` with default `Context`
15+
if (!(new RegExp(recursiveToken, 'su').test(expression))) {
1416
return expression;
1517
}
1618
if (hasUnescaped(expression, String.raw`\\[1-9]`, Context.DEFAULT)) {
@@ -32,53 +34,63 @@ export function recursion(expression) {
3234
throw new Error(`DEFINE groups cannot be used with recursion`);
3335
}
3436
const groupContentsStartPos = new Map();
37+
const openGroups = [];
3538
let numCharClassesOpen = 0;
39+
let numCaptures = 0;
3640
let match;
3741
token.lastIndex = 0;
3842
while ((match = token.exec(expression))) {
39-
const {0: m, groups: {captureName, rDepth, gRName, gRDepth}} = match;
43+
const {0: m, groups: {captureName, rDepth, gRNameOrNum, gRDepth}} = match;
4044
if (m === '[') {
4145
numCharClassesOpen++;
4246
} else if (!numCharClassesOpen) {
4347

44-
if (captureName) {
45-
groupContentsStartPos.set(captureName, token.lastIndex);
4648
// `(?R=N)`
47-
} else if (rDepth) {
49+
if (rDepth) {
4850
assertMaxInBounds(rDepth);
49-
const maxDepth = +rDepth;
5051
const pre = expression.slice(0, match.index);
5152
const post = expression.slice(token.lastIndex);
5253
assertNoFollowingRecursion(post);
53-
return makeRecursive(pre, post, maxDepth, false);
54-
// `\g<name&R=N>`
55-
} else if (gRName) {
54+
return makeRecursive(pre, post, +rDepth, false);
55+
// `\g<name&R=N>`, `\g<N&R=N>`
56+
} else if (gRNameOrNum) {
5657
assertMaxInBounds(gRDepth);
57-
const maxDepth = +gRDepth;
58-
const outsideOwnGroupMsg = `Recursion via \\g<${gRName}&R=${gRDepth}> must be used within the referenced group`;
59-
// Appears before (outside) the referenced group
60-
if (!groupContentsStartPos.has(gRName)) {
61-
throw new Error(outsideOwnGroupMsg);
58+
assertNoFollowingRecursion(expression.slice(token.lastIndex));
59+
if (!openGroups.some(g => g.name === gRNameOrNum || g.num === +gRNameOrNum)) {
60+
throw new Error(`Recursion via \\g<${gRNameOrNum}&R=${gRDepth}> must be used within the referenced group`);
6261
}
63-
const startPos = groupContentsStartPos.get(gRName);
62+
const startPos = groupContentsStartPos.get(gRNameOrNum);
6463
const recursiveGroupContents = getGroupContents(expression, startPos);
65-
// Appears after (outside) the referenced group
66-
if (!hasUnescaped(recursiveGroupContents, gRToken, Context.DEFAULT)) {
67-
throw new Error(outsideOwnGroupMsg)
68-
}
6964
const pre = expression.slice(startPos, match.index);
7065
const post = recursiveGroupContents.slice(pre.length + m.length);
71-
assertNoFollowingRecursion(expression.slice(token.lastIndex));
7266
return expression.slice(0, startPos) +
73-
makeRecursive(pre, post, maxDepth, true) +
67+
makeRecursive(pre, post, +gRDepth, true) +
7468
expression.slice(startPos + recursiveGroupContents.length);
69+
} else if (captureName) {
70+
numCaptures++;
71+
groupContentsStartPos.set(String(numCaptures), token.lastIndex);
72+
groupContentsStartPos.set(captureName, token.lastIndex);
73+
openGroups.push({
74+
num: numCaptures,
75+
name: captureName,
76+
});
77+
} else if (m.startsWith('(')) {
78+
const isUnnamedCapture = m === '(';
79+
if (isUnnamedCapture) {
80+
numCaptures++;
81+
groupContentsStartPos.set(String(numCaptures), token.lastIndex);
82+
}
83+
openGroups.push(isUnnamedCapture ? {num: numCaptures} : {});
84+
} else if (m === ')') {
85+
openGroups.pop();
7586
}
7687

7788
} else if (m === ']') {
7889
numCharClassesOpen--;
7990
}
8091
}
81-
throw new Error('Unexpected error; recursion was not processed');
92+
93+
return expression;
8294
}
8395

8496
/**
@@ -120,9 +132,9 @@ function makeRecursive(pre, post, maxDepth, isSubpattern) {
120132
// Depth 2: 'pre(?:pre(?:)post)post'
121133
// Depth 3: 'pre(?:pre(?:pre(?:)post)post)post'
122134
return `${pre}${
123-
repeatWithDepth(`(?:${pre}`, reps, isSubpattern ? namesInRecursed: null)
135+
repeatWithDepth(`(?:${pre}`, reps, (isSubpattern ? namesInRecursed : null))
124136
}(?:)${
125-
repeatWithDepth(`${post})`, reps, isSubpattern ? namesInRecursed: null, 'backward')
137+
repeatWithDepth(`${post})`, reps, (isSubpattern ? namesInRecursed : null), 'backward')
126138
}${post}`;
127139
}
128140

@@ -144,6 +156,7 @@ function repeatWithDepth(expression, reps, namesInRecursed, direction = 'forward
144156
String.raw`${namedCapturingDelim}|\\k<(?<backref>[^>]+)>`,
145157
({0: m, groups: {captureName, backref}}) => {
146158
if (backref && namesInRecursed && !namesInRecursed.has(backref)) {
159+
// Don't alter backrefs to groups outside the recursed subpattern
147160
return m;
148161
}
149162
const suffix = `_$${captureNum}`;

0 commit comments

Comments
 (0)