diff --git a/src/builders.ts b/src/builders.ts index c698ca9..5568761 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -1,6 +1,5 @@ import type { RegexFlags, RegexSequence } from './types'; -import { encodeSequence } from './encoder/encoder'; -import { ensureArray } from './utils/elements'; +import { encode } from './encoder'; /** * Generate RegExp object from elements with optional flags. @@ -10,7 +9,7 @@ import { ensureArray } from './utils/elements'; * @returns RegExp object */ export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp { - const pattern = encodeSequence(ensureArray(sequence)).pattern; + const pattern = encode(sequence).pattern; const flagsString = encodeFlags(flags ?? {}); return new RegExp(pattern, flagsString); } @@ -21,7 +20,7 @@ export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp * @returns regex pattern string */ export function buildPattern(sequence: RegexSequence): string { - return encodeSequence(ensureArray(sequence)).pattern; + return encode(sequence).pattern; } function encodeFlags(flags: RegexFlags): string { diff --git a/src/constructs/__tests__/char-class.test.ts b/src/constructs/__tests__/char-class.test.ts index 678abcb..bccd0e2 100644 --- a/src/constructs/__tests__/char-class.test.ts +++ b/src/constructs/__tests__/char-class.test.ts @@ -37,6 +37,12 @@ test('`charClass` joins character escapes', () => { expect(charClass(word, nonDigit)).toEqualRegex(/[\w\D]/); }); +test('`charClass` throws on empty text', () => { + expect(() => charClass()).toThrowErrorMatchingInlineSnapshot( + `"\`charClass\` should receive at least one element"`, + ); +}); + test('`charRange` pattern', () => { expect(charRange('a', 'z')).toEqualRegex(/[a-z]/); expect(['x', charRange('0', '9')]).toEqualRegex(/x[0-9]/); @@ -115,16 +121,7 @@ test('`negated` character class matching', () => { }); test('`encodeCharacterClass` throws on empty text', () => { - expect(() => - buildRegExp( - // @ts-expect-error - negated({ - type: 'characterClass', - chars: [], - ranges: [], - }), - ), - ).toThrowErrorMatchingInlineSnapshot( + expect(() => buildRegExp(negated({ chars: [], ranges: [] }))).toThrowErrorMatchingInlineSnapshot( `"Character class should contain at least one character or character range"`, ); }); diff --git a/src/encoder/__tests__/encoder.test.tsx b/src/constructs/__tests__/encoder.test.tsx similarity index 95% rename from src/encoder/__tests__/encoder.test.tsx rename to src/constructs/__tests__/encoder.test.tsx index afc9624..4713160 100644 --- a/src/encoder/__tests__/encoder.test.tsx +++ b/src/constructs/__tests__/encoder.test.tsx @@ -74,7 +74,11 @@ test('`buildRegExp` throws error on unknown element', () => { expect(() => // @ts-expect-error intentionally passing incorrect object buildRegExp({ type: 'unknown' }), - ).toThrowErrorMatchingInlineSnapshot(`"\`encodeNode\`: unknown element type unknown"`); + ).toThrowErrorMatchingInlineSnapshot(` + "\`encodeElement\`: unknown element: { + "type": "unknown" + }" + `); }); test('`buildPattern` throws on empty text', () => { diff --git a/src/constructs/anchors.ts b/src/constructs/anchors.ts index 718e7d6..6d61f42 100644 --- a/src/constructs/anchors.ts +++ b/src/constructs/anchors.ts @@ -1,43 +1,26 @@ -import type { EncodeResult } from '../encoder/types'; -import type { RegexConstruct } from '../types'; +import type { EncodedRegex } from '../types'; -export interface Anchor extends RegexConstruct { - type: 'anchor'; - symbol: string; -} - -export const startOfString: Anchor = { - type: 'anchor', - symbol: '^', - encode: encodeAnchor, +export const startOfString: EncodedRegex = { + precedence: 'atom', + pattern: '^', }; -export const endOfString: Anchor = { - type: 'anchor', - symbol: '$', - encode: encodeAnchor, +export const endOfString: EncodedRegex = { + precedence: 'atom', + pattern: '$', }; -export const wordBoundary: Anchor = { - type: 'anchor', - symbol: '\\b', - encode: encodeAnchor, +export const wordBoundary: EncodedRegex = { + precedence: 'atom', + pattern: '\\b', }; -export const nonWordBoundary: Anchor = { - type: 'anchor', - symbol: '\\B', - encode: encodeAnchor, +export const nonWordBoundary: EncodedRegex = { + precedence: 'atom', + pattern: '\\B', }; /** * @deprecated Renamed to `nonWordBoundary`. */ export const notWordBoundary = nonWordBoundary; - -function encodeAnchor(this: Anchor): EncodeResult { - return { - precedence: 'sequence', - pattern: this.symbol, - }; -} diff --git a/src/constructs/capture.ts b/src/constructs/capture.ts index 471c463..3814866 100644 --- a/src/constructs/capture.ts +++ b/src/constructs/capture.ts @@ -1,13 +1,5 @@ -import { encodeSequence } from '../encoder/encoder'; -import type { EncodeResult } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; - -export interface Capture extends RegexConstruct { - type: 'capture'; - children: RegexElement[]; - options?: CaptureOptions; -} +import { encode } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; export type CaptureOptions = { /** @@ -16,8 +8,7 @@ export type CaptureOptions = { name?: string; }; -export interface Reference extends RegexConstruct { - type: 'reference'; +export interface Reference extends EncodedRegex { name: string; } @@ -26,12 +17,18 @@ export interface Reference extends RegexConstruct { * - in the match results (`String.match`, `String.matchAll`, or `RegExp.exec`) * - in the regex itself, through {@link ref} */ -export function capture(sequence: RegexSequence, options?: CaptureOptions): Capture { +export function capture(sequence: RegexSequence, options?: CaptureOptions): EncodedRegex { + const name = options?.name; + if (name) { + return { + precedence: 'atom', + pattern: `(?<${name}>${encode(sequence).pattern})`, + }; + } + return { - type: 'capture', - children: ensureArray(sequence), - options, - encode: encodeCapture, + precedence: 'atom', + pattern: `(${encode(sequence).pattern})`, }; } @@ -45,31 +42,9 @@ export function capture(sequence: RegexSequence, options?: CaptureOptions): Capt * @param name - Name of the capturing group to reference. */ export function ref(name: string): Reference { - return { - type: 'reference', - name, - encode: encodeReference, - }; -} - -function encodeCapture(this: Capture): EncodeResult { - const name = this.options?.name; - if (name) { - return { - precedence: 'atom', - pattern: `(?<${name}>${encodeSequence(this.children).pattern})`, - }; - } - return { precedence: 'atom', - pattern: `(${encodeSequence(this.children).pattern})`, - }; -} - -function encodeReference(this: Reference): EncodeResult { - return { - precedence: 'atom', - pattern: `\\k<${this.name}>`, + pattern: `\\k<${name}>`, + name, }; } diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index ed90621..b2bc758 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -1,27 +1,14 @@ -import type { EncodeResult } from '../encoder/types'; -import type { RegexConstruct } from '../types'; -import type { CharacterEscape } from './char-escape'; - -/** - * Character range from start to end (inclusive). - */ -export interface CharacterRange { - start: string; - end: string; -} - -export interface CharacterClass extends RegexConstruct { - type: 'characterClass'; - chars: string[]; - ranges?: CharacterRange[]; -} +import { encodeCharClass } from '../encoder'; +import type { CharacterClass, CharacterEscape, EncodedRegex } from '../types'; export function charClass(...elements: Array): CharacterClass { + if (!elements.length) { + throw new Error('`charClass` should receive at least one element'); + } + return { - type: 'characterClass', chars: elements.map((c) => c.chars).flat(), ranges: elements.map((c) => c.ranges ?? []).flat(), - encode: encodeCharacterClass, }; } @@ -39,10 +26,8 @@ export function charRange(start: string, end: string): CharacterClass { } return { - type: 'characterClass', chars: [], ranges: [{ start, end }], - encode: encodeCharacterClass, }; } @@ -54,14 +39,12 @@ export function anyOf(characters: string): CharacterClass { } return { - type: 'characterClass', chars, - encode: encodeCharacterClass, }; } -export function negated(element: CharacterClass | CharacterEscape): EncodeResult { - return encodeCharacterClass.call(element, true); +export function negated(element: CharacterClass | CharacterEscape): EncodedRegex { + return encodeCharClass(element, true); } /** @@ -69,32 +52,6 @@ export function negated(element: CharacterClass | CharacterEscape): EncodeResult */ export const inverted = negated; -export function encodeCharacterClass( - this: CharacterClass | CharacterEscape, - isNegated?: boolean, -): EncodeResult { - if (!this.chars.length && !this.ranges?.length) { - throw new Error('Character class should contain at least one character or character range'); - } - - // If passed characters includes hyphen (`-`) it need to be moved to - // first (or last) place in order to treat it as hyphen character and not a range. - // See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes#types - const hyphen = this.chars.includes('-') ? '-' : ''; - const caret = this.chars.includes('^') ? '^' : ''; - const otherChars = this.chars.filter((c) => c !== '-' && c !== '^').join(''); - const ranges = this.ranges?.map(({ start, end }) => `${start}-${end}`).join('') ?? ''; - const negation = isNegated ? '^' : ''; - - let pattern = `[${negation}${ranges}${otherChars}${caret}${hyphen}]`; - if (pattern === '[^-]') pattern = '[\\^-]'; - - return { - precedence: 'atom', - pattern, - }; -} - function escapeForCharacterClass(text: string): string { return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string } diff --git a/src/constructs/char-escape.ts b/src/constructs/char-escape.ts index dfbd35e..77aa2cb 100644 --- a/src/constructs/char-escape.ts +++ b/src/constructs/char-escape.ts @@ -1,18 +1,10 @@ -import type { EncodeResult } from '../encoder/types'; - -export interface CharacterEscape extends EncodeResult { - kind: 'escape'; - - // `CharacterClass` compatibility - chars: string[]; - ranges?: never; -} +import type { CharacterEscape, EncodedRegex } from '../types'; /** * Matches any single character. * Specifically this one is NOT a character escape. */ -export const any: EncodeResult = { +export const any: EncodedRegex = { precedence: 'atom', pattern: '.', }; @@ -21,42 +13,36 @@ export const digit: CharacterEscape = { precedence: 'atom', pattern: '\\d', chars: ['\\d'], - kind: 'escape', }; export const nonDigit: CharacterEscape = { precedence: 'atom', pattern: '\\D', chars: ['\\D'], - kind: 'escape', }; export const word: CharacterEscape = { precedence: 'atom', pattern: '\\w', chars: ['\\w'], - kind: 'escape', }; export const nonWord: CharacterEscape = { precedence: 'atom', pattern: '\\W', chars: ['\\W'], - kind: 'escape', }; export const whitespace: CharacterEscape = { precedence: 'atom', pattern: '\\s', chars: ['\\s'], - kind: 'escape', }; export const nonWhitespace: CharacterEscape = { precedence: 'atom', pattern: '\\S', chars: ['\\S'], - kind: 'escape', }; /** diff --git a/src/constructs/choice-of.ts b/src/constructs/choice-of.ts index d2bd3ac..40be23a 100644 --- a/src/constructs/choice-of.ts +++ b/src/constructs/choice-of.ts @@ -1,27 +1,12 @@ -import { encodeSequence } from '../encoder/encoder'; -import type { EncodeResult } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; +import { encode } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; -export interface ChoiceOf extends RegexConstruct { - type: 'choiceOf'; - alternatives: RegexElement[][]; -} - -export function choiceOf(...alternatives: RegexSequence[]): ChoiceOf { +export function choiceOf(...alternatives: RegexSequence[]): EncodedRegex { if (alternatives.length === 0) { throw new Error('`choiceOf` should receive at least one alternative'); } - return { - type: 'choiceOf', - alternatives: alternatives.map((c) => ensureArray(c)), - encode: encodeChoiceOf, - }; -} - -function encodeChoiceOf(this: ChoiceOf): EncodeResult { - const encodedAlternatives = this.alternatives.map((c) => encodeSequence(c)); + const encodedAlternatives = alternatives.map((c) => encode(c)); if (encodedAlternatives.length === 1) { return encodedAlternatives[0]!; } diff --git a/src/constructs/lookahead.ts b/src/constructs/lookahead.ts index 5715dad..6180033 100644 --- a/src/constructs/lookahead.ts +++ b/src/constructs/lookahead.ts @@ -1,7 +1,5 @@ -import { encodeSequence } from '../encoder/encoder'; -import type { EncodeResult } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; +import { encode } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; /** * Positive lookahead assertion. @@ -17,22 +15,9 @@ import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; * // /(?=abc)/ * ``` */ -export interface Lookahead extends RegexConstruct { - type: 'lookahead'; - children: RegexElement[]; -} - -export function lookahead(sequence: RegexSequence): Lookahead { - return { - type: 'lookahead', - children: ensureArray(sequence), - encode: encodeLookahead, - }; -} - -function encodeLookahead(this: Lookahead): EncodeResult { +export function lookahead(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?=${encodeSequence(this.children).pattern})`, + pattern: `(?=${encode(sequence).pattern})`, }; } diff --git a/src/constructs/lookbehind.ts b/src/constructs/lookbehind.ts index 0ed418e..9187bed 100644 --- a/src/constructs/lookbehind.ts +++ b/src/constructs/lookbehind.ts @@ -1,7 +1,5 @@ -import { encodeSequence } from '../encoder/encoder'; -import type { EncodeResult } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; +import { encode } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; /** * Positive lookbehind assertion. @@ -17,22 +15,9 @@ import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; * // /(?<=abc)/ * ``` */ -export interface Lookbehind extends RegexConstruct { - type: 'lookbehind'; - children: RegexElement[]; -} - -export function lookbehind(sequence: RegexSequence): Lookbehind { - return { - type: 'lookbehind', - children: ensureArray(sequence), - encode: encodeLookbehind, - }; -} - -function encodeLookbehind(this: Lookbehind): EncodeResult { +export function lookbehind(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?<=${encodeSequence(this.children).pattern})`, + pattern: `(?<=${encode(sequence).pattern})`, }; } diff --git a/src/constructs/negative-lookahead.ts b/src/constructs/negative-lookahead.ts index 18b9a18..5694ca6 100644 --- a/src/constructs/negative-lookahead.ts +++ b/src/constructs/negative-lookahead.ts @@ -1,7 +1,5 @@ -import { encodeSequence } from '../encoder/encoder'; -import type { EncodeResult } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; +import { encode } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; /** * Negative lookahead assertion. @@ -17,22 +15,9 @@ import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; * // /(?=abc)/ * ``` */ -export interface NegativeLookahead extends RegexConstruct { - type: 'negativeLookahead'; - children: RegexElement[]; -} - -export function negativeLookahead(sequence: RegexSequence): NegativeLookahead { - return { - type: 'negativeLookahead', - children: ensureArray(sequence), - encode: encodeNegativeLookahead, - }; -} - -function encodeNegativeLookahead(this: NegativeLookahead): EncodeResult { +export function negativeLookahead(sequence: RegexSequence): EncodedRegex { return { precedence: 'atom', - pattern: `(?!${encodeSequence(this.children).pattern})`, + pattern: `(?!${encode(sequence).pattern})`, }; } diff --git a/src/constructs/negative-lookbehind.ts b/src/constructs/negative-lookbehind.ts index f2e5fcc..b0264f3 100644 --- a/src/constructs/negative-lookbehind.ts +++ b/src/constructs/negative-lookbehind.ts @@ -1,7 +1,5 @@ -import { encodeSequence } from '../encoder/encoder'; -import type { EncodeResult } from '../encoder/types'; -import { ensureArray } from '../utils/elements'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; +import { encode } from '../encoder'; +import type { EncodedRegex, RegexSequence } from '../types'; /** * Negative lookbehind assertion. @@ -17,22 +15,9 @@ import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; * // /(? encodeElement(n)); + + if (encoded.length === 1) { + return encoded[0]!; + } + + return { + precedence: 'sequence', + pattern: encoded + .map((n) => (n.precedence === 'disjunction' ? encodeAtomic(n) : n.pattern)) + .join(''), + }; +} + +export function encodeAtomic(sequence: RegexSequence): string { + const encoded = encode(sequence); + return encoded.precedence === 'atom' ? encoded.pattern : `(?:${encoded.pattern})`; +} + +function encodeElement(element: RegexElement): EncodedRegex { + if (typeof element === 'string') { + return encodeText(element); + } + + if (typeof element === 'object' && element instanceof RegExp) { + return encodeRegExp(element); + } + + // EncodedRegex + if (typeof element === 'object' && 'pattern' in element) { + return element; + } + + // CharacterClass + if (typeof element === 'object' && 'chars' in element) { + return encodeCharClass(element); + } + + throw new Error(`\`encodeElement\`: unknown element: ${JSON.stringify(element, null, 2)}`); +} + +function encodeText(text: string): EncodedRegex { + if (text.length === 0) { + throw new Error('`encodeText`: received text should not be empty'); + } + + return { + // Optimize for single character case + precedence: text.length === 1 ? 'atom' : 'sequence', + pattern: escapeText(text), + }; +} + +function encodeRegExp(regexp: RegExp): EncodedRegex { + const pattern = regexp.source; + + return { + // Encode at safe precedence + precedence: isAtomicPattern(pattern) ? 'atom' : 'disjunction', + pattern, + }; +} + +// This is intended to catch only some popular atomic patterns like char classes. +function isAtomicPattern(pattern: string): boolean { + if (pattern.length === 1) { + return true; + } + + if (pattern.startsWith('[') && pattern.endsWith(']') && pattern.match(/[[\]]/g)?.length === 2) { + return true; + } + + if (pattern.startsWith('(') && pattern.endsWith(')') && pattern.match(/[()]/g)?.length === 2) { + return true; + } + + return false; +} + +export function encodeCharClass(element: CharacterClass, isNegated?: boolean): EncodedRegex { + if (!element.chars.length && !element.ranges?.length) { + throw new Error('Character class should contain at least one character or character range'); + } + + // If passed characters includes hyphen (`-`) it need to be moved to + // first (or last) place in order to treat it as hyphen character and not a range. + // See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes#types + const hyphen = element.chars.includes('-') ? '-' : ''; + const caret = element.chars.includes('^') ? '^' : ''; + const otherChars = element.chars.filter((c) => c !== '-' && c !== '^').join(''); + const ranges = element.ranges?.map(({ start, end }) => `${start}-${end}`).join('') ?? ''; + const negation = isNegated ? '^' : ''; + + let pattern = `[${negation}${ranges}${otherChars}${caret}${hyphen}]`; + if (pattern === '[^-]') pattern = '[\\^-]'; + + return { + precedence: 'atom', + pattern, + }; +} + +// Source: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#escaping +function escapeText(text: string) { + return text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string +} diff --git a/src/encoder/encoder.ts b/src/encoder/encoder.ts deleted file mode 100644 index bd61dad..0000000 --- a/src/encoder/encoder.ts +++ /dev/null @@ -1,102 +0,0 @@ -import type { RegexElement } from '../types'; -import { escapeText } from '../utils/text'; -import type { EncodeResult } from './types'; - -export function encodeSequence(elements: RegexElement[]): EncodeResult { - const encodedNodes = elements.map((n) => encodeNode(n)); - return concatSequence(encodedNodes); -} - -export function encodeAtom(elements: RegexElement[]): EncodeResult { - return wrapAtom(encodeSequence(elements)); -} - -function encodeNode(element: RegexElement): EncodeResult { - if (typeof element === 'string') { - return encodeText(element); - } - - if (typeof element === 'object' && element instanceof RegExp) { - return encodeRegExp(element); - } - - if (typeof element === 'object' && 'pattern' in element) { - return element; - } - - if (typeof element === 'object' && typeof element.encode !== 'function') { - throw new Error(`\`encodeNode\`: unknown element type ${element.type}`); - } - - return element.encode(); -} - -function encodeText(text: string): EncodeResult { - if (text.length === 0) { - throw new Error('`encodeText`: received text should not be empty'); - } - - // Optimize for single character case - if (text.length === 1) { - return { - precedence: 'atom', - pattern: escapeText(text), - }; - } - - return { - precedence: 'sequence', - pattern: escapeText(text), - }; -} - -function encodeRegExp(regexp: RegExp): EncodeResult { - const pattern = regexp.source; - - // Encode at safe precedence - return { - precedence: isAtomicPattern(pattern) ? 'atom' : 'disjunction', - pattern, - }; -} - -// This is intended to catch only some popular atomic patterns like char classes. -function isAtomicPattern(pattern: string): boolean { - if (pattern.length === 1) { - return true; - } - - if (pattern.startsWith('[') && pattern.endsWith(']') && pattern.match(/[[\]]/g)?.length === 2) { - return true; - } - - if (pattern.startsWith('(') && pattern.endsWith(')') && pattern.match(/[()]/g)?.length === 2) { - return true; - } - - return false; -} - -function concatSequence(encoded: EncodeResult[]): EncodeResult { - if (encoded.length === 1) { - return encoded[0]!; - } - - return { - precedence: 'sequence', - pattern: encoded - .map((n) => (n.precedence === 'disjunction' ? wrapAtom(n) : n).pattern) - .join(''), - }; -} - -function wrapAtom(encoded: EncodeResult): EncodeResult { - if (encoded.precedence === 'atom') { - return encoded; - } - - return { - precedence: 'atom', - pattern: `(?:${encoded.pattern})`, - }; -} diff --git a/src/encoder/types.ts b/src/encoder/types.ts deleted file mode 100644 index 97a3807..0000000 --- a/src/encoder/types.ts +++ /dev/null @@ -1,9 +0,0 @@ -/** - * Encoded regex pattern with information about its type (atom, sequence) - */ -export interface EncodeResult { - precedence: EncodePrecedence; - pattern: string; -} - -export type EncodePrecedence = 'atom' | 'sequence' | 'disjunction'; diff --git a/src/types.ts b/src/types.ts index a81f995..2b102d5 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,5 +1,3 @@ -import type { EncodeResult } from './encoder/types'; - export type ArrayOrSingle = T[] | T; /** @@ -10,16 +8,39 @@ export type ArrayOrSingle = T[] | T; export type RegexSequence = RegexElement[] | RegexElement; /** - * Fundamental building block of a regular expression, defined as either a regex construct or a string. + * Fundamental building block of a regular expression, defined as either a regex construct, `RegExp` object or a string. + */ +export type RegexElement = RegexConstruct | RegExp | string; + +/** + * Fundamental building block of a regular expression, defined as either an encoded regex or a character class. */ -export type RegexElement = RegexConstruct | EncodeResult | string | RegExp; +export type RegexConstruct = EncodedRegex | CharacterClass; /** - * Common interface for all regex constructs like character classes, quantifiers, and anchors. + * Encoded regex pattern with information about its type (atom, sequence) */ -export interface RegexConstruct { - type: string; - encode(): EncodeResult; +export interface EncodedRegex { + precedence: EncodePrecedence; + pattern: string; +} + +export type EncodePrecedence = 'atom' | 'sequence' | 'disjunction'; + +export interface CharacterEscape extends EncodedRegex { + // `CharacterClass` compatibility + chars: string[]; + ranges?: never; +} + +export interface CharacterClass { + chars: string[]; + ranges?: CharacterRange[]; +} + +export interface CharacterRange { + start: string; + end: string; } /** diff --git a/src/utils/elements.ts b/src/utils/elements.ts deleted file mode 100644 index c9eb283..0000000 --- a/src/utils/elements.ts +++ /dev/null @@ -1,5 +0,0 @@ -import type { RegexElement, RegexSequence } from '../types'; - -export function ensureArray(sequence: RegexSequence): RegexElement[] { - return Array.isArray(sequence) ? sequence : [sequence]; -} diff --git a/src/utils/text.ts b/src/utils/text.ts deleted file mode 100644 index 9187463..0000000 --- a/src/utils/text.ts +++ /dev/null @@ -1,4 +0,0 @@ -// Source: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#escaping -export function escapeText(text: string) { - return text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string -} diff --git a/test-utils/utils.ts b/test-utils/utils.ts index 323da9f..b8f38c8 100644 --- a/test-utils/utils.ts +++ b/test-utils/utils.ts @@ -1,18 +1,5 @@ import { buildRegExp } from '../src/builders'; -import type { RegexConstruct, RegexElement, RegexSequence } from '../src/types'; - -export function isRegexElement(node: unknown): node is RegexElement { - return typeof node === 'string' || isRegexConstruct(node); -} - -export function isRegexConstruct(element: unknown): element is RegexConstruct { - return ( - typeof element === 'object' && - element !== null && - 'encode' in element && - typeof element.encode === 'function' - ); -} +import type { RegexSequence } from '../src/types'; export function wrapRegExp(regex: RegExp | RegexSequence) { if (regex instanceof RegExp) { diff --git a/website/docs/api/assertions.md b/website/docs/api/assertions.md index aacc76d..a190eab 100644 --- a/website/docs/api/assertions.md +++ b/website/docs/api/assertions.md @@ -10,8 +10,8 @@ Anchors are special characters or sequences that specify positions in the input ### Start and end of string ```ts -const startOfString: Anchor; -const endOfString: Anchor; +const startOfString: RegexConstruct; +const endOfString: RegexConstruct; ``` - `startOfString` anchor matches the start of a string (or line, if multiline mode is enabled). Regex syntax: `^`. @@ -22,8 +22,8 @@ const endOfString: Anchor; _This API was added in version 1.3.0._ ```ts -const wordBoundary: Anchor; -const nonWordBoundary: Anchor; +const wordBoundary: RegexConstruct; +const nonWordBoundary: RegexConstruct; ``` - `wordBoundary` matches the positions where a word character is not followed or preceded by another word character, effectively indicating the start or end of a word. Regex syntax: `\b`. @@ -40,7 +40,7 @@ Lookarounds in regex are used for asserting that some pattern is or isn't follow _This API was added in version 1.3.0._ ```ts -function lookahead(sequence: RegexSequence): Lookahead; +function lookahead(sequence: RegexSequence): RegexConstruct; ``` Regex syntax: `(?=...)`. @@ -52,7 +52,7 @@ Allows for conditional matching by checking for subsequent patterns in regexes w _This API was added in version 1.3.0._ ```ts -function negativeLookahead(sequence: RegexSequence): NegativeLookahead; +function negativeLookahead(sequence: RegexSequence): RegexConstruct; ``` Regex syntax: `(?!...)`. @@ -64,7 +64,7 @@ Allows for matches to be rejected if a specified subsequent pattern is present, _This API was added in version 1.3.0._ ```ts -function lookbehind(sequence: RegexSequence): Lookahead; +function lookbehind(sequence: RegexSequence): RegexConstruct; ``` Regex syntax: `(?<=...)`. @@ -76,7 +76,7 @@ Allows for conditional matching by checking for preceeding patterns in regexes w _This API was added in version 1.3.0._ ```ts -function negativeLookahead(sequence: RegexSequence): NegativeLookahead; +function negativeLookahead(sequence: RegexSequence): RegexConstruct; ``` Regex syntax: `(?`. diff --git a/website/docs/api/character-classes.md b/website/docs/api/character-classes.md index 732c346..71df0b5 100644 --- a/website/docs/api/character-classes.md +++ b/website/docs/api/character-classes.md @@ -5,19 +5,19 @@ title: Character Classes Character classes are a set of characters that match any one of the characters in the set. -### Common character classes +### Common character class escapes ```ts -const any: CharacterClass; -const word: CharacterClass; -const nonWord: CharacterClass; -const digit: CharacterClass; -const nonDigit: CharacterClass; -const whitespace: CharacterClass; -const nonWhitespace: CharacterClass; +const any: RegexConstruct; +const word: CharacterEscape; +const nonWord: CharacterEscape; +const digit: CharacterEscape; +const nonDigit: CharacterEscape; +const whitespace: CharacterEscape; +const nonWhitespace: CharacterEscape; ``` -- `any` matches any character except newline characters. Regex syntax: `*`. +- `any` matches any character except newline characters. Regex syntax: `.`. - `word` matches any word character (letters, digits & underscore). Regex syntax: `\w`. - `nonWord` matches any character **except** word characters (letters, digits & underscore). Regex syntax: `\W`. - `digit` matches any digit. Regex syntax: `\d`. @@ -71,7 +71,7 @@ Examples: ### `negated()` ```ts -function negated(element: CharacterClass): CharacterClass; +function negated(element: CharacterClass): RegexConstruct; ``` Regex syntax: `[^...]`. diff --git a/website/docs/api/constructs.md b/website/docs/api/constructs.md index 5182ed8..32a0824 100644 --- a/website/docs/api/constructs.md +++ b/website/docs/api/constructs.md @@ -10,7 +10,7 @@ These functions and objects represent available regex constructs. ```ts function choiceOf( ...alternatives: RegexSequence[], -): ChoiceOf { +): RegexConstruct { ``` Regex syntax: `a|b|c`. @@ -22,7 +22,7 @@ Example: `choiceOf("color", "colour")` matches either `color` or `colour` patter ### `regex()` ```ts -function regex(sequence: RegexSequence): Regex; +function regex(sequence: RegexSequence): RegexConstruct; ``` Regex syntax: the pattern remains unchanged when wrapped by this construct. diff --git a/website/docs/api/quantifiers.md b/website/docs/api/quantifiers.md index 101902c..53065d9 100644 --- a/website/docs/api/quantifiers.md +++ b/website/docs/api/quantifiers.md @@ -13,7 +13,7 @@ function zeroOrMore( options?: { greedy?: boolean; // default=true }, -): ZeroOrMore; +): RegexConstruct; ``` Regex syntax: @@ -31,7 +31,7 @@ function oneOrMore( options?: { greedy?: boolean; // default=true }, -): OneOrMore; +): RegexConstruct; ``` Regex syntax: @@ -49,7 +49,7 @@ function optional( options?: { greedy?: boolean; // default=true }, -): Optionally; +): RegexConstruct; ``` Regex syntax: @@ -71,7 +71,7 @@ function repeat( max?: number; greedy?: boolean; // default=true }, -): Repeat; +): RegexConstruct; ``` Regex syntax: diff --git a/website/docs/api/types.md b/website/docs/api/types.md index 69d565f..fd3266c 100644 --- a/website/docs/api/types.md +++ b/website/docs/api/types.md @@ -14,13 +14,13 @@ The sequence of regex elements forming a regular expression. For developer conve ### `RegexElement` ```ts -type RegexElement = RegexConstruct | RegExp | string; +type RegexElement = RegexConstruct | string | RegExp; ``` Regex elements are fundamental building blocks of a regular expression. These can be either further regex constructs, regular strings to be matched literally or `RegExp` literals (`/.../`) for including simple regexes as part of a larger structure. ### `RegexConstruct` -The common type for all regex constructs like character classes, quantifiers, and anchors. You should not need to use this type directly, it is returned by all regex construct functions. +The common type for all regex constructs like character classes, quantifiers, and captures. You should not need to use this type directly, it is returned by all regex construct functions. -Note: the shape of the `RegexConstruct` is considered private and may change in a breaking way without a major release. We will focus on maintaining the compatibility of regexes built with +Note: the shape of the `RegexConstruct` is considered private and may change in a breaking way without a major release. We will focus on maintaining the compatibility of regexes built with it.