From aeed4a7df6ffbb9376d16029f5522d43483ca119 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sat, 7 Sep 2024 23:35:16 +0200 Subject: [PATCH 01/19] chore: make char class encoding tree shakable --- src/constructs/__tests__/encoder.test.tsx | 4 +- src/constructs/char-class.ts | 32 +++++++++++++- src/encoder.ts | 53 ++++++----------------- src/types.ts | 8 +++- 4 files changed, 52 insertions(+), 45 deletions(-) diff --git a/src/constructs/__tests__/encoder.test.tsx b/src/constructs/__tests__/encoder.test.tsx index 4713160..e4c3939 100644 --- a/src/constructs/__tests__/encoder.test.tsx +++ b/src/constructs/__tests__/encoder.test.tsx @@ -75,7 +75,7 @@ test('`buildRegExp` throws error on unknown element', () => { // @ts-expect-error intentionally passing incorrect object buildRegExp({ type: 'unknown' }), ).toThrowErrorMatchingInlineSnapshot(` - "\`encodeElement\`: unknown element: { + "Unsupported element. Received: { "type": "unknown" }" `); @@ -83,6 +83,6 @@ test('`buildRegExp` throws error on unknown element', () => { test('`buildPattern` throws on empty text', () => { expect(() => buildPattern('')).toThrowErrorMatchingInlineSnapshot( - `"\`encodeText\`: received text should not be empty"`, + `"Unsupported element. Received: """`, ); }); diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index c480d9f..12f9e42 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -1,4 +1,3 @@ -import { encodeCharClass } from '../encoder'; import type { CharacterClass, CharacterEscape, EncodedRegex } from '../types'; export function charClass(...elements: Array): CharacterClass { @@ -9,6 +8,7 @@ export function charClass(...elements: Array): return { chars: elements.map((c) => c.chars).flat(), ranges: elements.map((c) => c.ranges ?? []).flat(), + encode: encodeCharClass, }; } @@ -28,6 +28,7 @@ export function charRange(start: string, end: string): CharacterClass { return { chars: [], ranges: [{ start, end }], + encode: encodeCharClass, }; } @@ -40,11 +41,12 @@ export function anyOf(characters: string): CharacterClass { return { chars, + encode: encodeCharClass, }; } export function negated(element: CharacterClass | CharacterEscape): EncodedRegex { - return encodeCharClass(element, true); + return encodeCharClass.call(element, true); } /** @@ -55,3 +57,29 @@ export const inverted = negated; function escapeCharClass(text: string): string { return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string } + +export function encodeCharClass( + this: CharacterClass | CharacterEscape, + isNegated?: boolean, +): EncodedRegex { + if (!this.chars.length && !this.ranges?.length) { + throw new Error('Character class should contain at least one character or character range'); + } + + // If passed characters includes hyphen (`-`) it need to be moved to + // first (or last) place in order to treat it as hyphen character and not a range. + // See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes#types + const hyphen = this.chars.includes('-') ? '-' : ''; + const caret = this.chars.includes('^') ? '^' : ''; + const otherChars = this.chars.filter((c) => c !== '-' && c !== '^').join(''); + const ranges = this.ranges?.map(({ start, end }) => `${start}-${end}`).join('') ?? ''; + const negation = isNegated ? '^' : ''; + + let pattern = `[${negation}${ranges}${otherChars}${caret}${hyphen}]`; + if (pattern === '[^-]') pattern = '[\\^-]'; + + return { + precedence: 'atom', + pattern, + }; +} diff --git a/src/encoder.ts b/src/encoder.ts index 9a7d967..011a71b 100644 --- a/src/encoder.ts +++ b/src/encoder.ts @@ -22,32 +22,30 @@ export function encodeAtomic(sequence: RegexSequence): string { } function encodeElement(element: RegexElement): EncodedRegex { - if (typeof element === 'string') { + if (typeof element === 'string' && element.length > 0) { return encodeText(element); } - if (typeof element === 'object' && element instanceof RegExp) { - return encodeRegExp(element); - } + if (typeof element === 'object') { + if (element instanceof RegExp) { + return encodeRegExp(element); + } - // EncodedRegex - if (typeof element === 'object' && 'pattern' in element) { - return element; - } + // EncodedRegex + if ('pattern' in element) { + return element; + } - // CharacterClass - if (typeof element === 'object' && 'chars' in element) { - return encodeCharClass(element); + // SelfEncodableRegex + if ('encode' in element) { + return element.encode(); + } } - throw new Error(`\`encodeElement\`: unknown element: ${JSON.stringify(element, null, 2)}`); + throw new Error(`Unsupported element. Received: ${JSON.stringify(element, null, 2)}`); } function encodeText(text: string): EncodedRegex { - if (text.length === 0) { - throw new Error('`encodeText`: received text should not be empty'); - } - return { // Optimize for single character case precedence: text.length === 1 ? 'atom' : 'sequence', @@ -82,29 +80,6 @@ function isAtomicPattern(pattern: string): boolean { return false; } -export function encodeCharClass(element: CharacterClass, isNegated?: boolean): EncodedRegex { - if (!element.chars.length && !element.ranges?.length) { - throw new Error('Character class should contain at least one character or character range'); - } - - // If passed characters includes hyphen (`-`) it need to be moved to - // first (or last) place in order to treat it as hyphen character and not a range. - // See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes#types - const hyphen = element.chars.includes('-') ? '-' : ''; - const caret = element.chars.includes('^') ? '^' : ''; - const otherChars = element.chars.filter((c) => c !== '-' && c !== '^').join(''); - const ranges = element.ranges?.map(({ start, end }) => `${start}-${end}`).join('') ?? ''; - const negation = isNegated ? '^' : ''; - - let pattern = `[${negation}${ranges}${otherChars}${caret}${hyphen}]`; - if (pattern === '[^-]') pattern = '[\\^-]'; - - return { - precedence: 'atom', - pattern, - }; -} - // Source: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#escaping function escapeText(text: string) { return text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string diff --git a/src/types.ts b/src/types.ts index 81e23a3..10b8b6d 100644 --- a/src/types.ts +++ b/src/types.ts @@ -15,7 +15,7 @@ export type RegexElement = RegexConstruct | RegExp | string; /** * Fundamental building block of a regular expression, defined as either an encoded regex or a character class. */ -export type RegexConstruct = EncodedRegex | CharacterClass; +export type RegexConstruct = EncodedRegex | SelfEncodableRegex; /** * Encoded regex pattern with information about its type (atom, sequence) @@ -33,7 +33,11 @@ export interface CharacterEscape extends EncodedRegex { ranges?: never; } -export interface CharacterClass { +export interface SelfEncodableRegex { + encode: () => EncodedRegex; +} + +export interface CharacterClass extends SelfEncodableRegex { chars: string[]; ranges?: CharacterRange[]; } From 7b99c0b06ea4f1d0e87b618cf63a09bd72f4c6ce Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sat, 7 Sep 2024 23:44:21 +0200 Subject: [PATCH 02/19] refactor: optimize error checks & messages --- src/constructs/__tests__/char-class.test.ts | 20 +++++--------------- src/constructs/char-class.ts | 14 +++++--------- 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/src/constructs/__tests__/char-class.test.ts b/src/constructs/__tests__/char-class.test.ts index bccd0e2..cb78d7d 100644 --- a/src/constructs/__tests__/char-class.test.ts +++ b/src/constructs/__tests__/char-class.test.ts @@ -38,9 +38,7 @@ test('`charClass` joins character escapes', () => { }); test('`charClass` throws on empty text', () => { - expect(() => charClass()).toThrowErrorMatchingInlineSnapshot( - `"\`charClass\` should receive at least one element"`, - ); + expect(() => charClass()).toThrowErrorMatchingInlineSnapshot(`"Expected at least one element"`); }); test('`charRange` pattern', () => { @@ -51,13 +49,13 @@ test('`charRange` pattern', () => { test('`charRange` throws on incorrect arguments', () => { expect(() => charRange('z', 'a')).toThrowErrorMatchingInlineSnapshot( - `"\`start\` should be before or equal to \`end\`"`, + `"\`start\` character should be before or same as \`end\` character"`, ); expect(() => charRange('aa', 'z')).toThrowErrorMatchingInlineSnapshot( - `"\`charRange\` should receive only single character \`start\` string"`, + `"Expected a single character, received "aa""`, ); expect(() => charRange('a', 'zz')).toThrowErrorMatchingInlineSnapshot( - `"\`charRange\` should receive only single character \`end\` string"`, + `"Expected a single character, received "zz""`, ); }); @@ -105,9 +103,7 @@ test('`anyOf` pattern edge cases', () => { }); test('`anyOf` throws on empty text', () => { - expect(() => anyOf('')).toThrowErrorMatchingInlineSnapshot( - `"\`anyOf\` should received at least one character"`, - ); + expect(() => anyOf('')).toThrowErrorMatchingInlineSnapshot(`"Expected at least one character"`); }); test('`negated` character class pattern', () => { @@ -119,9 +115,3 @@ test('`negated` character class matching', () => { expect(negated(anyOf('a'))).not.toMatchString('aa'); expect(negated(anyOf('a'))).toMatchGroups('aba', ['b']); }); - -test('`encodeCharacterClass` throws on empty text', () => { - expect(() => buildRegExp(negated({ chars: [], ranges: [] }))).toThrowErrorMatchingInlineSnapshot( - `"Character class should contain at least one character or character range"`, - ); -}); diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index 12f9e42..fb1daeb 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -2,7 +2,7 @@ import type { CharacterClass, CharacterEscape, EncodedRegex } from '../types'; export function charClass(...elements: Array): CharacterClass { if (!elements.length) { - throw new Error('`charClass` should receive at least one element'); + throw new Error('Expected at least one element'); } return { @@ -14,15 +14,15 @@ export function charClass(...elements: Array): export function charRange(start: string, end: string): CharacterClass { if (start.length !== 1) { - throw new Error('`charRange` should receive only single character `start` string'); + throw new Error(`Expected a single character, received "${start}"`); } if (end.length !== 1) { - throw new Error('`charRange` should receive only single character `end` string'); + throw new Error(`Expected a single character, received "${end}"`); } if (start > end) { - throw new Error('`start` should be before or equal to `end`'); + throw new Error('`start` character should be before or same as `end` character'); } return { @@ -36,7 +36,7 @@ export function anyOf(characters: string): CharacterClass { const chars = characters.split('').map((c) => escapeCharClass(c)); if (chars.length === 0) { - throw new Error('`anyOf` should received at least one character'); + throw new Error('Expected at least one character'); } return { @@ -62,10 +62,6 @@ export function encodeCharClass( this: CharacterClass | CharacterEscape, isNegated?: boolean, ): EncodedRegex { - if (!this.chars.length && !this.ranges?.length) { - throw new Error('Character class should contain at least one character or character range'); - } - // If passed characters includes hyphen (`-`) it need to be moved to // first (or last) place in order to treat it as hyphen character and not a range. // See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes#types From 8b576156c33e5e13a1c7c6c5c810cdfe0a7d6127 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sat, 7 Sep 2024 23:50:38 +0200 Subject: [PATCH 03/19] refactor: simplify atomic checks --- src/encoder.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/encoder.ts b/src/encoder.ts index 011a71b..08b5c0b 100644 --- a/src/encoder.ts +++ b/src/encoder.ts @@ -1,4 +1,4 @@ -import type { CharacterClass, EncodedRegex, RegexElement, RegexSequence } from './types'; +import type { EncodedRegex, RegexElement, RegexSequence } from './types'; export function encode(sequence: RegexSequence): EncodedRegex { const elements = Array.isArray(sequence) ? sequence : [sequence]; @@ -69,11 +69,13 @@ function isAtomicPattern(pattern: string): boolean { return true; } - if (pattern.startsWith('[') && pattern.endsWith(']') && pattern.match(/[[\]]/g)?.length === 2) { + // Simple char class: [...] + if (pattern.match(/^\[[^[\]]*\]$/)) { return true; } - if (pattern.startsWith('(') && pattern.endsWith(')') && pattern.match(/[()]/g)?.length === 2) { + // Simple group: (...) + if (pattern.match(/^\([^()]*\)$/)) { return true; } From b035c62c2a45da596996c13472b16883b16a0872 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 8 Sep 2024 00:02:12 +0200 Subject: [PATCH 04/19] refactor: improve unicode detection --- src/__tests__/builder.test.ts | 10 +++++++--- src/builders.ts | 25 ++++++++++++------------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/__tests__/builder.test.ts b/src/__tests__/builder.test.ts index 7bb2a6d..970e665 100644 --- a/src/__tests__/builder.test.ts +++ b/src/__tests__/builder.test.ts @@ -42,14 +42,18 @@ test('`regexBuilder` throws when using unicode-aware features without `unicode` `"Expected a valid unicode code point but received 1193046"`, ); expect(() => buildRegExp(char(0x12345))).toThrowErrorMatchingInlineSnapshot( - `"The pattern "\\u{12345}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, + `"Pattern "\\u{12345}" requires the "unicode" flag to be set."`, ); expect(() => buildRegExp(unicodeProperty('Emoji_Presentation')), ).toThrowErrorMatchingInlineSnapshot( - `"The pattern "\\p{Emoji_Presentation}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, + `"Pattern "\\p{Emoji_Presentation}" requires the "unicode" flag to be set."`, ); expect(() => buildRegExp(/\P{Letter}/u)).toThrowErrorMatchingInlineSnapshot( - `"The pattern "\\P{Letter}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, + `"Pattern "\\P{Letter}" requires the "unicode" flag to be set."`, ); }); + +test('`regexBuilder` does not throws on tricky unicode mode-like patterns', () => { + expect(() => buildRegExp(/\\u{1234}/)).not.toThrow(); +}); diff --git a/src/builders.ts b/src/builders.ts index 482392f..01ed08b 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -10,17 +10,9 @@ import { encode } from './encoder'; */ export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp { const pattern = encode(sequence).pattern; - const flagsString = encodeFlags(flags ?? {}); - - if (!flags?.unicode) { - const unicodeModePattern = getUnicodeModePattern(pattern); - if (unicodeModePattern) { - throw new Error( - `The pattern "${unicodeModePattern}" requires Unicode-aware mode. Please ensure the "unicode" flag is set.`, - ); - } - } + ensureUnicodeFlagIfNeeded(pattern, flags); + const flagsString = encodeFlags(flags ?? {}); return new RegExp(pattern, flagsString); } @@ -47,9 +39,16 @@ function encodeFlags(flags: RegexFlags): string { return result; } -const unicodeModePatterns = /(?:\\u|\\p|\\P)\{.+?\}/; +// Matches unicode mode patterns: \u{...}, \p{...}, \P{...}, but avoids valid \\u{...}, etc +const unicodeModePatterns = /(? Date: Sun, 8 Sep 2024 00:06:16 +0200 Subject: [PATCH 05/19] refactor: improve errors --- src/builders.ts | 2 +- src/constructs/choice-of.ts | 2 +- src/constructs/repeat.ts | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/builders.ts b/src/builders.ts index 01ed08b..f65c7fb 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -49,6 +49,6 @@ function ensureUnicodeFlagIfNeeded(pattern: string, flags: RegexFlags | undefine const match = pattern.match(unicodeModePatterns); if (match) { - throw new Error(`Pattern "${match?.[0]}" requires the "unicode" flag to be set.`); + throw new Error(`Pattern "${match?.[0]}" requires "unicode" flag to be set.`); } } diff --git a/src/constructs/choice-of.ts b/src/constructs/choice-of.ts index 40be23a..a899c94 100644 --- a/src/constructs/choice-of.ts +++ b/src/constructs/choice-of.ts @@ -3,7 +3,7 @@ import type { EncodedRegex, RegexSequence } from '../types'; export function choiceOf(...alternatives: RegexSequence[]): EncodedRegex { if (alternatives.length === 0) { - throw new Error('`choiceOf` should receive at least one alternative'); + throw new Error('Expected at least one alternative'); } const encodedAlternatives = alternatives.map((c) => encode(c)); diff --git a/src/constructs/repeat.ts b/src/constructs/repeat.ts index 2fc30a9..25e7d07 100644 --- a/src/constructs/repeat.ts +++ b/src/constructs/repeat.ts @@ -6,7 +6,7 @@ export type RepeatOptions = number | { min: number; max?: number; greedy?: boole export function repeat(sequence: RegexSequence, options: RepeatOptions): EncodedRegex { const elements = Array.isArray(sequence) ? sequence : [sequence]; if (elements.length === 0) { - throw new Error('`repeat` should receive at least one element'); + throw new Error('Expected at least one element'); } if (typeof options === 'number') { From bb0b6e60e7f1634b31a7143cb6fdf62ba7ead0ba Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 8 Sep 2024 00:09:10 +0200 Subject: [PATCH 06/19] refactor: add additional length checks --- src/encoder.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/encoder.ts b/src/encoder.ts index 08b5c0b..ec0c983 100644 --- a/src/encoder.ts +++ b/src/encoder.ts @@ -2,6 +2,10 @@ import type { EncodedRegex, RegexElement, RegexSequence } from './types'; export function encode(sequence: RegexSequence): EncodedRegex { const elements = Array.isArray(sequence) ? sequence : [sequence]; + if (elements.length === 0) { + throw new Error('Expected at least one element'); + } + const encoded = elements.map((n) => encodeElement(n)); if (encoded.length === 1) { @@ -46,6 +50,10 @@ function encodeElement(element: RegexElement): EncodedRegex { } function encodeText(text: string): EncodedRegex { + if (text.length === 0) { + throw new Error('Expected at least one character'); + } + return { // Optimize for single character case precedence: text.length === 1 ? 'atom' : 'sequence', From 988f09be6416c59714964bb887288906f8177454 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 8 Sep 2024 10:05:37 +0200 Subject: [PATCH 07/19] refactor: ensure elements --- src/constructs/quantifiers.ts | 10 +++++++--- src/constructs/repeat.ts | 6 ++---- src/encoder.ts | 7 ++----- src/utils.ts | 10 ++++++++++ 4 files changed, 21 insertions(+), 12 deletions(-) create mode 100644 src/utils.ts diff --git a/src/constructs/quantifiers.ts b/src/constructs/quantifiers.ts index 0fcab70..70e0869 100644 --- a/src/constructs/quantifiers.ts +++ b/src/constructs/quantifiers.ts @@ -1,27 +1,31 @@ import { encodeAtomic } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; +import { ensureElements } from '../utils'; export interface QuantifierOptions { greedy?: boolean; } export function zeroOrMore(sequence: RegexSequence, options?: QuantifierOptions): EncodedRegex { + const elements = ensureElements(sequence); return { precedence: 'sequence', - pattern: `${encodeAtomic(sequence)}*${options?.greedy === false ? '?' : ''}`, + pattern: `${encodeAtomic(elements)}*${options?.greedy === false ? '?' : ''}`, }; } export function oneOrMore(sequence: RegexSequence, options?: QuantifierOptions): EncodedRegex { + const elements = ensureElements(sequence); return { precedence: 'sequence', - pattern: `${encodeAtomic(sequence)}+${options?.greedy === false ? '?' : ''}`, + pattern: `${encodeAtomic(elements)}+${options?.greedy === false ? '?' : ''}`, }; } export function optional(sequence: RegexSequence, options?: QuantifierOptions): EncodedRegex { + const elements = ensureElements(sequence); return { precedence: 'sequence', - pattern: `${encodeAtomic(sequence)}?${options?.greedy === false ? '?' : ''}`, + pattern: `${encodeAtomic(elements)}?${options?.greedy === false ? '?' : ''}`, }; } diff --git a/src/constructs/repeat.ts b/src/constructs/repeat.ts index 25e7d07..ddb42e3 100644 --- a/src/constructs/repeat.ts +++ b/src/constructs/repeat.ts @@ -1,13 +1,11 @@ import { encodeAtomic } from '../encoder'; import type { EncodedRegex, RegexSequence } from '../types'; +import { ensureElements } from '../utils'; export type RepeatOptions = number | { min: number; max?: number; greedy?: boolean }; export function repeat(sequence: RegexSequence, options: RepeatOptions): EncodedRegex { - const elements = Array.isArray(sequence) ? sequence : [sequence]; - if (elements.length === 0) { - throw new Error('Expected at least one element'); - } + const elements = ensureElements(sequence); if (typeof options === 'number') { return { diff --git a/src/encoder.ts b/src/encoder.ts index ec0c983..3fabd09 100644 --- a/src/encoder.ts +++ b/src/encoder.ts @@ -1,11 +1,8 @@ import type { EncodedRegex, RegexElement, RegexSequence } from './types'; +import { ensureElements } from './utils'; export function encode(sequence: RegexSequence): EncodedRegex { - const elements = Array.isArray(sequence) ? sequence : [sequence]; - if (elements.length === 0) { - throw new Error('Expected at least one element'); - } - + const elements = ensureElements(sequence); const encoded = elements.map((n) => encodeElement(n)); if (encoded.length === 1) { diff --git a/src/utils.ts b/src/utils.ts new file mode 100644 index 0000000..8815078 --- /dev/null +++ b/src/utils.ts @@ -0,0 +1,10 @@ +import type { RegexElement, RegexSequence } from './types'; + +export function ensureElements(sequence: RegexSequence): RegexElement[] { + const elements = Array.isArray(sequence) ? sequence : [sequence]; + if (elements.length === 0) { + throw new Error('Expected at least one element'); + } + + return elements; +} From 9f6684f376960f08f4051775e277ac783404aa5e Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 8 Sep 2024 10:14:04 +0200 Subject: [PATCH 08/19] refactor: extract ensure text --- src/constructs/char-class.ts | 14 ++++++-------- src/encoder.ts | 25 +++++-------------------- src/utils.ts | 6 ++++++ 3 files changed, 17 insertions(+), 28 deletions(-) diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index fb1daeb..3c1e199 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -1,4 +1,5 @@ import type { CharacterClass, CharacterEscape, EncodedRegex } from '../types'; +import { ensureText } from '../utils'; export function charClass(...elements: Array): CharacterClass { if (!elements.length) { @@ -32,15 +33,11 @@ export function charRange(start: string, end: string): CharacterClass { }; } -export function anyOf(characters: string): CharacterClass { - const chars = characters.split('').map((c) => escapeCharClass(c)); - - if (chars.length === 0) { - throw new Error('Expected at least one character'); - } +export function anyOf(chars: string): CharacterClass { + ensureText(chars); return { - chars, + chars: chars.split('').map(escapeChar), encode: encodeCharClass, }; } @@ -54,7 +51,8 @@ export function negated(element: CharacterClass | CharacterEscape): EncodedRegex */ export const inverted = negated; -function escapeCharClass(text: string): string { +/** Escape chars for usage inside char class */ +function escapeChar(text: string): string { return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string } diff --git a/src/encoder.ts b/src/encoder.ts index 3fabd09..1991d2e 100644 --- a/src/encoder.ts +++ b/src/encoder.ts @@ -1,5 +1,5 @@ import type { EncodedRegex, RegexElement, RegexSequence } from './types'; -import { ensureElements } from './utils'; +import { ensureElements, ensureText } from './utils'; export function encode(sequence: RegexSequence): EncodedRegex { const elements = ensureElements(sequence); @@ -47,9 +47,7 @@ function encodeElement(element: RegexElement): EncodedRegex { } function encodeText(text: string): EncodedRegex { - if (text.length === 0) { - throw new Error('Expected at least one character'); - } + ensureText(text); return { // Optimize for single character case @@ -68,23 +66,10 @@ function encodeRegExp(regexp: RegExp): EncodedRegex { }; } -// This is intended to catch only some popular atomic patterns like char classes. +// This is intended to catch only some popular atomic patterns like char classes and groups. function isAtomicPattern(pattern: string): boolean { - if (pattern.length === 1) { - return true; - } - - // Simple char class: [...] - if (pattern.match(/^\[[^[\]]*\]$/)) { - return true; - } - - // Simple group: (...) - if (pattern.match(/^\([^()]*\)$/)) { - return true; - } - - return false; + // Simple char, char class [...] or group (...) + return pattern.length === 1 || /^\[[^[\]]*\]$/.test(pattern) || /^\([^()]*\)$/.test(pattern); } // Source: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#escaping diff --git a/src/utils.ts b/src/utils.ts index 8815078..5bc9232 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -8,3 +8,9 @@ export function ensureElements(sequence: RegexSequence): RegexElement[] { return elements; } + +export function ensureText(text: string): void { + if (text.length === 0) { + throw new Error('Expected at least one character'); + } +} From 27b3067815924831e186d591ba7d9e11118bd23d Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 8 Sep 2024 10:19:41 +0200 Subject: [PATCH 09/19] refactor: lazy encodable regex --- src/encoder.ts | 12 ++++++------ src/types.ts | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/encoder.ts b/src/encoder.ts index 1991d2e..c2aaeda 100644 --- a/src/encoder.ts +++ b/src/encoder.ts @@ -23,21 +23,21 @@ export function encodeAtomic(sequence: RegexSequence): string { } function encodeElement(element: RegexElement): EncodedRegex { - if (typeof element === 'string' && element.length > 0) { + if (typeof element === 'string') { return encodeText(element); } - if (typeof element === 'object') { - if (element instanceof RegExp) { - return encodeRegExp(element); - } + if (element instanceof RegExp) { + return encodeRegExp(element); + } + if (typeof element === 'object') { // EncodedRegex if ('pattern' in element) { return element; } - // SelfEncodableRegex + // LazyEncodableRegex if ('encode' in element) { return element.encode(); } diff --git a/src/types.ts b/src/types.ts index 10b8b6d..f6fd401 100644 --- a/src/types.ts +++ b/src/types.ts @@ -15,7 +15,7 @@ export type RegexElement = RegexConstruct | RegExp | string; /** * Fundamental building block of a regular expression, defined as either an encoded regex or a character class. */ -export type RegexConstruct = EncodedRegex | SelfEncodableRegex; +export type RegexConstruct = EncodedRegex | LazyEncodableRegex; /** * Encoded regex pattern with information about its type (atom, sequence) @@ -33,11 +33,11 @@ export interface CharacterEscape extends EncodedRegex { ranges?: never; } -export interface SelfEncodableRegex { +export interface LazyEncodableRegex { encode: () => EncodedRegex; } -export interface CharacterClass extends SelfEncodableRegex { +export interface CharacterClass extends LazyEncodableRegex { chars: string[]; ranges?: CharacterRange[]; } From 1a30bc6702955121c3716b80f732281a3acf56f0 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 8 Sep 2024 10:24:26 +0200 Subject: [PATCH 10/19] chore: update snapshots --- src/__tests__/builder.test.ts | 6 +++--- src/constructs/__tests__/choice-of.test.ts | 2 +- src/constructs/__tests__/encoder.test.tsx | 2 +- src/constructs/__tests__/repeat.test.tsx | 4 +--- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/__tests__/builder.test.ts b/src/__tests__/builder.test.ts index 970e665..71a304f 100644 --- a/src/__tests__/builder.test.ts +++ b/src/__tests__/builder.test.ts @@ -42,15 +42,15 @@ test('`regexBuilder` throws when using unicode-aware features without `unicode` `"Expected a valid unicode code point but received 1193046"`, ); expect(() => buildRegExp(char(0x12345))).toThrowErrorMatchingInlineSnapshot( - `"Pattern "\\u{12345}" requires the "unicode" flag to be set."`, + `"Pattern "\\u{12345}" requires "unicode" flag to be set."`, ); expect(() => buildRegExp(unicodeProperty('Emoji_Presentation')), ).toThrowErrorMatchingInlineSnapshot( - `"Pattern "\\p{Emoji_Presentation}" requires the "unicode" flag to be set."`, + `"Pattern "\\p{Emoji_Presentation}" requires "unicode" flag to be set."`, ); expect(() => buildRegExp(/\P{Letter}/u)).toThrowErrorMatchingInlineSnapshot( - `"Pattern "\\P{Letter}" requires the "unicode" flag to be set."`, + `"Pattern "\\P{Letter}" requires "unicode" flag to be set."`, ); }); diff --git a/src/constructs/__tests__/choice-of.test.ts b/src/constructs/__tests__/choice-of.test.ts index cc2a3a6..3d3a2e5 100644 --- a/src/constructs/__tests__/choice-of.test.ts +++ b/src/constructs/__tests__/choice-of.test.ts @@ -34,6 +34,6 @@ test('`choiceOf` pattern using nested regex', () => { test('`choiceOf` throws on empty options', () => { expect(() => choiceOf()).toThrowErrorMatchingInlineSnapshot( - `"\`choiceOf\` should receive at least one alternative"`, + `"Expected at least one alternative"`, ); }); diff --git a/src/constructs/__tests__/encoder.test.tsx b/src/constructs/__tests__/encoder.test.tsx index e4c3939..9ba7c06 100644 --- a/src/constructs/__tests__/encoder.test.tsx +++ b/src/constructs/__tests__/encoder.test.tsx @@ -83,6 +83,6 @@ test('`buildRegExp` throws error on unknown element', () => { test('`buildPattern` throws on empty text', () => { expect(() => buildPattern('')).toThrowErrorMatchingInlineSnapshot( - `"Unsupported element. Received: """`, + `"Expected at least one character"`, ); }); diff --git a/src/constructs/__tests__/repeat.test.tsx b/src/constructs/__tests__/repeat.test.tsx index af61fa4..c7158aa 100644 --- a/src/constructs/__tests__/repeat.test.tsx +++ b/src/constructs/__tests__/repeat.test.tsx @@ -16,9 +16,7 @@ test('`repeat` pattern optimizes grouping for atoms', () => { }); test('`repeat` throws on no children', () => { - expect(() => repeat([], 1)).toThrowErrorMatchingInlineSnapshot( - `"\`repeat\` should receive at least one element"`, - ); + expect(() => repeat([], 1)).toThrowErrorMatchingInlineSnapshot(`"Expected at least one element"`); }); test('greedy `repeat` quantifier pattern', () => { From 220a5f7f3e701e759247c9e244c513805cd18857 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 8 Sep 2024 10:24:57 +0200 Subject: [PATCH 11/19] chore: fix lint --- src/constructs/__tests__/char-class.test.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/constructs/__tests__/char-class.test.ts b/src/constructs/__tests__/char-class.test.ts index cb78d7d..5c44e61 100644 --- a/src/constructs/__tests__/char-class.test.ts +++ b/src/constructs/__tests__/char-class.test.ts @@ -1,6 +1,5 @@ import { anyOf, - buildRegExp, charClass, charRange, digit, From eb78700473150f31f247a50725298d8e3a4685c4 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 8 Sep 2024 22:06:01 +0200 Subject: [PATCH 12/19] chore: tweaks --- src/patterns/hex-color.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/patterns/hex-color.ts b/src/patterns/hex-color.ts index 67f8efa..6253529 100644 --- a/src/patterns/hex-color.ts +++ b/src/patterns/hex-color.ts @@ -1,11 +1,9 @@ import { buildRegExp } from '../builders'; import { endOfString, startOfString, wordBoundary } from '../constructs/anchors'; -import { charClass, charRange } from '../constructs/char-class'; -import { digit } from '../constructs/char-escape'; import { choiceOf } from '../constructs/choice-of'; import { repeat } from '../constructs/repeat'; -const hexDigit = charClass(digit, charRange('a', 'f')); +const hexDigit = /[0-9a-f]/; /** Find hex color strings in a text. */ export const hexColorFinder = buildRegExp( From 0960fd7630084d28b0336eef4b28e54b6d4cf7f5 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 8 Sep 2024 22:27:16 +0200 Subject: [PATCH 13/19] refactor: improve char range --- src/constructs/__tests__/char-class.test.ts | 18 +++++++++++++----- src/constructs/char-class.ts | 10 +++------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/constructs/__tests__/char-class.test.ts b/src/constructs/__tests__/char-class.test.ts index 5c44e61..cc90f18 100644 --- a/src/constructs/__tests__/char-class.test.ts +++ b/src/constructs/__tests__/char-class.test.ts @@ -46,15 +46,23 @@ test('`charRange` pattern', () => { expect([charRange('A', 'F'), 'x']).toEqualRegex(/[A-F]x/); }); +test('`charRange` works both ways', () => { + expect(charRange('a', 'z')).toEqualRegex(/[a-z]/); + expect(charRange('z', 'a')).toEqualRegex(/[a-z]/); +}); + test('`charRange` throws on incorrect arguments', () => { - expect(() => charRange('z', 'a')).toThrowErrorMatchingInlineSnapshot( - `"\`start\` character should be before or same as \`end\` character"`, - ); expect(() => charRange('aa', 'z')).toThrowErrorMatchingInlineSnapshot( - `"Expected a single character, received "aa""`, + `"Expected a single characters, received "aa" & "z""`, ); expect(() => charRange('a', 'zz')).toThrowErrorMatchingInlineSnapshot( - `"Expected a single character, received "zz""`, + `"Expected a single characters, received "a" & "zz""`, + ); + expect(() => charRange('', 'z')).toThrowErrorMatchingInlineSnapshot( + `"Expected a single characters, received "" & "z""`, + ); + expect(() => charRange('a', '')).toThrowErrorMatchingInlineSnapshot( + `"Expected a single characters, received "a" & """`, ); }); diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index 3c1e199..81eefb6 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -14,16 +14,12 @@ export function charClass(...elements: Array): } export function charRange(start: string, end: string): CharacterClass { - if (start.length !== 1) { - throw new Error(`Expected a single character, received "${start}"`); - } - - if (end.length !== 1) { - throw new Error(`Expected a single character, received "${end}"`); + if (start.length !== 1 || end.length !== 1) { + throw new Error(`Expected a single characters, received "${start}" & "${end}"`); } if (start > end) { - throw new Error('`start` character should be before or same as `end` character'); + [start, end] = [end, start]; } return { From 509901a40011b73a2a90ba1a45f4a3fe4c52e6ce Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 8 Sep 2024 22:40:24 +0200 Subject: [PATCH 14/19] refactor: group unicode back again --- ...cape-unicode.test.tsx => unicode.test.tsx} | 0 src/constructs/char-class.ts | 2 +- src/constructs/char-escape.ts | 51 ------------------ src/constructs/unicode.ts | 52 +++++++++++++++++++ src/index.ts | 3 +- 5 files changed, 54 insertions(+), 54 deletions(-) rename src/constructs/__tests__/{char-escape-unicode.test.tsx => unicode.test.tsx} (100%) create mode 100644 src/constructs/unicode.ts diff --git a/src/constructs/__tests__/char-escape-unicode.test.tsx b/src/constructs/__tests__/unicode.test.tsx similarity index 100% rename from src/constructs/__tests__/char-escape-unicode.test.tsx rename to src/constructs/__tests__/unicode.test.tsx diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index 81eefb6..2c1a5e3 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -52,7 +52,7 @@ function escapeChar(text: string): string { return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string } -export function encodeCharClass( +function encodeCharClass( this: CharacterClass | CharacterEscape, isNegated?: boolean, ): EncodedRegex { diff --git a/src/constructs/char-escape.ts b/src/constructs/char-escape.ts index fcf6be5..77aa2cb 100644 --- a/src/constructs/char-escape.ts +++ b/src/constructs/char-escape.ts @@ -59,54 +59,3 @@ export const notWord = nonWord; * @deprecated Renamed to `nonWhitespace`. */ export const notWhitespace = nonWhitespace; - -/** - * Unicode character code point escape. - * - * Regex pattern: - * - `\uXXXX`: 4-digit hex escape for code points below 0x10000. - * - `\u{X}`: Unicode code point escape for code points above 0xFFFF. - * - * Note: for code points above 0xFFFF, the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). - * - * @param codePoint The code point of the character to escape. - * @returns A character class representing the unicode escape. - */ -export function char(codePoint: number): CharacterEscape { - if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) { - throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`); - } - - let escape = - codePoint < 0x10000 - ? `\\u${codePoint.toString(16).padStart(4, '0')}` // 4-digit hex (works in all modes) - : `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode) - - return { - precedence: 'atom', - pattern: escape, - chars: [escape], - }; -} - -/** - * Unicode property escape matching a set of characters specified by a Unicode property. - * - * Regex pattern: `\p{Property}` or `\p{Property=Value}` - * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape - * - * Note: the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). - * - * @param property Unicode property name. - * @param value Unicode property value (optional). - * @returns A character class representing the unicode property escape. - */ -export function unicodeProperty(property: string, value?: string): CharacterEscape { - const escape = `\\p{${property}${value ? `=${value}` : ''}}`; - - return { - precedence: 'atom', - pattern: escape, - chars: [escape], - }; -} diff --git a/src/constructs/unicode.ts b/src/constructs/unicode.ts new file mode 100644 index 0000000..ccc5d18 --- /dev/null +++ b/src/constructs/unicode.ts @@ -0,0 +1,52 @@ +import type { CharacterEscape } from '../types'; + +/** + * Unicode character code point escape. + * + * Regex pattern: + * - `\uXXXX`: 4-digit hex escape for code points below 0x10000. + * - `\u{X}`: Unicode code point escape for code points above 0xFFFF. + * + * Note: for code points above 0xFFFF, the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + * + * @param codePoint The code point of the character to escape. + * @returns A character class representing the unicode escape. + */ +export function char(codePoint: number): CharacterEscape { + if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) { + throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`); + } + + let escape = + codePoint < 0x10000 + ? `\\u${codePoint.toString(16).padStart(4, '0')}` // 4-digit hex (works in all modes) + : `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode) + + return { + precedence: 'atom', + pattern: escape, + chars: [escape], + }; +} + +/** + * Unicode property escape matching a set of characters specified by a Unicode property. + * + * Regex pattern: `\p{Property}` or `\p{Property=Value}` + * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape + * + * Note: the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + * + * @param property Unicode property name. + * @param value Unicode property value (optional). + * @returns A character class representing the unicode property escape. + */ +export function unicodeProperty(property: string, value?: string): CharacterEscape { + const escape = `\\p{${property}${value ? `=${value}` : ''}}`; + + return { + precedence: 'atom', + pattern: escape, + chars: [escape], + }; +} diff --git a/src/index.ts b/src/index.ts index 30d6677..de50286 100644 --- a/src/index.ts +++ b/src/index.ts @@ -28,8 +28,6 @@ export { notDigit, notWhitespace, notWord, - char, - unicodeProperty, } from './constructs/char-escape'; export { choiceOf } from './constructs/choice-of'; export { lookahead } from './constructs/lookahead'; @@ -39,3 +37,4 @@ export { negativeLookbehind } from './constructs/negative-lookbehind'; export { zeroOrMore, oneOrMore, optional } from './constructs/quantifiers'; export { regex } from './constructs/regex'; export { repeat } from './constructs/repeat'; +export { char, unicodeProperty } from './constructs/unicode'; From 5f11fb322eb79c55cf87ef5cc3a35bb1649edb2c Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 8 Sep 2024 22:42:10 +0200 Subject: [PATCH 15/19] refactor: rename char back to unicodeChar --- src/__tests__/builder.test.ts | 10 +-- src/constructs/__tests__/unicode.test.tsx | 100 ++++++++++++---------- src/constructs/unicode.ts | 2 +- src/index.ts | 2 +- 4 files changed, 60 insertions(+), 54 deletions(-) diff --git a/src/__tests__/builder.test.ts b/src/__tests__/builder.test.ts index 71a304f..ee09984 100644 --- a/src/__tests__/builder.test.ts +++ b/src/__tests__/builder.test.ts @@ -1,4 +1,4 @@ -import { buildRegExp, char, unicodeProperty } from '..'; +import { buildRegExp, unicodeChar, unicodeProperty } from '..'; test('`regexBuilder` flags', () => { expect(buildRegExp('a').flags).toBe(''); @@ -34,14 +34,14 @@ test('`regexBuilder` flags', () => { }); test('`regexBuilder` throws when using unicode-aware features without `unicode` flag', () => { - expect(() => buildRegExp(char(0x1234))).not.toThrow(); - expect(() => buildRegExp(char(0x12345), { unicode: true })).not.toThrow(); + expect(() => buildRegExp(unicodeChar(0x1234))).not.toThrow(); + expect(() => buildRegExp(unicodeChar(0x12345), { unicode: true })).not.toThrow(); expect(() => buildRegExp(unicodeProperty('Emoji_Presentation'), { unicode: true })).not.toThrow(); - expect(() => buildRegExp(char(0x123456))).toThrowErrorMatchingInlineSnapshot( + expect(() => buildRegExp(unicodeChar(0x123456))).toThrowErrorMatchingInlineSnapshot( `"Expected a valid unicode code point but received 1193046"`, ); - expect(() => buildRegExp(char(0x12345))).toThrowErrorMatchingInlineSnapshot( + expect(() => buildRegExp(unicodeChar(0x12345))).toThrowErrorMatchingInlineSnapshot( `"Pattern "\\u{12345}" requires "unicode" flag to be set."`, ); expect(() => diff --git a/src/constructs/__tests__/unicode.test.tsx b/src/constructs/__tests__/unicode.test.tsx index e7c940e..b4239ea 100644 --- a/src/constructs/__tests__/unicode.test.tsx +++ b/src/constructs/__tests__/unicode.test.tsx @@ -1,10 +1,10 @@ import { buildRegExp, - char, charClass, endOfString, type RegexSequence, startOfString, + unicodeChar, unicodeProperty, } from '../..'; @@ -14,79 +14,85 @@ function u(sequence: RegexSequence) { test('`char` pattern', () => { // eslint-disable-next-line no-control-regex - expect(char(0)).toEqualRegex(/\u0000/); + expect(unicodeChar(0)).toEqualRegex(/\u0000/); // eslint-disable-next-line no-control-regex - expect(char(0x1)).toEqualRegex(/\u0001/); + expect(unicodeChar(0x1)).toEqualRegex(/\u0001/); // eslint-disable-next-line no-control-regex - expect(char(0x12)).toEqualRegex(/\u0012/); - expect(char(0x123)).toEqualRegex(/\u0123/); - expect(char(0x1234)).toEqualRegex(/\u1234/); + expect(unicodeChar(0x12)).toEqualRegex(/\u0012/); + expect(unicodeChar(0x123)).toEqualRegex(/\u0123/); + expect(unicodeChar(0x1234)).toEqualRegex(/\u1234/); // eslint-disable-next-line no-control-regex - expect(u(char(0))).toEqualRegex(new RegExp('\\u0000', 'u')); + expect(u(unicodeChar(0))).toEqualRegex(new RegExp('\\u0000', 'u')); // eslint-disable-next-line no-control-regex - expect(u(char(0x1))).toEqualRegex(new RegExp('\\u0001', 'u')); - expect(u(char(0x12))).toEqualRegex( + expect(u(unicodeChar(0x1))).toEqualRegex(new RegExp('\\u0001', 'u')); + expect(u(unicodeChar(0x12))).toEqualRegex( // eslint-disable-next-line no-control-regex new RegExp('\\u0012', 'u'), ); - expect(char(0x0123)).toEqualRegex(/\u0123/); - expect(char(0x1234)).toEqualRegex(/\u1234/); + expect(unicodeChar(0x0123)).toEqualRegex(/\u0123/); + expect(unicodeChar(0x1234)).toEqualRegex(/\u1234/); - expect(u(char(0x0123))).toEqualRegex(/\u0123/u); - expect(u(char(0x1234))).toEqualRegex(/\u1234/u); - expect(u(char(0x12345))).toEqualRegex(new RegExp('\\u{12345}', 'u')); - expect(u(char(0x103456))).toEqualRegex(new RegExp('\\u{103456}', 'u')); + expect(u(unicodeChar(0x0123))).toEqualRegex(/\u0123/u); + expect(u(unicodeChar(0x1234))).toEqualRegex(/\u1234/u); + expect(u(unicodeChar(0x12345))).toEqualRegex(new RegExp('\\u{12345}', 'u')); + expect(u(unicodeChar(0x103456))).toEqualRegex(new RegExp('\\u{103456}', 'u')); }); test('`char` matching', () => { - expect(char(0)).toMatchString('\u{0}'); - expect(char(0x1)).toMatchString('\u{1}'); - expect(char(0x12)).toMatchString('\u{12}}'); - expect(char(0x123)).toMatchString('\u{123}'); - expect(char(0x1234)).toMatchString('\u{1234}}'); - - expect(char('a'.codePointAt(0)!)).toMatchString('a'); - expect(char('ą'.codePointAt(0)!)).toMatchString('ą'); - expect(char('©'.codePointAt(0)!)).toMatchString('©'); - - expect(u(char(0))).toMatchString('\u{0}'); - expect(u(char(0))).not.toMatchString('a'); - expect(u(char(0x1))).toMatchString('\u{1}'); - expect(u(char(0x12))).toMatchString('\u{12}'); - expect(u(char(0x123))).toMatchString('\u{123}'); - expect(u(char(0x1234))).toMatchString('\u{1234}'); - expect(u(char(0x12345))).toMatchString('\u{12345}'); - expect(u(char(0x103456))).toMatchString('\u{103456}'); - - expect(u(char('a'.codePointAt(0)!))).toMatchString('a'); - expect(u(char('ą'.codePointAt(0)!))).toMatchString('ą'); - expect(u(char('©'.codePointAt(0)!))).toMatchString('©'); - expect(u(char('😎'.codePointAt(0)!))).toMatchString('😎'); - expect(u(char('😎'.codePointAt(0)!))).toMatchString('\u{1f60e}'); + expect(unicodeChar(0)).toMatchString('\u{0}'); + expect(unicodeChar(0x1)).toMatchString('\u{1}'); + expect(unicodeChar(0x12)).toMatchString('\u{12}}'); + expect(unicodeChar(0x123)).toMatchString('\u{123}'); + expect(unicodeChar(0x1234)).toMatchString('\u{1234}}'); + + expect(unicodeChar('a'.codePointAt(0)!)).toMatchString('a'); + expect(unicodeChar('ą'.codePointAt(0)!)).toMatchString('ą'); + expect(unicodeChar('©'.codePointAt(0)!)).toMatchString('©'); + + expect(u(unicodeChar(0))).toMatchString('\u{0}'); + expect(u(unicodeChar(0))).not.toMatchString('a'); + expect(u(unicodeChar(0x1))).toMatchString('\u{1}'); + expect(u(unicodeChar(0x12))).toMatchString('\u{12}'); + expect(u(unicodeChar(0x123))).toMatchString('\u{123}'); + expect(u(unicodeChar(0x1234))).toMatchString('\u{1234}'); + expect(u(unicodeChar(0x12345))).toMatchString('\u{12345}'); + expect(u(unicodeChar(0x103456))).toMatchString('\u{103456}'); + + expect(u(unicodeChar('a'.codePointAt(0)!))).toMatchString('a'); + expect(u(unicodeChar('ą'.codePointAt(0)!))).toMatchString('ą'); + expect(u(unicodeChar('©'.codePointAt(0)!))).toMatchString('©'); + expect(u(unicodeChar('😎'.codePointAt(0)!))).toMatchString('😎'); + expect(u(unicodeChar('😎'.codePointAt(0)!))).toMatchString('\u{1f60e}'); }); test('`char` nesting matching', () => { - expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('a'); - expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('ą'); - expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).not.toMatchString('b'); + expect( + u(charClass(unicodeChar('a'.codePointAt(0)!), unicodeChar('ą'.codePointAt(0)!))), + ).toMatchString('a'); + expect( + u(charClass(unicodeChar('a'.codePointAt(0)!), unicodeChar('ą'.codePointAt(0)!))), + ).toMatchString('ą'); + expect( + u(charClass(unicodeChar('a'.codePointAt(0)!), unicodeChar('ą'.codePointAt(0)!))), + ).not.toMatchString('b'); }); test('`char` edge cases handling', () => { - expect(() => u(char(NaN))).toThrowErrorMatchingInlineSnapshot( + expect(() => u(unicodeChar(NaN))).toThrowErrorMatchingInlineSnapshot( `"Expected a valid unicode code point but received NaN"`, ); - expect(() => u(char(1.5))).toThrowErrorMatchingInlineSnapshot( + expect(() => u(unicodeChar(1.5))).toThrowErrorMatchingInlineSnapshot( `"Expected a valid unicode code point but received 1.5"`, ); - expect(() => u(char(-1))).toThrowErrorMatchingInlineSnapshot( + expect(() => u(unicodeChar(-1))).toThrowErrorMatchingInlineSnapshot( `"Expected a valid unicode code point but received -1"`, ); - expect(() => u(char(0x110000))).toThrowErrorMatchingInlineSnapshot( + expect(() => u(unicodeChar(0x110000))).toThrowErrorMatchingInlineSnapshot( `"Expected a valid unicode code point but received 1114112"`, ); - expect(u(char(0x10ffff))).toEqualRegex(/\u{10ffff}/u); + expect(u(unicodeChar(0x10ffff))).toEqualRegex(/\u{10ffff}/u); }); test('`unicodeProperty` pattern', () => { diff --git a/src/constructs/unicode.ts b/src/constructs/unicode.ts index ccc5d18..8140f11 100644 --- a/src/constructs/unicode.ts +++ b/src/constructs/unicode.ts @@ -12,7 +12,7 @@ import type { CharacterEscape } from '../types'; * @param codePoint The code point of the character to escape. * @returns A character class representing the unicode escape. */ -export function char(codePoint: number): CharacterEscape { +export function unicodeChar(codePoint: number): CharacterEscape { if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) { throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`); } diff --git a/src/index.ts b/src/index.ts index de50286..923918f 100644 --- a/src/index.ts +++ b/src/index.ts @@ -37,4 +37,4 @@ export { negativeLookbehind } from './constructs/negative-lookbehind'; export { zeroOrMore, oneOrMore, optional } from './constructs/quantifiers'; export { regex } from './constructs/regex'; export { repeat } from './constructs/repeat'; -export { char, unicodeProperty } from './constructs/unicode'; +export { unicodeChar, unicodeProperty } from './constructs/unicode'; From e0978a101f0fbb6bcb2228278e971523f32525c9 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 8 Sep 2024 22:46:57 +0200 Subject: [PATCH 16/19] refactor: fix grammar --- src/constructs/__tests__/char-class.test.ts | 8 ++++---- src/constructs/char-class.ts | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/constructs/__tests__/char-class.test.ts b/src/constructs/__tests__/char-class.test.ts index cc90f18..a7ace7f 100644 --- a/src/constructs/__tests__/char-class.test.ts +++ b/src/constructs/__tests__/char-class.test.ts @@ -53,16 +53,16 @@ test('`charRange` works both ways', () => { test('`charRange` throws on incorrect arguments', () => { expect(() => charRange('aa', 'z')).toThrowErrorMatchingInlineSnapshot( - `"Expected a single characters, received "aa" & "z""`, + `"Expected single characters, but received "aa" & "z""`, ); expect(() => charRange('a', 'zz')).toThrowErrorMatchingInlineSnapshot( - `"Expected a single characters, received "a" & "zz""`, + `"Expected single characters, but received "a" & "zz""`, ); expect(() => charRange('', 'z')).toThrowErrorMatchingInlineSnapshot( - `"Expected a single characters, received "" & "z""`, + `"Expected single characters, but received "" & "z""`, ); expect(() => charRange('a', '')).toThrowErrorMatchingInlineSnapshot( - `"Expected a single characters, received "a" & """`, + `"Expected single characters, but received "a" & """`, ); }); diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index 2c1a5e3..5724503 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -15,7 +15,7 @@ export function charClass(...elements: Array): export function charRange(start: string, end: string): CharacterClass { if (start.length !== 1 || end.length !== 1) { - throw new Error(`Expected a single characters, received "${start}" & "${end}"`); + throw new Error(`Expected single characters, but received "${start}" & "${end}"`); } if (start > end) { From a15cbbf42091786f6a2bb249e337203dacd5c4fd Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Sun, 8 Sep 2024 22:50:50 +0200 Subject: [PATCH 17/19] refactor: self code review --- src/constructs/__tests__/unicode.test.tsx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/constructs/__tests__/unicode.test.tsx b/src/constructs/__tests__/unicode.test.tsx index b4239ea..a079b5a 100644 --- a/src/constructs/__tests__/unicode.test.tsx +++ b/src/constructs/__tests__/unicode.test.tsx @@ -12,7 +12,7 @@ function u(sequence: RegexSequence) { return buildRegExp(sequence, { unicode: true }); } -test('`char` pattern', () => { +test('`unicodeChar` pattern', () => { // eslint-disable-next-line no-control-regex expect(unicodeChar(0)).toEqualRegex(/\u0000/); // eslint-disable-next-line no-control-regex @@ -39,7 +39,7 @@ test('`char` pattern', () => { expect(u(unicodeChar(0x103456))).toEqualRegex(new RegExp('\\u{103456}', 'u')); }); -test('`char` matching', () => { +test('`unicodeChar` matching', () => { expect(unicodeChar(0)).toMatchString('\u{0}'); expect(unicodeChar(0x1)).toMatchString('\u{1}'); expect(unicodeChar(0x12)).toMatchString('\u{12}}'); @@ -66,7 +66,7 @@ test('`char` matching', () => { expect(u(unicodeChar('😎'.codePointAt(0)!))).toMatchString('\u{1f60e}'); }); -test('`char` nesting matching', () => { +test('`unicodeChar` nesting matching', () => { expect( u(charClass(unicodeChar('a'.codePointAt(0)!), unicodeChar('ą'.codePointAt(0)!))), ).toMatchString('a'); @@ -78,7 +78,7 @@ test('`char` nesting matching', () => { ).not.toMatchString('b'); }); -test('`char` edge cases handling', () => { +test('`unicodeChar` edge cases handling', () => { expect(() => u(unicodeChar(NaN))).toThrowErrorMatchingInlineSnapshot( `"Expected a valid unicode code point but received NaN"`, ); From 19bd5be53da3eb23f03ce8d8053b5a25ae0260e8 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 9 Sep 2024 08:57:38 +0200 Subject: [PATCH 18/19] refactor: re-expose `char` --- src/constructs/__tests__/unicode.test.tsx | 5 +++++ src/constructs/unicode.ts | 5 +++++ src/index.ts | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/constructs/__tests__/unicode.test.tsx b/src/constructs/__tests__/unicode.test.tsx index a079b5a..ca9994c 100644 --- a/src/constructs/__tests__/unicode.test.tsx +++ b/src/constructs/__tests__/unicode.test.tsx @@ -1,5 +1,6 @@ import { buildRegExp, + char, charClass, endOfString, type RegexSequence, @@ -95,6 +96,10 @@ test('`unicodeChar` edge cases handling', () => { expect(u(unicodeChar(0x10ffff))).toEqualRegex(/\u{10ffff}/u); }); +test('"char" alias', () => { + expect(char('a'.codePointAt(0)!)).toEqualRegex(/\u0061/); +}); + test('`unicodeProperty` pattern', () => { expect(u(unicodeProperty('General_Category', 'Letter'))).toEqualRegex( /\p{General_Category=Letter}/u, diff --git a/src/constructs/unicode.ts b/src/constructs/unicode.ts index 8140f11..c3874ea 100644 --- a/src/constructs/unicode.ts +++ b/src/constructs/unicode.ts @@ -29,6 +29,11 @@ export function unicodeChar(codePoint: number): CharacterEscape { }; } +/** + * Alias for `unicodeChar`. + */ +export const char = unicodeChar; + /** * Unicode property escape matching a set of characters specified by a Unicode property. * diff --git a/src/index.ts b/src/index.ts index 923918f..048ae92 100644 --- a/src/index.ts +++ b/src/index.ts @@ -37,4 +37,4 @@ export { negativeLookbehind } from './constructs/negative-lookbehind'; export { zeroOrMore, oneOrMore, optional } from './constructs/quantifiers'; export { regex } from './constructs/regex'; export { repeat } from './constructs/repeat'; -export { unicodeChar, unicodeProperty } from './constructs/unicode'; +export { char, unicodeChar, unicodeProperty } from './constructs/unicode'; From db8c2326e9f26f32f20dbb3559b8e8aab0d79a9e Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 9 Sep 2024 09:10:02 +0200 Subject: [PATCH 19/19] chore: tweak docs --- website/docs/api/unicode.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/website/docs/api/unicode.md b/website/docs/api/unicode.md index fc1648b..abccb33 100644 --- a/website/docs/api/unicode.md +++ b/website/docs/api/unicode.md @@ -10,13 +10,15 @@ JavaScript `RegExp` object offers [Unicode-aware](https://developer.mozilla.org/ ### Character escapes ```ts -function char(codePoint: number): CharacterEscape; +function unicodeChar(codePoint: number): CharacterEscape; ``` +Alias: `char`. + Regex syntax: - `\uXXXX`: 4-digit hex escape for code points below 0x10000. -- `\u{X}`: Unicode code point escape for code points above 0xFFFF. +- `\u{XXXXXX}`: Unicode code point escape for code points above 0xFFFF. Note: for code points above 0xFFFF, the regex engine must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode).