diff --git a/README.md b/README.md index 5fde5d3..658a891 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ A user-friendly regular expression builder for TypeScript and JavaScript. +[API docs](./API.md) | [Examples](./Examples.md) + ## Goal Regular expressions are a powerful tool for matching simple and complex text patterns, yet they are notorious for their hard-to-parse syntax. @@ -68,11 +70,11 @@ Terminology: Most of the regex constructs accept a regex sequence as their argument. Examples of sequences: +- single element (construct): `capture('abc')` +- single element (string): `'Hello'` - array of elements: `['USD', oneOrMore(digit)]` -- single construct: `capture('abc')` -- single string: `'Hello'` -Regex constructs can be composed into a tree: +Regex constructs can be composed into a tree structure: ```ts const currencyAmount = buildRegExp([ @@ -88,28 +90,25 @@ const currencyAmount = buildRegExp([ ]); ``` +Comprehensive API document is available [here](./API.md). + ### Regex Builders -| Builder | Regex Pattern | Description | +| Builder | Regex Syntax | Description | | ---------------------------------------- | ------------- | ----------------------------------- | | `buildRegExp(...)` | `/.../` | Create `RegExp` instance | | `buildRegExp(..., { ignoreCase: true })` | `/.../i` | Create `RegExp` instance with flags | ### Regex Constructs -| Regex Construct | Regex Pattern | Notes | +| Construct | Regex Syntax | Notes | | ------------------- | ------------- | ------------------------------- | | `capture(...)` | `(...)` | Create a capture group | | `choiceOf(x, y, z)` | `x\|y\|z` | Match one of provided sequences | -Notes: - -- `capture` accepts a sequence of elements -- `choiceOf()` accepts a variable number of sequences - ### Quantifiers -| Regex Construct | Regex Pattern | Description | +| Quantifier | Regex Syntax | Description | | -------------------------------- | ------------- | ------------------------------------------------- | | `zeroOrMore(x)` | `x*` | Zero or more occurence of a pattern | | `oneOrMore(x)` | `x+` | One or more occurence of a pattern | @@ -118,11 +117,9 @@ Notes: | `repeat(x, { min: n, })` | `x{n,}` | Pattern repeats at least given number of times | | `repeat(x, { min: n, max: n2 })` | `x{n1,n2}` | Pattern repeats between n1 and n2 number of times | -All quantifiers accept sequence of elements - ### Character classes -| Regex Construct | Regex Pattern | Description | +| Character class | Regex Syntax | Description | | --------------------- | ------------- | ------------------------------------------- | | `any` | `.` | Any character | | `word` | `\w` | Word characters | @@ -133,25 +130,13 @@ All quantifiers accept sequence of elements | `charClass(...)` | `[...]` | Concatenation of multiple character classes | | `inverted(...)` | `[^...]` | Negation of a given character class | -Notes: - -- `any`, `word`, `digit`, `whitespace` are objects, no need to call them -- `anyOf` accepts a single string of characters to match -- `charRange` accepts exactly **two single character** strings representing range start and end (inclusive) -- `charClass` accepts a variable number of character classes to join into a single class -- `inverted` accepts a single character class to be inverted - ### Anchors -| Regex Construct | Regex Pattern | Description | +| Anchor | Regex Syntax | Description | | --------------- | ------------- | ---------------------------------------------------------------- | | `startOfString` | `^` | Match start of the string (or start of a line in multiline mode) | | `endOfString` | `$` | Match end of the string (or end of a line in multiline mode) | -Notes: - -- `startOfString`, `endOfString` are objects, no need to call them. - ## Examples See [Examples document](./docs/Examples.md). diff --git a/docs/API.md b/docs/API.md index 18b6cfc..70e4e2a 100644 --- a/docs/API.md +++ b/docs/API.md @@ -1,86 +1,133 @@ # API +## Types + +### `RegexSequence` + +The sequence of regex elements forming a regular expression. For developer convenience it also accepts a single element instead of array. + +### `RegexElement` + +Fundamental building blocks of a regular expression, defined as either a regex construct or a string. + +### `RegexConstruct` + +The common type for all regex constructs like character classes, quantifiers, and anchors. You should not need to use this type directly, it is returned by all regex construct functions. + +Note: the shape of the `RegexConstruct` is considered private, and may change in a breaking way without a major release. We will focus on maintaining the compatibility of regexes built with + + ## Builder -### `buildRegExp()` function +### `buildRegExp()` ```ts -function buildRegExp(sequence: RegexSequence): RegExp; - function buildRegExp( - sequence: RegexSequence, - flags: { - global?: boolean; - ignoreCase?: boolean; - multiline?: boolean; - hasIndices?: boolean; - sticky?: boolean; - }, + sequence: RegexSequence, + flags?: { + global?: boolean; + ignoreCase?: boolean; + multiline?: boolean; + hasIndices?: boolean; + }, ): RegExp; ``` +The `buildRegExp` is a top-level function responsible for build JavaScript-native `RegExp` object from passed regex sequence. + +It optionally accepts a list of regex flags: + +- `global` - find all matches in a string, instead of just the first one. +- `ignoreCase` - perform case-insensitive matching. +- `multiline` - treat the start and end of each line in a string as the beginning and end of the string. +- `hasIndices` - provide the start and end indices of each captured group in a match. + ## Constructs -### `capture()` +These functions and objects represent available regex constructs. -Captures, also known as capturing groups, are used to extract and store parts of the matched string for later use. +### `capture()` ```ts function capture( - sequence: RegexSequence + sequence: RegexSequence ): Capture ``` +Regex syntax: `(...)`. + +Captures, also known as capturing groups, are used to extract and store parts of the matched string for later use. + ### `choiceOf()` ```ts function choiceOf( - ...alternatives: RegexSequence[] + ...alternatives: RegexSequence[] ): ChoiceOf { ``` -The `choiceOf` (alternation) construct is used to match one out of several possible sequences. It functions similarly to a logical OR operator in programming. It can match simple string options as well as complex patterns. +Regex syntax: `a|b|c`. + +The `choiceOf` (disjunction) construct is used to match one out of several possible sequences. It functions similarly to a logical OR operator in programming. It can match simple string options as well as complex patterns. Example: `choiceOf("color", "colour")` matches either `color` or `colour` pattern. ## Quantifiers +Quantifiers in regex define the number of occurrences to match for a pattern. + ### `zeroOrMore()` ```ts function zeroOrMore( - sequence: RegexSequence, + sequence: RegexSequence, ): ZeroOrMore ``` +Regex syntax: `x*`; + +The `zeroOrMore` quantifier matches zero or more occurrences of given pattern, allowing a flexible number of repetitions of that element. + ### `oneOrMore()` ```ts function oneOrMore( - sequence: RegexSequence, + sequence: RegexSequence, ): OneOrMore ``` +Regex syntax: `x+`; + +The `oneOrMore` quantifier matches one or more occurrences of given pattern, allowing a flexible number of repetitions of that element. + ### `optionally()` ```ts function optionally( - sequence: RegexSequence, + sequence: RegexSequence, ): Optionally ``` +Regex syntax: `x?`; + +The `optionally` quantifier matches zero or one occurrence of given pattern, making it optional. + ### `repeat()` ```ts function repeat( - options: number | { min: number; max?: number }, - sequence: RegexSequence, + sequence: RegexSequence, + count: number | { min: number; max?: number }, ): Repeat ``` +Regex syntax: `x{n}`, `x{min,}`, `x{min, max}`. + +The `repeat` quantifier in regex matches either exactly `count` times or between `min` and `max` times. If only `min` is provided it matches at least `min` times. + ## Character classes -Character classes are a set of characters that match any one of the characters in the set. +Character classes are a set of characters that match any one of the characters in the set. ### Common character classess @@ -91,80 +138,88 @@ const digit: CharacterClass; const whitespace: CharacterClass; ``` -* `any` matches any character except newline characters. -* `word` matches any word character (alphanumeric & underscore). -* `digit` matches any digit. -* `whitespace` matches any whitespace character (spaces, tabs, line breaks). +- `any` matches any character except newline characters. Regex syntax: `*`. +- `word` matches any word character (alphanumeric & underscore). Regex syntax: `\w`. +- `digit` matches any digit. Regex syntax: `\d`. +- `whitespace` matches any whitespace character (spaces, tabs, line breaks). Regex syntax: `\s`. ### `anyOf()` ```ts function anyOf( - characters: string, + characters: string, ): CharacterClass ``` +Regex syntax: `[abc]`. + The `anyOf` class matches any character present in the `character` string. Example: `anyOf('aeiou')` will match either `a`, `e`, `i` `o` or `u` characters. -### `characterRange()` +### `charRange()` ```ts -function characterRange( - start: string, - end: string, +function charRange( + start: string, + end: string, ): CharacterClass ``` -The `characterRange` class matches any character present in the range from `start` to `end` (inclusive). +Regex syntax: `[a-z]`. + +The `charRange` class matches any character present in the range from `start` to `end` (inclusive). Examples: -* `characterRange('a', 'z')` will match all lowercase characters from `a` to `z`. -* `characterRange('A', 'Z')` will match all uppercase characters from `a` to `z`. -* `characterRange('0', '9')` will match all digit characters from `0` to `9`. -### `characterClass()` +- `charRange('a', 'z')` will match all lowercase characters from `a` to `z`. +- `charRange('A', 'Z')` will match all uppercase characters from `A` to `Z`. +- `charRange('0', '9')` will match all digit characters from `0` to `9`. + +### `charClass()` ```ts -function characterClass( - ...elements: CharacterClass[], +function charClass( + ...elements: CharacterClass[], ): CharacterClass ``` -The `characterClass` construct creates a new character class that includes all passed character classes. +Regex syntax: `[...]`. -Example: -* `characterClass(characterRange('a', 'f'), digit)` will match all lowercase hex digits (`0` to `9` and `a` to `f`). -* `characterClass(characterRange('a', 'z'), digit, anyOf("._-"))` will match any digit, lowercase latin lettet from `a` to `z`, and either of `.`, `_`, and `-` characters. +The `charClass` construct creates a new character class that includes all passed character classes. + +Examples: + +- `charClass(charRange('a', 'f'), digit)` will match all lowercase hex digits (`0` to `9` and `a` to `f`). +- `charClass(charRange('a', 'z'), digit, anyOf("._-"))` will match any digit, lowercase latin lettet from `a` to `z`, and either of `.`, `_`, and `-` characters. ### `inverted()` ```ts function inverted( - element: CharacterClass, + element: CharacterClass, ): CharacterClass ``` +Regex syntax: `[^...]`. + The `inverted` construct creates a new character class that matches any character that is not present in the passed character class. Examples: -* `inverted(digit)` matches any character that is not a digit -* `inverted(anyOf('aeiou'))` matches any character that is not a lowercase vowel. - +- `inverted(digit)` matches any character that is not a digit +- `inverted(anyOf('aeiou'))` matches any character that is not a lowercase vowel. ## Anchors Anchors are special characters or sequences that specify positions in the input string, rather than matching specific characters. -### Line start and end +### Start and end of string ```ts -const startOfString: Anchor; // Regex: ^ -const endOfString: Anchor; // Regex: $ +const startOfString: Anchor; +const endOfString: Anchor; ``` -The `startOfString` (regex: `^`) matches the start of a string (or line, if multiline mode is enabled). - -The `endOfString` (regex: `$`) matches the end of a string (or line, if multiline mode is enabled). \ No newline at end of file +- `startOfString` anchor matches the start of a string (or line, if multiline mode is enabled). Regex syntax: `^`. +- `endOfString` anchor matches the end of a string (or line, if multiline mode is enabled). Regex syntax: `$`. diff --git a/docs/Examples.md b/docs/Examples.md index 7333a45..10c891a 100644 --- a/docs/Examples.md +++ b/docs/Examples.md @@ -1,5 +1,21 @@ # Regex Examples +## JavaScript number + +```ts +const optionalSign = optionally(anyOf('+-')); +const exponent = [anyOf('eE'), optionalSign, oneOrMore(digit)]; + +const regex = buildRegExp([ + optionalSign, + choiceOf( + [oneOrMore(digit), optionally(['.', zeroOrMore(digit)])], // leading digit + ['.', oneOrMore(digit)], // leading dot + ), + optionally(exponent), // exponent +]); +``` + ## IPv4 address validation ```ts @@ -9,7 +25,7 @@ const octet = choiceOf( [charRange('1', '9'), digit], ['1', repeat(digit, 2)], ['2', charRange('0', '4'), digit], - ['25', charRange('0', '5')] + ['25', charRange('0', '5')], ); // Match diff --git a/src/__tests__/builder.test.ts b/src/__tests__/builder.test.ts index 7377430..d9e18f8 100644 --- a/src/__tests__/builder.test.ts +++ b/src/__tests__/builder.test.ts @@ -16,9 +16,6 @@ test('`regexBuilder` flags', () => { expect(buildRegExp('a', { hasIndices: true }).flags).toBe('d'); expect(buildRegExp('a', { hasIndices: false }).flags).toBe(''); - expect(buildRegExp('a', { sticky: true }).flags).toBe('y'); - expect(buildRegExp('a', { sticky: false }).flags).toBe(''); - expect( buildRegExp('a', { global: true, // diff --git a/src/__tests__/examples.test.ts b/src/__tests__/examples.test.ts index de4b072..0155b4d 100644 --- a/src/__tests__/examples.test.ts +++ b/src/__tests__/examples.test.ts @@ -1,13 +1,56 @@ import { + anyOf, buildRegExp, charRange, choiceOf, digit, endOfString, + oneOrMore, + optionally, repeat, startOfString, + zeroOrMore, } from '../index'; +test('example: validate JavaScript number', () => { + const optionalSign = optionally(anyOf('+-')); + const exponent = [anyOf('eE'), optionalSign, oneOrMore(digit)]; + + const regex = buildRegExp([ + startOfString, + optionalSign, + choiceOf( + [oneOrMore(digit), optionally(['.', zeroOrMore(digit)])], // leading digit + ['.', oneOrMore(digit)], // leading dot + ), + optionally(exponent), // exponent + endOfString, + ]); + + expect(regex).toMatchString('0'); + expect(regex).toMatchString('-1'); + expect(regex).toMatchString('+1'); + expect(regex).toMatchString('1.0'); + expect(regex).toMatchString('1.1234'); + expect(regex).toMatchString('1.'); + expect(regex).toMatchString('.1'); + expect(regex).toMatchString('-.1234'); + expect(regex).toMatchString('+.5'); + expect(regex).toMatchString('1e21'); + expect(regex).toMatchString('1e-21'); + expect(regex).toMatchString('+1e+42'); + expect(regex).toMatchString('-1e-42'); + + expect(regex).not.toMatchString(''); + expect(regex).not.toMatchString('a'); + expect(regex).not.toMatchString('1a'); + expect(regex).not.toMatchString('1.0.'); + expect(regex).not.toMatchString('.1.1'); + expect(regex).not.toMatchString('.'); + + expect(regex).toHavePattern(/^[+-]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?$/); +}); + test('example: IPv4 address validator', () => { const octet = choiceOf( [digit], diff --git a/src/builders.ts b/src/builders.ts index b550299..6b48f82 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -31,7 +31,6 @@ function encodeFlags(flags: RegexFlags): string { if (flags.ignoreCase) result += 'i'; if (flags.multiline) result += 'm'; if (flags.hasIndices) result += 'd'; - if (flags.sticky) result += 'y'; return result; } diff --git a/src/constructs/repeat.ts b/src/constructs/repeat.ts index 26a9d48..bf35e2f 100644 --- a/src/constructs/repeat.ts +++ b/src/constructs/repeat.ts @@ -5,13 +5,13 @@ import type { RegexConstruct, RegexElement, RegexSequence } from '../types'; export interface Repeat extends RegexConstruct { type: 'repeat'; - options: RepeatOptions; + count: RepeatCount; children: RegexElement[]; } -export type RepeatOptions = number | { min: number; max?: number }; +export type RepeatCount = number | { min: number; max?: number }; -export function repeat(sequence: RegexSequence, options: RepeatOptions): Repeat { +export function repeat(sequence: RegexSequence, count: RepeatCount): Repeat { const children = ensureArray(sequence); if (children.length === 0) { @@ -21,7 +21,7 @@ export function repeat(sequence: RegexSequence, options: RepeatOptions): Repeat return { type: 'repeat', children, - options, + count: count, encode: encodeRepeat, }; } @@ -29,15 +29,15 @@ export function repeat(sequence: RegexSequence, options: RepeatOptions): Repeat function encodeRepeat(this: Repeat): EncodeResult { const atomicNodes = encodeAtom(this.children); - if (typeof this.options === 'number') { + if (typeof this.count === 'number') { return { precedence: 'sequence', - pattern: `${atomicNodes.pattern}{${this.options}}`, + pattern: `${atomicNodes.pattern}{${this.count}}`, }; } return { precedence: 'sequence', - pattern: `${atomicNodes.pattern}{${this.options.min},${this.options?.max ?? ''}}`, + pattern: `${atomicNodes.pattern}{${this.count.min},${this.count?.max ?? ''}}`, }; } diff --git a/src/types.ts b/src/types.ts index 6d57ad5..c3b8cef 100644 --- a/src/types.ts +++ b/src/types.ts @@ -23,18 +23,15 @@ export interface RegexConstruct { } export interface RegexFlags { - /** Global search. */ + /** Find all matches in a string, instead of just the first one. */ global?: boolean; - /** Case-insensitive search. */ + /** Perform case-insensitive matching. */ ignoreCase?: boolean; - /** Allows ^ and $ to match newline characters. */ + /** Treat the start and end of each line in a string as the beginning and end of the string. */ multiline?: boolean; - /** Generate indices for substring matches. */ + /** Penerate the start and end indices of each captured group in a match. */ hasIndices?: boolean; - - /** Perform a "sticky" search that matches starting at the current position in the target string. */ - sticky?: boolean; }